on_Snapshot__53_dom.js 5.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184
  1. #!/usr/bin/env node
  2. /**
  3. * Dump the DOM of a URL using Chrome/Puppeteer.
  4. *
  5. * Requires a Chrome session (from chrome plugin) and connects to it via CDP.
  6. *
  7. * Usage: on_Snapshot__53_dom.js --url=<url> --snapshot-id=<uuid>
  8. * Output: Writes dom/output.html
  9. *
  10. * Environment variables:
  11. * DOM_ENABLED: Enable DOM extraction (default: true)
  12. */
  13. const fs = require('fs');
  14. const path = require('path');
  15. // Add NODE_MODULES_DIR to module resolution paths if set
  16. if (process.env.NODE_MODULES_DIR) module.paths.unshift(process.env.NODE_MODULES_DIR);
  17. const {
  18. getEnvBool,
  19. parseArgs,
  20. readCdpUrl,
  21. } = require('../chrome/chrome_utils.js');
  22. // Check if DOM is enabled BEFORE requiring puppeteer
  23. if (!getEnvBool('DOM_ENABLED', true)) {
  24. console.error('Skipping DOM (DOM_ENABLED=False)');
  25. // Temporary failure (config disabled) - NO JSONL emission
  26. process.exit(0);
  27. }
  28. // Now safe to require puppeteer
  29. const puppeteer = require('puppeteer-core');
  30. // Extractor metadata
  31. const PLUGIN_NAME = 'dom';
  32. const OUTPUT_DIR = '.';
  33. const OUTPUT_FILE = 'output.html';
  34. const CHROME_SESSION_DIR = '../chrome';
  35. // Check if staticfile extractor already downloaded this URL
  36. const STATICFILE_DIR = '../staticfile';
  37. function hasStaticFileOutput() {
  38. if (!fs.existsSync(STATICFILE_DIR)) return false;
  39. const stdoutPath = path.join(STATICFILE_DIR, 'stdout.log');
  40. if (!fs.existsSync(stdoutPath)) return false;
  41. const stdout = fs.readFileSync(stdoutPath, 'utf8');
  42. for (const line of stdout.split('\n')) {
  43. const trimmed = line.trim();
  44. if (!trimmed.startsWith('{')) continue;
  45. try {
  46. const record = JSON.parse(trimmed);
  47. if (record.type === 'ArchiveResult' && record.status === 'succeeded') {
  48. return true;
  49. }
  50. } catch (e) {}
  51. }
  52. return false;
  53. }
  54. // Wait for chrome tab to be fully loaded
  55. async function waitForChromeTabLoaded(timeoutMs = 60000) {
  56. const navigationFile = path.join(CHROME_SESSION_DIR, 'navigation.json');
  57. const startTime = Date.now();
  58. while (Date.now() - startTime < timeoutMs) {
  59. if (fs.existsSync(navigationFile)) {
  60. return true;
  61. }
  62. // Wait 100ms before checking again
  63. await new Promise(resolve => setTimeout(resolve, 100));
  64. }
  65. return false;
  66. }
  67. async function dumpDom(url) {
  68. // Output directory is current directory (hook already runs in output dir)
  69. const outputPath = path.join(OUTPUT_DIR, OUTPUT_FILE);
  70. let browser = null;
  71. let page = null;
  72. try {
  73. // Connect to existing Chrome session (required)
  74. const cdpUrl = readCdpUrl(CHROME_SESSION_DIR);
  75. if (!cdpUrl) {
  76. return { success: false, error: 'No Chrome session found (chrome plugin must run first)' };
  77. }
  78. browser = await puppeteer.connect({
  79. browserWSEndpoint: cdpUrl,
  80. defaultViewport: null,
  81. });
  82. // Get existing pages or create new one
  83. const pages = await browser.pages();
  84. page = pages.find(p => p.url().startsWith('http')) || pages[0];
  85. if (!page) {
  86. page = await browser.newPage();
  87. }
  88. // Get the full DOM content
  89. const domContent = await page.content();
  90. if (domContent && domContent.length > 100) {
  91. fs.writeFileSync(outputPath, domContent, 'utf8');
  92. return { success: true, output: outputPath };
  93. } else {
  94. return { success: false, error: 'DOM content too short or empty' };
  95. }
  96. } catch (e) {
  97. return { success: false, error: `${e.name}: ${e.message}` };
  98. } finally {
  99. if (browser) {
  100. browser.disconnect();
  101. }
  102. }
  103. }
  104. async function main() {
  105. const args = parseArgs();
  106. const url = args.url;
  107. const snapshotId = args.snapshot_id;
  108. if (!url || !snapshotId) {
  109. console.error('Usage: on_Snapshot__53_dom.js --url=<url> --snapshot-id=<uuid>');
  110. process.exit(1);
  111. }
  112. try {
  113. // Check if staticfile extractor already handled this (permanent skip)
  114. if (hasStaticFileOutput()) {
  115. console.error(`Skipping DOM - staticfile extractor already downloaded this`);
  116. // Permanent skip - emit ArchiveResult with status='skipped'
  117. console.log(JSON.stringify({
  118. type: 'ArchiveResult',
  119. status: 'skipped',
  120. output_str: 'staticfile already handled',
  121. }));
  122. process.exit(0);
  123. }
  124. const cdpUrl = readCdpUrl(CHROME_SESSION_DIR);
  125. if (!cdpUrl) {
  126. throw new Error('No Chrome session found (chrome plugin must run first)');
  127. }
  128. // Wait for page to be fully loaded
  129. const pageLoaded = await waitForChromeTabLoaded(60000);
  130. if (!pageLoaded) {
  131. throw new Error('Page not loaded after 60s (chrome_navigate must complete first)');
  132. }
  133. const result = await dumpDom(url);
  134. if (result.success) {
  135. // Success - emit ArchiveResult
  136. const size = fs.statSync(result.output).size;
  137. console.error(`DOM saved (${size} bytes)`);
  138. console.log(JSON.stringify({
  139. type: 'ArchiveResult',
  140. status: 'succeeded',
  141. output_str: result.output,
  142. }));
  143. process.exit(0);
  144. } else {
  145. // Transient error - emit NO JSONL
  146. console.error(`ERROR: ${result.error}`);
  147. process.exit(1);
  148. }
  149. } catch (e) {
  150. // Transient error - emit NO JSONL
  151. console.error(`ERROR: ${e.name}: ${e.message}`);
  152. process.exit(1);
  153. }
  154. }
  155. main().catch(e => {
  156. console.error(`Fatal error: ${e.message}`);
  157. process.exit(1);
  158. });