on_Snapshot__30_chrome_navigate.js 7.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225
  1. #!/usr/bin/env node
  2. /**
  3. * Navigate the Chrome browser to the target URL.
  4. *
  5. * This is a simple hook that ONLY navigates - nothing else.
  6. * Pre-load hooks (21-29) should set up their own CDP listeners.
  7. * Post-load hooks (31+) can then read from the loaded page.
  8. *
  9. * Usage: on_Snapshot__30_chrome_navigate.js --url=<url> --snapshot-id=<uuid>
  10. * Output: Writes page_loaded.txt marker when navigation completes
  11. *
  12. * Environment variables:
  13. * CHROME_PAGELOAD_TIMEOUT: Timeout in seconds (default: 60)
  14. * CHROME_DELAY_AFTER_LOAD: Extra delay after load in seconds (default: 0)
  15. * CHROME_WAIT_FOR: Wait condition (default: networkidle2)
  16. */
  17. const fs = require('fs');
  18. const path = require('path');
  19. // Add NODE_MODULES_DIR to module resolution paths if set
  20. if (process.env.NODE_MODULES_DIR) module.paths.unshift(process.env.NODE_MODULES_DIR);
  21. const puppeteer = require('puppeteer');
  22. const PLUGIN_NAME = 'chrome_navigate';
  23. const CHROME_SESSION_DIR = '.';
  24. const OUTPUT_DIR = '.';
  25. const CHROME_SESSION_REQUIRED_ERROR = 'No Chrome session found (chrome plugin must run first)';
  26. function parseArgs() {
  27. const args = {};
  28. process.argv.slice(2).forEach(arg => {
  29. if (arg.startsWith('--')) {
  30. const [key, ...valueParts] = arg.slice(2).split('=');
  31. args[key.replace(/-/g, '_')] = valueParts.join('=') || true;
  32. }
  33. });
  34. return args;
  35. }
  36. function getEnv(name, defaultValue = '') {
  37. return (process.env[name] || defaultValue).trim();
  38. }
  39. function getEnvInt(name, defaultValue = 0) {
  40. const val = parseInt(getEnv(name, String(defaultValue)), 10);
  41. return isNaN(val) ? defaultValue : val;
  42. }
  43. function getEnvFloat(name, defaultValue = 0) {
  44. const val = parseFloat(getEnv(name, String(defaultValue)));
  45. return isNaN(val) ? defaultValue : val;
  46. }
  47. async function waitForChromeTabOpen(timeoutMs = 60000) {
  48. const cdpFile = path.join(CHROME_SESSION_DIR, 'cdp_url.txt');
  49. const targetIdFile = path.join(CHROME_SESSION_DIR, 'target_id.txt');
  50. const startTime = Date.now();
  51. while (Date.now() - startTime < timeoutMs) {
  52. if (fs.existsSync(cdpFile) && fs.existsSync(targetIdFile)) {
  53. return true;
  54. }
  55. // Wait 100ms before checking again
  56. await new Promise(resolve => setTimeout(resolve, 100));
  57. }
  58. return false;
  59. }
  60. function getCdpUrl() {
  61. const cdpFile = path.join(CHROME_SESSION_DIR, 'cdp_url.txt');
  62. if (!fs.existsSync(cdpFile)) return null;
  63. return fs.readFileSync(cdpFile, 'utf8').trim();
  64. }
  65. function getPageId() {
  66. const targetIdFile = path.join(CHROME_SESSION_DIR, 'target_id.txt');
  67. if (!fs.existsSync(targetIdFile)) return null;
  68. return fs.readFileSync(targetIdFile, 'utf8').trim();
  69. }
  70. function getWaitCondition() {
  71. const waitFor = getEnv('CHROME_WAIT_FOR', 'networkidle2').toLowerCase();
  72. const valid = ['domcontentloaded', 'load', 'networkidle0', 'networkidle2'];
  73. return valid.includes(waitFor) ? waitFor : 'networkidle2';
  74. }
  75. function sleep(ms) {
  76. return new Promise(resolve => setTimeout(resolve, ms));
  77. }
  78. async function navigate(url, cdpUrl) {
  79. const timeout = (getEnvInt('CHROME_PAGELOAD_TIMEOUT') || getEnvInt('CHROME_TIMEOUT') || getEnvInt('TIMEOUT', 60)) * 1000;
  80. const delayAfterLoad = getEnvFloat('CHROME_DELAY_AFTER_LOAD', 0) * 1000;
  81. const waitUntil = getWaitCondition();
  82. const targetId = getPageId();
  83. let browser = null;
  84. const navStartTime = Date.now();
  85. try {
  86. browser = await puppeteer.connect({ browserWSEndpoint: cdpUrl });
  87. const pages = await browser.pages();
  88. if (pages.length === 0) {
  89. return { success: false, error: 'No pages found in browser', waitUntil, elapsed: Date.now() - navStartTime };
  90. }
  91. // Find page by target ID if available
  92. let page = null;
  93. if (targetId) {
  94. page = pages.find(p => {
  95. const target = p.target();
  96. return target && target._targetId === targetId;
  97. });
  98. }
  99. if (!page) {
  100. page = pages[pages.length - 1];
  101. }
  102. // Navigate
  103. console.log(`Navigating to ${url} (wait: ${waitUntil}, timeout: ${timeout}ms)`);
  104. const response = await page.goto(url, { waitUntil, timeout });
  105. // Optional delay
  106. if (delayAfterLoad > 0) {
  107. console.log(`Waiting ${delayAfterLoad}ms after load...`);
  108. await sleep(delayAfterLoad);
  109. }
  110. const finalUrl = page.url();
  111. const status = response ? response.status() : null;
  112. const elapsed = Date.now() - navStartTime;
  113. // Write navigation state as JSON
  114. const navigationState = {
  115. waitUntil,
  116. elapsed,
  117. url,
  118. finalUrl,
  119. status,
  120. timestamp: new Date().toISOString()
  121. };
  122. fs.writeFileSync(path.join(OUTPUT_DIR, 'navigation.json'), JSON.stringify(navigationState, null, 2));
  123. // Write marker files for backwards compatibility
  124. fs.writeFileSync(path.join(OUTPUT_DIR, 'page_loaded.txt'), new Date().toISOString());
  125. fs.writeFileSync(path.join(OUTPUT_DIR, 'final_url.txt'), finalUrl);
  126. browser.disconnect();
  127. return { success: true, finalUrl, status, waitUntil, elapsed };
  128. } catch (e) {
  129. if (browser) browser.disconnect();
  130. const elapsed = Date.now() - navStartTime;
  131. return { success: false, error: `${e.name}: ${e.message}`, waitUntil, elapsed };
  132. }
  133. }
  134. async function main() {
  135. const args = parseArgs();
  136. const url = args.url;
  137. const snapshotId = args.snapshot_id;
  138. if (!url || !snapshotId) {
  139. console.error('Usage: on_Snapshot__30_chrome_navigate.js --url=<url> --snapshot-id=<uuid>');
  140. process.exit(1);
  141. }
  142. const startTs = new Date();
  143. let status = 'failed';
  144. let output = null;
  145. let error = '';
  146. // Wait for chrome tab to be open (up to 60s)
  147. const tabOpen = await waitForChromeTabOpen(60000);
  148. if (!tabOpen) {
  149. console.error(`ERROR: ${CHROME_SESSION_REQUIRED_ERROR}`);
  150. process.exit(1);
  151. }
  152. const cdpUrl = getCdpUrl();
  153. if (!cdpUrl) {
  154. console.error(`ERROR: ${CHROME_SESSION_REQUIRED_ERROR}`);
  155. process.exit(1);
  156. }
  157. const result = await navigate(url, cdpUrl);
  158. if (result.success) {
  159. status = 'succeeded';
  160. output = 'navigation.json';
  161. console.log(`Page loaded: ${result.finalUrl} (HTTP ${result.status}) in ${result.elapsed}ms (waitUntil: ${result.waitUntil})`);
  162. } else {
  163. error = result.error;
  164. // Save navigation state even on failure
  165. const navigationState = {
  166. waitUntil: result.waitUntil,
  167. elapsed: result.elapsed,
  168. url,
  169. error: result.error,
  170. timestamp: new Date().toISOString()
  171. };
  172. fs.writeFileSync(path.join(OUTPUT_DIR, 'navigation.json'), JSON.stringify(navigationState, null, 2));
  173. }
  174. const endTs = new Date();
  175. if (error) console.error(`ERROR: ${error}`);
  176. // Output clean JSONL (no RESULT_JSON= prefix)
  177. console.log(JSON.stringify({
  178. type: 'ArchiveResult',
  179. status,
  180. output_str: output || error || '',
  181. }));
  182. process.exit(status === 'succeeded' ? 0 : 1);
  183. }
  184. main().catch(e => {
  185. console.error(`Fatal error: ${e.message}`);
  186. process.exit(1);
  187. });