on_Snapshot__10_chrome_tab.bg.js 8.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264
  1. #!/usr/bin/env node
  2. /**
  3. * Create a Chrome tab for this snapshot in the shared crawl Chrome session.
  4. *
  5. * Connects to the crawl-level Chrome session (from on_Crawl__90_chrome_launch.bg.js)
  6. * and creates a new tab. This hook does NOT launch its own Chrome instance.
  7. *
  8. * Usage: on_Snapshot__10_chrome_tab.bg.js --url=<url> --snapshot-id=<uuid> --crawl-id=<uuid>
  9. * Output: Creates chrome/ directory under snapshot output dir with:
  10. * - cdp_url.txt: WebSocket URL for CDP connection
  11. * - chrome.pid: Chrome process ID (from crawl)
  12. * - target_id.txt: Target ID of this snapshot's tab
  13. * - url.txt: The URL to be navigated to
  14. *
  15. * Environment variables:
  16. * CRAWL_OUTPUT_DIR: Crawl output directory (to find crawl's Chrome session)
  17. * CHROME_BINARY: Path to Chromium binary (optional, for version info)
  18. *
  19. * This is a background hook that stays alive until SIGTERM so the tab
  20. * can be closed cleanly at the end of the snapshot run.
  21. */
  22. const fs = require('fs');
  23. const path = require('path');
  24. const { execSync } = require('child_process');
  25. // Add NODE_MODULES_DIR to module resolution paths if set
  26. if (process.env.NODE_MODULES_DIR) module.paths.unshift(process.env.NODE_MODULES_DIR);
  27. const puppeteer = require('puppeteer');
  28. const { getEnv, getEnvInt } = require('./chrome_utils.js');
  29. // Extractor metadata
  30. const PLUGIN_NAME = 'chrome_tab';
  31. const OUTPUT_DIR = '.'; // Hook already runs in chrome/ output directory
  32. const CHROME_SESSION_DIR = '.';
  33. const CHROME_SESSION_REQUIRED_ERROR = 'No Chrome session found (chrome plugin must run first)';
  34. let finalStatus = 'failed';
  35. let finalOutput = '';
  36. let finalError = '';
  37. let cmdVersion = '';
  38. let finalized = false;
  39. // Parse command line arguments
  40. function parseArgs() {
  41. const args = {};
  42. process.argv.slice(2).forEach(arg => {
  43. if (arg.startsWith('--')) {
  44. const [key, ...valueParts] = arg.slice(2).split('=');
  45. args[key.replace(/-/g, '_')] = valueParts.join('=') || true;
  46. }
  47. });
  48. return args;
  49. }
  50. function emitResult(statusOverride) {
  51. if (finalized) return;
  52. finalized = true;
  53. const status = statusOverride || finalStatus;
  54. const outputStr = status === 'succeeded'
  55. ? finalOutput
  56. : (finalError || finalOutput || '');
  57. const result = {
  58. type: 'ArchiveResult',
  59. status,
  60. output_str: outputStr,
  61. };
  62. if (cmdVersion) {
  63. result.cmd_version = cmdVersion;
  64. }
  65. console.log(JSON.stringify(result));
  66. }
  67. // Cleanup handler for SIGTERM - close this snapshot's tab
  68. async function cleanup(signal) {
  69. if (signal) {
  70. console.error(`\nReceived ${signal}, closing chrome tab...`);
  71. }
  72. try {
  73. const cdpFile = path.join(OUTPUT_DIR, 'cdp_url.txt');
  74. const targetIdFile = path.join(OUTPUT_DIR, 'target_id.txt');
  75. if (fs.existsSync(cdpFile) && fs.existsSync(targetIdFile)) {
  76. const cdpUrl = fs.readFileSync(cdpFile, 'utf8').trim();
  77. const targetId = fs.readFileSync(targetIdFile, 'utf8').trim();
  78. const browser = await puppeteer.connect({ browserWSEndpoint: cdpUrl });
  79. const pages = await browser.pages();
  80. const page = pages.find(p => p.target()._targetId === targetId);
  81. if (page) {
  82. await page.close();
  83. }
  84. browser.disconnect();
  85. }
  86. } catch (e) {
  87. // Best effort
  88. }
  89. emitResult();
  90. process.exit(finalStatus === 'succeeded' ? 0 : 1);
  91. }
  92. // Register signal handlers
  93. process.on('SIGTERM', () => cleanup('SIGTERM'));
  94. process.on('SIGINT', () => cleanup('SIGINT'));
  95. // Try to find the crawl's Chrome session
  96. function getCrawlChromeSession() {
  97. // Use CRAWL_OUTPUT_DIR env var set by get_config() in configset.py
  98. const crawlOutputDir = getEnv('CRAWL_OUTPUT_DIR', '');
  99. if (!crawlOutputDir) {
  100. throw new Error(CHROME_SESSION_REQUIRED_ERROR);
  101. }
  102. const crawlChromeDir = path.join(crawlOutputDir, 'chrome');
  103. const cdpFile = path.join(crawlChromeDir, 'cdp_url.txt');
  104. const pidFile = path.join(crawlChromeDir, 'chrome.pid');
  105. if (!fs.existsSync(cdpFile)) {
  106. throw new Error(CHROME_SESSION_REQUIRED_ERROR);
  107. }
  108. if (!fs.existsSync(pidFile)) {
  109. throw new Error(CHROME_SESSION_REQUIRED_ERROR);
  110. }
  111. const cdpUrl = fs.readFileSync(cdpFile, 'utf-8').trim();
  112. const pid = parseInt(fs.readFileSync(pidFile, 'utf-8').trim(), 10);
  113. if (!cdpUrl) {
  114. throw new Error(CHROME_SESSION_REQUIRED_ERROR);
  115. }
  116. if (!pid || Number.isNaN(pid)) {
  117. throw new Error(CHROME_SESSION_REQUIRED_ERROR);
  118. }
  119. // Verify the process is still running
  120. try {
  121. process.kill(pid, 0); // Signal 0 = check if process exists
  122. } catch (e) {
  123. throw new Error(CHROME_SESSION_REQUIRED_ERROR);
  124. }
  125. return { cdpUrl, pid };
  126. }
  127. async function waitForCrawlChromeSession(timeoutMs, intervalMs = 250) {
  128. const startTime = Date.now();
  129. let lastError = null;
  130. while (Date.now() - startTime < timeoutMs) {
  131. try {
  132. return getCrawlChromeSession();
  133. } catch (e) {
  134. lastError = e;
  135. }
  136. await new Promise(resolve => setTimeout(resolve, intervalMs));
  137. }
  138. if (lastError) {
  139. throw lastError;
  140. }
  141. throw new Error(CHROME_SESSION_REQUIRED_ERROR);
  142. }
  143. // Create a new tab in an existing Chrome session
  144. async function createTabInExistingChrome(cdpUrl, url, pid) {
  145. console.log(`[*] Connecting to existing Chrome session: ${cdpUrl}`);
  146. // Connect Puppeteer to the running Chrome
  147. const browser = await puppeteer.connect({
  148. browserWSEndpoint: cdpUrl,
  149. defaultViewport: null,
  150. });
  151. // Create a new tab for this snapshot
  152. const page = await browser.newPage();
  153. // Get the page target ID
  154. const target = page.target();
  155. const targetId = target._targetId;
  156. // Write session info
  157. fs.writeFileSync(path.join(OUTPUT_DIR, 'cdp_url.txt'), cdpUrl);
  158. fs.writeFileSync(path.join(OUTPUT_DIR, 'chrome.pid'), String(pid));
  159. fs.writeFileSync(path.join(OUTPUT_DIR, 'target_id.txt'), targetId);
  160. fs.writeFileSync(path.join(OUTPUT_DIR, 'url.txt'), url);
  161. // Disconnect Puppeteer (Chrome and tab stay alive)
  162. browser.disconnect();
  163. return { success: true, output: OUTPUT_DIR, cdpUrl, targetId, pid };
  164. }
  165. async function main() {
  166. const args = parseArgs();
  167. const url = args.url;
  168. const snapshotId = args.snapshot_id;
  169. const crawlId = args.crawl_id || getEnv('CRAWL_ID', '');
  170. if (!url || !snapshotId) {
  171. console.error('Usage: on_Snapshot__10_chrome_tab.bg.js --url=<url> --snapshot-id=<uuid> [--crawl-id=<uuid>]');
  172. process.exit(1);
  173. }
  174. let status = 'failed';
  175. let output = '';
  176. let error = '';
  177. let version = '';
  178. try {
  179. // Get Chrome version
  180. try {
  181. const binary = getEnv('CHROME_BINARY', '').trim();
  182. if (binary) {
  183. version = execSync(`"${binary}" --version`, { encoding: 'utf8', timeout: 5000 }).trim().slice(0, 64);
  184. }
  185. } catch (e) {
  186. version = '';
  187. }
  188. // Try to use existing crawl Chrome session (wait for readiness)
  189. const timeoutSeconds = getEnvInt('CHROME_TAB_TIMEOUT', getEnvInt('CHROME_TIMEOUT', getEnvInt('TIMEOUT', 60)));
  190. const crawlSession = await waitForCrawlChromeSession(timeoutSeconds * 1000);
  191. console.log(`[*] Found existing Chrome session from crawl ${crawlId}`);
  192. const result = await createTabInExistingChrome(crawlSession.cdpUrl, url, crawlSession.pid);
  193. if (result.success) {
  194. status = 'succeeded';
  195. output = result.output;
  196. console.log(`[+] Chrome tab ready`);
  197. console.log(`[+] CDP URL: ${result.cdpUrl}`);
  198. console.log(`[+] Page target ID: ${result.targetId}`);
  199. } else {
  200. status = 'failed';
  201. error = result.error;
  202. }
  203. } catch (e) {
  204. error = `${e.name}: ${e.message}`;
  205. status = 'failed';
  206. }
  207. if (error) {
  208. console.error(`ERROR: ${error}`);
  209. }
  210. finalStatus = status;
  211. finalOutput = output || '';
  212. finalError = error || '';
  213. cmdVersion = version || '';
  214. if (status !== 'succeeded') {
  215. emitResult(status);
  216. process.exit(1);
  217. }
  218. console.log('[*] Chrome tab created, waiting for cleanup signal...');
  219. await new Promise(() => {}); // Keep alive until SIGTERM
  220. }
  221. main().catch(e => {
  222. console.error(`Fatal error: ${e.message}`);
  223. process.exit(1);
  224. });