on_Crawl__90_chrome_launch.bg.js 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427
  1. #!/usr/bin/env node
  2. /**
  3. * Launch a shared Chromium browser session for the entire crawl.
  4. *
  5. * This runs once per crawl and keeps Chromium alive for all snapshots to share.
  6. * Each snapshot creates its own tab via on_Snapshot__10_chrome_tab.bg.js.
  7. *
  8. * NOTE: We use Chromium instead of Chrome because Chrome 137+ removed support for
  9. * --load-extension and --disable-extensions-except flags.
  10. *
  11. * Usage: on_Crawl__90_chrome_launch.bg.js --crawl-id=<uuid> --source-url=<url>
  12. * Output: Writes to current directory (executor creates chrome/ dir):
  13. * - cdp_url.txt: WebSocket URL for CDP connection
  14. * - chrome.pid: Chromium process ID (for cleanup)
  15. * - port.txt: Debug port number
  16. * - extensions.json: Loaded extensions metadata
  17. *
  18. * Environment variables:
  19. * NODE_MODULES_DIR: Path to node_modules directory for module resolution
  20. * CHROME_BINARY: Path to Chromium binary (falls back to auto-detection)
  21. * CHROME_RESOLUTION: Page resolution (default: 1440,2000)
  22. * CHROME_HEADLESS: Run in headless mode (default: true)
  23. * CHROME_CHECK_SSL_VALIDITY: Whether to check SSL certificates (default: true)
  24. * CHROME_EXTENSIONS_DIR: Directory containing Chrome extensions
  25. */
  26. // Add NODE_MODULES_DIR to module resolution paths if set
  27. if (process.env.NODE_MODULES_DIR) {
  28. module.paths.unshift(process.env.NODE_MODULES_DIR);
  29. }
  30. const fs = require('fs');
  31. const path = require('path');
  32. const http = require('http');
  33. const puppeteer = require('puppeteer');
  34. const {
  35. findChromium,
  36. launchChromium,
  37. killChrome,
  38. getEnv,
  39. getEnvBool,
  40. getExtensionId,
  41. writePidWithMtime,
  42. getExtensionsDir,
  43. } = require('./chrome_utils.js');
  44. // Extractor metadata
  45. const PLUGIN_NAME = 'chrome_launch';
  46. const OUTPUT_DIR = '.';
  47. // Global state for cleanup
  48. let chromePid = null;
  49. let browserInstance = null;
  50. function parseCookiesTxt(contents) {
  51. const cookies = [];
  52. let skipped = 0;
  53. for (const rawLine of contents.split(/\r?\n/)) {
  54. const line = rawLine.trim();
  55. if (!line) continue;
  56. let httpOnly = false;
  57. let dataLine = line;
  58. if (dataLine.startsWith('#HttpOnly_')) {
  59. httpOnly = true;
  60. dataLine = dataLine.slice('#HttpOnly_'.length);
  61. } else if (dataLine.startsWith('#')) {
  62. continue;
  63. }
  64. const parts = dataLine.split('\t');
  65. if (parts.length < 7) {
  66. skipped += 1;
  67. continue;
  68. }
  69. const [domainRaw, includeSubdomainsRaw, pathRaw, secureRaw, expiryRaw, name, value] = parts;
  70. if (!name || !domainRaw) {
  71. skipped += 1;
  72. continue;
  73. }
  74. const includeSubdomains = (includeSubdomainsRaw || '').toUpperCase() === 'TRUE';
  75. let domain = domainRaw;
  76. if (includeSubdomains && !domain.startsWith('.')) domain = `.${domain}`;
  77. if (!includeSubdomains && domain.startsWith('.')) domain = domain.slice(1);
  78. const cookie = {
  79. name,
  80. value,
  81. domain,
  82. path: pathRaw || '/',
  83. secure: (secureRaw || '').toUpperCase() === 'TRUE',
  84. httpOnly,
  85. };
  86. const expires = parseInt(expiryRaw, 10);
  87. if (!isNaN(expires) && expires > 0) {
  88. cookie.expires = expires;
  89. }
  90. cookies.push(cookie);
  91. }
  92. return { cookies, skipped };
  93. }
  94. async function importCookiesFromFile(browser, cookiesFile, userDataDir) {
  95. if (!cookiesFile) return;
  96. if (!fs.existsSync(cookiesFile)) {
  97. console.error(`[!] Cookies file not found: ${cookiesFile}`);
  98. return;
  99. }
  100. let contents = '';
  101. try {
  102. contents = fs.readFileSync(cookiesFile, 'utf-8');
  103. } catch (e) {
  104. console.error(`[!] Failed to read COOKIES_TXT_FILE: ${e.message}`);
  105. return;
  106. }
  107. const { cookies, skipped } = parseCookiesTxt(contents);
  108. if (cookies.length === 0) {
  109. console.error('[!] No cookies found to import');
  110. return;
  111. }
  112. console.error(`[*] Importing ${cookies.length} cookies from ${cookiesFile}...`);
  113. if (skipped) {
  114. console.error(`[*] Skipped ${skipped} malformed cookie line(s)`);
  115. }
  116. if (!userDataDir) {
  117. console.error('[!] CHROME_USER_DATA_DIR not set; cookies will not persist beyond this session');
  118. }
  119. const page = await browser.newPage();
  120. const client = await page.target().createCDPSession();
  121. await client.send('Network.enable');
  122. const chunkSize = 200;
  123. let imported = 0;
  124. for (let i = 0; i < cookies.length; i += chunkSize) {
  125. const chunk = cookies.slice(i, i + chunkSize);
  126. try {
  127. await client.send('Network.setCookies', { cookies: chunk });
  128. imported += chunk.length;
  129. } catch (e) {
  130. console.error(`[!] Failed to import cookies ${i + 1}-${i + chunk.length}: ${e.message}`);
  131. }
  132. }
  133. await page.close();
  134. console.error(`[+] Imported ${imported}/${cookies.length} cookies`);
  135. }
  136. function getPortFromCdpUrl(cdpUrl) {
  137. if (!cdpUrl) return null;
  138. const match = cdpUrl.match(/:(\d+)\/devtools\//);
  139. return match ? match[1] : null;
  140. }
  141. async function fetchDevtoolsTargets(cdpUrl) {
  142. const port = getPortFromCdpUrl(cdpUrl);
  143. if (!port) return [];
  144. const urlPath = '/json/list';
  145. return new Promise((resolve, reject) => {
  146. const req = http.get(
  147. { hostname: '127.0.0.1', port, path: urlPath },
  148. (res) => {
  149. let data = '';
  150. res.on('data', (chunk) => (data += chunk));
  151. res.on('end', () => {
  152. try {
  153. const targets = JSON.parse(data);
  154. resolve(Array.isArray(targets) ? targets : []);
  155. } catch (e) {
  156. reject(e);
  157. }
  158. });
  159. }
  160. );
  161. req.on('error', reject);
  162. });
  163. }
  164. async function discoverExtensionTargets(cdpUrl, installedExtensions) {
  165. const builtinIds = [
  166. 'nkeimhogjdpnpccoofpliimaahmaaome',
  167. 'fignfifoniblkonapihmkfakmlgkbkcf',
  168. 'ahfgeienlihckogmohjhadlkjgocpleb',
  169. 'mhjfbmdgcfjbbpaeojofohoefgiehjai',
  170. ];
  171. let targets = [];
  172. for (let i = 0; i < 10; i += 1) {
  173. try {
  174. targets = await fetchDevtoolsTargets(cdpUrl);
  175. if (targets.length > 0) break;
  176. } catch (e) {
  177. // Ignore and retry
  178. }
  179. await new Promise(r => setTimeout(r, 500));
  180. }
  181. const customExtTargets = targets.filter(t => {
  182. const url = t.url || '';
  183. if (!url.startsWith('chrome-extension://')) return false;
  184. const extId = url.split('://')[1].split('/')[0];
  185. return !builtinIds.includes(extId);
  186. });
  187. console.error(`[+] Found ${customExtTargets.length} custom extension target(s) via /json/list`);
  188. for (const target of customExtTargets) {
  189. const url = target.url || '';
  190. const extId = url.split('://')[1].split('/')[0];
  191. console.error(`[+] Extension target: ${extId} (${target.type || 'unknown'})`);
  192. }
  193. const runtimeIds = new Set(customExtTargets.map(t => (t.url || '').split('://')[1].split('/')[0]));
  194. for (const ext of installedExtensions) {
  195. if (ext.id) {
  196. ext.loaded = runtimeIds.has(ext.id);
  197. }
  198. }
  199. if (customExtTargets.length === 0 && installedExtensions.length > 0) {
  200. console.error(`[!] Warning: No custom extensions detected. Extension loading may have failed.`);
  201. console.error(`[!] Make sure you are using Chromium, not Chrome (Chrome 137+ removed --load-extension support)`);
  202. }
  203. }
  204. // Parse command line arguments
  205. function parseArgs() {
  206. const args = {};
  207. process.argv.slice(2).forEach((arg) => {
  208. if (arg.startsWith('--')) {
  209. const [key, ...valueParts] = arg.slice(2).split('=');
  210. args[key.replace(/-/g, '_')] = valueParts.join('=') || true;
  211. }
  212. });
  213. return args;
  214. }
  215. // Cleanup handler for SIGTERM
  216. async function cleanup() {
  217. console.error('[*] Cleaning up Chrome session...');
  218. // Try graceful browser close first
  219. if (browserInstance) {
  220. try {
  221. console.error('[*] Closing browser gracefully...');
  222. await browserInstance.close();
  223. browserInstance = null;
  224. console.error('[+] Browser closed gracefully');
  225. } catch (e) {
  226. console.error(`[!] Graceful close failed: ${e.message}`);
  227. }
  228. }
  229. // Kill Chrome process
  230. if (chromePid) {
  231. await killChrome(chromePid, OUTPUT_DIR);
  232. }
  233. process.exit(0);
  234. }
  235. // Register signal handlers
  236. process.on('SIGTERM', cleanup);
  237. process.on('SIGINT', cleanup);
  238. async function main() {
  239. const args = parseArgs();
  240. const crawlId = args.crawl_id;
  241. try {
  242. const binary = findChromium();
  243. if (!binary) {
  244. console.error('ERROR: Chromium binary not found');
  245. console.error('DEPENDENCY_NEEDED=chromium');
  246. console.error('BIN_PROVIDERS=puppeteer,env,playwright,apt,brew');
  247. console.error('INSTALL_HINT=npx @puppeteer/browsers install chromium@latest');
  248. process.exit(1);
  249. }
  250. // Get Chromium version
  251. let version = '';
  252. try {
  253. const { execSync } = require('child_process');
  254. version = execSync(`"${binary}" --version`, { encoding: 'utf8', timeout: 5000 })
  255. .trim()
  256. .slice(0, 64);
  257. } catch (e) {}
  258. console.error(`[*] Using browser: ${binary}`);
  259. if (version) console.error(`[*] Version: ${version}`);
  260. // Load installed extensions
  261. const extensionsDir = getExtensionsDir();
  262. const userDataDir = getEnv('CHROME_USER_DATA_DIR');
  263. const cookiesFile = getEnv('COOKIES_TXT_FILE') || getEnv('COOKIES_FILE');
  264. if (userDataDir) {
  265. console.error(`[*] Using user data dir: ${userDataDir}`);
  266. }
  267. if (cookiesFile) {
  268. console.error(`[*] Using cookies file: ${cookiesFile}`);
  269. }
  270. const installedExtensions = [];
  271. const extensionPaths = [];
  272. if (fs.existsSync(extensionsDir)) {
  273. const files = fs.readdirSync(extensionsDir);
  274. for (const file of files) {
  275. if (file.endsWith('.extension.json')) {
  276. try {
  277. const extPath = path.join(extensionsDir, file);
  278. const extData = JSON.parse(fs.readFileSync(extPath, 'utf-8'));
  279. if (extData.unpacked_path && fs.existsSync(extData.unpacked_path)) {
  280. installedExtensions.push(extData);
  281. extensionPaths.push(extData.unpacked_path);
  282. console.error(`[*] Loading extension: ${extData.name || file}`);
  283. }
  284. } catch (e) {
  285. console.warn(`[!] Skipping invalid extension cache: ${file}`);
  286. }
  287. }
  288. }
  289. }
  290. if (installedExtensions.length > 0) {
  291. console.error(`[+] Found ${installedExtensions.length} extension(s) to load`);
  292. }
  293. // Ensure extension IDs are available without chrome://extensions
  294. for (const ext of installedExtensions) {
  295. if (!ext.id && ext.unpacked_path) {
  296. try {
  297. ext.id = getExtensionId(ext.unpacked_path);
  298. } catch (e) {
  299. console.error(`[!] Failed to compute extension id for ${ext.name}: ${e.message}`);
  300. }
  301. }
  302. }
  303. // Note: PID file is written by run_hook() with hook-specific name
  304. // Snapshot.cleanup() kills all *.pid processes when done
  305. if (!fs.existsSync(OUTPUT_DIR)) {
  306. fs.mkdirSync(OUTPUT_DIR, { recursive: true });
  307. }
  308. // Launch Chromium using consolidated function
  309. // userDataDir is derived from ACTIVE_PERSONA by get_config() if not explicitly set
  310. const result = await launchChromium({
  311. binary,
  312. outputDir: OUTPUT_DIR,
  313. userDataDir,
  314. extensionPaths,
  315. });
  316. if (!result.success) {
  317. console.error(`ERROR: ${result.error}`);
  318. process.exit(1);
  319. }
  320. chromePid = result.pid;
  321. const cdpUrl = result.cdpUrl;
  322. // Discover extension targets at launch (no chrome://extensions)
  323. if (extensionPaths.length > 0) {
  324. await new Promise(r => setTimeout(r, 2000));
  325. console.error('[*] Discovering extension targets via devtools /json/list...');
  326. await discoverExtensionTargets(cdpUrl, installedExtensions);
  327. }
  328. // Only connect to CDP when cookies import is needed to reduce crash risk.
  329. if (cookiesFile) {
  330. console.error(`[*] Connecting puppeteer to CDP for cookie import...`);
  331. const browser = await puppeteer.connect({
  332. browserWSEndpoint: cdpUrl,
  333. defaultViewport: null,
  334. });
  335. browserInstance = browser;
  336. // Import cookies into Chrome profile at crawl start
  337. await importCookiesFromFile(browser, cookiesFile, userDataDir);
  338. try {
  339. browser.disconnect();
  340. } catch (e) {}
  341. browserInstance = null;
  342. } else {
  343. console.error('[*] Skipping puppeteer CDP connection (no cookies to import)');
  344. }
  345. // Write extensions metadata with actual IDs
  346. if (installedExtensions.length > 0) {
  347. fs.writeFileSync(
  348. path.join(OUTPUT_DIR, 'extensions.json'),
  349. JSON.stringify(installedExtensions, null, 2)
  350. );
  351. }
  352. console.error(`[+] Chromium session started for crawl ${crawlId}`);
  353. console.error(`[+] CDP URL: ${cdpUrl}`);
  354. console.error(`[+] PID: ${chromePid}`);
  355. // Stay alive to handle cleanup on SIGTERM
  356. console.log('[*] Chromium launch hook staying alive to handle cleanup...');
  357. setInterval(() => {}, 1000000);
  358. } catch (e) {
  359. console.error(`ERROR: ${e.name}: ${e.message}`);
  360. process.exit(1);
  361. }
  362. }
  363. main().catch((e) => {
  364. console.error(`Fatal error: ${e.message}`);
  365. process.exit(1);
  366. });