chrome_utils.js 63 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758
  1. #!/usr/bin/env node
  2. /**
  3. * Chrome Extension Management Utilities
  4. *
  5. * Handles downloading, installing, and managing Chrome extensions for browser automation.
  6. * Ported from the TypeScript implementation in archivebox.ts
  7. */
  8. const fs = require('fs');
  9. const path = require('path');
  10. const crypto = require('crypto');
  11. const http = require('http');
  12. const net = require('net');
  13. const { exec, spawn } = require('child_process');
  14. const { promisify } = require('util');
  15. const { Readable } = require('stream');
  16. const { finished } = require('stream/promises');
  17. const execAsync = promisify(exec);
  18. // ============================================================================
  19. // Environment helpers
  20. // ============================================================================
  21. /**
  22. * Get environment variable with default value.
  23. * @param {string} name - Environment variable name
  24. * @param {string} [defaultValue=''] - Default value if not set
  25. * @returns {string} - Trimmed environment variable value
  26. */
  27. function getEnv(name, defaultValue = '') {
  28. return (process.env[name] || defaultValue).trim();
  29. }
  30. /**
  31. * Get boolean environment variable.
  32. * @param {string} name - Environment variable name
  33. * @param {boolean} [defaultValue=false] - Default value if not set
  34. * @returns {boolean} - Boolean value
  35. */
  36. function getEnvBool(name, defaultValue = false) {
  37. const val = getEnv(name, '').toLowerCase();
  38. if (['true', '1', 'yes', 'on'].includes(val)) return true;
  39. if (['false', '0', 'no', 'off'].includes(val)) return false;
  40. return defaultValue;
  41. }
  42. /**
  43. * Get integer environment variable.
  44. * @param {string} name - Environment variable name
  45. * @param {number} [defaultValue=0] - Default value if not set
  46. * @returns {number} - Integer value
  47. */
  48. function getEnvInt(name, defaultValue = 0) {
  49. const val = parseInt(getEnv(name, String(defaultValue)), 10);
  50. return isNaN(val) ? defaultValue : val;
  51. }
  52. /**
  53. * Get array environment variable (JSON array or comma-separated string).
  54. *
  55. * Parsing strategy:
  56. * - If value starts with '[', parse as JSON array
  57. * - Otherwise, parse as comma-separated values
  58. *
  59. * This prevents incorrect splitting of arguments that contain internal commas.
  60. * For arguments with commas, use JSON format:
  61. * CHROME_ARGS='["--user-data-dir=/path/with,comma", "--window-size=1440,900"]'
  62. *
  63. * @param {string} name - Environment variable name
  64. * @param {string[]} [defaultValue=[]] - Default value if not set
  65. * @returns {string[]} - Array of strings
  66. */
  67. function getEnvArray(name, defaultValue = []) {
  68. const val = getEnv(name, '');
  69. if (!val) return defaultValue;
  70. // If starts with '[', parse as JSON array
  71. if (val.startsWith('[')) {
  72. try {
  73. const parsed = JSON.parse(val);
  74. if (Array.isArray(parsed)) return parsed;
  75. } catch (e) {
  76. console.error(`[!] Failed to parse ${name} as JSON array: ${e.message}`);
  77. // Fall through to comma-separated parsing
  78. }
  79. }
  80. // Parse as comma-separated values
  81. return val.split(',').map(s => s.trim()).filter(Boolean);
  82. }
  83. /**
  84. * Parse resolution string into width/height.
  85. * @param {string} resolution - Resolution string like "1440,2000"
  86. * @returns {{width: number, height: number}} - Parsed dimensions
  87. */
  88. function parseResolution(resolution) {
  89. const [width, height] = resolution.split(',').map(x => parseInt(x.trim(), 10));
  90. return { width: width || 1440, height: height || 2000 };
  91. }
  92. // ============================================================================
  93. // PID file management
  94. // ============================================================================
  95. /**
  96. * Write PID file with specific mtime for process validation.
  97. * @param {string} filePath - Path to PID file
  98. * @param {number} pid - Process ID
  99. * @param {number} startTimeSeconds - Process start time in seconds
  100. */
  101. function writePidWithMtime(filePath, pid, startTimeSeconds) {
  102. fs.writeFileSync(filePath, String(pid));
  103. const startTimeMs = startTimeSeconds * 1000;
  104. fs.utimesSync(filePath, new Date(startTimeMs), new Date(startTimeMs));
  105. }
  106. /**
  107. * Write a shell script that can re-run the Chrome command.
  108. * @param {string} filePath - Path to script file
  109. * @param {string} binary - Chrome binary path
  110. * @param {string[]} args - Chrome arguments
  111. */
  112. function writeCmdScript(filePath, binary, args) {
  113. const escape = (arg) =>
  114. arg.includes(' ') || arg.includes('"') || arg.includes('$')
  115. ? `"${arg.replace(/"/g, '\\"')}"`
  116. : arg;
  117. fs.writeFileSync(
  118. filePath,
  119. `#!/bin/bash\n${binary} ${args.map(escape).join(' ')}\n`
  120. );
  121. fs.chmodSync(filePath, 0o755);
  122. }
  123. // ============================================================================
  124. // Port management
  125. // ============================================================================
  126. /**
  127. * Find a free port on localhost.
  128. * @returns {Promise<number>} - Available port number
  129. */
  130. function findFreePort() {
  131. return new Promise((resolve, reject) => {
  132. const server = net.createServer();
  133. server.unref();
  134. server.on('error', reject);
  135. server.listen(0, () => {
  136. const port = server.address().port;
  137. server.close(() => resolve(port));
  138. });
  139. });
  140. }
  141. /**
  142. * Wait for Chrome's DevTools port to be ready.
  143. * @param {number} port - Debug port number
  144. * @param {number} [timeout=30000] - Timeout in milliseconds
  145. * @returns {Promise<Object>} - Chrome version info
  146. */
  147. function waitForDebugPort(port, timeout = 30000) {
  148. const startTime = Date.now();
  149. return new Promise((resolve, reject) => {
  150. const tryConnect = () => {
  151. if (Date.now() - startTime > timeout) {
  152. reject(new Error(`Timeout waiting for Chrome debug port ${port}`));
  153. return;
  154. }
  155. const req = http.get(`http://127.0.0.1:${port}/json/version`, (res) => {
  156. let data = '';
  157. res.on('data', (chunk) => (data += chunk));
  158. res.on('end', () => {
  159. try {
  160. const info = JSON.parse(data);
  161. resolve(info);
  162. } catch (e) {
  163. setTimeout(tryConnect, 100);
  164. }
  165. });
  166. });
  167. req.on('error', () => {
  168. setTimeout(tryConnect, 100);
  169. });
  170. req.setTimeout(1000, () => {
  171. req.destroy();
  172. setTimeout(tryConnect, 100);
  173. });
  174. };
  175. tryConnect();
  176. });
  177. }
  178. // ============================================================================
  179. // Zombie process cleanup
  180. // ============================================================================
  181. /**
  182. * Kill zombie Chrome processes from stale crawls.
  183. * Recursively scans DATA_DIR for any */chrome/*.pid files from stale crawls.
  184. * Does not assume specific directory structure - works with nested paths.
  185. * @param {string} [dataDir] - Data directory (defaults to DATA_DIR env or '.')
  186. * @returns {number} - Number of zombies killed
  187. */
  188. function killZombieChrome(dataDir = null) {
  189. dataDir = dataDir || getEnv('DATA_DIR', '.');
  190. const now = Date.now();
  191. const fiveMinutesAgo = now - 300000;
  192. let killed = 0;
  193. console.error('[*] Checking for zombie Chrome processes...');
  194. if (!fs.existsSync(dataDir)) {
  195. console.error('[+] No data directory found');
  196. return 0;
  197. }
  198. /**
  199. * Recursively find all chrome/.pid files in directory tree
  200. * @param {string} dir - Directory to search
  201. * @param {number} depth - Current recursion depth (limit to 10)
  202. * @returns {Array<{pidFile: string, crawlDir: string}>} - Array of PID file info
  203. */
  204. function findChromePidFiles(dir, depth = 0) {
  205. if (depth > 10) return []; // Prevent infinite recursion
  206. const results = [];
  207. try {
  208. const entries = fs.readdirSync(dir, { withFileTypes: true });
  209. for (const entry of entries) {
  210. if (!entry.isDirectory()) continue;
  211. const fullPath = path.join(dir, entry.name);
  212. // Found a chrome directory - check for .pid files
  213. if (entry.name === 'chrome') {
  214. try {
  215. const pidFiles = fs.readdirSync(fullPath).filter(f => f.endsWith('.pid'));
  216. const crawlDir = dir; // Parent of chrome/ is the crawl dir
  217. for (const pidFileName of pidFiles) {
  218. results.push({
  219. pidFile: path.join(fullPath, pidFileName),
  220. crawlDir: crawlDir,
  221. });
  222. }
  223. } catch (e) {
  224. // Skip if can't read chrome dir
  225. }
  226. } else {
  227. // Recurse into subdirectory (skip hidden dirs and node_modules)
  228. if (!entry.name.startsWith('.') && entry.name !== 'node_modules') {
  229. results.push(...findChromePidFiles(fullPath, depth + 1));
  230. }
  231. }
  232. }
  233. } catch (e) {
  234. // Skip if can't read directory
  235. }
  236. return results;
  237. }
  238. try {
  239. const chromePids = findChromePidFiles(dataDir);
  240. for (const {pidFile, crawlDir} of chromePids) {
  241. // Check if crawl was modified recently (still active)
  242. try {
  243. const crawlStats = fs.statSync(crawlDir);
  244. if (crawlStats.mtimeMs > fiveMinutesAgo) {
  245. continue; // Crawl is active, skip
  246. }
  247. } catch (e) {
  248. continue;
  249. }
  250. // Crawl is stale, check PID
  251. try {
  252. const pid = parseInt(fs.readFileSync(pidFile, 'utf8').trim(), 10);
  253. if (isNaN(pid) || pid <= 0) continue;
  254. // Check if process exists
  255. try {
  256. process.kill(pid, 0);
  257. } catch (e) {
  258. // Process dead, remove stale PID file
  259. try { fs.unlinkSync(pidFile); } catch (e) {}
  260. continue;
  261. }
  262. // Process alive and crawl is stale - zombie!
  263. console.error(`[!] Found zombie (PID ${pid}) from stale crawl ${path.basename(crawlDir)}`);
  264. try {
  265. try { process.kill(-pid, 'SIGKILL'); } catch (e) { process.kill(pid, 'SIGKILL'); }
  266. killed++;
  267. console.error(`[+] Killed zombie (PID ${pid})`);
  268. try { fs.unlinkSync(pidFile); } catch (e) {}
  269. } catch (e) {
  270. console.error(`[!] Failed to kill PID ${pid}: ${e.message}`);
  271. }
  272. } catch (e) {
  273. // Skip invalid PID files
  274. }
  275. }
  276. } catch (e) {
  277. console.error(`[!] Error scanning for Chrome processes: ${e.message}`);
  278. }
  279. if (killed > 0) {
  280. console.error(`[+] Killed ${killed} zombie process(es)`);
  281. } else {
  282. console.error('[+] No zombies found');
  283. }
  284. // Clean up stale SingletonLock files from persona chrome_user_data directories
  285. const personasDir = path.join(dataDir, 'personas');
  286. if (fs.existsSync(personasDir)) {
  287. try {
  288. const personas = fs.readdirSync(personasDir, { withFileTypes: true });
  289. for (const persona of personas) {
  290. if (!persona.isDirectory()) continue;
  291. const userDataDir = path.join(personasDir, persona.name, 'chrome_user_data');
  292. const singletonLock = path.join(userDataDir, 'SingletonLock');
  293. if (fs.existsSync(singletonLock)) {
  294. try {
  295. fs.unlinkSync(singletonLock);
  296. console.error(`[+] Removed stale SingletonLock: ${singletonLock}`);
  297. } catch (e) {
  298. // Ignore - may be in use by active Chrome
  299. }
  300. }
  301. }
  302. } catch (e) {
  303. // Ignore errors scanning personas directory
  304. }
  305. }
  306. return killed;
  307. }
  308. // ============================================================================
  309. // Chrome launching
  310. // ============================================================================
  311. /**
  312. * Launch Chromium with extensions and return connection info.
  313. *
  314. * @param {Object} options - Launch options
  315. * @param {string} [options.binary] - Chrome binary path (auto-detected if not provided)
  316. * @param {string} [options.outputDir='chrome'] - Directory for output files
  317. * @param {string} [options.userDataDir] - Chrome user data directory for persistent sessions
  318. * @param {string} [options.resolution='1440,2000'] - Window resolution
  319. * @param {boolean} [options.headless=true] - Run in headless mode
  320. * @param {boolean} [options.sandbox=true] - Enable Chrome sandbox
  321. * @param {boolean} [options.checkSsl=true] - Check SSL certificates
  322. * @param {string[]} [options.extensionPaths=[]] - Paths to unpacked extensions
  323. * @param {boolean} [options.killZombies=true] - Kill zombie processes first
  324. * @returns {Promise<Object>} - {success, cdpUrl, pid, port, process, error}
  325. */
  326. async function launchChromium(options = {}) {
  327. const {
  328. binary = findChromium(),
  329. outputDir = 'chrome',
  330. userDataDir = getEnv('CHROME_USER_DATA_DIR'),
  331. resolution = getEnv('CHROME_RESOLUTION') || getEnv('RESOLUTION', '1440,2000'),
  332. headless = getEnvBool('CHROME_HEADLESS', true),
  333. sandbox = getEnvBool('CHROME_SANDBOX', true),
  334. checkSsl = getEnvBool('CHROME_CHECK_SSL_VALIDITY', getEnvBool('CHECK_SSL_VALIDITY', true)),
  335. extensionPaths = [],
  336. killZombies = true,
  337. } = options;
  338. if (!binary) {
  339. return { success: false, error: 'Chrome binary not found' };
  340. }
  341. // Kill zombies first
  342. if (killZombies) {
  343. killZombieChrome();
  344. }
  345. const { width, height } = parseResolution(resolution);
  346. // Create output directory
  347. if (!fs.existsSync(outputDir)) {
  348. fs.mkdirSync(outputDir, { recursive: true });
  349. }
  350. // Create user data directory if specified and doesn't exist
  351. if (userDataDir) {
  352. if (!fs.existsSync(userDataDir)) {
  353. fs.mkdirSync(userDataDir, { recursive: true });
  354. console.error(`[*] Created user data directory: ${userDataDir}`);
  355. }
  356. // Clean up any stale SingletonLock file from previous crashed sessions
  357. const singletonLock = path.join(userDataDir, 'SingletonLock');
  358. if (fs.existsSync(singletonLock)) {
  359. try {
  360. fs.unlinkSync(singletonLock);
  361. console.error(`[*] Removed stale SingletonLock: ${singletonLock}`);
  362. } catch (e) {
  363. console.error(`[!] Failed to remove SingletonLock: ${e.message}`);
  364. }
  365. }
  366. }
  367. // Find a free port
  368. const debugPort = await findFreePort();
  369. console.error(`[*] Using debug port: ${debugPort}`);
  370. // Get base Chrome args from config (static flags from CHROME_ARGS env var)
  371. // These come from config.json defaults, merged by get_config() in Python
  372. const baseArgs = getEnvArray('CHROME_ARGS', []);
  373. // Get extra user-provided args
  374. const extraArgs = getEnvArray('CHROME_ARGS_EXTRA', []);
  375. // Build dynamic Chrome arguments (these must be computed at runtime)
  376. const dynamicArgs = [
  377. // Remote debugging setup
  378. `--remote-debugging-port=${debugPort}`,
  379. '--remote-debugging-address=127.0.0.1',
  380. // Sandbox settings (disable in Docker)
  381. ...(sandbox ? [] : ['--no-sandbox', '--disable-setuid-sandbox']),
  382. // Docker-specific workarounds
  383. '--disable-dev-shm-usage',
  384. '--disable-gpu',
  385. // Window size
  386. `--window-size=${width},${height}`,
  387. // User data directory (for persistent sessions with persona)
  388. ...(userDataDir ? [`--user-data-dir=${userDataDir}`] : []),
  389. // Headless mode
  390. ...(headless ? ['--headless=new'] : []),
  391. // SSL certificate checking
  392. ...(checkSsl ? [] : ['--ignore-certificate-errors']),
  393. ];
  394. // Combine all args: base (from config) + dynamic (runtime) + extra (user overrides)
  395. // Dynamic args come after base so they can override if needed
  396. const chromiumArgs = [...baseArgs, ...dynamicArgs, ...extraArgs];
  397. // Add extension loading flags
  398. if (extensionPaths.length > 0) {
  399. const extPathsArg = extensionPaths.join(',');
  400. chromiumArgs.push(`--load-extension=${extPathsArg}`);
  401. chromiumArgs.push('--enable-unsafe-extension-debugging');
  402. chromiumArgs.push('--disable-features=DisableLoadExtensionCommandLineSwitch,ExtensionManifestV2Unsupported,ExtensionManifestV2Disabled');
  403. console.error(`[*] Loading ${extensionPaths.length} extension(s) via --load-extension`);
  404. }
  405. chromiumArgs.push('about:blank');
  406. // Write command script for debugging
  407. writeCmdScript(path.join(outputDir, 'cmd.sh'), binary, chromiumArgs);
  408. try {
  409. console.error(`[*] Spawning Chromium (headless=${headless})...`);
  410. const chromiumProcess = spawn(binary, chromiumArgs, {
  411. stdio: ['ignore', 'pipe', 'pipe'],
  412. detached: true,
  413. });
  414. const chromePid = chromiumProcess.pid;
  415. const chromeStartTime = Date.now() / 1000;
  416. if (chromePid) {
  417. console.error(`[*] Chromium spawned (PID: ${chromePid})`);
  418. writePidWithMtime(path.join(outputDir, 'chrome.pid'), chromePid, chromeStartTime);
  419. }
  420. // Pipe Chrome output to stderr
  421. chromiumProcess.stdout.on('data', (data) => {
  422. process.stderr.write(`[chromium:stdout] ${data}`);
  423. });
  424. chromiumProcess.stderr.on('data', (data) => {
  425. process.stderr.write(`[chromium:stderr] ${data}`);
  426. });
  427. // Wait for debug port
  428. console.error(`[*] Waiting for debug port ${debugPort}...`);
  429. const versionInfo = await waitForDebugPort(debugPort, 30000);
  430. const wsUrl = versionInfo.webSocketDebuggerUrl;
  431. console.error(`[+] Chromium ready: ${wsUrl}`);
  432. fs.writeFileSync(path.join(outputDir, 'cdp_url.txt'), wsUrl);
  433. fs.writeFileSync(path.join(outputDir, 'port.txt'), String(debugPort));
  434. return {
  435. success: true,
  436. cdpUrl: wsUrl,
  437. pid: chromePid,
  438. port: debugPort,
  439. process: chromiumProcess,
  440. };
  441. } catch (e) {
  442. return { success: false, error: `${e.name}: ${e.message}` };
  443. }
  444. }
  445. /**
  446. * Check if a process is still running.
  447. * @param {number} pid - Process ID to check
  448. * @returns {boolean} - True if process exists
  449. */
  450. function isProcessAlive(pid) {
  451. try {
  452. process.kill(pid, 0); // Signal 0 checks existence without killing
  453. return true;
  454. } catch (e) {
  455. return false;
  456. }
  457. }
  458. /**
  459. * Find all Chrome child processes for a given debug port.
  460. * @param {number} port - Debug port number
  461. * @returns {Array<number>} - Array of PIDs
  462. */
  463. function findChromeProcessesByPort(port) {
  464. const { execSync } = require('child_process');
  465. const pids = [];
  466. try {
  467. // Find all Chrome processes using this debug port
  468. const output = execSync(
  469. `ps aux | grep -i "chrome.*--remote-debugging-port=${port}" | grep -v grep | awk '{print $2}'`,
  470. { encoding: 'utf8', timeout: 5000 }
  471. );
  472. for (const line of output.split('\n')) {
  473. const pid = parseInt(line.trim(), 10);
  474. if (!isNaN(pid) && pid > 0) {
  475. pids.push(pid);
  476. }
  477. }
  478. } catch (e) {
  479. // Command failed or no processes found
  480. }
  481. return pids;
  482. }
  483. /**
  484. * Kill a Chrome process by PID.
  485. * Always sends SIGTERM before SIGKILL, then verifies death.
  486. *
  487. * @param {number} pid - Process ID to kill
  488. * @param {string} [outputDir] - Directory containing PID files to clean up
  489. */
  490. async function killChrome(pid, outputDir = null) {
  491. if (!pid) return;
  492. console.error(`[*] Killing Chrome process tree (PID ${pid})...`);
  493. // Get debug port for finding child processes
  494. let debugPort = null;
  495. if (outputDir) {
  496. try {
  497. const portFile = path.join(outputDir, 'port.txt');
  498. if (fs.existsSync(portFile)) {
  499. debugPort = parseInt(fs.readFileSync(portFile, 'utf8').trim(), 10);
  500. }
  501. } catch (e) {}
  502. }
  503. // Step 1: SIGTERM to process group (graceful shutdown)
  504. console.error(`[*] Sending SIGTERM to process group -${pid}...`);
  505. try {
  506. process.kill(-pid, 'SIGTERM');
  507. } catch (e) {
  508. try {
  509. console.error(`[*] Process group kill failed, trying single process...`);
  510. process.kill(pid, 'SIGTERM');
  511. } catch (e2) {
  512. console.error(`[!] SIGTERM failed: ${e2.message}`);
  513. }
  514. }
  515. // Step 2: Wait for graceful shutdown
  516. await new Promise(resolve => setTimeout(resolve, 2000));
  517. // Step 3: Check if still alive
  518. if (!isProcessAlive(pid)) {
  519. console.error('[+] Chrome process terminated gracefully');
  520. } else {
  521. // Step 4: Force kill ENTIRE process group with SIGKILL
  522. console.error(`[*] Process still alive, sending SIGKILL to process group -${pid}...`);
  523. try {
  524. process.kill(-pid, 'SIGKILL'); // Kill entire process group
  525. } catch (e) {
  526. console.error(`[!] Process group SIGKILL failed, trying single process: ${e.message}`);
  527. try {
  528. process.kill(pid, 'SIGKILL');
  529. } catch (e2) {
  530. console.error(`[!] SIGKILL failed: ${e2.message}`);
  531. }
  532. }
  533. // Step 5: Wait briefly and verify death
  534. await new Promise(resolve => setTimeout(resolve, 1000));
  535. if (isProcessAlive(pid)) {
  536. console.error(`[!] WARNING: Process ${pid} is unkillable (likely in UNE state)`);
  537. console.error(`[!] This typically happens when Chrome crashes in kernel syscall`);
  538. console.error(`[!] Process will remain as zombie until system reboot`);
  539. console.error(`[!] macOS IOSurface crash creates unkillable processes in UNE state`);
  540. // Try one more time to kill the entire process group
  541. if (debugPort) {
  542. const relatedPids = findChromeProcessesByPort(debugPort);
  543. if (relatedPids.length > 1) {
  544. console.error(`[*] Found ${relatedPids.length} Chrome processes still running on port ${debugPort}`);
  545. console.error(`[*] Attempting final process group SIGKILL...`);
  546. // Try to kill each unique process group we find
  547. const processGroups = new Set();
  548. for (const relatedPid of relatedPids) {
  549. if (relatedPid !== pid) {
  550. processGroups.add(relatedPid);
  551. }
  552. }
  553. for (const groupPid of processGroups) {
  554. try {
  555. process.kill(-groupPid, 'SIGKILL');
  556. } catch (e) {}
  557. }
  558. }
  559. }
  560. } else {
  561. console.error('[+] Chrome process group killed successfully');
  562. }
  563. }
  564. // Step 8: Clean up PID files
  565. // Note: hook-specific .pid files are cleaned up by run_hook() and Snapshot.cleanup()
  566. if (outputDir) {
  567. try { fs.unlinkSync(path.join(outputDir, 'chrome.pid')); } catch (e) {}
  568. }
  569. console.error('[*] Chrome cleanup completed');
  570. }
  571. /**
  572. * Install Chromium using @puppeteer/browsers programmatic API.
  573. * Uses puppeteer's default cache location, returns the binary path.
  574. *
  575. * @param {Object} options - Install options
  576. * @returns {Promise<Object>} - {success, binary, version, error}
  577. */
  578. async function installChromium(options = {}) {
  579. // Check if CHROME_BINARY is already set and valid
  580. const configuredBinary = getEnv('CHROME_BINARY');
  581. if (configuredBinary && fs.existsSync(configuredBinary)) {
  582. console.error(`[+] Using configured CHROME_BINARY: ${configuredBinary}`);
  583. return { success: true, binary: configuredBinary, version: null };
  584. }
  585. // Try to load @puppeteer/browsers from NODE_MODULES_DIR or system
  586. let puppeteerBrowsers;
  587. try {
  588. if (process.env.NODE_MODULES_DIR) {
  589. module.paths.unshift(process.env.NODE_MODULES_DIR);
  590. }
  591. puppeteerBrowsers = require('@puppeteer/browsers');
  592. } catch (e) {
  593. console.error(`[!] @puppeteer/browsers not found. Install it first with installPuppeteerCore.`);
  594. return { success: false, error: '@puppeteer/browsers not installed' };
  595. }
  596. console.error(`[*] Installing Chromium via @puppeteer/browsers...`);
  597. try {
  598. const result = await puppeteerBrowsers.install({
  599. browser: 'chromium',
  600. buildId: 'latest',
  601. });
  602. const binary = result.executablePath;
  603. const version = result.buildId;
  604. if (!binary || !fs.existsSync(binary)) {
  605. console.error(`[!] Chromium binary not found at: ${binary}`);
  606. return { success: false, error: `Chromium binary not found at: ${binary}` };
  607. }
  608. console.error(`[+] Chromium installed: ${binary}`);
  609. return { success: true, binary, version };
  610. } catch (e) {
  611. console.error(`[!] Failed to install Chromium: ${e.message}`);
  612. return { success: false, error: e.message };
  613. }
  614. }
  615. /**
  616. * Install puppeteer-core npm package.
  617. *
  618. * @param {Object} options - Install options
  619. * @param {string} [options.npmPrefix] - npm prefix directory (default: DATA_DIR/lib/<arch>/npm or ./node_modules parent)
  620. * @param {number} [options.timeout=60000] - Timeout in milliseconds
  621. * @returns {Promise<Object>} - {success, path, error}
  622. */
  623. async function installPuppeteerCore(options = {}) {
  624. const arch = `${process.arch}-${process.platform}`;
  625. const defaultPrefix = path.join(getEnv('LIB_DIR', getEnv('DATA_DIR', '.')), 'npm');
  626. const {
  627. npmPrefix = defaultPrefix,
  628. timeout = 60000,
  629. } = options;
  630. const nodeModulesDir = path.join(npmPrefix, 'node_modules');
  631. const puppeteerPath = path.join(nodeModulesDir, 'puppeteer-core');
  632. // Check if already installed
  633. if (fs.existsSync(puppeteerPath)) {
  634. console.error(`[+] puppeteer-core already installed: ${puppeteerPath}`);
  635. return { success: true, path: puppeteerPath };
  636. }
  637. console.error(`[*] Installing puppeteer-core to ${npmPrefix}...`);
  638. // Create directory
  639. if (!fs.existsSync(npmPrefix)) {
  640. fs.mkdirSync(npmPrefix, { recursive: true });
  641. }
  642. try {
  643. const { execSync } = require('child_process');
  644. execSync(
  645. `npm install --prefix "${npmPrefix}" puppeteer-core`,
  646. { encoding: 'utf8', timeout, stdio: ['pipe', 'pipe', 'pipe'] }
  647. );
  648. console.error(`[+] puppeteer-core installed successfully`);
  649. return { success: true, path: puppeteerPath };
  650. } catch (e) {
  651. console.error(`[!] Failed to install puppeteer-core: ${e.message}`);
  652. return { success: false, error: e.message };
  653. }
  654. }
  655. // Try to import unzipper, fallback to system unzip if not available
  656. let unzip = null;
  657. try {
  658. const unzipper = require('unzipper');
  659. unzip = async (sourcePath, destPath) => {
  660. const stream = fs.createReadStream(sourcePath).pipe(unzipper.Extract({ path: destPath }));
  661. return stream.promise();
  662. };
  663. } catch (err) {
  664. // Will use system unzip command as fallback
  665. }
  666. /**
  667. * Compute the extension ID from the unpacked path.
  668. * Chrome uses a SHA256 hash of the unpacked extension directory path to compute a dynamic id.
  669. *
  670. * @param {string} unpacked_path - Path to the unpacked extension directory
  671. * @returns {string} - 32-character extension ID
  672. */
  673. function getExtensionId(unpacked_path) {
  674. // Chrome uses a SHA256 hash of the unpacked extension directory path
  675. const hash = crypto.createHash('sha256');
  676. hash.update(Buffer.from(unpacked_path, 'utf-8'));
  677. // Convert first 32 hex chars to characters in the range 'a'-'p'
  678. const detected_extension_id = Array.from(hash.digest('hex'))
  679. .slice(0, 32)
  680. .map(i => String.fromCharCode(parseInt(i, 16) + 'a'.charCodeAt(0)))
  681. .join('');
  682. return detected_extension_id;
  683. }
  684. /**
  685. * Download and install a Chrome extension from the Chrome Web Store.
  686. *
  687. * @param {Object} extension - Extension metadata object
  688. * @param {string} extension.webstore_id - Chrome Web Store extension ID
  689. * @param {string} extension.name - Human-readable extension name
  690. * @param {string} extension.crx_url - URL to download the CRX file
  691. * @param {string} extension.crx_path - Local path to save the CRX file
  692. * @param {string} extension.unpacked_path - Path to extract the extension
  693. * @returns {Promise<boolean>} - True if installation succeeded
  694. */
  695. async function installExtension(extension) {
  696. const manifest_path = path.join(extension.unpacked_path, 'manifest.json');
  697. // Download CRX file if not already downloaded
  698. if (!fs.existsSync(manifest_path) && !fs.existsSync(extension.crx_path)) {
  699. console.log(`[🛠️] Downloading missing extension ${extension.name} ${extension.webstore_id} -> ${extension.crx_path}`);
  700. try {
  701. // Ensure parent directory exists
  702. const crxDir = path.dirname(extension.crx_path);
  703. if (!fs.existsSync(crxDir)) {
  704. fs.mkdirSync(crxDir, { recursive: true });
  705. }
  706. // Download CRX file from Chrome Web Store
  707. const response = await fetch(extension.crx_url);
  708. if (!response.ok) {
  709. console.warn(`[⚠️] Failed to download extension ${extension.name}: HTTP ${response.status}`);
  710. return false;
  711. }
  712. if (response.body) {
  713. const crx_file = fs.createWriteStream(extension.crx_path);
  714. const crx_stream = Readable.fromWeb(response.body);
  715. await finished(crx_stream.pipe(crx_file));
  716. } else {
  717. console.warn(`[⚠️] Failed to download extension ${extension.name}: No response body`);
  718. return false;
  719. }
  720. } catch (err) {
  721. console.error(`[❌] Failed to download extension ${extension.name}:`, err);
  722. return false;
  723. }
  724. }
  725. // Unzip CRX file to unpacked_path (CRX files have extra header bytes but unzip handles it)
  726. await fs.promises.mkdir(extension.unpacked_path, { recursive: true });
  727. try {
  728. // Use -q to suppress warnings about extra bytes in CRX header
  729. await execAsync(`/usr/bin/unzip -q -o "${extension.crx_path}" -d "${extension.unpacked_path}"`);
  730. } catch (err1) {
  731. // unzip may return non-zero even on success due to CRX header warning, check if manifest exists
  732. if (!fs.existsSync(manifest_path)) {
  733. if (unzip) {
  734. // Fallback to unzipper library
  735. try {
  736. await unzip(extension.crx_path, extension.unpacked_path);
  737. } catch (err2) {
  738. console.error(`[❌] Failed to unzip ${extension.crx_path}:`, err2.message);
  739. return false;
  740. }
  741. } else {
  742. console.error(`[❌] Failed to unzip ${extension.crx_path}:`, err1.message);
  743. return false;
  744. }
  745. }
  746. }
  747. if (!fs.existsSync(manifest_path)) {
  748. console.error(`[❌] Failed to install ${extension.crx_path}: could not find manifest.json in unpacked_path`);
  749. return false;
  750. }
  751. return true;
  752. }
  753. /**
  754. * Load or install a Chrome extension, computing all metadata.
  755. *
  756. * @param {Object} ext - Partial extension metadata (at minimum: webstore_id or unpacked_path)
  757. * @param {string} [ext.webstore_id] - Chrome Web Store extension ID
  758. * @param {string} [ext.name] - Human-readable extension name
  759. * @param {string} [ext.unpacked_path] - Path to unpacked extension
  760. * @param {string} [extensions_dir] - Directory to store extensions
  761. * @returns {Promise<Object>} - Complete extension metadata object
  762. */
  763. async function loadOrInstallExtension(ext, extensions_dir = null) {
  764. if (!(ext.webstore_id || ext.unpacked_path)) {
  765. throw new Error('Extension must have either {webstore_id} or {unpacked_path}');
  766. }
  767. // Determine extensions directory
  768. // Use provided dir, or fall back to getExtensionsDir() which handles env vars and defaults
  769. const EXTENSIONS_DIR = extensions_dir || getExtensionsDir();
  770. // Set statically computable extension metadata
  771. ext.webstore_id = ext.webstore_id || ext.id;
  772. ext.name = ext.name || ext.webstore_id;
  773. ext.webstore_url = ext.webstore_url || `https://chromewebstore.google.com/detail/${ext.webstore_id}`;
  774. ext.crx_url = ext.crx_url || `https://clients2.google.com/service/update2/crx?response=redirect&prodversion=1230&acceptformat=crx3&x=id%3D${ext.webstore_id}%26uc`;
  775. ext.crx_path = ext.crx_path || path.join(EXTENSIONS_DIR, `${ext.webstore_id}__${ext.name}.crx`);
  776. ext.unpacked_path = ext.unpacked_path || path.join(EXTENSIONS_DIR, `${ext.webstore_id}__${ext.name}`);
  777. const manifest_path = path.join(ext.unpacked_path, 'manifest.json');
  778. ext.read_manifest = () => JSON.parse(fs.readFileSync(manifest_path, 'utf-8'));
  779. ext.read_version = () => fs.existsSync(manifest_path) && ext.read_manifest()?.version || null;
  780. // If extension is not installed, download and unpack it
  781. if (!ext.read_version()) {
  782. await installExtension(ext);
  783. }
  784. // Autodetect ID from filesystem path (unpacked extensions don't have stable IDs)
  785. ext.id = getExtensionId(ext.unpacked_path);
  786. ext.version = ext.read_version();
  787. if (!ext.version) {
  788. console.warn(`[❌] Unable to detect ID and version of installed extension ${ext.unpacked_path}`);
  789. } else {
  790. console.log(`[➕] Installed extension ${ext.name} (${ext.version})... ${ext.unpacked_path}`);
  791. }
  792. return ext;
  793. }
  794. /**
  795. * Check if a Puppeteer target is an extension background page/service worker.
  796. *
  797. * @param {Object} target - Puppeteer target object
  798. * @returns {Promise<Object>} - Object with target_is_bg, extension_id, manifest_version, etc.
  799. */
  800. async function isTargetExtension(target) {
  801. let target_type;
  802. let target_ctx;
  803. let target_url;
  804. try {
  805. target_type = target.type();
  806. target_ctx = (await target.worker()) || (await target.page()) || null;
  807. target_url = target.url() || target_ctx?.url() || null;
  808. } catch (err) {
  809. if (String(err).includes('No target with given id found')) {
  810. // Target closed during check, ignore harmless race condition
  811. target_type = 'closed';
  812. target_ctx = null;
  813. target_url = 'about:closed';
  814. } else {
  815. throw err;
  816. }
  817. }
  818. // Check if this is an extension background page or service worker
  819. const is_chrome_extension = target_url?.startsWith('chrome-extension://');
  820. const is_background_page = target_type === 'background_page';
  821. const is_service_worker = target_type === 'service_worker';
  822. const target_is_bg = is_chrome_extension && (is_background_page || is_service_worker);
  823. let extension_id = null;
  824. let manifest_version = null;
  825. const target_is_extension = is_chrome_extension || target_is_bg;
  826. if (target_is_extension) {
  827. try {
  828. extension_id = target_url?.split('://')[1]?.split('/')[0] || null;
  829. if (target_ctx) {
  830. const manifest = await target_ctx.evaluate(() => chrome.runtime.getManifest());
  831. manifest_version = manifest?.manifest_version || null;
  832. }
  833. } catch (err) {
  834. // Failed to get extension metadata
  835. }
  836. }
  837. return {
  838. target_is_extension,
  839. target_is_bg,
  840. target_type,
  841. target_ctx,
  842. target_url,
  843. extension_id,
  844. manifest_version,
  845. };
  846. }
  847. /**
  848. * Load extension metadata and connection handlers from a browser target.
  849. *
  850. * @param {Array} extensions - Array of extension metadata objects to update
  851. * @param {Object} target - Puppeteer target object
  852. * @returns {Promise<Object|null>} - Updated extension object or null if not an extension
  853. */
  854. async function loadExtensionFromTarget(extensions, target) {
  855. const {
  856. target_is_bg,
  857. target_is_extension,
  858. target_type,
  859. target_ctx,
  860. target_url,
  861. extension_id,
  862. manifest_version,
  863. } = await isTargetExtension(target);
  864. if (!(target_is_bg && extension_id && target_ctx)) {
  865. return null;
  866. }
  867. // Find matching extension in our list
  868. const extension = extensions.find(ext => ext.id === extension_id);
  869. if (!extension) {
  870. console.warn(`[⚠️] Found loaded extension ${extension_id} that's not in CHROME_EXTENSIONS list`);
  871. return null;
  872. }
  873. // Load manifest from the extension context
  874. let manifest = null;
  875. try {
  876. manifest = await target_ctx.evaluate(() => chrome.runtime.getManifest());
  877. } catch (err) {
  878. console.error(`[❌] Failed to read manifest for extension ${extension_id}:`, err);
  879. return null;
  880. }
  881. // Create dispatch methods for communicating with the extension
  882. const new_extension = {
  883. ...extension,
  884. target,
  885. target_type,
  886. target_url,
  887. manifest,
  888. manifest_version,
  889. // Trigger extension toolbar button click
  890. dispatchAction: async (tab) => {
  891. return await target_ctx.evaluate((tabId) => {
  892. return new Promise((resolve) => {
  893. chrome.action.onClicked.addListener((tab) => {
  894. resolve({ success: true, tab });
  895. });
  896. chrome.action.openPopup();
  897. });
  898. }, tab?.id || null);
  899. },
  900. // Send message to extension
  901. dispatchMessage: async (message, options = {}) => {
  902. return await target_ctx.evaluate((msg, opts) => {
  903. return new Promise((resolve) => {
  904. chrome.runtime.sendMessage(msg, opts, (response) => {
  905. resolve(response);
  906. });
  907. });
  908. }, message, options);
  909. },
  910. // Trigger extension command (keyboard shortcut)
  911. dispatchCommand: async (command) => {
  912. return await target_ctx.evaluate((cmd) => {
  913. return new Promise((resolve) => {
  914. chrome.commands.onCommand.addListener((receivedCommand) => {
  915. if (receivedCommand === cmd) {
  916. resolve({ success: true, command: receivedCommand });
  917. }
  918. });
  919. // Note: Actually triggering commands programmatically is not directly supported
  920. // This would need to be done via CDP or keyboard simulation
  921. });
  922. }, command);
  923. },
  924. };
  925. // Update the extension in the array
  926. Object.assign(extension, new_extension);
  927. console.log(`[🔌] Connected to extension ${extension.name} (${extension.version})`);
  928. return new_extension;
  929. }
  930. /**
  931. * Install all extensions in the list if not already installed.
  932. *
  933. * @param {Array} extensions - Array of extension metadata objects
  934. * @param {string} [extensions_dir] - Directory to store extensions
  935. * @returns {Promise<Array>} - Array of installed extension objects
  936. */
  937. async function installAllExtensions(extensions, extensions_dir = null) {
  938. console.log(`[⚙️] Installing ${extensions.length} chrome extensions...`);
  939. for (const extension of extensions) {
  940. await loadOrInstallExtension(extension, extensions_dir);
  941. }
  942. return extensions;
  943. }
  944. /**
  945. * Load and connect to all extensions from a running browser.
  946. *
  947. * @param {Object} browser - Puppeteer browser instance
  948. * @param {Array} extensions - Array of extension metadata objects
  949. * @returns {Promise<Array>} - Array of loaded extension objects with connection handlers
  950. */
  951. async function loadAllExtensionsFromBrowser(browser, extensions) {
  952. console.log(`[⚙️] Loading ${extensions.length} chrome extensions from browser...`);
  953. // Find loaded extensions at runtime by examining browser targets
  954. for (const target of browser.targets()) {
  955. await loadExtensionFromTarget(extensions, target);
  956. }
  957. return extensions;
  958. }
  959. /**
  960. * Load extension manifest.json file
  961. *
  962. * @param {string} unpacked_path - Path to unpacked extension directory
  963. * @returns {object|null} - Parsed manifest object or null if not found/invalid
  964. */
  965. function loadExtensionManifest(unpacked_path) {
  966. const manifest_path = path.join(unpacked_path, 'manifest.json');
  967. if (!fs.existsSync(manifest_path)) {
  968. return null;
  969. }
  970. try {
  971. const manifest_content = fs.readFileSync(manifest_path, 'utf-8');
  972. return JSON.parse(manifest_content);
  973. } catch (error) {
  974. // Invalid JSON or read error
  975. return null;
  976. }
  977. }
  978. /**
  979. * @deprecated Use puppeteer's enableExtensions option instead.
  980. *
  981. * Generate Chrome launch arguments for loading extensions.
  982. * NOTE: This is deprecated. Use puppeteer.launch({ pipe: true, enableExtensions: [paths] }) instead.
  983. *
  984. * @param {Array} extensions - Array of extension metadata objects
  985. * @returns {Array<string>} - Chrome CLI arguments for loading extensions
  986. */
  987. function getExtensionLaunchArgs(extensions) {
  988. console.warn('[DEPRECATED] getExtensionLaunchArgs is deprecated. Use puppeteer enableExtensions option instead.');
  989. if (!extensions || extensions.length === 0) {
  990. return [];
  991. }
  992. // Filter out extensions without unpacked_path first
  993. const validExtensions = extensions.filter(ext => ext.unpacked_path);
  994. const unpacked_paths = validExtensions.map(ext => ext.unpacked_path);
  995. // Use computed id (from path hash) for allowlisting, as that's what Chrome uses for unpacked extensions
  996. // Fall back to webstore_id if computed id not available
  997. const extension_ids = validExtensions.map(ext => ext.id || getExtensionId(ext.unpacked_path));
  998. return [
  999. `--load-extension=${unpacked_paths.join(',')}`,
  1000. `--allowlisted-extension-id=${extension_ids.join(',')}`,
  1001. '--allow-legacy-extension-manifests',
  1002. '--disable-extensions-auto-update',
  1003. ];
  1004. }
  1005. /**
  1006. * Get extension paths for use with puppeteer's enableExtensions option.
  1007. * Following puppeteer best practices: https://pptr.dev/guides/chrome-extensions
  1008. *
  1009. * @param {Array} extensions - Array of extension metadata objects
  1010. * @returns {Array<string>} - Array of extension unpacked paths
  1011. */
  1012. function getExtensionPaths(extensions) {
  1013. if (!extensions || extensions.length === 0) {
  1014. return [];
  1015. }
  1016. return extensions
  1017. .filter(ext => ext.unpacked_path)
  1018. .map(ext => ext.unpacked_path);
  1019. }
  1020. /**
  1021. * Wait for an extension target to be available in the browser.
  1022. * Following puppeteer best practices for accessing extension contexts.
  1023. *
  1024. * For Manifest V3 extensions (service workers):
  1025. * const worker = await waitForExtensionTarget(browser, extensionId);
  1026. * // worker is a WebWorker context
  1027. *
  1028. * For Manifest V2 extensions (background pages):
  1029. * const page = await waitForExtensionTarget(browser, extensionId);
  1030. * // page is a Page context
  1031. *
  1032. * @param {Object} browser - Puppeteer browser instance
  1033. * @param {string} extensionId - Extension ID to wait for (computed from path hash)
  1034. * @param {number} [timeout=30000] - Timeout in milliseconds
  1035. * @returns {Promise<Object>} - Worker or Page context for the extension
  1036. */
  1037. async function waitForExtensionTarget(browser, extensionId, timeout = 30000) {
  1038. // Try to find service worker first (Manifest V3)
  1039. try {
  1040. const workerTarget = await browser.waitForTarget(
  1041. target => target.type() === 'service_worker' &&
  1042. target.url().includes(`chrome-extension://${extensionId}`),
  1043. { timeout }
  1044. );
  1045. const worker = await workerTarget.worker();
  1046. if (worker) return worker;
  1047. } catch (err) {
  1048. // No service worker found, try background page
  1049. }
  1050. // Try background page (Manifest V2)
  1051. try {
  1052. const backgroundTarget = await browser.waitForTarget(
  1053. target => target.type() === 'background_page' &&
  1054. target.url().includes(`chrome-extension://${extensionId}`),
  1055. { timeout }
  1056. );
  1057. const page = await backgroundTarget.page();
  1058. if (page) return page;
  1059. } catch (err) {
  1060. // No background page found
  1061. }
  1062. // Try any extension page as fallback
  1063. const extTarget = await browser.waitForTarget(
  1064. target => target.url().startsWith(`chrome-extension://${extensionId}`),
  1065. { timeout }
  1066. );
  1067. // Return worker or page depending on target type
  1068. if (extTarget.type() === 'service_worker') {
  1069. return await extTarget.worker();
  1070. }
  1071. return await extTarget.page();
  1072. }
  1073. /**
  1074. * Get all loaded extension targets from a browser.
  1075. *
  1076. * @param {Object} browser - Puppeteer browser instance
  1077. * @returns {Array<Object>} - Array of extension target info objects
  1078. */
  1079. function getExtensionTargets(browser) {
  1080. return browser.targets()
  1081. .filter(target =>
  1082. target.url().startsWith('chrome-extension://') ||
  1083. target.type() === 'service_worker' ||
  1084. target.type() === 'background_page'
  1085. )
  1086. .map(target => ({
  1087. type: target.type(),
  1088. url: target.url(),
  1089. extensionId: target.url().includes('chrome-extension://')
  1090. ? target.url().split('chrome-extension://')[1]?.split('/')[0]
  1091. : null,
  1092. }));
  1093. }
  1094. /**
  1095. * Find Chromium/Chrome binary path.
  1096. * Checks CHROME_BINARY env var first, then falls back to system locations.
  1097. *
  1098. * @returns {string|null} - Absolute path to browser binary or null if not found
  1099. */
  1100. function findChromium() {
  1101. const { execSync } = require('child_process');
  1102. // Helper to validate a binary by running --version
  1103. const validateBinary = (binaryPath) => {
  1104. if (!binaryPath || !fs.existsSync(binaryPath)) return false;
  1105. try {
  1106. execSync(`"${binaryPath}" --version`, { encoding: 'utf8', timeout: 5000, stdio: 'pipe' });
  1107. return true;
  1108. } catch (e) {
  1109. return false;
  1110. }
  1111. };
  1112. // 1. Check CHROME_BINARY env var first
  1113. const chromeBinary = getEnv('CHROME_BINARY');
  1114. if (chromeBinary) {
  1115. const absPath = path.resolve(chromeBinary);
  1116. if (validateBinary(absPath)) {
  1117. return absPath;
  1118. }
  1119. console.error(`[!] Warning: CHROME_BINARY="${chromeBinary}" is not valid`);
  1120. }
  1121. // 2. Warn that no CHROME_BINARY is configured, searching fallbacks
  1122. if (!chromeBinary) {
  1123. console.error('[!] Warning: CHROME_BINARY not set, searching system locations...');
  1124. }
  1125. // Helper to find Chromium in @puppeteer/browsers directory structure
  1126. const findInPuppeteerDir = (baseDir) => {
  1127. if (!fs.existsSync(baseDir)) return null;
  1128. try {
  1129. const versions = fs.readdirSync(baseDir);
  1130. for (const version of versions.sort().reverse()) {
  1131. const versionDir = path.join(baseDir, version);
  1132. const candidates = [
  1133. path.join(versionDir, 'chrome-mac-arm64/Chromium.app/Contents/MacOS/Chromium'),
  1134. path.join(versionDir, 'chrome-mac/Chromium.app/Contents/MacOS/Chromium'),
  1135. path.join(versionDir, 'chrome-mac-x64/Chromium.app/Contents/MacOS/Chromium'),
  1136. path.join(versionDir, 'chrome-linux64/chrome'),
  1137. path.join(versionDir, 'chrome-linux/chrome'),
  1138. ];
  1139. for (const c of candidates) {
  1140. if (fs.existsSync(c)) return c;
  1141. }
  1142. }
  1143. } catch (e) {}
  1144. return null;
  1145. };
  1146. // 3. Search fallback locations (Chromium first, then Chrome)
  1147. const fallbackLocations = [
  1148. // System Chromium
  1149. '/Applications/Chromium.app/Contents/MacOS/Chromium',
  1150. '/usr/bin/chromium',
  1151. '/usr/bin/chromium-browser',
  1152. // Puppeteer cache
  1153. path.join(process.env.HOME || '', '.cache/puppeteer/chromium'),
  1154. path.join(process.env.HOME || '', '.cache/puppeteer'),
  1155. // Chrome (fallback - extensions may not work in 137+)
  1156. '/Applications/Google Chrome.app/Contents/MacOS/Google Chrome',
  1157. '/usr/bin/google-chrome',
  1158. '/usr/bin/google-chrome-stable',
  1159. ];
  1160. for (const loc of fallbackLocations) {
  1161. // Check if it's a puppeteer cache dir
  1162. if (loc.includes('.cache/puppeteer')) {
  1163. const binary = findInPuppeteerDir(loc);
  1164. if (binary && validateBinary(binary)) {
  1165. return binary;
  1166. }
  1167. } else if (validateBinary(loc)) {
  1168. if (loc.includes('Google Chrome') || loc.includes('google-chrome')) {
  1169. console.error('[!] Warning: Using Chrome instead of Chromium. Extension loading may not work in Chrome 137+');
  1170. }
  1171. return loc;
  1172. }
  1173. }
  1174. return null;
  1175. }
  1176. // ============================================================================
  1177. // Shared Extension Installer Utilities
  1178. // ============================================================================
  1179. /**
  1180. * Get the extensions directory path.
  1181. * Centralized path calculation used by extension installers and chrome launch.
  1182. *
  1183. * Path is derived from environment variables in this priority:
  1184. * 1. CHROME_EXTENSIONS_DIR (explicit override)
  1185. * 2. DATA_DIR/personas/ACTIVE_PERSONA/chrome_extensions (default)
  1186. *
  1187. * @returns {string} - Absolute path to extensions directory
  1188. */
  1189. function getExtensionsDir() {
  1190. const dataDir = getEnv('DATA_DIR', '.');
  1191. const persona = getEnv('ACTIVE_PERSONA', 'Default');
  1192. return getEnv('CHROME_EXTENSIONS_DIR') ||
  1193. path.join(dataDir, 'personas', persona, 'chrome_extensions');
  1194. }
  1195. /**
  1196. * Get machine type string for platform-specific paths.
  1197. * Matches Python's archivebox.config.paths.get_machine_type()
  1198. *
  1199. * @returns {string} - Machine type (e.g., 'x86_64-linux', 'arm64-darwin')
  1200. */
  1201. function getMachineType() {
  1202. if (process.env.MACHINE_TYPE) {
  1203. return process.env.MACHINE_TYPE;
  1204. }
  1205. let machine = process.arch;
  1206. const system = process.platform;
  1207. // Normalize machine type to match Python's convention
  1208. if (machine === 'arm64' || machine === 'aarch64') {
  1209. machine = 'arm64';
  1210. } else if (machine === 'x64' || machine === 'x86_64' || machine === 'amd64') {
  1211. machine = 'x86_64';
  1212. } else if (machine === 'ia32' || machine === 'x86') {
  1213. machine = 'x86';
  1214. }
  1215. return `${machine}-${system}`;
  1216. }
  1217. /**
  1218. * Get LIB_DIR path for platform-specific binaries.
  1219. * Returns DATA_DIR/lib/MACHINE_TYPE/
  1220. *
  1221. * @returns {string} - Absolute path to lib directory
  1222. */
  1223. function getLibDir() {
  1224. if (process.env.LIB_DIR) {
  1225. return process.env.LIB_DIR;
  1226. }
  1227. const dataDir = getEnv('DATA_DIR', './data');
  1228. const machineType = getMachineType();
  1229. return path.join(dataDir, 'lib', machineType);
  1230. }
  1231. /**
  1232. * Get NODE_MODULES_DIR path for npm packages.
  1233. * Returns LIB_DIR/npm/node_modules/
  1234. *
  1235. * @returns {string} - Absolute path to node_modules directory
  1236. */
  1237. function getNodeModulesDir() {
  1238. if (process.env.NODE_MODULES_DIR) {
  1239. return process.env.NODE_MODULES_DIR;
  1240. }
  1241. return path.join(getLibDir(), 'npm', 'node_modules');
  1242. }
  1243. /**
  1244. * Get all test environment paths as a JSON object.
  1245. * This is the single source of truth for path calculations - Python calls this
  1246. * to avoid duplicating path logic.
  1247. *
  1248. * @returns {Object} - Object with all test environment paths
  1249. */
  1250. function getTestEnv() {
  1251. const dataDir = getEnv('DATA_DIR', './data');
  1252. const machineType = getMachineType();
  1253. const libDir = getLibDir();
  1254. const nodeModulesDir = getNodeModulesDir();
  1255. return {
  1256. DATA_DIR: dataDir,
  1257. MACHINE_TYPE: machineType,
  1258. LIB_DIR: libDir,
  1259. NODE_MODULES_DIR: nodeModulesDir,
  1260. NPM_BIN_DIR: path.join(libDir, 'npm', '.bin'),
  1261. CHROME_EXTENSIONS_DIR: getExtensionsDir(),
  1262. };
  1263. }
  1264. /**
  1265. * Install a Chrome extension with caching support.
  1266. *
  1267. * This is the main entry point for extension installer hooks. It handles:
  1268. * - Checking for cached extension metadata
  1269. * - Installing the extension if not cached
  1270. * - Writing cache file for future runs
  1271. *
  1272. * @param {Object} extension - Extension metadata object
  1273. * @param {string} extension.webstore_id - Chrome Web Store extension ID
  1274. * @param {string} extension.name - Human-readable extension name (used for cache file)
  1275. * @param {Object} [options] - Options
  1276. * @param {string} [options.extensionsDir] - Override extensions directory
  1277. * @param {boolean} [options.quiet=false] - Suppress info logging
  1278. * @returns {Promise<Object|null>} - Installed extension metadata or null on failure
  1279. */
  1280. async function installExtensionWithCache(extension, options = {}) {
  1281. const {
  1282. extensionsDir = getExtensionsDir(),
  1283. quiet = false,
  1284. } = options;
  1285. const cacheFile = path.join(extensionsDir, `${extension.name}.extension.json`);
  1286. // Check if extension is already cached and valid
  1287. if (fs.existsSync(cacheFile)) {
  1288. try {
  1289. const cached = JSON.parse(fs.readFileSync(cacheFile, 'utf-8'));
  1290. const manifestPath = path.join(cached.unpacked_path, 'manifest.json');
  1291. if (fs.existsSync(manifestPath)) {
  1292. if (!quiet) {
  1293. console.log(`[*] ${extension.name} extension already installed (using cache)`);
  1294. }
  1295. return cached;
  1296. }
  1297. } catch (e) {
  1298. // Cache file corrupted, re-install
  1299. console.warn(`[⚠️] Extension cache corrupted for ${extension.name}, re-installing...`);
  1300. }
  1301. }
  1302. // Install extension
  1303. if (!quiet) {
  1304. console.log(`[*] Installing ${extension.name} extension...`);
  1305. }
  1306. const installedExt = await loadOrInstallExtension(extension, extensionsDir);
  1307. if (!installedExt?.version) {
  1308. console.error(`[❌] Failed to install ${extension.name} extension`);
  1309. return null;
  1310. }
  1311. // Write cache file
  1312. try {
  1313. await fs.promises.mkdir(extensionsDir, { recursive: true });
  1314. await fs.promises.writeFile(cacheFile, JSON.stringify(installedExt, null, 2));
  1315. if (!quiet) {
  1316. console.log(`[+] Extension metadata written to ${cacheFile}`);
  1317. }
  1318. } catch (e) {
  1319. console.warn(`[⚠️] Failed to write cache file: ${e.message}`);
  1320. }
  1321. if (!quiet) {
  1322. console.log(`[+] ${extension.name} extension installed`);
  1323. }
  1324. return installedExt;
  1325. }
  1326. // Export all functions
  1327. module.exports = {
  1328. // Environment helpers
  1329. getEnv,
  1330. getEnvBool,
  1331. getEnvInt,
  1332. getEnvArray,
  1333. parseResolution,
  1334. // PID file management
  1335. writePidWithMtime,
  1336. writeCmdScript,
  1337. // Port management
  1338. findFreePort,
  1339. waitForDebugPort,
  1340. // Zombie cleanup
  1341. killZombieChrome,
  1342. // Chrome launching
  1343. launchChromium,
  1344. killChrome,
  1345. // Chrome/Chromium install
  1346. installChromium,
  1347. installPuppeteerCore,
  1348. // Chrome/Chromium binary finding
  1349. findChromium,
  1350. // Extension utilities
  1351. getExtensionId,
  1352. loadExtensionManifest,
  1353. installExtension,
  1354. loadOrInstallExtension,
  1355. isTargetExtension,
  1356. loadExtensionFromTarget,
  1357. installAllExtensions,
  1358. loadAllExtensionsFromBrowser,
  1359. // New puppeteer best-practices helpers
  1360. getExtensionPaths,
  1361. waitForExtensionTarget,
  1362. getExtensionTargets,
  1363. // Shared path utilities (single source of truth for Python/JS)
  1364. getMachineType,
  1365. getLibDir,
  1366. getNodeModulesDir,
  1367. getExtensionsDir,
  1368. getTestEnv,
  1369. // Shared extension installer utilities
  1370. installExtensionWithCache,
  1371. // Deprecated - use enableExtensions option instead
  1372. getExtensionLaunchArgs,
  1373. };
  1374. // CLI usage
  1375. if (require.main === module) {
  1376. const args = process.argv.slice(2);
  1377. if (args.length === 0) {
  1378. console.log('Usage: chrome_utils.js <command> [args...]');
  1379. console.log('');
  1380. console.log('Commands:');
  1381. console.log(' findChromium Find Chrome/Chromium binary');
  1382. console.log(' installChromium Install Chromium via @puppeteer/browsers');
  1383. console.log(' installPuppeteerCore Install puppeteer-core npm package');
  1384. console.log(' launchChromium Launch Chrome with CDP debugging');
  1385. console.log(' killChrome <pid> Kill Chrome process by PID');
  1386. console.log(' killZombieChrome Clean up zombie Chrome processes');
  1387. console.log('');
  1388. console.log(' getMachineType Get machine type (e.g., x86_64-linux)');
  1389. console.log(' getLibDir Get LIB_DIR path');
  1390. console.log(' getNodeModulesDir Get NODE_MODULES_DIR path');
  1391. console.log(' getExtensionsDir Get Chrome extensions directory');
  1392. console.log(' getTestEnv Get all paths as JSON (for tests)');
  1393. console.log('');
  1394. console.log(' getExtensionId <path> Get extension ID from unpacked path');
  1395. console.log(' loadExtensionManifest Load extension manifest.json');
  1396. console.log(' loadOrInstallExtension Load or install an extension');
  1397. console.log(' installExtensionWithCache Install extension with caching');
  1398. console.log('');
  1399. console.log('Environment variables:');
  1400. console.log(' DATA_DIR Base data directory');
  1401. console.log(' LIB_DIR Library directory (computed if not set)');
  1402. console.log(' MACHINE_TYPE Machine type override');
  1403. console.log(' NODE_MODULES_DIR Node modules directory');
  1404. console.log(' CHROME_BINARY Chrome binary path');
  1405. console.log(' CHROME_EXTENSIONS_DIR Extensions directory');
  1406. process.exit(1);
  1407. }
  1408. const [command, ...commandArgs] = args;
  1409. (async () => {
  1410. try {
  1411. switch (command) {
  1412. case 'findChromium': {
  1413. const binary = findChromium();
  1414. if (binary) {
  1415. console.log(binary);
  1416. } else {
  1417. console.error('Chromium binary not found');
  1418. process.exit(1);
  1419. }
  1420. break;
  1421. }
  1422. case 'installChromium': {
  1423. const result = await installChromium();
  1424. if (result.success) {
  1425. console.log(JSON.stringify({
  1426. binary: result.binary,
  1427. version: result.version,
  1428. }));
  1429. } else {
  1430. console.error(result.error);
  1431. process.exit(1);
  1432. }
  1433. break;
  1434. }
  1435. case 'installPuppeteerCore': {
  1436. const [npmPrefix] = commandArgs;
  1437. const result = await installPuppeteerCore({ npmPrefix: npmPrefix || undefined });
  1438. if (result.success) {
  1439. console.log(JSON.stringify({ path: result.path }));
  1440. } else {
  1441. console.error(result.error);
  1442. process.exit(1);
  1443. }
  1444. break;
  1445. }
  1446. case 'launchChromium': {
  1447. const [outputDir, extensionPathsJson] = commandArgs;
  1448. const extensionPaths = extensionPathsJson ? JSON.parse(extensionPathsJson) : [];
  1449. const result = await launchChromium({
  1450. outputDir: outputDir || 'chrome',
  1451. extensionPaths,
  1452. });
  1453. if (result.success) {
  1454. console.log(JSON.stringify({
  1455. cdpUrl: result.cdpUrl,
  1456. pid: result.pid,
  1457. port: result.port,
  1458. }));
  1459. } else {
  1460. console.error(result.error);
  1461. process.exit(1);
  1462. }
  1463. break;
  1464. }
  1465. case 'killChrome': {
  1466. const [pidStr, outputDir] = commandArgs;
  1467. const pid = parseInt(pidStr, 10);
  1468. if (isNaN(pid)) {
  1469. console.error('Invalid PID');
  1470. process.exit(1);
  1471. }
  1472. await killChrome(pid, outputDir);
  1473. break;
  1474. }
  1475. case 'killZombieChrome': {
  1476. const [dataDir] = commandArgs;
  1477. const killed = killZombieChrome(dataDir);
  1478. console.log(killed);
  1479. break;
  1480. }
  1481. case 'getExtensionId': {
  1482. const [unpacked_path] = commandArgs;
  1483. const id = getExtensionId(unpacked_path);
  1484. console.log(id);
  1485. break;
  1486. }
  1487. case 'loadExtensionManifest': {
  1488. const [unpacked_path] = commandArgs;
  1489. const manifest = loadExtensionManifest(unpacked_path);
  1490. console.log(JSON.stringify(manifest));
  1491. break;
  1492. }
  1493. case 'getExtensionLaunchArgs': {
  1494. const [extensions_json] = commandArgs;
  1495. const extensions = JSON.parse(extensions_json);
  1496. const launchArgs = getExtensionLaunchArgs(extensions);
  1497. console.log(JSON.stringify(launchArgs));
  1498. break;
  1499. }
  1500. case 'loadOrInstallExtension': {
  1501. const [webstore_id, name, extensions_dir] = commandArgs;
  1502. const ext = await loadOrInstallExtension({ webstore_id, name }, extensions_dir);
  1503. console.log(JSON.stringify(ext, null, 2));
  1504. break;
  1505. }
  1506. case 'getMachineType': {
  1507. console.log(getMachineType());
  1508. break;
  1509. }
  1510. case 'getLibDir': {
  1511. console.log(getLibDir());
  1512. break;
  1513. }
  1514. case 'getNodeModulesDir': {
  1515. console.log(getNodeModulesDir());
  1516. break;
  1517. }
  1518. case 'getExtensionsDir': {
  1519. console.log(getExtensionsDir());
  1520. break;
  1521. }
  1522. case 'getTestEnv': {
  1523. console.log(JSON.stringify(getTestEnv(), null, 2));
  1524. break;
  1525. }
  1526. case 'installExtensionWithCache': {
  1527. const [webstore_id, name] = commandArgs;
  1528. if (!webstore_id || !name) {
  1529. console.error('Usage: installExtensionWithCache <webstore_id> <name>');
  1530. process.exit(1);
  1531. }
  1532. const ext = await installExtensionWithCache({ webstore_id, name });
  1533. if (ext) {
  1534. console.log(JSON.stringify(ext, null, 2));
  1535. } else {
  1536. process.exit(1);
  1537. }
  1538. break;
  1539. }
  1540. default:
  1541. console.error(`Unknown command: ${command}`);
  1542. process.exit(1);
  1543. }
  1544. } catch (error) {
  1545. console.error(`Error: ${error.message}`);
  1546. process.exit(1);
  1547. }
  1548. })();
  1549. }