chrome_test_helpers.py 33 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002
  1. """
  2. Shared Chrome test helpers for plugin integration tests.
  3. This module provides common utilities for Chrome-based plugin tests, reducing
  4. duplication across test files. Functions delegate to chrome_utils.js (the single
  5. source of truth) with Python fallbacks.
  6. Function names match the JS equivalents in snake_case:
  7. JS: getMachineType() -> Python: get_machine_type()
  8. JS: getLibDir() -> Python: get_lib_dir()
  9. JS: getNodeModulesDir() -> Python: get_node_modules_dir()
  10. JS: getExtensionsDir() -> Python: get_extensions_dir()
  11. JS: findChromium() -> Python: find_chromium()
  12. JS: killChrome() -> Python: kill_chrome()
  13. JS: getTestEnv() -> Python: get_test_env()
  14. Usage:
  15. # Path helpers (delegate to chrome_utils.js):
  16. from archivebox.plugins.chrome.tests.chrome_test_helpers import (
  17. get_test_env, # env dict with LIB_DIR, NODE_MODULES_DIR, MACHINE_TYPE
  18. get_machine_type, # e.g., 'x86_64-linux', 'arm64-darwin'
  19. get_lib_dir, # Path to lib dir
  20. get_node_modules_dir, # Path to node_modules
  21. get_extensions_dir, # Path to chrome extensions
  22. find_chromium, # Find Chrome/Chromium binary
  23. kill_chrome, # Kill Chrome process by PID
  24. )
  25. # Test file helpers:
  26. from archivebox.plugins.chrome.tests.chrome_test_helpers import (
  27. get_plugin_dir, # get_plugin_dir(__file__) -> plugin dir Path
  28. get_hook_script, # Find hook script by glob pattern
  29. PLUGINS_ROOT, # Path to plugins root
  30. LIB_DIR, # Path to lib dir (lazy-loaded)
  31. NODE_MODULES_DIR, # Path to node_modules (lazy-loaded)
  32. )
  33. # For Chrome session tests:
  34. from archivebox.plugins.chrome.tests.chrome_test_helpers import (
  35. chrome_session, # Context manager (Full Chrome + tab setup with automatic cleanup)
  36. cleanup_chrome, # Manual cleanup by PID (rarely needed)
  37. )
  38. # For extension tests:
  39. from archivebox.plugins.chrome.tests.chrome_test_helpers import (
  40. setup_test_env, # Full dir structure + Chrome install
  41. launch_chromium_session, # Launch Chrome, return CDP URL
  42. kill_chromium_session, # Cleanup Chrome
  43. )
  44. # Run hooks and parse JSONL:
  45. from archivebox.plugins.chrome.tests.chrome_test_helpers import (
  46. run_hook, # Run hook, return (returncode, stdout, stderr)
  47. parse_jsonl_output, # Parse JSONL from stdout
  48. )
  49. """
  50. import json
  51. import os
  52. import platform
  53. import signal
  54. import subprocess
  55. import sys
  56. import time
  57. from datetime import datetime
  58. from pathlib import Path
  59. from typing import Tuple, Optional, List, Dict, Any
  60. from contextlib import contextmanager
  61. # Plugin directory locations
  62. CHROME_PLUGIN_DIR = Path(__file__).parent.parent
  63. PLUGINS_ROOT = CHROME_PLUGIN_DIR.parent
  64. # Hook script locations
  65. CHROME_INSTALL_HOOK = CHROME_PLUGIN_DIR / 'on_Crawl__70_chrome_install.py'
  66. CHROME_LAUNCH_HOOK = CHROME_PLUGIN_DIR / 'on_Crawl__90_chrome_launch.bg.js'
  67. CHROME_TAB_HOOK = CHROME_PLUGIN_DIR / 'on_Snapshot__10_chrome_tab.bg.js'
  68. CHROME_NAVIGATE_HOOK = next(CHROME_PLUGIN_DIR.glob('on_Snapshot__*_chrome_navigate.*'), None)
  69. CHROME_UTILS = CHROME_PLUGIN_DIR / 'chrome_utils.js'
  70. PUPPETEER_BINARY_HOOK = PLUGINS_ROOT / 'puppeteer' / 'on_Binary__12_puppeteer_install.py'
  71. PUPPETEER_CRAWL_HOOK = PLUGINS_ROOT / 'puppeteer' / 'on_Crawl__60_puppeteer_install.py'
  72. NPM_BINARY_HOOK = PLUGINS_ROOT / 'npm' / 'on_Binary__10_npm_install.py'
  73. # =============================================================================
  74. # Path Helpers - delegates to chrome_utils.js with Python fallback
  75. # Function names match JS: getMachineType -> get_machine_type, etc.
  76. # =============================================================================
  77. def _call_chrome_utils(command: str, *args: str, env: Optional[dict] = None) -> Tuple[int, str, str]:
  78. """Call chrome_utils.js CLI command (internal helper).
  79. This is the central dispatch for calling the JS utilities from Python.
  80. All path calculations and Chrome operations are centralized in chrome_utils.js
  81. to ensure consistency between Python and JavaScript code.
  82. Args:
  83. command: The CLI command (e.g., 'findChromium', 'getTestEnv')
  84. *args: Additional command arguments
  85. env: Environment dict (default: current env)
  86. Returns:
  87. Tuple of (returncode, stdout, stderr)
  88. """
  89. cmd = ['node', str(CHROME_UTILS), command] + list(args)
  90. result = subprocess.run(
  91. cmd,
  92. capture_output=True,
  93. text=True,
  94. timeout=30,
  95. env=env or os.environ.copy()
  96. )
  97. return result.returncode, result.stdout, result.stderr
  98. def get_plugin_dir(test_file: str) -> Path:
  99. """Get the plugin directory from a test file path.
  100. Usage:
  101. PLUGIN_DIR = get_plugin_dir(__file__)
  102. Args:
  103. test_file: The __file__ of the test module (e.g., test_screenshot.py)
  104. Returns:
  105. Path to the plugin directory (e.g., plugins/screenshot/)
  106. """
  107. return Path(test_file).parent.parent
  108. def get_hook_script(plugin_dir: Path, pattern: str) -> Optional[Path]:
  109. """Find a hook script in a plugin directory by pattern.
  110. Usage:
  111. HOOK = get_hook_script(PLUGIN_DIR, 'on_Snapshot__*_screenshot.*')
  112. Args:
  113. plugin_dir: Path to the plugin directory
  114. pattern: Glob pattern to match
  115. Returns:
  116. Path to the hook script or None if not found
  117. """
  118. matches = list(plugin_dir.glob(pattern))
  119. return matches[0] if matches else None
  120. def get_machine_type() -> str:
  121. """Get machine type string (e.g., 'x86_64-linux', 'arm64-darwin').
  122. Matches JS: getMachineType()
  123. Tries chrome_utils.js first, falls back to Python computation.
  124. """
  125. # Try JS first (single source of truth)
  126. returncode, stdout, stderr = _call_chrome_utils('getMachineType')
  127. if returncode == 0 and stdout.strip():
  128. return stdout.strip()
  129. # Fallback to Python computation
  130. if os.environ.get('MACHINE_TYPE'):
  131. return os.environ['MACHINE_TYPE']
  132. machine = platform.machine().lower()
  133. system = platform.system().lower()
  134. if machine in ('arm64', 'aarch64'):
  135. machine = 'arm64'
  136. elif machine in ('x86_64', 'amd64'):
  137. machine = 'x86_64'
  138. return f"{machine}-{system}"
  139. def get_lib_dir() -> Path:
  140. """Get LIB_DIR path for platform-specific binaries.
  141. Matches JS: getLibDir()
  142. Tries chrome_utils.js first, falls back to Python computation.
  143. """
  144. # Try JS first
  145. returncode, stdout, stderr = _call_chrome_utils('getLibDir')
  146. if returncode == 0 and stdout.strip():
  147. return Path(stdout.strip())
  148. # Fallback to Python
  149. if os.environ.get('LIB_DIR'):
  150. return Path(os.environ['LIB_DIR'])
  151. raise Exception('LIB_DIR env var must be set!')
  152. def get_node_modules_dir() -> Path:
  153. """Get NODE_MODULES_DIR path for npm packages.
  154. Matches JS: getNodeModulesDir()
  155. Tries chrome_utils.js first, falls back to Python computation.
  156. """
  157. # Try JS first
  158. returncode, stdout, stderr = _call_chrome_utils('getNodeModulesDir')
  159. if returncode == 0 and stdout.strip():
  160. return Path(stdout.strip())
  161. # Fallback to Python
  162. if os.environ.get('NODE_MODULES_DIR'):
  163. return Path(os.environ['NODE_MODULES_DIR'])
  164. lib_dir = get_lib_dir()
  165. return lib_dir / 'npm' / 'node_modules'
  166. def get_extensions_dir() -> str:
  167. """Get the Chrome extensions directory path.
  168. Matches JS: getExtensionsDir()
  169. Tries chrome_utils.js first, falls back to Python computation.
  170. """
  171. try:
  172. returncode, stdout, stderr = _call_chrome_utils('getExtensionsDir')
  173. if returncode == 0 and stdout.strip():
  174. return stdout.strip()
  175. except subprocess.TimeoutExpired:
  176. pass # Fall through to default computation
  177. # Fallback to default computation if JS call fails
  178. data_dir = os.environ.get('DATA_DIR', '.')
  179. persona = os.environ.get('ACTIVE_PERSONA', 'Default')
  180. return str(Path(data_dir) / 'personas' / persona / 'chrome_extensions')
  181. def link_puppeteer_cache(lib_dir: Path) -> None:
  182. """Best-effort symlink from system Puppeteer cache into test lib_dir.
  183. Avoids repeated Chromium downloads across tests by reusing the
  184. default Puppeteer cache directory.
  185. """
  186. cache_dir = lib_dir / 'puppeteer'
  187. cache_dir.mkdir(parents=True, exist_ok=True)
  188. candidates = [
  189. Path.home() / 'Library' / 'Caches' / 'puppeteer',
  190. Path.home() / '.cache' / 'puppeteer',
  191. ]
  192. for src_root in candidates:
  193. if not src_root.exists():
  194. continue
  195. for item in src_root.iterdir():
  196. dst = cache_dir / item.name
  197. if dst.exists():
  198. continue
  199. try:
  200. os.symlink(item, dst, target_is_directory=item.is_dir())
  201. except Exception:
  202. # Best-effort only; if symlink fails, leave as-is.
  203. pass
  204. def find_chromium(data_dir: Optional[str] = None) -> Optional[str]:
  205. """Find the Chromium binary path.
  206. Matches JS: findChromium()
  207. Uses chrome_utils.js which checks:
  208. - CHROME_BINARY env var
  209. - @puppeteer/browsers install locations
  210. - System Chromium locations
  211. - Falls back to Chrome (with warning)
  212. Args:
  213. data_dir: Optional DATA_DIR override
  214. Returns:
  215. Path to Chromium binary or None if not found
  216. """
  217. env = os.environ.copy()
  218. if data_dir:
  219. env['DATA_DIR'] = str(data_dir)
  220. returncode, stdout, stderr = _call_chrome_utils('findChromium', env=env)
  221. if returncode == 0 and stdout.strip():
  222. return stdout.strip()
  223. return None
  224. def kill_chrome(pid: int, output_dir: Optional[str] = None) -> bool:
  225. """Kill a Chrome process by PID.
  226. Matches JS: killChrome()
  227. Uses chrome_utils.js which handles:
  228. - SIGTERM then SIGKILL
  229. - Process group killing
  230. - Zombie process cleanup
  231. Args:
  232. pid: Process ID to kill
  233. output_dir: Optional chrome output directory for PID file cleanup
  234. Returns:
  235. True if the kill command succeeded
  236. """
  237. args = [str(pid)]
  238. if output_dir:
  239. args.append(str(output_dir))
  240. returncode, stdout, stderr = _call_chrome_utils('killChrome', *args)
  241. return returncode == 0
  242. def get_test_env() -> dict:
  243. """Get environment dict with all paths set correctly for tests.
  244. Matches JS: getTestEnv()
  245. Tries chrome_utils.js first for path values, builds env dict.
  246. Use this for all subprocess calls in plugin tests.
  247. """
  248. env = os.environ.copy()
  249. # Try to get all paths from JS (single source of truth)
  250. returncode, stdout, stderr = _call_chrome_utils('getTestEnv')
  251. if returncode == 0 and stdout.strip():
  252. try:
  253. js_env = json.loads(stdout)
  254. env.update(js_env)
  255. return env
  256. except json.JSONDecodeError:
  257. pass
  258. # Fallback to Python computation
  259. lib_dir = get_lib_dir()
  260. env['LIB_DIR'] = str(lib_dir)
  261. env['NODE_MODULES_DIR'] = str(get_node_modules_dir())
  262. env['MACHINE_TYPE'] = get_machine_type()
  263. return env
  264. # Backward compatibility aliases (deprecated, use new names)
  265. find_chromium_binary = find_chromium
  266. kill_chrome_via_js = kill_chrome
  267. get_machine_type_from_js = get_machine_type
  268. get_test_env_from_js = get_test_env
  269. # =============================================================================
  270. # Module-level constants (lazy-loaded on first access)
  271. # Import these directly: from chrome_test_helpers import LIB_DIR, NODE_MODULES_DIR
  272. # =============================================================================
  273. # These are computed once when first accessed
  274. _LIB_DIR: Optional[Path] = None
  275. _NODE_MODULES_DIR: Optional[Path] = None
  276. def _get_lib_dir_cached() -> Path:
  277. global _LIB_DIR
  278. if _LIB_DIR is None:
  279. _LIB_DIR = get_lib_dir()
  280. return _LIB_DIR
  281. def _get_node_modules_dir_cached() -> Path:
  282. global _NODE_MODULES_DIR
  283. if _NODE_MODULES_DIR is None:
  284. _NODE_MODULES_DIR = get_node_modules_dir()
  285. return _NODE_MODULES_DIR
  286. # Module-level constants that can be imported directly
  287. # Usage: from chrome_test_helpers import LIB_DIR, NODE_MODULES_DIR
  288. class _LazyPath:
  289. """Lazy path that computes value on first access."""
  290. def __init__(self, getter):
  291. self._getter = getter
  292. self._value = None
  293. def __fspath__(self):
  294. if self._value is None:
  295. self._value = self._getter()
  296. return str(self._value)
  297. def __truediv__(self, other):
  298. if self._value is None:
  299. self._value = self._getter()
  300. return self._value / other
  301. def __str__(self):
  302. return self.__fspath__()
  303. def __repr__(self):
  304. return f"<LazyPath: {self.__fspath__()}>"
  305. LIB_DIR = _LazyPath(_get_lib_dir_cached)
  306. NODE_MODULES_DIR = _LazyPath(_get_node_modules_dir_cached)
  307. # =============================================================================
  308. # Hook Execution Helpers
  309. # =============================================================================
  310. def run_hook(
  311. hook_script: Path,
  312. url: str,
  313. snapshot_id: str,
  314. cwd: Optional[Path] = None,
  315. env: Optional[dict] = None,
  316. timeout: int = 60,
  317. extra_args: Optional[List[str]] = None,
  318. ) -> Tuple[int, str, str]:
  319. """Run a hook script and return (returncode, stdout, stderr).
  320. Usage:
  321. returncode, stdout, stderr = run_hook(
  322. HOOK_SCRIPT, 'https://example.com', 'test-snap-123',
  323. cwd=tmpdir, env=get_test_env()
  324. )
  325. Args:
  326. hook_script: Path to the hook script
  327. url: URL to process
  328. snapshot_id: Snapshot ID
  329. cwd: Working directory (default: current dir)
  330. env: Environment dict (default: get_test_env())
  331. timeout: Timeout in seconds
  332. extra_args: Additional arguments to pass
  333. Returns:
  334. Tuple of (returncode, stdout, stderr)
  335. """
  336. if env is None:
  337. env = get_test_env()
  338. # Determine interpreter based on file extension
  339. if hook_script.suffix == '.py':
  340. cmd = [sys.executable, str(hook_script)]
  341. elif hook_script.suffix == '.js':
  342. cmd = ['node', str(hook_script)]
  343. else:
  344. cmd = [str(hook_script)]
  345. cmd.extend([f'--url={url}', f'--snapshot-id={snapshot_id}'])
  346. if extra_args:
  347. cmd.extend(extra_args)
  348. result = subprocess.run(
  349. cmd,
  350. cwd=str(cwd) if cwd else None,
  351. capture_output=True,
  352. text=True,
  353. env=env,
  354. timeout=timeout
  355. )
  356. return result.returncode, result.stdout, result.stderr
  357. def parse_jsonl_output(stdout: str, record_type: str = 'ArchiveResult') -> Optional[Dict[str, Any]]:
  358. """Parse JSONL output from hook stdout and return the specified record type.
  359. Usage:
  360. result = parse_jsonl_output(stdout)
  361. if result and result['status'] == 'succeeded':
  362. print("Success!")
  363. Args:
  364. stdout: The stdout from a hook execution
  365. record_type: The 'type' field to look for (default: 'ArchiveResult')
  366. Returns:
  367. The parsed JSON dict or None if not found
  368. """
  369. for line in stdout.strip().split('\n'):
  370. line = line.strip()
  371. if not line.startswith('{'):
  372. continue
  373. try:
  374. record = json.loads(line)
  375. if record.get('type') == record_type:
  376. return record
  377. except json.JSONDecodeError:
  378. continue
  379. return None
  380. def parse_jsonl_records(stdout: str) -> List[Dict[str, Any]]:
  381. """Parse all JSONL records from stdout."""
  382. records: List[Dict[str, Any]] = []
  383. for line in stdout.strip().split('\n'):
  384. line = line.strip()
  385. if not line.startswith('{'):
  386. continue
  387. try:
  388. records.append(json.loads(line))
  389. except json.JSONDecodeError:
  390. continue
  391. return records
  392. def apply_machine_updates(records: List[Dict[str, Any]], env: dict) -> None:
  393. """Apply Machine update records to env dict in-place."""
  394. for record in records:
  395. if record.get('type') != 'Machine':
  396. continue
  397. config = record.get('config')
  398. if not isinstance(config, dict):
  399. continue
  400. env.update(config)
  401. def install_chromium_with_hooks(env: dict, timeout: int = 300) -> str:
  402. """Install Chromium via chrome crawl hook + puppeteer/npm hooks.
  403. Returns absolute path to Chromium binary.
  404. """
  405. puppeteer_result = subprocess.run(
  406. [sys.executable, str(PUPPETEER_CRAWL_HOOK)],
  407. capture_output=True,
  408. text=True,
  409. timeout=timeout,
  410. env=env,
  411. )
  412. if puppeteer_result.returncode != 0:
  413. raise RuntimeError(f"Puppeteer crawl hook failed: {puppeteer_result.stderr}")
  414. puppeteer_record = parse_jsonl_output(puppeteer_result.stdout, record_type='Binary') or {}
  415. if not puppeteer_record or puppeteer_record.get('name') != 'puppeteer':
  416. raise RuntimeError("Puppeteer Binary record not emitted by crawl hook")
  417. npm_cmd = [
  418. sys.executable,
  419. str(NPM_BINARY_HOOK),
  420. '--machine-id=test-machine',
  421. '--binary-id=test-puppeteer',
  422. '--name=puppeteer',
  423. f"--binproviders={puppeteer_record.get('binproviders', '*')}",
  424. ]
  425. puppeteer_overrides = puppeteer_record.get('overrides')
  426. if puppeteer_overrides:
  427. npm_cmd.append(f'--overrides={json.dumps(puppeteer_overrides)}')
  428. npm_result = subprocess.run(
  429. npm_cmd,
  430. capture_output=True,
  431. text=True,
  432. timeout=timeout,
  433. env=env,
  434. )
  435. if npm_result.returncode != 0:
  436. raise RuntimeError(f"Npm install failed: {npm_result.stderr}")
  437. apply_machine_updates(parse_jsonl_records(npm_result.stdout), env)
  438. chrome_result = subprocess.run(
  439. [sys.executable, str(CHROME_INSTALL_HOOK)],
  440. capture_output=True,
  441. text=True,
  442. timeout=timeout,
  443. env=env,
  444. )
  445. if chrome_result.returncode != 0:
  446. raise RuntimeError(f"Chrome install hook failed: {chrome_result.stderr}")
  447. chrome_record = parse_jsonl_output(chrome_result.stdout, record_type='Binary') or {}
  448. if not chrome_record or chrome_record.get('name') not in ('chromium', 'chrome'):
  449. raise RuntimeError("Chrome Binary record not emitted by crawl hook")
  450. chromium_cmd = [
  451. sys.executable,
  452. str(PUPPETEER_BINARY_HOOK),
  453. '--machine-id=test-machine',
  454. '--binary-id=test-chromium',
  455. f"--name={chrome_record.get('name', 'chromium')}",
  456. f"--binproviders={chrome_record.get('binproviders', '*')}",
  457. ]
  458. chrome_overrides = chrome_record.get('overrides')
  459. if chrome_overrides:
  460. chromium_cmd.append(f'--overrides={json.dumps(chrome_overrides)}')
  461. result = subprocess.run(
  462. chromium_cmd,
  463. capture_output=True,
  464. text=True,
  465. timeout=timeout,
  466. env=env,
  467. )
  468. if result.returncode != 0:
  469. raise RuntimeError(f"Puppeteer chromium install failed: {result.stderr}")
  470. records = parse_jsonl_records(result.stdout)
  471. chromium_record = None
  472. for record in records:
  473. if record.get('type') == 'Binary' and record.get('name') in ('chromium', 'chrome'):
  474. chromium_record = record
  475. break
  476. if not chromium_record:
  477. chromium_record = parse_jsonl_output(result.stdout, record_type='Binary')
  478. chromium_path = chromium_record.get('abspath')
  479. if not chromium_path or not Path(chromium_path).exists():
  480. raise RuntimeError(f"Chromium binary not found after install: {chromium_path}")
  481. env['CHROME_BINARY'] = chromium_path
  482. apply_machine_updates(records, env)
  483. return chromium_path
  484. def run_hook_and_parse(
  485. hook_script: Path,
  486. url: str,
  487. snapshot_id: str,
  488. cwd: Optional[Path] = None,
  489. env: Optional[dict] = None,
  490. timeout: int = 60,
  491. extra_args: Optional[List[str]] = None,
  492. ) -> Tuple[int, Optional[Dict[str, Any]], str]:
  493. """Run a hook and parse its JSONL output.
  494. Convenience function combining run_hook() and parse_jsonl_output().
  495. Returns:
  496. Tuple of (returncode, parsed_result_or_none, stderr)
  497. """
  498. returncode, stdout, stderr = run_hook(
  499. hook_script, url, snapshot_id,
  500. cwd=cwd, env=env, timeout=timeout, extra_args=extra_args
  501. )
  502. result = parse_jsonl_output(stdout)
  503. return returncode, result, stderr
  504. # =============================================================================
  505. # Extension Test Helpers
  506. # Used by extension tests (ublock, istilldontcareaboutcookies, twocaptcha)
  507. # =============================================================================
  508. def setup_test_env(tmpdir: Path) -> dict:
  509. """Set up isolated data/lib directory structure for extension tests.
  510. Creates structure matching real ArchiveBox data dir:
  511. <tmpdir>/data/
  512. lib/
  513. arm64-darwin/ (or x86_64-linux, etc.)
  514. npm/
  515. .bin/
  516. node_modules/
  517. personas/
  518. Default/
  519. chrome_extensions/
  520. users/
  521. testuser/
  522. crawls/
  523. snapshots/
  524. Calls chrome install hook + puppeteer/npm hooks for Chromium installation.
  525. Returns env dict with DATA_DIR, LIB_DIR, NPM_BIN_DIR, NODE_MODULES_DIR, CHROME_BINARY, etc.
  526. Args:
  527. tmpdir: Base temporary directory for the test
  528. Returns:
  529. Environment dict with all paths set.
  530. """
  531. # Determine machine type (matches archivebox.config.paths.get_machine_type())
  532. machine = platform.machine().lower()
  533. system = platform.system().lower()
  534. if machine in ('arm64', 'aarch64'):
  535. machine = 'arm64'
  536. elif machine in ('x86_64', 'amd64'):
  537. machine = 'x86_64'
  538. machine_type = f"{machine}-{system}"
  539. # Create proper directory structure matching real ArchiveBox layout
  540. data_dir = tmpdir / 'data'
  541. lib_dir = data_dir / 'lib' / machine_type
  542. npm_dir = lib_dir / 'npm'
  543. npm_bin_dir = npm_dir / '.bin'
  544. node_modules_dir = npm_dir / 'node_modules'
  545. # Extensions go under personas/Default/
  546. chrome_extensions_dir = data_dir / 'personas' / 'Default' / 'chrome_extensions'
  547. # User data goes under users/{username}/
  548. date_str = datetime.now().strftime('%Y%m%d')
  549. users_dir = data_dir / 'users' / 'testuser'
  550. crawls_dir = users_dir / 'crawls' / date_str
  551. snapshots_dir = users_dir / 'snapshots' / date_str
  552. # Create all directories
  553. node_modules_dir.mkdir(parents=True, exist_ok=True)
  554. npm_bin_dir.mkdir(parents=True, exist_ok=True)
  555. chrome_extensions_dir.mkdir(parents=True, exist_ok=True)
  556. crawls_dir.mkdir(parents=True, exist_ok=True)
  557. snapshots_dir.mkdir(parents=True, exist_ok=True)
  558. # Build complete env dict
  559. env = os.environ.copy()
  560. env.update({
  561. 'DATA_DIR': str(data_dir),
  562. 'LIB_DIR': str(lib_dir),
  563. 'MACHINE_TYPE': machine_type,
  564. 'NPM_BIN_DIR': str(npm_bin_dir),
  565. 'NODE_MODULES_DIR': str(node_modules_dir),
  566. 'CHROME_EXTENSIONS_DIR': str(chrome_extensions_dir),
  567. 'CRAWLS_DIR': str(crawls_dir),
  568. 'SNAPSHOTS_DIR': str(snapshots_dir),
  569. })
  570. # Only set headless if not already in environment (allow override for debugging)
  571. if 'CHROME_HEADLESS' not in os.environ:
  572. env['CHROME_HEADLESS'] = 'true'
  573. try:
  574. install_chromium_with_hooks(env)
  575. except RuntimeError as e:
  576. raise RuntimeError(str(e))
  577. return env
  578. def launch_chromium_session(env: dict, chrome_dir: Path, crawl_id: str) -> Tuple[subprocess.Popen, str]:
  579. """Launch Chromium and return (process, cdp_url).
  580. This launches Chrome using the chrome launch hook and waits for the CDP URL
  581. to become available. Use this for extension tests that need direct CDP access.
  582. Args:
  583. env: Environment dict (from setup_test_env)
  584. chrome_dir: Directory for Chrome to write its files (cdp_url.txt, chrome.pid, etc.)
  585. crawl_id: ID for the crawl
  586. Returns:
  587. Tuple of (chrome_launch_process, cdp_url)
  588. Raises:
  589. RuntimeError: If Chrome fails to launch or CDP URL not available after 20s
  590. """
  591. chrome_dir.mkdir(parents=True, exist_ok=True)
  592. chrome_launch_process = subprocess.Popen(
  593. ['node', str(CHROME_LAUNCH_HOOK), f'--crawl-id={crawl_id}'],
  594. cwd=str(chrome_dir),
  595. stdout=subprocess.PIPE,
  596. stderr=subprocess.PIPE,
  597. text=True,
  598. env=env
  599. )
  600. # Wait for Chromium to launch and CDP URL to be available
  601. cdp_url = None
  602. for i in range(20):
  603. if chrome_launch_process.poll() is not None:
  604. stdout, stderr = chrome_launch_process.communicate()
  605. raise RuntimeError(f"Chromium launch failed:\nStdout: {stdout}\nStderr: {stderr}")
  606. cdp_file = chrome_dir / 'cdp_url.txt'
  607. if cdp_file.exists():
  608. cdp_url = cdp_file.read_text().strip()
  609. break
  610. time.sleep(1)
  611. if not cdp_url:
  612. chrome_launch_process.kill()
  613. raise RuntimeError("Chromium CDP URL not found after 20s")
  614. return chrome_launch_process, cdp_url
  615. def kill_chromium_session(chrome_launch_process: subprocess.Popen, chrome_dir: Path) -> None:
  616. """Clean up Chromium process launched by launch_chromium_session.
  617. Uses chrome_utils.js killChrome for proper process group handling.
  618. Args:
  619. chrome_launch_process: The Popen object from launch_chromium_session
  620. chrome_dir: The chrome directory containing chrome.pid
  621. """
  622. # First try to terminate the launch process gracefully
  623. try:
  624. chrome_launch_process.send_signal(signal.SIGTERM)
  625. chrome_launch_process.wait(timeout=5)
  626. except Exception:
  627. pass
  628. # Read PID and use JS to kill with proper cleanup
  629. chrome_pid_file = chrome_dir / 'chrome.pid'
  630. if chrome_pid_file.exists():
  631. try:
  632. chrome_pid = int(chrome_pid_file.read_text().strip())
  633. kill_chrome(chrome_pid, str(chrome_dir))
  634. except (ValueError, FileNotFoundError):
  635. pass
  636. @contextmanager
  637. def chromium_session(env: dict, chrome_dir: Path, crawl_id: str):
  638. """Context manager for Chromium sessions with automatic cleanup.
  639. Usage:
  640. with chromium_session(env, chrome_dir, 'test-crawl') as (process, cdp_url):
  641. # Use cdp_url to connect with puppeteer
  642. pass
  643. # Chromium automatically cleaned up
  644. Args:
  645. env: Environment dict (from setup_test_env)
  646. chrome_dir: Directory for Chrome files
  647. crawl_id: ID for the crawl
  648. Yields:
  649. Tuple of (chrome_launch_process, cdp_url)
  650. """
  651. chrome_launch_process = None
  652. try:
  653. chrome_launch_process, cdp_url = launch_chromium_session(env, chrome_dir, crawl_id)
  654. yield chrome_launch_process, cdp_url
  655. finally:
  656. if chrome_launch_process:
  657. kill_chromium_session(chrome_launch_process, chrome_dir)
  658. # =============================================================================
  659. # Tab-based Test Helpers
  660. # Used by tab-based tests (infiniscroll, modalcloser)
  661. # =============================================================================
  662. def cleanup_chrome(chrome_launch_process: subprocess.Popen, chrome_pid: int, chrome_dir: Optional[Path] = None) -> None:
  663. """Clean up Chrome processes using chrome_utils.js killChrome.
  664. Uses the centralized kill logic from chrome_utils.js which handles:
  665. - SIGTERM then SIGKILL
  666. - Process group killing
  667. - Zombie process cleanup
  668. Args:
  669. chrome_launch_process: The Popen object for the chrome launch hook
  670. chrome_pid: The PID of the Chrome process
  671. chrome_dir: Optional path to chrome output directory
  672. """
  673. # First try to terminate the launch process gracefully
  674. try:
  675. chrome_launch_process.send_signal(signal.SIGTERM)
  676. chrome_launch_process.wait(timeout=5)
  677. except Exception:
  678. pass
  679. # Use JS to kill Chrome with proper process group handling
  680. kill_chrome(chrome_pid, str(chrome_dir) if chrome_dir else None)
  681. @contextmanager
  682. def chrome_session(
  683. tmpdir: Path,
  684. crawl_id: str = 'test-crawl',
  685. snapshot_id: str = 'test-snapshot',
  686. test_url: str = 'about:blank',
  687. navigate: bool = True,
  688. timeout: int = 15,
  689. ):
  690. """Context manager for Chrome sessions with automatic cleanup.
  691. Creates the directory structure, launches Chrome, creates a tab,
  692. and optionally navigates to the test URL. Automatically cleans up
  693. Chrome on exit.
  694. Usage:
  695. with chrome_session(tmpdir, test_url='https://example.com') as (process, pid, chrome_dir, env):
  696. # Run tests with chrome session
  697. pass
  698. # Chrome automatically cleaned up
  699. Args:
  700. tmpdir: Temporary directory for test files
  701. crawl_id: ID to use for the crawl
  702. snapshot_id: ID to use for the snapshot
  703. test_url: URL to navigate to (if navigate=True)
  704. navigate: Whether to navigate to the URL after creating tab
  705. timeout: Seconds to wait for Chrome to start
  706. Yields:
  707. Tuple of (chrome_launch_process, chrome_pid, snapshot_chrome_dir, env)
  708. Raises:
  709. RuntimeError: If Chrome fails to start or tab creation fails
  710. """
  711. chrome_launch_process = None
  712. chrome_pid = None
  713. try:
  714. # Create proper directory structure in tmpdir
  715. machine = platform.machine().lower()
  716. system = platform.system().lower()
  717. if machine in ('arm64', 'aarch64'):
  718. machine = 'arm64'
  719. elif machine in ('x86_64', 'amd64'):
  720. machine = 'x86_64'
  721. machine_type = f"{machine}-{system}"
  722. data_dir = Path(tmpdir) / 'data'
  723. lib_dir = data_dir / 'lib' / machine_type
  724. npm_dir = lib_dir / 'npm'
  725. node_modules_dir = npm_dir / 'node_modules'
  726. puppeteer_cache_dir = lib_dir / 'puppeteer'
  727. # Create lib structure for puppeteer installation
  728. node_modules_dir.mkdir(parents=True, exist_ok=True)
  729. # Create crawl and snapshot directories
  730. crawl_dir = Path(tmpdir) / 'crawl'
  731. crawl_dir.mkdir(exist_ok=True)
  732. chrome_dir = crawl_dir / 'chrome'
  733. chrome_dir.mkdir(exist_ok=True)
  734. # Build env with tmpdir-specific paths
  735. env = os.environ.copy()
  736. env.update({
  737. 'DATA_DIR': str(data_dir),
  738. 'LIB_DIR': str(lib_dir),
  739. 'MACHINE_TYPE': machine_type,
  740. 'NODE_MODULES_DIR': str(node_modules_dir),
  741. 'NODE_PATH': str(node_modules_dir),
  742. 'NPM_BIN_DIR': str(npm_dir / '.bin'),
  743. 'CHROME_HEADLESS': 'true',
  744. 'PUPPETEER_CACHE_DIR': str(puppeteer_cache_dir),
  745. })
  746. # Reuse system Puppeteer cache to avoid redundant Chromium downloads
  747. link_puppeteer_cache(lib_dir)
  748. # Install Chromium via npm + puppeteer hooks using normal Binary flow
  749. install_chromium_with_hooks(env)
  750. # Launch Chrome at crawl level
  751. chrome_launch_process = subprocess.Popen(
  752. ['node', str(CHROME_LAUNCH_HOOK), f'--crawl-id={crawl_id}'],
  753. cwd=str(chrome_dir),
  754. stdout=subprocess.PIPE,
  755. stderr=subprocess.PIPE,
  756. text=True,
  757. env=env
  758. )
  759. # Wait for Chrome to launch
  760. for i in range(timeout):
  761. if chrome_launch_process.poll() is not None:
  762. stdout, stderr = chrome_launch_process.communicate()
  763. raise RuntimeError(f"Chrome launch failed:\nStdout: {stdout}\nStderr: {stderr}")
  764. if (chrome_dir / 'cdp_url.txt').exists():
  765. break
  766. time.sleep(1)
  767. if not (chrome_dir / 'cdp_url.txt').exists():
  768. raise RuntimeError(f"Chrome CDP URL not found after {timeout}s")
  769. chrome_pid = int((chrome_dir / 'chrome.pid').read_text().strip())
  770. # Create snapshot directory structure
  771. snapshot_dir = Path(tmpdir) / 'snapshot'
  772. snapshot_dir.mkdir(exist_ok=True)
  773. snapshot_chrome_dir = snapshot_dir / 'chrome'
  774. snapshot_chrome_dir.mkdir(exist_ok=True)
  775. # Create tab
  776. tab_env = env.copy()
  777. tab_env['CRAWL_OUTPUT_DIR'] = str(crawl_dir)
  778. try:
  779. result = subprocess.run(
  780. ['node', str(CHROME_TAB_HOOK), f'--url={test_url}', f'--snapshot-id={snapshot_id}', f'--crawl-id={crawl_id}'],
  781. cwd=str(snapshot_chrome_dir),
  782. capture_output=True,
  783. text=True,
  784. timeout=60,
  785. env=tab_env
  786. )
  787. if result.returncode != 0:
  788. cleanup_chrome(chrome_launch_process, chrome_pid)
  789. raise RuntimeError(f"Tab creation failed: {result.stderr}")
  790. except subprocess.TimeoutExpired:
  791. cleanup_chrome(chrome_launch_process, chrome_pid)
  792. raise RuntimeError("Tab creation timed out after 60s")
  793. # Navigate to URL if requested
  794. if navigate and CHROME_NAVIGATE_HOOK and test_url != 'about:blank':
  795. try:
  796. result = subprocess.run(
  797. ['node', str(CHROME_NAVIGATE_HOOK), f'--url={test_url}', f'--snapshot-id={snapshot_id}'],
  798. cwd=str(snapshot_chrome_dir),
  799. capture_output=True,
  800. text=True,
  801. timeout=120,
  802. env=env
  803. )
  804. if result.returncode != 0:
  805. cleanup_chrome(chrome_launch_process, chrome_pid)
  806. raise RuntimeError(f"Navigation failed: {result.stderr}")
  807. except subprocess.TimeoutExpired:
  808. cleanup_chrome(chrome_launch_process, chrome_pid)
  809. raise RuntimeError("Navigation timed out after 120s")
  810. yield chrome_launch_process, chrome_pid, snapshot_chrome_dir, env
  811. finally:
  812. if chrome_launch_process and chrome_pid:
  813. cleanup_chrome(chrome_launch_process, chrome_pid)