test_chrome.py 25 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722
  1. """
  2. Integration tests for chrome plugin
  3. Tests verify:
  4. 1. Chromium install via @puppeteer/browsers
  5. 2. Verify deps with abx-pkg
  6. 3. Chrome hooks exist
  7. 4. Chromium launches at crawl level
  8. 5. Tab creation at snapshot level
  9. 6. Tab navigation works
  10. 7. Tab cleanup on SIGTERM
  11. 8. Chromium cleanup on crawl end
  12. NOTE: We use Chromium instead of Chrome because Chrome 137+ removed support for
  13. --load-extension and --disable-extensions-except flags, which are needed for
  14. loading unpacked extensions in headless mode.
  15. """
  16. import json
  17. import os
  18. import signal
  19. import subprocess
  20. import sys
  21. import time
  22. from pathlib import Path
  23. import pytest
  24. import tempfile
  25. import shutil
  26. import platform
  27. from archivebox.plugins.chrome.tests.chrome_test_helpers import (
  28. get_test_env,
  29. find_chromium_binary,
  30. install_chromium_with_hooks,
  31. CHROME_PLUGIN_DIR as PLUGIN_DIR,
  32. CHROME_LAUNCH_HOOK,
  33. CHROME_TAB_HOOK,
  34. CHROME_NAVIGATE_HOOK,
  35. )
  36. def _get_cookies_via_cdp(port: int, env: dict) -> list[dict]:
  37. node_script = r"""
  38. const http = require('http');
  39. const WebSocket = require('ws');
  40. const port = process.env.CDP_PORT;
  41. function getTargets() {
  42. return new Promise((resolve, reject) => {
  43. const req = http.get(`http://127.0.0.1:${port}/json/list`, (res) => {
  44. let data = '';
  45. res.on('data', (chunk) => (data += chunk));
  46. res.on('end', () => {
  47. try {
  48. resolve(JSON.parse(data));
  49. } catch (e) {
  50. reject(e);
  51. }
  52. });
  53. });
  54. req.on('error', reject);
  55. });
  56. }
  57. (async () => {
  58. const targets = await getTargets();
  59. const pageTarget = targets.find(t => t.type === 'page') || targets[0];
  60. if (!pageTarget) {
  61. console.error('No page target found');
  62. process.exit(2);
  63. }
  64. const ws = new WebSocket(pageTarget.webSocketDebuggerUrl);
  65. const timer = setTimeout(() => {
  66. console.error('Timeout waiting for cookies');
  67. process.exit(3);
  68. }, 10000);
  69. ws.on('open', () => {
  70. ws.send(JSON.stringify({ id: 1, method: 'Network.getAllCookies' }));
  71. });
  72. ws.on('message', (data) => {
  73. const msg = JSON.parse(data);
  74. if (msg.id === 1) {
  75. clearTimeout(timer);
  76. ws.close();
  77. if (!msg.result || !msg.result.cookies) {
  78. console.error('No cookies in response');
  79. process.exit(4);
  80. }
  81. process.stdout.write(JSON.stringify(msg.result.cookies));
  82. process.exit(0);
  83. }
  84. });
  85. ws.on('error', (err) => {
  86. console.error(String(err));
  87. process.exit(5);
  88. });
  89. })().catch((err) => {
  90. console.error(String(err));
  91. process.exit(1);
  92. });
  93. """
  94. result = subprocess.run(
  95. ['node', '-e', node_script],
  96. capture_output=True,
  97. text=True,
  98. timeout=30,
  99. env=env | {'CDP_PORT': str(port)},
  100. )
  101. assert result.returncode == 0, f"Failed to read cookies via CDP: {result.stderr}\nStdout: {result.stdout}"
  102. return json.loads(result.stdout or '[]')
  103. @pytest.fixture(scope="session", autouse=True)
  104. def ensure_chromium_and_puppeteer_installed(tmp_path_factory):
  105. """Ensure Chromium and puppeteer are installed before running tests."""
  106. if not os.environ.get('DATA_DIR'):
  107. test_data_dir = tmp_path_factory.mktemp('chrome_test_data')
  108. os.environ['DATA_DIR'] = str(test_data_dir)
  109. env = get_test_env()
  110. try:
  111. chromium_binary = install_chromium_with_hooks(env)
  112. except RuntimeError as e:
  113. raise RuntimeError(str(e))
  114. if not chromium_binary:
  115. raise RuntimeError("Chromium not found after install")
  116. os.environ['CHROME_BINARY'] = chromium_binary
  117. for key in ('NODE_MODULES_DIR', 'NODE_PATH', 'PATH'):
  118. if env.get(key):
  119. os.environ[key] = env[key]
  120. def test_hook_scripts_exist():
  121. """Verify chrome hooks exist."""
  122. assert CHROME_LAUNCH_HOOK.exists(), f"Hook not found: {CHROME_LAUNCH_HOOK}"
  123. assert CHROME_TAB_HOOK.exists(), f"Hook not found: {CHROME_TAB_HOOK}"
  124. assert CHROME_NAVIGATE_HOOK.exists(), f"Hook not found: {CHROME_NAVIGATE_HOOK}"
  125. def test_verify_chromium_available():
  126. """Verify Chromium is available via CHROME_BINARY env var."""
  127. chromium_binary = os.environ.get('CHROME_BINARY') or find_chromium_binary()
  128. assert chromium_binary, "Chromium binary should be available (set by fixture or found)"
  129. assert Path(chromium_binary).exists(), f"Chromium binary should exist at {chromium_binary}"
  130. # Verify it's actually Chromium by checking version
  131. result = subprocess.run(
  132. [chromium_binary, '--version'],
  133. capture_output=True,
  134. text=True,
  135. timeout=10
  136. )
  137. assert result.returncode == 0, f"Failed to get Chromium version: {result.stderr}"
  138. assert 'Chromium' in result.stdout or 'Chrome' in result.stdout, f"Unexpected version output: {result.stdout}"
  139. def test_chrome_launch_and_tab_creation():
  140. """Integration test: Launch Chrome at crawl level and create tab at snapshot level."""
  141. with tempfile.TemporaryDirectory() as tmpdir:
  142. crawl_dir = Path(tmpdir) / 'crawl'
  143. crawl_dir.mkdir()
  144. chrome_dir = crawl_dir / 'chrome'
  145. chrome_dir.mkdir()
  146. # Get test environment with NODE_MODULES_DIR set
  147. env = get_test_env()
  148. env['CHROME_HEADLESS'] = 'true'
  149. # Launch Chrome at crawl level (background process)
  150. chrome_launch_process = subprocess.Popen(
  151. ['node', str(CHROME_LAUNCH_HOOK), '--crawl-id=test-crawl-123'],
  152. cwd=str(chrome_dir),
  153. stdout=subprocess.PIPE,
  154. stderr=subprocess.PIPE,
  155. text=True,
  156. env=env
  157. )
  158. # Wait for Chrome to launch (check process isn't dead and files exist)
  159. for i in range(15): # Wait up to 15 seconds for Chrome to start
  160. if chrome_launch_process.poll() is not None:
  161. stdout, stderr = chrome_launch_process.communicate()
  162. pytest.fail(f"Chrome launch process exited early:\nStdout: {stdout}\nStderr: {stderr}")
  163. if (chrome_dir / 'cdp_url.txt').exists():
  164. break
  165. time.sleep(1)
  166. # Verify Chrome launch outputs - if it failed, get the error from the process
  167. if not (chrome_dir / 'cdp_url.txt').exists():
  168. # Try to get output from the process
  169. try:
  170. stdout, stderr = chrome_launch_process.communicate(timeout=1)
  171. except subprocess.TimeoutExpired:
  172. # Process still running, try to read available output
  173. stdout = stderr = "(process still running)"
  174. # Check what files exist
  175. if chrome_dir.exists():
  176. files = list(chrome_dir.iterdir())
  177. # Check if Chrome process is still alive
  178. if (chrome_dir / 'chrome.pid').exists():
  179. chrome_pid = int((chrome_dir / 'chrome.pid').read_text().strip())
  180. try:
  181. os.kill(chrome_pid, 0)
  182. chrome_alive = "yes"
  183. except OSError:
  184. chrome_alive = "no"
  185. pytest.fail(f"cdp_url.txt missing after 15s. Chrome dir files: {files}. Chrome process {chrome_pid} alive: {chrome_alive}\nLaunch stdout: {stdout}\nLaunch stderr: {stderr}")
  186. else:
  187. pytest.fail(f"cdp_url.txt missing. Chrome dir exists with files: {files}\nLaunch stdout: {stdout}\nLaunch stderr: {stderr}")
  188. else:
  189. pytest.fail(f"Chrome dir {chrome_dir} doesn't exist\nLaunch stdout: {stdout}\nLaunch stderr: {stderr}")
  190. assert (chrome_dir / 'cdp_url.txt').exists(), "cdp_url.txt should exist"
  191. assert (chrome_dir / 'chrome.pid').exists(), "chrome.pid should exist"
  192. assert (chrome_dir / 'port.txt').exists(), "port.txt should exist"
  193. cdp_url = (chrome_dir / 'cdp_url.txt').read_text().strip()
  194. chrome_pid = int((chrome_dir / 'chrome.pid').read_text().strip())
  195. assert cdp_url.startswith('ws://'), f"CDP URL should be WebSocket URL: {cdp_url}"
  196. assert chrome_pid > 0, "Chrome PID should be valid"
  197. # Verify Chrome process is running
  198. try:
  199. os.kill(chrome_pid, 0)
  200. except OSError:
  201. pytest.fail(f"Chrome process {chrome_pid} is not running")
  202. # Create snapshot directory and tab
  203. snapshot_dir = Path(tmpdir) / 'snapshot1'
  204. snapshot_dir.mkdir()
  205. snapshot_chrome_dir = snapshot_dir / 'chrome'
  206. snapshot_chrome_dir.mkdir()
  207. # Launch tab at snapshot level
  208. env['CRAWL_OUTPUT_DIR'] = str(crawl_dir)
  209. result = subprocess.run(
  210. ['node', str(CHROME_TAB_HOOK), '--url=https://example.com', '--snapshot-id=snap-123', '--crawl-id=test-crawl-123'],
  211. cwd=str(snapshot_chrome_dir),
  212. capture_output=True,
  213. text=True,
  214. timeout=60,
  215. env=env
  216. )
  217. assert result.returncode == 0, f"Tab creation failed: {result.stderr}\nStdout: {result.stdout}"
  218. # Verify tab creation outputs
  219. assert (snapshot_chrome_dir / 'cdp_url.txt').exists(), "Snapshot cdp_url.txt should exist"
  220. assert (snapshot_chrome_dir / 'target_id.txt').exists(), "target_id.txt should exist"
  221. assert (snapshot_chrome_dir / 'url.txt').exists(), "url.txt should exist"
  222. target_id = (snapshot_chrome_dir / 'target_id.txt').read_text().strip()
  223. assert len(target_id) > 0, "Target ID should not be empty"
  224. # Cleanup: Kill Chrome and launch process
  225. try:
  226. chrome_launch_process.send_signal(signal.SIGTERM)
  227. chrome_launch_process.wait(timeout=5)
  228. except:
  229. pass
  230. try:
  231. os.kill(chrome_pid, signal.SIGKILL)
  232. except OSError:
  233. pass
  234. def test_cookies_imported_on_launch():
  235. """Integration test: COOKIES_TXT_FILE is imported at crawl start."""
  236. with tempfile.TemporaryDirectory() as tmpdir:
  237. crawl_dir = Path(tmpdir) / 'crawl'
  238. crawl_dir.mkdir()
  239. chrome_dir = crawl_dir / 'chrome'
  240. chrome_dir.mkdir()
  241. cookies_file = Path(tmpdir) / 'cookies.txt'
  242. cookies_file.write_text(
  243. '\n'.join([
  244. '# Netscape HTTP Cookie File',
  245. '# https://curl.se/docs/http-cookies.html',
  246. '# This file was generated by a test',
  247. '',
  248. 'example.com\tTRUE\t/\tFALSE\t2147483647\tabx_test_cookie\thello',
  249. '',
  250. ])
  251. )
  252. profile_dir = Path(tmpdir) / 'profile'
  253. env = get_test_env()
  254. env.update({
  255. 'CHROME_HEADLESS': 'true',
  256. 'CHROME_USER_DATA_DIR': str(profile_dir),
  257. 'COOKIES_TXT_FILE': str(cookies_file),
  258. })
  259. chrome_launch_process = subprocess.Popen(
  260. ['node', str(CHROME_LAUNCH_HOOK), '--crawl-id=test-crawl-cookies'],
  261. cwd=str(chrome_dir),
  262. stdout=subprocess.PIPE,
  263. stderr=subprocess.PIPE,
  264. text=True,
  265. env=env
  266. )
  267. for _ in range(15):
  268. if (chrome_dir / 'port.txt').exists():
  269. break
  270. time.sleep(1)
  271. assert (chrome_dir / 'port.txt').exists(), "port.txt should exist"
  272. chrome_pid = int((chrome_dir / 'chrome.pid').read_text().strip())
  273. port = int((chrome_dir / 'port.txt').read_text().strip())
  274. cookie_found = False
  275. for _ in range(15):
  276. cookies = _get_cookies_via_cdp(port, env)
  277. cookie_found = any(
  278. c.get('name') == 'abx_test_cookie' and c.get('value') == 'hello'
  279. for c in cookies
  280. )
  281. if cookie_found:
  282. break
  283. time.sleep(1)
  284. assert cookie_found, "Imported cookie should be present in Chrome session"
  285. # Cleanup
  286. try:
  287. chrome_launch_process.send_signal(signal.SIGTERM)
  288. chrome_launch_process.wait(timeout=5)
  289. except:
  290. pass
  291. try:
  292. os.kill(chrome_pid, signal.SIGKILL)
  293. except OSError:
  294. pass
  295. def test_chrome_navigation():
  296. """Integration test: Navigate to a URL."""
  297. with tempfile.TemporaryDirectory() as tmpdir:
  298. crawl_dir = Path(tmpdir) / 'crawl'
  299. crawl_dir.mkdir()
  300. chrome_dir = crawl_dir / 'chrome'
  301. chrome_dir.mkdir()
  302. # Launch Chrome (background process)
  303. chrome_launch_process = subprocess.Popen(
  304. ['node', str(CHROME_LAUNCH_HOOK), '--crawl-id=test-crawl-nav'],
  305. cwd=str(chrome_dir),
  306. stdout=subprocess.PIPE,
  307. stderr=subprocess.PIPE,
  308. text=True,
  309. env=get_test_env() | {'CHROME_HEADLESS': 'true'}
  310. )
  311. # Wait for Chrome to launch
  312. time.sleep(3)
  313. chrome_pid = int((chrome_dir / 'chrome.pid').read_text().strip())
  314. # Create snapshot and tab
  315. snapshot_dir = Path(tmpdir) / 'snapshot1'
  316. snapshot_dir.mkdir()
  317. snapshot_chrome_dir = snapshot_dir / 'chrome'
  318. snapshot_chrome_dir.mkdir()
  319. result = subprocess.run(
  320. ['node', str(CHROME_TAB_HOOK), '--url=https://example.com', '--snapshot-id=snap-nav-123', '--crawl-id=test-crawl-nav'],
  321. cwd=str(snapshot_chrome_dir),
  322. capture_output=True,
  323. text=True,
  324. timeout=60,
  325. env=get_test_env() | {'CRAWL_OUTPUT_DIR': str(crawl_dir), 'CHROME_HEADLESS': 'true'}
  326. )
  327. assert result.returncode == 0, f"Tab creation failed: {result.stderr}"
  328. # Navigate to URL
  329. result = subprocess.run(
  330. ['node', str(CHROME_NAVIGATE_HOOK), '--url=https://example.com', '--snapshot-id=snap-nav-123'],
  331. cwd=str(snapshot_chrome_dir),
  332. capture_output=True,
  333. text=True,
  334. timeout=120,
  335. env=get_test_env() | {'CHROME_PAGELOAD_TIMEOUT': '30', 'CHROME_WAIT_FOR': 'load'}
  336. )
  337. assert result.returncode == 0, f"Navigation failed: {result.stderr}\nStdout: {result.stdout}"
  338. # Verify navigation outputs
  339. assert (snapshot_chrome_dir / 'navigation.json').exists(), "navigation.json should exist"
  340. assert (snapshot_chrome_dir / 'page_loaded.txt').exists(), "page_loaded.txt should exist"
  341. nav_data = json.loads((snapshot_chrome_dir / 'navigation.json').read_text())
  342. assert nav_data.get('status') in [200, 301, 302], f"Should get valid HTTP status: {nav_data}"
  343. assert nav_data.get('finalUrl'), "Should have final URL"
  344. # Cleanup
  345. try:
  346. chrome_launch_process.send_signal(signal.SIGTERM)
  347. chrome_launch_process.wait(timeout=5)
  348. except:
  349. pass
  350. try:
  351. os.kill(chrome_pid, signal.SIGKILL)
  352. except OSError:
  353. pass
  354. def test_tab_cleanup_on_sigterm():
  355. """Integration test: Tab cleanup when receiving SIGTERM."""
  356. with tempfile.TemporaryDirectory() as tmpdir:
  357. crawl_dir = Path(tmpdir) / 'crawl'
  358. crawl_dir.mkdir()
  359. chrome_dir = crawl_dir / 'chrome'
  360. chrome_dir.mkdir()
  361. # Launch Chrome (background process)
  362. chrome_launch_process = subprocess.Popen(
  363. ['node', str(CHROME_LAUNCH_HOOK), '--crawl-id=test-cleanup'],
  364. cwd=str(chrome_dir),
  365. stdout=subprocess.PIPE,
  366. stderr=subprocess.PIPE,
  367. text=True,
  368. env=get_test_env() | {'CHROME_HEADLESS': 'true'}
  369. )
  370. # Wait for Chrome to launch
  371. time.sleep(3)
  372. chrome_pid = int((chrome_dir / 'chrome.pid').read_text().strip())
  373. # Create snapshot and tab - run in background
  374. snapshot_dir = Path(tmpdir) / 'snapshot1'
  375. snapshot_dir.mkdir()
  376. snapshot_chrome_dir = snapshot_dir / 'chrome'
  377. snapshot_chrome_dir.mkdir()
  378. tab_process = subprocess.Popen(
  379. ['node', str(CHROME_TAB_HOOK), '--url=https://example.com', '--snapshot-id=snap-cleanup', '--crawl-id=test-cleanup'],
  380. cwd=str(snapshot_chrome_dir),
  381. stdout=subprocess.PIPE,
  382. stderr=subprocess.PIPE,
  383. text=True,
  384. env=get_test_env() | {'CRAWL_OUTPUT_DIR': str(crawl_dir), 'CHROME_HEADLESS': 'true'}
  385. )
  386. # Wait for tab to be created
  387. time.sleep(3)
  388. # Send SIGTERM to tab process
  389. tab_process.send_signal(signal.SIGTERM)
  390. stdout, stderr = tab_process.communicate(timeout=10)
  391. assert tab_process.returncode == 0, f"Tab process should exit cleanly: {stderr}"
  392. # Chrome should still be running
  393. try:
  394. os.kill(chrome_pid, 0)
  395. except OSError:
  396. pytest.fail("Chrome should still be running after tab cleanup")
  397. # Cleanup
  398. try:
  399. chrome_launch_process.send_signal(signal.SIGTERM)
  400. chrome_launch_process.wait(timeout=5)
  401. except:
  402. pass
  403. try:
  404. os.kill(chrome_pid, signal.SIGKILL)
  405. except OSError:
  406. pass
  407. def test_multiple_snapshots_share_chrome():
  408. """Integration test: Multiple snapshots share one Chrome instance."""
  409. with tempfile.TemporaryDirectory() as tmpdir:
  410. crawl_dir = Path(tmpdir) / 'crawl'
  411. crawl_dir.mkdir()
  412. chrome_dir = crawl_dir / 'chrome'
  413. chrome_dir.mkdir()
  414. # Launch Chrome at crawl level
  415. chrome_launch_process = subprocess.Popen(
  416. ['node', str(CHROME_LAUNCH_HOOK), '--crawl-id=test-multi-crawl'],
  417. cwd=str(chrome_dir),
  418. stdout=subprocess.PIPE,
  419. stderr=subprocess.PIPE,
  420. text=True,
  421. env=get_test_env() | {'CHROME_HEADLESS': 'true'}
  422. )
  423. # Wait for Chrome to launch
  424. for i in range(15):
  425. if (chrome_dir / 'cdp_url.txt').exists():
  426. break
  427. time.sleep(1)
  428. chrome_pid = int((chrome_dir / 'chrome.pid').read_text().strip())
  429. crawl_cdp_url = (chrome_dir / 'cdp_url.txt').read_text().strip()
  430. # Create multiple snapshots that share this Chrome
  431. snapshot_dirs = []
  432. target_ids = []
  433. for snap_num in range(3):
  434. snapshot_dir = Path(tmpdir) / f'snapshot{snap_num}'
  435. snapshot_dir.mkdir()
  436. snapshot_chrome_dir = snapshot_dir / 'chrome'
  437. snapshot_chrome_dir.mkdir()
  438. snapshot_dirs.append(snapshot_chrome_dir)
  439. # Create tab for this snapshot
  440. result = subprocess.run(
  441. ['node', str(CHROME_TAB_HOOK), f'--url=https://example.com/{snap_num}', f'--snapshot-id=snap-{snap_num}', '--crawl-id=test-multi-crawl'],
  442. cwd=str(snapshot_chrome_dir),
  443. capture_output=True,
  444. text=True,
  445. timeout=60,
  446. env=get_test_env() | {'CRAWL_OUTPUT_DIR': str(crawl_dir), 'CHROME_HEADLESS': 'true'}
  447. )
  448. assert result.returncode == 0, f"Tab {snap_num} creation failed: {result.stderr}"
  449. # Verify each snapshot has its own target_id but same Chrome PID
  450. assert (snapshot_chrome_dir / 'target_id.txt').exists()
  451. assert (snapshot_chrome_dir / 'cdp_url.txt').exists()
  452. assert (snapshot_chrome_dir / 'chrome.pid').exists()
  453. target_id = (snapshot_chrome_dir / 'target_id.txt').read_text().strip()
  454. snapshot_cdp_url = (snapshot_chrome_dir / 'cdp_url.txt').read_text().strip()
  455. snapshot_pid = int((snapshot_chrome_dir / 'chrome.pid').read_text().strip())
  456. target_ids.append(target_id)
  457. # All snapshots should share same Chrome
  458. assert snapshot_pid == chrome_pid, f"Snapshot {snap_num} should use crawl Chrome PID"
  459. assert snapshot_cdp_url == crawl_cdp_url, f"Snapshot {snap_num} should use crawl CDP URL"
  460. # All target IDs should be unique (different tabs)
  461. assert len(set(target_ids)) == 3, f"All snapshots should have unique tabs: {target_ids}"
  462. # Chrome should still be running with all 3 tabs
  463. try:
  464. os.kill(chrome_pid, 0)
  465. except OSError:
  466. pytest.fail("Chrome should still be running after creating 3 tabs")
  467. # Cleanup
  468. try:
  469. chrome_launch_process.send_signal(signal.SIGTERM)
  470. chrome_launch_process.wait(timeout=5)
  471. except:
  472. pass
  473. try:
  474. os.kill(chrome_pid, signal.SIGKILL)
  475. except OSError:
  476. pass
  477. def test_chrome_cleanup_on_crawl_end():
  478. """Integration test: Chrome cleanup at end of crawl."""
  479. with tempfile.TemporaryDirectory() as tmpdir:
  480. crawl_dir = Path(tmpdir) / 'crawl'
  481. crawl_dir.mkdir()
  482. chrome_dir = crawl_dir / 'chrome'
  483. chrome_dir.mkdir()
  484. # Launch Chrome in background
  485. chrome_launch_process = subprocess.Popen(
  486. ['node', str(CHROME_LAUNCH_HOOK), '--crawl-id=test-crawl-end'],
  487. cwd=str(chrome_dir),
  488. stdout=subprocess.PIPE,
  489. stderr=subprocess.PIPE,
  490. text=True,
  491. env=get_test_env() | {'CHROME_HEADLESS': 'true'}
  492. )
  493. # Wait for Chrome to launch
  494. time.sleep(3)
  495. # Verify Chrome is running
  496. assert (chrome_dir / 'chrome.pid').exists(), "Chrome PID file should exist"
  497. chrome_pid = int((chrome_dir / 'chrome.pid').read_text().strip())
  498. try:
  499. os.kill(chrome_pid, 0)
  500. except OSError:
  501. pytest.fail("Chrome should be running")
  502. # Send SIGTERM to chrome launch process
  503. chrome_launch_process.send_signal(signal.SIGTERM)
  504. stdout, stderr = chrome_launch_process.communicate(timeout=10)
  505. # Wait for cleanup
  506. time.sleep(3)
  507. # Verify Chrome process is killed
  508. try:
  509. os.kill(chrome_pid, 0)
  510. pytest.fail("Chrome should be killed after SIGTERM")
  511. except OSError:
  512. # Expected - Chrome should be dead
  513. pass
  514. def test_zombie_prevention_hook_killed():
  515. """Integration test: Chrome is killed even if hook process is SIGKILL'd."""
  516. with tempfile.TemporaryDirectory() as tmpdir:
  517. crawl_dir = Path(tmpdir) / 'crawl'
  518. crawl_dir.mkdir()
  519. chrome_dir = crawl_dir / 'chrome'
  520. chrome_dir.mkdir()
  521. # Launch Chrome
  522. chrome_launch_process = subprocess.Popen(
  523. ['node', str(CHROME_LAUNCH_HOOK), '--crawl-id=test-zombie'],
  524. cwd=str(chrome_dir),
  525. stdout=subprocess.PIPE,
  526. stderr=subprocess.PIPE,
  527. text=True,
  528. env=get_test_env() | {'CHROME_HEADLESS': 'true'}
  529. )
  530. # Wait for Chrome to launch
  531. for i in range(15):
  532. if (chrome_dir / 'chrome.pid').exists():
  533. break
  534. time.sleep(1)
  535. assert (chrome_dir / 'chrome.pid').exists(), "Chrome PID file should exist"
  536. chrome_pid = int((chrome_dir / 'chrome.pid').read_text().strip())
  537. hook_pid = chrome_launch_process.pid # Use the Popen process PID instead of hook.pid file
  538. # Verify both Chrome and hook are running
  539. try:
  540. os.kill(chrome_pid, 0)
  541. os.kill(hook_pid, 0)
  542. except OSError:
  543. pytest.fail("Both Chrome and hook should be running")
  544. # Simulate hook getting SIGKILL'd (can't cleanup)
  545. os.kill(hook_pid, signal.SIGKILL)
  546. time.sleep(1)
  547. # Chrome should still be running (orphaned)
  548. try:
  549. os.kill(chrome_pid, 0)
  550. except OSError:
  551. pytest.fail("Chrome should still be running after hook SIGKILL")
  552. # Simulate Crawl.cleanup() using the actual cleanup logic
  553. def is_process_alive(pid):
  554. """Check if a process exists."""
  555. try:
  556. os.kill(pid, 0)
  557. return True
  558. except (OSError, ProcessLookupError):
  559. return False
  560. for pid_file in chrome_dir.glob('**/*.pid'):
  561. try:
  562. pid = int(pid_file.read_text().strip())
  563. # Step 1: SIGTERM for graceful shutdown
  564. try:
  565. try:
  566. os.killpg(pid, signal.SIGTERM)
  567. except (OSError, ProcessLookupError):
  568. os.kill(pid, signal.SIGTERM)
  569. except ProcessLookupError:
  570. pid_file.unlink(missing_ok=True)
  571. continue
  572. # Step 2: Wait for graceful shutdown
  573. time.sleep(2)
  574. # Step 3: Check if still alive
  575. if not is_process_alive(pid):
  576. pid_file.unlink(missing_ok=True)
  577. continue
  578. # Step 4: Force kill ENTIRE process group with SIGKILL
  579. try:
  580. try:
  581. # Always kill entire process group with SIGKILL
  582. os.killpg(pid, signal.SIGKILL)
  583. except (OSError, ProcessLookupError):
  584. os.kill(pid, signal.SIGKILL)
  585. except ProcessLookupError:
  586. pid_file.unlink(missing_ok=True)
  587. continue
  588. # Step 5: Wait and verify death
  589. time.sleep(1)
  590. if not is_process_alive(pid):
  591. pid_file.unlink(missing_ok=True)
  592. except (ValueError, OSError):
  593. pass
  594. # Chrome should now be dead
  595. try:
  596. os.kill(chrome_pid, 0)
  597. pytest.fail("Chrome should be killed after cleanup")
  598. except OSError:
  599. # Expected - Chrome is dead
  600. pass
  601. if __name__ == '__main__':
  602. pytest.main([__file__, '-v'])