test_screenshot.py 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454
  1. """
  2. Integration tests for screenshot plugin
  3. Tests verify:
  4. 1. Hook script exists
  5. 2. Dependencies installed via chrome validation hooks
  6. 3. Verify deps with abx-pkg
  7. 4. Screenshot extraction works on https://example.com
  8. 5. JSONL output is correct
  9. 6. Filesystem output is valid PNG image
  10. 7. Config options work
  11. """
  12. import json
  13. import os
  14. import subprocess
  15. import sys
  16. import tempfile
  17. from pathlib import Path
  18. import pytest
  19. from archivebox.plugins.chrome.tests.chrome_test_helpers import (
  20. get_test_env,
  21. get_plugin_dir,
  22. get_hook_script,
  23. run_hook_and_parse,
  24. chrome_session,
  25. LIB_DIR,
  26. NODE_MODULES_DIR,
  27. CHROME_PLUGIN_DIR,
  28. )
  29. # Import chrome test fixture to ensure puppeteer is installed
  30. from archivebox.plugins.chrome.tests.test_chrome import ensure_chromium_and_puppeteer_installed
  31. PLUGIN_DIR = get_plugin_dir(__file__)
  32. SCREENSHOT_HOOK = get_hook_script(PLUGIN_DIR, 'on_Snapshot__*_screenshot.*')
  33. # Get Chrome hooks for setting up sessions
  34. CHROME_LAUNCH_HOOK = get_hook_script(CHROME_PLUGIN_DIR, 'on_Crawl__*_chrome_launch.*')
  35. CHROME_TAB_HOOK = get_hook_script(CHROME_PLUGIN_DIR, 'on_Snapshot__*_chrome_tab.*')
  36. CHROME_NAVIGATE_HOOK = get_hook_script(CHROME_PLUGIN_DIR, 'on_Snapshot__*_chrome_navigate.*')
  37. TEST_URL = 'https://example.com'
  38. def test_hook_script_exists():
  39. """Verify on_Snapshot hook exists."""
  40. assert SCREENSHOT_HOOK.exists(), f"Hook not found: {SCREENSHOT_HOOK}"
  41. def test_verify_deps_with_abx_pkg():
  42. """Verify dependencies are available via abx-pkg after hook installation."""
  43. from abx_pkg import Binary, EnvProvider, BinProviderOverrides
  44. EnvProvider.model_rebuild()
  45. # Verify node is available
  46. node_binary = Binary(name='node', binproviders=[EnvProvider()])
  47. node_loaded = node_binary.load()
  48. assert node_loaded and node_loaded.abspath, "Node.js required for screenshot plugin"
  49. def test_screenshot_with_chrome_session():
  50. """Test multiple screenshot scenarios with one Chrome session to save time."""
  51. with tempfile.TemporaryDirectory() as tmpdir:
  52. test_url = 'https://example.com'
  53. snapshot_id = 'test-screenshot-snap'
  54. try:
  55. with chrome_session(
  56. Path(tmpdir),
  57. crawl_id='test-screenshot-crawl',
  58. snapshot_id=snapshot_id,
  59. test_url=test_url,
  60. navigate=True,
  61. timeout=30,
  62. ) as (chrome_process, chrome_pid, snapshot_chrome_dir, env):
  63. # Scenario 1: Basic screenshot extraction
  64. screenshot_dir = snapshot_chrome_dir.parent / 'screenshot'
  65. screenshot_dir.mkdir()
  66. result = subprocess.run(
  67. ['node', str(SCREENSHOT_HOOK), f'--url={test_url}', f'--snapshot-id={snapshot_id}'],
  68. cwd=str(screenshot_dir),
  69. capture_output=True,
  70. text=True,
  71. timeout=30,
  72. env=env
  73. )
  74. assert result.returncode == 0, f"Screenshot extraction failed:\nStderr: {result.stderr}"
  75. # Parse JSONL output
  76. result_json = None
  77. for line in result.stdout.strip().split('\n'):
  78. line = line.strip()
  79. if line.startswith('{'):
  80. try:
  81. record = json.loads(line)
  82. if record.get('type') == 'ArchiveResult':
  83. result_json = record
  84. break
  85. except json.JSONDecodeError:
  86. pass
  87. assert result_json and result_json['status'] == 'succeeded'
  88. screenshot_file = screenshot_dir / 'screenshot.png'
  89. assert screenshot_file.exists() and screenshot_file.stat().st_size > 1000
  90. assert screenshot_file.read_bytes()[:8] == b'\x89PNG\r\n\x1a\n'
  91. # Scenario 2: Wrong target ID (error case)
  92. screenshot_dir3 = snapshot_chrome_dir.parent / 'screenshot3'
  93. screenshot_dir3.mkdir()
  94. (snapshot_chrome_dir / 'target_id.txt').write_text('nonexistent-target-id')
  95. result = subprocess.run(
  96. ['node', str(SCREENSHOT_HOOK), f'--url={test_url}', f'--snapshot-id={snapshot_id}'],
  97. cwd=str(screenshot_dir3),
  98. capture_output=True,
  99. text=True,
  100. timeout=5,
  101. env=env
  102. )
  103. assert result.returncode != 0
  104. assert 'target' in result.stderr.lower() and 'not found' in result.stderr.lower()
  105. except RuntimeError:
  106. raise
  107. def test_skips_when_staticfile_exists():
  108. """Test that screenshot skips when staticfile extractor already handled the URL."""
  109. with tempfile.TemporaryDirectory() as tmpdir:
  110. data_dir = Path(tmpdir)
  111. snapshot_dir = data_dir / 'users' / 'testuser' / 'snapshots' / '20240101' / 'example.com' / 'snap-skip'
  112. screenshot_dir = snapshot_dir / 'screenshot'
  113. screenshot_dir.mkdir(parents=True)
  114. # Create staticfile output to simulate staticfile extractor already ran
  115. staticfile_dir = snapshot_dir / 'staticfile'
  116. staticfile_dir.mkdir()
  117. (staticfile_dir / 'stdout.log').write_text('{"type":"ArchiveResult","status":"succeeded","output_str":"index.html"}\n')
  118. env = get_test_env()
  119. result = subprocess.run(
  120. ['node', str(SCREENSHOT_HOOK), f'--url={TEST_URL}', '--snapshot-id=snap-skip'],
  121. cwd=str(screenshot_dir),
  122. capture_output=True,
  123. text=True,
  124. timeout=30,
  125. env=env
  126. )
  127. assert result.returncode == 0, f"Should exit successfully: {result.stderr}"
  128. # Should emit skipped status
  129. result_json = None
  130. for line in result.stdout.strip().split('\n'):
  131. line = line.strip()
  132. if line.startswith('{'):
  133. try:
  134. record = json.loads(line)
  135. if record.get('type') == 'ArchiveResult':
  136. result_json = record
  137. break
  138. except json.JSONDecodeError:
  139. pass
  140. assert result_json, "Should have ArchiveResult JSONL output"
  141. assert result_json['status'] == 'skipped', f"Should skip: {result_json}"
  142. def test_config_save_screenshot_false_skips():
  143. """Test that SCREENSHOT_ENABLED=False exits without emitting JSONL."""
  144. import os
  145. # FIRST check what Python sees
  146. print(f"\n[DEBUG PYTHON] NODE_V8_COVERAGE in os.environ: {'NODE_V8_COVERAGE' in os.environ}")
  147. print(f"[DEBUG PYTHON] Value: {os.environ.get('NODE_V8_COVERAGE', 'NOT SET')}")
  148. with tempfile.TemporaryDirectory() as tmpdir:
  149. tmpdir = Path(tmpdir)
  150. env = os.environ.copy()
  151. env['SCREENSHOT_ENABLED'] = 'False'
  152. # Check what's in the copied env
  153. print(f"[DEBUG ENV COPY] NODE_V8_COVERAGE in env: {'NODE_V8_COVERAGE' in env}")
  154. print(f"[DEBUG ENV COPY] Value: {env.get('NODE_V8_COVERAGE', 'NOT SET')}")
  155. result = subprocess.run(
  156. ['node', str(SCREENSHOT_HOOK), f'--url={TEST_URL}', '--snapshot-id=test999'],
  157. cwd=tmpdir,
  158. capture_output=True,
  159. text=True,
  160. env=env,
  161. timeout=30
  162. )
  163. print(f"[DEBUG RESULT] Exit code: {result.returncode}")
  164. print(f"[DEBUG RESULT] Stderr: {result.stderr[:200]}")
  165. assert result.returncode == 0, f"Should exit 0 when feature disabled: {result.stderr}"
  166. # Feature disabled - temporary failure, should NOT emit JSONL
  167. assert 'Skipping' in result.stderr or 'False' in result.stderr, "Should log skip reason to stderr"
  168. # Should NOT emit any JSONL
  169. jsonl_lines = [line for line in result.stdout.strip().split('\n') if line.strip().startswith('{')]
  170. assert len(jsonl_lines) == 0, f"Should not emit JSONL when feature disabled, but got: {jsonl_lines}"
  171. def test_reports_missing_chrome():
  172. """Test that script reports error when Chrome is not found."""
  173. with tempfile.TemporaryDirectory() as tmpdir:
  174. tmpdir = Path(tmpdir)
  175. # Set CHROME_BINARY to nonexistent path
  176. env = get_test_env()
  177. env['CHROME_BINARY'] = '/nonexistent/chrome'
  178. result = subprocess.run(
  179. ['node', str(SCREENSHOT_HOOK), f'--url={TEST_URL}', '--snapshot-id=test123'],
  180. cwd=tmpdir,
  181. capture_output=True,
  182. text=True,
  183. env=env,
  184. timeout=30
  185. )
  186. # Should fail and report missing Chrome
  187. if result.returncode != 0:
  188. combined = result.stdout + result.stderr
  189. assert 'chrome' in combined.lower() or 'browser' in combined.lower() or 'ERROR=' in combined
  190. def test_waits_for_navigation_timeout():
  191. """Test that screenshot waits for navigation.json and times out quickly if missing."""
  192. import time
  193. with tempfile.TemporaryDirectory() as tmpdir:
  194. tmpdir = Path(tmpdir)
  195. # Create chrome directory without navigation.json to trigger timeout
  196. chrome_dir = tmpdir.parent / 'chrome'
  197. chrome_dir.mkdir(parents=True, exist_ok=True)
  198. (chrome_dir / 'cdp_url.txt').write_text('ws://localhost:9222/devtools/browser/test')
  199. (chrome_dir / 'target_id.txt').write_text('test-target-id')
  200. # Intentionally NOT creating navigation.json to test timeout
  201. screenshot_dir = tmpdir / 'screenshot'
  202. screenshot_dir.mkdir()
  203. env = get_test_env()
  204. env['SCREENSHOT_TIMEOUT'] = '2' # Set 2 second timeout
  205. start_time = time.time()
  206. result = subprocess.run(
  207. ['node', str(SCREENSHOT_HOOK), f'--url={TEST_URL}', '--snapshot-id=test-timeout'],
  208. cwd=str(screenshot_dir),
  209. capture_output=True,
  210. text=True,
  211. timeout=5, # Test timeout slightly higher than SCREENSHOT_TIMEOUT
  212. env=env
  213. )
  214. elapsed = time.time() - start_time
  215. # Should fail when navigation.json doesn't appear
  216. assert result.returncode != 0, "Should fail when navigation.json missing"
  217. assert 'not loaded' in result.stderr.lower() or 'navigate' in result.stderr.lower(), f"Should mention navigation timeout: {result.stderr}"
  218. # Should complete within 3s (2s wait + 1s overhead)
  219. assert elapsed < 3, f"Should timeout within 3s, took {elapsed:.1f}s"
  220. def test_config_timeout_honored():
  221. """Test that CHROME_TIMEOUT config is respected."""
  222. import os
  223. with tempfile.TemporaryDirectory() as tmpdir:
  224. tmpdir = Path(tmpdir)
  225. # Set very short timeout
  226. env = os.environ.copy()
  227. env['CHROME_TIMEOUT'] = '5'
  228. result = subprocess.run(
  229. ['node', str(SCREENSHOT_HOOK), f'--url={TEST_URL}', '--snapshot-id=testtimeout'],
  230. cwd=tmpdir,
  231. capture_output=True,
  232. text=True,
  233. env=env,
  234. timeout=30
  235. )
  236. # Should complete (success or fail, but not hang)
  237. assert result.returncode in (0, 1), "Should complete without hanging"
  238. def test_missing_url_argument():
  239. """Test that hook fails gracefully when URL argument is missing."""
  240. with tempfile.TemporaryDirectory() as tmpdir:
  241. tmpdir = Path(tmpdir)
  242. env = get_test_env()
  243. result = subprocess.run(
  244. ['node', str(SCREENSHOT_HOOK), '--snapshot-id=test-missing-url'],
  245. cwd=tmpdir,
  246. capture_output=True,
  247. text=True,
  248. timeout=30,
  249. env=env
  250. )
  251. # Should exit with error
  252. assert result.returncode != 0, "Should fail when URL is missing"
  253. assert 'Usage:' in result.stderr or 'url' in result.stderr.lower()
  254. def test_missing_snapshot_id_argument():
  255. """Test that hook fails gracefully when snapshot-id argument is missing."""
  256. with tempfile.TemporaryDirectory() as tmpdir:
  257. tmpdir = Path(tmpdir)
  258. env = get_test_env()
  259. result = subprocess.run(
  260. ['node', str(SCREENSHOT_HOOK), f'--url={TEST_URL}'],
  261. cwd=tmpdir,
  262. capture_output=True,
  263. text=True,
  264. timeout=30,
  265. env=env
  266. )
  267. # Should exit with error
  268. assert result.returncode != 0, "Should fail when snapshot-id is missing"
  269. assert 'Usage:' in result.stderr or 'snapshot' in result.stderr.lower()
  270. def test_no_cdp_url_fails():
  271. """Test error when chrome dir exists but no cdp_url.txt."""
  272. with tempfile.TemporaryDirectory() as tmpdir:
  273. tmpdir = Path(tmpdir)
  274. chrome_dir = tmpdir / 'chrome'
  275. chrome_dir.mkdir()
  276. # Create target_id.txt and navigation.json but NOT cdp_url.txt
  277. (chrome_dir / 'target_id.txt').write_text('test-target')
  278. (chrome_dir / 'navigation.json').write_text('{}')
  279. screenshot_dir = tmpdir / 'screenshot'
  280. screenshot_dir.mkdir()
  281. result = subprocess.run(
  282. ['node', str(SCREENSHOT_HOOK), '--url=https://example.com', '--snapshot-id=test'],
  283. cwd=str(screenshot_dir),
  284. capture_output=True,
  285. text=True,
  286. timeout=7,
  287. env=get_test_env()
  288. )
  289. assert result.returncode != 0
  290. assert 'no chrome session' in result.stderr.lower()
  291. def test_no_target_id_fails():
  292. """Test error when cdp_url exists but no target_id.txt."""
  293. with tempfile.TemporaryDirectory() as tmpdir:
  294. tmpdir = Path(tmpdir)
  295. chrome_dir = tmpdir / 'chrome'
  296. chrome_dir.mkdir()
  297. # Create cdp_url.txt and navigation.json but NOT target_id.txt
  298. (chrome_dir / 'cdp_url.txt').write_text('ws://localhost:9222/devtools/browser/test')
  299. (chrome_dir / 'navigation.json').write_text('{}')
  300. screenshot_dir = tmpdir / 'screenshot'
  301. screenshot_dir.mkdir()
  302. result = subprocess.run(
  303. ['node', str(SCREENSHOT_HOOK), '--url=https://example.com', '--snapshot-id=test'],
  304. cwd=str(screenshot_dir),
  305. capture_output=True,
  306. text=True,
  307. timeout=7,
  308. env=get_test_env()
  309. )
  310. assert result.returncode != 0
  311. assert 'target_id.txt' in result.stderr.lower()
  312. def test_invalid_cdp_url_fails():
  313. """Test error with malformed CDP URL."""
  314. with tempfile.TemporaryDirectory() as tmpdir:
  315. tmpdir = Path(tmpdir)
  316. chrome_dir = tmpdir / 'chrome'
  317. chrome_dir.mkdir()
  318. (chrome_dir / 'cdp_url.txt').write_text('invalid-url')
  319. (chrome_dir / 'target_id.txt').write_text('test-target')
  320. (chrome_dir / 'navigation.json').write_text('{}')
  321. screenshot_dir = tmpdir / 'screenshot'
  322. screenshot_dir.mkdir()
  323. result = subprocess.run(
  324. ['node', str(SCREENSHOT_HOOK), '--url=https://example.com', '--snapshot-id=test'],
  325. cwd=str(screenshot_dir),
  326. capture_output=True,
  327. text=True,
  328. timeout=7,
  329. env=get_test_env()
  330. )
  331. assert result.returncode != 0
  332. def test_invalid_timeout_uses_default():
  333. """Test that invalid SCREENSHOT_TIMEOUT falls back to default."""
  334. with tempfile.TemporaryDirectory() as tmpdir:
  335. tmpdir = Path(tmpdir)
  336. chrome_dir = tmpdir / 'chrome'
  337. chrome_dir.mkdir()
  338. # No navigation.json to trigger timeout
  339. (chrome_dir / 'cdp_url.txt').write_text('ws://localhost:9222/test')
  340. (chrome_dir / 'target_id.txt').write_text('test')
  341. screenshot_dir = tmpdir / 'screenshot'
  342. screenshot_dir.mkdir()
  343. env = get_test_env()
  344. env['SCREENSHOT_TIMEOUT'] = 'invalid' # Should fallback to default (10s becomes NaN, treated as 0)
  345. import time
  346. start = time.time()
  347. result = subprocess.run(
  348. ['node', str(SCREENSHOT_HOOK), '--url=https://example.com', '--snapshot-id=test'],
  349. cwd=str(screenshot_dir),
  350. capture_output=True,
  351. text=True,
  352. timeout=5,
  353. env=env
  354. )
  355. elapsed = time.time() - start
  356. # With invalid timeout, parseInt returns NaN, which should be handled
  357. assert result.returncode != 0
  358. assert elapsed < 2 # Should fail quickly, not wait 10s
  359. if __name__ == '__main__':
  360. pytest.main([__file__, '-v'])