test_singlefile.py 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304
  1. """
  2. Integration tests for singlefile plugin
  3. Tests verify:
  4. 1. Hook scripts exist with correct naming
  5. 2. CLI-based singlefile extraction works
  6. 3. Dependencies available via abx-pkg
  7. 4. Output contains valid HTML
  8. 5. Connects to Chrome session via CDP when available
  9. 6. Works with extensions loaded (ublock, etc.)
  10. """
  11. import json
  12. import os
  13. import subprocess
  14. import sys
  15. import tempfile
  16. from pathlib import Path
  17. import pytest
  18. from archivebox.plugins.chrome.tests.chrome_test_helpers import (
  19. get_test_env,
  20. get_plugin_dir,
  21. get_hook_script,
  22. chrome_session,
  23. cleanup_chrome,
  24. )
  25. PLUGIN_DIR = get_plugin_dir(__file__)
  26. SNAPSHOT_HOOK = get_hook_script(PLUGIN_DIR, 'on_Snapshot__*_singlefile.py')
  27. INSTALL_SCRIPT = PLUGIN_DIR / 'on_Crawl__82_singlefile_install.js'
  28. TEST_URL = "https://example.com"
  29. def test_snapshot_hook_exists():
  30. """Verify snapshot extraction hook exists"""
  31. assert SNAPSHOT_HOOK is not None and SNAPSHOT_HOOK.exists(), f"Snapshot hook not found in {PLUGIN_DIR}"
  32. def test_snapshot_hook_priority():
  33. """Test that snapshot hook has correct priority (50)"""
  34. filename = SNAPSHOT_HOOK.name
  35. assert "50" in filename, "SingleFile snapshot hook should have priority 50"
  36. assert filename.startswith("on_Snapshot__50_"), "Should follow priority naming convention"
  37. def test_verify_deps_with_abx_pkg():
  38. """Verify dependencies are available via abx-pkg."""
  39. from abx_pkg import Binary, EnvProvider
  40. EnvProvider.model_rebuild()
  41. # Verify node is available
  42. node_binary = Binary(name='node', binproviders=[EnvProvider()])
  43. node_loaded = node_binary.load()
  44. assert node_loaded and node_loaded.abspath, "Node.js required for singlefile plugin"
  45. def test_singlefile_cli_archives_example_com():
  46. """Test that singlefile archives example.com and produces valid HTML."""
  47. with tempfile.TemporaryDirectory() as tmpdir:
  48. tmpdir = Path(tmpdir)
  49. data_dir = tmpdir / 'data'
  50. extensions_dir = data_dir / 'personas' / 'Default' / 'chrome_extensions'
  51. downloads_dir = data_dir / 'personas' / 'Default' / 'chrome_downloads'
  52. user_data_dir = data_dir / 'personas' / 'Default' / 'chrome_user_data'
  53. extensions_dir.mkdir(parents=True, exist_ok=True)
  54. downloads_dir.mkdir(parents=True, exist_ok=True)
  55. user_data_dir.mkdir(parents=True, exist_ok=True)
  56. env_install = os.environ.copy()
  57. env_install.update({
  58. 'DATA_DIR': str(data_dir),
  59. 'CHROME_EXTENSIONS_DIR': str(extensions_dir),
  60. 'CHROME_DOWNLOADS_DIR': str(downloads_dir),
  61. })
  62. result = subprocess.run(
  63. ['node', str(INSTALL_SCRIPT)],
  64. capture_output=True,
  65. text=True,
  66. env=env_install,
  67. timeout=120,
  68. )
  69. assert result.returncode == 0, f"Extension install failed: {result.stderr}"
  70. old_env = os.environ.copy()
  71. os.environ['CHROME_USER_DATA_DIR'] = str(user_data_dir)
  72. os.environ['CHROME_DOWNLOADS_DIR'] = str(downloads_dir)
  73. os.environ['CHROME_EXTENSIONS_DIR'] = str(extensions_dir)
  74. try:
  75. with chrome_session(
  76. tmpdir=tmpdir,
  77. crawl_id='singlefile-cli-crawl',
  78. snapshot_id='singlefile-cli-snap',
  79. test_url=TEST_URL,
  80. navigate=True,
  81. timeout=30,
  82. ) as (_chrome_proc, _chrome_pid, snapshot_chrome_dir, env):
  83. env['SINGLEFILE_ENABLED'] = 'true'
  84. env['CHROME_EXTENSIONS_DIR'] = str(extensions_dir)
  85. env['CHROME_DOWNLOADS_DIR'] = str(downloads_dir)
  86. singlefile_output_dir = snapshot_chrome_dir.parent / 'singlefile'
  87. singlefile_output_dir.mkdir(parents=True, exist_ok=True)
  88. # Run singlefile snapshot hook
  89. result = subprocess.run(
  90. [sys.executable, str(SNAPSHOT_HOOK), f'--url={TEST_URL}', '--snapshot-id=test789'],
  91. cwd=singlefile_output_dir,
  92. capture_output=True,
  93. text=True,
  94. env=env,
  95. timeout=120,
  96. )
  97. finally:
  98. os.environ.clear()
  99. os.environ.update(old_env)
  100. assert result.returncode == 0, f"Hook execution failed: {result.stderr}"
  101. # Verify output file exists
  102. output_file = singlefile_output_dir / 'singlefile.html'
  103. assert output_file.exists(), f"singlefile.html not created. stdout: {result.stdout}, stderr: {result.stderr}"
  104. # Verify it contains real HTML
  105. html_content = output_file.read_text()
  106. assert len(html_content) > 500, "Output file too small to be valid HTML"
  107. assert '<!DOCTYPE html>' in html_content or '<html' in html_content, "Output should contain HTML doctype or html tag"
  108. assert 'Example Domain' in html_content, "Output should contain example.com content"
  109. def test_singlefile_with_chrome_session():
  110. """Test singlefile connects to existing Chrome session via CDP.
  111. When a Chrome session exists (chrome/cdp_url.txt), singlefile should
  112. connect to it instead of launching a new Chrome instance.
  113. """
  114. with tempfile.TemporaryDirectory() as tmpdir:
  115. tmpdir = Path(tmpdir)
  116. # Set up Chrome session using shared helper
  117. with chrome_session(
  118. tmpdir=tmpdir,
  119. crawl_id='singlefile-test-crawl',
  120. snapshot_id='singlefile-test-snap',
  121. test_url=TEST_URL,
  122. navigate=False, # Don't navigate, singlefile will do that
  123. timeout=20,
  124. ) as (chrome_launch_process, chrome_pid, snapshot_chrome_dir, env):
  125. # singlefile looks for ../chrome/cdp_url.txt relative to cwd
  126. # So we need to run from a directory that has ../chrome pointing to our chrome dir
  127. singlefile_output_dir = tmpdir / 'snapshot' / 'singlefile'
  128. singlefile_output_dir.mkdir(parents=True, exist_ok=True)
  129. # Create symlink so singlefile can find the chrome session
  130. chrome_link = singlefile_output_dir.parent / 'chrome'
  131. if not chrome_link.exists():
  132. chrome_link.symlink_to(tmpdir / 'crawl' / 'chrome')
  133. # Use env from chrome_session
  134. env['SINGLEFILE_ENABLED'] = 'true'
  135. # Run singlefile - it should find and use the existing Chrome session
  136. result = subprocess.run(
  137. [sys.executable, str(SNAPSHOT_HOOK), f'--url={TEST_URL}', '--snapshot-id=singlefile-test-snap'],
  138. cwd=str(singlefile_output_dir),
  139. capture_output=True,
  140. text=True,
  141. env=env,
  142. timeout=120
  143. )
  144. # Verify output
  145. output_file = singlefile_output_dir / 'singlefile.html'
  146. if output_file.exists():
  147. html_content = output_file.read_text()
  148. assert len(html_content) > 500, "Output file too small"
  149. assert 'Example Domain' in html_content, "Should contain example.com content"
  150. else:
  151. # If singlefile couldn't connect to Chrome, it may have failed
  152. # Check if it mentioned browser-server in its args (indicating it tried to use CDP)
  153. assert result.returncode == 0 or 'browser-server' in result.stderr or 'cdp' in result.stderr.lower(), \
  154. f"Singlefile should attempt CDP connection. stderr: {result.stderr}"
  155. def test_singlefile_with_extension_uses_existing_chrome():
  156. """Test SingleFile uses the Chrome extension via existing session (CLI fallback disabled)."""
  157. with tempfile.TemporaryDirectory() as tmpdir:
  158. tmpdir = Path(tmpdir)
  159. data_dir = tmpdir / 'data'
  160. extensions_dir = data_dir / 'personas' / 'Default' / 'chrome_extensions'
  161. downloads_dir = data_dir / 'personas' / 'Default' / 'chrome_downloads'
  162. user_data_dir = data_dir / 'personas' / 'Default' / 'chrome_user_data'
  163. extensions_dir.mkdir(parents=True, exist_ok=True)
  164. downloads_dir.mkdir(parents=True, exist_ok=True)
  165. user_data_dir.mkdir(parents=True, exist_ok=True)
  166. env_install = os.environ.copy()
  167. env_install.update({
  168. 'DATA_DIR': str(data_dir),
  169. 'CHROME_EXTENSIONS_DIR': str(extensions_dir),
  170. 'CHROME_DOWNLOADS_DIR': str(downloads_dir),
  171. })
  172. # Install SingleFile extension cache before launching Chrome
  173. result = subprocess.run(
  174. ['node', str(INSTALL_SCRIPT)],
  175. capture_output=True,
  176. text=True,
  177. env=env_install,
  178. timeout=120
  179. )
  180. assert result.returncode == 0, f"Extension install failed: {result.stderr}"
  181. # Launch Chrome session with extensions loaded
  182. old_env = os.environ.copy()
  183. os.environ['CHROME_USER_DATA_DIR'] = str(user_data_dir)
  184. os.environ['CHROME_DOWNLOADS_DIR'] = str(downloads_dir)
  185. os.environ['CHROME_EXTENSIONS_DIR'] = str(extensions_dir)
  186. try:
  187. with chrome_session(
  188. tmpdir=tmpdir,
  189. crawl_id='singlefile-ext-crawl',
  190. snapshot_id='singlefile-ext-snap',
  191. test_url=TEST_URL,
  192. navigate=True,
  193. timeout=30,
  194. ) as (_chrome_proc, _chrome_pid, snapshot_chrome_dir, env):
  195. singlefile_output_dir = tmpdir / 'snapshot' / 'singlefile'
  196. singlefile_output_dir.mkdir(parents=True, exist_ok=True)
  197. # Ensure ../chrome points to snapshot chrome session (contains target_id.txt)
  198. chrome_dir = singlefile_output_dir.parent / 'chrome'
  199. if not chrome_dir.exists():
  200. chrome_dir.symlink_to(snapshot_chrome_dir)
  201. env['SINGLEFILE_ENABLED'] = 'true'
  202. env['SINGLEFILE_BINARY'] = '/nonexistent/single-file' # force extension path
  203. env['CHROME_EXTENSIONS_DIR'] = str(extensions_dir)
  204. env['CHROME_DOWNLOADS_DIR'] = str(downloads_dir)
  205. env['CHROME_HEADLESS'] = 'false'
  206. # Track downloads dir state before run to ensure file is created then moved out
  207. downloads_before = set(downloads_dir.glob('*.html'))
  208. downloads_mtime_before = downloads_dir.stat().st_mtime_ns
  209. result = subprocess.run(
  210. [sys.executable, str(SNAPSHOT_HOOK), f'--url={TEST_URL}', '--snapshot-id=singlefile-ext-snap'],
  211. cwd=str(singlefile_output_dir),
  212. capture_output=True,
  213. text=True,
  214. env=env,
  215. timeout=120
  216. )
  217. assert result.returncode == 0, f"SingleFile extension run failed: {result.stderr}"
  218. output_file = singlefile_output_dir / 'singlefile.html'
  219. assert output_file.exists(), f"singlefile.html not created. stdout: {result.stdout}, stderr: {result.stderr}"
  220. html_content = output_file.read_text(errors='ignore')
  221. assert 'Example Domain' in html_content, "Output should contain example.com content"
  222. # Verify download moved out of downloads dir
  223. downloads_after = set(downloads_dir.glob('*.html'))
  224. new_downloads = downloads_after - downloads_before
  225. downloads_mtime_after = downloads_dir.stat().st_mtime_ns
  226. assert downloads_mtime_after != downloads_mtime_before, "Downloads dir should be modified during extension save"
  227. assert not new_downloads, f"SingleFile download should be moved out of downloads dir, found: {new_downloads}"
  228. finally:
  229. os.environ.clear()
  230. os.environ.update(old_env)
  231. def test_singlefile_disabled_skips():
  232. """Test that SINGLEFILE_ENABLED=False exits without JSONL."""
  233. with tempfile.TemporaryDirectory() as tmpdir:
  234. tmpdir = Path(tmpdir)
  235. env = get_test_env()
  236. env['SINGLEFILE_ENABLED'] = 'False'
  237. result = subprocess.run(
  238. [sys.executable, str(SNAPSHOT_HOOK), f'--url={TEST_URL}', '--snapshot-id=test-disabled'],
  239. cwd=tmpdir,
  240. capture_output=True,
  241. text=True,
  242. env=env,
  243. timeout=30
  244. )
  245. assert result.returncode == 0, f"Should exit 0 when disabled: {result.stderr}"
  246. # Should NOT emit JSONL when disabled
  247. jsonl_lines = [line for line in result.stdout.strip().split('\n') if line.strip().startswith('{')]
  248. assert len(jsonl_lines) == 0, f"Should not emit JSONL when disabled, but got: {jsonl_lines}"
  249. if __name__ == '__main__':
  250. pytest.main([__file__, '-v'])