test_hooks.py 18 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483
  1. #!/usr/bin/env python3
  2. """
  3. Unit tests for the ArchiveBox hook architecture.
  4. Tests hook discovery, execution, JSONL parsing, background hook detection,
  5. binary lookup, and install hook XYZ_BINARY env var handling.
  6. Run with:
  7. sudo -u testuser bash -c 'source .venv/bin/activate && python -m pytest archivebox/tests/test_hooks.py -v'
  8. """
  9. import json
  10. import os
  11. import shutil
  12. import subprocess
  13. import tempfile
  14. import unittest
  15. from pathlib import Path
  16. from unittest.mock import MagicMock, patch
  17. # Set up Django before importing any Django-dependent modules
  18. os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'archivebox.settings')
  19. class TestBackgroundHookDetection(unittest.TestCase):
  20. """Test that background hooks are detected by .bg. suffix."""
  21. def test_bg_js_suffix_detected(self):
  22. """Hooks with .bg.js suffix should be detected as background."""
  23. script = Path('/path/to/on_Snapshot__21_consolelog.bg.js')
  24. is_background = '.bg.' in script.name or '__background' in script.stem
  25. self.assertTrue(is_background)
  26. def test_bg_py_suffix_detected(self):
  27. """Hooks with .bg.py suffix should be detected as background."""
  28. script = Path('/path/to/on_Snapshot__24_responses.bg.py')
  29. is_background = '.bg.' in script.name or '__background' in script.stem
  30. self.assertTrue(is_background)
  31. def test_bg_sh_suffix_detected(self):
  32. """Hooks with .bg.sh suffix should be detected as background."""
  33. script = Path('/path/to/on_Snapshot__23_ssl.bg.sh')
  34. is_background = '.bg.' in script.name or '__background' in script.stem
  35. self.assertTrue(is_background)
  36. def test_legacy_background_suffix_detected(self):
  37. """Hooks with __background in stem should be detected (backwards compat)."""
  38. script = Path('/path/to/on_Snapshot__21_consolelog__background.js')
  39. is_background = '.bg.' in script.name or '__background' in script.stem
  40. self.assertTrue(is_background)
  41. def test_foreground_hook_not_detected(self):
  42. """Hooks without .bg. or __background should NOT be detected as background."""
  43. script = Path('/path/to/on_Snapshot__11_favicon.js')
  44. is_background = '.bg.' in script.name or '__background' in script.stem
  45. self.assertFalse(is_background)
  46. def test_foreground_py_hook_not_detected(self):
  47. """Python hooks without .bg. should NOT be detected as background."""
  48. script = Path('/path/to/on_Snapshot__50_wget.py')
  49. is_background = '.bg.' in script.name or '__background' in script.stem
  50. self.assertFalse(is_background)
  51. class TestJSONLParsing(unittest.TestCase):
  52. """Test JSONL parsing in run_hook() output processing."""
  53. def test_parse_clean_jsonl(self):
  54. """Clean JSONL format should be parsed correctly."""
  55. stdout = '{"type": "ArchiveResult", "status": "succeeded", "output_str": "Done"}'
  56. from archivebox.machine.models import Process
  57. records = Process.parse_records_from_text(stdout)
  58. self.assertEqual(len(records), 1)
  59. self.assertEqual(records[0]['type'], 'ArchiveResult')
  60. self.assertEqual(records[0]['status'], 'succeeded')
  61. self.assertEqual(records[0]['output_str'], 'Done')
  62. def test_parse_multiple_jsonl_records(self):
  63. """Multiple JSONL records should all be parsed."""
  64. stdout = '''{"type": "ArchiveResult", "status": "succeeded", "output_str": "Done"}
  65. {"type": "Binary", "name": "wget", "abspath": "/usr/bin/wget"}'''
  66. from archivebox.machine.models import Process
  67. records = Process.parse_records_from_text(stdout)
  68. self.assertEqual(len(records), 2)
  69. self.assertEqual(records[0]['type'], 'ArchiveResult')
  70. self.assertEqual(records[1]['type'], 'Binary')
  71. def test_parse_jsonl_with_log_output(self):
  72. """JSONL should be extracted from mixed stdout with log lines."""
  73. stdout = '''Starting hook execution...
  74. Processing URL: https://example.com
  75. {"type": "ArchiveResult", "status": "succeeded", "output_str": "Downloaded"}
  76. Hook completed successfully'''
  77. from archivebox.machine.models import Process
  78. records = Process.parse_records_from_text(stdout)
  79. self.assertEqual(len(records), 1)
  80. self.assertEqual(records[0]['status'], 'succeeded')
  81. def test_ignore_invalid_json(self):
  82. """Invalid JSON should be silently ignored."""
  83. stdout = '''{"type": "ArchiveResult", "status": "succeeded"}
  84. {invalid json here}
  85. not json at all
  86. {"type": "Binary", "name": "wget"}'''
  87. from archivebox.machine.models import Process
  88. records = Process.parse_records_from_text(stdout)
  89. self.assertEqual(len(records), 2)
  90. def test_json_without_type_ignored(self):
  91. """JSON objects without 'type' field should be ignored."""
  92. stdout = '''{"status": "succeeded", "output_str": "Done"}
  93. {"type": "ArchiveResult", "status": "succeeded"}'''
  94. from archivebox.machine.models import Process
  95. records = Process.parse_records_from_text(stdout)
  96. self.assertEqual(len(records), 1)
  97. self.assertEqual(records[0]['type'], 'ArchiveResult')
  98. class TestInstallHookEnvVarHandling(unittest.TestCase):
  99. """Test that install hooks respect XYZ_BINARY env vars."""
  100. def setUp(self):
  101. """Set up test environment."""
  102. self.work_dir = Path(tempfile.mkdtemp())
  103. self.test_hook = self.work_dir / 'test_hook.py'
  104. def tearDown(self):
  105. """Clean up test environment."""
  106. shutil.rmtree(self.work_dir, ignore_errors=True)
  107. def test_binary_env_var_absolute_path_handling(self):
  108. """Install hooks should handle absolute paths in XYZ_BINARY."""
  109. # Test the logic that install hooks use
  110. configured_binary = '/custom/path/to/wget2'
  111. if '/' in configured_binary:
  112. bin_name = Path(configured_binary).name
  113. else:
  114. bin_name = configured_binary
  115. self.assertEqual(bin_name, 'wget2')
  116. def test_binary_env_var_name_only_handling(self):
  117. """Install hooks should handle binary names in XYZ_BINARY."""
  118. # Test the logic that install hooks use
  119. configured_binary = 'wget2'
  120. if '/' in configured_binary:
  121. bin_name = Path(configured_binary).name
  122. else:
  123. bin_name = configured_binary
  124. self.assertEqual(bin_name, 'wget2')
  125. def test_binary_env_var_empty_default(self):
  126. """Install hooks should use default when XYZ_BINARY is empty."""
  127. configured_binary = ''
  128. if configured_binary:
  129. if '/' in configured_binary:
  130. bin_name = Path(configured_binary).name
  131. else:
  132. bin_name = configured_binary
  133. else:
  134. bin_name = 'wget' # default
  135. self.assertEqual(bin_name, 'wget')
  136. class TestHookDiscovery(unittest.TestCase):
  137. """Test hook discovery functions."""
  138. def setUp(self):
  139. """Set up test plugin directory."""
  140. self.test_dir = Path(tempfile.mkdtemp())
  141. self.plugins_dir = self.test_dir / 'plugins'
  142. self.plugins_dir.mkdir()
  143. # Create test plugin structure
  144. wget_dir = self.plugins_dir / 'wget'
  145. wget_dir.mkdir()
  146. (wget_dir / 'on_Snapshot__50_wget.py').write_text('# test hook')
  147. (wget_dir / 'on_Crawl__00_install_wget.py').write_text('# install hook')
  148. chrome_dir = self.plugins_dir / 'chrome'
  149. chrome_dir.mkdir()
  150. (chrome_dir / 'on_Snapshot__20_chrome_tab.bg.js').write_text('// background hook')
  151. consolelog_dir = self.plugins_dir / 'consolelog'
  152. consolelog_dir.mkdir()
  153. (consolelog_dir / 'on_Snapshot__21_consolelog.bg.js').write_text('// background hook')
  154. def tearDown(self):
  155. """Clean up test directory."""
  156. shutil.rmtree(self.test_dir, ignore_errors=True)
  157. def test_discover_hooks_by_event(self):
  158. """discover_hooks() should find all hooks for an event."""
  159. # Use the local implementation since we can't easily mock BUILTIN_PLUGINS_DIR
  160. hooks = []
  161. for ext in ('sh', 'py', 'js'):
  162. pattern = f'*/on_Snapshot__*.{ext}'
  163. hooks.extend(self.plugins_dir.glob(pattern))
  164. hooks = sorted(set(hooks), key=lambda p: p.name)
  165. self.assertEqual(len(hooks), 3)
  166. hook_names = [h.name for h in hooks]
  167. self.assertIn('on_Snapshot__20_chrome_tab.bg.js', hook_names)
  168. self.assertIn('on_Snapshot__21_consolelog.bg.js', hook_names)
  169. self.assertIn('on_Snapshot__50_wget.py', hook_names)
  170. def test_discover_hooks_sorted_by_name(self):
  171. """Hooks should be sorted by filename (numeric prefix ordering)."""
  172. hooks = []
  173. for ext in ('sh', 'py', 'js'):
  174. pattern = f'*/on_Snapshot__*.{ext}'
  175. hooks.extend(self.plugins_dir.glob(pattern))
  176. hooks = sorted(set(hooks), key=lambda p: p.name)
  177. # Check numeric ordering
  178. self.assertEqual(hooks[0].name, 'on_Snapshot__20_chrome_tab.bg.js')
  179. self.assertEqual(hooks[1].name, 'on_Snapshot__21_consolelog.bg.js')
  180. self.assertEqual(hooks[2].name, 'on_Snapshot__50_wget.py')
  181. class TestGetExtractorName(unittest.TestCase):
  182. """Test get_extractor_name() function."""
  183. def test_strip_numeric_prefix(self):
  184. """Numeric prefix should be stripped from extractor name."""
  185. # Inline implementation of get_extractor_name
  186. def get_extractor_name(extractor: str) -> str:
  187. parts = extractor.split('_', 1)
  188. if len(parts) == 2 and parts[0].isdigit():
  189. return parts[1]
  190. return extractor
  191. self.assertEqual(get_extractor_name('10_title'), 'title')
  192. self.assertEqual(get_extractor_name('26_readability'), 'readability')
  193. self.assertEqual(get_extractor_name('50_parse_html_urls'), 'parse_html_urls')
  194. def test_no_prefix_unchanged(self):
  195. """Extractor without numeric prefix should be unchanged."""
  196. def get_extractor_name(extractor: str) -> str:
  197. parts = extractor.split('_', 1)
  198. if len(parts) == 2 and parts[0].isdigit():
  199. return parts[1]
  200. return extractor
  201. self.assertEqual(get_extractor_name('title'), 'title')
  202. self.assertEqual(get_extractor_name('readability'), 'readability')
  203. class TestHookExecution(unittest.TestCase):
  204. """Test hook execution with real subprocesses."""
  205. def setUp(self):
  206. """Set up test environment."""
  207. self.work_dir = Path(tempfile.mkdtemp())
  208. def tearDown(self):
  209. """Clean up test environment."""
  210. shutil.rmtree(self.work_dir, ignore_errors=True)
  211. def test_python_hook_execution(self):
  212. """Python hook should execute and output JSONL."""
  213. hook_path = self.work_dir / 'test_hook.py'
  214. hook_path.write_text('''#!/usr/bin/env python3
  215. import json
  216. print(json.dumps({"type": "ArchiveResult", "status": "succeeded", "output_str": "Test passed"}))
  217. ''')
  218. result = subprocess.run(
  219. ['python3', str(hook_path)],
  220. cwd=str(self.work_dir),
  221. capture_output=True,
  222. text=True,
  223. )
  224. self.assertEqual(result.returncode, 0)
  225. from archivebox.machine.models import Process
  226. records = Process.parse_records_from_text(result.stdout)
  227. self.assertTrue(records)
  228. self.assertEqual(records[0]['type'], 'ArchiveResult')
  229. self.assertEqual(records[0]['status'], 'succeeded')
  230. def test_js_hook_execution(self):
  231. """JavaScript hook should execute and output JSONL."""
  232. # Skip if node not available
  233. if shutil.which('node') is None:
  234. self.skipTest('Node.js not available')
  235. hook_path = self.work_dir / 'test_hook.js'
  236. hook_path.write_text('''#!/usr/bin/env node
  237. console.log(JSON.stringify({type: 'ArchiveResult', status: 'succeeded', output_str: 'JS test'}));
  238. ''')
  239. result = subprocess.run(
  240. ['node', str(hook_path)],
  241. cwd=str(self.work_dir),
  242. capture_output=True,
  243. text=True,
  244. )
  245. self.assertEqual(result.returncode, 0)
  246. from archivebox.machine.models import Process
  247. records = Process.parse_records_from_text(result.stdout)
  248. self.assertTrue(records)
  249. self.assertEqual(records[0]['type'], 'ArchiveResult')
  250. self.assertEqual(records[0]['status'], 'succeeded')
  251. def test_hook_receives_cli_args(self):
  252. """Hook should receive CLI arguments."""
  253. hook_path = self.work_dir / 'test_hook.py'
  254. hook_path.write_text('''#!/usr/bin/env python3
  255. import sys
  256. import json
  257. # Simple arg parsing
  258. args = {}
  259. for arg in sys.argv[1:]:
  260. if arg.startswith('--') and '=' in arg:
  261. key, val = arg[2:].split('=', 1)
  262. args[key.replace('-', '_')] = val
  263. print(json.dumps({"type": "ArchiveResult", "status": "succeeded", "url": args.get("url", "")}))
  264. ''')
  265. result = subprocess.run(
  266. ['python3', str(hook_path), '--url=https://example.com'],
  267. cwd=str(self.work_dir),
  268. capture_output=True,
  269. text=True,
  270. )
  271. self.assertEqual(result.returncode, 0)
  272. from archivebox.machine.models import Process
  273. records = Process.parse_records_from_text(result.stdout)
  274. self.assertTrue(records)
  275. self.assertEqual(records[0]['url'], 'https://example.com')
  276. class TestInstallHookOutput(unittest.TestCase):
  277. """Test install hook output format compliance."""
  278. def setUp(self):
  279. """Set up test environment."""
  280. self.work_dir = Path(tempfile.mkdtemp())
  281. def tearDown(self):
  282. """Clean up test environment."""
  283. shutil.rmtree(self.work_dir, ignore_errors=True)
  284. def test_install_hook_outputs_binary(self):
  285. """Install hook should output Binary JSONL when binary found."""
  286. hook_output = json.dumps({
  287. 'type': 'Binary',
  288. 'name': 'wget',
  289. 'abspath': '/usr/bin/wget',
  290. 'version': '1.21.3',
  291. 'sha256': None,
  292. 'binprovider': 'apt',
  293. })
  294. from archivebox.machine.models import Process
  295. data = Process.parse_records_from_text(hook_output)[0]
  296. self.assertEqual(data['type'], 'Binary')
  297. self.assertEqual(data['name'], 'wget')
  298. self.assertTrue(data['abspath'].startswith('/'))
  299. def test_install_hook_outputs_machine_config(self):
  300. """Install hook should output Machine config update JSONL."""
  301. hook_output = json.dumps({
  302. 'type': 'Machine',
  303. 'config': {
  304. 'WGET_BINARY': '/usr/bin/wget',
  305. },
  306. })
  307. from archivebox.machine.models import Process
  308. data = Process.parse_records_from_text(hook_output)[0]
  309. self.assertEqual(data['type'], 'Machine')
  310. self.assertIn('config', data)
  311. self.assertEqual(data['config']['WGET_BINARY'], '/usr/bin/wget')
  312. class TestSnapshotHookOutput(unittest.TestCase):
  313. """Test snapshot hook output format compliance."""
  314. def test_snapshot_hook_basic_output(self):
  315. """Snapshot hook should output clean ArchiveResult JSONL."""
  316. hook_output = json.dumps({
  317. 'type': 'ArchiveResult',
  318. 'status': 'succeeded',
  319. 'output_str': 'Downloaded 5 files',
  320. })
  321. from archivebox.machine.models import Process
  322. data = Process.parse_records_from_text(hook_output)[0]
  323. self.assertEqual(data['type'], 'ArchiveResult')
  324. self.assertEqual(data['status'], 'succeeded')
  325. self.assertIn('output_str', data)
  326. def test_snapshot_hook_with_cmd(self):
  327. """Snapshot hook should include cmd for binary FK lookup."""
  328. hook_output = json.dumps({
  329. 'type': 'ArchiveResult',
  330. 'status': 'succeeded',
  331. 'output_str': 'Archived with wget',
  332. 'cmd': ['/usr/bin/wget', '-p', '-k', 'https://example.com'],
  333. })
  334. from archivebox.machine.models import Process
  335. data = Process.parse_records_from_text(hook_output)[0]
  336. self.assertEqual(data['type'], 'ArchiveResult')
  337. self.assertIsInstance(data['cmd'], list)
  338. self.assertEqual(data['cmd'][0], '/usr/bin/wget')
  339. def test_snapshot_hook_with_output_json(self):
  340. """Snapshot hook can include structured metadata in output_json."""
  341. hook_output = json.dumps({
  342. 'type': 'ArchiveResult',
  343. 'status': 'succeeded',
  344. 'output_str': 'Got headers',
  345. 'output_json': {
  346. 'content-type': 'text/html',
  347. 'server': 'nginx',
  348. 'status-code': 200,
  349. },
  350. })
  351. from archivebox.machine.models import Process
  352. data = Process.parse_records_from_text(hook_output)[0]
  353. self.assertEqual(data['type'], 'ArchiveResult')
  354. self.assertIsInstance(data['output_json'], dict)
  355. self.assertEqual(data['output_json']['status-code'], 200)
  356. def test_snapshot_hook_skipped_status(self):
  357. """Snapshot hook should support skipped status."""
  358. hook_output = json.dumps({
  359. 'type': 'ArchiveResult',
  360. 'status': 'skipped',
  361. 'output_str': 'SAVE_WGET=False',
  362. })
  363. from archivebox.machine.models import Process
  364. data = Process.parse_records_from_text(hook_output)[0]
  365. self.assertEqual(data['status'], 'skipped')
  366. def test_snapshot_hook_failed_status(self):
  367. """Snapshot hook should support failed status."""
  368. hook_output = json.dumps({
  369. 'type': 'ArchiveResult',
  370. 'status': 'failed',
  371. 'output_str': '404 Not Found',
  372. })
  373. from archivebox.machine.models import Process
  374. data = Process.parse_records_from_text(hook_output)[0]
  375. self.assertEqual(data['status'], 'failed')
  376. class TestPluginMetadata(unittest.TestCase):
  377. """Test that plugin metadata is added to JSONL records."""
  378. def test_plugin_name_added(self):
  379. """run_hook() should add plugin name to records."""
  380. # Simulate what run_hook() does
  381. script = Path('/archivebox/plugins/wget/on_Snapshot__50_wget.py')
  382. plugin_name = script.parent.name
  383. record = {'type': 'ArchiveResult', 'status': 'succeeded'}
  384. record['plugin'] = plugin_name
  385. record['plugin_hook'] = str(script)
  386. self.assertEqual(record['plugin'], 'wget')
  387. self.assertIn('on_Snapshot__50_wget.py', record['plugin_hook'])
  388. if __name__ == '__main__':
  389. unittest.main()