test_hooks.py 19 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536
  1. #!/usr/bin/env python3
  2. """
  3. Unit tests for the ArchiveBox hook architecture.
  4. Tests hook discovery, execution, JSONL parsing, background hook detection,
  5. binary lookup, and install hook XYZ_BINARY env var handling.
  6. Run with:
  7. sudo -u testuser bash -c 'source .venv/bin/activate && python -m pytest archivebox/tests/test_hooks.py -v'
  8. """
  9. import json
  10. import os
  11. import shutil
  12. import subprocess
  13. import tempfile
  14. import unittest
  15. from pathlib import Path
  16. from unittest.mock import MagicMock, patch
  17. # Set up Django before importing any Django-dependent modules
  18. os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'archivebox.settings')
  19. class TestBackgroundHookDetection(unittest.TestCase):
  20. """Test that background hooks are detected by .bg. suffix."""
  21. def test_bg_js_suffix_detected(self):
  22. """Hooks with .bg.js suffix should be detected as background."""
  23. script = Path('/path/to/on_Snapshot__21_consolelog.bg.js')
  24. is_background = '.bg.' in script.name or '__background' in script.stem
  25. self.assertTrue(is_background)
  26. def test_bg_py_suffix_detected(self):
  27. """Hooks with .bg.py suffix should be detected as background."""
  28. script = Path('/path/to/on_Snapshot__24_responses.bg.py')
  29. is_background = '.bg.' in script.name or '__background' in script.stem
  30. self.assertTrue(is_background)
  31. def test_bg_sh_suffix_detected(self):
  32. """Hooks with .bg.sh suffix should be detected as background."""
  33. script = Path('/path/to/on_Snapshot__23_ssl.bg.sh')
  34. is_background = '.bg.' in script.name or '__background' in script.stem
  35. self.assertTrue(is_background)
  36. def test_legacy_background_suffix_detected(self):
  37. """Hooks with __background in stem should be detected (backwards compat)."""
  38. script = Path('/path/to/on_Snapshot__21_consolelog__background.js')
  39. is_background = '.bg.' in script.name or '__background' in script.stem
  40. self.assertTrue(is_background)
  41. def test_foreground_hook_not_detected(self):
  42. """Hooks without .bg. or __background should NOT be detected as background."""
  43. script = Path('/path/to/on_Snapshot__11_favicon.js')
  44. is_background = '.bg.' in script.name or '__background' in script.stem
  45. self.assertFalse(is_background)
  46. def test_foreground_py_hook_not_detected(self):
  47. """Python hooks without .bg. should NOT be detected as background."""
  48. script = Path('/path/to/on_Snapshot__50_wget.py')
  49. is_background = '.bg.' in script.name or '__background' in script.stem
  50. self.assertFalse(is_background)
  51. class TestJSONLParsing(unittest.TestCase):
  52. """Test JSONL parsing in run_hook() output processing."""
  53. def test_parse_clean_jsonl(self):
  54. """Clean JSONL format should be parsed correctly."""
  55. stdout = '{"type": "ArchiveResult", "status": "succeeded", "output_str": "Done"}'
  56. records = []
  57. for line in stdout.splitlines():
  58. line = line.strip()
  59. if not line or not line.startswith('{'):
  60. continue
  61. try:
  62. data = json.loads(line)
  63. if 'type' in data:
  64. records.append(data)
  65. except json.JSONDecodeError:
  66. pass
  67. self.assertEqual(len(records), 1)
  68. self.assertEqual(records[0]['type'], 'ArchiveResult')
  69. self.assertEqual(records[0]['status'], 'succeeded')
  70. self.assertEqual(records[0]['output_str'], 'Done')
  71. def test_parse_multiple_jsonl_records(self):
  72. """Multiple JSONL records should all be parsed."""
  73. stdout = '''{"type": "ArchiveResult", "status": "succeeded", "output_str": "Done"}
  74. {"type": "Binary", "name": "wget", "abspath": "/usr/bin/wget"}'''
  75. records = []
  76. for line in stdout.splitlines():
  77. line = line.strip()
  78. if not line or not line.startswith('{'):
  79. continue
  80. try:
  81. data = json.loads(line)
  82. if 'type' in data:
  83. records.append(data)
  84. except json.JSONDecodeError:
  85. pass
  86. self.assertEqual(len(records), 2)
  87. self.assertEqual(records[0]['type'], 'ArchiveResult')
  88. self.assertEqual(records[1]['type'], 'Binary')
  89. def test_parse_jsonl_with_log_output(self):
  90. """JSONL should be extracted from mixed stdout with log lines."""
  91. stdout = '''Starting hook execution...
  92. Processing URL: https://example.com
  93. {"type": "ArchiveResult", "status": "succeeded", "output_str": "Downloaded"}
  94. Hook completed successfully'''
  95. records = []
  96. for line in stdout.splitlines():
  97. line = line.strip()
  98. if not line or not line.startswith('{'):
  99. continue
  100. try:
  101. data = json.loads(line)
  102. if 'type' in data:
  103. records.append(data)
  104. except json.JSONDecodeError:
  105. pass
  106. self.assertEqual(len(records), 1)
  107. self.assertEqual(records[0]['status'], 'succeeded')
  108. def test_parse_legacy_result_json_format(self):
  109. """Legacy RESULT_JSON= format should be parsed for backwards compat."""
  110. stdout = 'RESULT_JSON={"status": "succeeded", "output": "Done"}'
  111. output_json = None
  112. records = []
  113. for line in stdout.splitlines():
  114. line = line.strip()
  115. if line.startswith('RESULT_JSON='):
  116. try:
  117. data = json.loads(line[len('RESULT_JSON='):])
  118. if output_json is None:
  119. output_json = data
  120. data['type'] = 'ArchiveResult'
  121. records.append(data)
  122. except json.JSONDecodeError:
  123. pass
  124. self.assertEqual(len(records), 1)
  125. self.assertEqual(records[0]['type'], 'ArchiveResult')
  126. self.assertEqual(records[0]['status'], 'succeeded')
  127. def test_ignore_invalid_json(self):
  128. """Invalid JSON should be silently ignored."""
  129. stdout = '''{"type": "ArchiveResult", "status": "succeeded"}
  130. {invalid json here}
  131. not json at all
  132. {"type": "Binary", "name": "wget"}'''
  133. records = []
  134. for line in stdout.splitlines():
  135. line = line.strip()
  136. if not line or not line.startswith('{'):
  137. continue
  138. try:
  139. data = json.loads(line)
  140. if 'type' in data:
  141. records.append(data)
  142. except json.JSONDecodeError:
  143. pass
  144. self.assertEqual(len(records), 2)
  145. def test_json_without_type_ignored(self):
  146. """JSON objects without 'type' field should be ignored."""
  147. stdout = '''{"status": "succeeded", "output_str": "Done"}
  148. {"type": "ArchiveResult", "status": "succeeded"}'''
  149. records = []
  150. for line in stdout.splitlines():
  151. line = line.strip()
  152. if not line or not line.startswith('{'):
  153. continue
  154. try:
  155. data = json.loads(line)
  156. if 'type' in data:
  157. records.append(data)
  158. except json.JSONDecodeError:
  159. pass
  160. self.assertEqual(len(records), 1)
  161. self.assertEqual(records[0]['type'], 'ArchiveResult')
  162. class TestInstallHookEnvVarHandling(unittest.TestCase):
  163. """Test that install hooks respect XYZ_BINARY env vars."""
  164. def setUp(self):
  165. """Set up test environment."""
  166. self.work_dir = Path(tempfile.mkdtemp())
  167. self.test_hook = self.work_dir / 'test_hook.py'
  168. def tearDown(self):
  169. """Clean up test environment."""
  170. shutil.rmtree(self.work_dir, ignore_errors=True)
  171. def test_binary_env_var_absolute_path_handling(self):
  172. """Install hooks should handle absolute paths in XYZ_BINARY."""
  173. # Test the logic that install hooks use
  174. configured_binary = '/custom/path/to/wget2'
  175. if '/' in configured_binary:
  176. bin_name = Path(configured_binary).name
  177. else:
  178. bin_name = configured_binary
  179. self.assertEqual(bin_name, 'wget2')
  180. def test_binary_env_var_name_only_handling(self):
  181. """Install hooks should handle binary names in XYZ_BINARY."""
  182. # Test the logic that install hooks use
  183. configured_binary = 'wget2'
  184. if '/' in configured_binary:
  185. bin_name = Path(configured_binary).name
  186. else:
  187. bin_name = configured_binary
  188. self.assertEqual(bin_name, 'wget2')
  189. def test_binary_env_var_empty_default(self):
  190. """Install hooks should use default when XYZ_BINARY is empty."""
  191. configured_binary = ''
  192. if configured_binary:
  193. if '/' in configured_binary:
  194. bin_name = Path(configured_binary).name
  195. else:
  196. bin_name = configured_binary
  197. else:
  198. bin_name = 'wget' # default
  199. self.assertEqual(bin_name, 'wget')
  200. class TestHookDiscovery(unittest.TestCase):
  201. """Test hook discovery functions."""
  202. def setUp(self):
  203. """Set up test plugin directory."""
  204. self.test_dir = Path(tempfile.mkdtemp())
  205. self.plugins_dir = self.test_dir / 'plugins'
  206. self.plugins_dir.mkdir()
  207. # Create test plugin structure
  208. wget_dir = self.plugins_dir / 'wget'
  209. wget_dir.mkdir()
  210. (wget_dir / 'on_Snapshot__50_wget.py').write_text('# test hook')
  211. (wget_dir / 'on_Crawl__00_install_wget.py').write_text('# install hook')
  212. chrome_dir = self.plugins_dir / 'chrome_session'
  213. chrome_dir.mkdir()
  214. (chrome_dir / 'on_Snapshot__20_chrome_session.bg.js').write_text('// background hook')
  215. consolelog_dir = self.plugins_dir / 'consolelog'
  216. consolelog_dir.mkdir()
  217. (consolelog_dir / 'on_Snapshot__21_consolelog.bg.js').write_text('// background hook')
  218. def tearDown(self):
  219. """Clean up test directory."""
  220. shutil.rmtree(self.test_dir, ignore_errors=True)
  221. def test_discover_hooks_by_event(self):
  222. """discover_hooks() should find all hooks for an event."""
  223. # Use the local implementation since we can't easily mock BUILTIN_PLUGINS_DIR
  224. hooks = []
  225. for ext in ('sh', 'py', 'js'):
  226. pattern = f'*/on_Snapshot__*.{ext}'
  227. hooks.extend(self.plugins_dir.glob(pattern))
  228. hooks = sorted(set(hooks), key=lambda p: p.name)
  229. self.assertEqual(len(hooks), 3)
  230. hook_names = [h.name for h in hooks]
  231. self.assertIn('on_Snapshot__20_chrome_session.bg.js', hook_names)
  232. self.assertIn('on_Snapshot__21_consolelog.bg.js', hook_names)
  233. self.assertIn('on_Snapshot__50_wget.py', hook_names)
  234. def test_discover_hooks_sorted_by_name(self):
  235. """Hooks should be sorted by filename (numeric prefix ordering)."""
  236. hooks = []
  237. for ext in ('sh', 'py', 'js'):
  238. pattern = f'*/on_Snapshot__*.{ext}'
  239. hooks.extend(self.plugins_dir.glob(pattern))
  240. hooks = sorted(set(hooks), key=lambda p: p.name)
  241. # Check numeric ordering
  242. self.assertEqual(hooks[0].name, 'on_Snapshot__20_chrome_session.js')
  243. self.assertEqual(hooks[1].name, 'on_Snapshot__21_consolelog.bg.js')
  244. self.assertEqual(hooks[2].name, 'on_Snapshot__50_wget.py')
  245. class TestGetExtractorName(unittest.TestCase):
  246. """Test get_extractor_name() function."""
  247. def test_strip_numeric_prefix(self):
  248. """Numeric prefix should be stripped from extractor name."""
  249. # Inline implementation of get_extractor_name
  250. def get_extractor_name(extractor: str) -> str:
  251. parts = extractor.split('_', 1)
  252. if len(parts) == 2 and parts[0].isdigit():
  253. return parts[1]
  254. return extractor
  255. self.assertEqual(get_extractor_name('10_title'), 'title')
  256. self.assertEqual(get_extractor_name('26_readability'), 'readability')
  257. self.assertEqual(get_extractor_name('50_parse_html_urls'), 'parse_html_urls')
  258. def test_no_prefix_unchanged(self):
  259. """Extractor without numeric prefix should be unchanged."""
  260. def get_extractor_name(extractor: str) -> str:
  261. parts = extractor.split('_', 1)
  262. if len(parts) == 2 and parts[0].isdigit():
  263. return parts[1]
  264. return extractor
  265. self.assertEqual(get_extractor_name('title'), 'title')
  266. self.assertEqual(get_extractor_name('readability'), 'readability')
  267. class TestHookExecution(unittest.TestCase):
  268. """Test hook execution with real subprocesses."""
  269. def setUp(self):
  270. """Set up test environment."""
  271. self.work_dir = Path(tempfile.mkdtemp())
  272. def tearDown(self):
  273. """Clean up test environment."""
  274. shutil.rmtree(self.work_dir, ignore_errors=True)
  275. def test_python_hook_execution(self):
  276. """Python hook should execute and output JSONL."""
  277. hook_path = self.work_dir / 'test_hook.py'
  278. hook_path.write_text('''#!/usr/bin/env python3
  279. import json
  280. print(json.dumps({"type": "ArchiveResult", "status": "succeeded", "output_str": "Test passed"}))
  281. ''')
  282. result = subprocess.run(
  283. ['python3', str(hook_path)],
  284. cwd=str(self.work_dir),
  285. capture_output=True,
  286. text=True,
  287. )
  288. self.assertEqual(result.returncode, 0)
  289. output = json.loads(result.stdout.strip())
  290. self.assertEqual(output['type'], 'ArchiveResult')
  291. self.assertEqual(output['status'], 'succeeded')
  292. def test_js_hook_execution(self):
  293. """JavaScript hook should execute and output JSONL."""
  294. # Skip if node not available
  295. if shutil.which('node') is None:
  296. self.skipTest('Node.js not available')
  297. hook_path = self.work_dir / 'test_hook.js'
  298. hook_path.write_text('''#!/usr/bin/env node
  299. console.log(JSON.stringify({type: 'ArchiveResult', status: 'succeeded', output_str: 'JS test'}));
  300. ''')
  301. result = subprocess.run(
  302. ['node', str(hook_path)],
  303. cwd=str(self.work_dir),
  304. capture_output=True,
  305. text=True,
  306. )
  307. self.assertEqual(result.returncode, 0)
  308. output = json.loads(result.stdout.strip())
  309. self.assertEqual(output['type'], 'ArchiveResult')
  310. self.assertEqual(output['status'], 'succeeded')
  311. def test_hook_receives_cli_args(self):
  312. """Hook should receive CLI arguments."""
  313. hook_path = self.work_dir / 'test_hook.py'
  314. hook_path.write_text('''#!/usr/bin/env python3
  315. import sys
  316. import json
  317. # Simple arg parsing
  318. args = {}
  319. for arg in sys.argv[1:]:
  320. if arg.startswith('--') and '=' in arg:
  321. key, val = arg[2:].split('=', 1)
  322. args[key.replace('-', '_')] = val
  323. print(json.dumps({"type": "ArchiveResult", "status": "succeeded", "url": args.get("url", "")}))
  324. ''')
  325. result = subprocess.run(
  326. ['python3', str(hook_path), '--url=https://example.com'],
  327. cwd=str(self.work_dir),
  328. capture_output=True,
  329. text=True,
  330. )
  331. self.assertEqual(result.returncode, 0)
  332. output = json.loads(result.stdout.strip())
  333. self.assertEqual(output['url'], 'https://example.com')
  334. class TestInstallHookOutput(unittest.TestCase):
  335. """Test install hook output format compliance."""
  336. def setUp(self):
  337. """Set up test environment."""
  338. self.work_dir = Path(tempfile.mkdtemp())
  339. def tearDown(self):
  340. """Clean up test environment."""
  341. shutil.rmtree(self.work_dir, ignore_errors=True)
  342. def test_install_hook_outputs_binary(self):
  343. """Install hook should output Binary JSONL when binary found."""
  344. hook_output = json.dumps({
  345. 'type': 'Binary',
  346. 'name': 'wget',
  347. 'abspath': '/usr/bin/wget',
  348. 'version': '1.21.3',
  349. 'sha256': None,
  350. 'binprovider': 'apt',
  351. })
  352. data = json.loads(hook_output)
  353. self.assertEqual(data['type'], 'Binary')
  354. self.assertEqual(data['name'], 'wget')
  355. self.assertTrue(data['abspath'].startswith('/'))
  356. def test_install_hook_outputs_machine_config(self):
  357. """Install hook should output Machine config update JSONL."""
  358. hook_output = json.dumps({
  359. 'type': 'Machine',
  360. '_method': 'update',
  361. 'key': 'config/WGET_BINARY',
  362. 'value': '/usr/bin/wget',
  363. })
  364. data = json.loads(hook_output)
  365. self.assertEqual(data['type'], 'Machine')
  366. self.assertEqual(data['_method'], 'update')
  367. self.assertEqual(data['key'], 'config/WGET_BINARY')
  368. class TestSnapshotHookOutput(unittest.TestCase):
  369. """Test snapshot hook output format compliance."""
  370. def test_snapshot_hook_basic_output(self):
  371. """Snapshot hook should output clean ArchiveResult JSONL."""
  372. hook_output = json.dumps({
  373. 'type': 'ArchiveResult',
  374. 'status': 'succeeded',
  375. 'output_str': 'Downloaded 5 files',
  376. })
  377. data = json.loads(hook_output)
  378. self.assertEqual(data['type'], 'ArchiveResult')
  379. self.assertEqual(data['status'], 'succeeded')
  380. self.assertIn('output_str', data)
  381. def test_snapshot_hook_with_cmd(self):
  382. """Snapshot hook should include cmd for binary FK lookup."""
  383. hook_output = json.dumps({
  384. 'type': 'ArchiveResult',
  385. 'status': 'succeeded',
  386. 'output_str': 'Archived with wget',
  387. 'cmd': ['/usr/bin/wget', '-p', '-k', 'https://example.com'],
  388. })
  389. data = json.loads(hook_output)
  390. self.assertEqual(data['type'], 'ArchiveResult')
  391. self.assertIsInstance(data['cmd'], list)
  392. self.assertEqual(data['cmd'][0], '/usr/bin/wget')
  393. def test_snapshot_hook_with_output_json(self):
  394. """Snapshot hook can include structured metadata in output_json."""
  395. hook_output = json.dumps({
  396. 'type': 'ArchiveResult',
  397. 'status': 'succeeded',
  398. 'output_str': 'Got headers',
  399. 'output_json': {
  400. 'content-type': 'text/html',
  401. 'server': 'nginx',
  402. 'status-code': 200,
  403. },
  404. })
  405. data = json.loads(hook_output)
  406. self.assertEqual(data['type'], 'ArchiveResult')
  407. self.assertIsInstance(data['output_json'], dict)
  408. self.assertEqual(data['output_json']['status-code'], 200)
  409. def test_snapshot_hook_skipped_status(self):
  410. """Snapshot hook should support skipped status."""
  411. hook_output = json.dumps({
  412. 'type': 'ArchiveResult',
  413. 'status': 'skipped',
  414. 'output_str': 'SAVE_WGET=False',
  415. })
  416. data = json.loads(hook_output)
  417. self.assertEqual(data['status'], 'skipped')
  418. def test_snapshot_hook_failed_status(self):
  419. """Snapshot hook should support failed status."""
  420. hook_output = json.dumps({
  421. 'type': 'ArchiveResult',
  422. 'status': 'failed',
  423. 'output_str': '404 Not Found',
  424. })
  425. data = json.loads(hook_output)
  426. self.assertEqual(data['status'], 'failed')
  427. class TestPluginMetadata(unittest.TestCase):
  428. """Test that plugin metadata is added to JSONL records."""
  429. def test_plugin_name_added(self):
  430. """run_hook() should add plugin name to records."""
  431. # Simulate what run_hook() does
  432. script = Path('/archivebox/plugins/wget/on_Snapshot__50_wget.py')
  433. plugin_name = script.parent.name
  434. record = {'type': 'ArchiveResult', 'status': 'succeeded'}
  435. record['plugin'] = plugin_name
  436. record['plugin_hook'] = str(script)
  437. self.assertEqual(record['plugin'], 'wget')
  438. self.assertIn('on_Snapshot__50_wget.py', record['plugin_hook'])
  439. if __name__ == '__main__':
  440. unittest.main()