tests_piping.py 38 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057
  1. #!/usr/bin/env python3
  2. """
  3. Tests for CLI piping workflow: crawl | snapshot | archiveresult | run
  4. This module tests the JSONL-based piping between CLI commands as described in:
  5. https://github.com/ArchiveBox/ArchiveBox/issues/1363
  6. Workflows tested:
  7. archivebox crawl create URL -> Crawl JSONL
  8. archivebox snapshot create -> Snapshot JSONL (accepts Crawl or URL input)
  9. archivebox archiveresult create -> ArchiveResult JSONL (accepts Snapshot input)
  10. archivebox run -> Process queued records (accepts any JSONL)
  11. Pipeline:
  12. archivebox crawl create URL | archivebox snapshot create | archivebox archiveresult create | archivebox run
  13. Each command should:
  14. - Accept URLs, IDs, or JSONL as input (args or stdin)
  15. - Output JSONL to stdout when piped (not TTY)
  16. - Output human-readable to stderr when TTY
  17. """
  18. __package__ = 'archivebox.cli'
  19. import os
  20. import sys
  21. import json
  22. import shutil
  23. import tempfile
  24. import unittest
  25. from io import StringIO
  26. from pathlib import Path
  27. # Test configuration - disable slow extractors
  28. TEST_CONFIG = {
  29. 'USE_COLOR': 'False',
  30. 'SHOW_PROGRESS': 'False',
  31. 'SAVE_ARCHIVEDOTORG': 'False',
  32. 'SAVE_TITLE': 'True', # Fast extractor
  33. 'SAVE_FAVICON': 'False',
  34. 'SAVE_WGET': 'False',
  35. 'SAVE_WARC': 'False',
  36. 'SAVE_PDF': 'False',
  37. 'SAVE_SCREENSHOT': 'False',
  38. 'SAVE_DOM': 'False',
  39. 'SAVE_SINGLEFILE': 'False',
  40. 'SAVE_READABILITY': 'False',
  41. 'SAVE_MERCURY': 'False',
  42. 'SAVE_GIT': 'False',
  43. 'SAVE_YTDLP': 'False',
  44. 'SAVE_HEADERS': 'False',
  45. 'USE_CURL': 'False',
  46. 'USE_WGET': 'False',
  47. 'USE_GIT': 'False',
  48. 'USE_CHROME': 'False',
  49. 'USE_YOUTUBEDL': 'False',
  50. 'USE_NODE': 'False',
  51. }
  52. os.environ.update(TEST_CONFIG)
  53. # =============================================================================
  54. # JSONL Utility Tests
  55. # =============================================================================
  56. class TestJSONLParsing(unittest.TestCase):
  57. """Test JSONL input parsing utilities."""
  58. def test_parse_plain_url(self):
  59. """Plain URLs should be parsed as Snapshot records."""
  60. from archivebox.misc.jsonl import parse_line, TYPE_SNAPSHOT
  61. result = parse_line('https://example.com')
  62. self.assertIsNotNone(result)
  63. self.assertEqual(result['type'], TYPE_SNAPSHOT)
  64. self.assertEqual(result['url'], 'https://example.com')
  65. def test_parse_jsonl_snapshot(self):
  66. """JSONL Snapshot records should preserve all fields."""
  67. from archivebox.misc.jsonl import parse_line, TYPE_SNAPSHOT
  68. line = '{"type": "Snapshot", "url": "https://example.com", "tags": "test,demo"}'
  69. result = parse_line(line)
  70. self.assertIsNotNone(result)
  71. self.assertEqual(result['type'], TYPE_SNAPSHOT)
  72. self.assertEqual(result['url'], 'https://example.com')
  73. self.assertEqual(result['tags'], 'test,demo')
  74. def test_parse_jsonl_crawl(self):
  75. """JSONL Crawl records should be parsed correctly."""
  76. from archivebox.misc.jsonl import parse_line, TYPE_CRAWL
  77. line = '{"type": "Crawl", "id": "abc123", "urls": "https://example.com", "max_depth": 1}'
  78. result = parse_line(line)
  79. self.assertIsNotNone(result)
  80. self.assertEqual(result['type'], TYPE_CRAWL)
  81. self.assertEqual(result['id'], 'abc123')
  82. self.assertEqual(result['urls'], 'https://example.com')
  83. self.assertEqual(result['max_depth'], 1)
  84. def test_parse_jsonl_with_id(self):
  85. """JSONL with id field should be recognized."""
  86. from archivebox.misc.jsonl import parse_line, TYPE_SNAPSHOT
  87. line = '{"type": "Snapshot", "id": "abc123", "url": "https://example.com"}'
  88. result = parse_line(line)
  89. self.assertIsNotNone(result)
  90. self.assertEqual(result['id'], 'abc123')
  91. self.assertEqual(result['url'], 'https://example.com')
  92. def test_parse_uuid_as_snapshot_id(self):
  93. """Bare UUIDs should be parsed as snapshot IDs."""
  94. from archivebox.misc.jsonl import parse_line, TYPE_SNAPSHOT
  95. uuid = '01234567-89ab-cdef-0123-456789abcdef'
  96. result = parse_line(uuid)
  97. self.assertIsNotNone(result)
  98. self.assertEqual(result['type'], TYPE_SNAPSHOT)
  99. self.assertEqual(result['id'], uuid)
  100. def test_parse_empty_line(self):
  101. """Empty lines should return None."""
  102. from archivebox.misc.jsonl import parse_line
  103. self.assertIsNone(parse_line(''))
  104. self.assertIsNone(parse_line(' '))
  105. self.assertIsNone(parse_line('\n'))
  106. def test_parse_comment_line(self):
  107. """Comment lines should return None."""
  108. from archivebox.misc.jsonl import parse_line
  109. self.assertIsNone(parse_line('# This is a comment'))
  110. self.assertIsNone(parse_line(' # Indented comment'))
  111. def test_parse_invalid_url(self):
  112. """Invalid URLs should return None."""
  113. from archivebox.misc.jsonl import parse_line
  114. self.assertIsNone(parse_line('not-a-url'))
  115. self.assertIsNone(parse_line('ftp://example.com')) # Only http/https/file
  116. def test_parse_file_url(self):
  117. """file:// URLs should be parsed."""
  118. from archivebox.misc.jsonl import parse_line, TYPE_SNAPSHOT
  119. result = parse_line('file:///path/to/file.txt')
  120. self.assertIsNotNone(result)
  121. self.assertEqual(result['type'], TYPE_SNAPSHOT)
  122. self.assertEqual(result['url'], 'file:///path/to/file.txt')
  123. # Note: JSONL output serialization is tested in TestPipingWorkflowIntegration
  124. # using real model instances, not mocks.
  125. class TestReadArgsOrStdin(unittest.TestCase):
  126. """Test reading from args or stdin."""
  127. def test_read_from_args(self):
  128. """Should read URLs from command line args."""
  129. from archivebox.misc.jsonl import read_args_or_stdin
  130. args = ('https://example1.com', 'https://example2.com')
  131. records = list(read_args_or_stdin(args))
  132. self.assertEqual(len(records), 2)
  133. self.assertEqual(records[0]['url'], 'https://example1.com')
  134. self.assertEqual(records[1]['url'], 'https://example2.com')
  135. def test_read_from_stdin(self):
  136. """Should read URLs from stdin when no args provided."""
  137. from archivebox.misc.jsonl import read_args_or_stdin
  138. stdin_content = 'https://example1.com\nhttps://example2.com\n'
  139. stream = StringIO(stdin_content)
  140. # Mock isatty to return False (simulating piped input)
  141. stream.isatty = lambda: False
  142. records = list(read_args_or_stdin((), stream=stream))
  143. self.assertEqual(len(records), 2)
  144. self.assertEqual(records[0]['url'], 'https://example1.com')
  145. self.assertEqual(records[1]['url'], 'https://example2.com')
  146. def test_read_jsonl_from_stdin(self):
  147. """Should read JSONL from stdin."""
  148. from archivebox.misc.jsonl import read_args_or_stdin
  149. stdin_content = '{"type": "Snapshot", "url": "https://example.com", "tags": "test"}\n'
  150. stream = StringIO(stdin_content)
  151. stream.isatty = lambda: False
  152. records = list(read_args_or_stdin((), stream=stream))
  153. self.assertEqual(len(records), 1)
  154. self.assertEqual(records[0]['url'], 'https://example.com')
  155. self.assertEqual(records[0]['tags'], 'test')
  156. def test_read_crawl_jsonl_from_stdin(self):
  157. """Should read Crawl JSONL from stdin."""
  158. from archivebox.misc.jsonl import read_args_or_stdin, TYPE_CRAWL
  159. stdin_content = '{"type": "Crawl", "id": "abc123", "urls": "https://example.com\\nhttps://foo.com"}\n'
  160. stream = StringIO(stdin_content)
  161. stream.isatty = lambda: False
  162. records = list(read_args_or_stdin((), stream=stream))
  163. self.assertEqual(len(records), 1)
  164. self.assertEqual(records[0]['type'], TYPE_CRAWL)
  165. self.assertEqual(records[0]['id'], 'abc123')
  166. def test_skip_tty_stdin(self):
  167. """Should not read from TTY stdin (would block)."""
  168. from archivebox.misc.jsonl import read_args_or_stdin
  169. stream = StringIO('https://example.com')
  170. stream.isatty = lambda: True # Simulate TTY
  171. records = list(read_args_or_stdin((), stream=stream))
  172. self.assertEqual(len(records), 0)
  173. # =============================================================================
  174. # Unit Tests for Individual Commands
  175. # =============================================================================
  176. class TestCrawlCommand(unittest.TestCase):
  177. """Unit tests for archivebox crawl command."""
  178. def setUp(self):
  179. """Set up test environment."""
  180. self.test_dir = tempfile.mkdtemp()
  181. os.environ['DATA_DIR'] = self.test_dir
  182. def tearDown(self):
  183. """Clean up test environment."""
  184. shutil.rmtree(self.test_dir, ignore_errors=True)
  185. def test_crawl_accepts_url(self):
  186. """crawl should accept URLs as input."""
  187. from archivebox.misc.jsonl import read_args_or_stdin
  188. args = ('https://example.com',)
  189. records = list(read_args_or_stdin(args))
  190. self.assertEqual(len(records), 1)
  191. self.assertEqual(records[0]['url'], 'https://example.com')
  192. def test_crawl_output_format(self):
  193. """crawl should output Crawl JSONL records."""
  194. from archivebox.misc.jsonl import TYPE_CRAWL
  195. # Mock crawl output
  196. crawl_output = {
  197. 'type': TYPE_CRAWL,
  198. 'schema_version': '0.9.0',
  199. 'id': 'test-crawl-id',
  200. 'urls': 'https://example.com',
  201. 'status': 'queued',
  202. 'max_depth': 0,
  203. }
  204. self.assertEqual(crawl_output['type'], TYPE_CRAWL)
  205. self.assertIn('id', crawl_output)
  206. self.assertIn('urls', crawl_output)
  207. class TestSnapshotCommand(unittest.TestCase):
  208. """Unit tests for archivebox snapshot command."""
  209. def setUp(self):
  210. """Set up test environment."""
  211. self.test_dir = tempfile.mkdtemp()
  212. os.environ['DATA_DIR'] = self.test_dir
  213. def tearDown(self):
  214. """Clean up test environment."""
  215. shutil.rmtree(self.test_dir, ignore_errors=True)
  216. def test_snapshot_accepts_url(self):
  217. """snapshot should accept URLs as input."""
  218. from archivebox.misc.jsonl import read_args_or_stdin
  219. args = ('https://example.com',)
  220. records = list(read_args_or_stdin(args))
  221. self.assertEqual(len(records), 1)
  222. self.assertEqual(records[0]['url'], 'https://example.com')
  223. def test_snapshot_accepts_crawl_jsonl(self):
  224. """snapshot should accept Crawl JSONL as input."""
  225. from archivebox.misc.jsonl import read_args_or_stdin, TYPE_CRAWL
  226. stdin = StringIO('{"type": "Crawl", "id": "abc123", "urls": "https://example.com"}\n')
  227. stdin.isatty = lambda: False
  228. records = list(read_args_or_stdin((), stream=stdin))
  229. self.assertEqual(len(records), 1)
  230. self.assertEqual(records[0]['type'], TYPE_CRAWL)
  231. self.assertEqual(records[0]['id'], 'abc123')
  232. self.assertEqual(records[0]['urls'], 'https://example.com')
  233. def test_snapshot_accepts_jsonl_with_metadata(self):
  234. """snapshot should accept JSONL with tags and other metadata."""
  235. from archivebox.misc.jsonl import read_args_or_stdin
  236. stdin = StringIO('{"type": "Snapshot", "url": "https://example.com", "tags": "tag1,tag2", "title": "Test"}\n')
  237. stdin.isatty = lambda: False
  238. records = list(read_args_or_stdin((), stream=stdin))
  239. self.assertEqual(len(records), 1)
  240. self.assertEqual(records[0]['url'], 'https://example.com')
  241. self.assertEqual(records[0]['tags'], 'tag1,tag2')
  242. self.assertEqual(records[0]['title'], 'Test')
  243. # Note: Snapshot output format is tested in integration tests
  244. # (TestPipingWorkflowIntegration.test_snapshot_creates_and_outputs_jsonl)
  245. # using real Snapshot instances.
  246. class TestArchiveResultCommand(unittest.TestCase):
  247. """Unit tests for archivebox archiveresult command."""
  248. def setUp(self):
  249. """Set up test environment."""
  250. self.test_dir = tempfile.mkdtemp()
  251. os.environ['DATA_DIR'] = self.test_dir
  252. def tearDown(self):
  253. """Clean up test environment."""
  254. shutil.rmtree(self.test_dir, ignore_errors=True)
  255. def test_archiveresult_accepts_snapshot_id(self):
  256. """archiveresult should accept snapshot IDs as input."""
  257. from archivebox.misc.jsonl import read_args_or_stdin
  258. uuid = '01234567-89ab-cdef-0123-456789abcdef'
  259. args = (uuid,)
  260. records = list(read_args_or_stdin(args))
  261. self.assertEqual(len(records), 1)
  262. self.assertEqual(records[0]['id'], uuid)
  263. def test_archiveresult_accepts_jsonl_snapshot(self):
  264. """archiveresult should accept JSONL Snapshot records."""
  265. from archivebox.misc.jsonl import read_args_or_stdin, TYPE_SNAPSHOT
  266. stdin = StringIO('{"type": "Snapshot", "id": "abc123", "url": "https://example.com"}\n')
  267. stdin.isatty = lambda: False
  268. records = list(read_args_or_stdin((), stream=stdin))
  269. self.assertEqual(len(records), 1)
  270. self.assertEqual(records[0]['type'], TYPE_SNAPSHOT)
  271. self.assertEqual(records[0]['id'], 'abc123')
  272. def test_archiveresult_gathers_snapshot_ids(self):
  273. """archiveresult should gather snapshot IDs from various input formats."""
  274. from archivebox.misc.jsonl import TYPE_SNAPSHOT, TYPE_ARCHIVERESULT
  275. records = [
  276. {'type': TYPE_SNAPSHOT, 'id': 'snap-1'},
  277. {'type': TYPE_SNAPSHOT, 'id': 'snap-2', 'url': 'https://example.com'},
  278. {'type': TYPE_ARCHIVERESULT, 'snapshot_id': 'snap-3'},
  279. {'id': 'snap-4'}, # Bare id
  280. ]
  281. snapshot_ids = set()
  282. for record in records:
  283. record_type = record.get('type')
  284. if record_type == TYPE_SNAPSHOT:
  285. snapshot_id = record.get('id')
  286. if snapshot_id:
  287. snapshot_ids.add(snapshot_id)
  288. elif record_type == TYPE_ARCHIVERESULT:
  289. snapshot_id = record.get('snapshot_id')
  290. if snapshot_id:
  291. snapshot_ids.add(snapshot_id)
  292. elif 'id' in record:
  293. snapshot_ids.add(record['id'])
  294. self.assertEqual(len(snapshot_ids), 4)
  295. self.assertIn('snap-1', snapshot_ids)
  296. self.assertIn('snap-2', snapshot_ids)
  297. self.assertIn('snap-3', snapshot_ids)
  298. self.assertIn('snap-4', snapshot_ids)
  299. # =============================================================================
  300. # URL Collection Tests
  301. # =============================================================================
  302. class TestURLCollection(unittest.TestCase):
  303. """Test collecting urls.jsonl from extractor output."""
  304. def setUp(self):
  305. """Create test directory structure."""
  306. self.test_dir = Path(tempfile.mkdtemp())
  307. # Create fake extractor output directories with urls.jsonl
  308. (self.test_dir / 'wget').mkdir()
  309. (self.test_dir / 'wget' / 'urls.jsonl').write_text(
  310. '{"url": "https://wget-link-1.com"}\n'
  311. '{"url": "https://wget-link-2.com"}\n'
  312. )
  313. (self.test_dir / 'parse_html_urls').mkdir()
  314. (self.test_dir / 'parse_html_urls' / 'urls.jsonl').write_text(
  315. '{"url": "https://html-link-1.com"}\n'
  316. '{"url": "https://html-link-2.com", "title": "HTML Link 2"}\n'
  317. )
  318. (self.test_dir / 'screenshot').mkdir()
  319. # No urls.jsonl in screenshot dir - not a parser
  320. def tearDown(self):
  321. """Clean up test directory."""
  322. shutil.rmtree(self.test_dir, ignore_errors=True)
  323. def test_collect_urls_from_plugins(self):
  324. """Should collect urls.jsonl from all parser plugin subdirectories."""
  325. from archivebox.hooks import collect_urls_from_plugins
  326. urls = collect_urls_from_plugins(self.test_dir)
  327. self.assertEqual(len(urls), 4)
  328. # Check that plugin is set
  329. plugins = {u['plugin'] for u in urls}
  330. self.assertIn('wget', plugins)
  331. self.assertIn('parse_html_urls', plugins)
  332. self.assertNotIn('screenshot', plugins) # No urls.jsonl
  333. def test_collect_urls_preserves_metadata(self):
  334. """Should preserve metadata from urls.jsonl entries."""
  335. from archivebox.hooks import collect_urls_from_plugins
  336. urls = collect_urls_from_plugins(self.test_dir)
  337. # Find the entry with title
  338. titled = [u for u in urls if u.get('title') == 'HTML Link 2']
  339. self.assertEqual(len(titled), 1)
  340. self.assertEqual(titled[0]['url'], 'https://html-link-2.com')
  341. def test_collect_urls_empty_dir(self):
  342. """Should handle empty or non-existent directories."""
  343. from archivebox.hooks import collect_urls_from_plugins
  344. empty_dir = self.test_dir / 'nonexistent'
  345. urls = collect_urls_from_plugins(empty_dir)
  346. self.assertEqual(len(urls), 0)
  347. # =============================================================================
  348. # Integration Tests
  349. # =============================================================================
  350. class TestPipingWorkflowIntegration(unittest.TestCase):
  351. """
  352. Integration tests for the complete piping workflow.
  353. These tests require Django to be set up and use the actual database.
  354. """
  355. @classmethod
  356. def setUpClass(cls):
  357. """Set up Django and test database."""
  358. cls.test_dir = tempfile.mkdtemp()
  359. os.environ['DATA_DIR'] = cls.test_dir
  360. # Initialize Django
  361. from archivebox.config.django import setup_django
  362. setup_django()
  363. # Initialize the archive
  364. from archivebox.cli.archivebox_init import init
  365. init()
  366. @classmethod
  367. def tearDownClass(cls):
  368. """Clean up test database."""
  369. shutil.rmtree(cls.test_dir, ignore_errors=True)
  370. def test_crawl_creates_and_outputs_jsonl(self):
  371. """
  372. Test: archivebox crawl URL1 URL2 URL3
  373. Should create a single Crawl with all URLs and output JSONL when piped.
  374. """
  375. from archivebox.crawls.models import Crawl
  376. from archivebox.misc.jsonl import TYPE_CRAWL
  377. from archivebox.base_models.models import get_or_create_system_user_pk
  378. created_by_id = get_or_create_system_user_pk()
  379. # Create crawl with multiple URLs (as newline-separated string)
  380. urls = 'https://test-crawl-1.example.com\nhttps://test-crawl-2.example.com'
  381. crawl = Crawl.from_json({'urls': urls}, overrides={'created_by_id': created_by_id})
  382. self.assertIsNotNone(crawl)
  383. self.assertIsNotNone(crawl.id)
  384. self.assertEqual(crawl.urls, urls)
  385. self.assertEqual(crawl.status, 'queued')
  386. # Verify URLs list
  387. urls_list = crawl.get_urls_list()
  388. self.assertEqual(len(urls_list), 2)
  389. self.assertIn('https://test-crawl-1.example.com', urls_list)
  390. self.assertIn('https://test-crawl-2.example.com', urls_list)
  391. # Verify output format
  392. output = crawl.to_json()
  393. self.assertEqual(output['type'], TYPE_CRAWL)
  394. self.assertIn('id', output)
  395. self.assertEqual(output['urls'], urls)
  396. self.assertIn('schema_version', output)
  397. def test_snapshot_accepts_crawl_jsonl(self):
  398. """
  399. Test: archivebox crawl URL | archivebox snapshot
  400. Snapshot should accept Crawl JSONL and create Snapshots for each URL.
  401. """
  402. from archivebox.crawls.models import Crawl
  403. from archivebox.core.models import Snapshot
  404. from archivebox.misc.jsonl import (
  405. read_args_or_stdin,
  406. TYPE_CRAWL, TYPE_SNAPSHOT
  407. )
  408. from archivebox.base_models.models import get_or_create_system_user_pk
  409. created_by_id = get_or_create_system_user_pk()
  410. # Step 1: Create crawl (simulating 'archivebox crawl')
  411. urls = 'https://crawl-to-snap-1.example.com\nhttps://crawl-to-snap-2.example.com'
  412. crawl = Crawl.from_json({'urls': urls}, overrides={'created_by_id': created_by_id})
  413. crawl_output = crawl.to_json()
  414. # Step 2: Parse crawl output as snapshot input
  415. stdin = StringIO(json.dumps(crawl_output) + '\n')
  416. stdin.isatty = lambda: False
  417. records = list(read_args_or_stdin((), stream=stdin))
  418. self.assertEqual(len(records), 1)
  419. self.assertEqual(records[0]['type'], TYPE_CRAWL)
  420. # Step 3: Create snapshots from crawl URLs
  421. created_snapshots = []
  422. for url in crawl.get_urls_list():
  423. snapshot = Snapshot.from_json({'url': url}, overrides={'created_by_id': created_by_id})
  424. if snapshot:
  425. created_snapshots.append(snapshot)
  426. self.assertEqual(len(created_snapshots), 2)
  427. # Verify snapshot output
  428. for snapshot in created_snapshots:
  429. output = snapshot.to_json()
  430. self.assertEqual(output['type'], TYPE_SNAPSHOT)
  431. self.assertIn(output['url'], [
  432. 'https://crawl-to-snap-1.example.com',
  433. 'https://crawl-to-snap-2.example.com'
  434. ])
  435. def test_snapshot_creates_and_outputs_jsonl(self):
  436. """
  437. Test: archivebox snapshot URL
  438. Should create a Snapshot and output JSONL when piped.
  439. """
  440. from archivebox.core.models import Snapshot
  441. from archivebox.misc.jsonl import (
  442. read_args_or_stdin, write_record,
  443. TYPE_SNAPSHOT
  444. )
  445. from archivebox.base_models.models import get_or_create_system_user_pk
  446. created_by_id = get_or_create_system_user_pk()
  447. # Simulate input
  448. url = 'https://test-snapshot-1.example.com'
  449. records = list(read_args_or_stdin((url,)))
  450. self.assertEqual(len(records), 1)
  451. self.assertEqual(records[0]['url'], url)
  452. # Create snapshot
  453. overrides = {'created_by_id': created_by_id}
  454. snapshot = Snapshot.from_json(records[0], overrides=overrides)
  455. self.assertIsNotNone(snapshot.id)
  456. self.assertEqual(snapshot.url, url)
  457. # Verify output format
  458. output = snapshot.to_json()
  459. self.assertEqual(output['type'], TYPE_SNAPSHOT)
  460. self.assertIn('id', output)
  461. self.assertEqual(output['url'], url)
  462. def test_extract_accepts_snapshot_from_previous_command(self):
  463. """
  464. Test: archivebox snapshot URL | archivebox extract
  465. Extract should accept JSONL output from snapshot command.
  466. """
  467. from archivebox.core.models import Snapshot, ArchiveResult
  468. from archivebox.misc.jsonl import (
  469. read_args_or_stdin,
  470. TYPE_SNAPSHOT
  471. )
  472. from archivebox.base_models.models import get_or_create_system_user_pk
  473. created_by_id = get_or_create_system_user_pk()
  474. # Step 1: Create snapshot (simulating 'archivebox snapshot')
  475. url = 'https://test-extract-1.example.com'
  476. overrides = {'created_by_id': created_by_id}
  477. snapshot = Snapshot.from_json({'url': url}, overrides=overrides)
  478. snapshot_output = snapshot.to_json()
  479. # Step 2: Parse snapshot output as extract input
  480. stdin = StringIO(json.dumps(snapshot_output) + '\n')
  481. stdin.isatty = lambda: False
  482. records = list(read_args_or_stdin((), stream=stdin))
  483. self.assertEqual(len(records), 1)
  484. self.assertEqual(records[0]['type'], TYPE_SNAPSHOT)
  485. self.assertEqual(records[0]['id'], str(snapshot.id))
  486. # Step 3: Gather snapshot IDs (as extract does)
  487. snapshot_ids = set()
  488. for record in records:
  489. if record.get('type') == TYPE_SNAPSHOT and record.get('id'):
  490. snapshot_ids.add(record['id'])
  491. self.assertIn(str(snapshot.id), snapshot_ids)
  492. def test_full_pipeline_crawl_snapshot_extract(self):
  493. """
  494. Test: archivebox crawl URL | archivebox snapshot | archivebox extract
  495. This is equivalent to: archivebox add --depth=0 URL
  496. """
  497. from archivebox.crawls.models import Crawl
  498. from archivebox.core.models import Snapshot
  499. from archivebox.misc.jsonl import (
  500. read_args_or_stdin,
  501. TYPE_CRAWL, TYPE_SNAPSHOT
  502. )
  503. from archivebox.base_models.models import get_or_create_system_user_pk
  504. created_by_id = get_or_create_system_user_pk()
  505. # === archivebox crawl https://example.com ===
  506. url = 'https://test-pipeline-full.example.com'
  507. crawl = Crawl.from_json({'url': url}, overrides={'created_by_id': created_by_id})
  508. crawl_jsonl = json.dumps(crawl.to_json())
  509. # === | archivebox snapshot ===
  510. stdin = StringIO(crawl_jsonl + '\n')
  511. stdin.isatty = lambda: False
  512. records = list(read_args_or_stdin((), stream=stdin))
  513. self.assertEqual(len(records), 1)
  514. self.assertEqual(records[0]['type'], TYPE_CRAWL)
  515. # Create snapshots from crawl
  516. created_snapshots = []
  517. for record in records:
  518. if record.get('type') == TYPE_CRAWL:
  519. crawl_id = record.get('id')
  520. if crawl_id:
  521. db_crawl = Crawl.objects.get(id=crawl_id)
  522. for crawl_url in db_crawl.get_urls_list():
  523. snapshot = Snapshot.from_json({'url': crawl_url}, overrides={'created_by_id': created_by_id})
  524. if snapshot:
  525. created_snapshots.append(snapshot)
  526. self.assertEqual(len(created_snapshots), 1)
  527. self.assertEqual(created_snapshots[0].url, url)
  528. # === | archivebox extract ===
  529. snapshot_jsonl_lines = [json.dumps(s.to_json()) for s in created_snapshots]
  530. stdin = StringIO('\n'.join(snapshot_jsonl_lines) + '\n')
  531. stdin.isatty = lambda: False
  532. records = list(read_args_or_stdin((), stream=stdin))
  533. self.assertEqual(len(records), 1)
  534. self.assertEqual(records[0]['type'], TYPE_SNAPSHOT)
  535. self.assertEqual(records[0]['id'], str(created_snapshots[0].id))
  536. class TestDepthWorkflows(unittest.TestCase):
  537. """Test various depth crawl workflows."""
  538. @classmethod
  539. def setUpClass(cls):
  540. """Set up Django and test database."""
  541. cls.test_dir = tempfile.mkdtemp()
  542. os.environ['DATA_DIR'] = cls.test_dir
  543. from archivebox.config.django import setup_django
  544. setup_django()
  545. from archivebox.cli.archivebox_init import init
  546. init()
  547. @classmethod
  548. def tearDownClass(cls):
  549. """Clean up test database."""
  550. shutil.rmtree(cls.test_dir, ignore_errors=True)
  551. def test_depth_0_workflow(self):
  552. """
  553. Test: archivebox crawl URL | archivebox snapshot | archivebox extract
  554. Depth 0: Only archive the specified URL, no recursive crawling.
  555. """
  556. from archivebox.crawls.models import Crawl
  557. from archivebox.core.models import Snapshot
  558. from archivebox.base_models.models import get_or_create_system_user_pk
  559. created_by_id = get_or_create_system_user_pk()
  560. # Create crawl with depth 0
  561. url = 'https://depth0-test.example.com'
  562. crawl = Crawl.from_json({'url': url, 'max_depth': 0}, overrides={'created_by_id': created_by_id})
  563. self.assertEqual(crawl.max_depth, 0)
  564. # Create snapshot
  565. snapshot = Snapshot.from_json({'url': url}, overrides={'created_by_id': created_by_id})
  566. self.assertEqual(snapshot.url, url)
  567. def test_depth_metadata_in_crawl(self):
  568. """Test that depth metadata is stored in Crawl."""
  569. from archivebox.crawls.models import Crawl
  570. from archivebox.base_models.models import get_or_create_system_user_pk
  571. created_by_id = get_or_create_system_user_pk()
  572. # Create crawl with depth
  573. crawl = Crawl.from_json(
  574. {'url': 'https://depth-meta-test.example.com', 'max_depth': 2},
  575. overrides={'created_by_id': created_by_id}
  576. )
  577. self.assertEqual(crawl.max_depth, 2)
  578. # Verify in JSONL output
  579. output = crawl.to_json()
  580. self.assertEqual(output['max_depth'], 2)
  581. class TestParserPluginWorkflows(unittest.TestCase):
  582. """Test workflows with specific parser plugins."""
  583. @classmethod
  584. def setUpClass(cls):
  585. """Set up Django and test database."""
  586. cls.test_dir = tempfile.mkdtemp()
  587. os.environ['DATA_DIR'] = cls.test_dir
  588. from archivebox.config.django import setup_django
  589. setup_django()
  590. from archivebox.cli.archivebox_init import init
  591. init()
  592. @classmethod
  593. def tearDownClass(cls):
  594. """Clean up test database."""
  595. shutil.rmtree(cls.test_dir, ignore_errors=True)
  596. def test_html_parser_workflow(self):
  597. """
  598. Test: archivebox crawl --plugin=parse_html_urls URL | archivebox snapshot | archivebox extract
  599. """
  600. from archivebox.hooks import collect_urls_from_plugins
  601. from archivebox.misc.jsonl import TYPE_SNAPSHOT
  602. # Create mock output directory
  603. snapshot_dir = Path(self.test_dir) / 'archive' / 'html-parser-test'
  604. snapshot_dir.mkdir(parents=True, exist_ok=True)
  605. (snapshot_dir / 'parse_html_urls').mkdir(exist_ok=True)
  606. (snapshot_dir / 'parse_html_urls' / 'urls.jsonl').write_text(
  607. '{"url": "https://html-discovered.com", "title": "HTML Link"}\n'
  608. )
  609. # Collect URLs
  610. discovered = collect_urls_from_plugins(snapshot_dir)
  611. self.assertEqual(len(discovered), 1)
  612. self.assertEqual(discovered[0]['url'], 'https://html-discovered.com')
  613. self.assertEqual(discovered[0]['plugin'], 'parse_html_urls')
  614. def test_rss_parser_workflow(self):
  615. """
  616. Test: archivebox crawl --plugin=parse_rss_urls URL | archivebox snapshot | archivebox extract
  617. """
  618. from archivebox.hooks import collect_urls_from_plugins
  619. # Create mock output directory
  620. snapshot_dir = Path(self.test_dir) / 'archive' / 'rss-parser-test'
  621. snapshot_dir.mkdir(parents=True, exist_ok=True)
  622. (snapshot_dir / 'parse_rss_urls').mkdir(exist_ok=True)
  623. (snapshot_dir / 'parse_rss_urls' / 'urls.jsonl').write_text(
  624. '{"url": "https://rss-item-1.com", "title": "RSS Item 1"}\n'
  625. '{"url": "https://rss-item-2.com", "title": "RSS Item 2"}\n'
  626. )
  627. # Collect URLs
  628. discovered = collect_urls_from_plugins(snapshot_dir)
  629. self.assertEqual(len(discovered), 2)
  630. self.assertTrue(all(d['plugin'] == 'parse_rss_urls' for d in discovered))
  631. def test_multiple_parsers_dedupe(self):
  632. """
  633. Multiple parsers may discover the same URL - should be deduplicated.
  634. """
  635. from archivebox.hooks import collect_urls_from_plugins
  636. # Create mock output with duplicate URLs from different parsers
  637. snapshot_dir = Path(self.test_dir) / 'archive' / 'dedupe-test'
  638. snapshot_dir.mkdir(parents=True, exist_ok=True)
  639. (snapshot_dir / 'parse_html_urls').mkdir(exist_ok=True)
  640. (snapshot_dir / 'parse_html_urls' / 'urls.jsonl').write_text(
  641. '{"url": "https://same-url.com"}\n'
  642. )
  643. (snapshot_dir / 'wget').mkdir(exist_ok=True)
  644. (snapshot_dir / 'wget' / 'urls.jsonl').write_text(
  645. '{"url": "https://same-url.com"}\n' # Same URL, different extractor
  646. )
  647. # Collect URLs
  648. all_discovered = collect_urls_from_plugins(snapshot_dir)
  649. # Both entries are returned (deduplication happens at the crawl command level)
  650. self.assertEqual(len(all_discovered), 2)
  651. # Verify both extractors found the same URL
  652. urls = {d['url'] for d in all_discovered}
  653. self.assertEqual(urls, {'https://same-url.com'})
  654. class TestEdgeCases(unittest.TestCase):
  655. """Test edge cases and error handling."""
  656. def test_empty_input(self):
  657. """Commands should handle empty input gracefully."""
  658. from archivebox.misc.jsonl import read_args_or_stdin
  659. # Empty args, TTY stdin (should not block)
  660. stdin = StringIO('')
  661. stdin.isatty = lambda: True
  662. records = list(read_args_or_stdin((), stream=stdin))
  663. self.assertEqual(len(records), 0)
  664. def test_malformed_jsonl(self):
  665. """Should skip malformed JSONL lines."""
  666. from archivebox.misc.jsonl import read_args_or_stdin
  667. stdin = StringIO(
  668. '{"url": "https://good.com"}\n'
  669. 'not valid json\n'
  670. '{"url": "https://also-good.com"}\n'
  671. )
  672. stdin.isatty = lambda: False
  673. records = list(read_args_or_stdin((), stream=stdin))
  674. self.assertEqual(len(records), 2)
  675. urls = {r['url'] for r in records}
  676. self.assertEqual(urls, {'https://good.com', 'https://also-good.com'})
  677. def test_mixed_input_formats(self):
  678. """Should handle mixed URLs and JSONL."""
  679. from archivebox.misc.jsonl import read_args_or_stdin
  680. stdin = StringIO(
  681. 'https://plain-url.com\n'
  682. '{"type": "Snapshot", "url": "https://jsonl-url.com", "tags": "test"}\n'
  683. '01234567-89ab-cdef-0123-456789abcdef\n' # UUID
  684. )
  685. stdin.isatty = lambda: False
  686. records = list(read_args_or_stdin((), stream=stdin))
  687. self.assertEqual(len(records), 3)
  688. # Plain URL
  689. self.assertEqual(records[0]['url'], 'https://plain-url.com')
  690. # JSONL with metadata
  691. self.assertEqual(records[1]['url'], 'https://jsonl-url.com')
  692. self.assertEqual(records[1]['tags'], 'test')
  693. # UUID
  694. self.assertEqual(records[2]['id'], '01234567-89ab-cdef-0123-456789abcdef')
  695. def test_crawl_with_multiple_urls(self):
  696. """Crawl should handle multiple URLs in a single crawl."""
  697. from archivebox.misc.jsonl import TYPE_CRAWL
  698. # Test crawl JSONL with multiple URLs
  699. crawl_output = {
  700. 'type': TYPE_CRAWL,
  701. 'id': 'test-multi-url-crawl',
  702. 'urls': 'https://url1.com\nhttps://url2.com\nhttps://url3.com',
  703. 'max_depth': 0,
  704. }
  705. # Parse the URLs
  706. urls = [u.strip() for u in crawl_output['urls'].split('\n') if u.strip()]
  707. self.assertEqual(len(urls), 3)
  708. self.assertEqual(urls[0], 'https://url1.com')
  709. self.assertEqual(urls[1], 'https://url2.com')
  710. self.assertEqual(urls[2], 'https://url3.com')
  711. # =============================================================================
  712. # Pass-Through Behavior Tests
  713. # =============================================================================
  714. class TestPassThroughBehavior(unittest.TestCase):
  715. """Test pass-through behavior in CLI commands."""
  716. def test_crawl_passes_through_other_types(self):
  717. """crawl create should pass through records with other types."""
  718. from archivebox.misc.jsonl import TYPE_CRAWL
  719. # Input: a Tag record (not a Crawl or URL)
  720. tag_record = {'type': 'Tag', 'id': 'test-tag', 'name': 'example'}
  721. url_record = {'url': 'https://example.com'}
  722. # Mock stdin with both records
  723. stdin = StringIO(
  724. json.dumps(tag_record) + '\n' +
  725. json.dumps(url_record)
  726. )
  727. stdin.isatty = lambda: False
  728. # The Tag should be passed through, the URL should create a Crawl
  729. # (This is a unit test of the pass-through logic)
  730. from archivebox.misc.jsonl import read_args_or_stdin
  731. records = list(read_args_or_stdin((), stream=stdin))
  732. self.assertEqual(len(records), 2)
  733. # First record is a Tag (other type)
  734. self.assertEqual(records[0]['type'], 'Tag')
  735. # Second record has a URL
  736. self.assertIn('url', records[1])
  737. def test_snapshot_passes_through_crawl(self):
  738. """snapshot create should pass through Crawl records."""
  739. from archivebox.misc.jsonl import TYPE_CRAWL, TYPE_SNAPSHOT
  740. crawl_record = {
  741. 'type': TYPE_CRAWL,
  742. 'id': 'test-crawl',
  743. 'urls': 'https://example.com',
  744. }
  745. # Crawl records should be passed through AND create snapshots
  746. # This tests the accumulation behavior
  747. self.assertEqual(crawl_record['type'], TYPE_CRAWL)
  748. self.assertIn('urls', crawl_record)
  749. def test_archiveresult_passes_through_snapshot(self):
  750. """archiveresult create should pass through Snapshot records."""
  751. from archivebox.misc.jsonl import TYPE_SNAPSHOT
  752. snapshot_record = {
  753. 'type': TYPE_SNAPSHOT,
  754. 'id': 'test-snapshot',
  755. 'url': 'https://example.com',
  756. }
  757. # Snapshot records should be passed through
  758. self.assertEqual(snapshot_record['type'], TYPE_SNAPSHOT)
  759. self.assertIn('url', snapshot_record)
  760. def test_run_passes_through_unknown_types(self):
  761. """run should pass through records with unknown types."""
  762. unknown_record = {'type': 'Unknown', 'id': 'test', 'data': 'value'}
  763. # Unknown types should be passed through unchanged
  764. self.assertEqual(unknown_record['type'], 'Unknown')
  765. self.assertIn('data', unknown_record)
  766. class TestPipelineAccumulation(unittest.TestCase):
  767. """Test that pipelines accumulate records correctly."""
  768. def test_full_pipeline_output_types(self):
  769. """Full pipeline should output all record types."""
  770. from archivebox.misc.jsonl import TYPE_CRAWL, TYPE_SNAPSHOT, TYPE_ARCHIVERESULT
  771. # Simulated pipeline output after: crawl | snapshot | archiveresult | run
  772. # Should contain Crawl, Snapshot, and ArchiveResult records
  773. pipeline_output = [
  774. {'type': TYPE_CRAWL, 'id': 'c1', 'urls': 'https://example.com'},
  775. {'type': TYPE_SNAPSHOT, 'id': 's1', 'url': 'https://example.com'},
  776. {'type': TYPE_ARCHIVERESULT, 'id': 'ar1', 'plugin': 'title'},
  777. ]
  778. types = {r['type'] for r in pipeline_output}
  779. self.assertIn(TYPE_CRAWL, types)
  780. self.assertIn(TYPE_SNAPSHOT, types)
  781. self.assertIn(TYPE_ARCHIVERESULT, types)
  782. def test_pipeline_preserves_ids(self):
  783. """Pipeline should preserve record IDs through all stages."""
  784. records = [
  785. {'type': 'Crawl', 'id': 'c1', 'urls': 'https://example.com'},
  786. {'type': 'Snapshot', 'id': 's1', 'url': 'https://example.com'},
  787. ]
  788. # All records should have IDs
  789. for record in records:
  790. self.assertIn('id', record)
  791. self.assertTrue(record['id'])
  792. def test_jq_transform_pattern(self):
  793. """Test pattern for jq transforms in pipeline."""
  794. # Simulated: archiveresult list --status=failed | jq 'del(.id) | .status = "queued"'
  795. failed_record = {
  796. 'type': 'ArchiveResult',
  797. 'id': 'ar1',
  798. 'status': 'failed',
  799. 'plugin': 'wget',
  800. }
  801. # Transform: delete id, set status to queued
  802. transformed = {
  803. 'type': failed_record['type'],
  804. 'status': 'queued',
  805. 'plugin': failed_record['plugin'],
  806. }
  807. self.assertNotIn('id', transformed)
  808. self.assertEqual(transformed['status'], 'queued')
  809. if __name__ == '__main__':
  810. unittest.main()