test_cli_run.py 8.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254
  1. """
  2. Tests for archivebox run CLI command.
  3. Tests cover:
  4. - run with stdin JSONL (Crawl, Snapshot, ArchiveResult)
  5. - create-or-update behavior (records with/without id)
  6. - pass-through output (for chaining)
  7. """
  8. import json
  9. import pytest
  10. from archivebox.tests.conftest import (
  11. run_archivebox_cmd,
  12. parse_jsonl_output,
  13. create_test_url,
  14. create_test_crawl_json,
  15. create_test_snapshot_json,
  16. )
  17. class TestRunWithCrawl:
  18. """Tests for `archivebox run` with Crawl input."""
  19. def test_run_with_new_crawl(self, initialized_archive):
  20. """Run creates and processes a new Crawl (no id)."""
  21. crawl_record = create_test_crawl_json()
  22. stdout, stderr, code = run_archivebox_cmd(
  23. ['run'],
  24. stdin=json.dumps(crawl_record),
  25. data_dir=initialized_archive,
  26. timeout=120,
  27. )
  28. assert code == 0, f"Command failed: {stderr}"
  29. # Should output the created Crawl
  30. records = parse_jsonl_output(stdout)
  31. crawl_records = [r for r in records if r.get('type') == 'Crawl']
  32. assert len(crawl_records) >= 1
  33. assert crawl_records[0].get('id') # Should have an id now
  34. def test_run_with_existing_crawl(self, initialized_archive):
  35. """Run re-queues an existing Crawl (with id)."""
  36. url = create_test_url()
  37. # First create a crawl
  38. stdout1, _, _ = run_archivebox_cmd(['crawl', 'create', url], data_dir=initialized_archive)
  39. crawl = parse_jsonl_output(stdout1)[0]
  40. # Run with the existing crawl
  41. stdout2, stderr, code = run_archivebox_cmd(
  42. ['run'],
  43. stdin=json.dumps(crawl),
  44. data_dir=initialized_archive,
  45. timeout=120,
  46. )
  47. assert code == 0
  48. records = parse_jsonl_output(stdout2)
  49. assert len(records) >= 1
  50. class TestRunWithSnapshot:
  51. """Tests for `archivebox run` with Snapshot input."""
  52. def test_run_with_new_snapshot(self, initialized_archive):
  53. """Run creates and processes a new Snapshot (no id, just url)."""
  54. snapshot_record = create_test_snapshot_json()
  55. stdout, stderr, code = run_archivebox_cmd(
  56. ['run'],
  57. stdin=json.dumps(snapshot_record),
  58. data_dir=initialized_archive,
  59. timeout=120,
  60. )
  61. assert code == 0, f"Command failed: {stderr}"
  62. records = parse_jsonl_output(stdout)
  63. snapshot_records = [r for r in records if r.get('type') == 'Snapshot']
  64. assert len(snapshot_records) >= 1
  65. assert snapshot_records[0].get('id')
  66. def test_run_with_existing_snapshot(self, initialized_archive):
  67. """Run re-queues an existing Snapshot (with id)."""
  68. url = create_test_url()
  69. # First create a snapshot
  70. stdout1, _, _ = run_archivebox_cmd(['snapshot', 'create', url], data_dir=initialized_archive)
  71. snapshot = parse_jsonl_output(stdout1)[0]
  72. # Run with the existing snapshot
  73. stdout2, stderr, code = run_archivebox_cmd(
  74. ['run'],
  75. stdin=json.dumps(snapshot),
  76. data_dir=initialized_archive,
  77. timeout=120,
  78. )
  79. assert code == 0
  80. records = parse_jsonl_output(stdout2)
  81. assert len(records) >= 1
  82. def test_run_with_plain_url(self, initialized_archive):
  83. """Run accepts plain URL records (no type field)."""
  84. url = create_test_url()
  85. url_record = {'url': url}
  86. stdout, stderr, code = run_archivebox_cmd(
  87. ['run'],
  88. stdin=json.dumps(url_record),
  89. data_dir=initialized_archive,
  90. timeout=120,
  91. )
  92. assert code == 0
  93. records = parse_jsonl_output(stdout)
  94. assert len(records) >= 1
  95. class TestRunWithArchiveResult:
  96. """Tests for `archivebox run` with ArchiveResult input."""
  97. def test_run_requeues_failed_archiveresult(self, initialized_archive):
  98. """Run re-queues a failed ArchiveResult."""
  99. url = create_test_url()
  100. # Create snapshot and archive result
  101. stdout1, _, _ = run_archivebox_cmd(['snapshot', 'create', url], data_dir=initialized_archive)
  102. snapshot = parse_jsonl_output(stdout1)[0]
  103. stdout2, _, _ = run_archivebox_cmd(
  104. ['archiveresult', 'create', '--plugin=title'],
  105. stdin=json.dumps(snapshot),
  106. data_dir=initialized_archive,
  107. )
  108. ar = next(r for r in parse_jsonl_output(stdout2) if r.get('type') == 'ArchiveResult')
  109. # Update to failed
  110. ar['status'] = 'failed'
  111. run_archivebox_cmd(
  112. ['archiveresult', 'update', '--status=failed'],
  113. stdin=json.dumps(ar),
  114. data_dir=initialized_archive,
  115. )
  116. # Now run should re-queue it
  117. stdout3, stderr, code = run_archivebox_cmd(
  118. ['run'],
  119. stdin=json.dumps(ar),
  120. data_dir=initialized_archive,
  121. timeout=120,
  122. )
  123. assert code == 0
  124. records = parse_jsonl_output(stdout3)
  125. ar_records = [r for r in records if r.get('type') == 'ArchiveResult']
  126. assert len(ar_records) >= 1
  127. class TestRunPassThrough:
  128. """Tests for pass-through behavior in `archivebox run`."""
  129. def test_run_passes_through_unknown_types(self, initialized_archive):
  130. """Run passes through records with unknown types."""
  131. unknown_record = {'type': 'Unknown', 'id': 'fake-id', 'data': 'test'}
  132. stdout, stderr, code = run_archivebox_cmd(
  133. ['run'],
  134. stdin=json.dumps(unknown_record),
  135. data_dir=initialized_archive,
  136. )
  137. assert code == 0
  138. records = parse_jsonl_output(stdout)
  139. unknown_records = [r for r in records if r.get('type') == 'Unknown']
  140. assert len(unknown_records) == 1
  141. assert unknown_records[0]['data'] == 'test'
  142. def test_run_outputs_all_processed_records(self, initialized_archive):
  143. """Run outputs all processed records for chaining."""
  144. url = create_test_url()
  145. crawl_record = create_test_crawl_json(urls=[url])
  146. stdout, stderr, code = run_archivebox_cmd(
  147. ['run'],
  148. stdin=json.dumps(crawl_record),
  149. data_dir=initialized_archive,
  150. timeout=120,
  151. )
  152. assert code == 0
  153. records = parse_jsonl_output(stdout)
  154. # Should have at least the Crawl in output
  155. assert len(records) >= 1
  156. class TestRunMixedInput:
  157. """Tests for `archivebox run` with mixed record types."""
  158. def test_run_handles_mixed_types(self, initialized_archive):
  159. """Run handles mixed Crawl/Snapshot/ArchiveResult input."""
  160. crawl = create_test_crawl_json()
  161. snapshot = create_test_snapshot_json()
  162. unknown = {'type': 'Tag', 'id': 'fake', 'name': 'test'}
  163. stdin = '\n'.join([
  164. json.dumps(crawl),
  165. json.dumps(snapshot),
  166. json.dumps(unknown),
  167. ])
  168. stdout, stderr, code = run_archivebox_cmd(
  169. ['run'],
  170. stdin=stdin,
  171. data_dir=initialized_archive,
  172. timeout=120,
  173. )
  174. assert code == 0
  175. records = parse_jsonl_output(stdout)
  176. types = set(r.get('type') for r in records)
  177. # Should have processed Crawl and Snapshot, passed through Tag
  178. assert 'Crawl' in types or 'Snapshot' in types or 'Tag' in types
  179. class TestRunEmpty:
  180. """Tests for `archivebox run` edge cases."""
  181. def test_run_empty_stdin(self, initialized_archive):
  182. """Run with empty stdin returns success."""
  183. stdout, stderr, code = run_archivebox_cmd(
  184. ['run'],
  185. stdin='',
  186. data_dir=initialized_archive,
  187. )
  188. assert code == 0
  189. def test_run_no_records_to_process(self, initialized_archive):
  190. """Run with only pass-through records shows message."""
  191. unknown = {'type': 'Unknown', 'id': 'fake'}
  192. stdout, stderr, code = run_archivebox_cmd(
  193. ['run'],
  194. stdin=json.dumps(unknown),
  195. data_dir=initialized_archive,
  196. )
  197. assert code == 0
  198. assert 'No records to process' in stderr