test_cli_archiveresult.py 9.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264
  1. """
  2. Tests for archivebox archiveresult CLI command.
  3. Tests cover:
  4. - archiveresult create (from Snapshot JSONL, with --plugin, pass-through)
  5. - archiveresult list (with filters)
  6. - archiveresult update
  7. - archiveresult delete
  8. """
  9. import json
  10. import pytest
  11. from archivebox.tests.conftest import (
  12. run_archivebox_cmd,
  13. parse_jsonl_output,
  14. create_test_url,
  15. )
  16. class TestArchiveResultCreate:
  17. """Tests for `archivebox archiveresult create`."""
  18. def test_create_from_snapshot_jsonl(self, initialized_archive):
  19. """Create archive results from Snapshot JSONL input."""
  20. url = create_test_url()
  21. # Create a snapshot first
  22. stdout1, _, _ = run_archivebox_cmd(['snapshot', 'create', url], data_dir=initialized_archive)
  23. snapshot = parse_jsonl_output(stdout1)[0]
  24. # Pipe snapshot to archiveresult create
  25. stdout2, stderr, code = run_archivebox_cmd(
  26. ['archiveresult', 'create', '--plugin=title'],
  27. stdin=json.dumps(snapshot),
  28. data_dir=initialized_archive,
  29. )
  30. assert code == 0, f"Command failed: {stderr}"
  31. records = parse_jsonl_output(stdout2)
  32. # Should have the Snapshot passed through and ArchiveResult created
  33. types = [r.get('type') for r in records]
  34. assert 'Snapshot' in types
  35. assert 'ArchiveResult' in types
  36. ar = next(r for r in records if r['type'] == 'ArchiveResult')
  37. assert ar['plugin'] == 'title'
  38. def test_create_with_specific_plugin(self, initialized_archive):
  39. """Create archive result for specific plugin."""
  40. url = create_test_url()
  41. stdout1, _, _ = run_archivebox_cmd(['snapshot', 'create', url], data_dir=initialized_archive)
  42. snapshot = parse_jsonl_output(stdout1)[0]
  43. stdout2, stderr, code = run_archivebox_cmd(
  44. ['archiveresult', 'create', '--plugin=screenshot'],
  45. stdin=json.dumps(snapshot),
  46. data_dir=initialized_archive,
  47. )
  48. assert code == 0
  49. records = parse_jsonl_output(stdout2)
  50. ar_records = [r for r in records if r.get('type') == 'ArchiveResult']
  51. assert len(ar_records) >= 1
  52. assert ar_records[0]['plugin'] == 'screenshot'
  53. def test_create_pass_through_crawl(self, initialized_archive):
  54. """Pass-through Crawl records unchanged."""
  55. url = create_test_url()
  56. # Create crawl and snapshot
  57. stdout1, _, _ = run_archivebox_cmd(['crawl', 'create', url], data_dir=initialized_archive)
  58. crawl = parse_jsonl_output(stdout1)[0]
  59. stdout2, _, _ = run_archivebox_cmd(
  60. ['snapshot', 'create'],
  61. stdin=json.dumps(crawl),
  62. data_dir=initialized_archive,
  63. )
  64. # Now pipe all to archiveresult create
  65. stdout3, stderr, code = run_archivebox_cmd(
  66. ['archiveresult', 'create', '--plugin=title'],
  67. stdin=stdout2,
  68. data_dir=initialized_archive,
  69. )
  70. assert code == 0
  71. records = parse_jsonl_output(stdout3)
  72. types = [r.get('type') for r in records]
  73. assert 'Crawl' in types
  74. assert 'Snapshot' in types
  75. assert 'ArchiveResult' in types
  76. def test_create_pass_through_only_when_no_snapshots(self, initialized_archive):
  77. """Only pass-through records but no new snapshots returns success."""
  78. crawl_record = {'type': 'Crawl', 'id': 'fake-id', 'urls': 'https://example.com'}
  79. stdout, stderr, code = run_archivebox_cmd(
  80. ['archiveresult', 'create'],
  81. stdin=json.dumps(crawl_record),
  82. data_dir=initialized_archive,
  83. )
  84. assert code == 0
  85. assert 'Passed through' in stderr
  86. class TestArchiveResultList:
  87. """Tests for `archivebox archiveresult list`."""
  88. def test_list_empty(self, initialized_archive):
  89. """List with no archive results returns empty."""
  90. stdout, stderr, code = run_archivebox_cmd(
  91. ['archiveresult', 'list'],
  92. data_dir=initialized_archive,
  93. )
  94. assert code == 0
  95. assert 'Listed 0 archive results' in stderr
  96. def test_list_filter_by_status(self, initialized_archive):
  97. """Filter archive results by status."""
  98. # Create snapshot and archive result
  99. url = create_test_url()
  100. stdout1, _, _ = run_archivebox_cmd(['snapshot', 'create', url], data_dir=initialized_archive)
  101. snapshot = parse_jsonl_output(stdout1)[0]
  102. run_archivebox_cmd(
  103. ['archiveresult', 'create', '--plugin=title'],
  104. stdin=json.dumps(snapshot),
  105. data_dir=initialized_archive,
  106. )
  107. stdout, stderr, code = run_archivebox_cmd(
  108. ['archiveresult', 'list', '--status=queued'],
  109. data_dir=initialized_archive,
  110. )
  111. assert code == 0
  112. records = parse_jsonl_output(stdout)
  113. for r in records:
  114. assert r['status'] == 'queued'
  115. def test_list_filter_by_plugin(self, initialized_archive):
  116. """Filter archive results by plugin."""
  117. url = create_test_url()
  118. stdout1, _, _ = run_archivebox_cmd(['snapshot', 'create', url], data_dir=initialized_archive)
  119. snapshot = parse_jsonl_output(stdout1)[0]
  120. run_archivebox_cmd(
  121. ['archiveresult', 'create', '--plugin=title'],
  122. stdin=json.dumps(snapshot),
  123. data_dir=initialized_archive,
  124. )
  125. stdout, stderr, code = run_archivebox_cmd(
  126. ['archiveresult', 'list', '--plugin=title'],
  127. data_dir=initialized_archive,
  128. )
  129. assert code == 0
  130. records = parse_jsonl_output(stdout)
  131. for r in records:
  132. assert r['plugin'] == 'title'
  133. def test_list_with_limit(self, initialized_archive):
  134. """Limit number of results."""
  135. # Create multiple archive results
  136. for _ in range(3):
  137. url = create_test_url()
  138. stdout1, _, _ = run_archivebox_cmd(['snapshot', 'create', url], data_dir=initialized_archive)
  139. snapshot = parse_jsonl_output(stdout1)[0]
  140. run_archivebox_cmd(
  141. ['archiveresult', 'create', '--plugin=title'],
  142. stdin=json.dumps(snapshot),
  143. data_dir=initialized_archive,
  144. )
  145. stdout, stderr, code = run_archivebox_cmd(
  146. ['archiveresult', 'list', '--limit=2'],
  147. data_dir=initialized_archive,
  148. )
  149. assert code == 0
  150. records = parse_jsonl_output(stdout)
  151. assert len(records) == 2
  152. class TestArchiveResultUpdate:
  153. """Tests for `archivebox archiveresult update`."""
  154. def test_update_status(self, initialized_archive):
  155. """Update archive result status."""
  156. url = create_test_url()
  157. stdout1, _, _ = run_archivebox_cmd(['snapshot', 'create', url], data_dir=initialized_archive)
  158. snapshot = parse_jsonl_output(stdout1)[0]
  159. stdout2, _, _ = run_archivebox_cmd(
  160. ['archiveresult', 'create', '--plugin=title'],
  161. stdin=json.dumps(snapshot),
  162. data_dir=initialized_archive,
  163. )
  164. ar = next(r for r in parse_jsonl_output(stdout2) if r.get('type') == 'ArchiveResult')
  165. stdout3, stderr, code = run_archivebox_cmd(
  166. ['archiveresult', 'update', '--status=failed'],
  167. stdin=json.dumps(ar),
  168. data_dir=initialized_archive,
  169. )
  170. assert code == 0
  171. assert 'Updated 1 archive results' in stderr
  172. records = parse_jsonl_output(stdout3)
  173. assert records[0]['status'] == 'failed'
  174. class TestArchiveResultDelete:
  175. """Tests for `archivebox archiveresult delete`."""
  176. def test_delete_requires_yes(self, initialized_archive):
  177. """Delete requires --yes flag."""
  178. url = create_test_url()
  179. stdout1, _, _ = run_archivebox_cmd(['snapshot', 'create', url], data_dir=initialized_archive)
  180. snapshot = parse_jsonl_output(stdout1)[0]
  181. stdout2, _, _ = run_archivebox_cmd(
  182. ['archiveresult', 'create', '--plugin=title'],
  183. stdin=json.dumps(snapshot),
  184. data_dir=initialized_archive,
  185. )
  186. ar = next(r for r in parse_jsonl_output(stdout2) if r.get('type') == 'ArchiveResult')
  187. stdout, stderr, code = run_archivebox_cmd(
  188. ['archiveresult', 'delete'],
  189. stdin=json.dumps(ar),
  190. data_dir=initialized_archive,
  191. )
  192. assert code == 1
  193. assert '--yes' in stderr
  194. def test_delete_with_yes(self, initialized_archive):
  195. """Delete with --yes flag works."""
  196. url = create_test_url()
  197. stdout1, _, _ = run_archivebox_cmd(['snapshot', 'create', url], data_dir=initialized_archive)
  198. snapshot = parse_jsonl_output(stdout1)[0]
  199. stdout2, _, _ = run_archivebox_cmd(
  200. ['archiveresult', 'create', '--plugin=title'],
  201. stdin=json.dumps(snapshot),
  202. data_dir=initialized_archive,
  203. )
  204. ar = next(r for r in parse_jsonl_output(stdout2) if r.get('type') == 'ArchiveResult')
  205. stdout, stderr, code = run_archivebox_cmd(
  206. ['archiveresult', 'delete', '--yes'],
  207. stdin=json.dumps(ar),
  208. data_dir=initialized_archive,
  209. )
  210. assert code == 0
  211. assert 'Deleted 1 archive results' in stderr