test_cli_snapshot.py 8.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274
  1. """
  2. Tests for archivebox snapshot CLI command.
  3. Tests cover:
  4. - snapshot create (from URLs, from Crawl JSONL, pass-through)
  5. - snapshot list (with filters)
  6. - snapshot update
  7. - snapshot delete
  8. """
  9. import json
  10. import pytest
  11. from archivebox.tests.conftest import (
  12. run_archivebox_cmd,
  13. parse_jsonl_output,
  14. assert_jsonl_contains_type,
  15. create_test_url,
  16. )
  17. class TestSnapshotCreate:
  18. """Tests for `archivebox snapshot create`."""
  19. def test_create_from_url_args(self, initialized_archive):
  20. """Create snapshot from URL arguments."""
  21. url = create_test_url()
  22. stdout, stderr, code = run_archivebox_cmd(
  23. ['snapshot', 'create', url],
  24. data_dir=initialized_archive,
  25. )
  26. assert code == 0, f"Command failed: {stderr}"
  27. assert 'Created' in stderr
  28. records = parse_jsonl_output(stdout)
  29. assert len(records) == 1
  30. assert records[0]['type'] == 'Snapshot'
  31. assert records[0]['url'] == url
  32. def test_create_from_crawl_jsonl(self, initialized_archive):
  33. """Create snapshots from Crawl JSONL input."""
  34. url = create_test_url()
  35. # First create a crawl
  36. stdout1, _, _ = run_archivebox_cmd(['crawl', 'create', url], data_dir=initialized_archive)
  37. crawl = parse_jsonl_output(stdout1)[0]
  38. # Pipe crawl to snapshot create
  39. stdout2, stderr, code = run_archivebox_cmd(
  40. ['snapshot', 'create'],
  41. stdin=json.dumps(crawl),
  42. data_dir=initialized_archive,
  43. )
  44. assert code == 0, f"Command failed: {stderr}"
  45. records = parse_jsonl_output(stdout2)
  46. # Should have the Crawl passed through and the Snapshot created
  47. types = [r.get('type') for r in records]
  48. assert 'Crawl' in types
  49. assert 'Snapshot' in types
  50. snapshot = next(r for r in records if r['type'] == 'Snapshot')
  51. assert snapshot['url'] == url
  52. def test_create_with_tag(self, initialized_archive):
  53. """Create snapshot with --tag flag."""
  54. url = create_test_url()
  55. stdout, stderr, code = run_archivebox_cmd(
  56. ['snapshot', 'create', '--tag=test-tag', url],
  57. data_dir=initialized_archive,
  58. )
  59. assert code == 0
  60. records = parse_jsonl_output(stdout)
  61. assert 'test-tag' in records[0].get('tags_str', '')
  62. def test_create_pass_through_other_types(self, initialized_archive):
  63. """Pass-through records of other types unchanged."""
  64. tag_record = {'type': 'Tag', 'id': 'fake-tag-id', 'name': 'test'}
  65. url = create_test_url()
  66. stdin = json.dumps(tag_record) + '\n' + json.dumps({'url': url})
  67. stdout, stderr, code = run_archivebox_cmd(
  68. ['snapshot', 'create'],
  69. stdin=stdin,
  70. data_dir=initialized_archive,
  71. )
  72. assert code == 0
  73. records = parse_jsonl_output(stdout)
  74. types = [r.get('type') for r in records]
  75. assert 'Tag' in types
  76. assert 'Snapshot' in types
  77. def test_create_multiple_urls(self, initialized_archive):
  78. """Create snapshots from multiple URLs."""
  79. urls = [create_test_url() for _ in range(3)]
  80. stdout, stderr, code = run_archivebox_cmd(
  81. ['snapshot', 'create'] + urls,
  82. data_dir=initialized_archive,
  83. )
  84. assert code == 0
  85. records = parse_jsonl_output(stdout)
  86. assert len(records) == 3
  87. created_urls = {r['url'] for r in records}
  88. for url in urls:
  89. assert url in created_urls
  90. class TestSnapshotList:
  91. """Tests for `archivebox snapshot list`."""
  92. def test_list_empty(self, initialized_archive):
  93. """List with no snapshots returns empty."""
  94. stdout, stderr, code = run_archivebox_cmd(
  95. ['snapshot', 'list'],
  96. data_dir=initialized_archive,
  97. )
  98. assert code == 0
  99. assert 'Listed 0 snapshots' in stderr
  100. def test_list_returns_created(self, initialized_archive):
  101. """List returns previously created snapshots."""
  102. url = create_test_url()
  103. run_archivebox_cmd(['snapshot', 'create', url], data_dir=initialized_archive)
  104. stdout, stderr, code = run_archivebox_cmd(
  105. ['snapshot', 'list'],
  106. data_dir=initialized_archive,
  107. )
  108. assert code == 0
  109. records = parse_jsonl_output(stdout)
  110. assert len(records) >= 1
  111. assert any(r.get('url') == url for r in records)
  112. def test_list_filter_by_status(self, initialized_archive):
  113. """Filter snapshots by status."""
  114. url = create_test_url()
  115. run_archivebox_cmd(['snapshot', 'create', url], data_dir=initialized_archive)
  116. stdout, stderr, code = run_archivebox_cmd(
  117. ['snapshot', 'list', '--status=queued'],
  118. data_dir=initialized_archive,
  119. )
  120. assert code == 0
  121. records = parse_jsonl_output(stdout)
  122. for r in records:
  123. assert r['status'] == 'queued'
  124. def test_list_filter_by_url_contains(self, initialized_archive):
  125. """Filter snapshots by URL contains."""
  126. url = create_test_url(domain='unique-domain-12345.com')
  127. run_archivebox_cmd(['snapshot', 'create', url], data_dir=initialized_archive)
  128. stdout, stderr, code = run_archivebox_cmd(
  129. ['snapshot', 'list', '--url__icontains=unique-domain-12345'],
  130. data_dir=initialized_archive,
  131. )
  132. assert code == 0
  133. records = parse_jsonl_output(stdout)
  134. assert len(records) == 1
  135. assert 'unique-domain-12345' in records[0]['url']
  136. def test_list_with_limit(self, initialized_archive):
  137. """Limit number of results."""
  138. for _ in range(3):
  139. run_archivebox_cmd(['snapshot', 'create', create_test_url()], data_dir=initialized_archive)
  140. stdout, stderr, code = run_archivebox_cmd(
  141. ['snapshot', 'list', '--limit=2'],
  142. data_dir=initialized_archive,
  143. )
  144. assert code == 0
  145. records = parse_jsonl_output(stdout)
  146. assert len(records) == 2
  147. class TestSnapshotUpdate:
  148. """Tests for `archivebox snapshot update`."""
  149. def test_update_status(self, initialized_archive):
  150. """Update snapshot status."""
  151. url = create_test_url()
  152. stdout1, _, _ = run_archivebox_cmd(['snapshot', 'create', url], data_dir=initialized_archive)
  153. snapshot = parse_jsonl_output(stdout1)[0]
  154. stdout2, stderr, code = run_archivebox_cmd(
  155. ['snapshot', 'update', '--status=started'],
  156. stdin=json.dumps(snapshot),
  157. data_dir=initialized_archive,
  158. )
  159. assert code == 0
  160. assert 'Updated 1 snapshots' in stderr
  161. records = parse_jsonl_output(stdout2)
  162. assert records[0]['status'] == 'started'
  163. def test_update_add_tag(self, initialized_archive):
  164. """Update snapshot by adding tag."""
  165. url = create_test_url()
  166. stdout1, _, _ = run_archivebox_cmd(['snapshot', 'create', url], data_dir=initialized_archive)
  167. snapshot = parse_jsonl_output(stdout1)[0]
  168. stdout2, stderr, code = run_archivebox_cmd(
  169. ['snapshot', 'update', '--tag=new-tag'],
  170. stdin=json.dumps(snapshot),
  171. data_dir=initialized_archive,
  172. )
  173. assert code == 0
  174. assert 'Updated 1 snapshots' in stderr
  175. class TestSnapshotDelete:
  176. """Tests for `archivebox snapshot delete`."""
  177. def test_delete_requires_yes(self, initialized_archive):
  178. """Delete requires --yes flag."""
  179. url = create_test_url()
  180. stdout1, _, _ = run_archivebox_cmd(['snapshot', 'create', url], data_dir=initialized_archive)
  181. snapshot = parse_jsonl_output(stdout1)[0]
  182. stdout, stderr, code = run_archivebox_cmd(
  183. ['snapshot', 'delete'],
  184. stdin=json.dumps(snapshot),
  185. data_dir=initialized_archive,
  186. )
  187. assert code == 1
  188. assert '--yes' in stderr
  189. def test_delete_with_yes(self, initialized_archive):
  190. """Delete with --yes flag works."""
  191. url = create_test_url()
  192. stdout1, _, _ = run_archivebox_cmd(['snapshot', 'create', url], data_dir=initialized_archive)
  193. snapshot = parse_jsonl_output(stdout1)[0]
  194. stdout, stderr, code = run_archivebox_cmd(
  195. ['snapshot', 'delete', '--yes'],
  196. stdin=json.dumps(snapshot),
  197. data_dir=initialized_archive,
  198. )
  199. assert code == 0
  200. assert 'Deleted 1 snapshots' in stderr
  201. def test_delete_dry_run(self, initialized_archive):
  202. """Dry run shows what would be deleted."""
  203. url = create_test_url()
  204. stdout1, _, _ = run_archivebox_cmd(['snapshot', 'create', url], data_dir=initialized_archive)
  205. snapshot = parse_jsonl_output(stdout1)[0]
  206. stdout, stderr, code = run_archivebox_cmd(
  207. ['snapshot', 'delete', '--dry-run'],
  208. stdin=json.dumps(snapshot),
  209. data_dir=initialized_archive,
  210. )
  211. assert code == 0
  212. assert 'Would delete' in stderr