test_cli_crawl.py 8.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261
  1. """
  2. Tests for archivebox crawl CLI command.
  3. Tests cover:
  4. - crawl create (with URLs, from stdin, pass-through)
  5. - crawl list (with filters)
  6. - crawl update
  7. - crawl delete
  8. """
  9. import json
  10. import pytest
  11. from archivebox.tests.conftest import (
  12. run_archivebox_cmd,
  13. parse_jsonl_output,
  14. assert_jsonl_contains_type,
  15. create_test_url,
  16. create_test_crawl_json,
  17. )
  18. class TestCrawlCreate:
  19. """Tests for `archivebox crawl create`."""
  20. def test_create_from_url_args(self, initialized_archive):
  21. """Create crawl from URL arguments."""
  22. url = create_test_url()
  23. stdout, stderr, code = run_archivebox_cmd(
  24. ['crawl', 'create', url],
  25. data_dir=initialized_archive,
  26. )
  27. assert code == 0, f"Command failed: {stderr}"
  28. assert 'Created crawl' in stderr
  29. # Check JSONL output
  30. records = parse_jsonl_output(stdout)
  31. assert len(records) == 1
  32. assert records[0]['type'] == 'Crawl'
  33. assert url in records[0]['urls']
  34. def test_create_from_stdin_urls(self, initialized_archive):
  35. """Create crawl from stdin URLs (one per line)."""
  36. urls = [create_test_url() for _ in range(3)]
  37. stdin = '\n'.join(urls)
  38. stdout, stderr, code = run_archivebox_cmd(
  39. ['crawl', 'create'],
  40. stdin=stdin,
  41. data_dir=initialized_archive,
  42. )
  43. assert code == 0, f"Command failed: {stderr}"
  44. records = parse_jsonl_output(stdout)
  45. assert len(records) == 1
  46. crawl = records[0]
  47. assert crawl['type'] == 'Crawl'
  48. # All URLs should be in the crawl
  49. for url in urls:
  50. assert url in crawl['urls']
  51. def test_create_with_depth(self, initialized_archive):
  52. """Create crawl with --depth flag."""
  53. url = create_test_url()
  54. stdout, stderr, code = run_archivebox_cmd(
  55. ['crawl', 'create', '--depth=2', url],
  56. data_dir=initialized_archive,
  57. )
  58. assert code == 0
  59. records = parse_jsonl_output(stdout)
  60. assert records[0]['max_depth'] == 2
  61. def test_create_with_tag(self, initialized_archive):
  62. """Create crawl with --tag flag."""
  63. url = create_test_url()
  64. stdout, stderr, code = run_archivebox_cmd(
  65. ['crawl', 'create', '--tag=test-tag', url],
  66. data_dir=initialized_archive,
  67. )
  68. assert code == 0
  69. records = parse_jsonl_output(stdout)
  70. assert 'test-tag' in records[0].get('tags_str', '')
  71. def test_create_pass_through_other_types(self, initialized_archive):
  72. """Pass-through records of other types unchanged."""
  73. tag_record = {'type': 'Tag', 'id': 'fake-tag-id', 'name': 'test'}
  74. url = create_test_url()
  75. stdin = json.dumps(tag_record) + '\n' + json.dumps({'url': url})
  76. stdout, stderr, code = run_archivebox_cmd(
  77. ['crawl', 'create'],
  78. stdin=stdin,
  79. data_dir=initialized_archive,
  80. )
  81. assert code == 0
  82. records = parse_jsonl_output(stdout)
  83. # Should have both the passed-through Tag and the new Crawl
  84. types = [r.get('type') for r in records]
  85. assert 'Tag' in types
  86. assert 'Crawl' in types
  87. def test_create_pass_through_existing_crawl(self, initialized_archive):
  88. """Existing Crawl records (with id) are passed through."""
  89. # First create a crawl
  90. url = create_test_url()
  91. stdout1, _, _ = run_archivebox_cmd(['crawl', 'create', url], data_dir=initialized_archive)
  92. crawl = parse_jsonl_output(stdout1)[0]
  93. # Now pipe it back - should pass through
  94. stdout2, stderr, code = run_archivebox_cmd(
  95. ['crawl', 'create'],
  96. stdin=json.dumps(crawl),
  97. data_dir=initialized_archive,
  98. )
  99. assert code == 0
  100. records = parse_jsonl_output(stdout2)
  101. assert len(records) == 1
  102. assert records[0]['id'] == crawl['id']
  103. class TestCrawlList:
  104. """Tests for `archivebox crawl list`."""
  105. def test_list_empty(self, initialized_archive):
  106. """List with no crawls returns empty."""
  107. stdout, stderr, code = run_archivebox_cmd(
  108. ['crawl', 'list'],
  109. data_dir=initialized_archive,
  110. )
  111. assert code == 0
  112. assert 'Listed 0 crawls' in stderr
  113. def test_list_returns_created(self, initialized_archive):
  114. """List returns previously created crawls."""
  115. url = create_test_url()
  116. run_archivebox_cmd(['crawl', 'create', url], data_dir=initialized_archive)
  117. stdout, stderr, code = run_archivebox_cmd(
  118. ['crawl', 'list'],
  119. data_dir=initialized_archive,
  120. )
  121. assert code == 0
  122. records = parse_jsonl_output(stdout)
  123. assert len(records) >= 1
  124. assert any(url in r.get('urls', '') for r in records)
  125. def test_list_filter_by_status(self, initialized_archive):
  126. """Filter crawls by status."""
  127. url = create_test_url()
  128. run_archivebox_cmd(['crawl', 'create', url], data_dir=initialized_archive)
  129. stdout, stderr, code = run_archivebox_cmd(
  130. ['crawl', 'list', '--status=queued'],
  131. data_dir=initialized_archive,
  132. )
  133. assert code == 0
  134. records = parse_jsonl_output(stdout)
  135. for r in records:
  136. assert r['status'] == 'queued'
  137. def test_list_with_limit(self, initialized_archive):
  138. """Limit number of results."""
  139. # Create multiple crawls
  140. for _ in range(3):
  141. run_archivebox_cmd(['crawl', 'create', create_test_url()], data_dir=initialized_archive)
  142. stdout, stderr, code = run_archivebox_cmd(
  143. ['crawl', 'list', '--limit=2'],
  144. data_dir=initialized_archive,
  145. )
  146. assert code == 0
  147. records = parse_jsonl_output(stdout)
  148. assert len(records) == 2
  149. class TestCrawlUpdate:
  150. """Tests for `archivebox crawl update`."""
  151. def test_update_status(self, initialized_archive):
  152. """Update crawl status."""
  153. # Create a crawl
  154. url = create_test_url()
  155. stdout1, _, _ = run_archivebox_cmd(['crawl', 'create', url], data_dir=initialized_archive)
  156. crawl = parse_jsonl_output(stdout1)[0]
  157. # Update it
  158. stdout2, stderr, code = run_archivebox_cmd(
  159. ['crawl', 'update', '--status=started'],
  160. stdin=json.dumps(crawl),
  161. data_dir=initialized_archive,
  162. )
  163. assert code == 0
  164. assert 'Updated 1 crawls' in stderr
  165. records = parse_jsonl_output(stdout2)
  166. assert records[0]['status'] == 'started'
  167. class TestCrawlDelete:
  168. """Tests for `archivebox crawl delete`."""
  169. def test_delete_requires_yes(self, initialized_archive):
  170. """Delete requires --yes flag."""
  171. url = create_test_url()
  172. stdout1, _, _ = run_archivebox_cmd(['crawl', 'create', url], data_dir=initialized_archive)
  173. crawl = parse_jsonl_output(stdout1)[0]
  174. stdout, stderr, code = run_archivebox_cmd(
  175. ['crawl', 'delete'],
  176. stdin=json.dumps(crawl),
  177. data_dir=initialized_archive,
  178. )
  179. assert code == 1
  180. assert '--yes' in stderr
  181. def test_delete_with_yes(self, initialized_archive):
  182. """Delete with --yes flag works."""
  183. url = create_test_url()
  184. stdout1, _, _ = run_archivebox_cmd(['crawl', 'create', url], data_dir=initialized_archive)
  185. crawl = parse_jsonl_output(stdout1)[0]
  186. stdout, stderr, code = run_archivebox_cmd(
  187. ['crawl', 'delete', '--yes'],
  188. stdin=json.dumps(crawl),
  189. data_dir=initialized_archive,
  190. )
  191. assert code == 0
  192. assert 'Deleted 1 crawls' in stderr
  193. def test_delete_dry_run(self, initialized_archive):
  194. """Dry run shows what would be deleted."""
  195. url = create_test_url()
  196. stdout1, _, _ = run_archivebox_cmd(['crawl', 'create', url], data_dir=initialized_archive)
  197. crawl = parse_jsonl_output(stdout1)[0]
  198. stdout, stderr, code = run_archivebox_cmd(
  199. ['crawl', 'delete', '--dry-run'],
  200. stdin=json.dumps(crawl),
  201. data_dir=initialized_archive,
  202. )
  203. assert code == 0
  204. assert 'Would delete' in stderr
  205. assert 'dry run' in stderr.lower()