test_crawl.py 5.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185
  1. #!/usr/bin/env python3
  2. """Integration tests for archivebox crawl command."""
  3. import os
  4. import subprocess
  5. import sqlite3
  6. import json
  7. import pytest
  8. from .fixtures import process, disable_extractors_dict
  9. def test_crawl_creates_crawl_object(tmp_path, process, disable_extractors_dict):
  10. """Test that crawl command creates a Crawl object."""
  11. os.chdir(tmp_path)
  12. subprocess.run(
  13. ['archivebox', 'crawl', '--no-wait', 'https://example.com'],
  14. capture_output=True,
  15. text=True,
  16. env=disable_extractors_dict,
  17. )
  18. conn = sqlite3.connect('index.sqlite3')
  19. c = conn.cursor()
  20. crawl = c.execute("SELECT id, max_depth FROM crawls_crawl ORDER BY created_at DESC LIMIT 1").fetchone()
  21. conn.close()
  22. assert crawl is not None, "Crawl object should be created"
  23. def test_crawl_depth_sets_max_depth_in_crawl(tmp_path, process, disable_extractors_dict):
  24. """Test that --depth option sets max_depth in the Crawl object."""
  25. os.chdir(tmp_path)
  26. subprocess.run(
  27. ['archivebox', 'crawl', '--depth=2', '--no-wait', 'https://example.com'],
  28. capture_output=True,
  29. text=True,
  30. env=disable_extractors_dict,
  31. )
  32. conn = sqlite3.connect('index.sqlite3')
  33. c = conn.cursor()
  34. crawl = c.execute("SELECT max_depth FROM crawls_crawl ORDER BY created_at DESC LIMIT 1").fetchone()
  35. conn.close()
  36. assert crawl is not None
  37. assert crawl[0] == 2, "Crawl max_depth should match --depth=2"
  38. def test_crawl_creates_snapshot_for_url(tmp_path, process, disable_extractors_dict):
  39. """Test that crawl creates a Snapshot for the input URL."""
  40. os.chdir(tmp_path)
  41. subprocess.run(
  42. ['archivebox', 'crawl', '--no-wait', 'https://example.com'],
  43. capture_output=True,
  44. text=True,
  45. env=disable_extractors_dict,
  46. )
  47. conn = sqlite3.connect('index.sqlite3')
  48. c = conn.cursor()
  49. snapshot = c.execute("SELECT url FROM core_snapshot WHERE url = ?",
  50. ('https://example.com',)).fetchone()
  51. conn.close()
  52. assert snapshot is not None, "Snapshot should be created for input URL"
  53. def test_crawl_links_snapshot_to_crawl(tmp_path, process, disable_extractors_dict):
  54. """Test that Snapshot is linked to Crawl via crawl_id."""
  55. os.chdir(tmp_path)
  56. subprocess.run(
  57. ['archivebox', 'crawl', '--no-wait', 'https://example.com'],
  58. capture_output=True,
  59. text=True,
  60. env=disable_extractors_dict,
  61. )
  62. conn = sqlite3.connect('index.sqlite3')
  63. c = conn.cursor()
  64. # Get the crawl ID
  65. crawl = c.execute("SELECT id FROM crawls_crawl ORDER BY created_at DESC LIMIT 1").fetchone()
  66. assert crawl is not None
  67. crawl_id = crawl[0]
  68. # Check snapshot has correct crawl_id
  69. snapshot = c.execute("SELECT crawl_id FROM core_snapshot WHERE url = ?",
  70. ('https://example.com',)).fetchone()
  71. conn.close()
  72. assert snapshot is not None
  73. assert snapshot[0] == crawl_id, "Snapshot should be linked to Crawl"
  74. def test_crawl_multiple_urls_creates_multiple_snapshots(tmp_path, process, disable_extractors_dict):
  75. """Test that crawling multiple URLs creates multiple snapshots."""
  76. os.chdir(tmp_path)
  77. subprocess.run(
  78. ['archivebox', 'crawl', '--no-wait',
  79. 'https://example.com',
  80. 'https://iana.org'],
  81. capture_output=True,
  82. text=True,
  83. env=disable_extractors_dict,
  84. )
  85. conn = sqlite3.connect('index.sqlite3')
  86. c = conn.cursor()
  87. urls = c.execute("SELECT url FROM core_snapshot ORDER BY url").fetchall()
  88. conn.close()
  89. urls = [u[0] for u in urls]
  90. assert 'https://example.com' in urls
  91. assert 'https://iana.org' in urls
  92. def test_crawl_from_file_creates_snapshot(tmp_path, process, disable_extractors_dict):
  93. """Test that crawl can create snapshots from a file of URLs."""
  94. os.chdir(tmp_path)
  95. # Write URLs to a file
  96. urls_file = tmp_path / 'urls.txt'
  97. urls_file.write_text('https://example.com\n')
  98. subprocess.run(
  99. ['archivebox', 'crawl', '--no-wait', str(urls_file)],
  100. capture_output=True,
  101. text=True,
  102. env=disable_extractors_dict,
  103. )
  104. conn = sqlite3.connect('index.sqlite3')
  105. c = conn.cursor()
  106. snapshot = c.execute("SELECT url FROM core_snapshot").fetchone()
  107. conn.close()
  108. # Should create at least one snapshot (the source file or the URL)
  109. assert snapshot is not None, "Should create at least one snapshot"
  110. def test_crawl_creates_seed_for_input(tmp_path, process, disable_extractors_dict):
  111. """Test that crawl creates a Seed object for input."""
  112. os.chdir(tmp_path)
  113. subprocess.run(
  114. ['archivebox', 'crawl', '--no-wait', 'https://example.com'],
  115. capture_output=True,
  116. text=True,
  117. env=disable_extractors_dict,
  118. )
  119. conn = sqlite3.connect('index.sqlite3')
  120. c = conn.cursor()
  121. seed = c.execute("SELECT id FROM crawls_seed").fetchone()
  122. conn.close()
  123. assert seed is not None, "Seed should be created for crawl input"
  124. class TestCrawlCLI:
  125. """Test the CLI interface for crawl command."""
  126. def test_cli_help(self, tmp_path, process):
  127. """Test that --help works for crawl command."""
  128. os.chdir(tmp_path)
  129. result = subprocess.run(
  130. ['archivebox', 'crawl', '--help'],
  131. capture_output=True,
  132. text=True,
  133. )
  134. assert result.returncode == 0
  135. assert '--depth' in result.stdout or '-d' in result.stdout
  136. if __name__ == '__main__':
  137. pytest.main([__file__, '-v'])