test_cli_crawl.py 2.6 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697
  1. #!/usr/bin/env python3
  2. """
  3. Tests for archivebox crawl command.
  4. Verify crawl creates snapshots with depth.
  5. """
  6. import os
  7. import subprocess
  8. import sqlite3
  9. from .fixtures import *
  10. def test_crawl_creates_snapshots(tmp_path, process, disable_extractors_dict):
  11. """Test that crawl command works on existing snapshots."""
  12. os.chdir(tmp_path)
  13. # First add a snapshot
  14. subprocess.run(
  15. ['archivebox', 'add', '--index-only', '--depth=0', 'https://example.com'],
  16. capture_output=True,
  17. env=disable_extractors_dict,
  18. )
  19. # Then run crawl on it
  20. result = subprocess.run(
  21. ['archivebox', 'crawl', '--depth=0', 'https://example.com'],
  22. capture_output=True,
  23. env=disable_extractors_dict,
  24. timeout=30,
  25. )
  26. assert result.returncode in [0, 1, 2] # May succeed or fail depending on URL
  27. # Check snapshot was created
  28. conn = sqlite3.connect("index.sqlite3")
  29. c = conn.cursor()
  30. count = c.execute("SELECT COUNT(*) FROM core_snapshot").fetchone()[0]
  31. conn.close()
  32. assert count == 1
  33. def test_crawl_with_depth_0(tmp_path, process, disable_extractors_dict):
  34. """Test crawl with depth=0 works on existing snapshot."""
  35. os.chdir(tmp_path)
  36. # First add a snapshot
  37. subprocess.run(
  38. ['archivebox', 'add', '--index-only', '--depth=0', 'https://example.com'],
  39. capture_output=True,
  40. env=disable_extractors_dict,
  41. )
  42. # Then crawl it
  43. subprocess.run(
  44. ['archivebox', 'crawl', '--depth=0', 'https://example.com'],
  45. capture_output=True,
  46. env=disable_extractors_dict,
  47. timeout=30,
  48. )
  49. conn = sqlite3.connect("index.sqlite3")
  50. c = conn.cursor()
  51. count = c.execute("SELECT COUNT(*) FROM core_snapshot").fetchone()[0]
  52. conn.close()
  53. # Should have at least 1 snapshot from the add command
  54. assert count >= 1
  55. def test_crawl_creates_crawl_record(tmp_path, process, disable_extractors_dict):
  56. """Test that add+crawl creates Crawl records."""
  57. os.chdir(tmp_path)
  58. # First add a snapshot (this creates a Crawl)
  59. subprocess.run(
  60. ['archivebox', 'add', '--index-only', '--depth=0', 'https://example.com'],
  61. capture_output=True,
  62. env=disable_extractors_dict,
  63. )
  64. # Then crawl it
  65. subprocess.run(
  66. ['archivebox', 'crawl', '--depth=0', 'https://example.com'],
  67. capture_output=True,
  68. env=disable_extractors_dict,
  69. timeout=30,
  70. )
  71. conn = sqlite3.connect("index.sqlite3")
  72. c = conn.cursor()
  73. crawl_count = c.execute("SELECT COUNT(*) FROM crawls_crawl").fetchone()[0]
  74. conn.close()
  75. # Should have at least 1 crawl from the add command
  76. assert crawl_count >= 1