test_list.py 5.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596
  1. import json
  2. import subprocess
  3. from .fixtures import *
  4. def test_search_json(process, disable_extractors_dict):
  5. subprocess.run(["archivebox", "add", "--index-only", "https://example.com", "--depth=0"],
  6. capture_output=True, env=disable_extractors_dict)
  7. search_process = subprocess.run(["archivebox", "search", "--json"], capture_output=True)
  8. output_str = search_process.stdout.decode("utf-8").strip()
  9. # Handle potential control characters in output
  10. try:
  11. output_json = json.loads(output_str)
  12. except json.JSONDecodeError:
  13. # Try with strict=False if there are control characters
  14. import re
  15. # Remove ANSI escape sequences and control characters
  16. clean_str = re.sub(r'\x1b\[[0-9;]*m', '', output_str)
  17. clean_str = re.sub(r'[\x00-\x1f\x7f]', lambda m: ' ' if m.group(0) in '\t\n\r' else '', clean_str)
  18. output_json = json.loads(clean_str)
  19. # Verify we get at least one snapshot back
  20. assert len(output_json) >= 1
  21. # Should include the requested URL
  22. assert any("example.com" in entry.get("url", "") for entry in output_json)
  23. def test_search_json_headers(process, disable_extractors_dict):
  24. subprocess.run(["archivebox", "add", "--index-only", "https://example.com", "--depth=0"],
  25. capture_output=True, env=disable_extractors_dict)
  26. search_process = subprocess.run(["archivebox", "search", "--json", "--with-headers"], capture_output=True)
  27. output_str = search_process.stdout.decode("utf-8").strip()
  28. # Handle potential control characters in output
  29. try:
  30. output_json = json.loads(output_str)
  31. except json.JSONDecodeError:
  32. # Try with strict=False if there are control characters
  33. import re
  34. # Remove ANSI escape sequences and control characters
  35. clean_str = re.sub(r'\x1b\[[0-9;]*m', '', output_str)
  36. clean_str = re.sub(r'[\x00-\x1f\x7f]', lambda m: ' ' if m.group(0) in '\t\n\r' else '', clean_str)
  37. output_json = json.loads(clean_str)
  38. # The response should have a links key with headers mode
  39. links = output_json.get("links", output_json)
  40. assert len(links) >= 1
  41. def test_search_html(process, disable_extractors_dict):
  42. subprocess.run(["archivebox", "add", "--index-only", "https://example.com", "--depth=0"],
  43. capture_output=True, env=disable_extractors_dict)
  44. search_process = subprocess.run(["archivebox", "search", "--html"], capture_output=True)
  45. output_html = search_process.stdout.decode("utf-8")
  46. # Should contain some HTML and reference to the source file
  47. assert "sources" in output_html or "cli_add" in output_html or "<" in output_html
  48. def test_search_html_headers(process, disable_extractors_dict):
  49. subprocess.run(["archivebox", "add", "--index-only", "https://example.com", "--depth=0"],
  50. capture_output=True, env=disable_extractors_dict)
  51. search_process = subprocess.run(["archivebox", "search", "--html", "--with-headers"], capture_output=True)
  52. output_html = search_process.stdout.decode("utf-8")
  53. # Should contain HTML
  54. assert "<" in output_html
  55. def test_search_csv(process, disable_extractors_dict):
  56. subprocess.run(["archivebox", "add", "--index-only", "https://example.com", "--depth=0"],
  57. capture_output=True, env=disable_extractors_dict)
  58. search_process = subprocess.run(["archivebox", "search", "--csv", "url"], capture_output=True)
  59. output_csv = search_process.stdout.decode("utf-8")
  60. # Should contain the requested URL
  61. assert "example.com" in output_csv
  62. def test_search_csv_headers(process, disable_extractors_dict):
  63. subprocess.run(["archivebox", "add", "--index-only", "https://example.com", "--depth=0"],
  64. capture_output=True, env=disable_extractors_dict)
  65. search_process = subprocess.run(["archivebox", "search", "--csv", "url", "--with-headers"], capture_output=True)
  66. output_csv = search_process.stdout.decode("utf-8")
  67. # Should have url header and requested URL
  68. assert "url" in output_csv
  69. assert "example.com" in output_csv
  70. def test_search_with_headers_requires_format(process):
  71. search_process = subprocess.run(["archivebox", "search", "--with-headers"], capture_output=True)
  72. stderr = search_process.stderr.decode("utf-8")
  73. assert "--with-headers" in stderr and ("requires" in stderr or "can only be used" in stderr)
  74. def test_sort_by_url(process, disable_extractors_dict):
  75. # Add two URLs - they will create separate source files
  76. subprocess.run(["archivebox", "add", "--index-only", "https://iana.org", "--depth=0"],
  77. capture_output=True, env=disable_extractors_dict)
  78. subprocess.run(["archivebox", "add", "--index-only", "https://example.com", "--depth=0"],
  79. capture_output=True, env=disable_extractors_dict)
  80. # Search with sort should return results (even if they're file:// URLs)
  81. search_process = subprocess.run(["archivebox", "search", "--csv", "url", "--sort=url"], capture_output=True)
  82. output = search_process.stdout.decode("utf-8")
  83. lines = [line for line in output.strip().split("\n") if line]
  84. # Should have at least 2 snapshots (the source file snapshots)
  85. assert len(lines) >= 2