| 12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697 |
- #!/usr/bin/env python3
- """
- Tests for archivebox crawl command.
- Verify crawl creates snapshots with depth.
- """
- import os
- import subprocess
- import sqlite3
- from .fixtures import *
- def test_crawl_creates_snapshots(tmp_path, process, disable_extractors_dict):
- """Test that crawl command works on existing snapshots."""
- os.chdir(tmp_path)
- # First add a snapshot
- subprocess.run(
- ['archivebox', 'add', '--index-only', '--depth=0', 'https://example.com'],
- capture_output=True,
- env=disable_extractors_dict,
- )
- # Then run crawl on it
- result = subprocess.run(
- ['archivebox', 'crawl', '--depth=0', 'https://example.com'],
- capture_output=True,
- env=disable_extractors_dict,
- timeout=30,
- )
- assert result.returncode in [0, 1, 2] # May succeed or fail depending on URL
- # Check snapshot was created
- conn = sqlite3.connect("index.sqlite3")
- c = conn.cursor()
- count = c.execute("SELECT COUNT(*) FROM core_snapshot").fetchone()[0]
- conn.close()
- assert count == 1
- def test_crawl_with_depth_0(tmp_path, process, disable_extractors_dict):
- """Test crawl with depth=0 works on existing snapshot."""
- os.chdir(tmp_path)
- # First add a snapshot
- subprocess.run(
- ['archivebox', 'add', '--index-only', '--depth=0', 'https://example.com'],
- capture_output=True,
- env=disable_extractors_dict,
- )
- # Then crawl it
- subprocess.run(
- ['archivebox', 'crawl', '--depth=0', 'https://example.com'],
- capture_output=True,
- env=disable_extractors_dict,
- timeout=30,
- )
- conn = sqlite3.connect("index.sqlite3")
- c = conn.cursor()
- count = c.execute("SELECT COUNT(*) FROM core_snapshot").fetchone()[0]
- conn.close()
- # Should have at least 1 snapshot from the add command
- assert count >= 1
- def test_crawl_creates_crawl_record(tmp_path, process, disable_extractors_dict):
- """Test that add+crawl creates Crawl records."""
- os.chdir(tmp_path)
- # First add a snapshot (this creates a Crawl)
- subprocess.run(
- ['archivebox', 'add', '--index-only', '--depth=0', 'https://example.com'],
- capture_output=True,
- env=disable_extractors_dict,
- )
- # Then crawl it
- subprocess.run(
- ['archivebox', 'crawl', '--depth=0', 'https://example.com'],
- capture_output=True,
- env=disable_extractors_dict,
- timeout=30,
- )
- conn = sqlite3.connect("index.sqlite3")
- c = conn.cursor()
- crawl_count = c.execute("SELECT COUNT(*) FROM crawls_crawl").fetchone()[0]
- conn.close()
- # Should have at least 1 crawl from the add command
- assert crawl_count >= 1
|