| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310 |
- #!/usr/bin/env python3
- """
- Comprehensive tests for archivebox add command.
- Verify add creates snapshots in DB, crawls, source files, and archive directories.
- """
- import os
- import subprocess
- import sqlite3
- from pathlib import Path
- from .fixtures import *
- def test_add_single_url_creates_snapshot_in_db(tmp_path, process, disable_extractors_dict):
- """Test that adding a single URL creates a snapshot in the database."""
- os.chdir(tmp_path)
- result = subprocess.run(
- ['archivebox', 'add', '--index-only', '--depth=0', 'https://example.com'],
- capture_output=True,
- env=disable_extractors_dict,
- )
- assert result.returncode == 0
- conn = sqlite3.connect("index.sqlite3")
- c = conn.cursor()
- snapshots = c.execute("SELECT url FROM core_snapshot").fetchall()
- conn.close()
- assert len(snapshots) == 1
- assert snapshots[0][0] == 'https://example.com'
- def test_add_creates_crawl_record(tmp_path, process, disable_extractors_dict):
- """Test that add command creates a Crawl record in the database."""
- os.chdir(tmp_path)
- subprocess.run(
- ['archivebox', 'add', '--index-only', '--depth=0', 'https://example.com'],
- capture_output=True,
- env=disable_extractors_dict,
- )
- conn = sqlite3.connect("index.sqlite3")
- c = conn.cursor()
- crawl_count = c.execute("SELECT COUNT(*) FROM crawls_crawl").fetchone()[0]
- conn.close()
- assert crawl_count == 1
- def test_add_creates_source_file(tmp_path, process, disable_extractors_dict):
- """Test that add creates a source file with the URL."""
- os.chdir(tmp_path)
- subprocess.run(
- ['archivebox', 'add', '--index-only', '--depth=0', 'https://example.com'],
- capture_output=True,
- env=disable_extractors_dict,
- )
- sources_dir = tmp_path / "sources"
- assert sources_dir.exists()
- source_files = list(sources_dir.glob("*cli_add.txt"))
- assert len(source_files) >= 1
- source_content = source_files[0].read_text()
- assert "https://example.com" in source_content
- def test_add_multiple_urls_single_command(tmp_path, process, disable_extractors_dict):
- """Test adding multiple URLs in a single command."""
- os.chdir(tmp_path)
- result = subprocess.run(
- ['archivebox', 'add', '--index-only', '--depth=0', 'https://example.com', 'https://example.org'],
- capture_output=True,
- env=disable_extractors_dict,
- )
- assert result.returncode == 0
- conn = sqlite3.connect("index.sqlite3")
- c = conn.cursor()
- snapshot_count = c.execute("SELECT COUNT(*) FROM core_snapshot").fetchone()[0]
- urls = c.execute("SELECT url FROM core_snapshot ORDER BY url").fetchall()
- conn.close()
- assert snapshot_count == 2
- assert urls[0][0] == 'https://example.com'
- assert urls[1][0] == 'https://example.org'
- def test_add_from_file(tmp_path, process, disable_extractors_dict):
- """Test adding URLs from a file.
- With --index-only, this creates a snapshot for the file itself, not the URLs inside.
- To get snapshots for the URLs inside, you need to run without --index-only so parsers run.
- """
- os.chdir(tmp_path)
- # Create a file with URLs
- urls_file = tmp_path / "urls.txt"
- urls_file.write_text("https://example.com\nhttps://example.org\n")
- result = subprocess.run(
- ['archivebox', 'add', '--index-only', '--depth=0', str(urls_file)],
- capture_output=True,
- env=disable_extractors_dict,
- )
- assert result.returncode == 0
- conn = sqlite3.connect("index.sqlite3")
- c = conn.cursor()
- crawl_count = c.execute("SELECT COUNT(*) FROM crawls_crawl").fetchone()[0]
- snapshot_count = c.execute("SELECT COUNT(*) FROM core_snapshot").fetchone()[0]
- conn.close()
- # With --index-only, creates 1 snapshot for the file itself
- assert crawl_count == 1
- assert snapshot_count == 1
- def test_add_with_depth_0_flag(tmp_path, process, disable_extractors_dict):
- """Test that --depth=0 flag is accepted and works."""
- os.chdir(tmp_path)
- result = subprocess.run(
- ['archivebox', 'add', '--index-only', '--depth=0', 'https://example.com'],
- capture_output=True,
- env=disable_extractors_dict,
- )
- assert result.returncode == 0
- assert 'unrecognized arguments: --depth' not in result.stderr.decode('utf-8')
- def test_add_with_depth_1_flag(tmp_path, process, disable_extractors_dict):
- """Test that --depth=1 flag is accepted."""
- os.chdir(tmp_path)
- result = subprocess.run(
- ['archivebox', 'add', '--index-only', '--depth=1', 'https://example.com'],
- capture_output=True,
- env=disable_extractors_dict,
- )
- assert result.returncode == 0
- assert 'unrecognized arguments: --depth' not in result.stderr.decode('utf-8')
- def test_add_with_tags(tmp_path, process, disable_extractors_dict):
- """Test adding URL with tags stores tags_str in crawl.
- With --index-only, Tag objects are not created until archiving happens.
- Tags are stored as a string in the Crawl.tags_str field.
- """
- os.chdir(tmp_path)
- subprocess.run(
- ['archivebox', 'add', '--index-only', '--depth=0', '--tag=test,example', 'https://example.com'],
- capture_output=True,
- env=disable_extractors_dict,
- )
- conn = sqlite3.connect("index.sqlite3")
- c = conn.cursor()
- tags_str = c.execute("SELECT tags_str FROM crawls_crawl").fetchone()[0]
- conn.close()
- # Tags are stored as a comma-separated string in crawl
- assert 'test' in tags_str or 'example' in tags_str
- def test_add_duplicate_url_creates_separate_crawls(tmp_path, process, disable_extractors_dict):
- """Test that adding the same URL twice creates separate crawls and snapshots.
- Each 'add' command creates a new Crawl. Multiple crawls can archive the same URL.
- This allows re-archiving URLs at different times.
- """
- os.chdir(tmp_path)
- # Add URL first time
- subprocess.run(
- ['archivebox', 'add', '--index-only', '--depth=0', 'https://example.com'],
- capture_output=True,
- env=disable_extractors_dict,
- )
- # Add same URL second time
- subprocess.run(
- ['archivebox', 'add', '--index-only', '--depth=0', 'https://example.com'],
- capture_output=True,
- env=disable_extractors_dict,
- )
- conn = sqlite3.connect("index.sqlite3")
- c = conn.cursor()
- snapshot_count = c.execute("SELECT COUNT(*) FROM core_snapshot WHERE url='https://example.com'").fetchone()[0]
- crawl_count = c.execute("SELECT COUNT(*) FROM crawls_crawl").fetchone()[0]
- conn.close()
- # Each add creates a new crawl with its own snapshot
- assert crawl_count == 2
- assert snapshot_count == 2
- def test_add_with_overwrite_flag(tmp_path, process, disable_extractors_dict):
- """Test that --overwrite flag forces re-archiving."""
- os.chdir(tmp_path)
- # Add URL first time
- subprocess.run(
- ['archivebox', 'add', '--index-only', '--depth=0', 'https://example.com'],
- capture_output=True,
- env=disable_extractors_dict,
- )
- # Add with overwrite
- result = subprocess.run(
- ['archivebox', 'add', '--index-only', '--overwrite', 'https://example.com'],
- capture_output=True,
- env=disable_extractors_dict,
- )
- assert result.returncode == 0
- assert 'unrecognized arguments: --overwrite' not in result.stderr.decode('utf-8')
- def test_add_creates_archive_subdirectory(tmp_path, process, disable_extractors_dict):
- """Test that add creates archive subdirectory for the snapshot.
- Archive subdirectories are named by timestamp, not by snapshot ID.
- """
- os.chdir(tmp_path)
- subprocess.run(
- ['archivebox', 'add', '--index-only', '--depth=0', 'https://example.com'],
- capture_output=True,
- env=disable_extractors_dict,
- )
- # Get the snapshot timestamp from the database
- conn = sqlite3.connect("index.sqlite3")
- c = conn.cursor()
- timestamp = c.execute("SELECT timestamp FROM core_snapshot").fetchone()[0]
- conn.close()
- # Check that archive subdirectory was created using timestamp
- archive_dir = tmp_path / "archive" / str(timestamp)
- assert archive_dir.exists()
- assert archive_dir.is_dir()
- def test_add_index_only_skips_extraction(tmp_path, process, disable_extractors_dict):
- """Test that --index-only flag skips extraction (fast)."""
- os.chdir(tmp_path)
- result = subprocess.run(
- ['archivebox', 'add', '--index-only', '--depth=0', 'https://example.com'],
- capture_output=True,
- env=disable_extractors_dict,
- timeout=30, # Should be fast
- )
- assert result.returncode == 0
- # Snapshot should exist but archive results should be minimal
- conn = sqlite3.connect("index.sqlite3")
- c = conn.cursor()
- snapshot_count = c.execute("SELECT COUNT(*) FROM core_snapshot").fetchone()[0]
- conn.close()
- assert snapshot_count == 1
- def test_add_links_snapshot_to_crawl(tmp_path, process, disable_extractors_dict):
- """Test that add links the snapshot to the crawl via crawl_id."""
- os.chdir(tmp_path)
- subprocess.run(
- ['archivebox', 'add', '--index-only', '--depth=0', 'https://example.com'],
- capture_output=True,
- env=disable_extractors_dict,
- )
- conn = sqlite3.connect("index.sqlite3")
- c = conn.cursor()
- # Get crawl id
- crawl_id = c.execute("SELECT id FROM crawls_crawl").fetchone()[0]
- # Get snapshot's crawl_id
- snapshot_crawl = c.execute("SELECT crawl_id FROM core_snapshot").fetchone()[0]
- conn.close()
- assert snapshot_crawl == crawl_id
- def test_add_sets_snapshot_timestamp(tmp_path, process, disable_extractors_dict):
- """Test that add sets a timestamp on the snapshot."""
- os.chdir(tmp_path)
- subprocess.run(
- ['archivebox', 'add', '--index-only', '--depth=0', 'https://example.com'],
- capture_output=True,
- env=disable_extractors_dict,
- )
- conn = sqlite3.connect("index.sqlite3")
- c = conn.cursor()
- timestamp = c.execute("SELECT timestamp FROM core_snapshot").fetchone()[0]
- conn.close()
- assert timestamp is not None
- assert len(str(timestamp)) > 0
|