| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407 |
- """archivebox/tests/conftest.py - Pytest fixtures for CLI tests."""
- import os
- import shutil
- import sys
- import subprocess
- import textwrap
- from pathlib import Path
- from typing import List, Dict, Any, Optional, Tuple
- import pytest
- # =============================================================================
- # CLI Helpers (defined before fixtures that use them)
- # =============================================================================
- def run_archivebox_cmd(
- args: List[str],
- data_dir: Path,
- stdin: Optional[str] = None,
- timeout: int = 60,
- env: Optional[Dict[str, str]] = None,
- ) -> Tuple[str, str, int]:
- """
- Run archivebox command via subprocess, return (stdout, stderr, returncode).
- Args:
- args: Command arguments (e.g., ['crawl', 'create', 'https://example.com'])
- data_dir: The DATA_DIR to use
- stdin: Optional string to pipe to stdin
- timeout: Command timeout in seconds
- env: Additional environment variables
- Returns:
- Tuple of (stdout, stderr, returncode)
- """
- cmd = [sys.executable, '-m', 'archivebox'] + args
- base_env = os.environ.copy()
- base_env['DATA_DIR'] = str(data_dir)
- base_env['USE_COLOR'] = 'False'
- base_env['SHOW_PROGRESS'] = 'False'
- # Disable slow extractors for faster tests
- base_env['SAVE_ARCHIVEDOTORG'] = 'False'
- base_env['SAVE_TITLE'] = 'False'
- base_env['SAVE_FAVICON'] = 'False'
- base_env['SAVE_WGET'] = 'False'
- base_env['SAVE_WARC'] = 'False'
- base_env['SAVE_PDF'] = 'False'
- base_env['SAVE_SCREENSHOT'] = 'False'
- base_env['SAVE_DOM'] = 'False'
- base_env['SAVE_SINGLEFILE'] = 'False'
- base_env['SAVE_READABILITY'] = 'False'
- base_env['SAVE_MERCURY'] = 'False'
- base_env['SAVE_GIT'] = 'False'
- base_env['SAVE_YTDLP'] = 'False'
- base_env['SAVE_HEADERS'] = 'False'
- base_env['SAVE_HTMLTOTEXT'] = 'False'
- if env:
- base_env.update(env)
- result = subprocess.run(
- cmd,
- input=stdin,
- capture_output=True,
- text=True,
- cwd=data_dir,
- env=base_env,
- timeout=timeout,
- )
- return result.stdout, result.stderr, result.returncode
- # =============================================================================
- # Fixtures
- # =============================================================================
- @pytest.fixture
- def isolated_data_dir(tmp_path):
- """
- Create isolated DATA_DIR for each test.
- Uses tmp_path for complete isolation.
- """
- data_dir = tmp_path / 'archivebox_data'
- data_dir.mkdir()
- return data_dir
- @pytest.fixture
- def initialized_archive(isolated_data_dir):
- """
- Initialize ArchiveBox archive in isolated directory.
- Runs `archivebox init` via subprocess to set up database and directories.
- """
- stdout, stderr, returncode = run_archivebox_cmd(
- ['init', '--quick'],
- data_dir=isolated_data_dir,
- timeout=60,
- )
- assert returncode == 0, f"archivebox init failed: {stderr}"
- return isolated_data_dir
- # =============================================================================
- # CWD-based CLI Helpers (no DATA_DIR env)
- # =============================================================================
- def run_archivebox_cmd_cwd(
- args: List[str],
- cwd: Path,
- stdin: Optional[str] = None,
- timeout: int = 60,
- env: Optional[Dict[str, str]] = None,
- ) -> Tuple[str, str, int]:
- """
- Run archivebox command via subprocess using cwd as DATA_DIR (no DATA_DIR env).
- Returns (stdout, stderr, returncode).
- """
- cmd = [sys.executable, '-m', 'archivebox'] + args
- base_env = os.environ.copy()
- base_env.pop('DATA_DIR', None)
- base_env['USE_COLOR'] = 'False'
- base_env['SHOW_PROGRESS'] = 'False'
- if env:
- base_env.update(env)
- result = subprocess.run(
- cmd,
- input=stdin,
- capture_output=True,
- text=True,
- cwd=cwd,
- env=base_env,
- timeout=timeout,
- )
- return result.stdout, result.stderr, result.returncode
- def run_python_cwd(
- script: str,
- cwd: Path,
- timeout: int = 60,
- ) -> Tuple[str, str, int]:
- base_env = os.environ.copy()
- base_env.pop('DATA_DIR', None)
- result = subprocess.run(
- [sys.executable, '-'],
- input=script,
- capture_output=True,
- text=True,
- cwd=cwd,
- env=base_env,
- timeout=timeout,
- )
- return result.stdout, result.stderr, result.returncode
- def _get_machine_type() -> str:
- import platform
- os_name = platform.system().lower()
- arch = platform.machine().lower()
- in_docker = os.environ.get('IN_DOCKER', '').lower() in ('1', 'true', 'yes')
- suffix = '-docker' if in_docker else ''
- return f'{arch}-{os_name}{suffix}'
- def _find_cached_chromium(lib_dir: Path) -> Optional[Path]:
- candidates = [
- lib_dir / 'puppeteer',
- lib_dir / 'npm' / 'node_modules' / 'puppeteer' / '.local-chromium',
- ]
- for base in candidates:
- if not base.exists():
- continue
- for path in base.rglob('Chromium.app/Contents/MacOS/Chromium'):
- return path
- for path in base.rglob('chrome-linux/chrome'):
- return path
- for path in base.rglob('chrome-linux64/chrome'):
- return path
- return None
- def _find_system_browser() -> Optional[Path]:
- candidates = [
- Path('/Applications/Chromium.app/Contents/MacOS/Chromium'),
- Path('/usr/bin/chromium'),
- Path('/usr/bin/chromium-browser'),
- ]
- for candidate in candidates:
- if candidate.exists():
- return candidate
- return None
- def _ensure_puppeteer(shared_lib: Path) -> None:
- npm_prefix = shared_lib / 'npm'
- node_modules = npm_prefix / 'node_modules'
- puppeteer_dir = node_modules / 'puppeteer'
- if puppeteer_dir.exists():
- return
- npm_prefix.mkdir(parents=True, exist_ok=True)
- env = os.environ.copy()
- env['PUPPETEER_SKIP_DOWNLOAD'] = '1'
- subprocess.run(
- ['npm', 'install', 'puppeteer'],
- cwd=str(npm_prefix),
- env=env,
- check=True,
- capture_output=True,
- text=True,
- timeout=600,
- )
- @pytest.fixture(scope="class")
- def real_archive_with_example(tmp_path_factory, request):
- """
- Initialize archive and add https://example.com using chrome+responses only.
- Uses cwd for DATA_DIR and symlinks lib dir to a shared cache.
- """
- tmp_path = tmp_path_factory.mktemp("archivebox_data")
- if getattr(request, "cls", None) is not None:
- request.cls.data_dir = tmp_path
- stdout, stderr, returncode = run_archivebox_cmd_cwd(
- ['init', '--quick'],
- cwd=tmp_path,
- timeout=120,
- )
- assert returncode == 0, f"archivebox init failed: {stderr}"
- stdout, stderr, returncode = run_archivebox_cmd_cwd(
- [
- 'config',
- '--set',
- 'LISTEN_HOST=archivebox.localhost:8000',
- 'PUBLIC_INDEX=True',
- 'PUBLIC_SNAPSHOTS=True',
- 'PUBLIC_ADD_VIEW=True',
- ],
- cwd=tmp_path,
- )
- assert returncode == 0, f"archivebox config failed: {stderr}"
- machine_type = _get_machine_type()
- shared_root = Path(__file__).resolve().parents[3] / 'tmp' / 'test_lib_cache'
- shared_lib = shared_root / machine_type
- shared_lib.mkdir(parents=True, exist_ok=True)
- lib_target = tmp_path / 'lib' / machine_type
- if lib_target.exists() and not lib_target.is_symlink():
- shutil.rmtree(lib_target)
- if not lib_target.exists():
- lib_target.parent.mkdir(parents=True, exist_ok=True)
- lib_target.symlink_to(shared_lib, target_is_directory=True)
- _ensure_puppeteer(shared_lib)
- cached_chromium = _find_cached_chromium(shared_lib)
- if cached_chromium:
- browser_binary = cached_chromium
- else:
- browser_binary = _find_system_browser()
- if browser_binary:
- chromium_link = shared_lib / 'chromium-bin'
- if not chromium_link.exists():
- chromium_link.symlink_to(browser_binary)
- browser_binary = chromium_link
- if browser_binary:
- stdout, stderr, returncode = run_archivebox_cmd_cwd(
- [f'config', '--set', f'CHROME_BINARY={browser_binary}'],
- cwd=tmp_path,
- )
- assert returncode == 0, f"archivebox config CHROME_BINARY failed: {stderr}"
- script = textwrap.dedent(f"""\
- import os
- os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'archivebox.core.settings')
- import django
- django.setup()
- from django.utils import timezone
- from archivebox.machine.models import Binary, Machine
- machine = Machine.current()
- Binary.objects.filter(machine=machine, name='chromium').update(
- status='installed',
- abspath='{browser_binary}',
- binprovider='env',
- retry_at=timezone.now(),
- )
- Binary.objects.update_or_create(
- machine=machine,
- name='chromium',
- defaults={{
- 'status': 'installed',
- 'abspath': '{browser_binary}',
- 'binprovider': 'env',
- 'retry_at': timezone.now(),
- }},
- )
- print('OK')
- """
- )
- stdout, stderr, returncode = run_python_cwd(script, cwd=tmp_path, timeout=60)
- assert returncode == 0, f"Register chromium binary failed: {stderr}"
- add_env = {
- 'CHROME_ENABLED': 'True',
- 'RESPONSES_ENABLED': 'True',
- 'DOM_ENABLED': 'False',
- 'SHOW_PROGRESS': 'False',
- 'USE_COLOR': 'False',
- 'CHROME_HEADLESS': 'True',
- 'CHROME_PAGELOAD_TIMEOUT': '45',
- 'CHROME_TIMEOUT': '60',
- 'RESPONSES_TIMEOUT': '30',
- }
- if browser_binary:
- add_env['CHROME_BINARY'] = str(browser_binary)
- if cached_chromium:
- add_env['PUPPETEER_CACHE_DIR'] = str(shared_lib / 'puppeteer')
- stdout, stderr, returncode = run_archivebox_cmd_cwd(
- ['add', '--depth=0', '--plugins=chrome,responses', 'https://example.com'],
- cwd=tmp_path,
- timeout=600,
- env=add_env,
- )
- assert returncode == 0, f"archivebox add failed: {stderr}"
- return tmp_path
- # =============================================================================
- # Output Assertions
- # =============================================================================
- def parse_jsonl_output(stdout: str) -> List[Dict[str, Any]]:
- """Parse JSONL output into list of dicts via Process parser."""
- from archivebox.machine.models import Process
- return Process.parse_records_from_text(stdout or '')
- def assert_jsonl_contains_type(stdout: str, record_type: str, min_count: int = 1):
- """Assert output contains at least min_count records of type."""
- records = parse_jsonl_output(stdout)
- matching = [r for r in records if r.get('type') == record_type]
- assert len(matching) >= min_count, \
- f"Expected >= {min_count} {record_type}, got {len(matching)}"
- return matching
- def assert_jsonl_pass_through(stdout: str, input_records: List[Dict[str, Any]]):
- """Assert that input records appear in output (pass-through behavior)."""
- output_records = parse_jsonl_output(stdout)
- output_ids = {r.get('id') for r in output_records if r.get('id')}
- for input_rec in input_records:
- input_id = input_rec.get('id')
- if input_id:
- assert input_id in output_ids, \
- f"Input record {input_id} not found in output (pass-through failed)"
- def assert_record_has_fields(record: Dict[str, Any], required_fields: List[str]):
- """Assert record has all required fields with non-None values."""
- for field in required_fields:
- assert field in record, f"Record missing field: {field}"
- assert record[field] is not None, f"Record field is None: {field}"
- # =============================================================================
- # Test Data Factories
- # =============================================================================
- def create_test_url(domain: str = 'example.com', path: str = None) -> str:
- """Generate unique test URL."""
- import uuid
- path = path or uuid.uuid4().hex[:8]
- return f'https://{domain}/{path}'
- def create_test_crawl_json(urls: List[str] = None, **kwargs) -> Dict[str, Any]:
- """Create Crawl JSONL record for testing."""
- urls = urls or [create_test_url()]
- return {
- 'type': 'Crawl',
- 'urls': '\n'.join(urls),
- 'max_depth': kwargs.get('max_depth', 0),
- 'tags_str': kwargs.get('tags_str', ''),
- 'status': kwargs.get('status', 'queued'),
- **{k: v for k, v in kwargs.items() if k not in ('max_depth', 'tags_str', 'status')},
- }
- def create_test_snapshot_json(url: str = None, **kwargs) -> Dict[str, Any]:
- """Create Snapshot JSONL record for testing."""
- return {
- 'type': 'Snapshot',
- 'url': url or create_test_url(),
- 'tags_str': kwargs.get('tags_str', ''),
- 'status': kwargs.get('status', 'queued'),
- **{k: v for k, v in kwargs.items() if k not in ('tags_str', 'status')},
- }
|