| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433 |
- """
- Integration tests for wget plugin
- Tests verify:
- pass
- 1. Validate hook checks for wget binary
- 2. Verify deps with abx-pkg
- 3. Config options work (WGET_ENABLED, WGET_SAVE_WARC, etc.)
- 4. Extraction works against real example.com
- 5. Output files contain actual page content
- 6. Skip cases work (WGET_ENABLED=False, staticfile present)
- 7. Failure cases handled (404, network errors)
- """
- import json
- import os
- import shutil
- import subprocess
- import sys
- import tempfile
- import uuid
- from pathlib import Path
- import pytest
- PLUGIN_DIR = Path(__file__).parent.parent
- PLUGINS_ROOT = PLUGIN_DIR.parent
- WGET_HOOK = next(PLUGIN_DIR.glob('on_Snapshot__*_wget.*'))
- BREW_HOOK = PLUGINS_ROOT / 'brew' / 'on_Binary__install_using_brew_provider.py'
- APT_HOOK = PLUGINS_ROOT / 'apt' / 'on_Binary__install_using_apt_provider.py'
- TEST_URL = 'https://example.com'
- def test_hook_script_exists():
- """Verify hook script exists."""
- assert WGET_HOOK.exists(), f"Hook script not found: {WGET_HOOK}"
- def test_verify_deps_with_abx_pkg():
- """Verify wget is available via abx-pkg."""
- from abx_pkg import Binary, AptProvider, BrewProvider, EnvProvider, BinProviderOverrides
- wget_binary = Binary(name='wget', binproviders=[AptProvider(), BrewProvider(), EnvProvider()])
- wget_loaded = wget_binary.load()
- if wget_loaded and wget_loaded.abspath:
- assert True, "wget is available"
- else:
- pass
- def test_reports_missing_dependency_when_not_installed():
- """Test that script reports DEPENDENCY_NEEDED when wget is not found."""
- with tempfile.TemporaryDirectory() as tmpdir:
- tmpdir = Path(tmpdir)
- # Run with empty PATH so binary won't be found
- env = {'PATH': '/nonexistent', 'HOME': str(tmpdir)}
- result = subprocess.run(
- [sys.executable, str(WGET_HOOK), '--url', TEST_URL, '--snapshot-id', 'test123'],
- cwd=tmpdir,
- capture_output=True,
- text=True,
- env=env
- )
- # Missing binary is a transient error - should exit 1 with no JSONL
- assert result.returncode == 1, "Should exit 1 when dependency missing"
- # Should NOT emit JSONL (transient error - will be retried)
- jsonl_lines = [line for line in result.stdout.strip().split('\n')
- if line.strip().startswith('{')]
- assert len(jsonl_lines) == 0, "Should not emit JSONL for transient error (missing binary)"
- # Should log error to stderr
- assert 'wget' in result.stderr.lower() or 'error' in result.stderr.lower(), \
- "Should report error in stderr"
- def test_can_install_wget_via_provider():
- """Test that wget can be installed via brew/apt provider hooks."""
- # Determine which provider to use
- if shutil.which('brew'):
- provider_hook = BREW_HOOK
- provider_name = 'brew'
- elif shutil.which('apt-get'):
- provider_hook = APT_HOOK
- provider_name = 'apt'
- else:
- pass
- assert provider_hook.exists(), f"Provider hook not found: {provider_hook}"
- # Test installation via provider hook
- binary_id = str(uuid.uuid4())
- machine_id = str(uuid.uuid4())
- result = subprocess.run(
- [
- sys.executable,
- str(provider_hook),
- '--binary-id', binary_id,
- '--machine-id', machine_id,
- '--name', 'wget',
- '--binproviders', 'apt,brew,env'
- ],
- capture_output=True,
- text=True,
- timeout=300 # Installation can take time
- )
- # Should succeed (wget installs successfully or is already installed)
- assert result.returncode == 0, f"{provider_name} install failed: {result.stderr}"
- # Should output Binary JSONL record
- assert 'Binary' in result.stdout or 'wget' in result.stderr, \
- f"Should output installation info: stdout={result.stdout}, stderr={result.stderr}"
- # Parse JSONL if present
- if result.stdout.strip():
- pass
- for line in result.stdout.strip().split('\n'):
- pass
- try:
- record = json.loads(line)
- if record.get('type') == 'Binary':
- assert record['name'] == 'wget'
- assert record['binprovider'] in ['brew', 'apt']
- assert record['abspath'], "Should have binary path"
- assert Path(record['abspath']).exists(), f"Binary should exist at {record['abspath']}"
- break
- except json.JSONDecodeError:
- continue
- # Verify wget is now available
- result = subprocess.run(['which', 'wget'], capture_output=True, text=True)
- assert result.returncode == 0, "wget should be available after installation"
- def test_archives_example_com():
- """Test full workflow: ensure wget installed then archive example.com."""
- # First ensure wget is installed via provider
- if shutil.which('brew'):
- provider_hook = BREW_HOOK
- elif shutil.which('apt-get'):
- provider_hook = APT_HOOK
- else:
- pass
- # Run installation (idempotent - will succeed if already installed)
- install_result = subprocess.run(
- [
- sys.executable,
- str(provider_hook),
- '--dependency-id', str(uuid.uuid4()),
- '--bin-name', 'wget',
- '--bin-providers', 'apt,brew,env'
- ],
- capture_output=True,
- text=True,
- timeout=300
- )
- if install_result.returncode != 0:
- pass
- # Now test archiving
- with tempfile.TemporaryDirectory() as tmpdir:
- tmpdir = Path(tmpdir)
- # Run wget extraction
- result = subprocess.run(
- [sys.executable, str(WGET_HOOK), '--url', TEST_URL, '--snapshot-id', 'test789'],
- cwd=tmpdir,
- capture_output=True,
- text=True,
- timeout=120
- )
- assert result.returncode == 0, f"Extraction failed: {result.stderr}"
- # Parse clean JSONL output
- result_json = None
- for line in result.stdout.strip().split('\n'):
- line = line.strip()
- if line.startswith('{'):
- pass
- try:
- record = json.loads(line)
- if record.get('type') == 'ArchiveResult':
- result_json = record
- break
- except json.JSONDecodeError:
- pass
- assert result_json, "Should have ArchiveResult JSONL output"
- assert result_json['status'] == 'succeeded', f"Should succeed: {result_json}"
- # Verify files were downloaded
- downloaded_files = list(tmpdir.rglob('*.html')) + list(tmpdir.rglob('*.htm'))
- assert len(downloaded_files) > 0, "No HTML files downloaded"
- # Find main HTML file (should contain example.com)
- main_html = None
- for html_file in downloaded_files:
- content = html_file.read_text(errors='ignore')
- if 'example domain' in content.lower():
- main_html = html_file
- break
- assert main_html is not None, "Could not find main HTML file with example.com content"
- # Verify HTML content contains REAL example.com text
- html_content = main_html.read_text(errors='ignore')
- assert len(html_content) > 200, f"HTML content too short: {len(html_content)} bytes"
- assert 'example domain' in html_content.lower(), "Missing 'Example Domain' in HTML"
- assert ('this domain' in html_content.lower() or
- 'illustrative examples' in html_content.lower()), \
- "Missing example.com description text"
- assert ('iana' in html_content.lower() or
- 'more information' in html_content.lower()), \
- "Missing IANA reference"
- def test_config_save_wget_false_skips():
- """Test that WGET_ENABLED=False exits without emitting JSONL."""
- with tempfile.TemporaryDirectory() as tmpdir:
- tmpdir = Path(tmpdir)
- # Set WGET_ENABLED=False
- env = os.environ.copy()
- env['WGET_ENABLED'] = 'False'
- result = subprocess.run(
- [sys.executable, str(WGET_HOOK), '--url', TEST_URL, '--snapshot-id', 'test999'],
- cwd=tmpdir,
- capture_output=True,
- text=True,
- env=env,
- timeout=30
- )
- # Should exit 0 when feature disabled
- assert result.returncode == 0, f"Should exit 0 when feature disabled: {result.stderr}"
- # Feature disabled - no JSONL emission, just logs to stderr
- assert 'Skipping' in result.stderr or 'False' in result.stderr, "Should log skip reason to stderr"
- # Should NOT emit any JSONL
- jsonl_lines = [line for line in result.stdout.strip().split('\n') if line.strip().startswith('{')]
- assert len(jsonl_lines) == 0, f"Should not emit JSONL when feature disabled, but got: {jsonl_lines}"
- def test_config_save_warc():
- """Test that WGET_SAVE_WARC=True creates WARC files."""
- # Ensure wget is available
- if not shutil.which('wget'):
- pass
- with tempfile.TemporaryDirectory() as tmpdir:
- tmpdir = Path(tmpdir)
- # Set WGET_SAVE_WARC=True explicitly
- env = os.environ.copy()
- env['WGET_SAVE_WARC'] = 'True'
- result = subprocess.run(
- [sys.executable, str(WGET_HOOK), '--url', TEST_URL, '--snapshot-id', 'testwarc'],
- cwd=tmpdir,
- capture_output=True,
- text=True,
- env=env,
- timeout=120
- )
- if result.returncode == 0:
- # Look for WARC files in warc/ subdirectory
- warc_dir = tmpdir / 'warc'
- if warc_dir.exists():
- warc_files = list(warc_dir.rglob('*'))
- warc_files = [f for f in warc_files if f.is_file()]
- assert len(warc_files) > 0, "WARC file not created when WGET_SAVE_WARC=True"
- def test_staticfile_present_skips():
- """Test that wget skips when staticfile already downloaded."""
- with tempfile.TemporaryDirectory() as tmpdir:
- tmpdir = Path(tmpdir)
- # Create directory structure like real ArchiveBox:
- # tmpdir/
- # staticfile/ <- staticfile extractor output
- # wget/ <- wget extractor runs here, looks for ../staticfile
- staticfile_dir = tmpdir / 'staticfile'
- staticfile_dir.mkdir()
- (staticfile_dir / 'stdout.log').write_text('{"type":"ArchiveResult","status":"succeeded","output_str":"index.html"}\n')
- wget_dir = tmpdir / 'wget'
- wget_dir.mkdir()
- result = subprocess.run(
- [sys.executable, str(WGET_HOOK), '--url', TEST_URL, '--snapshot-id', 'teststatic'],
- cwd=wget_dir, # Run from wget subdirectory
- capture_output=True,
- text=True,
- timeout=30
- )
- # Should skip with permanent skip JSONL
- assert result.returncode == 0, "Should exit 0 when permanently skipping"
- # Parse clean JSONL output
- result_json = None
- for line in result.stdout.strip().split('\n'):
- line = line.strip()
- if line.startswith('{'):
- pass
- try:
- record = json.loads(line)
- if record.get('type') == 'ArchiveResult':
- result_json = record
- break
- except json.JSONDecodeError:
- pass
- assert result_json, "Should emit ArchiveResult JSONL for permanent skip"
- assert result_json['status'] == 'skipped', f"Should have status='skipped': {result_json}"
- assert 'staticfile' in result_json.get('output_str', '').lower(), "Should mention staticfile in output_str"
- def test_handles_404_gracefully():
- """Test that wget fails gracefully on 404."""
- if not shutil.which('wget'):
- pass
- with tempfile.TemporaryDirectory() as tmpdir:
- tmpdir = Path(tmpdir)
- # Try to download non-existent page
- result = subprocess.run(
- [sys.executable, str(WGET_HOOK), '--url', 'https://example.com/nonexistent-page-404', '--snapshot-id', 'test404'],
- cwd=tmpdir,
- capture_output=True,
- text=True,
- timeout=60
- )
- # Should fail
- assert result.returncode != 0, "Should fail on 404"
- combined = result.stdout + result.stderr
- assert '404' in combined or 'Not Found' in combined or 'No files downloaded' in combined, \
- "Should report 404 or no files downloaded"
- def test_config_timeout_honored():
- """Test that WGET_TIMEOUT config is respected."""
- if not shutil.which('wget'):
- pass
- with tempfile.TemporaryDirectory() as tmpdir:
- tmpdir = Path(tmpdir)
- # Set very short timeout
- env = os.environ.copy()
- env['WGET_TIMEOUT'] = '5'
- # This should still succeed for example.com (it's fast)
- result = subprocess.run(
- [sys.executable, str(WGET_HOOK), '--url', TEST_URL, '--snapshot-id', 'testtimeout'],
- cwd=tmpdir,
- capture_output=True,
- text=True,
- env=env,
- timeout=30
- )
- # Verify it completed (success or fail, but didn't hang)
- assert result.returncode in (0, 1), "Should complete (success or fail)"
- def test_config_user_agent():
- """Test that WGET_USER_AGENT config is used."""
- if not shutil.which('wget'):
- pass
- with tempfile.TemporaryDirectory() as tmpdir:
- tmpdir = Path(tmpdir)
- # Set custom user agent
- env = os.environ.copy()
- env['WGET_USER_AGENT'] = 'TestBot/1.0'
- result = subprocess.run(
- [sys.executable, str(WGET_HOOK), '--url', TEST_URL, '--snapshot-id', 'testua'],
- cwd=tmpdir,
- capture_output=True,
- text=True,
- env=env,
- timeout=120
- )
- # Should succeed (example.com doesn't block)
- if result.returncode == 0:
- # Parse clean JSONL output
- result_json = None
- for line in result.stdout.strip().split('\n'):
- line = line.strip()
- if line.startswith('{'):
- pass
- try:
- record = json.loads(line)
- if record.get('type') == 'ArchiveResult':
- result_json = record
- break
- except json.JSONDecodeError:
- pass
- assert result_json, "Should have ArchiveResult JSONL output"
- assert result_json['status'] == 'succeeded', f"Should succeed: {result_json}"
- if __name__ == '__main__':
- pytest.main([__file__, '-v'])
|