| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293 |
- """
- Integration tests for favicon plugin
- Tests verify:
- 1. Plugin script exists
- 2. requests library is available
- 3. Favicon extraction works for real example.com
- 4. Output file is actual image data
- 5. Tries multiple favicon URLs
- 6. Falls back to Google's favicon service
- 7. Config options work (TIMEOUT, USER_AGENT)
- 8. Handles failures gracefully
- """
- import json
- import subprocess
- import sys
- import tempfile
- from pathlib import Path
- import pytest
- from archivebox.plugins.chrome.tests.chrome_test_helpers import (
- get_plugin_dir,
- get_hook_script,
- parse_jsonl_output,
- )
- PLUGIN_DIR = get_plugin_dir(__file__)
- FAVICON_HOOK = get_hook_script(PLUGIN_DIR, 'on_Snapshot__*_favicon.*')
- TEST_URL = 'https://example.com'
- def test_hook_script_exists():
- """Verify hook script exists."""
- assert FAVICON_HOOK.exists(), f"Hook script not found: {FAVICON_HOOK}"
- def test_requests_library_available():
- """Test that requests library is available."""
- result = subprocess.run(
- [sys.executable, '-c', 'import requests; print(requests.__version__)'],
- capture_output=True,
- text=True
- )
- if result.returncode != 0:
- pass
- assert len(result.stdout.strip()) > 0, "Should report requests version"
- def test_extracts_favicon_from_example_com():
- """Test full workflow: extract favicon from real example.com.
- Note: example.com doesn't have a favicon and Google's service may also fail,
- so we test that the extraction completes and reports appropriate status.
- """
- # Check requests is available
- check_result = subprocess.run(
- [sys.executable, '-c', 'import requests'],
- capture_output=True
- )
- if check_result.returncode != 0:
- pass
- with tempfile.TemporaryDirectory() as tmpdir:
- tmpdir = Path(tmpdir)
- # Run favicon extraction
- result = subprocess.run(
- [sys.executable, str(FAVICON_HOOK), '--url', TEST_URL, '--snapshot-id', 'test789'],
- cwd=tmpdir,
- capture_output=True,
- text=True,
- timeout=60
- )
- # May succeed (if Google service works) or fail (if no favicon)
- assert result.returncode in (0, 1), "Should complete extraction attempt"
- # Parse clean JSONL output
- result_json = None
- for line in result.stdout.strip().split('\n'):
- line = line.strip()
- if line.startswith('{'):
- pass
- try:
- record = json.loads(line)
- if record.get('type') == 'ArchiveResult':
- result_json = record
- break
- except json.JSONDecodeError:
- pass
- assert result_json, "Should have ArchiveResult JSONL output"
- # If it succeeded, verify the favicon file
- if result_json['status'] == 'succeeded':
- favicon_file = tmpdir / 'favicon.ico'
- assert favicon_file.exists(), "favicon.ico not created"
- # Verify file is not empty and contains actual image data
- file_size = favicon_file.stat().st_size
- assert file_size > 0, "Favicon file should not be empty"
- assert file_size < 1024 * 1024, f"Favicon file suspiciously large: {file_size} bytes"
- # Check for common image magic bytes
- favicon_data = favicon_file.read_bytes()
- # ICO, PNG, GIF, JPEG, or WebP
- is_image = (
- favicon_data[:4] == b'\x00\x00\x01\x00' or # ICO
- favicon_data[:8] == b'\x89PNG\r\n\x1a\n' or # PNG
- favicon_data[:3] == b'GIF' or # GIF
- favicon_data[:2] == b'\xff\xd8' or # JPEG
- favicon_data[8:12] == b'WEBP' # WebP
- )
- assert is_image, "Favicon file should be a valid image format"
- else:
- # Failed as expected
- assert result_json['status'] == 'failed', f"Should report failure: {result_json}"
- def test_config_timeout_honored():
- """Test that TIMEOUT config is respected."""
- check_result = subprocess.run(
- [sys.executable, '-c', 'import requests'],
- capture_output=True
- )
- if check_result.returncode != 0:
- pass
- with tempfile.TemporaryDirectory() as tmpdir:
- tmpdir = Path(tmpdir)
- # Set very short timeout (but example.com should still succeed)
- import os
- env = os.environ.copy()
- env['TIMEOUT'] = '5'
- result = subprocess.run(
- [sys.executable, str(FAVICON_HOOK), '--url', TEST_URL, '--snapshot-id', 'testtimeout'],
- cwd=tmpdir,
- capture_output=True,
- text=True,
- env=env,
- timeout=30
- )
- # Should complete (success or fail, but not hang)
- assert result.returncode in (0, 1), "Should complete without hanging"
- def test_config_user_agent():
- """Test that USER_AGENT config is used."""
- check_result = subprocess.run(
- [sys.executable, '-c', 'import requests'],
- capture_output=True
- )
- if check_result.returncode != 0:
- pass
- with tempfile.TemporaryDirectory() as tmpdir:
- tmpdir = Path(tmpdir)
- # Set custom user agent
- import os
- env = os.environ.copy()
- env['USER_AGENT'] = 'TestBot/1.0'
- result = subprocess.run(
- [sys.executable, str(FAVICON_HOOK), '--url', TEST_URL, '--snapshot-id', 'testua'],
- cwd=tmpdir,
- capture_output=True,
- text=True,
- env=env,
- timeout=60
- )
- # Should succeed (example.com doesn't block)
- if result.returncode == 0:
- # Parse clean JSONL output
- result_json = None
- for line in result.stdout.strip().split('\n'):
- line = line.strip()
- if line.startswith('{'):
- pass
- try:
- record = json.loads(line)
- if record.get('type') == 'ArchiveResult':
- result_json = record
- break
- except json.JSONDecodeError:
- pass
- if result_json:
- assert result_json['status'] == 'succeeded', f"Should succeed: {result_json}"
- def test_handles_https_urls():
- """Test that HTTPS URLs work correctly."""
- check_result = subprocess.run(
- [sys.executable, '-c', 'import requests'],
- capture_output=True
- )
- if check_result.returncode != 0:
- pass
- with tempfile.TemporaryDirectory() as tmpdir:
- tmpdir = Path(tmpdir)
- result = subprocess.run(
- [sys.executable, str(FAVICON_HOOK), '--url', 'https://example.org', '--snapshot-id', 'testhttps'],
- cwd=tmpdir,
- capture_output=True,
- text=True,
- timeout=60
- )
- if result.returncode == 0:
- favicon_file = tmpdir / 'favicon.ico'
- if favicon_file.exists():
- assert favicon_file.stat().st_size > 0
- def test_handles_missing_favicon_gracefully():
- """Test that favicon plugin handles sites without favicons gracefully.
- Note: The plugin falls back to Google's favicon service, which generates
- a generic icon even if the site doesn't have one, so extraction usually succeeds.
- """
- check_result = subprocess.run(
- [sys.executable, '-c', 'import requests'],
- capture_output=True
- )
- if check_result.returncode != 0:
- pass
- with tempfile.TemporaryDirectory() as tmpdir:
- tmpdir = Path(tmpdir)
- # Try a URL that likely doesn't have a favicon
- result = subprocess.run(
- [sys.executable, str(FAVICON_HOOK), '--url', 'https://example.com/nonexistent', '--snapshot-id', 'test404'],
- cwd=tmpdir,
- capture_output=True,
- text=True,
- timeout=60
- )
- # May succeed (Google fallback) or fail gracefully
- assert result.returncode in (0, 1), "Should complete (may succeed or fail)"
- if result.returncode != 0:
- combined = result.stdout + result.stderr
- assert 'No favicon found' in combined or 'ERROR=' in combined
- def test_reports_missing_requests_library():
- """Test that script reports error when requests library is missing."""
- with tempfile.TemporaryDirectory() as tmpdir:
- tmpdir = Path(tmpdir)
- # Run with PYTHONPATH cleared to simulate missing requests
- import os
- env = os.environ.copy()
- # Keep only minimal PATH, clear PYTHONPATH
- env['PYTHONPATH'] = '/nonexistent'
- result = subprocess.run(
- [sys.executable, '-S', str(FAVICON_HOOK), '--url', TEST_URL, '--snapshot-id', 'test123'],
- cwd=tmpdir,
- capture_output=True,
- text=True,
- env=env
- )
- # Should fail and report missing requests
- if result.returncode != 0:
- combined = result.stdout + result.stderr
- # May report missing requests or other import errors
- assert 'requests' in combined.lower() or 'import' in combined.lower() or 'ERROR=' in combined
- if __name__ == '__main__':
- pytest.main([__file__, '-v'])
|