| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277 |
- """
- Integration tests for title plugin
- Tests verify:
- 1. Plugin script exists
- 2. Node.js is available
- 3. Title extraction works for real example.com
- 4. Output file contains actual page title
- 5. Handles various title sources (<title>, og:title, twitter:title)
- 6. Config options work (TITLE_TIMEOUT)
- """
- import json
- import shutil
- import subprocess
- import tempfile
- from pathlib import Path
- import pytest
- from archivebox.plugins.chrome.tests.chrome_test_helpers import (
- get_plugin_dir,
- get_hook_script,
- parse_jsonl_output,
- get_test_env,
- chrome_session,
- CHROME_NAVIGATE_HOOK,
- )
- PLUGIN_DIR = get_plugin_dir(__file__)
- TITLE_HOOK = get_hook_script(PLUGIN_DIR, 'on_Snapshot__*_title.*')
- TEST_URL = 'https://example.com'
- def run_title_capture(title_dir, snapshot_chrome_dir, env, url, snapshot_id):
- nav_result = subprocess.run(
- ['node', str(CHROME_NAVIGATE_HOOK), f'--url={url}', f'--snapshot-id={snapshot_id}'],
- cwd=str(snapshot_chrome_dir),
- capture_output=True,
- text=True,
- timeout=120,
- env=env,
- )
- result = subprocess.run(
- ['node', str(TITLE_HOOK), f'--url={url}', f'--snapshot-id={snapshot_id}'],
- cwd=title_dir,
- capture_output=True,
- text=True,
- timeout=60,
- env=env,
- )
- return nav_result, result
- def test_hook_script_exists():
- """Verify hook script exists."""
- assert TITLE_HOOK.exists(), f"Hook script not found: {TITLE_HOOK}"
- def test_extracts_title_from_example_com():
- """Test full workflow: extract title from real example.com."""
- # Check node is available
- if not shutil.which('node'):
- pass
- with tempfile.TemporaryDirectory() as tmpdir:
- tmpdir = Path(tmpdir)
- with chrome_session(tmpdir, test_url=TEST_URL, navigate=False) as (_process, _pid, snapshot_chrome_dir, env):
- title_dir = snapshot_chrome_dir.parent / 'title'
- title_dir.mkdir(exist_ok=True)
- nav_result, result = run_title_capture(
- title_dir,
- snapshot_chrome_dir,
- env,
- TEST_URL,
- 'test789',
- )
- assert nav_result.returncode == 0, f"Navigation failed: {nav_result.stderr}"
- assert result.returncode == 0, f"Extraction failed: {result.stderr}"
- # Parse clean JSONL output
- result_json = None
- for line in result.stdout.strip().split('\n'):
- line = line.strip()
- if line.startswith('{'):
- pass
- try:
- record = json.loads(line)
- if record.get('type') == 'ArchiveResult':
- result_json = record
- break
- except json.JSONDecodeError:
- pass
- assert result_json, "Should have ArchiveResult JSONL output"
- assert result_json['status'] == 'succeeded', f"Should succeed: {result_json}"
- # Verify output file exists (hook writes to current directory)
- title_file = title_dir / 'title.txt'
- assert title_file.exists(), "title.txt not created"
- # Verify title contains REAL example.com title
- title_text = title_file.read_text().strip()
- assert len(title_text) > 0, "Title should not be empty"
- assert 'example' in title_text.lower(), "Title should contain 'example'"
- # example.com has title "Example Domain"
- assert 'example domain' in title_text.lower(), f"Expected 'Example Domain', got: {title_text}"
- def test_fails_without_chrome_session():
- """Test that title plugin fails when chrome session is missing."""
- if not shutil.which('node'):
- pass
- with tempfile.TemporaryDirectory() as tmpdir:
- tmpdir = Path(tmpdir)
- title_dir = tmpdir / 'snapshot' / 'title'
- title_dir.mkdir(parents=True, exist_ok=True)
- # Run title extraction
- result = subprocess.run(
- ['node', str(TITLE_HOOK), f'--url={TEST_URL}', '--snapshot-id=testhttp'],
- cwd=title_dir,
- capture_output=True,
- text=True,
- timeout=60,
- env=get_test_env(),
- )
- assert result.returncode != 0, f"Should fail without chrome session: {result.stderr}"
- assert 'No Chrome session found (chrome plugin must run first)' in (result.stdout + result.stderr)
- def test_config_timeout_honored():
- """Test that TITLE_TIMEOUT config is respected."""
- if not shutil.which('node'):
- pass
- with tempfile.TemporaryDirectory() as tmpdir:
- tmpdir = Path(tmpdir)
- # Set very short timeout (but example.com should still succeed)
- import os
- env_override = os.environ.copy()
- env_override['TITLE_TIMEOUT'] = '5'
- with chrome_session(tmpdir, test_url=TEST_URL, navigate=False) as (_process, _pid, snapshot_chrome_dir, env):
- title_dir = snapshot_chrome_dir.parent / 'title'
- title_dir.mkdir(exist_ok=True)
- env.update(env_override)
- nav_result, result = run_title_capture(
- title_dir,
- snapshot_chrome_dir,
- env,
- TEST_URL,
- 'testtimeout',
- )
- assert nav_result.returncode == 0, f"Navigation failed: {nav_result.stderr}"
- # Should complete (success or fail, but not hang)
- assert result.returncode in (0, 1), "Should complete without hanging"
- def test_handles_https_urls():
- """Test that HTTPS URLs work correctly."""
- if not shutil.which('node'):
- pass
- with tempfile.TemporaryDirectory() as tmpdir:
- tmpdir = Path(tmpdir)
- with chrome_session(tmpdir, test_url='https://example.org', navigate=False) as (_process, _pid, snapshot_chrome_dir, env):
- title_dir = snapshot_chrome_dir.parent / 'title'
- title_dir.mkdir(exist_ok=True)
- nav_result, result = run_title_capture(
- title_dir,
- snapshot_chrome_dir,
- env,
- 'https://example.org',
- 'testhttps',
- )
- assert nav_result.returncode == 0, f"Navigation failed: {nav_result.stderr}"
- if result.returncode == 0:
- # Hook writes to current directory
- output_title_file = title_dir / 'title.txt'
- if output_title_file.exists():
- title_text = output_title_file.read_text().strip()
- assert len(title_text) > 0, "Title should not be empty"
- assert 'example' in title_text.lower()
- def test_handles_404_gracefully():
- """Test that title plugin handles 404 pages.
- Note: example.com returns valid HTML even for 404 pages, so extraction may succeed
- with the generic "Example Domain" title.
- """
- if not shutil.which('node'):
- pass
- with tempfile.TemporaryDirectory() as tmpdir:
- tmpdir = Path(tmpdir)
- with chrome_session(tmpdir, test_url='https://example.com/nonexistent-page-404', navigate=False) as (
- _process,
- _pid,
- snapshot_chrome_dir,
- env,
- ):
- title_dir = snapshot_chrome_dir.parent / 'title'
- title_dir.mkdir(exist_ok=True)
- nav_result, result = run_title_capture(
- title_dir,
- snapshot_chrome_dir,
- env,
- 'https://example.com/nonexistent-page-404',
- 'test404',
- )
- assert nav_result.returncode == 0, f"Navigation failed: {nav_result.stderr}"
- # May succeed or fail depending on server behavior
- # example.com returns "Example Domain" even for 404s
- assert result.returncode in (0, 1), "Should complete (may succeed or fail)"
- def test_handles_redirects():
- """Test that title plugin handles redirects correctly."""
- if not shutil.which('node'):
- pass
- with tempfile.TemporaryDirectory() as tmpdir:
- tmpdir = Path(tmpdir)
- with chrome_session(tmpdir, test_url='http://example.com', navigate=False) as (
- _process,
- _pid,
- snapshot_chrome_dir,
- env,
- ):
- title_dir = snapshot_chrome_dir.parent / 'title'
- title_dir.mkdir(exist_ok=True)
- # http://example.com redirects to https://example.com
- nav_result, result = run_title_capture(
- title_dir,
- snapshot_chrome_dir,
- env,
- 'http://example.com',
- 'testredirect',
- )
- assert nav_result.returncode == 0, f"Navigation failed: {nav_result.stderr}"
- # Should succeed and follow redirect
- if result.returncode == 0:
- # Hook writes to current directory
- output_title_file = title_dir / 'title.txt'
- if output_title_file.exists():
- title_text = output_title_file.read_text().strip()
- assert 'example' in title_text.lower()
- if __name__ == '__main__':
- pytest.main([__file__, '-v'])
|