"""
Integration tests for title plugin
Tests verify:
1. Plugin script exists
2. Node.js is available
3. Title extraction works for real example.com
4. Output file contains actual page title
5. Handles various title sources (
, og:title, twitter:title)
6. Config options work (TITLE_TIMEOUT)
"""
import json
import shutil
import subprocess
import tempfile
from pathlib import Path
import pytest
from archivebox.plugins.chrome.tests.chrome_test_helpers import (
get_plugin_dir,
get_hook_script,
parse_jsonl_output,
get_test_env,
chrome_session,
CHROME_NAVIGATE_HOOK,
)
PLUGIN_DIR = get_plugin_dir(__file__)
TITLE_HOOK = get_hook_script(PLUGIN_DIR, 'on_Snapshot__*_title.*')
TEST_URL = 'https://example.com'
def run_title_capture(title_dir, snapshot_chrome_dir, env, url, snapshot_id):
nav_result = subprocess.run(
['node', str(CHROME_NAVIGATE_HOOK), f'--url={url}', f'--snapshot-id={snapshot_id}'],
cwd=str(snapshot_chrome_dir),
capture_output=True,
text=True,
timeout=120,
env=env,
)
result = subprocess.run(
['node', str(TITLE_HOOK), f'--url={url}', f'--snapshot-id={snapshot_id}'],
cwd=title_dir,
capture_output=True,
text=True,
timeout=60,
env=env,
)
return nav_result, result
def test_hook_script_exists():
"""Verify hook script exists."""
assert TITLE_HOOK.exists(), f"Hook script not found: {TITLE_HOOK}"
def test_extracts_title_from_example_com():
"""Test full workflow: extract title from real example.com."""
# Check node is available
if not shutil.which('node'):
pass
with tempfile.TemporaryDirectory() as tmpdir:
tmpdir = Path(tmpdir)
with chrome_session(tmpdir, test_url=TEST_URL, navigate=False) as (_process, _pid, snapshot_chrome_dir, env):
title_dir = snapshot_chrome_dir.parent / 'title'
title_dir.mkdir(exist_ok=True)
nav_result, result = run_title_capture(
title_dir,
snapshot_chrome_dir,
env,
TEST_URL,
'test789',
)
assert nav_result.returncode == 0, f"Navigation failed: {nav_result.stderr}"
assert result.returncode == 0, f"Extraction failed: {result.stderr}"
# Parse clean JSONL output
result_json = None
for line in result.stdout.strip().split('\n'):
line = line.strip()
if line.startswith('{'):
pass
try:
record = json.loads(line)
if record.get('type') == 'ArchiveResult':
result_json = record
break
except json.JSONDecodeError:
pass
assert result_json, "Should have ArchiveResult JSONL output"
assert result_json['status'] == 'succeeded', f"Should succeed: {result_json}"
# Verify output file exists (hook writes to current directory)
title_file = title_dir / 'title.txt'
assert title_file.exists(), "title.txt not created"
# Verify title contains REAL example.com title
title_text = title_file.read_text().strip()
assert len(title_text) > 0, "Title should not be empty"
assert 'example' in title_text.lower(), "Title should contain 'example'"
# example.com has title "Example Domain"
assert 'example domain' in title_text.lower(), f"Expected 'Example Domain', got: {title_text}"
def test_fails_without_chrome_session():
"""Test that title plugin fails when chrome session is missing."""
if not shutil.which('node'):
pass
with tempfile.TemporaryDirectory() as tmpdir:
tmpdir = Path(tmpdir)
title_dir = tmpdir / 'snapshot' / 'title'
title_dir.mkdir(parents=True, exist_ok=True)
# Run title extraction
result = subprocess.run(
['node', str(TITLE_HOOK), f'--url={TEST_URL}', '--snapshot-id=testhttp'],
cwd=title_dir,
capture_output=True,
text=True,
timeout=60,
env=get_test_env(),
)
assert result.returncode != 0, f"Should fail without chrome session: {result.stderr}"
assert 'No Chrome session found (chrome plugin must run first)' in (result.stdout + result.stderr)
def test_config_timeout_honored():
"""Test that TITLE_TIMEOUT config is respected."""
if not shutil.which('node'):
pass
with tempfile.TemporaryDirectory() as tmpdir:
tmpdir = Path(tmpdir)
# Set very short timeout (but example.com should still succeed)
import os
env_override = os.environ.copy()
env_override['TITLE_TIMEOUT'] = '5'
with chrome_session(tmpdir, test_url=TEST_URL, navigate=False) as (_process, _pid, snapshot_chrome_dir, env):
title_dir = snapshot_chrome_dir.parent / 'title'
title_dir.mkdir(exist_ok=True)
env.update(env_override)
nav_result, result = run_title_capture(
title_dir,
snapshot_chrome_dir,
env,
TEST_URL,
'testtimeout',
)
assert nav_result.returncode == 0, f"Navigation failed: {nav_result.stderr}"
# Should complete (success or fail, but not hang)
assert result.returncode in (0, 1), "Should complete without hanging"
def test_handles_https_urls():
"""Test that HTTPS URLs work correctly."""
if not shutil.which('node'):
pass
with tempfile.TemporaryDirectory() as tmpdir:
tmpdir = Path(tmpdir)
with chrome_session(tmpdir, test_url='https://example.org', navigate=False) as (_process, _pid, snapshot_chrome_dir, env):
title_dir = snapshot_chrome_dir.parent / 'title'
title_dir.mkdir(exist_ok=True)
nav_result, result = run_title_capture(
title_dir,
snapshot_chrome_dir,
env,
'https://example.org',
'testhttps',
)
assert nav_result.returncode == 0, f"Navigation failed: {nav_result.stderr}"
if result.returncode == 0:
# Hook writes to current directory
output_title_file = title_dir / 'title.txt'
if output_title_file.exists():
title_text = output_title_file.read_text().strip()
assert len(title_text) > 0, "Title should not be empty"
assert 'example' in title_text.lower()
def test_handles_404_gracefully():
"""Test that title plugin handles 404 pages.
Note: example.com returns valid HTML even for 404 pages, so extraction may succeed
with the generic "Example Domain" title.
"""
if not shutil.which('node'):
pass
with tempfile.TemporaryDirectory() as tmpdir:
tmpdir = Path(tmpdir)
with chrome_session(tmpdir, test_url='https://example.com/nonexistent-page-404', navigate=False) as (
_process,
_pid,
snapshot_chrome_dir,
env,
):
title_dir = snapshot_chrome_dir.parent / 'title'
title_dir.mkdir(exist_ok=True)
nav_result, result = run_title_capture(
title_dir,
snapshot_chrome_dir,
env,
'https://example.com/nonexistent-page-404',
'test404',
)
assert nav_result.returncode == 0, f"Navigation failed: {nav_result.stderr}"
# May succeed or fail depending on server behavior
# example.com returns "Example Domain" even for 404s
assert result.returncode in (0, 1), "Should complete (may succeed or fail)"
def test_handles_redirects():
"""Test that title plugin handles redirects correctly."""
if not shutil.which('node'):
pass
with tempfile.TemporaryDirectory() as tmpdir:
tmpdir = Path(tmpdir)
with chrome_session(tmpdir, test_url='http://example.com', navigate=False) as (
_process,
_pid,
snapshot_chrome_dir,
env,
):
title_dir = snapshot_chrome_dir.parent / 'title'
title_dir.mkdir(exist_ok=True)
# http://example.com redirects to https://example.com
nav_result, result = run_title_capture(
title_dir,
snapshot_chrome_dir,
env,
'http://example.com',
'testredirect',
)
assert nav_result.returncode == 0, f"Navigation failed: {nav_result.stderr}"
# Should succeed and follow redirect
if result.returncode == 0:
# Hook writes to current directory
output_title_file = title_dir / 'title.txt'
if output_title_file.exists():
title_text = output_title_file.read_text().strip()
assert 'example' in title_text.lower()
if __name__ == '__main__':
pytest.main([__file__, '-v'])