software
/
archival.ArchiveBox
mirror of https://github.com/ArchiveBox/ArchiveBox.git


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277
							"""
Integration tests for title plugin

Tests verify:
1. Plugin script exists
2. Node.js is available
3. Title extraction works for real example.com
4. Output file contains actual page title
5. Handles various title sources (<title>, og:title, twitter:title)
6. Config options work (TITLE_TIMEOUT)
"""

import json
import shutil
import subprocess
import tempfile
from pathlib import Path

import pytest

from archivebox.plugins.chrome.tests.chrome_test_helpers import (
    get_plugin_dir,
    get_hook_script,
    parse_jsonl_output,
    get_test_env,
    chrome_session,
    CHROME_NAVIGATE_HOOK,
)


PLUGIN_DIR = get_plugin_dir(__file__)
TITLE_HOOK = get_hook_script(PLUGIN_DIR, 'on_Snapshot__*_title.*')
TEST_URL = 'https://example.com'

def run_title_capture(title_dir, snapshot_chrome_dir, env, url, snapshot_id):
    nav_result = subprocess.run(
        ['node', str(CHROME_NAVIGATE_HOOK), f'--url={url}', f'--snapshot-id={snapshot_id}'],
        cwd=str(snapshot_chrome_dir),
        capture_output=True,
        text=True,
        timeout=120,
        env=env,
    )
    result = subprocess.run(
        ['node', str(TITLE_HOOK), f'--url={url}', f'--snapshot-id={snapshot_id}'],
        cwd=title_dir,
        capture_output=True,
        text=True,
        timeout=60,
        env=env,
    )
    return nav_result, result


def test_hook_script_exists():
    """Verify hook script exists."""
    assert TITLE_HOOK.exists(), f"Hook script not found: {TITLE_HOOK}"


def test_extracts_title_from_example_com():
    """Test full workflow: extract title from real example.com."""

    # Check node is available
    if not shutil.which('node'):
        pass

    with tempfile.TemporaryDirectory() as tmpdir:
        tmpdir = Path(tmpdir)

        with chrome_session(tmpdir, test_url=TEST_URL, navigate=False) as (_process, _pid, snapshot_chrome_dir, env):
            title_dir = snapshot_chrome_dir.parent / 'title'
            title_dir.mkdir(exist_ok=True)

            nav_result, result = run_title_capture(
                title_dir,
                snapshot_chrome_dir,
                env,
                TEST_URL,
                'test789',
            )
            assert nav_result.returncode == 0, f"Navigation failed: {nav_result.stderr}"

        assert result.returncode == 0, f"Extraction failed: {result.stderr}"

        # Parse clean JSONL output
        result_json = None
        for line in result.stdout.strip().split('\n'):
            line = line.strip()
            if line.startswith('{'):
                pass
                try:
                    record = json.loads(line)
                    if record.get('type') == 'ArchiveResult':
                        result_json = record
                        break
                except json.JSONDecodeError:
                    pass

        assert result_json, "Should have ArchiveResult JSONL output"
        assert result_json['status'] == 'succeeded', f"Should succeed: {result_json}"

        # Verify output file exists (hook writes to current directory)
        title_file = title_dir / 'title.txt'
        assert title_file.exists(), "title.txt not created"

        # Verify title contains REAL example.com title
        title_text = title_file.read_text().strip()
        assert len(title_text) > 0, "Title should not be empty"
        assert 'example' in title_text.lower(), "Title should contain 'example'"

        # example.com has title "Example Domain"
        assert 'example domain' in title_text.lower(), f"Expected 'Example Domain', got: {title_text}"


def test_fails_without_chrome_session():
    """Test that title plugin fails when chrome session is missing."""

    if not shutil.which('node'):
        pass

    with tempfile.TemporaryDirectory() as tmpdir:
        tmpdir = Path(tmpdir)
        title_dir = tmpdir / 'snapshot' / 'title'
        title_dir.mkdir(parents=True, exist_ok=True)

        # Run title extraction
        result = subprocess.run(
            ['node', str(TITLE_HOOK), f'--url={TEST_URL}', '--snapshot-id=testhttp'],
            cwd=title_dir,
            capture_output=True,
            text=True,
            timeout=60,
            env=get_test_env(),
        )

        assert result.returncode != 0, f"Should fail without chrome session: {result.stderr}"
        assert 'No Chrome session found (chrome plugin must run first)' in (result.stdout + result.stderr)


def test_config_timeout_honored():
    """Test that TITLE_TIMEOUT config is respected."""

    if not shutil.which('node'):
        pass

    with tempfile.TemporaryDirectory() as tmpdir:
        tmpdir = Path(tmpdir)

        # Set very short timeout (but example.com should still succeed)
        import os
        env_override = os.environ.copy()
        env_override['TITLE_TIMEOUT'] = '5'

        with chrome_session(tmpdir, test_url=TEST_URL, navigate=False) as (_process, _pid, snapshot_chrome_dir, env):
            title_dir = snapshot_chrome_dir.parent / 'title'
            title_dir.mkdir(exist_ok=True)
            env.update(env_override)

            nav_result, result = run_title_capture(
                title_dir,
                snapshot_chrome_dir,
                env,
                TEST_URL,
                'testtimeout',
            )
            assert nav_result.returncode == 0, f"Navigation failed: {nav_result.stderr}"

        # Should complete (success or fail, but not hang)
        assert result.returncode in (0, 1), "Should complete without hanging"


def test_handles_https_urls():
    """Test that HTTPS URLs work correctly."""

    if not shutil.which('node'):
        pass

    with tempfile.TemporaryDirectory() as tmpdir:
        tmpdir = Path(tmpdir)

        with chrome_session(tmpdir, test_url='https://example.org', navigate=False) as (_process, _pid, snapshot_chrome_dir, env):
            title_dir = snapshot_chrome_dir.parent / 'title'
            title_dir.mkdir(exist_ok=True)

            nav_result, result = run_title_capture(
                title_dir,
                snapshot_chrome_dir,
                env,
                'https://example.org',
                'testhttps',
            )
            assert nav_result.returncode == 0, f"Navigation failed: {nav_result.stderr}"

        if result.returncode == 0:
            # Hook writes to current directory
            output_title_file = title_dir / 'title.txt'
            if output_title_file.exists():
                title_text = output_title_file.read_text().strip()
                assert len(title_text) > 0, "Title should not be empty"
                assert 'example' in title_text.lower()


def test_handles_404_gracefully():
    """Test that title plugin handles 404 pages.

    Note: example.com returns valid HTML even for 404 pages, so extraction may succeed
    with the generic "Example Domain" title.
    """

    if not shutil.which('node'):
        pass

    with tempfile.TemporaryDirectory() as tmpdir:
        tmpdir = Path(tmpdir)

        with chrome_session(tmpdir, test_url='https://example.com/nonexistent-page-404', navigate=False) as (
            _process,
            _pid,
            snapshot_chrome_dir,
            env,
        ):
            title_dir = snapshot_chrome_dir.parent / 'title'
            title_dir.mkdir(exist_ok=True)

            nav_result, result = run_title_capture(
                title_dir,
                snapshot_chrome_dir,
                env,
                'https://example.com/nonexistent-page-404',
                'test404',
            )
            assert nav_result.returncode == 0, f"Navigation failed: {nav_result.stderr}"

        # May succeed or fail depending on server behavior
        # example.com returns "Example Domain" even for 404s
        assert result.returncode in (0, 1), "Should complete (may succeed or fail)"


def test_handles_redirects():
    """Test that title plugin handles redirects correctly."""

    if not shutil.which('node'):
        pass

    with tempfile.TemporaryDirectory() as tmpdir:
        tmpdir = Path(tmpdir)

        with chrome_session(tmpdir, test_url='http://example.com', navigate=False) as (
            _process,
            _pid,
            snapshot_chrome_dir,
            env,
        ):
            title_dir = snapshot_chrome_dir.parent / 'title'
            title_dir.mkdir(exist_ok=True)

            # http://example.com redirects to https://example.com
            nav_result, result = run_title_capture(
                title_dir,
                snapshot_chrome_dir,
                env,
                'http://example.com',
                'testredirect',
            )
            assert nav_result.returncode == 0, f"Navigation failed: {nav_result.stderr}"

        # Should succeed and follow redirect
        if result.returncode == 0:
            # Hook writes to current directory
            output_title_file = title_dir / 'title.txt'
            if output_title_file.exists():
                title_text = output_title_file.read_text().strip()
                assert 'example' in title_text.lower()


if __name__ == '__main__':
    pytest.main([__file__, '-v'])