software
/
archival.ArchiveBox
mirror of https://github.com/ArchiveBox/ArchiveBox.git


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254
							"""
Tests for archivebox run CLI command.

Tests cover:
- run with stdin JSONL (Crawl, Snapshot, ArchiveResult)
- create-or-update behavior (records with/without id)
- pass-through output (for chaining)
"""

import json
import pytest

from archivebox.tests.conftest import (
    run_archivebox_cmd,
    parse_jsonl_output,
    create_test_url,
    create_test_crawl_json,
    create_test_snapshot_json,
)


class TestRunWithCrawl:
    """Tests for `archivebox run` with Crawl input."""

    def test_run_with_new_crawl(self, initialized_archive):
        """Run creates and processes a new Crawl (no id)."""
        crawl_record = create_test_crawl_json()

        stdout, stderr, code = run_archivebox_cmd(
            ['run'],
            stdin=json.dumps(crawl_record),
            data_dir=initialized_archive,
            timeout=120,
        )

        assert code == 0, f"Command failed: {stderr}"

        # Should output the created Crawl
        records = parse_jsonl_output(stdout)
        crawl_records = [r for r in records if r.get('type') == 'Crawl']
        assert len(crawl_records) >= 1
        assert crawl_records[0].get('id')  # Should have an id now

    def test_run_with_existing_crawl(self, initialized_archive):
        """Run re-queues an existing Crawl (with id)."""
        url = create_test_url()

        # First create a crawl
        stdout1, _, _ = run_archivebox_cmd(['crawl', 'create', url], data_dir=initialized_archive)
        crawl = parse_jsonl_output(stdout1)[0]

        # Run with the existing crawl
        stdout2, stderr, code = run_archivebox_cmd(
            ['run'],
            stdin=json.dumps(crawl),
            data_dir=initialized_archive,
            timeout=120,
        )

        assert code == 0
        records = parse_jsonl_output(stdout2)
        assert len(records) >= 1


class TestRunWithSnapshot:
    """Tests for `archivebox run` with Snapshot input."""

    def test_run_with_new_snapshot(self, initialized_archive):
        """Run creates and processes a new Snapshot (no id, just url)."""
        snapshot_record = create_test_snapshot_json()

        stdout, stderr, code = run_archivebox_cmd(
            ['run'],
            stdin=json.dumps(snapshot_record),
            data_dir=initialized_archive,
            timeout=120,
        )

        assert code == 0, f"Command failed: {stderr}"

        records = parse_jsonl_output(stdout)
        snapshot_records = [r for r in records if r.get('type') == 'Snapshot']
        assert len(snapshot_records) >= 1
        assert snapshot_records[0].get('id')

    def test_run_with_existing_snapshot(self, initialized_archive):
        """Run re-queues an existing Snapshot (with id)."""
        url = create_test_url()

        # First create a snapshot
        stdout1, _, _ = run_archivebox_cmd(['snapshot', 'create', url], data_dir=initialized_archive)
        snapshot = parse_jsonl_output(stdout1)[0]

        # Run with the existing snapshot
        stdout2, stderr, code = run_archivebox_cmd(
            ['run'],
            stdin=json.dumps(snapshot),
            data_dir=initialized_archive,
            timeout=120,
        )

        assert code == 0
        records = parse_jsonl_output(stdout2)
        assert len(records) >= 1

    def test_run_with_plain_url(self, initialized_archive):
        """Run accepts plain URL records (no type field)."""
        url = create_test_url()
        url_record = {'url': url}

        stdout, stderr, code = run_archivebox_cmd(
            ['run'],
            stdin=json.dumps(url_record),
            data_dir=initialized_archive,
            timeout=120,
        )

        assert code == 0
        records = parse_jsonl_output(stdout)
        assert len(records) >= 1


class TestRunWithArchiveResult:
    """Tests for `archivebox run` with ArchiveResult input."""

    def test_run_requeues_failed_archiveresult(self, initialized_archive):
        """Run re-queues a failed ArchiveResult."""
        url = create_test_url()

        # Create snapshot and archive result
        stdout1, _, _ = run_archivebox_cmd(['snapshot', 'create', url], data_dir=initialized_archive)
        snapshot = parse_jsonl_output(stdout1)[0]

        stdout2, _, _ = run_archivebox_cmd(
            ['archiveresult', 'create', '--plugin=title'],
            stdin=json.dumps(snapshot),
            data_dir=initialized_archive,
        )
        ar = next(r for r in parse_jsonl_output(stdout2) if r.get('type') == 'ArchiveResult')

        # Update to failed
        ar['status'] = 'failed'
        run_archivebox_cmd(
            ['archiveresult', 'update', '--status=failed'],
            stdin=json.dumps(ar),
            data_dir=initialized_archive,
        )

        # Now run should re-queue it
        stdout3, stderr, code = run_archivebox_cmd(
            ['run'],
            stdin=json.dumps(ar),
            data_dir=initialized_archive,
            timeout=120,
        )

        assert code == 0
        records = parse_jsonl_output(stdout3)
        ar_records = [r for r in records if r.get('type') == 'ArchiveResult']
        assert len(ar_records) >= 1


class TestRunPassThrough:
    """Tests for pass-through behavior in `archivebox run`."""

    def test_run_passes_through_unknown_types(self, initialized_archive):
        """Run passes through records with unknown types."""
        unknown_record = {'type': 'Unknown', 'id': 'fake-id', 'data': 'test'}

        stdout, stderr, code = run_archivebox_cmd(
            ['run'],
            stdin=json.dumps(unknown_record),
            data_dir=initialized_archive,
        )

        assert code == 0
        records = parse_jsonl_output(stdout)
        unknown_records = [r for r in records if r.get('type') == 'Unknown']
        assert len(unknown_records) == 1
        assert unknown_records[0]['data'] == 'test'

    def test_run_outputs_all_processed_records(self, initialized_archive):
        """Run outputs all processed records for chaining."""
        url = create_test_url()
        crawl_record = create_test_crawl_json(urls=[url])

        stdout, stderr, code = run_archivebox_cmd(
            ['run'],
            stdin=json.dumps(crawl_record),
            data_dir=initialized_archive,
            timeout=120,
        )

        assert code == 0
        records = parse_jsonl_output(stdout)
        # Should have at least the Crawl in output
        assert len(records) >= 1


class TestRunMixedInput:
    """Tests for `archivebox run` with mixed record types."""

    def test_run_handles_mixed_types(self, initialized_archive):
        """Run handles mixed Crawl/Snapshot/ArchiveResult input."""
        crawl = create_test_crawl_json()
        snapshot = create_test_snapshot_json()
        unknown = {'type': 'Tag', 'id': 'fake', 'name': 'test'}

        stdin = '\n'.join([
            json.dumps(crawl),
            json.dumps(snapshot),
            json.dumps(unknown),
        ])

        stdout, stderr, code = run_archivebox_cmd(
            ['run'],
            stdin=stdin,
            data_dir=initialized_archive,
            timeout=120,
        )

        assert code == 0
        records = parse_jsonl_output(stdout)

        types = set(r.get('type') for r in records)
        # Should have processed Crawl and Snapshot, passed through Tag
        assert 'Crawl' in types or 'Snapshot' in types or 'Tag' in types


class TestRunEmpty:
    """Tests for `archivebox run` edge cases."""

    def test_run_empty_stdin(self, initialized_archive):
        """Run with empty stdin returns success."""
        stdout, stderr, code = run_archivebox_cmd(
            ['run'],
            stdin='',
            data_dir=initialized_archive,
        )

        assert code == 0

    def test_run_no_records_to_process(self, initialized_archive):
        """Run with only pass-through records shows message."""
        unknown = {'type': 'Unknown', 'id': 'fake'}

        stdout, stderr, code = run_archivebox_cmd(
            ['run'],
            stdin=json.dumps(unknown),
            data_dir=initialized_archive,
        )

        assert code == 0
        assert 'No records to process' in stderr