software
/
archival.ArchiveBox
mirror of https://github.com/ArchiveBox/ArchiveBox.git


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960
							#!/usr/bin/env python3
"""
Tests for CLI piping workflow: crawl | snapshot | extract

This module tests the JSONL-based piping between CLI commands as described in:
https://github.com/ArchiveBox/ArchiveBox/issues/1363

Workflows tested:
    archivebox crawl URL         -> Crawl JSONL
    archivebox snapshot          -> Snapshot JSONL (accepts Crawl or URL input)
    archivebox extract           -> ArchiveResult JSONL (accepts Snapshot input)

Pipeline:
    archivebox crawl URL | archivebox snapshot | archivebox extract

Each command should:
    - Accept URLs, IDs, or JSONL as input (args or stdin)
    - Output JSONL to stdout when piped (not TTY)
    - Output human-readable to stderr when TTY
"""

__package__ = 'archivebox.cli'

import os
import sys
import json
import shutil
import tempfile
import unittest
from io import StringIO
from pathlib import Path
from unittest.mock import patch, MagicMock

# Test configuration - disable slow extractors
TEST_CONFIG = {
    'USE_COLOR': 'False',
    'SHOW_PROGRESS': 'False',
    'SAVE_ARCHIVEDOTORG': 'False',
    'SAVE_TITLE': 'True',  # Fast extractor
    'SAVE_FAVICON': 'False',
    'SAVE_WGET': 'False',
    'SAVE_WARC': 'False',
    'SAVE_PDF': 'False',
    'SAVE_SCREENSHOT': 'False',
    'SAVE_DOM': 'False',
    'SAVE_SINGLEFILE': 'False',
    'SAVE_READABILITY': 'False',
    'SAVE_MERCURY': 'False',
    'SAVE_GIT': 'False',
    'SAVE_YTDLP': 'False',
    'SAVE_HEADERS': 'False',
    'USE_CURL': 'False',
    'USE_WGET': 'False',
    'USE_GIT': 'False',
    'USE_CHROME': 'False',
    'USE_YOUTUBEDL': 'False',
    'USE_NODE': 'False',
}

os.environ.update(TEST_CONFIG)


# =============================================================================
# JSONL Utility Tests
# =============================================================================

class TestJSONLParsing(unittest.TestCase):
    """Test JSONL input parsing utilities."""

    def test_parse_plain_url(self):
        """Plain URLs should be parsed as Snapshot records."""
        from archivebox.misc.jsonl import parse_line, TYPE_SNAPSHOT

        result = parse_line('https://example.com')
        self.assertIsNotNone(result)
        self.assertEqual(result['type'], TYPE_SNAPSHOT)
        self.assertEqual(result['url'], 'https://example.com')

    def test_parse_jsonl_snapshot(self):
        """JSONL Snapshot records should preserve all fields."""
        from archivebox.misc.jsonl import parse_line, TYPE_SNAPSHOT

        line = '{"type": "Snapshot", "url": "https://example.com", "tags": "test,demo"}'
        result = parse_line(line)
        self.assertIsNotNone(result)
        self.assertEqual(result['type'], TYPE_SNAPSHOT)
        self.assertEqual(result['url'], 'https://example.com')
        self.assertEqual(result['tags'], 'test,demo')

    def test_parse_jsonl_crawl(self):
        """JSONL Crawl records should be parsed correctly."""
        from archivebox.misc.jsonl import parse_line, TYPE_CRAWL

        line = '{"type": "Crawl", "id": "abc123", "urls": "https://example.com", "max_depth": 1}'
        result = parse_line(line)
        self.assertIsNotNone(result)
        self.assertEqual(result['type'], TYPE_CRAWL)
        self.assertEqual(result['id'], 'abc123')
        self.assertEqual(result['urls'], 'https://example.com')
        self.assertEqual(result['max_depth'], 1)

    def test_parse_jsonl_with_id(self):
        """JSONL with id field should be recognized."""
        from archivebox.misc.jsonl import parse_line, TYPE_SNAPSHOT

        line = '{"type": "Snapshot", "id": "abc123", "url": "https://example.com"}'
        result = parse_line(line)
        self.assertIsNotNone(result)
        self.assertEqual(result['id'], 'abc123')
        self.assertEqual(result['url'], 'https://example.com')

    def test_parse_uuid_as_snapshot_id(self):
        """Bare UUIDs should be parsed as snapshot IDs."""
        from archivebox.misc.jsonl import parse_line, TYPE_SNAPSHOT

        uuid = '01234567-89ab-cdef-0123-456789abcdef'
        result = parse_line(uuid)
        self.assertIsNotNone(result)
        self.assertEqual(result['type'], TYPE_SNAPSHOT)
        self.assertEqual(result['id'], uuid)

    def test_parse_empty_line(self):
        """Empty lines should return None."""
        from archivebox.misc.jsonl import parse_line

        self.assertIsNone(parse_line(''))
        self.assertIsNone(parse_line('   '))
        self.assertIsNone(parse_line('\n'))

    def test_parse_comment_line(self):
        """Comment lines should return None."""
        from archivebox.misc.jsonl import parse_line

        self.assertIsNone(parse_line('# This is a comment'))
        self.assertIsNone(parse_line('  # Indented comment'))

    def test_parse_invalid_url(self):
        """Invalid URLs should return None."""
        from archivebox.misc.jsonl import parse_line

        self.assertIsNone(parse_line('not-a-url'))
        self.assertIsNone(parse_line('ftp://example.com'))  # Only http/https/file

    def test_parse_file_url(self):
        """file:// URLs should be parsed."""
        from archivebox.misc.jsonl import parse_line, TYPE_SNAPSHOT

        result = parse_line('file:///path/to/file.txt')
        self.assertIsNotNone(result)
        self.assertEqual(result['type'], TYPE_SNAPSHOT)
        self.assertEqual(result['url'], 'file:///path/to/file.txt')


class TestJSONLOutput(unittest.TestCase):
    """Test JSONL output formatting."""

    def test_crawl_to_jsonl(self):
        """Crawl model should serialize to JSONL correctly."""
        from archivebox.misc.jsonl import TYPE_CRAWL

        # Create a mock crawl with to_jsonl method configured
        mock_crawl = MagicMock()
        mock_crawl.to_jsonl.return_value = {
            'type': TYPE_CRAWL,
            'schema_version': '0.9.0',
            'id': 'test-crawl-uuid',
            'urls': 'https://example.com',
            'status': 'queued',
            'max_depth': 0,
            'tags_str': 'tag1,tag2',
            'label': '',
            'created_at': None,
        }

        result = mock_crawl.to_jsonl()
        self.assertEqual(result['type'], TYPE_CRAWL)
        self.assertEqual(result['id'], 'test-crawl-uuid')
        self.assertEqual(result['urls'], 'https://example.com')
        self.assertEqual(result['status'], 'queued')

    # Note: Snapshot and ArchiveResult serialization is tested in integration tests
    # (TestPipingWorkflowIntegration) using real model instances, not mocks.


class TestReadArgsOrStdin(unittest.TestCase):
    """Test reading from args or stdin."""

    def test_read_from_args(self):
        """Should read URLs from command line args."""
        from archivebox.misc.jsonl import read_args_or_stdin

        args = ('https://example1.com', 'https://example2.com')
        records = list(read_args_or_stdin(args))

        self.assertEqual(len(records), 2)
        self.assertEqual(records[0]['url'], 'https://example1.com')
        self.assertEqual(records[1]['url'], 'https://example2.com')

    def test_read_from_stdin(self):
        """Should read URLs from stdin when no args provided."""
        from archivebox.misc.jsonl import read_args_or_stdin

        stdin_content = 'https://example1.com\nhttps://example2.com\n'
        stream = StringIO(stdin_content)

        # Mock isatty to return False (simulating piped input)
        stream.isatty = lambda: False

        records = list(read_args_or_stdin((), stream=stream))

        self.assertEqual(len(records), 2)
        self.assertEqual(records[0]['url'], 'https://example1.com')
        self.assertEqual(records[1]['url'], 'https://example2.com')

    def test_read_jsonl_from_stdin(self):
        """Should read JSONL from stdin."""
        from archivebox.misc.jsonl import read_args_or_stdin

        stdin_content = '{"type": "Snapshot", "url": "https://example.com", "tags": "test"}\n'
        stream = StringIO(stdin_content)
        stream.isatty = lambda: False

        records = list(read_args_or_stdin((), stream=stream))

        self.assertEqual(len(records), 1)
        self.assertEqual(records[0]['url'], 'https://example.com')
        self.assertEqual(records[0]['tags'], 'test')

    def test_read_crawl_jsonl_from_stdin(self):
        """Should read Crawl JSONL from stdin."""
        from archivebox.misc.jsonl import read_args_or_stdin, TYPE_CRAWL

        stdin_content = '{"type": "Crawl", "id": "abc123", "urls": "https://example.com\\nhttps://foo.com"}\n'
        stream = StringIO(stdin_content)
        stream.isatty = lambda: False

        records = list(read_args_or_stdin((), stream=stream))

        self.assertEqual(len(records), 1)
        self.assertEqual(records[0]['type'], TYPE_CRAWL)
        self.assertEqual(records[0]['id'], 'abc123')

    def test_skip_tty_stdin(self):
        """Should not read from TTY stdin (would block)."""
        from archivebox.misc.jsonl import read_args_or_stdin

        stream = StringIO('https://example.com')
        stream.isatty = lambda: True  # Simulate TTY

        records = list(read_args_or_stdin((), stream=stream))
        self.assertEqual(len(records), 0)


# =============================================================================
# Unit Tests for Individual Commands
# =============================================================================

class TestCrawlCommand(unittest.TestCase):
    """Unit tests for archivebox crawl command."""

    def setUp(self):
        """Set up test environment."""
        self.test_dir = tempfile.mkdtemp()
        os.environ['DATA_DIR'] = self.test_dir

    def tearDown(self):
        """Clean up test environment."""
        shutil.rmtree(self.test_dir, ignore_errors=True)

    def test_crawl_accepts_url(self):
        """crawl should accept URLs as input."""
        from archivebox.misc.jsonl import read_args_or_stdin

        args = ('https://example.com',)
        records = list(read_args_or_stdin(args))

        self.assertEqual(len(records), 1)
        self.assertEqual(records[0]['url'], 'https://example.com')

    def test_crawl_output_format(self):
        """crawl should output Crawl JSONL records."""
        from archivebox.misc.jsonl import TYPE_CRAWL

        # Mock crawl output
        crawl_output = {
            'type': TYPE_CRAWL,
            'schema_version': '0.9.0',
            'id': 'test-crawl-id',
            'urls': 'https://example.com',
            'status': 'queued',
            'max_depth': 0,
        }

        self.assertEqual(crawl_output['type'], TYPE_CRAWL)
        self.assertIn('id', crawl_output)
        self.assertIn('urls', crawl_output)


class TestSnapshotCommand(unittest.TestCase):
    """Unit tests for archivebox snapshot command."""

    def setUp(self):
        """Set up test environment."""
        self.test_dir = tempfile.mkdtemp()
        os.environ['DATA_DIR'] = self.test_dir

    def tearDown(self):
        """Clean up test environment."""
        shutil.rmtree(self.test_dir, ignore_errors=True)

    def test_snapshot_accepts_url(self):
        """snapshot should accept URLs as input."""
        from archivebox.misc.jsonl import read_args_or_stdin

        args = ('https://example.com',)
        records = list(read_args_or_stdin(args))

        self.assertEqual(len(records), 1)
        self.assertEqual(records[0]['url'], 'https://example.com')

    def test_snapshot_accepts_crawl_jsonl(self):
        """snapshot should accept Crawl JSONL as input."""
        from archivebox.misc.jsonl import read_args_or_stdin, TYPE_CRAWL

        stdin = StringIO('{"type": "Crawl", "id": "abc123", "urls": "https://example.com"}\n')
        stdin.isatty = lambda: False

        records = list(read_args_or_stdin((), stream=stdin))

        self.assertEqual(len(records), 1)
        self.assertEqual(records[0]['type'], TYPE_CRAWL)
        self.assertEqual(records[0]['id'], 'abc123')
        self.assertEqual(records[0]['urls'], 'https://example.com')

    def test_snapshot_accepts_jsonl_with_metadata(self):
        """snapshot should accept JSONL with tags and other metadata."""
        from archivebox.misc.jsonl import read_args_or_stdin

        stdin = StringIO('{"type": "Snapshot", "url": "https://example.com", "tags": "tag1,tag2", "title": "Test"}\n')
        stdin.isatty = lambda: False

        records = list(read_args_or_stdin((), stream=stdin))

        self.assertEqual(len(records), 1)
        self.assertEqual(records[0]['url'], 'https://example.com')
        self.assertEqual(records[0]['tags'], 'tag1,tag2')
        self.assertEqual(records[0]['title'], 'Test')

    # Note: Snapshot output format is tested in integration tests
    # (TestPipingWorkflowIntegration.test_snapshot_creates_and_outputs_jsonl)
    # using real Snapshot instances.


class TestExtractCommand(unittest.TestCase):
    """Unit tests for archivebox extract command."""

    def setUp(self):
        """Set up test environment."""
        self.test_dir = tempfile.mkdtemp()
        os.environ['DATA_DIR'] = self.test_dir

    def tearDown(self):
        """Clean up test environment."""
        shutil.rmtree(self.test_dir, ignore_errors=True)

    def test_extract_accepts_snapshot_id(self):
        """extract should accept snapshot IDs as input."""
        from archivebox.misc.jsonl import read_args_or_stdin

        uuid = '01234567-89ab-cdef-0123-456789abcdef'
        args = (uuid,)
        records = list(read_args_or_stdin(args))

        self.assertEqual(len(records), 1)
        self.assertEqual(records[0]['id'], uuid)

    def test_extract_accepts_jsonl_snapshot(self):
        """extract should accept JSONL Snapshot records."""
        from archivebox.misc.jsonl import read_args_or_stdin, TYPE_SNAPSHOT

        stdin = StringIO('{"type": "Snapshot", "id": "abc123", "url": "https://example.com"}\n')
        stdin.isatty = lambda: False

        records = list(read_args_or_stdin((), stream=stdin))

        self.assertEqual(len(records), 1)
        self.assertEqual(records[0]['type'], TYPE_SNAPSHOT)
        self.assertEqual(records[0]['id'], 'abc123')

    def test_extract_gathers_snapshot_ids(self):
        """extract should gather snapshot IDs from various input formats."""
        from archivebox.misc.jsonl import TYPE_SNAPSHOT, TYPE_ARCHIVERESULT

        records = [
            {'type': TYPE_SNAPSHOT, 'id': 'snap-1'},
            {'type': TYPE_SNAPSHOT, 'id': 'snap-2', 'url': 'https://example.com'},
            {'type': TYPE_ARCHIVERESULT, 'snapshot_id': 'snap-3'},
            {'id': 'snap-4'},  # Bare id
        ]

        snapshot_ids = set()
        for record in records:
            record_type = record.get('type')

            if record_type == TYPE_SNAPSHOT:
                snapshot_id = record.get('id')
                if snapshot_id:
                    snapshot_ids.add(snapshot_id)
            elif record_type == TYPE_ARCHIVERESULT:
                snapshot_id = record.get('snapshot_id')
                if snapshot_id:
                    snapshot_ids.add(snapshot_id)
            elif 'id' in record:
                snapshot_ids.add(record['id'])

        self.assertEqual(len(snapshot_ids), 4)
        self.assertIn('snap-1', snapshot_ids)
        self.assertIn('snap-2', snapshot_ids)
        self.assertIn('snap-3', snapshot_ids)
        self.assertIn('snap-4', snapshot_ids)


# =============================================================================
# URL Collection Tests
# =============================================================================

class TestURLCollection(unittest.TestCase):
    """Test collecting urls.jsonl from extractor output."""

    def setUp(self):
        """Create test directory structure."""
        self.test_dir = Path(tempfile.mkdtemp())

        # Create fake extractor output directories with urls.jsonl
        (self.test_dir / 'wget').mkdir()
        (self.test_dir / 'wget' / 'urls.jsonl').write_text(
            '{"url": "https://wget-link-1.com"}\n'
            '{"url": "https://wget-link-2.com"}\n'
        )

        (self.test_dir / 'parse_html_urls').mkdir()
        (self.test_dir / 'parse_html_urls' / 'urls.jsonl').write_text(
            '{"url": "https://html-link-1.com"}\n'
            '{"url": "https://html-link-2.com", "title": "HTML Link 2"}\n'
        )

        (self.test_dir / 'screenshot').mkdir()
        # No urls.jsonl in screenshot dir - not a parser

    def tearDown(self):
        """Clean up test directory."""
        shutil.rmtree(self.test_dir, ignore_errors=True)

    def test_collect_urls_from_plugins(self):
        """Should collect urls.jsonl from all parser plugin subdirectories."""
        from archivebox.hooks import collect_urls_from_plugins

        urls = collect_urls_from_plugins(self.test_dir)

        self.assertEqual(len(urls), 4)

        # Check that plugin is set
        plugins = {u['plugin'] for u in urls}
        self.assertIn('wget', plugins)
        self.assertIn('parse_html_urls', plugins)
        self.assertNotIn('screenshot', plugins)  # No urls.jsonl

    def test_collect_urls_preserves_metadata(self):
        """Should preserve metadata from urls.jsonl entries."""
        from archivebox.hooks import collect_urls_from_plugins

        urls = collect_urls_from_plugins(self.test_dir)

        # Find the entry with title
        titled = [u for u in urls if u.get('title') == 'HTML Link 2']
        self.assertEqual(len(titled), 1)
        self.assertEqual(titled[0]['url'], 'https://html-link-2.com')

    def test_collect_urls_empty_dir(self):
        """Should handle empty or non-existent directories."""
        from archivebox.hooks import collect_urls_from_plugins

        empty_dir = self.test_dir / 'nonexistent'
        urls = collect_urls_from_plugins(empty_dir)

        self.assertEqual(len(urls), 0)


# =============================================================================
# Integration Tests
# =============================================================================

class TestPipingWorkflowIntegration(unittest.TestCase):
    """
    Integration tests for the complete piping workflow.

    These tests require Django to be set up and use the actual database.
    """

    @classmethod
    def setUpClass(cls):
        """Set up Django and test database."""
        cls.test_dir = tempfile.mkdtemp()
        os.environ['DATA_DIR'] = cls.test_dir

        # Initialize Django
        from archivebox.config.django import setup_django
        setup_django()

        # Initialize the archive
        from archivebox.cli.archivebox_init import init
        init()

    @classmethod
    def tearDownClass(cls):
        """Clean up test database."""
        shutil.rmtree(cls.test_dir, ignore_errors=True)

    def test_crawl_creates_and_outputs_jsonl(self):
        """
        Test: archivebox crawl URL1 URL2 URL3
        Should create a single Crawl with all URLs and output JSONL when piped.
        """
        from archivebox.crawls.models import Crawl
        from archivebox.misc.jsonl import TYPE_CRAWL
        from archivebox.base_models.models import get_or_create_system_user_pk

        created_by_id = get_or_create_system_user_pk()

        # Create crawl with multiple URLs (as newline-separated string)
        urls = 'https://test-crawl-1.example.com\nhttps://test-crawl-2.example.com'
        crawl = Crawl.from_jsonl({'urls': urls}, overrides={'created_by_id': created_by_id})

        self.assertIsNotNone(crawl)
        self.assertIsNotNone(crawl.id)
        self.assertEqual(crawl.urls, urls)
        self.assertEqual(crawl.status, 'queued')

        # Verify URLs list
        urls_list = crawl.get_urls_list()
        self.assertEqual(len(urls_list), 2)
        self.assertIn('https://test-crawl-1.example.com', urls_list)
        self.assertIn('https://test-crawl-2.example.com', urls_list)

        # Verify output format
        output = crawl.to_jsonl()
        self.assertEqual(output['type'], TYPE_CRAWL)
        self.assertIn('id', output)
        self.assertEqual(output['urls'], urls)
        self.assertIn('schema_version', output)

    def test_snapshot_accepts_crawl_jsonl(self):
        """
        Test: archivebox crawl URL | archivebox snapshot
        Snapshot should accept Crawl JSONL and create Snapshots for each URL.
        """
        from archivebox.crawls.models import Crawl
        from archivebox.core.models import Snapshot
        from archivebox.misc.jsonl import (
            read_args_or_stdin,
            TYPE_CRAWL, TYPE_SNAPSHOT
        )
        from archivebox.base_models.models import get_or_create_system_user_pk

        created_by_id = get_or_create_system_user_pk()

        # Step 1: Create crawl (simulating 'archivebox crawl')
        urls = 'https://crawl-to-snap-1.example.com\nhttps://crawl-to-snap-2.example.com'
        crawl = Crawl.from_jsonl({'urls': urls}, overrides={'created_by_id': created_by_id})
        crawl_output = crawl.to_jsonl()

        # Step 2: Parse crawl output as snapshot input
        stdin = StringIO(json.dumps(crawl_output) + '\n')
        stdin.isatty = lambda: False

        records = list(read_args_or_stdin((), stream=stdin))

        self.assertEqual(len(records), 1)
        self.assertEqual(records[0]['type'], TYPE_CRAWL)

        # Step 3: Create snapshots from crawl URLs
        created_snapshots = []
        for url in crawl.get_urls_list():
            snapshot = Snapshot.from_jsonl({'url': url}, overrides={'created_by_id': created_by_id})
            if snapshot:
                created_snapshots.append(snapshot)

        self.assertEqual(len(created_snapshots), 2)

        # Verify snapshot output
        for snapshot in created_snapshots:
            output = snapshot.to_jsonl()
            self.assertEqual(output['type'], TYPE_SNAPSHOT)
            self.assertIn(output['url'], [
                'https://crawl-to-snap-1.example.com',
                'https://crawl-to-snap-2.example.com'
            ])

    def test_snapshot_creates_and_outputs_jsonl(self):
        """
        Test: archivebox snapshot URL
        Should create a Snapshot and output JSONL when piped.
        """
        from archivebox.core.models import Snapshot
        from archivebox.misc.jsonl import (
            read_args_or_stdin, write_record,
            TYPE_SNAPSHOT
        )
        from archivebox.base_models.models import get_or_create_system_user_pk

        created_by_id = get_or_create_system_user_pk()

        # Simulate input
        url = 'https://test-snapshot-1.example.com'
        records = list(read_args_or_stdin((url,)))

        self.assertEqual(len(records), 1)
        self.assertEqual(records[0]['url'], url)

        # Create snapshot
        overrides = {'created_by_id': created_by_id}
        snapshot = Snapshot.from_jsonl(records[0], overrides=overrides)

        self.assertIsNotNone(snapshot.id)
        self.assertEqual(snapshot.url, url)

        # Verify output format
        output = snapshot.to_jsonl()
        self.assertEqual(output['type'], TYPE_SNAPSHOT)
        self.assertIn('id', output)
        self.assertEqual(output['url'], url)

    def test_extract_accepts_snapshot_from_previous_command(self):
        """
        Test: archivebox snapshot URL | archivebox extract
        Extract should accept JSONL output from snapshot command.
        """
        from archivebox.core.models import Snapshot, ArchiveResult
        from archivebox.misc.jsonl import (
            read_args_or_stdin,
            TYPE_SNAPSHOT
        )
        from archivebox.base_models.models import get_or_create_system_user_pk

        created_by_id = get_or_create_system_user_pk()

        # Step 1: Create snapshot (simulating 'archivebox snapshot')
        url = 'https://test-extract-1.example.com'
        overrides = {'created_by_id': created_by_id}
        snapshot = Snapshot.from_jsonl({'url': url}, overrides=overrides)
        snapshot_output = snapshot.to_jsonl()

        # Step 2: Parse snapshot output as extract input
        stdin = StringIO(json.dumps(snapshot_output) + '\n')
        stdin.isatty = lambda: False

        records = list(read_args_or_stdin((), stream=stdin))

        self.assertEqual(len(records), 1)
        self.assertEqual(records[0]['type'], TYPE_SNAPSHOT)
        self.assertEqual(records[0]['id'], str(snapshot.id))

        # Step 3: Gather snapshot IDs (as extract does)
        snapshot_ids = set()
        for record in records:
            if record.get('type') == TYPE_SNAPSHOT and record.get('id'):
                snapshot_ids.add(record['id'])

        self.assertIn(str(snapshot.id), snapshot_ids)

    def test_full_pipeline_crawl_snapshot_extract(self):
        """
        Test: archivebox crawl URL | archivebox snapshot | archivebox extract

        This is equivalent to: archivebox add --depth=0 URL
        """
        from archivebox.crawls.models import Crawl
        from archivebox.core.models import Snapshot
        from archivebox.misc.jsonl import (
            read_args_or_stdin,
            TYPE_CRAWL, TYPE_SNAPSHOT
        )
        from archivebox.base_models.models import get_or_create_system_user_pk

        created_by_id = get_or_create_system_user_pk()

        # === archivebox crawl https://example.com ===
        url = 'https://test-pipeline-full.example.com'
        crawl = Crawl.from_jsonl({'url': url}, overrides={'created_by_id': created_by_id})
        crawl_jsonl = json.dumps(crawl.to_jsonl())

        # === | archivebox snapshot ===
        stdin = StringIO(crawl_jsonl + '\n')
        stdin.isatty = lambda: False

        records = list(read_args_or_stdin((), stream=stdin))
        self.assertEqual(len(records), 1)
        self.assertEqual(records[0]['type'], TYPE_CRAWL)

        # Create snapshots from crawl
        created_snapshots = []
        for record in records:
            if record.get('type') == TYPE_CRAWL:
                crawl_id = record.get('id')
                if crawl_id:
                    db_crawl = Crawl.objects.get(id=crawl_id)
                    for crawl_url in db_crawl.get_urls_list():
                        snapshot = Snapshot.from_jsonl({'url': crawl_url}, overrides={'created_by_id': created_by_id})
                        if snapshot:
                            created_snapshots.append(snapshot)

        self.assertEqual(len(created_snapshots), 1)
        self.assertEqual(created_snapshots[0].url, url)

        # === | archivebox extract ===
        snapshot_jsonl_lines = [json.dumps(s.to_jsonl()) for s in created_snapshots]
        stdin = StringIO('\n'.join(snapshot_jsonl_lines) + '\n')
        stdin.isatty = lambda: False

        records = list(read_args_or_stdin((), stream=stdin))
        self.assertEqual(len(records), 1)
        self.assertEqual(records[0]['type'], TYPE_SNAPSHOT)
        self.assertEqual(records[0]['id'], str(created_snapshots[0].id))


class TestDepthWorkflows(unittest.TestCase):
    """Test various depth crawl workflows."""

    @classmethod
    def setUpClass(cls):
        """Set up Django and test database."""
        cls.test_dir = tempfile.mkdtemp()
        os.environ['DATA_DIR'] = cls.test_dir

        from archivebox.config.django import setup_django
        setup_django()

        from archivebox.cli.archivebox_init import init
        init()

    @classmethod
    def tearDownClass(cls):
        """Clean up test database."""
        shutil.rmtree(cls.test_dir, ignore_errors=True)

    def test_depth_0_workflow(self):
        """
        Test: archivebox crawl URL | archivebox snapshot | archivebox extract

        Depth 0: Only archive the specified URL, no recursive crawling.
        """
        from archivebox.crawls.models import Crawl
        from archivebox.core.models import Snapshot
        from archivebox.base_models.models import get_or_create_system_user_pk

        created_by_id = get_or_create_system_user_pk()

        # Create crawl with depth 0
        url = 'https://depth0-test.example.com'
        crawl = Crawl.from_jsonl({'url': url, 'max_depth': 0}, overrides={'created_by_id': created_by_id})

        self.assertEqual(crawl.max_depth, 0)

        # Create snapshot
        snapshot = Snapshot.from_jsonl({'url': url}, overrides={'created_by_id': created_by_id})
        self.assertEqual(snapshot.url, url)

    def test_depth_metadata_in_crawl(self):
        """Test that depth metadata is stored in Crawl."""
        from archivebox.crawls.models import Crawl
        from archivebox.base_models.models import get_or_create_system_user_pk

        created_by_id = get_or_create_system_user_pk()

        # Create crawl with depth
        crawl = Crawl.from_jsonl(
            {'url': 'https://depth-meta-test.example.com', 'max_depth': 2},
            overrides={'created_by_id': created_by_id}
        )

        self.assertEqual(crawl.max_depth, 2)

        # Verify in JSONL output
        output = crawl.to_jsonl()
        self.assertEqual(output['max_depth'], 2)


class TestParserPluginWorkflows(unittest.TestCase):
    """Test workflows with specific parser plugins."""

    @classmethod
    def setUpClass(cls):
        """Set up Django and test database."""
        cls.test_dir = tempfile.mkdtemp()
        os.environ['DATA_DIR'] = cls.test_dir

        from archivebox.config.django import setup_django
        setup_django()

        from archivebox.cli.archivebox_init import init
        init()

    @classmethod
    def tearDownClass(cls):
        """Clean up test database."""
        shutil.rmtree(cls.test_dir, ignore_errors=True)

    def test_html_parser_workflow(self):
        """
        Test: archivebox crawl --plugin=parse_html_urls URL | archivebox snapshot | archivebox extract
        """
        from archivebox.hooks import collect_urls_from_plugins
        from archivebox.misc.jsonl import TYPE_SNAPSHOT

        # Create mock output directory
        snapshot_dir = Path(self.test_dir) / 'archive' / 'html-parser-test'
        snapshot_dir.mkdir(parents=True, exist_ok=True)
        (snapshot_dir / 'parse_html_urls').mkdir(exist_ok=True)
        (snapshot_dir / 'parse_html_urls' / 'urls.jsonl').write_text(
            '{"url": "https://html-discovered.com", "title": "HTML Link"}\n'
        )

        # Collect URLs
        discovered = collect_urls_from_plugins(snapshot_dir)

        self.assertEqual(len(discovered), 1)
        self.assertEqual(discovered[0]['url'], 'https://html-discovered.com')
        self.assertEqual(discovered[0]['plugin'], 'parse_html_urls')

    def test_rss_parser_workflow(self):
        """
        Test: archivebox crawl --plugin=parse_rss_urls URL | archivebox snapshot | archivebox extract
        """
        from archivebox.hooks import collect_urls_from_plugins

        # Create mock output directory
        snapshot_dir = Path(self.test_dir) / 'archive' / 'rss-parser-test'
        snapshot_dir.mkdir(parents=True, exist_ok=True)
        (snapshot_dir / 'parse_rss_urls').mkdir(exist_ok=True)
        (snapshot_dir / 'parse_rss_urls' / 'urls.jsonl').write_text(
            '{"url": "https://rss-item-1.com", "title": "RSS Item 1"}\n'
            '{"url": "https://rss-item-2.com", "title": "RSS Item 2"}\n'
        )

        # Collect URLs
        discovered = collect_urls_from_plugins(snapshot_dir)

        self.assertEqual(len(discovered), 2)
        self.assertTrue(all(d['plugin'] == 'parse_rss_urls' for d in discovered))

    def test_multiple_parsers_dedupe(self):
        """
        Multiple parsers may discover the same URL - should be deduplicated.
        """
        from archivebox.hooks import collect_urls_from_plugins

        # Create mock output with duplicate URLs from different parsers
        snapshot_dir = Path(self.test_dir) / 'archive' / 'dedupe-test'
        snapshot_dir.mkdir(parents=True, exist_ok=True)

        (snapshot_dir / 'parse_html_urls').mkdir(exist_ok=True)
        (snapshot_dir / 'parse_html_urls' / 'urls.jsonl').write_text(
            '{"url": "https://same-url.com"}\n'
        )

        (snapshot_dir / 'wget').mkdir(exist_ok=True)
        (snapshot_dir / 'wget' / 'urls.jsonl').write_text(
            '{"url": "https://same-url.com"}\n'  # Same URL, different extractor
        )

        # Collect URLs
        all_discovered = collect_urls_from_plugins(snapshot_dir)

        # Both entries are returned (deduplication happens at the crawl command level)
        self.assertEqual(len(all_discovered), 2)

        # Verify both extractors found the same URL
        urls = {d['url'] for d in all_discovered}
        self.assertEqual(urls, {'https://same-url.com'})


class TestEdgeCases(unittest.TestCase):
    """Test edge cases and error handling."""

    def test_empty_input(self):
        """Commands should handle empty input gracefully."""
        from archivebox.misc.jsonl import read_args_or_stdin

        # Empty args, TTY stdin (should not block)
        stdin = StringIO('')
        stdin.isatty = lambda: True

        records = list(read_args_or_stdin((), stream=stdin))
        self.assertEqual(len(records), 0)

    def test_malformed_jsonl(self):
        """Should skip malformed JSONL lines."""
        from archivebox.misc.jsonl import read_args_or_stdin

        stdin = StringIO(
            '{"url": "https://good.com"}\n'
            'not valid json\n'
            '{"url": "https://also-good.com"}\n'
        )
        stdin.isatty = lambda: False

        records = list(read_args_or_stdin((), stream=stdin))

        self.assertEqual(len(records), 2)
        urls = {r['url'] for r in records}
        self.assertEqual(urls, {'https://good.com', 'https://also-good.com'})

    def test_mixed_input_formats(self):
        """Should handle mixed URLs and JSONL."""
        from archivebox.misc.jsonl import read_args_or_stdin

        stdin = StringIO(
            'https://plain-url.com\n'
            '{"type": "Snapshot", "url": "https://jsonl-url.com", "tags": "test"}\n'
            '01234567-89ab-cdef-0123-456789abcdef\n'  # UUID
        )
        stdin.isatty = lambda: False

        records = list(read_args_or_stdin((), stream=stdin))

        self.assertEqual(len(records), 3)

        # Plain URL
        self.assertEqual(records[0]['url'], 'https://plain-url.com')

        # JSONL with metadata
        self.assertEqual(records[1]['url'], 'https://jsonl-url.com')
        self.assertEqual(records[1]['tags'], 'test')

        # UUID
        self.assertEqual(records[2]['id'], '01234567-89ab-cdef-0123-456789abcdef')

    def test_crawl_with_multiple_urls(self):
        """Crawl should handle multiple URLs in a single crawl."""
        from archivebox.misc.jsonl import TYPE_CRAWL

        # Test crawl JSONL with multiple URLs
        crawl_output = {
            'type': TYPE_CRAWL,
            'id': 'test-multi-url-crawl',
            'urls': 'https://url1.com\nhttps://url2.com\nhttps://url3.com',
            'max_depth': 0,
        }

        # Parse the URLs
        urls = [u.strip() for u in crawl_output['urls'].split('\n') if u.strip()]

        self.assertEqual(len(urls), 3)
        self.assertEqual(urls[0], 'https://url1.com')
        self.assertEqual(urls[1], 'https://url2.com')
        self.assertEqual(urls[2], 'https://url3.com')


if __name__ == '__main__':
    unittest.main()