| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254 |
- """
- Tests for archivebox run CLI command.
- Tests cover:
- - run with stdin JSONL (Crawl, Snapshot, ArchiveResult)
- - create-or-update behavior (records with/without id)
- - pass-through output (for chaining)
- """
- import json
- import pytest
- from archivebox.tests.conftest import (
- run_archivebox_cmd,
- parse_jsonl_output,
- create_test_url,
- create_test_crawl_json,
- create_test_snapshot_json,
- )
- class TestRunWithCrawl:
- """Tests for `archivebox run` with Crawl input."""
- def test_run_with_new_crawl(self, initialized_archive):
- """Run creates and processes a new Crawl (no id)."""
- crawl_record = create_test_crawl_json()
- stdout, stderr, code = run_archivebox_cmd(
- ['run'],
- stdin=json.dumps(crawl_record),
- data_dir=initialized_archive,
- timeout=120,
- )
- assert code == 0, f"Command failed: {stderr}"
- # Should output the created Crawl
- records = parse_jsonl_output(stdout)
- crawl_records = [r for r in records if r.get('type') == 'Crawl']
- assert len(crawl_records) >= 1
- assert crawl_records[0].get('id') # Should have an id now
- def test_run_with_existing_crawl(self, initialized_archive):
- """Run re-queues an existing Crawl (with id)."""
- url = create_test_url()
- # First create a crawl
- stdout1, _, _ = run_archivebox_cmd(['crawl', 'create', url], data_dir=initialized_archive)
- crawl = parse_jsonl_output(stdout1)[0]
- # Run with the existing crawl
- stdout2, stderr, code = run_archivebox_cmd(
- ['run'],
- stdin=json.dumps(crawl),
- data_dir=initialized_archive,
- timeout=120,
- )
- assert code == 0
- records = parse_jsonl_output(stdout2)
- assert len(records) >= 1
- class TestRunWithSnapshot:
- """Tests for `archivebox run` with Snapshot input."""
- def test_run_with_new_snapshot(self, initialized_archive):
- """Run creates and processes a new Snapshot (no id, just url)."""
- snapshot_record = create_test_snapshot_json()
- stdout, stderr, code = run_archivebox_cmd(
- ['run'],
- stdin=json.dumps(snapshot_record),
- data_dir=initialized_archive,
- timeout=120,
- )
- assert code == 0, f"Command failed: {stderr}"
- records = parse_jsonl_output(stdout)
- snapshot_records = [r for r in records if r.get('type') == 'Snapshot']
- assert len(snapshot_records) >= 1
- assert snapshot_records[0].get('id')
- def test_run_with_existing_snapshot(self, initialized_archive):
- """Run re-queues an existing Snapshot (with id)."""
- url = create_test_url()
- # First create a snapshot
- stdout1, _, _ = run_archivebox_cmd(['snapshot', 'create', url], data_dir=initialized_archive)
- snapshot = parse_jsonl_output(stdout1)[0]
- # Run with the existing snapshot
- stdout2, stderr, code = run_archivebox_cmd(
- ['run'],
- stdin=json.dumps(snapshot),
- data_dir=initialized_archive,
- timeout=120,
- )
- assert code == 0
- records = parse_jsonl_output(stdout2)
- assert len(records) >= 1
- def test_run_with_plain_url(self, initialized_archive):
- """Run accepts plain URL records (no type field)."""
- url = create_test_url()
- url_record = {'url': url}
- stdout, stderr, code = run_archivebox_cmd(
- ['run'],
- stdin=json.dumps(url_record),
- data_dir=initialized_archive,
- timeout=120,
- )
- assert code == 0
- records = parse_jsonl_output(stdout)
- assert len(records) >= 1
- class TestRunWithArchiveResult:
- """Tests for `archivebox run` with ArchiveResult input."""
- def test_run_requeues_failed_archiveresult(self, initialized_archive):
- """Run re-queues a failed ArchiveResult."""
- url = create_test_url()
- # Create snapshot and archive result
- stdout1, _, _ = run_archivebox_cmd(['snapshot', 'create', url], data_dir=initialized_archive)
- snapshot = parse_jsonl_output(stdout1)[0]
- stdout2, _, _ = run_archivebox_cmd(
- ['archiveresult', 'create', '--plugin=title'],
- stdin=json.dumps(snapshot),
- data_dir=initialized_archive,
- )
- ar = next(r for r in parse_jsonl_output(stdout2) if r.get('type') == 'ArchiveResult')
- # Update to failed
- ar['status'] = 'failed'
- run_archivebox_cmd(
- ['archiveresult', 'update', '--status=failed'],
- stdin=json.dumps(ar),
- data_dir=initialized_archive,
- )
- # Now run should re-queue it
- stdout3, stderr, code = run_archivebox_cmd(
- ['run'],
- stdin=json.dumps(ar),
- data_dir=initialized_archive,
- timeout=120,
- )
- assert code == 0
- records = parse_jsonl_output(stdout3)
- ar_records = [r for r in records if r.get('type') == 'ArchiveResult']
- assert len(ar_records) >= 1
- class TestRunPassThrough:
- """Tests for pass-through behavior in `archivebox run`."""
- def test_run_passes_through_unknown_types(self, initialized_archive):
- """Run passes through records with unknown types."""
- unknown_record = {'type': 'Unknown', 'id': 'fake-id', 'data': 'test'}
- stdout, stderr, code = run_archivebox_cmd(
- ['run'],
- stdin=json.dumps(unknown_record),
- data_dir=initialized_archive,
- )
- assert code == 0
- records = parse_jsonl_output(stdout)
- unknown_records = [r for r in records if r.get('type') == 'Unknown']
- assert len(unknown_records) == 1
- assert unknown_records[0]['data'] == 'test'
- def test_run_outputs_all_processed_records(self, initialized_archive):
- """Run outputs all processed records for chaining."""
- url = create_test_url()
- crawl_record = create_test_crawl_json(urls=[url])
- stdout, stderr, code = run_archivebox_cmd(
- ['run'],
- stdin=json.dumps(crawl_record),
- data_dir=initialized_archive,
- timeout=120,
- )
- assert code == 0
- records = parse_jsonl_output(stdout)
- # Should have at least the Crawl in output
- assert len(records) >= 1
- class TestRunMixedInput:
- """Tests for `archivebox run` with mixed record types."""
- def test_run_handles_mixed_types(self, initialized_archive):
- """Run handles mixed Crawl/Snapshot/ArchiveResult input."""
- crawl = create_test_crawl_json()
- snapshot = create_test_snapshot_json()
- unknown = {'type': 'Tag', 'id': 'fake', 'name': 'test'}
- stdin = '\n'.join([
- json.dumps(crawl),
- json.dumps(snapshot),
- json.dumps(unknown),
- ])
- stdout, stderr, code = run_archivebox_cmd(
- ['run'],
- stdin=stdin,
- data_dir=initialized_archive,
- timeout=120,
- )
- assert code == 0
- records = parse_jsonl_output(stdout)
- types = set(r.get('type') for r in records)
- # Should have processed Crawl and Snapshot, passed through Tag
- assert 'Crawl' in types or 'Snapshot' in types or 'Tag' in types
- class TestRunEmpty:
- """Tests for `archivebox run` edge cases."""
- def test_run_empty_stdin(self, initialized_archive):
- """Run with empty stdin returns success."""
- stdout, stderr, code = run_archivebox_cmd(
- ['run'],
- stdin='',
- data_dir=initialized_archive,
- )
- assert code == 0
- def test_run_no_records_to_process(self, initialized_archive):
- """Run with only pass-through records shows message."""
- unknown = {'type': 'Unknown', 'id': 'fake'}
- stdout, stderr, code = run_archivebox_cmd(
- ['run'],
- stdin=json.dumps(unknown),
- data_dir=initialized_archive,
- )
- assert code == 0
- assert 'No records to process' in stderr
|