1 month ago · 60422adc87
--- a/.claude/settings.local.json
+++ b/.claude/settings.local.json
@@ -26,7 +26,8 @@
 
				       "Bash(grep:*)",
			
 
				       "WebFetch(domain:python-statemachine.readthedocs.io)",
			
 
				       "Bash(./bin/run_plugin_tests.sh:*)",
			
 
				-      "Bash(done)"
			
 
				+      "Bash(done)",
			
 
				+      "Bash(coverage erase:*)"
			
 
				     ]
			
 
				   }
			
 
				 }
			
--- a/.gitignore
+++ b/.gitignore
@@ -10,6 +10,7 @@ tests/out/
 
				 .coverage
			
 
				 .coverage.*
			
 
				 coverage.json
			
 
				+coverage/
			
 
				 htmlcov/
			
 
				 
			
 
				 # Python and Node dependencies
			
--- a/archivebox/config/constants.py
+++ b/archivebox/config/constants.py
@@ -173,6 +173,8 @@ class ConstantsDict(Mapping):
 
				         CUSTOM_TEMPLATES_DIR_NAME,
			
 
				         CUSTOM_PLUGINS_DIR_NAME,
			
 
				         CRONTABS_DIR_NAME,
			
 
				+        "invalid",
			
 
				+        "users",
			
 
				         # Backwards compatibility with old directory names
			
 
				         "user_plugins",          # old name for USER_PLUGINS_DIR (now 'plugins')
			
 
				         "user_templates",        # old name for CUSTOM_TEMPLATES_DIR (now 'templates')
			
--- a/archivebox/core/migrations/0025_alter_archiveresult_options_alter_snapshot_options_and_more.py
+++ b/archivebox/core/migrations/0025_alter_archiveresult_options_alter_snapshot_options_and_more.py
@@ -57,18 +57,8 @@ class Migration(migrations.Migration):
 
				             name='snapshot',
			
 
				             options={'verbose_name': 'Snapshot', 'verbose_name_plural': 'Snapshots'},
			
 
				         ),
			
 
				-        migrations.RemoveField(
			
 
				-            model_name='archiveresult',
			
 
				-            name='cmd',
			
 
				-        ),
			
 
				-        migrations.RemoveField(
			
 
				-            model_name='archiveresult',
			
 
				-            name='cmd_version',
			
 
				-        ),
			
 
				-        migrations.RemoveField(
			
 
				-            model_name='archiveresult',
			
 
				-            name='pwd',
			
 
				-        ),
			
 
				+        # NOTE: RemoveField for cmd, cmd_version, pwd moved to migration 0027
			
 
				+        # to allow data migration to Process records first
			
 
				         migrations.AddField(
			
 
				             model_name='archiveresult',
			
 
				             name='config',
			
--- a/archivebox/core/models.py
+++ b/archivebox/core/models.py
@@ -2208,7 +2208,7 @@ class SnapshotMachine(BaseStateMachine, strict_states=True):
 
				     tick = (
			
 
				         queued.to.itself(unless='can_start') |
			
 
				         queued.to(started, cond='can_start') |
			
 
				-        started.to.itself(unless='is_finished') |
			
 
				+        started.to.itself(unless='is_finished', on='on_started_to_started') |
			
 
				         started.to(sealed, cond='is_finished')
			
 
				     )
			
 
				 
			
@@ -2243,6 +2243,13 @@ class SnapshotMachine(BaseStateMachine, strict_states=True):
 
				             status=Snapshot.StatusChoices.STARTED,
			
 
				         )
			
 
				 
			
 
				+    def on_started_to_started(self):
			
 
				+        """Called when Snapshot stays in started state (archiveresults not finished yet)."""
			
 
				+        # Bump retry_at so we check again in a few seconds
			
 
				+        self.snapshot.update_and_requeue(
			
 
				+            retry_at=timezone.now() + timedelta(seconds=5),
			
 
				+        )
			
 
				+
			
 
				     @sealed.enter
			
 
				     def enter_sealed(self):
			
 
				         # Clean up background hooks
			
--- a/archivebox/crawls/models.py
+++ b/archivebox/crawls/models.py
@@ -502,7 +502,7 @@ class CrawlMachine(BaseStateMachine, strict_states=True):
 
				     tick = (
			
 
				         queued.to.itself(unless='can_start') |
			
 
				         queued.to(started, cond='can_start') |
			
 
				-        started.to.itself(unless='is_finished') |
			
 
				+        started.to.itself(unless='is_finished', on='on_started_to_started') |
			
 
				         started.to(sealed, cond='is_finished')
			
 
				     )
			
 
				 
			
--- a/archivebox/hooks.py
+++ b/archivebox/hooks.py
@@ -1201,6 +1201,14 @@ def process_hook_records(records: List[Dict[str, Any]], overrides: Dict[str, Any
 
				             # Dispatch to appropriate model's from_json() method
			
 
				             if record_type == 'Snapshot':
			
 
				                 from archivebox.core.models import Snapshot
			
 
				+
			
 
				+                # Check if discovered snapshot exceeds crawl max_depth
			
 
				+                snapshot_depth = record.get('depth', 0)
			
 
				+                crawl = overrides.get('crawl')
			
 
				+                if crawl and snapshot_depth > crawl.max_depth:
			
 
				+                    # Skip - this URL was discovered but exceeds max crawl depth
			
 
				+                    continue
			
 
				+
			
 
				                 obj = Snapshot.from_json(record.copy(), overrides)
			
 
				                 if obj:
			
 
				                     stats['Snapshot'] = stats.get('Snapshot', 0) + 1
			
--- a/archivebox/plugins/parse_netscape_urls/on_Snapshot__73_parse_netscape_urls.py
+++ b/archivebox/plugins/parse_netscape_urls/on_Snapshot__73_parse_netscape_urls.py
@@ -163,8 +163,10 @@ def fetch_content(url: str) -> str:
 
				 
			
 
				 @click.command()
			
 
				 @click.option('--url', required=True, help='Netscape bookmark file URL to parse')
			
 
				[email protected]('--snapshot-id', required=False, help='Snapshot UUID (unused but required by hook runner)')
			
 
				-def main(url: str, snapshot_id: str = None):
			
 
				[email protected]('--snapshot-id', required=False, help='Parent Snapshot UUID')
			
 
				[email protected]('--crawl-id', required=False, help='Crawl UUID')
			
 
				[email protected]('--depth', type=int, default=0, help='Current depth level')
			
 
				+def main(url: str, snapshot_id: str = None, crawl_id: str = None, depth: int = 0):
			
 
				     """Parse Netscape bookmark HTML and extract URLs."""
			
 
				 
			
 
				     try:
			
@@ -188,7 +190,12 @@ def main(url: str, snapshot_id: str = None):
 
				                 'type': 'Snapshot',
			
 
				                 'url': unescape(bookmark_url),
			
 
				                 'plugin': PLUGIN_NAME,
			
 
				+                'depth': depth + 1,
			
 
				             }
			
 
				+            if snapshot_id:
			
 
				+                entry['parent_snapshot_id'] = snapshot_id
			
 
				+            if crawl_id:
			
 
				+                entry['crawl_id'] = crawl_id
			
 
				             if title:
			
 
				                 entry['title'] = unescape(title)
			
 
				             if tags_str:
			
--- a/archivebox/plugins/parse_txt_urls/on_Snapshot__71_parse_txt_urls.py
+++ b/archivebox/plugins/parse_txt_urls/on_Snapshot__71_parse_txt_urls.py
@@ -100,8 +100,10 @@ def fetch_content(url: str) -> str:
 
				 
			
 
				 @click.command()
			
 
				 @click.option('--url', required=True, help='URL to parse (file:// or https://)')
			
 
				[email protected]('--snapshot-id', required=False, help='Snapshot UUID (unused but required by hook runner)')
			
 
				-def main(url: str, snapshot_id: str = None):
			
 
				[email protected]('--snapshot-id', required=False, help='Parent Snapshot UUID')
			
 
				[email protected]('--crawl-id', required=False, help='Crawl UUID')
			
 
				[email protected]('--depth', type=int, default=0, help='Current depth level')
			
 
				+def main(url: str, snapshot_id: str = None, crawl_id: str = None, depth: int = 0):
			
 
				     """Parse plain text and extract URLs."""
			
 
				 
			
 
				     try:
			
@@ -123,9 +125,12 @@ def main(url: str, snapshot_id: str = None):
 
				             'type': 'Snapshot',
			
 
				             'url': found_url,
			
 
				             'plugin': PLUGIN_NAME,
			
 
				+            'depth': depth + 1,
			
 
				         }
			
 
				         if snapshot_id:
			
 
				             record['parent_snapshot_id'] = snapshot_id
			
 
				+        if crawl_id:
			
 
				+            record['crawl_id'] = crawl_id
			
 
				         print(json.dumps(record))
			
 
				 
			
 
				     # Emit ArchiveResult record to mark completion
			
--- a/archivebox/tests/test_migrations_08_to_09.py
+++ b/archivebox/tests/test_migrations_08_to_09.py
@@ -30,6 +30,7 @@ from .test_migrations_helpers import (
 
				     verify_foreign_keys,
			
 
				     verify_all_snapshots_in_output,
			
 
				     verify_crawl_count,
			
 
				+    verify_process_migration,
			
 
				 )
			
 
				 
			
 
				 
			
@@ -260,6 +261,54 @@ class TestMigrationFrom08x(unittest.TestCase):
 
				         self.assertTrue('ArchiveBox' in output or 'version' in output.lower(),
			
 
				                        f"Version output missing expected content: {output[:500]}")
			
 
				 
			
 
				+    def test_migration_creates_process_records(self):
			
 
				+        """Migration should create Process records for all ArchiveResults."""
			
 
				+        result = run_archivebox(self.work_dir, ['init'], timeout=45)
			
 
				+        self.assertEqual(result.returncode, 0, f"Init failed: {result.stderr}")
			
 
				+
			
 
				+        # Verify Process records created
			
 
				+        expected_count = len(self.original_data['archiveresults'])
			
 
				+        ok, msg = verify_process_migration(self.db_path, expected_count)
			
 
				+        self.assertTrue(ok, msg)
			
 
				+
			
 
				+    def test_migration_creates_binary_records(self):
			
 
				+        """Migration should create Binary records from cmd_version data."""
			
 
				+        result = run_archivebox(self.work_dir, ['init'], timeout=45)
			
 
				+        self.assertEqual(result.returncode, 0, f"Init failed: {result.stderr}")
			
 
				+
			
 
				+        conn = sqlite3.connect(str(self.db_path))
			
 
				+        cursor = conn.cursor()
			
 
				+
			
 
				+        # Check Binary records exist
			
 
				+        cursor.execute("SELECT COUNT(*) FROM machine_binary")
			
 
				+        binary_count = cursor.fetchone()[0]
			
 
				+
			
 
				+        # Should have at least one binary per unique extractor
			
 
				+        extractors = set(ar['extractor'] for ar in self.original_data['archiveresults'])
			
 
				+        self.assertGreaterEqual(binary_count, len(extractors),
			
 
				+                              f"Expected at least {len(extractors)} Binaries, got {binary_count}")
			
 
				+
			
 
				+        conn.close()
			
 
				+
			
 
				+    def test_migration_preserves_cmd_data(self):
			
 
				+        """Migration should preserve cmd data in Process.cmd field."""
			
 
				+        result = run_archivebox(self.work_dir, ['init'], timeout=45)
			
 
				+        self.assertEqual(result.returncode, 0, f"Init failed: {result.stderr}")
			
 
				+
			
 
				+        conn = sqlite3.connect(str(self.db_path))
			
 
				+        cursor = conn.cursor()
			
 
				+
			
 
				+        # Check that Process records have cmd arrays
			
 
				+        cursor.execute("SELECT cmd FROM machine_process WHERE cmd != '[]'")
			
 
				+        cmd_records = cursor.fetchall()
			
 
				+
			
 
				+        # All Processes should have non-empty cmd (test data has json.dumps([extractor, '--version']))
			
 
				+        expected_count = len(self.original_data['archiveresults'])
			
 
				+        self.assertEqual(len(cmd_records), expected_count,
			
 
				+                        f"Expected {expected_count} Processes with cmd, got {len(cmd_records)}")
			
 
				+
			
 
				+        conn.close()
			
 
				+
			
 
				 
			
 
				 class TestMigrationDataIntegrity08x(unittest.TestCase):
			
 
				     """Comprehensive data integrity tests for 0.8.x migrations."""
			
--- a/archivebox/tests/test_migrations_helpers.py
+++ b/archivebox/tests/test_migrations_helpers.py
@@ -730,44 +730,26 @@ def seed_0_8_data(db_path: Path) -> Dict[str, List[Dict]]:
 
				         tag_id = cursor.lastrowid
			
 
				         created_data['tags'].append({'id': tag_id, 'name': name, 'slug': name.lower()})
			
 
				 
			
 
				-    # Create Seeds first (required for 0.8.x Crawls)
			
 
				-    test_seeds = [
			
 
				-        ('https://example.com', 'auto', 'Example Seed'),
			
 
				-        ('https://github.com/ArchiveBox', 'auto', 'GitHub Seed'),
			
 
				-    ]
			
 
				-
			
 
				-    created_data['seeds'] = []
			
 
				-    for uri, extractor, label in test_seeds:
			
 
				-        seed_id = generate_uuid()
			
 
				-        cursor.execute("""
			
 
				-            INSERT INTO crawls_seed (id, created_at, created_by_id, modified_at, uri,
			
 
				-                                     extractor, tags_str, label, config, output_dir, notes,
			
 
				-                                     num_uses_failed, num_uses_succeeded)
			
 
				-            VALUES (?, datetime('now'), ?, datetime('now'), ?, ?, '', ?, '{}', '', '', 0, 0)
			
 
				-        """, (seed_id, user_id, uri, extractor, label))
			
 
				-        created_data['seeds'].append({'id': seed_id, 'uri': uri, 'label': label})
			
 
				-
			
 
				-    # Create 2 Crawls (linked to Seeds)
			
 
				+    # Create 2 Crawls (0.9.0 schema - no seeds)
			
 
				     test_crawls = [
			
 
				-        ('https://example.com\nhttps://example.org', 0, 'Example Crawl', created_data['seeds'][0]['id']),
			
 
				-        ('https://github.com/ArchiveBox', 1, 'GitHub Crawl', created_data['seeds'][1]['id']),
			
 
				+        ('https://example.com\nhttps://example.org', 0, 'Example Crawl'),
			
 
				+        ('https://github.com/ArchiveBox', 1, 'GitHub Crawl'),
			
 
				     ]
			
 
				 
			
 
				-    for i, (urls, max_depth, label, seed_id) in enumerate(test_crawls):
			
 
				+    for i, (urls, max_depth, label) in enumerate(test_crawls):
			
 
				         crawl_id = generate_uuid()
			
 
				         cursor.execute("""
			
 
				-            INSERT INTO crawls_crawl (id, created_at, created_by_id, modified_at, seed_id, urls,
			
 
				+            INSERT INTO crawls_crawl (id, created_at, created_by_id, modified_at, urls,
			
 
				                                       config, max_depth, tags_str, label, status, retry_at,
			
 
				                                       num_uses_failed, num_uses_succeeded)
			
 
				-            VALUES (?, datetime('now'), ?, datetime('now'), ?, ?, '{}', ?, '', ?, 'queued', datetime('now'), 0, 0)
			
 
				-        """, (crawl_id, user_id, seed_id, urls, max_depth, label))
			
 
				+            VALUES (?, datetime('now'), ?, datetime('now'), ?, '{}', ?, '', ?, 'queued', datetime('now'), 0, 0)
			
 
				+        """, (crawl_id, user_id, urls, max_depth, label))
			
 
				 
			
 
				         created_data['crawls'].append({
			
 
				             'id': crawl_id,
			
 
				             'urls': urls,
			
 
				             'max_depth': max_depth,
			
 
				             'label': label,
			
 
				-            'seed_id': seed_id,
			
 
				         })
			
 
				 
			
 
				     # Create 5 snapshots linked to crawls
			
@@ -1146,3 +1128,64 @@ def verify_crawl_count(db_path: Path, expected: int) -> Tuple[bool, str]:
 
				     if count == expected:
			
 
				         return True, f"Crawl count OK: {count}"
			
 
				     return False, f"Crawl count mismatch: expected {expected}, got {count}"
			
 
				+
			
 
				+
			
 
				+def verify_process_migration(db_path: Path, expected_archiveresult_count: int) -> Tuple[bool, str]:
			
 
				+    """
			
 
				+    Verify that ArchiveResults were properly migrated to Process records.
			
 
				+
			
 
				+    Checks:
			
 
				+    1. All ArchiveResults have process_id set
			
 
				+    2. Process count matches ArchiveResult count
			
 
				+    3. Binary records created for unique cmd_version values
			
 
				+    4. Status mapping is correct
			
 
				+    """
			
 
				+    conn = sqlite3.connect(str(db_path))
			
 
				+    cursor = conn.cursor()
			
 
				+
			
 
				+    # Check all ArchiveResults have process_id
			
 
				+    cursor.execute("SELECT COUNT(*) FROM core_archiveresult WHERE process_id IS NULL")
			
 
				+    null_count = cursor.fetchone()[0]
			
 
				+
			
 
				+    if null_count > 0:
			
 
				+        conn.close()
			
 
				+        return False, f"Found {null_count} ArchiveResults without process_id"
			
 
				+
			
 
				+    # Check Process count
			
 
				+    cursor.execute("SELECT COUNT(*) FROM machine_process")
			
 
				+    process_count = cursor.fetchone()[0]
			
 
				+
			
 
				+    if process_count != expected_archiveresult_count:
			
 
				+        conn.close()
			
 
				+        return False, f"Expected {expected_archiveresult_count} Processes, got {process_count}"
			
 
				+
			
 
				+    # Check status mapping
			
 
				+    cursor.execute("""
			
 
				+        SELECT ar.status, p.status, p.exit_code
			
 
				+        FROM core_archiveresult ar
			
 
				+        JOIN machine_process p ON ar.process_id = p.id
			
 
				+    """)
			
 
				+
			
 
				+    status_errors = []
			
 
				+    for ar_status, p_status, p_exit_code in cursor.fetchall():
			
 
				+        expected_p_status, expected_exit_code = {
			
 
				+            'queued': ('queued', None),
			
 
				+            'started': ('running', None),
			
 
				+            'backoff': ('queued', None),
			
 
				+            'succeeded': ('exited', 0),
			
 
				+            'failed': ('exited', 1),
			
 
				+            'skipped': ('exited', None),
			
 
				+        }.get(ar_status, ('queued', None))
			
 
				+
			
 
				+        if p_status != expected_p_status:
			
 
				+            status_errors.append(f"AR status {ar_status} → Process {p_status}, expected {expected_p_status}")
			
 
				+
			
 
				+        if p_exit_code != expected_exit_code:
			
 
				+            status_errors.append(f"AR status {ar_status} → exit_code {p_exit_code}, expected {expected_exit_code}")
			
 
				+
			
 
				+    if status_errors:
			
 
				+        conn.close()
			
 
				+        return False, f"Status mapping errors: {'; '.join(status_errors[:5])}"
			
 
				+
			
 
				+    conn.close()
			
 
				+    return True, f"Process migration verified: {process_count} Processes created"
			
--- a/archivebox/workers/orchestrator.py
+++ b/archivebox/workers/orchestrator.py
@@ -175,8 +175,50 @@ class Orchestrator:
 
				         """Spawn a new worker process. Returns PID or None if spawn failed."""
			
 
				         try:
			
 
				             pid = WorkerClass.start(daemon=False)
			
 
				-            # Worker spawning is logged by the worker itself in on_startup()
			
 
				-            return pid
			
 
				+
			
 
				+            # CRITICAL: Block until worker registers itself in Process table
			
 
				+            # This prevents race condition where orchestrator spawns multiple workers
			
 
				+            # before any of them finish on_startup() and register
			
 
				+            from archivebox.machine.models import Process
			
 
				+            import time
			
 
				+
			
 
				+            timeout = 5.0  # seconds to wait for worker registration
			
 
				+            poll_interval = 0.1  # check every 100ms
			
 
				+            elapsed = 0.0
			
 
				+            spawn_time = timezone.now()
			
 
				+
			
 
				+            while elapsed < timeout:
			
 
				+                # Check if worker process is registered with strict criteria:
			
 
				+                # 1. Correct PID
			
 
				+                # 2. WORKER process type
			
 
				+                # 3. RUNNING status
			
 
				+                # 4. Parent is this orchestrator
			
 
				+                # 5. Started recently (within last 10 seconds)
			
 
				+                worker_process = Process.objects.filter(
			
 
				+                    pid=pid,
			
 
				+                    process_type=Process.TypeChoices.WORKER,
			
 
				+                    status=Process.StatusChoices.RUNNING,
			
 
				+                    parent_id=self.db_process.id,
			
 
				+                    started_at__gte=spawn_time - timedelta(seconds=10),
			
 
				+                ).first()
			
 
				+
			
 
				+                if worker_process:
			
 
				+                    # Worker successfully registered!
			
 
				+                    return pid
			
 
				+
			
 
				+                time.sleep(poll_interval)
			
 
				+                elapsed += poll_interval
			
 
				+
			
 
				+            # Timeout - worker failed to register
			
 
				+            log_worker_event(
			
 
				+                worker_type='Orchestrator',
			
 
				+                event='Worker failed to register in time',
			
 
				+                indent_level=0,
			
 
				+                pid=self.pid,
			
 
				+                metadata={'worker_type': WorkerClass.name, 'worker_pid': pid, 'timeout': timeout},
			
 
				+            )
			
 
				+            return None
			
 
				+
			
 
				         except Exception as e:
			
 
				             log_worker_event(
			
 
				                 worker_type='Orchestrator',
			
@@ -266,48 +308,75 @@ class Orchestrator:
 
				     def runloop(self) -> None:
			
 
				         """Main orchestrator loop."""
			
 
				         from rich.progress import Progress, BarColumn, TextColumn, TaskProgressColumn
			
 
				-        from archivebox.misc.logging import IS_TTY
			
 
				-        import archivebox.misc.logging as logging_module
			
 
				+        from archivebox.misc.logging import IS_TTY, CONSOLE
			
 
				+        import sys
			
 
				+        import os
			
 
				 
			
 
				         # Enable progress bars only in TTY + foreground mode
			
 
				         show_progress = IS_TTY and self.exit_on_idle
			
 
				 
			
 
				-        # Save original consoles
			
 
				-        original_console = logging_module.CONSOLE
			
 
				-        original_stderr = logging_module.STDERR
			
 
				-
			
 
				-        # Create Progress with the console it will control
			
 
				-        progress = Progress(
			
 
				-            TextColumn("[cyan]{task.description}"),
			
 
				-            BarColumn(bar_width=40),
			
 
				-            TaskProgressColumn(),
			
 
				-            transient=False,
			
 
				-            console=original_console,  # Use the original console
			
 
				-        ) if show_progress else None
			
 
				-
			
 
				-        task_ids = {}  # snapshot_id -> task_id
			
 
				-
			
 
				-        # Wrapper to convert console.print() to console.log() for Rich Progress
			
 
				-        class ConsoleLogWrapper:
			
 
				-            def __init__(self, console):
			
 
				-                self._console = console
			
 
				-            def print(self, *args, **kwargs):
			
 
				-                # Use log() instead of print() to work with Live display
			
 
				-                self._console.log(*args)
			
 
				-            def __getattr__(self, name):
			
 
				-                return getattr(self._console, name)
			
 
				+        # Debug
			
 
				+        print(f"[yellow]DEBUG: IS_TTY={IS_TTY}, exit_on_idle={self.exit_on_idle}, show_progress={show_progress}[/yellow]")
			
 
				 
			
 
				-        try:
			
 
				-            if progress:
			
 
				-                progress.start()
			
 
				-                # Wrap progress.console so print() calls become log() calls
			
 
				-                wrapped_console = ConsoleLogWrapper(progress.console)
			
 
				-                logging_module.CONSOLE = wrapped_console
			
 
				-                logging_module.STDERR = wrapped_console
			
 
				-
			
 
				-            # Call on_startup AFTER redirecting consoles
			
 
				-            self.on_startup()
			
 
				+        self.on_startup()
			
 
				+        task_ids = {}
			
 
				 
			
 
				+        if not show_progress:
			
 
				+            # No progress bars - just run normally
			
 
				+            self._run_orchestrator_loop(None, task_ids, None, None)
			
 
				+        else:
			
 
				+            # Redirect worker subprocess output to /dev/null
			
 
				+            devnull_fd = os.open(os.devnull, os.O_WRONLY)
			
 
				+
			
 
				+            # Save original stdout/stderr (make 2 copies - one for Console, one for restoring)
			
 
				+            original_stdout = sys.stdout.fileno()
			
 
				+            original_stderr = sys.stderr.fileno()
			
 
				+            stdout_for_console = os.dup(original_stdout)
			
 
				+            stdout_for_restore = os.dup(original_stdout)
			
 
				+            stderr_for_restore = os.dup(original_stderr)
			
 
				+
			
 
				+            try:
			
 
				+                # Redirect stdout/stderr to /dev/null (workers will inherit this)
			
 
				+                os.dup2(devnull_fd, original_stdout)
			
 
				+                os.dup2(devnull_fd, original_stderr)
			
 
				+
			
 
				+                # Create Console using saved stdout (not the redirected one)
			
 
				+                from rich.console import Console
			
 
				+                import archivebox.misc.logging as logging_module
			
 
				+                orchestrator_console = Console(file=os.fdopen(stdout_for_console, 'w'), force_terminal=True)
			
 
				+
			
 
				+                # Update global CONSOLE so orchestrator logs appear too
			
 
				+                original_console = logging_module.CONSOLE
			
 
				+                logging_module.CONSOLE = orchestrator_console
			
 
				+
			
 
				+                # Now create Progress and run loop (DON'T restore stdout/stderr - workers need /dev/null)
			
 
				+                with Progress(
			
 
				+                    TextColumn("[cyan]{task.description}"),
			
 
				+                    BarColumn(bar_width=40),
			
 
				+                    TaskProgressColumn(),
			
 
				+                    console=orchestrator_console,
			
 
				+                ) as progress:
			
 
				+                    self._run_orchestrator_loop(progress, task_ids, None, None)
			
 
				+
			
 
				+                # Restore original console
			
 
				+                logging_module.CONSOLE = original_console
			
 
				+            finally:
			
 
				+                # Restore stdout/stderr
			
 
				+                os.dup2(stdout_for_restore, original_stdout)
			
 
				+                os.dup2(stderr_for_restore, original_stderr)
			
 
				+
			
 
				+                # Cleanup
			
 
				+                try:
			
 
				+                    os.close(devnull_fd)
			
 
				+                    os.close(stdout_for_restore)
			
 
				+                    os.close(stderr_for_restore)
			
 
				+                except:
			
 
				+                    pass
			
 
				+                # stdout_for_console is closed by orchestrator_console
			
 
				+
			
 
				+    def _run_orchestrator_loop(self, progress, task_ids, read_fd, console):
			
 
				+        """Run the main orchestrator loop with optional progress display."""
			
 
				+        try:
			
 
				             while True:
			
 
				                 # Check queues and spawn workers
			
 
				                 queue_sizes = self.check_queues_and_spawn_workers()
			
@@ -333,12 +402,33 @@ class Orchestrator:
 
				                             status__in=['succeeded', 'skipped', 'failed']
			
 
				                         ).count()
			
 
				 
			
 
				+                        # Find currently running hook (ordered by hook_name to get lowest step number)
			
 
				+                        current_ar = snapshot.archiveresult_set.filter(status='started').order_by('hook_name').first()
			
 
				+                        if not current_ar:
			
 
				+                            # If nothing running, show next queued item (ordered to get next in sequence)
			
 
				+                            current_ar = snapshot.archiveresult_set.filter(status='queued').order_by('hook_name').first()
			
 
				+
			
 
				+                        current_plugin = ''
			
 
				+                        if current_ar:
			
 
				+                            # Use hook_name if available, otherwise plugin name
			
 
				+                            hook_name = current_ar.hook_name or current_ar.plugin or ''
			
 
				+                            # Extract just the hook name without path (e.g., "on_Snapshot__50_wget.py" -> "wget")
			
 
				+                            if hook_name:
			
 
				+                                # Clean up the name: remove prefix and extension
			
 
				+                                clean_name = hook_name.split('__')[-1] if '__' in hook_name else hook_name
			
 
				+                                clean_name = clean_name.replace('.py', '').replace('.sh', '').replace('.bg', '')
			
 
				+                                current_plugin = f" • {clean_name}"
			
 
				+
			
 
				+                        # Build description with URL + current plugin
			
 
				+                        url = snapshot.url[:50] + '...' if len(snapshot.url) > 50 else snapshot.url
			
 
				+                        description = f"{url}{current_plugin}"
			
 
				+
			
 
				                         # Create or update task
			
 
				                         if snapshot.id not in task_ids:
			
 
				-                            url = snapshot.url[:60] + '...' if len(snapshot.url) > 60 else snapshot.url
			
 
				-                            task_ids[snapshot.id] = progress.add_task(url, total=total, completed=completed)
			
 
				+                            task_ids[snapshot.id] = progress.add_task(description, total=total, completed=completed)
			
 
				                         else:
			
 
				-                            progress.update(task_ids[snapshot.id], completed=completed)
			
 
				+                            # Update both progress and description
			
 
				+                            progress.update(task_ids[snapshot.id], description=description, completed=completed)
			
 
				 
			
 
				                     # Remove tasks for snapshots that are no longer active
			
 
				                     for snapshot_id in list(task_ids.keys()):
			
@@ -373,12 +463,6 @@ class Orchestrator:
 
				             raise
			
 
				         else:
			
 
				             self.on_shutdown()
			
 
				-        finally:
			
 
				-            if progress:
			
 
				-                # Restore original consoles
			
 
				-                logging_module.CONSOLE = original_console
			
 
				-                logging_module.STDERR = original_stderr
			
 
				-                progress.stop()
			
 
				     
			
 
				     def start(self) -> int:
			
 
				         """
			
--- a/bin/test_plugins.sh
+++ b/bin/test_plugins.sh
@@ -1,14 +1,20 @@
 
				 #!/bin/bash
			
 
				-# Run ArchiveBox plugin tests
			
 
				+# Run ArchiveBox plugin tests with coverage
			
 
				 #
			
 
				 # All plugin tests use pytest and are located in pluginname/tests/test_*.py
			
 
				 #
			
 
				-# Usage: ./bin/run_plugin_tests.sh [plugin_name]
			
 
				+# Usage: ./bin/test_plugins.sh [plugin_name] [--no-coverage]
			
 
				 #
			
 
				 # Examples:
			
 
				-#   ./bin/run_plugin_tests.sh                 # Run all plugin tests
			
 
				-#   ./bin/run_plugin_tests.sh chrome          # Run chrome plugin tests
			
 
				-#   ./bin/run_plugin_tests.sh parse_*         # Run all parse_* plugin tests
			
 
				+#   ./bin/test_plugins.sh                     # Run all plugin tests with coverage
			
 
				+#   ./bin/test_plugins.sh chrome              # Run chrome plugin tests with coverage
			
 
				+#   ./bin/test_plugins.sh parse_*             # Run all parse_* plugin tests with coverage
			
 
				+#   ./bin/test_plugins.sh --no-coverage       # Run all tests without coverage
			
 
				+#
			
 
				+# Coverage results are saved to .coverage and can be viewed with:
			
 
				+#   coverage combine
			
 
				+#   coverage report
			
 
				+#   coverage json
			
 
				 
			
 
				 set -e
			
 
				 
			
@@ -18,11 +24,43 @@ RED='\033[0;31m'
 
				 YELLOW='\033[1;33m'
			
 
				 NC='\033[0m' # No Color
			
 
				 
			
 
				+# Save root directory first
			
 
				+ROOT_DIR="$(cd "$(dirname "$0")/.." && pwd)"
			
 
				+
			
 
				 # Parse arguments
			
 
				-PLUGIN_FILTER="${1:-}"
			
 
				+PLUGIN_FILTER=""
			
 
				+ENABLE_COVERAGE=true
			
 
				+
			
 
				+for arg in "$@"; do
			
 
				+    if [ "$arg" = "--no-coverage" ]; then
			
 
				+        ENABLE_COVERAGE=false
			
 
				+    else
			
 
				+        PLUGIN_FILTER="$arg"
			
 
				+    fi
			
 
				+done
			
 
				+
			
 
				+# Reset coverage data if collecting coverage
			
 
				+if [ "$ENABLE_COVERAGE" = true ]; then
			
 
				+    echo "Resetting coverage data..."
			
 
				+    cd "$ROOT_DIR" || exit 1
			
 
				+    coverage erase
			
 
				+    rm -rf "$ROOT_DIR/coverage/js" 2>/dev/null
			
 
				+    mkdir -p "$ROOT_DIR/coverage/js"
			
 
				+
			
 
				+    # Enable Python subprocess coverage
			
 
				+    export COVERAGE_PROCESS_START="$ROOT_DIR/pyproject.toml"
			
 
				+    export PYTHONPATH="$ROOT_DIR:$PYTHONPATH"  # For sitecustomize.py
			
 
				+
			
 
				+    # Enable Node.js V8 coverage (built-in, no packages needed)
			
 
				+    export NODE_V8_COVERAGE="$ROOT_DIR/coverage/js"
			
 
				+
			
 
				+    echo "Python coverage: enabled (subprocess support)"
			
 
				+    echo "JavaScript coverage: enabled (NODE_V8_COVERAGE)"
			
 
				+    echo ""
			
 
				+fi
			
 
				 
			
 
				 # Change to plugins directory
			
 
				-cd "$(dirname "$0")/../archivebox/plugins" || exit 1
			
 
				+cd "$ROOT_DIR/archivebox/plugins" || exit 1
			
 
				 
			
 
				 echo "=========================================="
			
 
				 echo "ArchiveBox Plugin Tests"
			
@@ -34,6 +72,12 @@ if [ -n "$PLUGIN_FILTER" ]; then
 
				 else
			
 
				     echo "Running all plugin tests"
			
 
				 fi
			
 
				+
			
 
				+if [ "$ENABLE_COVERAGE" = true ]; then
			
 
				+    echo "Coverage: enabled"
			
 
				+else
			
 
				+    echo "Coverage: disabled"
			
 
				+fi
			
 
				 echo ""
			
 
				 
			
 
				 # Track results
			
@@ -67,7 +111,13 @@ for test_dir in $TEST_DIRS; do
 
				 
			
 
				     echo -e "${YELLOW}[RUNNING]${NC} $plugin_name"
			
 
				 
			
 
				-    if python -m pytest "$test_dir" -p no:django -v --tb=short 2>&1 | grep -v "^platform\|^cachedir\|^rootdir\|^configfile\|^plugins:" | tail -100; then
			
 
				+    # Build pytest command with optional coverage
			
 
				+    PYTEST_CMD="python -m pytest $test_dir -p no:django -v --tb=short"
			
 
				+    if [ "$ENABLE_COVERAGE" = true ]; then
			
 
				+        PYTEST_CMD="$PYTEST_CMD --cov=$plugin_name --cov-append --cov-branch"
			
 
				+    fi
			
 
				+
			
 
				+    if eval "$PYTEST_CMD" 2>&1 | grep -v "^platform\|^cachedir\|^rootdir\|^configfile\|^plugins:" | tail -100; then
			
 
				         echo -e "${GREEN}[PASSED]${NC} $plugin_name"
			
 
				         PASSED_PLUGINS=$((PASSED_PLUGINS + 1))
			
 
				     else
			
@@ -91,6 +141,41 @@ if [ $TOTAL_PLUGINS -eq 0 ]; then
 
				     exit 0
			
 
				 elif [ $FAILED_PLUGINS -eq 0 ]; then
			
 
				     echo -e "${GREEN}✓ All plugin tests passed!${NC}"
			
 
				+
			
 
				+    # Show coverage summary if enabled
			
 
				+    if [ "$ENABLE_COVERAGE" = true ]; then
			
 
				+        echo ""
			
 
				+        echo "=========================================="
			
 
				+        echo "Python Coverage Summary"
			
 
				+        echo "=========================================="
			
 
				+        # Coverage data is in ROOT_DIR, combine and report from there
			
 
				+        cd "$ROOT_DIR" || exit 1
			
 
				+        # Copy coverage data from plugins dir if it exists
			
 
				+        if [ -f "$ROOT_DIR/archivebox/plugins/.coverage" ]; then
			
 
				+            cp "$ROOT_DIR/archivebox/plugins/.coverage" "$ROOT_DIR/.coverage"
			
 
				+        fi
			
 
				+        coverage combine 2>/dev/null || true
			
 
				+        coverage report --include="archivebox/plugins/*" --omit="*/tests/*" 2>&1 | head -50
			
 
				+        echo ""
			
 
				+
			
 
				+        echo "=========================================="
			
 
				+        echo "JavaScript Coverage Summary"
			
 
				+        echo "=========================================="
			
 
				+        if [ -d "$ROOT_DIR/coverage/js" ] && [ "$(ls -A "$ROOT_DIR/coverage/js" 2>/dev/null)" ]; then
			
 
				+            node "$ROOT_DIR/bin/convert_v8_coverage.js" "$ROOT_DIR/coverage/js"
			
 
				+        else
			
 
				+            echo "No JavaScript coverage data collected"
			
 
				+            echo "(JS hooks may not have been executed during tests)"
			
 
				+        fi
			
 
				+        echo ""
			
 
				+
			
 
				+        echo "For detailed coverage reports (from project root):"
			
 
				+        echo "  Python:     coverage report --show-missing --include='archivebox/plugins/*' --omit='*/tests/*'"
			
 
				+        echo "  Python:     coverage json  # LLM-friendly format"
			
 
				+        echo "  Python:     coverage html  # Interactive HTML report"
			
 
				+        echo "  JavaScript: node bin/convert_v8_coverage.js coverage/js"
			
 
				+    fi
			
 
				+
			
 
				     exit 0
			
 
				 else
			
 
				     echo -e "${RED}✗ Some plugin tests failed${NC}"