1 month ago · c7b2217cd6
--- a/archivebox/cli/archivebox_add.py
+++ b/archivebox/cli/archivebox_add.py
@@ -4,6 +4,7 @@ __package__ = 'archivebox.cli'
 
				 __command__ = 'archivebox add'
			
 
				 
			
 
				 import sys
			
 
				+from pathlib import Path
			
 
				 
			
 
				 from typing import TYPE_CHECKING
			
 
				 
			
@@ -14,7 +15,7 @@ from django.db.models import QuerySet
 
				 
			
 
				 from archivebox.misc.util import enforce_types, docstring
			
 
				 from archivebox import CONSTANTS
			
 
				-from archivebox.config.common import ARCHIVING_CONFIG
			
 
				+from archivebox.config.common import ARCHIVING_CONFIG, SERVER_CONFIG
			
 
				 from archivebox.config.permissions import USER, HOSTNAME
			
 
				 
			
 
				 
			
@@ -57,8 +58,11 @@ def add(urls: str | list[str],
 
				     from archivebox.crawls.models import Crawl
			
 
				     from archivebox.base_models.models import get_or_create_system_user_pk
			
 
				     from archivebox.workers.orchestrator import Orchestrator
			
 
				+    from archivebox.misc.logging_util import printable_filesize
			
 
				+    from archivebox.misc.system import get_dir_size
			
 
				 
			
 
				     created_by_id = created_by_id or get_or_create_system_user_pk()
			
 
				+    started_at = timezone.now()
			
 
				 
			
 
				     # 1. Save the provided URLs to sources/2024-11-05__23-59-59__cli_add.txt
			
 
				     sources_file = CONSTANTS.SOURCES_DIR / f'{timezone.now().strftime("%Y-%m-%d__%H-%M-%S")}__cli_add.txt'
			
@@ -127,11 +131,56 @@ def add(urls: str | list[str],
 
				         # Background mode: just queue work and return (orchestrator via server will pick it up)
			
 
				         print('[yellow]\\[*] URLs queued. Orchestrator will process them (run `archivebox server` if not already running).[/yellow]')
			
 
				     else:
			
 
				-        # Foreground mode: run CrawlWorker inline until all work is done
			
 
				-        print(f'[green]\\[*] Starting worker to process crawl...[/green]')
			
 
				-        from archivebox.workers.worker import CrawlWorker
			
 
				-        worker = CrawlWorker(crawl_id=str(crawl.id), worker_id=0)
			
 
				-        worker.runloop()  # Block until complete
			
 
				+        # Foreground mode: run full orchestrator until all work is done
			
 
				+        print(f'[green]\\[*] Starting orchestrator to process crawl...[/green]')
			
 
				+        from archivebox.workers.orchestrator import Orchestrator
			
 
				+        orchestrator = Orchestrator(exit_on_idle=True, crawl_id=str(crawl.id))
			
 
				+        orchestrator.runloop()  # Block until complete
			
 
				+
			
 
				+        # Print summary for foreground runs
			
 
				+        try:
			
 
				+            crawl.refresh_from_db()
			
 
				+            snapshots_count = crawl.snapshot_set.count()
			
 
				+            try:
			
 
				+                total_bytes = sum(s.archive_size for s in crawl.snapshot_set.all())
			
 
				+            except Exception:
			
 
				+                total_bytes, _, _ = get_dir_size(crawl.output_dir)
			
 
				+            total_size = printable_filesize(total_bytes)
			
 
				+            total_time = timezone.now() - started_at
			
 
				+            total_seconds = int(total_time.total_seconds())
			
 
				+            mins, secs = divmod(total_seconds, 60)
			
 
				+            hours, mins = divmod(mins, 60)
			
 
				+            if hours:
			
 
				+                duration_str = f"{hours}h {mins}m {secs}s"
			
 
				+            elif mins:
			
 
				+                duration_str = f"{mins}m {secs}s"
			
 
				+            else:
			
 
				+                duration_str = f"{secs}s"
			
 
				+
			
 
				+            # Output dir relative to DATA_DIR
			
 
				+            try:
			
 
				+                rel_output = Path(crawl.output_dir).relative_to(CONSTANTS.DATA_DIR)
			
 
				+                rel_output_str = f'./{rel_output}'
			
 
				+            except Exception:
			
 
				+                rel_output_str = str(crawl.output_dir)
			
 
				+
			
 
				+            # Build admin URL from SERVER_CONFIG
			
 
				+            bind_addr = SERVER_CONFIG.BIND_ADDR
			
 
				+            if bind_addr.startswith('http://') or bind_addr.startswith('https://'):
			
 
				+                base_url = bind_addr
			
 
				+            else:
			
 
				+                base_url = f'http://{bind_addr}'
			
 
				+            admin_url = f'{base_url}/admin/crawls/crawl/{crawl.id}/change/'
			
 
				+
			
 
				+            print('\n[bold]crawl output saved to:[/bold]')
			
 
				+            print(f'  {rel_output_str}')
			
 
				+            print(f'  {admin_url}')
			
 
				+            print(f'\n[bold]total urls snapshotted:[/bold] {snapshots_count}')
			
 
				+            print(f'[bold]total size:[/bold] {total_size}')
			
 
				+            print(f'[bold]total time:[/bold] {duration_str}')
			
 
				+        except Exception:
			
 
				+            # Summary is best-effort; avoid failing the command if something goes wrong
			
 
				+            pass
			
 
				 
			
 
				     # 6. Return the list of Snapshots in this crawl
			
 
				     return crawl.snapshot_set.all()
			
--- a/archivebox/cli/archivebox_pluginmap.py
+++ b/archivebox/cli/archivebox_pluginmap.py
@@ -205,7 +205,6 @@ def pluginmap(
 
				 
			
 
				     from archivebox.hooks import (
			
 
				         discover_hooks,
			
 
				-        extract_step,
			
 
				         is_background_hook,
			
 
				         BUILTIN_PLUGINS_DIR,
			
 
				         USER_PLUGINS_DIR,
			
@@ -277,16 +276,14 @@ def pluginmap(
 
				         # Build hook info list
			
 
				         hook_infos = []
			
 
				         for hook_path in hooks:
			
 
				-            # Get plugin name from parent directory (e.g., 'wget' from 'plugins/wget/on_Snapshot__61_wget.py')
			
 
				+            # Get plugin name from parent directory (e.g., 'wget' from 'plugins/wget/on_Snapshot__06_wget.bg.py')
			
 
				             plugin_name = hook_path.parent.name
			
 
				-            step = extract_step(hook_path.name)
			
 
				             is_bg = is_background_hook(hook_path.name)
			
 
				 
			
 
				             hook_infos.append({
			
 
				                 'path': str(hook_path),
			
 
				                 'name': hook_path.name,
			
 
				                 'plugin': plugin_name,
			
 
				-                'step': step,
			
 
				                 'is_background': is_bg,
			
 
				                 'extension': hook_path.suffix,
			
 
				             })
			
@@ -316,20 +313,18 @@ def pluginmap(
 
				                 show_header=True,
			
 
				                 header_style='bold magenta',
			
 
				             )
			
 
				-            table.add_column('Step', justify='center', width=6)
			
 
				             table.add_column('Plugin', style='cyan', width=20)
			
 
				             table.add_column('Hook Name', style='green')
			
 
				             table.add_column('BG', justify='center', width=4)
			
 
				             table.add_column('Type', justify='center', width=5)
			
 
				 
			
 
				-            # Sort by step then by name
			
 
				-            sorted_hooks = sorted(hook_infos, key=lambda h: (h['step'], h['name']))
			
 
				+            # Sort lexicographically by hook name
			
 
				+            sorted_hooks = sorted(hook_infos, key=lambda h: h['name'])
			
 
				 
			
 
				             for hook in sorted_hooks:
			
 
				                 bg_marker = '[yellow]bg[/yellow]' if hook['is_background'] else ''
			
 
				                 ext = hook['extension'].lstrip('.')
			
 
				                 table.add_row(
			
 
				-                    str(hook['step']),
			
 
				                     hook['plugin'],
			
 
				                     hook['name'],
			
 
				                     bg_marker,
			
@@ -347,7 +342,7 @@ def pluginmap(
 
				         prnt(f'[bold]Total hooks discovered: {total_hooks}[/bold]')
			
 
				         prnt()
			
 
				         prnt('[dim]Hook naming convention: on_{Model}__{XX}_{description}[.bg].{ext}[/dim]')
			
 
				-        prnt('[dim]  - XX: Two-digit order (first digit = step 0-9)[/dim]')
			
 
				+        prnt('[dim]  - XX: Two-digit lexicographic order (00-99)[/dim]')
			
 
				         prnt('[dim]  - .bg: Background hook (non-blocking)[/dim]')
			
 
				         prnt('[dim]  - ext: py, sh, or js[/dim]')
			
 
				         prnt()
			
--- a/archivebox/config/configset.py
+++ b/archivebox/config/configset.py
@@ -258,11 +258,18 @@ def get_config(
 
				     # Add CRAWL_OUTPUT_DIR for snapshot hooks to find shared Chrome session
			
 
				     if crawl and hasattr(crawl, "output_dir"):
			
 
				         config['CRAWL_OUTPUT_DIR'] = str(crawl.output_dir)
			
 
				+        config['CRAWL_ID'] = str(getattr(crawl, "id", "")) if getattr(crawl, "id", None) else config.get('CRAWL_ID')
			
 
				 
			
 
				     # Apply snapshot config overrides (highest priority)
			
 
				     if snapshot and hasattr(snapshot, "config") and snapshot.config:
			
 
				         config.update(snapshot.config)
			
 
				 
			
 
				+    if snapshot:
			
 
				+        config['SNAPSHOT_ID'] = str(getattr(snapshot, "id", "")) if getattr(snapshot, "id", None) else config.get('SNAPSHOT_ID')
			
 
				+        config['SNAPSHOT_DEPTH'] = int(getattr(snapshot, "depth", 0) or 0)
			
 
				+        if getattr(snapshot, "crawl_id", None):
			
 
				+            config['CRAWL_ID'] = str(snapshot.crawl_id)
			
 
				+
			
 
				     # Normalize all aliases to canonical names (after all sources merged)
			
 
				     # This handles aliases that came from user/crawl/snapshot configs, not just env
			
 
				     try:
			
--- a/archivebox/core/models.py
+++ b/archivebox/core/models.py
@@ -344,6 +344,8 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
 
				     @property
			
 
				     def process_set(self):
			
 
				         """Get all Process objects related to this snapshot's ArchiveResults."""
			
 
				+        import json
			
 
				+        import json
			
 
				         from archivebox.machine.models import Process
			
 
				         return Process.objects.filter(archiveresult__snapshot_id=self.id)
			
 
				 
			
@@ -613,7 +615,7 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
 
				 
			
 
				         ONLY used by: archivebox update (for orphan detection)
			
 
				         """
			
 
				-        import json
			
 
				+        from archivebox.machine.models import Process
			
 
				 
			
 
				         # Try index.jsonl first (new format), then index.json (legacy)
			
 
				         jsonl_path = snapshot_dir / CONSTANTS.JSONL_INDEX_FILENAME
			
@@ -622,15 +624,12 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
 
				         data = None
			
 
				         if jsonl_path.exists():
			
 
				             try:
			
 
				-                with open(jsonl_path) as f:
			
 
				-                    for line in f:
			
 
				-                        line = line.strip()
			
 
				-                        if line.startswith('{'):
			
 
				-                            record = json.loads(line)
			
 
				-                            if record.get('type') == 'Snapshot':
			
 
				-                                data = record
			
 
				-                                break
			
 
				-            except (json.JSONDecodeError, OSError):
			
 
				+                records = Process.parse_records_from_text(jsonl_path.read_text())
			
 
				+                for record in records:
			
 
				+                    if record.get('type') == 'Snapshot':
			
 
				+                        data = record
			
 
				+                        break
			
 
				+            except OSError:
			
 
				                 pass
			
 
				         elif json_path.exists():
			
 
				             try:
			
@@ -689,7 +688,7 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
 
				 
			
 
				         ONLY used by: archivebox update (for orphan import)
			
 
				         """
			
 
				-        import json
			
 
				+        from archivebox.machine.models import Process
			
 
				 
			
 
				         # Try index.jsonl first (new format), then index.json (legacy)
			
 
				         jsonl_path = snapshot_dir / CONSTANTS.JSONL_INDEX_FILENAME
			
@@ -698,15 +697,12 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
 
				         data = None
			
 
				         if jsonl_path.exists():
			
 
				             try:
			
 
				-                with open(jsonl_path) as f:
			
 
				-                    for line in f:
			
 
				-                        line = line.strip()
			
 
				-                        if line.startswith('{'):
			
 
				-                            record = json.loads(line)
			
 
				-                            if record.get('type') == 'Snapshot':
			
 
				-                                data = record
			
 
				-                                break
			
 
				-            except (json.JSONDecodeError, OSError):
			
 
				+                records = Process.parse_records_from_text(jsonl_path.read_text())
			
 
				+                for record in records:
			
 
				+                    if record.get('type') == 'Snapshot':
			
 
				+                        data = record
			
 
				+                        break
			
 
				+            except OSError:
			
 
				                 pass
			
 
				         elif json_path.exists():
			
 
				             try:
			
@@ -1040,7 +1036,7 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
 
				 
			
 
				         Returns dict with keys: 'snapshot', 'archive_results', 'binaries', 'processes'
			
 
				         """
			
 
				-        import json
			
 
				+        from archivebox.machine.models import Process
			
 
				         from archivebox.misc.jsonl import (
			
 
				             TYPE_SNAPSHOT, TYPE_ARCHIVERESULT, TYPE_BINARY, TYPE_PROCESS,
			
 
				         )
			
@@ -1056,24 +1052,17 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
 
				         if not index_path.exists():
			
 
				             return result
			
 
				 
			
 
				-        with open(index_path, 'r') as f:
			
 
				-            for line in f:
			
 
				-                line = line.strip()
			
 
				-                if not line or not line.startswith('{'):
			
 
				-                    continue
			
 
				-                try:
			
 
				-                    record = json.loads(line)
			
 
				-                    record_type = record.get('type')
			
 
				-                    if record_type == TYPE_SNAPSHOT:
			
 
				-                        result['snapshot'] = record
			
 
				-                    elif record_type == TYPE_ARCHIVERESULT:
			
 
				-                        result['archive_results'].append(record)
			
 
				-                    elif record_type == TYPE_BINARY:
			
 
				-                        result['binaries'].append(record)
			
 
				-                    elif record_type == TYPE_PROCESS:
			
 
				-                        result['processes'].append(record)
			
 
				-                except json.JSONDecodeError:
			
 
				-                    continue
			
 
				+        records = Process.parse_records_from_text(index_path.read_text())
			
 
				+        for record in records:
			
 
				+            record_type = record.get('type')
			
 
				+            if record_type == TYPE_SNAPSHOT:
			
 
				+                result['snapshot'] = record
			
 
				+            elif record_type == TYPE_ARCHIVERESULT:
			
 
				+                result['archive_results'].append(record)
			
 
				+            elif record_type == TYPE_BINARY:
			
 
				+                result['binaries'].append(record)
			
 
				+            elif record_type == TYPE_PROCESS:
			
 
				+                result['processes'].append(record)
			
 
				 
			
 
				         return result
			
 
				 
			
@@ -1317,7 +1306,7 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
 
				             for plugin in all_plugins:
			
 
				                 result = archive_results.get(plugin)
			
 
				                 existing = result and result.status == 'succeeded' and (result.output_files or result.output_str)
			
 
				-                icon = get_plugin_icon(plugin)
			
 
				+                icon = mark_safe(get_plugin_icon(plugin))
			
 
				 
			
 
				                 # Skip plugins with empty icons that have no output
			
 
				                 # (e.g., staticfile only shows when there's actual output)
			
@@ -1373,6 +1362,45 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
 
				 
			
 
				         return str(current_path)
			
 
				 
			
 
				+    def ensure_crawl_symlink(self) -> None:
			
 
				+        """Ensure snapshot is symlinked under its crawl output directory."""
			
 
				+        import os
			
 
				+        from pathlib import Path
			
 
				+        from django.utils import timezone
			
 
				+        from archivebox import DATA_DIR
			
 
				+        from archivebox.crawls.models import Crawl
			
 
				+
			
 
				+        if not self.crawl_id:
			
 
				+            return
			
 
				+        crawl = Crawl.objects.filter(id=self.crawl_id).select_related('created_by').first()
			
 
				+        if not crawl:
			
 
				+            return
			
 
				+
			
 
				+        date_base = crawl.created_at or self.created_at or timezone.now()
			
 
				+        date_str = date_base.strftime('%Y%m%d')
			
 
				+        domain = self.extract_domain_from_url(self.url)
			
 
				+        username = crawl.created_by.username if crawl.created_by_id else 'system'
			
 
				+
			
 
				+        crawl_dir = DATA_DIR / 'users' / username / 'crawls' / date_str / domain / str(crawl.id)
			
 
				+        link_path = crawl_dir / 'snapshots' / domain / str(self.id)
			
 
				+        link_parent = link_path.parent
			
 
				+        link_parent.mkdir(parents=True, exist_ok=True)
			
 
				+
			
 
				+        target = Path(self.output_dir)
			
 
				+        if link_path.exists() or link_path.is_symlink():
			
 
				+            if link_path.is_symlink():
			
 
				+                if link_path.resolve() == target.resolve():
			
 
				+                    return
			
 
				+                link_path.unlink(missing_ok=True)
			
 
				+            else:
			
 
				+                return
			
 
				+
			
 
				+        rel_target = os.path.relpath(target, link_parent)
			
 
				+        try:
			
 
				+            link_path.symlink_to(rel_target, target_is_directory=True)
			
 
				+        except OSError:
			
 
				+            return
			
 
				+
			
 
				     @cached_property
			
 
				     def archive_path(self):
			
 
				         return f'{CONSTANTS.ARCHIVE_DIR_NAME}/{self.timestamp}'
			
@@ -1636,6 +1664,8 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
 
				         if update_fields:
			
 
				             snapshot.save(update_fields=update_fields + ['modified_at'])
			
 
				 
			
 
				+        snapshot.ensure_crawl_symlink()
			
 
				+
			
 
				         return snapshot
			
 
				 
			
 
				     def create_pending_archiveresults(self) -> list['ArchiveResult']:
			
@@ -1689,7 +1719,7 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
 
				         """
			
 
				         # Check if any ARs are still pending/started
			
 
				         pending = self.archiveresult_set.exclude(
			
 
				-            status__in=ArchiveResult.FINAL_OR_ACTIVE_STATES
			
 
				+            status__in=ArchiveResult.FINAL_STATES
			
 
				         ).exists()
			
 
				 
			
 
				         return not pending
			
@@ -1754,7 +1784,7 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
 
				         - Plugins run in order (numeric prefix)
			
 
				         - Each plugin checks its dependencies at runtime
			
 
				 
			
 
				-        Dependency handling (e.g., chrome_session → screenshot):
			
 
				+        Dependency handling (e.g., chrome → screenshot):
			
 
				         - Plugins check if required outputs exist before running
			
 
				         - If dependency output missing → plugin returns 'skipped'
			
 
				         - On retry, if dependency now succeeds → dependent can run
			
@@ -2117,6 +2147,18 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
 
				         TITLE_LOADING_MSG = 'Not yet archived...'
			
 
				 
			
 
				         canonical = self.canonical_outputs()
			
 
				+        preview_priority = [
			
 
				+            'singlefile_path',
			
 
				+            'screenshot_path',
			
 
				+            'wget_path',
			
 
				+            'dom_path',
			
 
				+            'pdf_path',
			
 
				+            'readability_path',
			
 
				+        ]
			
 
				+        best_preview_path = next(
			
 
				+            (canonical.get(key) for key in preview_priority if canonical.get(key)),
			
 
				+            canonical.get('index_path', 'index.html'),
			
 
				+        )
			
 
				         context = {
			
 
				             **self.to_dict(extended=True),
			
 
				             **{f'{k}_path': v for k, v in canonical.items()},
			
@@ -2132,6 +2174,7 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
 
				             'oldest_archive_date': ts_to_date_str(self.oldest_archive_date),
			
 
				             'SAVE_ARCHIVE_DOT_ORG': SAVE_ARCHIVE_DOT_ORG,
			
 
				             'PREVIEW_ORIGINALS': SERVER_CONFIG.PREVIEW_ORIGINALS,
			
 
				+            'best_preview_path': best_preview_path,
			
 
				         }
			
 
				         rendered_html = render_to_string('snapshot.html', context)
			
 
				         atomic_write(str(Path(out_dir) / CONSTANTS.HTML_INDEX_FILENAME), rendered_html)
			
@@ -2669,12 +2712,12 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
 
				         - end_ts, retry_at, cmd, cmd_version, binary FK
			
 
				         - Processes side-effect records (Snapshot, Tag, etc.) via process_hook_records()
			
 
				         """
			
 
				-        import json
			
 
				         import mimetypes
			
 
				         from collections import defaultdict
			
 
				         from pathlib import Path
			
 
				         from django.utils import timezone
			
 
				-        from archivebox.hooks import process_hook_records
			
 
				+        from archivebox.hooks import process_hook_records, extract_records_from_process
			
 
				+        from archivebox.machine.models import Process
			
 
				 
			
 
				         plugin_dir = Path(self.pwd) if self.pwd else None
			
 
				         if not plugin_dir or not plugin_dir.exists():
			
@@ -2687,15 +2730,13 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
 
				 
			
 
				         # Read and parse JSONL output from stdout.log
			
 
				         stdout_file = plugin_dir / 'stdout.log'
			
 
				-        stdout = stdout_file.read_text() if stdout_file.exists() else ''
			
 
				-
			
 
				         records = []
			
 
				-        for line in stdout.splitlines():
			
 
				-            if line.strip() and line.strip().startswith('{'):
			
 
				-                try:
			
 
				-                    records.append(json.loads(line))
			
 
				-                except json.JSONDecodeError:
			
 
				-                    continue
			
 
				+        if self.process_id and self.process:
			
 
				+            records = extract_records_from_process(self.process)
			
 
				+
			
 
				+        if not records:
			
 
				+            stdout = stdout_file.read_text() if stdout_file.exists() else ''
			
 
				+            records = Process.parse_records_from_text(stdout)
			
 
				 
			
 
				         # Find ArchiveResult record and update status/output from it
			
 
				         ar_records = [r for r in records if r.get('type') == 'ArchiveResult']
			
@@ -2722,9 +2763,20 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
 
				                 self._set_binary_from_cmd(hook_data['cmd'])
			
 
				             # Note: cmd_version is derived from binary.version, not stored on Process
			
 
				         else:
			
 
				-            # No ArchiveResult record = failed
			
 
				-            self.status = self.StatusChoices.FAILED
			
 
				-            self.output_str = 'Hook did not output ArchiveResult record'
			
 
				+            # No ArchiveResult record: treat background hooks or clean exits as skipped
			
 
				+            is_background = False
			
 
				+            try:
			
 
				+                from archivebox.hooks import is_background_hook
			
 
				+                is_background = bool(self.hook_name and is_background_hook(self.hook_name))
			
 
				+            except Exception:
			
 
				+                pass
			
 
				+
			
 
				+            if is_background or (self.process_id and self.process and self.process.exit_code == 0):
			
 
				+                self.status = self.StatusChoices.SKIPPED
			
 
				+                self.output_str = 'Hook did not output ArchiveResult record'
			
 
				+            else:
			
 
				+                self.status = self.StatusChoices.FAILED
			
 
				+                self.output_str = 'Hook did not output ArchiveResult record'
			
 
				 
			
 
				         # Walk filesystem and populate output_files, output_size, output_mimetypes
			
 
				         exclude_names = {'stdout.log', 'stderr.log', 'hook.pid', 'listener.pid'}
			
@@ -2793,14 +2845,9 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
 
				         }
			
 
				         process_hook_records(filtered_records, overrides=overrides)
			
 
				 
			
 
				-        # Cleanup PID files and empty logs
			
 
				+        # Cleanup PID files (keep logs even if empty so they can be tailed)
			
 
				         pid_file = plugin_dir / 'hook.pid'
			
 
				         pid_file.unlink(missing_ok=True)
			
 
				-        stderr_file = plugin_dir / 'stderr.log'
			
 
				-        if stdout_file.exists() and stdout_file.stat().st_size == 0:
			
 
				-            stdout_file.unlink()
			
 
				-        if stderr_file.exists() and stderr_file.stat().st_size == 0:
			
 
				-            stderr_file.unlink()
			
 
				 
			
 
				     def _set_binary_from_cmd(self, cmd: list) -> None:
			
 
				         """
			
@@ -3186,4 +3233,4 @@ class ArchiveResultMachine(BaseStateMachine, strict_states=True):
 
				 # Manually register state machines with python-statemachine registry
			
 
				 # (normally auto-discovered from statemachines.py, but we define them here for clarity)
			
 
				 registry.register(SnapshotMachine)
			
 
				-registry.register(ArchiveResultMachine)
			
 
				+registry.register(ArchiveResultMachine)
			
--- a/archivebox/core/settings.py
+++ b/archivebox/core/settings.py
@@ -436,6 +436,10 @@ SIGNAL_WEBHOOKS = {
 
				     },
			
 
				 }
			
 
				 
			
 
				+# Avoid background threads touching sqlite connections (especially during tests/migrations).
			
 
				+if DATABASES["default"]["ENGINE"].endswith("sqlite3"):
			
 
				+    SIGNAL_WEBHOOKS["TASK_HANDLER"] = "signal_webhooks.handlers.sync_task_handler"
			
 
				+
			
 
				 ################################################################################
			
 
				 ### Admin Data View Settings
			
 
				 ################################################################################
			
--- a/archivebox/core/views.py
+++ b/archivebox/core/views.py
@@ -120,7 +120,15 @@ class SnapshotView(View):
 
				         # Get available extractor plugins from hooks (sorted by numeric prefix for ordering)
			
 
				         # Convert to base names for display ordering
			
 
				         all_plugins = [get_plugin_name(e) for e in get_enabled_plugins()]
			
 
				-        preferred_types = tuple(all_plugins)
			
 
				+        preview_priority = [
			
 
				+            'singlefile',
			
 
				+            'screenshot',
			
 
				+            'wget',
			
 
				+            'dom',
			
 
				+            'pdf',
			
 
				+            'readability',
			
 
				+        ]
			
 
				+        preferred_types = tuple(preview_priority + [p for p in all_plugins if p not in preview_priority])
			
 
				         all_types = preferred_types + tuple(result_type for result_type in archiveresults.keys() if result_type not in preferred_types)
			
 
				 
			
 
				         best_result = {'path': 'None', 'result': None}
			
--- a/archivebox/crawls/models.py
+++ b/archivebox/crawls/models.py
@@ -313,6 +313,12 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith
 
				                 if tags:
			
 
				                     snapshot.save_tags(tags.split(','))
			
 
				 
			
 
				+            # Ensure crawl -> snapshot symlink exists for both new and existing snapshots
			
 
				+            try:
			
 
				+                snapshot.ensure_crawl_symlink()
			
 
				+            except Exception:
			
 
				+                pass
			
 
				+
			
 
				         return created_snapshots
			
 
				 
			
 
				     def run(self) -> 'Snapshot | None':
			
@@ -325,7 +331,6 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith
 
				             The root Snapshot for this crawl, or None for system crawls that don't create snapshots
			
 
				         """
			
 
				         import time
			
 
				-        import json
			
 
				         from pathlib import Path
			
 
				         from archivebox.hooks import run_hook, discover_hooks, process_hook_records
			
 
				         from archivebox.config.configset import get_config
			
@@ -339,35 +344,6 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith
 
				         # Get merged config with crawl context
			
 
				         config = get_config(crawl=self)
			
 
				 
			
 
				-        # Load all binaries.jsonl files from plugins
			
 
				-        # This replaces individual on_Crawl install hooks with declarative configuration
			
 
				-        from archivebox.hooks import BUILTIN_PLUGINS_DIR
			
 
				-        from archivebox.machine.models import Machine
			
 
				-
			
 
				-        machine_id = str(Machine.current().id)
			
 
				-        binaries_records = []
			
 
				-
			
 
				-        for binaries_file in BUILTIN_PLUGINS_DIR.glob('*/binaries.jsonl'):
			
 
				-            try:
			
 
				-                with open(binaries_file, 'r') as f:
			
 
				-                    for line in f:
			
 
				-                        line = line.strip()
			
 
				-                        if line and not line.startswith('#'):
			
 
				-                            try:
			
 
				-                                record = json.loads(line)
			
 
				-                                if record.get('type') == 'Binary':
			
 
				-                                    record['machine_id'] = machine_id
			
 
				-                                    binaries_records.append(record)
			
 
				-                            except json.JSONDecodeError:
			
 
				-                                pass
			
 
				-            except Exception:
			
 
				-                pass
			
 
				-
			
 
				-        # Process binary declarations before running hooks
			
 
				-        if binaries_records:
			
 
				-            overrides = {'crawl': self}
			
 
				-            process_hook_records(binaries_records, overrides=overrides)
			
 
				-
			
 
				         # Discover and run on_Crawl hooks
			
 
				         with open(debug_log, 'a') as f:
			
 
				             f.write(f'Discovering Crawl hooks...\n')
			
@@ -418,6 +394,34 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith
 
				             if stats:
			
 
				                 print(f'[green]✓ Created: {stats}[/green]')
			
 
				 
			
 
				+        # Ensure any newly declared binaries are installed before creating snapshots
			
 
				+        from archivebox.machine.models import Binary, Machine
			
 
				+        from django.utils import timezone
			
 
				+
			
 
				+        machine = Machine.current()
			
 
				+        while True:
			
 
				+            pending_binaries = Binary.objects.filter(
			
 
				+                machine=machine,
			
 
				+                status=Binary.StatusChoices.QUEUED,
			
 
				+                retry_at__lte=timezone.now(),
			
 
				+            ).order_by('retry_at')
			
 
				+            if not pending_binaries.exists():
			
 
				+                break
			
 
				+
			
 
				+            for binary in pending_binaries:
			
 
				+                try:
			
 
				+                    binary.sm.tick()
			
 
				+                except Exception:
			
 
				+                    continue
			
 
				+
			
 
				+            # Exit if nothing else is immediately retryable
			
 
				+            if not Binary.objects.filter(
			
 
				+                machine=machine,
			
 
				+                status=Binary.StatusChoices.QUEUED,
			
 
				+                retry_at__lte=timezone.now(),
			
 
				+            ).exists():
			
 
				+                break
			
 
				+
			
 
				         # Create snapshots from all URLs in self.urls
			
 
				         with open(debug_log, 'a') as f:
			
 
				             f.write(f'Creating snapshots from URLs...\n')
			
--- a/archivebox/hooks.py
+++ b/archivebox/hooks.py
@@ -15,29 +15,29 @@ Hook contract:
 
				     Exit:   0 = success, non-zero = failure
			
 
				 
			
 
				 Execution order:
			
 
				-    - Hooks are numbered 00-99 with first digit determining step (0-9)
			
 
				-    - All hooks in a step can run in parallel
			
 
				-    - Steps execute sequentially (step 0 → step 1 → ... → step 9)
			
 
				-    - Background hooks (.bg suffix) don't block step advancement
			
 
				+    - Hooks are named with two-digit prefixes (00-99) and sorted lexicographically by filename
			
 
				+    - Foreground hooks run sequentially in that order
			
 
				+    - Background hooks (.bg suffix) run concurrently and do not block foreground progress
			
 
				+    - After all foreground hooks complete, background hooks receive SIGTERM and must finalize
			
 
				     - Failed extractors don't block subsequent extractors
			
 
				 
			
 
				 Hook Naming Convention:
			
 
				     on_{ModelName}__{run_order}_{description}[.bg].{ext}
			
 
				 
			
 
				     Examples:
			
 
				-        on_Snapshot__00_setup.py         # Step 0, runs first
			
 
				-        on_Snapshot__20_chrome_tab.bg.js # Step 2, background (doesn't block)
			
 
				-        on_Snapshot__50_screenshot.js    # Step 5, foreground (blocks step)
			
 
				-        on_Snapshot__63_media.bg.py      # Step 6, background (long-running)
			
 
				+        on_Snapshot__00_setup.py         # runs first
			
 
				+        on_Snapshot__10_chrome_tab.bg.js # background (doesn't block)
			
 
				+        on_Snapshot__50_screenshot.js    # foreground (blocks)
			
 
				+        on_Snapshot__63_media.bg.py      # background (long-running)
			
 
				 
			
 
				 Dependency handling:
			
 
				     Extractor plugins that depend on other plugins' output should check at runtime:
			
 
				 
			
 
				     ```python
			
 
				     # Example: screenshot plugin depends on chrome plugin
			
 
				-    chrome_session_dir = Path(os.environ.get('SNAPSHOT_DIR', '.')) / 'chrome_session'
			
 
				-    if not (chrome_session_dir / 'session.json').exists():
			
 
				-        print('{"status": "skipped", "output": "chrome_session not available"}')
			
 
				+    chrome_dir = Path(os.environ.get('SNAPSHOT_DIR', '.')) / 'chrome'
			
 
				+    if not (chrome_dir / 'cdp_url.txt').exists():
			
 
				+        print('{"status": "skipped", "output": "chrome session not available"}')
			
 
				         sys.exit(1)  # Exit non-zero so it gets retried later
			
 
				     ```
			
 
				 
			
@@ -50,7 +50,7 @@ API (all hook logic lives here):
 
				     discover_hooks(event)     -> List[Path]     Find hook scripts
			
 
				     run_hook(script, ...)     -> HookResult     Execute a hook script
			
 
				     run_hooks(event, ...)     -> List[HookResult]  Run all hooks for an event
			
 
				-    extract_step(hook_name)   -> int            Get step number (0-9) from hook name
			
 
				+    extract_step(hook_name)   -> int            Deprecated: get two-digit order prefix if present
			
 
				     is_background_hook(name)  -> bool           Check if hook is background (.bg suffix)
			
 
				 """
			
 
				 
			
@@ -67,6 +67,7 @@ from typing import List, Dict, Any, Optional, TypedDict
 
				 
			
 
				 from django.conf import settings
			
 
				 from django.utils import timezone
			
 
				+from django.utils.safestring import mark_safe
			
 
				 
			
 
				 
			
 
				 # Plugin directories
			
@@ -80,51 +81,33 @@ USER_PLUGINS_DIR = Path(getattr(settings, 'DATA_DIR', Path.cwd())) / 'plugins'
 
				 
			
 
				 def extract_step(hook_name: str) -> int:
			
 
				     """
			
 
				-    Extract step number (0-9) from hook name.
			
 
				+    Deprecated: return the two-digit order prefix as an integer (00-99) if present.
			
 
				 
			
 
				-    Hooks are numbered 00-99 with the first digit determining the step.
			
 
				-    Pattern: on_{Model}__{XX}_{description}[.bg].{ext}
			
 
				-
			
 
				-    Args:
			
 
				-        hook_name: Hook filename (e.g., 'on_Snapshot__50_wget.py')
			
 
				-
			
 
				-    Returns:
			
 
				-        Step number 0-9, or 9 (default) for unnumbered hooks.
			
 
				-
			
 
				-    Examples:
			
 
				-        extract_step('on_Snapshot__05_chrome.py') -> 0
			
 
				-        extract_step('on_Snapshot__50_wget.py') -> 5
			
 
				-        extract_step('on_Snapshot__63_media.bg.py') -> 6
			
 
				-        extract_step('on_Snapshot__99_cleanup.sh') -> 9
			
 
				-        extract_step('on_Snapshot__unnumbered.py') -> 9 (default)
			
 
				+    Hook execution is based on lexicographic ordering of filenames; callers should
			
 
				+    not rely on parsed numeric steps for ordering decisions.
			
 
				     """
			
 
				-    # Pattern matches __XX_ where XX is two digits
			
 
				     match = re.search(r'__(\d{2})_', hook_name)
			
 
				     if match:
			
 
				-        two_digit = int(match.group(1))
			
 
				-        step = two_digit // 10  # First digit is the step (0-9)
			
 
				-        return step
			
 
				-
			
 
				-    # Log warning for unnumbered hooks and default to step 9
			
 
				+        return int(match.group(1))
			
 
				     import sys
			
 
				-    print(f"Warning: Hook '{hook_name}' has no step number (expected __XX_), defaulting to step 9", file=sys.stderr)
			
 
				-    return 9
			
 
				+    print(f"Warning: Hook '{hook_name}' has no order prefix (expected __XX_), defaulting to 99", file=sys.stderr)
			
 
				+    return 99
			
 
				 
			
 
				 
			
 
				 def is_background_hook(hook_name: str) -> bool:
			
 
				     """
			
 
				-    Check if a hook is a background hook (doesn't block step advancement).
			
 
				+    Check if a hook is a background hook (doesn't block foreground progression).
			
 
				 
			
 
				     Background hooks have '.bg.' in their filename before the extension.
			
 
				 
			
 
				     Args:
			
 
				-        hook_name: Hook filename (e.g., 'on_Snapshot__20_chrome_tab.bg.js')
			
 
				+        hook_name: Hook filename (e.g., 'on_Snapshot__10_chrome_tab.bg.js')
			
 
				 
			
 
				     Returns:
			
 
				         True if background hook, False if foreground.
			
 
				 
			
 
				     Examples:
			
 
				-        is_background_hook('on_Snapshot__20_chrome_tab.bg.js') -> True
			
 
				+        is_background_hook('on_Snapshot__10_chrome_tab.bg.js') -> True
			
 
				         is_background_hook('on_Snapshot__50_wget.py') -> False
			
 
				         is_background_hook('on_Snapshot__63_media.bg.py') -> True
			
 
				     """
			
@@ -273,6 +256,7 @@ def run_hook(
 
				     """
			
 
				     from archivebox.machine.models import Process, Machine
			
 
				     import time
			
 
				+    import sys
			
 
				     start_time = time.time()
			
 
				 
			
 
				     # Auto-detect timeout from plugin config if not explicitly provided
			
@@ -313,7 +297,7 @@ def run_hook(
 
				     if ext == '.sh':
			
 
				         cmd = ['bash', str(script)]
			
 
				     elif ext == '.py':
			
 
				-        cmd = ['python3', str(script)]
			
 
				+        cmd = [sys.executable, str(script)]
			
 
				     elif ext == '.js':
			
 
				         cmd = ['node', str(script)]
			
 
				     else:
			
@@ -393,10 +377,10 @@ def run_hook(
 
				     # Priority: config dict > Machine.config > derive from LIB_DIR
			
 
				     node_path = config.get('NODE_PATH')
			
 
				     if not node_path and lib_dir:
			
 
				-        # Derive from LIB_DIR/npm/node_modules
			
 
				+        # Derive from LIB_DIR/npm/node_modules (create if needed)
			
 
				         node_modules_dir = Path(lib_dir) / 'npm' / 'node_modules'
			
 
				-        if node_modules_dir.exists():
			
 
				-            node_path = str(node_modules_dir)
			
 
				+        node_modules_dir.mkdir(parents=True, exist_ok=True)
			
 
				+        node_path = str(node_modules_dir)
			
 
				     if not node_path:
			
 
				         try:
			
 
				             # Fallback to Machine.config
			
@@ -462,7 +446,7 @@ def run_hook(
 
				             cmd=cmd,
			
 
				             timeout=timeout,
			
 
				             status=Process.StatusChoices.EXITED,
			
 
				-            exit_code=-1,
			
 
				+            exit_code=1,
			
 
				             stderr=f'Failed to run hook: {type(e).__name__}: {e}',
			
 
				         )
			
 
				         return process
			
@@ -472,7 +456,6 @@ def extract_records_from_process(process: 'Process') -> List[Dict[str, Any]]:
 
				     """
			
 
				     Extract JSONL records from a Process's stdout.
			
 
				 
			
 
				-    Uses the same parse_line() logic from misc/jsonl.py.
			
 
				     Adds plugin metadata to each record.
			
 
				 
			
 
				     Args:
			
@@ -481,32 +464,20 @@ def extract_records_from_process(process: 'Process') -> List[Dict[str, Any]]:
 
				     Returns:
			
 
				         List of parsed JSONL records with plugin metadata
			
 
				     """
			
 
				-    from archivebox.misc.jsonl import parse_line
			
 
				-
			
 
				-    records = []
			
 
				-
			
 
				-    # Read stdout from process
			
 
				-    stdout = process.stdout
			
 
				-    if not stdout and process.stdout_file and process.stdout_file.exists():
			
 
				-        stdout = process.stdout_file.read_text()
			
 
				-
			
 
				-    if not stdout:
			
 
				-        return records
			
 
				+    records = process.get_records()
			
 
				+    if not records:
			
 
				+        return []
			
 
				 
			
 
				     # Extract plugin metadata from process.pwd and process.cmd
			
 
				     plugin_name = Path(process.pwd).name if process.pwd else 'unknown'
			
 
				     hook_name = Path(process.cmd[1]).name if len(process.cmd) > 1 else 'unknown'
			
 
				     plugin_hook = process.cmd[1] if len(process.cmd) > 1 else ''
			
 
				 
			
 
				-    # Parse each line as JSONL
			
 
				-    for line in stdout.splitlines():
			
 
				-        record = parse_line(line)
			
 
				-        if record and 'type' in record:
			
 
				-            # Add plugin metadata to record
			
 
				-            record.setdefault('plugin', plugin_name)
			
 
				-            record.setdefault('hook_name', hook_name)
			
 
				-            record.setdefault('plugin_hook', plugin_hook)
			
 
				-            records.append(record)
			
 
				+    for record in records:
			
 
				+        # Add plugin metadata to record
			
 
				+        record.setdefault('plugin', plugin_name)
			
 
				+        record.setdefault('hook_name', hook_name)
			
 
				+        record.setdefault('plugin_hook', plugin_hook)
			
 
				 
			
 
				     return records
			
 
				 
			
@@ -538,18 +509,13 @@ def collect_urls_from_plugins(snapshot_dir: Path) -> List[Dict[str, Any]]:
 
				             continue
			
 
				 
			
 
				         try:
			
 
				-            with open(urls_file, 'r') as f:
			
 
				-                for line in f:
			
 
				-                    line = line.strip()
			
 
				-                    if line:
			
 
				-                        try:
			
 
				-                            entry = json.loads(line)
			
 
				-                            if entry.get('url'):
			
 
				-                                # Track which parser plugin found this URL
			
 
				-                                entry['plugin'] = subdir.name
			
 
				-                                urls.append(entry)
			
 
				-                        except json.JSONDecodeError:
			
 
				-                            continue
			
 
				+            from archivebox.machine.models import Process
			
 
				+            text = urls_file.read_text()
			
 
				+            for entry in Process.parse_records_from_text(text):
			
 
				+                if entry.get('url'):
			
 
				+                    # Track which parser plugin found this URL
			
 
				+                    entry['plugin'] = subdir.name
			
 
				+                    urls.append(entry)
			
 
				         except Exception:
			
 
				             pass
			
 
				 
			
@@ -610,8 +576,8 @@ def get_plugins() -> List[str]:
 
				     The plugin name is the plugin directory name, not the hook script name.
			
 
				 
			
 
				     Example:
			
 
				-    archivebox/plugins/chrome_session/on_Snapshot__20_chrome_tab.bg.js
			
 
				-    -> plugin = 'chrome_session'
			
 
				+    archivebox/plugins/chrome/on_Snapshot__10_chrome_tab.bg.js
			
 
				+    -> plugin = 'chrome'
			
 
				 
			
 
				     Sorted alphabetically (plugins control their hook order via numeric prefixes in hook names).
			
 
				     """
			
@@ -817,7 +783,7 @@ def discover_plugin_configs() -> Dict[str, Dict[str, Any]]:
 
				 
			
 
				     Returns:
			
 
				         Dict mapping plugin names to their parsed JSONSchema configs.
			
 
				-        e.g., {'wget': {...schema...}, 'chrome_session': {...schema...}}
			
 
				+        e.g., {'wget': {...schema...}, 'chrome': {...schema...}}
			
 
				 
			
 
				     Example config.json:
			
 
				         {
			
@@ -928,14 +894,10 @@ def get_plugin_special_config(plugin_name: str, config: Dict[str, Any]) -> Dict[
 
				     if plugins_whitelist:
			
 
				         # PLUGINS whitelist is specified - only enable plugins in the list
			
 
				         plugin_names = [p.strip().lower() for p in plugins_whitelist.split(',') if p.strip()]
			
 
				-        import sys
			
 
				-        print(f"DEBUG: PLUGINS whitelist='{plugins_whitelist}', checking plugin '{plugin_name}', plugin_names={plugin_names}", file=sys.stderr)
			
 
				         if plugin_name.lower() not in plugin_names:
			
 
				             # Plugin not in whitelist - explicitly disabled
			
 
				-            print(f"DEBUG: Plugin '{plugin_name}' NOT in whitelist, disabling", file=sys.stderr)
			
 
				             enabled = False
			
 
				         else:
			
 
				-            print(f"DEBUG: Plugin '{plugin_name}' IS in whitelist, enabling", file=sys.stderr)
			
 
				             # Plugin is in whitelist - check if explicitly disabled by PLUGINNAME_ENABLED
			
 
				             enabled_key = f'{plugin_upper}_ENABLED'
			
 
				             enabled = config.get(enabled_key)
			
@@ -945,10 +907,8 @@ def get_plugin_special_config(plugin_name: str, config: Dict[str, Any]) -> Dict[
 
				                 enabled = enabled.lower() not in ('false', '0', 'no', '')
			
 
				     else:
			
 
				         # No PLUGINS whitelist - use PLUGINNAME_ENABLED (default True)
			
 
				-        import sys
			
 
				         enabled_key = f'{plugin_upper}_ENABLED'
			
 
				         enabled = config.get(enabled_key)
			
 
				-        print(f"DEBUG: NO PLUGINS whitelist in config, checking {enabled_key}={enabled}", file=sys.stderr)
			
 
				         if enabled is None:
			
 
				             enabled = True
			
 
				         elif isinstance(enabled, str):
			
@@ -1064,10 +1024,10 @@ def get_plugin_icon(plugin: str) -> str:
 
				     # Try plugin-provided icon template
			
 
				     icon_template = get_plugin_template(plugin, 'icon', fallback=False)
			
 
				     if icon_template:
			
 
				-        return icon_template.strip()
			
 
				+        return mark_safe(icon_template.strip())
			
 
				 
			
 
				     # Fall back to generic folder icon
			
 
				-    return '📁'
			
 
				+    return mark_safe('📁')
			
 
				 
			
 
				 
			
 
				 def get_all_plugin_icons() -> Dict[str, str]:
			
@@ -1204,18 +1164,14 @@ def create_model_record(record: Dict[str, Any]) -> Any:
 
				         return obj
			
 
				 
			
 
				     elif record_type == 'Machine':
			
 
				-        # Machine config update (special _method handling)
			
 
				-        method = record.pop('_method', None)
			
 
				-        if method == 'update':
			
 
				-            key = record.get('key')
			
 
				-            value = record.get('value')
			
 
				-            if key and value:
			
 
				-                machine = Machine.current()
			
 
				-                if not machine.config:
			
 
				-                    machine.config = {}
			
 
				-                machine.config[key] = value
			
 
				-                machine.save(update_fields=['config'])
			
 
				-                return machine
			
 
				+        config_patch = record.get('config')
			
 
				+        if isinstance(config_patch, dict) and config_patch:
			
 
				+            machine = Machine.current()
			
 
				+            if not machine.config:
			
 
				+                machine.config = {}
			
 
				+            machine.config.update(config_patch)
			
 
				+            machine.save(update_fields=['config'])
			
 
				+            return machine
			
 
				         return None
			
 
				 
			
 
				     # Add more types as needed (Dependency, Snapshot, etc.)
			
--- a/archivebox/machine/detect.py
+++ b/archivebox/machine/detect.py
@@ -227,33 +227,45 @@ def get_os_info() -> Dict[str, Any]:
 
				     }
			
 
				 
			
 
				 def get_host_stats() -> Dict[str, Any]:
			
 
				-    with tempfile.TemporaryDirectory() as tmp_dir:
			
 
				-        tmp_usage = psutil.disk_usage(str(tmp_dir))
			
 
				-    app_usage = psutil.disk_usage(str(PACKAGE_DIR))
			
 
				-    data_usage = psutil.disk_usage(str(DATA_DIR))
			
 
				-    mem_usage = psutil.virtual_memory()
			
 
				-    swap_usage = psutil.swap_memory()
			
 
				-    return {
			
 
				-        "cpu_boot_time": datetime.fromtimestamp(psutil.boot_time()).isoformat(),
			
 
				-        "cpu_count": psutil.cpu_count(logical=False),
			
 
				-        "cpu_load": psutil.getloadavg(),
			
 
				-        # "cpu_pct": psutil.cpu_percent(interval=1),
			
 
				-        "mem_virt_used_pct": mem_usage.percent,
			
 
				-        "mem_virt_used_gb": round(mem_usage.used / 1024 / 1024 / 1024, 3),
			
 
				-        "mem_virt_free_gb": round(mem_usage.free / 1024 / 1024 / 1024, 3),
			
 
				-        "mem_swap_used_pct": swap_usage.percent,
			
 
				-        "mem_swap_used_gb": round(swap_usage.used / 1024 / 1024 / 1024, 3),
			
 
				-        "mem_swap_free_gb": round(swap_usage.free / 1024 / 1024 / 1024, 3),
			
 
				-        "disk_tmp_used_pct": tmp_usage.percent,
			
 
				-        "disk_tmp_used_gb": round(tmp_usage.used / 1024 / 1024 / 1024, 3),
			
 
				-        "disk_tmp_free_gb": round(tmp_usage.free / 1024 / 1024 / 1024, 3),  # in GB
			
 
				-        "disk_app_used_pct": app_usage.percent,
			
 
				-        "disk_app_used_gb": round(app_usage.used / 1024 / 1024 / 1024, 3),
			
 
				-        "disk_app_free_gb": round(app_usage.free / 1024 / 1024 / 1024, 3),
			
 
				-        "disk_data_used_pct": data_usage.percent,
			
 
				-        "disk_data_used_gb": round(data_usage.used / 1024 / 1024 / 1024, 3),
			
 
				-        "disk_data_free_gb": round(data_usage.free / 1024 / 1024 / 1024, 3),
			
 
				-    }
			
 
				+    try:
			
 
				+        with tempfile.TemporaryDirectory() as tmp_dir:
			
 
				+            tmp_usage = psutil.disk_usage(str(tmp_dir))
			
 
				+        app_usage = psutil.disk_usage(str(PACKAGE_DIR))
			
 
				+        data_usage = psutil.disk_usage(str(DATA_DIR))
			
 
				+        mem_usage = psutil.virtual_memory()
			
 
				+        try:
			
 
				+            swap_usage = psutil.swap_memory()
			
 
				+            swap_used_pct = swap_usage.percent
			
 
				+            swap_used_gb = round(swap_usage.used / 1024 / 1024 / 1024, 3)
			
 
				+            swap_free_gb = round(swap_usage.free / 1024 / 1024 / 1024, 3)
			
 
				+        except OSError:
			
 
				+            # Some sandboxed environments deny access to swap stats
			
 
				+            swap_used_pct = 0.0
			
 
				+            swap_used_gb = 0.0
			
 
				+            swap_free_gb = 0.0
			
 
				+        return {
			
 
				+            "cpu_boot_time": datetime.fromtimestamp(psutil.boot_time()).isoformat(),
			
 
				+            "cpu_count": psutil.cpu_count(logical=False),
			
 
				+            "cpu_load": psutil.getloadavg(),
			
 
				+            # "cpu_pct": psutil.cpu_percent(interval=1),
			
 
				+            "mem_virt_used_pct": mem_usage.percent,
			
 
				+            "mem_virt_used_gb": round(mem_usage.used / 1024 / 1024 / 1024, 3),
			
 
				+            "mem_virt_free_gb": round(mem_usage.free / 1024 / 1024 / 1024, 3),
			
 
				+            "mem_swap_used_pct": swap_used_pct,
			
 
				+            "mem_swap_used_gb": swap_used_gb,
			
 
				+            "mem_swap_free_gb": swap_free_gb,
			
 
				+            "disk_tmp_used_pct": tmp_usage.percent,
			
 
				+            "disk_tmp_used_gb": round(tmp_usage.used / 1024 / 1024 / 1024, 3),
			
 
				+            "disk_tmp_free_gb": round(tmp_usage.free / 1024 / 1024 / 1024, 3),  # in GB
			
 
				+            "disk_app_used_pct": app_usage.percent,
			
 
				+            "disk_app_used_gb": round(app_usage.used / 1024 / 1024 / 1024, 3),
			
 
				+            "disk_app_free_gb": round(app_usage.free / 1024 / 1024 / 1024, 3),
			
 
				+            "disk_data_used_pct": data_usage.percent,
			
 
				+            "disk_data_used_gb": round(data_usage.used / 1024 / 1024 / 1024, 3),
			
 
				+            "disk_data_free_gb": round(data_usage.free / 1024 / 1024 / 1024, 3),
			
 
				+        }
			
 
				+    except Exception:
			
 
				+        return {}
			
 
				 
			
 
				 def get_host_immutable_info(host_info: Dict[str, Any]) -> Dict[str, Any]:
			
 
				     return {
			
--- a/archivebox/machine/models.py
+++ b/archivebox/machine/models.py
@@ -113,23 +113,20 @@ class Machine(ModelWithHealthStats):
 
				         Update Machine config from JSON dict.
			
 
				 
			
 
				         Args:
			
 
				-            record: JSON dict with '_method': 'update', 'key': '...', 'value': '...'
			
 
				+            record: JSON dict with 'config': {key: value} patch
			
 
				             overrides: Not used
			
 
				 
			
 
				         Returns:
			
 
				             Machine instance or None
			
 
				         """
			
 
				-        method = record.get('_method')
			
 
				-        if method == 'update':
			
 
				-            key = record.get('key')
			
 
				-            value = record.get('value')
			
 
				-            if key and value:
			
 
				-                machine = Machine.current()
			
 
				-                if not machine.config:
			
 
				-                    machine.config = {}
			
 
				-                machine.config[key] = value
			
 
				-                machine.save(update_fields=['config'])
			
 
				-                return machine
			
 
				+        config_patch = record.get('config')
			
 
				+        if isinstance(config_patch, dict) and config_patch:
			
 
				+            machine = Machine.current()
			
 
				+            if not machine.config:
			
 
				+                machine.config = {}
			
 
				+            machine.config.update(config_patch)
			
 
				+            machine.save(update_fields=['config'])
			
 
				+            return machine
			
 
				         return None
			
 
				 
			
 
				 
			
@@ -458,31 +455,31 @@ class Binary(ModelWithHealthStats, ModelWithStateMachine):
 
				                 continue
			
 
				 
			
 
				             # Parse JSONL output to check for successful installation
			
 
				-            stdout_file = plugin_output_dir / 'stdout.log'
			
 
				-            if stdout_file.exists():
			
 
				-                stdout = stdout_file.read_text()
			
 
				-                for line in stdout.splitlines():
			
 
				-                    if line.strip() and line.strip().startswith('{'):
			
 
				-                        try:
			
 
				-                            record = json.loads(line)
			
 
				-                            if record.get('type') == 'Binary' and record.get('abspath'):
			
 
				-                                # Update self from successful installation
			
 
				-                                self.abspath = record['abspath']
			
 
				-                                self.version = record.get('version', '')
			
 
				-                                self.sha256 = record.get('sha256', '')
			
 
				-                                self.binprovider = record.get('binprovider', 'env')
			
 
				-                                self.status = self.StatusChoices.INSTALLED
			
 
				-                                self.save()
			
 
				-
			
 
				-                                # Symlink binary into LIB_BIN_DIR if configured
			
 
				-                                from django.conf import settings
			
 
				-                                lib_bin_dir = getattr(settings, 'LIB_BIN_DIR', None)
			
 
				-                                if lib_bin_dir:
			
 
				-                                    self.symlink_to_lib_bin(lib_bin_dir)
			
 
				-
			
 
				-                                return
			
 
				-                        except json.JSONDecodeError:
			
 
				-                            continue
			
 
				+            from archivebox.hooks import extract_records_from_process, process_hook_records
			
 
				+            records = extract_records_from_process(process)
			
 
				+            if records:
			
 
				+                process_hook_records(records, overrides={})
			
 
				+            binary_records = [
			
 
				+                record for record in records
			
 
				+                if record.get('type') == 'Binary' and record.get('abspath')
			
 
				+            ]
			
 
				+            if binary_records:
			
 
				+                record = binary_records[0]
			
 
				+                # Update self from successful installation
			
 
				+                self.abspath = record['abspath']
			
 
				+                self.version = record.get('version', '')
			
 
				+                self.sha256 = record.get('sha256', '')
			
 
				+                self.binprovider = record.get('binprovider', 'env')
			
 
				+                self.status = self.StatusChoices.INSTALLED
			
 
				+                self.save()
			
 
				+
			
 
				+                # Symlink binary into LIB_BIN_DIR if configured
			
 
				+                from django.conf import settings
			
 
				+                lib_bin_dir = getattr(settings, 'LIB_BIN_DIR', None)
			
 
				+                if lib_bin_dir:
			
 
				+                    self.symlink_to_lib_bin(lib_bin_dir)
			
 
				+
			
 
				+                return
			
 
				 
			
 
				         # No hook succeeded - leave status as QUEUED (will retry later)
			
 
				         # Don't set to FAILED since we don't have that status anymore
			
@@ -861,6 +858,27 @@ class Process(models.Model):
 
				             record['timeout'] = self.timeout
			
 
				         return record
			
 
				 
			
 
				+    @classmethod
			
 
				+    def parse_records_from_text(cls, text: str) -> list[dict]:
			
 
				+        """Parse JSONL records from raw text using the shared JSONL parser."""
			
 
				+        from archivebox.misc.jsonl import parse_line
			
 
				+
			
 
				+        records: list[dict] = []
			
 
				+        if not text:
			
 
				+            return records
			
 
				+        for line in text.splitlines():
			
 
				+            record = parse_line(line)
			
 
				+            if record and record.get('type'):
			
 
				+                records.append(record)
			
 
				+        return records
			
 
				+
			
 
				+    def get_records(self) -> list[dict]:
			
 
				+        """Parse JSONL records from this process's stdout."""
			
 
				+        stdout = self.stdout
			
 
				+        if not stdout and self.stdout_file and self.stdout_file.exists():
			
 
				+            stdout = self.stdout_file.read_text()
			
 
				+        return self.parse_records_from_text(stdout or '')
			
 
				+
			
 
				     @staticmethod
			
 
				     def from_json(record: dict, overrides: dict = None):
			
 
				         """
			
@@ -919,6 +937,7 @@ class Process(models.Model):
 
				             if (_CURRENT_PROCESS.pid == current_pid and
			
 
				                 _CURRENT_PROCESS.machine_id == machine.id and
			
 
				                 timezone.now() < _CURRENT_PROCESS.modified_at + timedelta(seconds=PROCESS_RECHECK_INTERVAL)):
			
 
				+                _CURRENT_PROCESS.ensure_log_files()
			
 
				                 return _CURRENT_PROCESS
			
 
				             _CURRENT_PROCESS = None
			
 
				 
			
@@ -945,6 +964,7 @@ class Process(models.Model):
 
				                 db_start_time = existing.started_at.timestamp()
			
 
				                 if abs(db_start_time - os_start_time) < START_TIME_TOLERANCE:
			
 
				                     _CURRENT_PROCESS = existing
			
 
				+                    _CURRENT_PROCESS.ensure_log_files()
			
 
				                     return existing
			
 
				 
			
 
				         # No valid existing record - create new one
			
@@ -977,6 +997,7 @@ class Process(models.Model):
 
				             started_at=started_at,
			
 
				             status=cls.StatusChoices.RUNNING,
			
 
				         )
			
 
				+        _CURRENT_PROCESS.ensure_log_files()
			
 
				         return _CURRENT_PROCESS
			
 
				 
			
 
				     @classmethod
			
@@ -1089,7 +1110,7 @@ class Process(models.Model):
 
				             if is_stale:
			
 
				                 proc.status = cls.StatusChoices.EXITED
			
 
				                 proc.ended_at = proc.ended_at or timezone.now()
			
 
				-                proc.exit_code = proc.exit_code if proc.exit_code is not None else -1
			
 
				+                proc.exit_code = proc.exit_code if proc.exit_code is not None else 0
			
 
				                 proc.save(update_fields=['status', 'ended_at', 'exit_code'])
			
 
				                 cleaned += 1
			
 
				 
			
@@ -1209,7 +1230,15 @@ class Process(models.Model):
 
				         the actual OS process exists and matches our record.
			
 
				         """
			
 
				         proc = self.proc
			
 
				-        return proc is not None and proc.is_running()
			
 
				+        if proc is None:
			
 
				+            return False
			
 
				+        try:
			
 
				+            # Treat zombies as not running (they should be reaped)
			
 
				+            if proc.status() == psutil.STATUS_ZOMBIE:
			
 
				+                return False
			
 
				+        except Exception:
			
 
				+            pass
			
 
				+        return proc.is_running()
			
 
				 
			
 
				     def is_alive(self) -> bool:
			
 
				         """
			
@@ -1421,6 +1450,22 @@ class Process(models.Model):
 
				             except OSError:
			
 
				                 pass
			
 
				 
			
 
				+    def ensure_log_files(self) -> None:
			
 
				+        """Ensure stdout/stderr log files exist for this process."""
			
 
				+        if not self.pwd:
			
 
				+            return
			
 
				+        try:
			
 
				+            Path(self.pwd).mkdir(parents=True, exist_ok=True)
			
 
				+        except OSError:
			
 
				+            return
			
 
				+        try:
			
 
				+            if self.stdout_file:
			
 
				+                self.stdout_file.touch(exist_ok=True)
			
 
				+            if self.stderr_file:
			
 
				+                self.stderr_file.touch(exist_ok=True)
			
 
				+        except OSError:
			
 
				+            return
			
 
				+
			
 
				     def _build_env(self) -> dict:
			
 
				         """Build environment dict for subprocess, merging stored env with system."""
			
 
				         import json
			
@@ -1507,9 +1552,11 @@ class Process(models.Model):
 
				                     proc.wait(timeout=self.timeout)
			
 
				                     self.exit_code = proc.returncode
			
 
				                 except subprocess.TimeoutExpired:
			
 
				+                    import signal
			
 
				+
			
 
				                     proc.kill()
			
 
				                     proc.wait()
			
 
				-                    self.exit_code = -1
			
 
				+                    self.exit_code = 128 + signal.SIGKILL
			
 
				 
			
 
				                 self.ended_at = timezone.now()
			
 
				                 if stdout_path.exists():
			
@@ -1579,9 +1626,19 @@ class Process(models.Model):
 
				             exit_code if exited, None if still running
			
 
				         """
			
 
				         if self.status == self.StatusChoices.EXITED:
			
 
				+            if self.exit_code == -1:
			
 
				+                self.exit_code = 137
			
 
				+                self.save(update_fields=['exit_code'])
			
 
				             return self.exit_code
			
 
				 
			
 
				         if not self.is_running:
			
 
				+            # Reap child process if it's a zombie (best-effort)
			
 
				+            proc = self.proc
			
 
				+            if proc is not None:
			
 
				+                try:
			
 
				+                    proc.wait(timeout=0)
			
 
				+                except Exception:
			
 
				+                    pass
			
 
				             # Process exited - read output and copy to DB
			
 
				             if self.stdout_file and self.stdout_file.exists():
			
 
				                 self.stdout = self.stdout_file.read_text()
			
@@ -1603,7 +1660,9 @@ class Process(models.Model):
 
				             #         cmd_file.unlink(missing_ok=True)
			
 
				 
			
 
				             # Try to get exit code from proc or default to unknown
			
 
				-            self.exit_code = self.exit_code if self.exit_code is not None else -1
			
 
				+            self.exit_code = self.exit_code if self.exit_code is not None else 0
			
 
				+            if self.exit_code == -1:
			
 
				+                self.exit_code = 137
			
 
				             self.ended_at = timezone.now()
			
 
				             self.status = self.StatusChoices.EXITED
			
 
				             self.save()
			
@@ -1723,6 +1782,7 @@ class Process(models.Model):
 
				         import os
			
 
				 
			
 
				         killed_count = 0
			
 
				+        used_sigkill = False
			
 
				         proc = self.proc
			
 
				         if proc is None:
			
 
				             # Already dead
			
@@ -1772,11 +1832,15 @@ class Process(models.Model):
 
				                     try:
			
 
				                         os.kill(pid, signal.SIGKILL)
			
 
				                         killed_count += 1
			
 
				+                        used_sigkill = True
			
 
				                     except (OSError, ProcessLookupError):
			
 
				                         pass
			
 
				 
			
 
				             # Update self status
			
 
				-            self.exit_code = 128 + signal.SIGTERM if killed_count > 0 else 0
			
 
				+            if used_sigkill:
			
 
				+                self.exit_code = 128 + signal.SIGKILL
			
 
				+            else:
			
 
				+                self.exit_code = 128 + signal.SIGTERM if killed_count > 0 else 0
			
 
				             self.status = self.StatusChoices.EXITED
			
 
				             self.ended_at = timezone.now()
			
 
				             self.save()
			
@@ -1925,6 +1989,50 @@ class Process(models.Model):
 
				 
			
 
				         return 0
			
 
				 
			
 
				+    @classmethod
			
 
				+    def cleanup_orphaned_workers(cls) -> int:
			
 
				+        """
			
 
				+        Kill orphaned worker/hook processes whose root process is no longer running.
			
 
				+
			
 
				+        Orphaned if:
			
 
				+        - Root (orchestrator/cli) is not running, or
			
 
				+        - No orchestrator/cli ancestor exists.
			
 
				+
			
 
				+        Standalone worker runs (archivebox run --snapshot-id) are allowed.
			
 
				+        """
			
 
				+        killed = 0
			
 
				+
			
 
				+        running_children = cls.objects.filter(
			
 
				+            process_type__in=[cls.TypeChoices.WORKER, cls.TypeChoices.HOOK],
			
 
				+            status=cls.StatusChoices.RUNNING,
			
 
				+        )
			
 
				+
			
 
				+        for proc in running_children:
			
 
				+            if not proc.is_running:
			
 
				+                continue
			
 
				+
			
 
				+            root = proc.root
			
 
				+            # Standalone worker/hook process (run directly)
			
 
				+            if root.id == proc.id and root.process_type in (cls.TypeChoices.WORKER, cls.TypeChoices.HOOK):
			
 
				+                continue
			
 
				+
			
 
				+            # If root is an active orchestrator/cli, keep it
			
 
				+            if root.process_type in (cls.TypeChoices.ORCHESTRATOR, cls.TypeChoices.CLI) and root.is_running:
			
 
				+                continue
			
 
				+
			
 
				+            try:
			
 
				+                if proc.process_type == cls.TypeChoices.HOOK:
			
 
				+                    proc.kill_tree(graceful_timeout=1.0)
			
 
				+                else:
			
 
				+                    proc.terminate(graceful_timeout=1.0)
			
 
				+                killed += 1
			
 
				+            except Exception:
			
 
				+                continue
			
 
				+
			
 
				+        if killed:
			
 
				+            print(f'[yellow]🧹 Cleaned up {killed} orphaned worker/hook process(es)[/yellow]')
			
 
				+        return killed
			
 
				+
			
 
				 
			
 
				 # =============================================================================
			
 
				 # Binary State Machine
			
@@ -2126,5 +2234,3 @@ class ProcessMachine(BaseStateMachine, strict_states=True):
 
				 # Manually register state machines with python-statemachine registry
			
 
				 registry.register(BinaryMachine)
			
 
				 registry.register(ProcessMachine)
			
 
				-
			
 
				-
			
--- a/archivebox/machine/tests/test_machine_models.py
+++ b/archivebox/machine/tests/test_machine_models.py
@@ -79,9 +79,9 @@ class TestMachineModel(TestCase):
 
				         """Machine.from_json() should update machine config."""
			
 
				         Machine.current()  # Ensure machine exists
			
 
				         record = {
			
 
				-            '_method': 'update',
			
 
				-            'key': 'WGET_BINARY',
			
 
				-            'value': '/usr/bin/wget',
			
 
				+            'config': {
			
 
				+                'WGET_BINARY': '/usr/bin/wget',
			
 
				+            },
			
 
				         }
			
 
				 
			
 
				         result = Machine.from_json(record)
			
@@ -190,12 +190,12 @@ class TestBinaryModel(TestCase):
 
				         old_modified = binary.modified_at
			
 
				 
			
 
				         binary.update_and_requeue(
			
 
				-            status=Binary.StatusChoices.STARTED,
			
 
				+            status=Binary.StatusChoices.QUEUED,
			
 
				             retry_at=timezone.now() + timedelta(seconds=60),
			
 
				         )
			
 
				 
			
 
				         binary.refresh_from_db()
			
 
				-        self.assertEqual(binary.status, Binary.StatusChoices.STARTED)
			
 
				+        self.assertEqual(binary.status, Binary.StatusChoices.QUEUED)
			
 
				         self.assertGreater(binary.modified_at, old_modified)
			
 
				 
			
 
				 
			
@@ -221,12 +221,12 @@ class TestBinaryStateMachine(TestCase):
 
				     def test_binary_state_machine_can_start(self):
			
 
				         """BinaryMachine.can_start() should check name and binproviders."""
			
 
				         sm = BinaryMachine(self.binary)
			
 
				-        self.assertTrue(sm.can_start())
			
 
				+        self.assertTrue(sm.can_install())
			
 
				 
			
 
				         self.binary.binproviders = ''
			
 
				         self.binary.save()
			
 
				         sm = BinaryMachine(self.binary)
			
 
				-        self.assertFalse(sm.can_start())
			
 
				+        self.assertFalse(sm.can_install())
			
 
				 
			
 
				 
			
 
				 class TestProcessModel(TestCase):
			
@@ -415,11 +415,15 @@ class TestProcessLifecycle(TestCase):
 
				 
			
 
				     def test_process_is_running_current_pid(self):
			
 
				         """is_running should be True for current PID."""
			
 
				+        import psutil
			
 
				+        from datetime import datetime
			
 
				+
			
 
				+        proc_start = datetime.fromtimestamp(psutil.Process(os.getpid()).create_time(), tz=timezone.get_current_timezone())
			
 
				         proc = Process.objects.create(
			
 
				             machine=self.machine,
			
 
				             status=Process.StatusChoices.RUNNING,
			
 
				             pid=os.getpid(),
			
 
				-            started_at=timezone.now(),
			
 
				+            started_at=proc_start,
			
 
				         )
			
 
				 
			
 
				         self.assertTrue(proc.is_running)
			
@@ -450,6 +454,22 @@ class TestProcessLifecycle(TestCase):
 
				         proc.refresh_from_db()
			
 
				         self.assertEqual(proc.status, Process.StatusChoices.EXITED)
			
 
				 
			
 
				+    def test_process_poll_normalizes_negative_exit_code(self):
			
 
				+        """poll() should normalize -1 exit codes to 137."""
			
 
				+        proc = Process.objects.create(
			
 
				+            machine=self.machine,
			
 
				+            status=Process.StatusChoices.EXITED,
			
 
				+            pid=999999,
			
 
				+            exit_code=-1,
			
 
				+            started_at=timezone.now(),
			
 
				+        )
			
 
				+
			
 
				+        exit_code = proc.poll()
			
 
				+
			
 
				+        self.assertEqual(exit_code, 137)
			
 
				+        proc.refresh_from_db()
			
 
				+        self.assertEqual(proc.exit_code, 137)
			
 
				+
			
 
				     def test_process_terminate_dead_process(self):
			
 
				         """terminate() should handle already-dead process."""
			
 
				         proc = Process.objects.create(
			
--- a/archivebox/misc/checks.py
+++ b/archivebox/misc/checks.py
@@ -180,9 +180,11 @@ def check_tmp_dir(tmp_dir=None, throw=False, quiet=False, must_exist=True):
 
				         return len(f'file://{socket_file}') <= 96
			
 
				 
			
 
				     tmp_is_valid = False
			
 
				+    allow_no_unix_sockets = os.environ.get('ARCHIVEBOX_ALLOW_NO_UNIX_SOCKETS', '').lower() in ('1', 'true', 'yes')
			
 
				     try:
			
 
				         tmp_is_valid = dir_is_writable(tmp_dir)
			
 
				-        tmp_is_valid = tmp_is_valid and assert_dir_can_contain_unix_sockets(tmp_dir)
			
 
				+        if not allow_no_unix_sockets:
			
 
				+            tmp_is_valid = tmp_is_valid and assert_dir_can_contain_unix_sockets(tmp_dir)
			
 
				         assert tmp_is_valid, f'ArchiveBox user PUID={ARCHIVEBOX_USER} PGID={ARCHIVEBOX_GROUP} is unable to write to TMP_DIR={tmp_dir}'            
			
 
				         assert len(f'file://{socket_file}') <= 96, f'ArchiveBox TMP_DIR={tmp_dir} is too long, dir containing unix socket files must be <90 chars.'
			
 
				         return True
			
--- a/archivebox/misc/progress_layout.py
+++ b/archivebox/misc/progress_layout.py
@@ -3,30 +3,29 @@ Rich Layout-based live progress display for ArchiveBox orchestrator.
 
				 
			
 
				 Shows a comprehensive dashboard with:
			
 
				 - Top: Crawl queue status (full width)
			
 
				-- Middle: 4-column grid of SnapshotWorker progress panels
			
 
				+- Middle: Running process logs (dynamic panels)
			
 
				 - Bottom: Orchestrator/Daphne logs
			
 
				 """
			
 
				 
			
 
				 __package__ = 'archivebox.misc'
			
 
				 
			
 
				 from datetime import datetime, timezone
			
 
				-from typing import Dict, List, Optional, Any
			
 
				+from typing import List, Optional, Any
			
 
				 from collections import deque
			
 
				+from pathlib import Path
			
 
				 
			
 
				 from rich import box
			
 
				 from rich.align import Align
			
 
				-from rich.console import Console, Group, RenderableType
			
 
				+from rich.console import Group
			
 
				 from rich.layout import Layout
			
 
				+from rich.columns import Columns
			
 
				 from rich.panel import Panel
			
 
				-from rich.progress import Progress, BarColumn, TextColumn, TaskProgressColumn, SpinnerColumn
			
 
				-from rich.table import Table
			
 
				 from rich.text import Text
			
 
				+from rich.table import Table
			
 
				+from rich.tree import Tree
			
 
				 
			
 
				 from archivebox.config import VERSION
			
 
				 
			
 
				-# Maximum number of SnapshotWorker columns to display
			
 
				-MAX_WORKER_COLUMNS = 4
			
 
				-
			
 
				 
			
 
				 class CrawlQueuePanel:
			
 
				     """Display crawl queue status across full width."""
			
@@ -35,6 +34,8 @@ class CrawlQueuePanel:
 
				         self.orchestrator_status = "Idle"
			
 
				         self.crawl_queue_count = 0
			
 
				         self.crawl_workers_count = 0
			
 
				+        self.binary_queue_count = 0
			
 
				+        self.binary_workers_count = 0
			
 
				         self.max_crawl_workers = 8
			
 
				         self.crawl_id: Optional[str] = None
			
 
				 
			
@@ -51,19 +52,27 @@ class CrawlQueuePanel:
 
				         left_text.append(f"v{VERSION}", style="bold yellow")
			
 
				         left_text.append(f" • {datetime.now(timezone.utc).strftime('%H:%M:%S')}", style="grey53")
			
 
				 
			
 
				-        # Center-left: Crawl queue status
			
 
				+        # Center-left: Crawl + Binary queue status
			
 
				         queue_style = "yellow" if self.crawl_queue_count > 0 else "grey53"
			
 
				         center_left_text = Text()
			
 
				         center_left_text.append("Crawls: ", style="white")
			
 
				         center_left_text.append(str(self.crawl_queue_count), style=f"bold {queue_style}")
			
 
				         center_left_text.append(" queued", style="grey53")
			
 
				+        center_left_text.append(" • Binaries: ", style="white")
			
 
				+        binary_queue_style = "yellow" if self.binary_queue_count > 0 else "grey53"
			
 
				+        center_left_text.append(str(self.binary_queue_count), style=f"bold {binary_queue_style}")
			
 
				+        center_left_text.append(" queued", style="grey53")
			
 
				 
			
 
				-        # Center-right: CrawlWorker status
			
 
				+        # Center-right: Worker status
			
 
				         worker_style = "green" if self.crawl_workers_count > 0 else "grey53"
			
 
				         center_right_text = Text()
			
 
				         center_right_text.append("Workers: ", style="white")
			
 
				         center_right_text.append(f"{self.crawl_workers_count}/{self.max_crawl_workers}", style=f"bold {worker_style}")
			
 
				-        center_right_text.append(" active", style="grey53")
			
 
				+        center_right_text.append(" crawl", style="grey53")
			
 
				+        binary_worker_style = "green" if self.binary_workers_count > 0 else "grey53"
			
 
				+        center_right_text.append(" • ", style="grey53")
			
 
				+        center_right_text.append(str(self.binary_workers_count), style=f"bold {binary_worker_style}")
			
 
				+        center_right_text.append(" binary", style="grey53")
			
 
				 
			
 
				         # Right: Orchestrator status
			
 
				         status_color = "green" if self.crawl_workers_count > 0 else "grey53"
			
@@ -74,151 +83,302 @@ class CrawlQueuePanel:
 
				             right_text.append(f" [{self.crawl_id[:8]}]", style="grey53")
			
 
				 
			
 
				         grid.add_row(left_text, center_left_text, center_right_text, right_text)
			
 
				-        return Panel(grid, style="white on blue", box=box.ROUNDED)
			
 
				+        return Panel(grid, style="white on blue", box=box.HORIZONTALS)
			
 
				 
			
 
				 
			
 
				-class SnapshotWorkerPanel:
			
 
				-    """Display progress for a single SnapshotWorker."""
			
 
				+class ProcessLogPanel:
			
 
				+    """Display logs for a running Process."""
			
 
				 
			
 
				-    def __init__(self, worker_num: int):
			
 
				-        self.worker_num = worker_num
			
 
				-        self.snapshot_id: Optional[str] = None
			
 
				-        self.snapshot_url: Optional[str] = None
			
 
				-        self.total_hooks: int = 0
			
 
				-        self.completed_hooks: int = 0
			
 
				-        self.current_plugin: Optional[str] = None
			
 
				-        self.status: str = "idle"  # idle, working, completed
			
 
				-        self.recent_logs: deque = deque(maxlen=5)
			
 
				+    def __init__(self, process: Any, max_lines: int = 8, compact: bool | None = None):
			
 
				+        self.process = process
			
 
				+        self.max_lines = max_lines
			
 
				+        self.compact = compact
			
 
				 
			
 
				     def __rich__(self) -> Panel:
			
 
				-        if self.status == "idle":
			
 
				-            content = Align.center(
			
 
				-                Text("Idle", style="grey53"),
			
 
				-                vertical="middle",
			
 
				-            )
			
 
				-            border_style = "grey53"
			
 
				-            title_style = "grey53"
			
 
				-        else:
			
 
				-            # Build progress display
			
 
				-            lines = []
			
 
				-
			
 
				-            # URL (truncated)
			
 
				-            if self.snapshot_url:
			
 
				-                url_display = self.snapshot_url[:35] + "..." if len(self.snapshot_url) > 35 else self.snapshot_url
			
 
				-                lines.append(Text(url_display, style="cyan"))
			
 
				-                lines.append(Text())  # Spacing
			
 
				-
			
 
				-            # Progress bar
			
 
				-            if self.total_hooks > 0:
			
 
				-                pct = (self.completed_hooks / self.total_hooks) * 100
			
 
				-                bar_width = 30
			
 
				-                filled = int((pct / 100) * bar_width)
			
 
				-                bar = "█" * filled + "░" * (bar_width - filled)
			
 
				-
			
 
				-                # Color based on progress
			
 
				-                if pct < 30:
			
 
				-                    bar_style = "yellow"
			
 
				-                elif pct < 100:
			
 
				-                    bar_style = "green"
			
 
				-                else:
			
 
				-                    bar_style = "blue"
			
 
				-
			
 
				-                progress_text = Text()
			
 
				-                progress_text.append(bar, style=bar_style)
			
 
				-                progress_text.append(f" {pct:.0f}%", style="white")
			
 
				-                lines.append(progress_text)
			
 
				-                lines.append(Text())  # Spacing
			
 
				-
			
 
				-            # Stats
			
 
				-            stats = Table.grid(padding=(0, 1))
			
 
				-            stats.add_column(style="grey53", no_wrap=True)
			
 
				-            stats.add_column(style="white")
			
 
				-            stats.add_row("Hooks:", f"{self.completed_hooks}/{self.total_hooks}")
			
 
				-            if self.current_plugin:
			
 
				-                stats.add_row("Current:", Text(self.current_plugin, style="yellow"))
			
 
				-            lines.append(stats)
			
 
				-            lines.append(Text())  # Spacing
			
 
				-
			
 
				-            # Recent logs
			
 
				-            if self.recent_logs:
			
 
				-                lines.append(Text("Recent:", style="grey53"))
			
 
				-                for log_msg, log_style in self.recent_logs:
			
 
				-                    log_text = Text(f"• {log_msg[:30]}", style=log_style)
			
 
				-                    lines.append(log_text)
			
 
				-
			
 
				-            content = Group(*lines)
			
 
				-            border_style = "green" if self.status == "working" else "blue"
			
 
				-            title_style = "green" if self.status == "working" else "blue"
			
 
				-
			
 
				+        is_pending = self._is_pending()
			
 
				+        output_line = '' if is_pending else self._output_line()
			
 
				+        stdout_lines = []
			
 
				+        stderr_lines = []
			
 
				+        try:
			
 
				+            stdout_lines = list(self.process.tail_stdout(lines=self.max_lines, follow=False))
			
 
				+            stderr_lines = list(self.process.tail_stderr(lines=self.max_lines, follow=False))
			
 
				+        except Exception:
			
 
				+            stdout_lines = []
			
 
				+            stderr_lines = []
			
 
				+
			
 
				+        header_lines = []
			
 
				+        chrome_launch_line = self._chrome_launch_line(stderr_lines, stdout_lines)
			
 
				+        if chrome_launch_line:
			
 
				+            header_lines.append(Text(chrome_launch_line, style="grey53"))
			
 
				+        if output_line:
			
 
				+            header_lines.append(Text(output_line, style="grey53"))
			
 
				+        log_lines = []
			
 
				+        for line in stdout_lines:
			
 
				+            if line:
			
 
				+                log_lines.append(Text(line, style="white"))
			
 
				+        for line in stderr_lines:
			
 
				+            if line:
			
 
				+                log_lines.append(Text(line, style="cyan"))
			
 
				+
			
 
				+        compact = self.compact if self.compact is not None else self._is_background_hook()
			
 
				+        max_body = max(1, self.max_lines - len(header_lines))
			
 
				+        if not log_lines:
			
 
				+            log_lines = []
			
 
				+
			
 
				+        lines = header_lines + log_lines[-max_body:]
			
 
				+
			
 
				+        content = Group(*lines) if lines else Text("")
			
 
				+
			
 
				+        title = self._title()
			
 
				+        border_style = "grey53" if is_pending else "cyan"
			
 
				+        height = 2 if is_pending else None
			
 
				         return Panel(
			
 
				             content,
			
 
				-            title=f"[{title_style}]Worker {self.worker_num}",
			
 
				+            title=title,
			
 
				             border_style=border_style,
			
 
				-            box=box.ROUNDED,
			
 
				-            height=20,
			
 
				+            box=box.HORIZONTALS,
			
 
				+            padding=(0, 1),
			
 
				+            height=height,
			
 
				         )
			
 
				 
			
 
				-    def add_log(self, message: str, style: str = "white"):
			
 
				-        """Add a log message to this worker's recent logs."""
			
 
				-        self.recent_logs.append((message, style))
			
 
				+    def _title(self) -> str:
			
 
				+        process_type = getattr(self.process, 'process_type', 'process')
			
 
				+        worker_type = getattr(self.process, 'worker_type', '')
			
 
				+        pid = getattr(self.process, 'pid', None)
			
 
				+        label = process_type
			
 
				+        if process_type == 'worker' and worker_type:
			
 
				+            label, worker_suffix = self._worker_label(worker_type)
			
 
				+        elif process_type == 'hook':
			
 
				+            try:
			
 
				+                cmd = getattr(self.process, 'cmd', [])
			
 
				+                hook_path = Path(cmd[1]) if len(cmd) > 1 else None
			
 
				+                hook_name = hook_path.name if hook_path else 'hook'
			
 
				+                plugin_name = hook_path.parent.name if hook_path and hook_path.parent.name else 'hook'
			
 
				+            except Exception:
			
 
				+                hook_name = 'hook'
			
 
				+                plugin_name = 'hook'
			
 
				+            label = f"{plugin_name}/{hook_name}"
			
 
				+            worker_suffix = ''
			
 
				+        else:
			
 
				+            worker_suffix = ''
			
 
				+
			
 
				+        url = self._extract_url()
			
 
				+        url_suffix = f" url={self._abbrev_url(url)}" if url else ""
			
 
				+        time_suffix = self._elapsed_suffix()
			
 
				+        title_style = "grey53" if self._is_pending() else "bold white"
			
 
				+        if pid:
			
 
				+            return f"[{title_style}]{label}[/{title_style}] [grey53]pid={pid}{worker_suffix}{url_suffix}{time_suffix}[/grey53]"
			
 
				+        return f"[{title_style}]{label}[/{title_style}]{f' [grey53]{worker_suffix.strip()} {url_suffix.strip()}{time_suffix}[/grey53]' if (worker_suffix or url_suffix or time_suffix) else ''}".rstrip()
			
 
				+
			
 
				+    def _is_background_hook(self) -> bool:
			
 
				+        if getattr(self.process, 'process_type', '') != 'hook':
			
 
				+            return False
			
 
				+        try:
			
 
				+            cmd = getattr(self.process, 'cmd', [])
			
 
				+            hook_path = Path(cmd[1]) if len(cmd) > 1 else None
			
 
				+            hook_name = hook_path.name if hook_path else ''
			
 
				+            return '.bg.' in hook_name
			
 
				+        except Exception:
			
 
				+            return False
			
 
				+
			
 
				+    def _is_pending(self) -> bool:
			
 
				+        status = getattr(self.process, 'status', '')
			
 
				+        if status in ('queued', 'pending', 'backoff'):
			
 
				+            return True
			
 
				+        if getattr(self.process, 'process_type', '') == 'hook' and not getattr(self.process, 'pid', None):
			
 
				+            return True
			
 
				+        return False
			
 
				+
			
 
				+    def _worker_label(self, worker_type: str) -> tuple[str, str]:
			
 
				+        cmd = getattr(self.process, 'cmd', []) or []
			
 
				+        if worker_type == 'crawl':
			
 
				+            crawl_id = self._extract_arg(cmd, '--crawl-id')
			
 
				+            suffix = ''
			
 
				+            if crawl_id:
			
 
				+                suffix = f" id={str(crawl_id)[-8:]}"
			
 
				+                try:
			
 
				+                    from archivebox.crawls.models import Crawl
			
 
				+                    crawl = Crawl.objects.filter(id=crawl_id).first()
			
 
				+                    if crawl:
			
 
				+                        urls = crawl.get_urls_list()
			
 
				+                        if urls:
			
 
				+                            url_list = self._abbrev_urls(urls)
			
 
				+                            suffix += f" urls={url_list}"
			
 
				+                except Exception:
			
 
				+                    pass
			
 
				+            return 'crawl', suffix
			
 
				+        if worker_type == 'snapshot':
			
 
				+            snapshot_id = self._extract_arg(cmd, '--snapshot-id')
			
 
				+            suffix = ''
			
 
				+            if snapshot_id:
			
 
				+                suffix = f" id={str(snapshot_id)[-8:]}"
			
 
				+                try:
			
 
				+                    from archivebox.core.models import Snapshot
			
 
				+                    snap = Snapshot.objects.filter(id=snapshot_id).first()
			
 
				+                    if snap and snap.url:
			
 
				+                        suffix += f" url={self._abbrev_url(snap.url, max_len=48)}"
			
 
				+                except Exception:
			
 
				+                    pass
			
 
				+            return 'snapshot', suffix
			
 
				+        return f"worker:{worker_type}", ''
			
 
				+
			
 
				+    @staticmethod
			
 
				+    def _extract_arg(cmd: list[str], key: str) -> str | None:
			
 
				+        for i, part in enumerate(cmd):
			
 
				+            if part.startswith(f'{key}='):
			
 
				+                return part.split('=', 1)[1]
			
 
				+            if part == key and i + 1 < len(cmd):
			
 
				+                return cmd[i + 1]
			
 
				+        return None
			
 
				+
			
 
				+    def _abbrev_urls(self, urls: list[str], max_len: int = 48) -> str:
			
 
				+        if not urls:
			
 
				+            return ''
			
 
				+        if len(urls) == 1:
			
 
				+            return self._abbrev_url(urls[0], max_len=max_len)
			
 
				+        first = self._abbrev_url(urls[0], max_len=max_len)
			
 
				+        return f"{first},+{len(urls) - 1}"
			
 
				+
			
 
				+    def _extract_url(self) -> str:
			
 
				+        url = getattr(self.process, 'url', None)
			
 
				+        if url:
			
 
				+            return str(url)
			
 
				+        cmd = getattr(self.process, 'cmd', []) or []
			
 
				+        for i, part in enumerate(cmd):
			
 
				+            if part.startswith('--url='):
			
 
				+                return part.split('=', 1)[1].strip()
			
 
				+            if part == '--url' and i + 1 < len(cmd):
			
 
				+                return str(cmd[i + 1]).strip()
			
 
				+        return ''
			
 
				+
			
 
				+    def _abbrev_url(self, url: str, max_len: int = 48) -> str:
			
 
				+        if not url:
			
 
				+            return ''
			
 
				+        if len(url) <= max_len:
			
 
				+            return url
			
 
				+        return f"{url[:max_len - 3]}..."
			
 
				+
			
 
				+    def _chrome_launch_line(self, stderr_lines: list[str], stdout_lines: list[str]) -> str:
			
 
				+        try:
			
 
				+            cmd = getattr(self.process, 'cmd', [])
			
 
				+            hook_path = Path(cmd[1]) if len(cmd) > 1 else None
			
 
				+            hook_name = hook_path.name if hook_path else ''
			
 
				+            if 'chrome_launch' not in hook_name:
			
 
				+                return ''
			
 
				+
			
 
				+            pid = ''
			
 
				+            ws = ''
			
 
				+            for line in stderr_lines + stdout_lines:
			
 
				+                if not ws and 'CDP URL:' in line:
			
 
				+                    ws = line.split('CDP URL:', 1)[1].strip()
			
 
				+                if not pid and 'PID:' in line:
			
 
				+                    pid = line.split('PID:', 1)[1].strip()
			
 
				+
			
 
				+            if pid and ws:
			
 
				+                return f"Chrome pid={pid} {ws}"
			
 
				+            if ws:
			
 
				+                return f"Chrome {ws}"
			
 
				+            if pid:
			
 
				+                return f"Chrome pid={pid}"
			
 
				+            try:
			
 
				+                from archivebox import DATA_DIR
			
 
				+                base = Path(DATA_DIR)
			
 
				+                pwd = getattr(self.process, 'pwd', None)
			
 
				+                if pwd:
			
 
				+                    chrome_dir = Path(pwd)
			
 
				+                    if not chrome_dir.is_absolute():
			
 
				+                        chrome_dir = (base / chrome_dir).resolve()
			
 
				+                    cdp_file = chrome_dir / 'cdp_url.txt'
			
 
				+                    pid_file = chrome_dir / 'chrome.pid'
			
 
				+                    if cdp_file.exists():
			
 
				+                        ws = cdp_file.read_text().strip()
			
 
				+                    if pid_file.exists():
			
 
				+                        pid = pid_file.read_text().strip()
			
 
				+                    if pid and ws:
			
 
				+                        return f"Chrome pid={pid} {ws}"
			
 
				+                    if ws:
			
 
				+                        return f"Chrome {ws}"
			
 
				+                    if pid:
			
 
				+                        return f"Chrome pid={pid}"
			
 
				+            except Exception:
			
 
				+                pass
			
 
				+        except Exception:
			
 
				+            return ''
			
 
				+        return ''
			
 
				+
			
 
				+    def _elapsed_suffix(self) -> str:
			
 
				+        started_at = getattr(self.process, 'started_at', None)
			
 
				+        timeout = getattr(self.process, 'timeout', None)
			
 
				+        if not started_at or not timeout:
			
 
				+            return ''
			
 
				+        try:
			
 
				+            now = datetime.now(timezone.utc) if started_at.tzinfo else datetime.now()
			
 
				+            elapsed = int((now - started_at).total_seconds())
			
 
				+            elapsed = max(elapsed, 0)
			
 
				+            return f" [{elapsed}/{int(timeout)}s]"
			
 
				+        except Exception:
			
 
				+            return ''
			
 
				+
			
 
				+    def _output_line(self) -> str:
			
 
				+        pwd = getattr(self.process, 'pwd', None)
			
 
				+        if not pwd:
			
 
				+            return ''
			
 
				+        try:
			
 
				+            from archivebox import DATA_DIR
			
 
				+            rel = Path(pwd)
			
 
				+            base = Path(DATA_DIR)
			
 
				+            if rel.is_absolute():
			
 
				+                try:
			
 
				+                    rel = rel.relative_to(base)
			
 
				+                except Exception:
			
 
				+                    pass
			
 
				+            rel_str = f"./{rel}" if not str(rel).startswith("./") else str(rel)
			
 
				+            return f"{rel_str}"
			
 
				+        except Exception:
			
 
				+            return f"{pwd}"
			
 
				 
			
 
				 
			
 
				-class CrawlWorkerLogPanel:
			
 
				-    """Display CrawlWorker logs by tailing stdout/stderr from Process."""
			
 
				+class WorkerLogPanel:
			
 
				+    """Display worker logs by tailing stdout/stderr from Process."""
			
 
				 
			
 
				-    def __init__(self, max_lines: int = 8):
			
 
				+    def __init__(self, title: str, empty_message: str, running_message: str, max_lines: int = 8):
			
 
				+        self.title = title
			
 
				+        self.empty_message = empty_message
			
 
				+        self.running_message = running_message
			
 
				         self.log_lines: deque = deque(maxlen=max_lines * 2)  # Allow more buffer
			
 
				         self.max_lines = max_lines
			
 
				         self.last_stdout_pos = 0  # Track file position for efficient tailing
			
 
				         self.last_stderr_pos = 0
			
 
				+        self.last_process_running = False
			
 
				 
			
 
				     def update_from_process(self, process: Any):
			
 
				         """Update logs by tailing the Process stdout/stderr files."""
			
 
				-        from pathlib import Path
			
 
				-
			
 
				         if not process:
			
 
				+            self.last_process_running = False
			
 
				             return
			
 
				 
			
 
				-        # Read new stdout lines since last read
			
 
				+        # Use Process tail helpers for consistency
			
 
				         try:
			
 
				-            stdout_path = Path(process.stdout)
			
 
				-            if stdout_path.exists():
			
 
				-                with open(stdout_path, 'r') as f:
			
 
				-                    # Seek to last read position
			
 
				-                    f.seek(self.last_stdout_pos)
			
 
				-                    new_lines = f.readlines()
			
 
				-
			
 
				-                    # Update position
			
 
				-                    self.last_stdout_pos = f.tell()
			
 
				-
			
 
				-                    # Add new lines (up to max_lines to avoid overflow)
			
 
				-                    for line in new_lines[-self.max_lines:]:
			
 
				-                        line = line.rstrip('\n')
			
 
				-                        if line and not line.startswith('['):  # Skip Rich markup lines
			
 
				-                            self.log_lines.append(('stdout', line))
			
 
				+            self.last_process_running = bool(getattr(process, 'is_running', False))
			
 
				+            stdout_lines = list(process.tail_stdout(lines=self.max_lines, follow=False))
			
 
				+            stderr_lines = list(process.tail_stderr(lines=self.max_lines, follow=False))
			
 
				         except Exception:
			
 
				-            pass
			
 
				+            return
			
 
				 
			
 
				-        # Read new stderr lines since last read
			
 
				-        try:
			
 
				-            stderr_path = Path(process.stderr)
			
 
				-            if stderr_path.exists():
			
 
				-                with open(stderr_path, 'r') as f:
			
 
				-                    f.seek(self.last_stderr_pos)
			
 
				-                    new_lines = f.readlines()
			
 
				-
			
 
				-                    self.last_stderr_pos = f.tell()
			
 
				-
			
 
				-                    for line in new_lines[-self.max_lines:]:
			
 
				-                        line = line.rstrip('\n')
			
 
				-                        if line and not line.startswith('['):  # Skip Rich markup lines
			
 
				-                            self.log_lines.append(('stderr', line))
			
 
				-        except Exception:
			
 
				-            pass
			
 
				+        self.log_lines.clear()
			
 
				+
			
 
				+        # Preserve ordering by showing stdout then stderr
			
 
				+        for line in stdout_lines:
			
 
				+            if line:
			
 
				+                self.log_lines.append(('stdout', line))
			
 
				+        for line in stderr_lines:
			
 
				+            if line:
			
 
				+                self.log_lines.append(('stderr', line))
			
 
				 
			
 
				     def __rich__(self) -> Panel:
			
 
				         if not self.log_lines:
			
 
				-            content = Text("No CrawlWorker logs yet", style="grey53", justify="center")
			
 
				+            message = self.running_message if self.last_process_running else self.empty_message
			
 
				+            content = Text(message, style="grey53", justify="center")
			
 
				         else:
			
 
				             # Get the last max_lines for display
			
 
				             display_lines = list(self.log_lines)[-self.max_lines:]
			
@@ -236,9 +396,9 @@ class CrawlWorkerLogPanel:
 
				 
			
 
				         return Panel(
			
 
				             content,
			
 
				-            title="[bold cyan]CrawlWorker Logs (stdout/stderr)",
			
 
				+            title=f"[bold cyan]{self.title}",
			
 
				             border_style="cyan",
			
 
				-            box=box.ROUNDED,
			
 
				+            box=box.HORIZONTALS,
			
 
				         )
			
 
				 
			
 
				 
			
@@ -270,10 +430,71 @@ class OrchestratorLogPanel:
 
				             content,
			
 
				             title="[bold white]Orchestrator / Daphne Logs",
			
 
				             border_style="white",
			
 
				-            box=box.ROUNDED,
			
 
				+            box=box.HORIZONTALS,
			
 
				         )
			
 
				 
			
 
				 
			
 
				+class CrawlQueueTreePanel:
			
 
				+    """Display crawl queue with snapshots + hook summary in a tree view."""
			
 
				+
			
 
				+    def __init__(self, max_crawls: int = 8, max_snapshots: int = 16):
			
 
				+        self.crawls: list[dict[str, Any]] = []
			
 
				+        self.max_crawls = max_crawls
			
 
				+        self.max_snapshots = max_snapshots
			
 
				+
			
 
				+    def update_crawls(self, crawls: list[dict[str, Any]]) -> None:
			
 
				+        """Update crawl tree data."""
			
 
				+        self.crawls = crawls[:self.max_crawls]
			
 
				+
			
 
				+    def __rich__(self) -> Panel:
			
 
				+        if not self.crawls:
			
 
				+            content = Text("No active crawls", style="grey53", justify="center")
			
 
				+        else:
			
 
				+            trees = []
			
 
				+            for crawl in self.crawls:
			
 
				+                crawl_status = crawl.get('status', '')
			
 
				+                crawl_label = crawl.get('label', '')
			
 
				+                crawl_id = crawl.get('id', '')[:8]
			
 
				+                crawl_text = Text(f"{self._status_icon(crawl_status)} {crawl_id} {crawl_label}", style="white")
			
 
				+                crawl_tree = Tree(crawl_text, guide_style="grey53")
			
 
				+
			
 
				+                snapshots = crawl.get('snapshots', [])[:self.max_snapshots]
			
 
				+                for snap in snapshots:
			
 
				+                    snap_status = snap.get('status', '')
			
 
				+                    snap_label = snap.get('label', '')
			
 
				+                    snap_text = Text(f"{self._status_icon(snap_status)} {snap_label}", style="white")
			
 
				+                    snap_node = crawl_tree.add(snap_text)
			
 
				+
			
 
				+                    hooks = snap.get('hooks', {})
			
 
				+                    if hooks:
			
 
				+                        completed = hooks.get('completed', 0)
			
 
				+                        running = hooks.get('running', 0)
			
 
				+                        pending = hooks.get('pending', 0)
			
 
				+                        summary = f"✅ {completed} | ▶️  {running} | ⌛️ {pending}"
			
 
				+                        snap_node.add(Text(summary, style="grey53"))
			
 
				+                trees.append(crawl_tree)
			
 
				+            content = Group(*trees)
			
 
				+
			
 
				+        return Panel(
			
 
				+            content,
			
 
				+            title="[bold white]Crawl Queue",
			
 
				+            border_style="white",
			
 
				+            box=box.HORIZONTALS,
			
 
				+        )
			
 
				+
			
 
				+    @staticmethod
			
 
				+    def _status_icon(status: str) -> str:
			
 
				+        if status in ('queued', 'pending'):
			
 
				+            return '⏳'
			
 
				+        if status in ('started', 'running'):
			
 
				+            return '▶'
			
 
				+        if status in ('sealed', 'done', 'completed'):
			
 
				+            return '✅'
			
 
				+        if status in ('failed', 'error'):
			
 
				+            return '✖'
			
 
				+        return '•'
			
 
				+
			
 
				+
			
 
				 class ArchiveBoxProgressLayout:
			
 
				     """
			
 
				     Main layout manager for ArchiveBox orchestrator progress display.
			
@@ -281,15 +502,8 @@ class ArchiveBoxProgressLayout:
 
				     Layout structure:
			
 
				         ┌─────────────────────────────────────────────────────────────┐
			
 
				         │              Crawl Queue (full width)                       │
			
 
				-        ├───────────────┬───────────────┬───────────────┬─────────────┤
			
 
				-        │  Snapshot     │  Snapshot     │  Snapshot     │  Snapshot   │
			
 
				-        │  Worker 1     │  Worker 2     │  Worker 3     │  Worker 4   │
			
 
				-        │               │               │               │             │
			
 
				-        │  Progress +   │  Progress +   │  Progress +   │  Progress + │
			
 
				-        │  Stats +      │  Stats +      │  Stats +      │  Stats +    │
			
 
				-        │  Logs         │  Logs         │  Logs         │  Logs       │
			
 
				-        ├───────────────┴───────────────┴───────────────┴─────────────┤
			
 
				-        │              CrawlWorker Logs (stdout/stderr)               │
			
 
				+        ├─────────────────────────────────────────────────────────────┤
			
 
				+        │           Running Process Logs (dynamic panels)             │
			
 
				         ├─────────────────────────────────────────────────────────────┤
			
 
				         │           Orchestrator / Daphne Logs                        │
			
 
				         └─────────────────────────────────────────────────────────────┘
			
@@ -303,51 +517,33 @@ class ArchiveBoxProgressLayout:
 
				         self.crawl_queue = CrawlQueuePanel()
			
 
				         self.crawl_queue.crawl_id = crawl_id
			
 
				 
			
 
				-        # Create 4 worker panels
			
 
				-        self.worker_panels = [SnapshotWorkerPanel(i + 1) for i in range(MAX_WORKER_COLUMNS)]
			
 
				-
			
 
				-        self.crawl_worker_log = CrawlWorkerLogPanel(max_lines=8)
			
 
				+        self.process_panels: List[ProcessLogPanel] = []
			
 
				         self.orchestrator_log = OrchestratorLogPanel(max_events=8)
			
 
				+        self.crawl_queue_tree = CrawlQueueTreePanel(max_crawls=8, max_snapshots=16)
			
 
				 
			
 
				         # Create layout
			
 
				         self.layout = self._make_layout()
			
 
				 
			
 
				-        # Track snapshot ID to worker panel mapping
			
 
				-        self.snapshot_to_worker: Dict[str, int] = {}  # snapshot_id -> worker_panel_index
			
 
				-
			
 
				     def _make_layout(self) -> Layout:
			
 
				         """Define the layout structure."""
			
 
				         layout = Layout(name="root")
			
 
				 
			
 
				-        # Top-level split: crawl_queue, workers, logs
			
 
				+        # Top-level split: crawl_queue, workers, bottom
			
 
				         layout.split(
			
 
				             Layout(name="crawl_queue", size=3),
			
 
				-            Layout(name="workers", ratio=1),
			
 
				-            Layout(name="logs", size=20),
			
 
				-        )
			
 
				-
			
 
				-        # Split workers into 4 columns
			
 
				-        layout["workers"].split_row(
			
 
				-            Layout(name="worker1"),
			
 
				-            Layout(name="worker2"),
			
 
				-            Layout(name="worker3"),
			
 
				-            Layout(name="worker4"),
			
 
				-        )
			
 
				-
			
 
				-        # Split logs into crawl_worker_logs and orchestrator_logs
			
 
				-        layout["logs"].split(
			
 
				-            Layout(name="crawl_worker_logs", size=10),
			
 
				-            Layout(name="orchestrator_logs", size=10),
			
 
				+            Layout(name="processes", ratio=1),
			
 
				+            Layout(name="bottom", size=12),
			
 
				         )
			
 
				 
			
 
				         # Assign components to layout sections
			
 
				         layout["crawl_queue"].update(self.crawl_queue)
			
 
				-        layout["worker1"].update(self.worker_panels[0])
			
 
				-        layout["worker2"].update(self.worker_panels[1])
			
 
				-        layout["worker3"].update(self.worker_panels[2])
			
 
				-        layout["worker4"].update(self.worker_panels[3])
			
 
				-        layout["crawl_worker_logs"].update(self.crawl_worker_log)
			
 
				+        layout["processes"].update(Columns([]))
			
 
				+        layout["bottom"].split_row(
			
 
				+            Layout(name="orchestrator_logs", ratio=2),
			
 
				+            Layout(name="crawl_tree", ratio=1),
			
 
				+        )
			
 
				         layout["orchestrator_logs"].update(self.orchestrator_log)
			
 
				+        layout["crawl_tree"].update(self.crawl_queue_tree)
			
 
				 
			
 
				         return layout
			
 
				 
			
@@ -356,82 +552,53 @@ class ArchiveBoxProgressLayout:
 
				         status: str,
			
 
				         crawl_queue_count: int = 0,
			
 
				         crawl_workers_count: int = 0,
			
 
				+        binary_queue_count: int = 0,
			
 
				+        binary_workers_count: int = 0,
			
 
				         max_crawl_workers: int = 8,
			
 
				     ):
			
 
				         """Update orchestrator status in the crawl queue panel."""
			
 
				         self.crawl_queue.orchestrator_status = status
			
 
				         self.crawl_queue.crawl_queue_count = crawl_queue_count
			
 
				         self.crawl_queue.crawl_workers_count = crawl_workers_count
			
 
				+        self.crawl_queue.binary_queue_count = binary_queue_count
			
 
				+        self.crawl_queue.binary_workers_count = binary_workers_count
			
 
				         self.crawl_queue.max_crawl_workers = max_crawl_workers
			
 
				 
			
 
				-    def update_snapshot_worker(
			
 
				-        self,
			
 
				-        snapshot_id: str,
			
 
				-        url: str,
			
 
				-        total: int,
			
 
				-        completed: int,
			
 
				-        current_plugin: str = "",
			
 
				-    ):
			
 
				-        """Update or assign a snapshot to a worker panel."""
			
 
				-        # Find or assign worker panel for this snapshot
			
 
				-        if snapshot_id not in self.snapshot_to_worker:
			
 
				-            # Find first idle worker panel
			
 
				-            worker_idx = None
			
 
				-            for idx, panel in enumerate(self.worker_panels):
			
 
				-                if panel.status == "idle":
			
 
				-                    worker_idx = idx
			
 
				-                    break
			
 
				-
			
 
				-            # If no idle worker, use round-robin (shouldn't happen often)
			
 
				-            if worker_idx is None:
			
 
				-                worker_idx = len(self.snapshot_to_worker) % MAX_WORKER_COLUMNS
			
 
				-
			
 
				-            self.snapshot_to_worker[snapshot_id] = worker_idx
			
 
				-
			
 
				-        # Get assigned worker panel
			
 
				-        worker_idx = self.snapshot_to_worker[snapshot_id]
			
 
				-        panel = self.worker_panels[worker_idx]
			
 
				-
			
 
				-        # Update panel
			
 
				-        panel.snapshot_id = snapshot_id
			
 
				-        panel.snapshot_url = url
			
 
				-        panel.total_hooks = total
			
 
				-        panel.completed_hooks = completed
			
 
				-        panel.current_plugin = current_plugin
			
 
				-        panel.status = "working" if completed < total else "completed"
			
 
				-
			
 
				-    def remove_snapshot_worker(self, snapshot_id: str):
			
 
				-        """Mark a snapshot worker as idle after completion."""
			
 
				-        if snapshot_id in self.snapshot_to_worker:
			
 
				-            worker_idx = self.snapshot_to_worker[snapshot_id]
			
 
				-            panel = self.worker_panels[worker_idx]
			
 
				-
			
 
				-            # Mark as idle
			
 
				-            panel.status = "idle"
			
 
				-            panel.snapshot_id = None
			
 
				-            panel.snapshot_url = None
			
 
				-            panel.total_hooks = 0
			
 
				-            panel.completed_hooks = 0
			
 
				-            panel.current_plugin = None
			
 
				-            panel.recent_logs.clear()
			
 
				-
			
 
				-            # Remove mapping
			
 
				-            del self.snapshot_to_worker[snapshot_id]
			
 
				-
			
 
				-    def log_to_worker(self, snapshot_id: str, message: str, style: str = "white"):
			
 
				-        """Add a log message to a specific worker's panel."""
			
 
				-        if snapshot_id in self.snapshot_to_worker:
			
 
				-            worker_idx = self.snapshot_to_worker[snapshot_id]
			
 
				-            self.worker_panels[worker_idx].add_log(message, style)
			
 
				-
			
 
				-    def log_event(self, message: str, style: str = "white"):
			
 
				+    def update_process_panels(self, processes: List[Any], pending: Optional[List[Any]] = None) -> None:
			
 
				+        """Update process panels to show all running processes."""
			
 
				+        panels = []
			
 
				+        all_processes = list(processes) + list(pending or [])
			
 
				+        for process in all_processes:
			
 
				+            is_hook = getattr(process, 'process_type', '') == 'hook'
			
 
				+            is_bg = False
			
 
				+            if is_hook:
			
 
				+                try:
			
 
				+                    cmd = getattr(process, 'cmd', [])
			
 
				+                    hook_path = Path(cmd[1]) if len(cmd) > 1 else None
			
 
				+                    hook_name = hook_path.name if hook_path else ''
			
 
				+                    is_bg = '.bg.' in hook_name
			
 
				+                except Exception:
			
 
				+                    is_bg = False
			
 
				+            is_pending = getattr(process, 'status', '') in ('queued', 'pending', 'backoff') or (is_hook and not getattr(process, 'pid', None))
			
 
				+            max_lines = 2 if is_pending else (4 if is_bg else 7)
			
 
				+            panels.append(ProcessLogPanel(process, max_lines=max_lines, compact=is_bg))
			
 
				+        if not panels:
			
 
				+            self.layout["processes"].size = 0
			
 
				+            self.layout["processes"].update(Text(""))
			
 
				+            return
			
 
				+
			
 
				+        self.layout["processes"].size = None
			
 
				+        self.layout["processes"].ratio = 1
			
 
				+        self.layout["processes"].update(Columns(panels, equal=True, expand=True))
			
 
				+
			
 
				+    def update_crawl_tree(self, crawls: list[dict[str, Any]]) -> None:
			
 
				+        """Update the crawl queue tree panel."""
			
 
				+        self.crawl_queue_tree.update_crawls(crawls)
			
 
				+
			
 
				+    def log_event(self, message: str, style: str = "white") -> None:
			
 
				         """Add an event to the orchestrator log."""
			
 
				         self.orchestrator_log.add_event(message, style)
			
 
				 
			
 
				-    def update_crawl_worker_logs(self, process: Any):
			
 
				-        """Update CrawlWorker logs by tailing the Process stdout/stderr files."""
			
 
				-        self.crawl_worker_log.update_from_process(process)
			
 
				-
			
 
				     def get_layout(self) -> Layout:
			
 
				         """Get the Rich Layout object for rendering."""
			
 
				         return self.layout
			
--- a/archivebox/plugins/accessibility/templates/icon.html
+++ b/archivebox/plugins/accessibility/templates/icon.html
@@ -0,0 +1 @@
 
				+<span class="abx-output-icon abx-output-icon--accessibility" title="Accessibility"><svg width="16" height="16" viewBox="0 0 24 24" aria-hidden="true" focusable="false" fill="none" stroke="currentColor" stroke-width="1.8" stroke-linecap="round" stroke-linejoin="round"><circle cx="12" cy="4.5" r="2" fill="currentColor" stroke="none"/><path d="M4 7.5h16"/><path d="M12 7.5v12"/><path d="M7 20l5-6 5 6"/></svg></span>
			
--- a/archivebox/plugins/accessibility/tests/__init__.py
+++ b/archivebox/plugins/accessibility/tests/__init__.py
@@ -1 +0,0 @@
 
				-"""Tests for the accessibility plugin."""
			
--- a/archivebox/plugins/apt/on_Binary__13_apt_install.py
+++ b/archivebox/plugins/apt/on_Binary__13_apt_install.py
@@ -10,7 +10,7 @@ import json
 
				 import sys
			
 
				 
			
 
				 import rich_click as click
			
 
				-from abx_pkg import Binary, AptProvider
			
 
				+from abx_pkg import Binary, AptProvider, BinProviderOverrides
			
 
				 
			
 
				 # Fix pydantic forward reference issue
			
 
				 AptProvider.model_rebuild()
			
--- a/archivebox/plugins/apt/tests/__init__.py
+++ b/archivebox/plugins/apt/tests/__init__.py
@@ -1 +0,0 @@
 
				-"""Tests for the apt binary provider plugin."""
			
--- a/archivebox/plugins/apt/tests/test_apt_provider.py
+++ b/archivebox/plugins/apt/tests/test_apt_provider.py
@@ -21,7 +21,7 @@ from django.test import TestCase
 
				 
			
 
				 # Get the path to the apt provider hook
			
 
				 PLUGIN_DIR = Path(__file__).parent.parent
			
 
				-INSTALL_HOOK = PLUGIN_DIR / 'on_Binary__install_using_apt_provider.py'
			
 
				+INSTALL_HOOK = next(PLUGIN_DIR.glob('on_Binary__*_apt_install.py'), None)
			
 
				 
			
 
				 
			
 
				 def apt_available() -> bool:
			
@@ -48,7 +48,7 @@ class TestAptProviderHook(TestCase):
 
				 
			
 
				     def test_hook_script_exists(self):
			
 
				         """Hook script should exist."""
			
 
				-        self.assertTrue(INSTALL_HOOK.exists(), f"Hook not found: {INSTALL_HOOK}")
			
 
				+        self.assertTrue(INSTALL_HOOK and INSTALL_HOOK.exists(), f"Hook not found: {INSTALL_HOOK}")
			
 
				 
			
 
				     def test_hook_skips_when_apt_not_allowed(self):
			
 
				         """Hook should skip when apt not in allowed binproviders."""
			
--- a/archivebox/plugins/archivedotorg/on_Snapshot__13_archivedotorg.py
+++ b/archivebox/plugins/archivedotorg/on_Snapshot__13_archivedotorg.py
@@ -47,6 +47,9 @@ def submit_to_archivedotorg(url: str) -> tuple[bool, str | None, str]:
 
				 
			
 
				     Returns: (success, output_path, error_message)
			
 
				     """
			
 
				+    def log(message: str) -> None:
			
 
				+        print(f'[archivedotorg] {message}', file=sys.stderr)
			
 
				+
			
 
				     try:
			
 
				         import requests
			
 
				     except ImportError:
			
@@ -56,6 +59,8 @@ def submit_to_archivedotorg(url: str) -> tuple[bool, str | None, str]:
 
				     user_agent = get_env('USER_AGENT', 'Mozilla/5.0 (compatible; ArchiveBox/1.0)')
			
 
				 
			
 
				     submit_url = f'https://web.archive.org/save/{url}'
			
 
				+    log(f'Submitting to Wayback Machine (timeout={timeout}s)')
			
 
				+    log(f'GET {submit_url}')
			
 
				 
			
 
				     try:
			
 
				         response = requests.get(
			
@@ -64,31 +69,40 @@ def submit_to_archivedotorg(url: str) -> tuple[bool, str | None, str]:
 
				             headers={'User-Agent': user_agent},
			
 
				             allow_redirects=True,
			
 
				         )
			
 
				+        log(f'HTTP {response.status_code} final_url={response.url}')
			
 
				 
			
 
				         # Check for successful archive
			
 
				         content_location = response.headers.get('Content-Location', '')
			
 
				         x_archive_orig_url = response.headers.get('X-Archive-Orig-Url', '')
			
 
				+        if content_location:
			
 
				+            log(f'Content-Location: {content_location}')
			
 
				+        if x_archive_orig_url:
			
 
				+            log(f'X-Archive-Orig-Url: {x_archive_orig_url}')
			
 
				 
			
 
				         # Build archive URL
			
 
				         if content_location:
			
 
				             archive_url = f'https://web.archive.org{content_location}'
			
 
				             Path(OUTPUT_FILE).write_text(archive_url, encoding='utf-8')
			
 
				+            log(f'Saved archive URL -> {archive_url}')
			
 
				             return True, OUTPUT_FILE, ''
			
 
				         elif 'web.archive.org' in response.url:
			
 
				             # We were redirected to an archive page
			
 
				             Path(OUTPUT_FILE).write_text(response.url, encoding='utf-8')
			
 
				+            log(f'Redirected to archive page -> {response.url}')
			
 
				             return True, OUTPUT_FILE, ''
			
 
				         else:
			
 
				             # Check for errors in response
			
 
				             if 'RobotAccessControlException' in response.text:
			
 
				                 # Blocked by robots.txt - save submit URL for manual retry
			
 
				                 Path(OUTPUT_FILE).write_text(submit_url, encoding='utf-8')
			
 
				+                log('Blocked by robots.txt, saved submit URL for manual retry')
			
 
				                 return True, OUTPUT_FILE, ''  # Consider this a soft success
			
 
				             elif response.status_code >= 400:
			
 
				                 return False, None, f'HTTP {response.status_code}'
			
 
				             else:
			
 
				                 # Save submit URL anyway
			
 
				                 Path(OUTPUT_FILE).write_text(submit_url, encoding='utf-8')
			
 
				+                log('No archive URL returned, saved submit URL for manual retry')
			
 
				                 return True, OUTPUT_FILE, ''
			
 
				 
			
 
				     except requests.Timeout:
			
--- a/archivebox/plugins/archivedotorg/templates/icon.html
+++ b/archivebox/plugins/archivedotorg/templates/icon.html
@@ -1 +1 @@
 
				-🏛️
			
 
				+<span class="abx-output-icon abx-output-icon--archivedotorg" title="Archive.org"><svg width="16" height="16" viewBox="0 0 24 24" aria-hidden="true" focusable="false" fill="none" stroke="currentColor" stroke-width="1.8" stroke-linecap="round" stroke-linejoin="round"><path d="M3 7h18"/><rect x="3" y="7" width="18" height="13" rx="2"/><path d="M9 12h6"/></svg></span>
			
--- a/archivebox/plugins/chrome/binaries.jsonl
+++ b/archivebox/plugins/chrome/binaries.jsonl
@@ -1 +0,0 @@
 
				-{"type": "Binary", "name": "chrome", "binproviders": "npm,env,brew,apt", "overrides": {"npm": {"packages": ["@puppeteer/browsers"]}}}
			
--- a/archivebox/plugins/chrome/chrome_utils.js
+++ b/archivebox/plugins/chrome/chrome_utils.js
@@ -1253,7 +1253,7 @@ function getExtensionTargets(browser) {
 
				 }
			
 
				 
			
 
				 /**
			
 
				- * Find Chromium/Chrome binary path.
			
 
				+ * Find Chromium binary path.
			
 
				  * Checks CHROME_BINARY env var first, then falls back to system locations.
			
 
				  *
			
 
				  * @returns {string|null} - Absolute path to browser binary or null if not found
			
@@ -1276,7 +1276,9 @@ function findChromium() {
 
				     const chromeBinary = getEnv('CHROME_BINARY');
			
 
				     if (chromeBinary) {
			
 
				         const absPath = path.resolve(chromeBinary);
			
 
				-        if (validateBinary(absPath)) {
			
 
				+        if (absPath.includes('Google Chrome') || absPath.includes('google-chrome')) {
			
 
				+            console.error('[!] Warning: CHROME_BINARY points to Chrome. Chromium is required for extension support.');
			
 
				+        } else if (validateBinary(absPath)) {
			
 
				             return absPath;
			
 
				         }
			
 
				         console.error(`[!] Warning: CHROME_BINARY="${chromeBinary}" is not valid`);
			
@@ -1309,7 +1311,7 @@ function findChromium() {
 
				         return null;
			
 
				     };
			
 
				 
			
 
				-    // 3. Search fallback locations (Chromium first, then Chrome)
			
 
				+    // 3. Search fallback locations (Chromium only)
			
 
				     const fallbackLocations = [
			
 
				         // System Chromium
			
 
				         '/Applications/Chromium.app/Contents/MacOS/Chromium',
			
@@ -1318,10 +1320,6 @@ function findChromium() {
 
				         // Puppeteer cache
			
 
				         path.join(process.env.HOME || '', '.cache/puppeteer/chromium'),
			
 
				         path.join(process.env.HOME || '', '.cache/puppeteer'),
			
 
				-        // Chrome (fallback - extensions may not work in 137+)
			
 
				-        '/Applications/Google Chrome.app/Contents/MacOS/Google Chrome',
			
 
				-        '/usr/bin/google-chrome',
			
 
				-        '/usr/bin/google-chrome-stable',
			
 
				     ];
			
 
				 
			
 
				     for (const loc of fallbackLocations) {
			
@@ -1332,9 +1330,6 @@ function findChromium() {
 
				                 return binary;
			
 
				             }
			
 
				         } else if (validateBinary(loc)) {
			
 
				-            if (loc.includes('Google Chrome') || loc.includes('google-chrome')) {
			
 
				-                console.error('[!] Warning: Using Chrome instead of Chromium. Extension loading may not work in Chrome 137+');
			
 
				-            }
			
 
				             return loc;
			
 
				         }
			
 
				     }
			
@@ -1699,10 +1694,10 @@ module.exports = {
 
				     // Chrome launching
			
 
				     launchChromium,
			
 
				     killChrome,
			
 
				-    // Chrome/Chromium install
			
 
				+    // Chromium install
			
 
				     installChromium,
			
 
				     installPuppeteerCore,
			
 
				-    // Chrome/Chromium binary finding
			
 
				+    // Chromium binary finding
			
 
				     findChromium,
			
 
				     // Extension utilities
			
 
				     getExtensionId,
			
@@ -1744,7 +1739,7 @@ if (require.main === module) {
 
				         console.log('Usage: chrome_utils.js <command> [args...]');
			
 
				         console.log('');
			
 
				         console.log('Commands:');
			
 
				-        console.log('  findChromium              Find Chrome/Chromium binary');
			
 
				+        console.log('  findChromium              Find Chromium binary');
			
 
				         console.log('  installChromium           Install Chromium via @puppeteer/browsers');
			
 
				         console.log('  installPuppeteerCore      Install puppeteer-core npm package');
			
 
				         console.log('  launchChromium            Launch Chrome with CDP debugging');
			
--- a/archivebox/plugins/chrome/config.json
+++ b/archivebox/plugins/chrome/config.json
@@ -7,13 +7,13 @@
 
				       "type": "boolean",
			
 
				       "default": true,
			
 
				       "x-aliases": ["USE_CHROME"],
			
 
				-      "description": "Enable Chrome/Chromium browser integration for archiving"
			
 
				+      "description": "Enable Chromium browser integration for archiving"
			
 
				     },
			
 
				     "CHROME_BINARY": {
			
 
				       "type": "string",
			
 
				       "default": "chromium",
			
 
				       "x-aliases": ["CHROMIUM_BINARY", "GOOGLE_CHROME_BINARY"],
			
 
				-      "description": "Path to Chrome/Chromium binary"
			
 
				+      "description": "Path to Chromium binary"
			
 
				     },
			
 
				     "CHROME_NODE_BINARY": {
			
 
				       "type": "string",
			
--- a/archivebox/plugins/chrome/on_Crawl__01_chrome_install.py
+++ b/archivebox/plugins/chrome/on_Crawl__01_chrome_install.py
@@ -1,265 +0,0 @@
 
				-#!/usr/bin/env python3
			
 
				-"""
			
 
				-Install hook for Chrome/Chromium and puppeteer-core.
			
 
				-
			
 
				-Runs at crawl start to install/find Chromium and puppeteer-core.
			
 
				-Also validates config and computes derived values.
			
 
				-
			
 
				-Outputs:
			
 
				-    - JSONL for Binary and Machine config updates
			
 
				-    - COMPUTED:KEY=VALUE lines that hooks.py parses and adds to env
			
 
				-
			
 
				-Respects CHROME_BINARY env var for custom binary paths.
			
 
				-Uses `npx @puppeteer/browsers install chromium@latest` and parses output.
			
 
				-
			
 
				-NOTE: We use Chromium instead of Chrome because Chrome 137+ removed support for
			
 
				---load-extension and --disable-extensions-except flags, which are needed for
			
 
				-loading unpacked extensions in headless mode.
			
 
				-"""
			
 
				-
			
 
				-import os
			
 
				-import sys
			
 
				-import json
			
 
				-import subprocess
			
 
				-from pathlib import Path
			
 
				-
			
 
				-
			
 
				-def get_env(name: str, default: str = '') -> str:
			
 
				-    return os.environ.get(name, default).strip()
			
 
				-
			
 
				-
			
 
				-def get_env_bool(name: str, default: bool = False) -> bool:
			
 
				-    val = get_env(name, '').lower()
			
 
				-    if val in ('true', '1', 'yes', 'on'):
			
 
				-        return True
			
 
				-    if val in ('false', '0', 'no', 'off'):
			
 
				-        return False
			
 
				-    return default
			
 
				-
			
 
				-
			
 
				-def detect_docker() -> bool:
			
 
				-    """Detect if running inside Docker container."""
			
 
				-    return (
			
 
				-        os.path.exists('/.dockerenv') or
			
 
				-        os.environ.get('IN_DOCKER', '').lower() in ('true', '1', 'yes') or
			
 
				-        os.path.exists('/run/.containerenv')
			
 
				-    )
			
 
				-
			
 
				-
			
 
				-def get_chrome_version(binary_path: str) -> str | None:
			
 
				-    """Get Chrome/Chromium version string."""
			
 
				-    try:
			
 
				-        result = subprocess.run(
			
 
				-            [binary_path, '--version'],
			
 
				-            capture_output=True,
			
 
				-            text=True,
			
 
				-            timeout=5
			
 
				-        )
			
 
				-        if result.returncode == 0:
			
 
				-            return result.stdout.strip()
			
 
				-    except Exception:
			
 
				-        pass
			
 
				-    return None
			
 
				-
			
 
				-
			
 
				-def install_puppeteer_core() -> bool:
			
 
				-    """Install puppeteer-core to NODE_MODULES_DIR if not present."""
			
 
				-    node_modules_dir = os.environ.get('NODE_MODULES_DIR', '').strip()
			
 
				-    if not node_modules_dir:
			
 
				-        # No isolated node_modules, skip (will use global)
			
 
				-        return True
			
 
				-
			
 
				-    node_modules_path = Path(node_modules_dir)
			
 
				-    if (node_modules_path / 'puppeteer-core').exists():
			
 
				-        return True
			
 
				-
			
 
				-    # Get npm prefix from NODE_MODULES_DIR (parent of node_modules)
			
 
				-    npm_prefix = node_modules_path.parent
			
 
				-
			
 
				-    try:
			
 
				-        print(f"[*] Installing puppeteer-core to {npm_prefix}...", file=sys.stderr)
			
 
				-        result = subprocess.run(
			
 
				-            ['npm', 'install', '--prefix', str(npm_prefix), 'puppeteer-core', '@puppeteer/browsers'],
			
 
				-            capture_output=True,
			
 
				-            text=True,
			
 
				-            timeout=60
			
 
				-        )
			
 
				-        if result.returncode == 0:
			
 
				-            print(f"[+] puppeteer-core installed", file=sys.stderr)
			
 
				-            return True
			
 
				-        else:
			
 
				-            print(f"[!] Failed to install puppeteer-core: {result.stderr}", file=sys.stderr)
			
 
				-            return False
			
 
				-    except Exception as e:
			
 
				-        print(f"[!] Failed to install puppeteer-core: {e}", file=sys.stderr)
			
 
				-        return False
			
 
				-
			
 
				-
			
 
				-def install_chromium() -> dict | None:
			
 
				-    """Install Chromium using @puppeteer/browsers and parse output for binary path.
			
 
				-
			
 
				-    Output format: "chromium@<version> <path_to_binary>"
			
 
				-    e.g.: "chromium@1563294 /Users/x/.cache/puppeteer/chromium/.../Chromium"
			
 
				-
			
 
				-    Note: npx is fast when chromium is already cached - it returns the path without re-downloading.
			
 
				-    """
			
 
				-    try:
			
 
				-        print("[*] Installing Chromium via @puppeteer/browsers...", file=sys.stderr)
			
 
				-
			
 
				-        # Use --path to install to puppeteer's standard cache location
			
 
				-        cache_path = os.path.expanduser('~/.cache/puppeteer')
			
 
				-
			
 
				-        result = subprocess.run(
			
 
				-            ['npx', '@puppeteer/browsers', 'install', 'chromium@1563297', f'--path={cache_path}'],
			
 
				-            capture_output=True,
			
 
				-            text=True,
			
 
				-            stdin=subprocess.DEVNULL,
			
 
				-            timeout=300
			
 
				-        )
			
 
				-
			
 
				-        if result.returncode != 0:
			
 
				-            print(f"[!] Failed to install Chromium: {result.stderr}", file=sys.stderr)
			
 
				-            return None
			
 
				-
			
 
				-        # Parse output: "chromium@1563294 /path/to/Chromium"
			
 
				-        output = result.stdout.strip()
			
 
				-        parts = output.split(' ', 1)
			
 
				-        if len(parts) != 2:
			
 
				-            print(f"[!] Failed to parse install output: {output}", file=sys.stderr)
			
 
				-            return None
			
 
				-
			
 
				-        version_str = parts[0]  # "chromium@1563294"
			
 
				-        binary_path = parts[1].strip()
			
 
				-
			
 
				-        if not binary_path or not os.path.exists(binary_path):
			
 
				-            print(f"[!] Binary not found at: {binary_path}", file=sys.stderr)
			
 
				-            return None
			
 
				-
			
 
				-        # Extract version number
			
 
				-        version = version_str.split('@')[1] if '@' in version_str else None
			
 
				-
			
 
				-        print(f"[+] Chromium installed: {binary_path}", file=sys.stderr)
			
 
				-
			
 
				-        return {
			
 
				-            'name': 'chromium',
			
 
				-            'abspath': binary_path,
			
 
				-            'version': version,
			
 
				-            'binprovider': 'puppeteer',
			
 
				-        }
			
 
				-
			
 
				-    except subprocess.TimeoutExpired:
			
 
				-        print("[!] Chromium install timed out", file=sys.stderr)
			
 
				-    except FileNotFoundError:
			
 
				-        print("[!] npx not found - is Node.js installed?", file=sys.stderr)
			
 
				-    except Exception as e:
			
 
				-        print(f"[!] Failed to install Chromium: {e}", file=sys.stderr)
			
 
				-
			
 
				-    return None
			
 
				-
			
 
				-
			
 
				-def main():
			
 
				-    warnings = []
			
 
				-    errors = []
			
 
				-    computed = {}
			
 
				-
			
 
				-    # Install puppeteer-core if NODE_MODULES_DIR is set
			
 
				-    install_puppeteer_core()
			
 
				-
			
 
				-    # Check if Chrome is enabled
			
 
				-    chrome_enabled = get_env_bool('CHROME_ENABLED', True)
			
 
				-
			
 
				-    # Detect Docker and adjust sandbox
			
 
				-    in_docker = detect_docker()
			
 
				-    computed['IN_DOCKER'] = str(in_docker).lower()
			
 
				-
			
 
				-    chrome_sandbox = get_env_bool('CHROME_SANDBOX', True)
			
 
				-    if in_docker and chrome_sandbox:
			
 
				-        warnings.append(
			
 
				-            "Running in Docker with CHROME_SANDBOX=true. "
			
 
				-            "Chrome may fail to start. Consider setting CHROME_SANDBOX=false."
			
 
				-        )
			
 
				-        # Auto-disable sandbox in Docker unless explicitly set
			
 
				-        if not get_env('CHROME_SANDBOX'):
			
 
				-            computed['CHROME_SANDBOX'] = 'false'
			
 
				-
			
 
				-    # Check Node.js availability
			
 
				-    node_binary = get_env('NODE_BINARY', 'node')
			
 
				-    computed['NODE_BINARY'] = node_binary
			
 
				-
			
 
				-    # Check if CHROME_BINARY is already set and valid
			
 
				-    configured_binary = get_env('CHROME_BINARY', '')
			
 
				-    if configured_binary and os.path.isfile(configured_binary) and os.access(configured_binary, os.X_OK):
			
 
				-        version = get_chrome_version(configured_binary)
			
 
				-        computed['CHROME_BINARY'] = configured_binary
			
 
				-        computed['CHROME_VERSION'] = version or 'unknown'
			
 
				-
			
 
				-        print(json.dumps({
			
 
				-            'type': 'Binary',
			
 
				-            'name': 'chromium',
			
 
				-            'abspath': configured_binary,
			
 
				-            'version': version,
			
 
				-            'binprovider': 'env',
			
 
				-        }))
			
 
				-
			
 
				-        # Output computed values
			
 
				-        for key, value in computed.items():
			
 
				-            print(f"COMPUTED:{key}={value}")
			
 
				-        for warning in warnings:
			
 
				-            print(f"WARNING:{warning}", file=sys.stderr)
			
 
				-
			
 
				-        sys.exit(0)
			
 
				-
			
 
				-    # Install/find Chromium via puppeteer
			
 
				-    result = install_chromium()
			
 
				-
			
 
				-    if result and result.get('abspath'):
			
 
				-        computed['CHROME_BINARY'] = result['abspath']
			
 
				-        computed['CHROME_VERSION'] = result['version'] or 'unknown'
			
 
				-
			
 
				-        print(json.dumps({
			
 
				-            'type': 'Binary',
			
 
				-            'name': result['name'],
			
 
				-            'abspath': result['abspath'],
			
 
				-            'version': result['version'],
			
 
				-            'binprovider': result['binprovider'],
			
 
				-        }))
			
 
				-
			
 
				-        print(json.dumps({
			
 
				-            'type': 'Machine',
			
 
				-            '_method': 'update',
			
 
				-            'key': 'config/CHROME_BINARY',
			
 
				-            'value': result['abspath'],
			
 
				-        }))
			
 
				-
			
 
				-        if result['version']:
			
 
				-            print(json.dumps({
			
 
				-                'type': 'Machine',
			
 
				-                '_method': 'update',
			
 
				-                'key': 'config/CHROMIUM_VERSION',
			
 
				-                'value': result['version'],
			
 
				-            }))
			
 
				-
			
 
				-        # Output computed values
			
 
				-        for key, value in computed.items():
			
 
				-            print(f"COMPUTED:{key}={value}")
			
 
				-        for warning in warnings:
			
 
				-            print(f"WARNING:{warning}", file=sys.stderr)
			
 
				-
			
 
				-        sys.exit(0)
			
 
				-    else:
			
 
				-        errors.append("Chromium binary not found")
			
 
				-        computed['CHROME_BINARY'] = ''
			
 
				-
			
 
				-        # Output computed values and errors
			
 
				-        for key, value in computed.items():
			
 
				-            print(f"COMPUTED:{key}={value}")
			
 
				-        for warning in warnings:
			
 
				-            print(f"WARNING:{warning}", file=sys.stderr)
			
 
				-        for error in errors:
			
 
				-            print(f"ERROR:{error}", file=sys.stderr)
			
 
				-
			
 
				-        sys.exit(1)
			
 
				-
			
 
				-
			
 
				-if __name__ == '__main__':
			
 
				-    main()
			
--- a/archivebox/plugins/chrome/on_Crawl__70_chrome_install.py
+++ b/archivebox/plugins/chrome/on_Crawl__70_chrome_install.py
@@ -0,0 +1,34 @@
 
				+#!/usr/bin/env python3
			
 
				+"""
			
 
				+Emit Chromium Binary dependency for the crawl.
			
 
				+
			
 
				+NOTE: We use Chromium instead of Chrome because Chrome 137+ removed support for
			
 
				+--load-extension and --disable-extensions-except flags, which are needed for
			
 
				+loading unpacked extensions in headless mode.
			
 
				+"""
			
 
				+
			
 
				+import json
			
 
				+import os
			
 
				+import sys
			
 
				+
			
 
				+
			
 
				+def main():
			
 
				+    # Check if Chrome is enabled
			
 
				+    chrome_enabled = os.environ.get('CHROME_ENABLED', 'true').lower() not in ('false', '0', 'no', 'off')
			
 
				+    if not chrome_enabled:
			
 
				+        sys.exit(0)
			
 
				+
			
 
				+    record = {
			
 
				+        'type': 'Binary',
			
 
				+        'name': 'chromium',
			
 
				+        'binproviders': 'puppeteer,env',
			
 
				+        'overrides': {
			
 
				+            'puppeteer': ['chromium@latest', '--install-deps'],
			
 
				+        },
			
 
				+    }
			
 
				+    print(json.dumps(record))
			
 
				+    sys.exit(0)
			
 
				+
			
 
				+
			
 
				+if __name__ == '__main__':
			
 
				+    main()
			
--- a/archivebox/plugins/chrome/on_Crawl__90_chrome_launch.bg.js
+++ b/archivebox/plugins/chrome/on_Crawl__90_chrome_launch.bg.js
@@ -3,12 +3,12 @@
 
				  * Launch a shared Chromium browser session for the entire crawl.
			
 
				  *
			
 
				  * This runs once per crawl and keeps Chromium alive for all snapshots to share.
			
 
				- * Each snapshot creates its own tab via on_Snapshot__20_chrome_tab.bg.js.
			
 
				+ * Each snapshot creates its own tab via on_Snapshot__10_chrome_tab.bg.js.
			
 
				  *
			
 
				  * NOTE: We use Chromium instead of Chrome because Chrome 137+ removed support for
			
 
				  * --load-extension and --disable-extensions-except flags.
			
 
				  *
			
 
				- * Usage: on_Crawl__20_chrome_launch.bg.js --crawl-id=<uuid> --source-url=<url>
			
 
				+ * Usage: on_Crawl__90_chrome_launch.bg.js --crawl-id=<uuid> --source-url=<url>
			
 
				  * Output: Writes to current directory (executor creates chrome/ dir):
			
 
				  *   - cdp_url.txt: WebSocket URL for CDP connection
			
 
				  *   - chrome.pid: Chromium process ID (for cleanup)
			
@@ -31,7 +31,7 @@ if (process.env.NODE_MODULES_DIR) {
 
				 
			
 
				 const fs = require('fs');
			
 
				 const path = require('path');
			
 
				-const puppeteer = require('puppeteer-core');
			
 
				+const puppeteer = require('puppeteer');
			
 
				 const {
			
 
				     findChromium,
			
 
				     launchChromium,
			
--- a/archivebox/plugins/chrome/on_Snapshot__10_chrome_tab.bg.js
+++ b/archivebox/plugins/chrome/on_Snapshot__10_chrome_tab.bg.js
@@ -2,11 +2,11 @@
 
				 /**
			
 
				  * Create a Chrome tab for this snapshot in the shared crawl Chrome session.
			
 
				  *
			
 
				- * If a crawl-level Chrome session exists (from on_Crawl__20_chrome_launch.bg.js),
			
 
				+ * If a crawl-level Chrome session exists (from on_Crawl__90_chrome_launch.bg.js),
			
 
				  * this connects to it and creates a new tab. Otherwise, falls back to launching
			
 
				  * its own Chrome instance.
			
 
				  *
			
 
				- * Usage: on_Snapshot__20_chrome_tab.bg.js --url=<url> --snapshot-id=<uuid> --crawl-id=<uuid>
			
 
				+ * Usage: on_Snapshot__10_chrome_tab.bg.js --url=<url> --snapshot-id=<uuid> --crawl-id=<uuid>
			
 
				  * Output: Creates chrome/ directory under snapshot output dir with:
			
 
				  *   - cdp_url.txt: WebSocket URL for CDP connection
			
 
				  *   - chrome.pid: Chrome process ID (from crawl)
			
@@ -15,11 +15,14 @@
 
				  *
			
 
				  * Environment variables:
			
 
				  *     CRAWL_OUTPUT_DIR: Crawl output directory (to find crawl's Chrome session)
			
 
				- *     CHROME_BINARY: Path to Chrome/Chromium binary (for fallback)
			
 
				+ *     CHROME_BINARY: Path to Chromium binary (for fallback)
			
 
				  *     CHROME_RESOLUTION: Page resolution (default: 1440,2000)
			
 
				  *     CHROME_USER_AGENT: User agent string (optional)
			
 
				  *     CHROME_CHECK_SSL_VALIDITY: Whether to check SSL certificates (default: true)
			
 
				  *     CHROME_HEADLESS: Run in headless mode (default: true)
			
 
				+ *
			
 
				+ * This is a background hook that stays alive until SIGTERM so the tab
			
 
				+ * can be closed cleanly at the end of the snapshot run.
			
 
				  */
			
 
				 
			
 
				 const fs = require('fs');
			
@@ -28,7 +31,7 @@ const { spawn } = require('child_process');
 
				 // Add NODE_MODULES_DIR to module resolution paths if set
			
 
				 if (process.env.NODE_MODULES_DIR) module.paths.unshift(process.env.NODE_MODULES_DIR);
			
 
				 
			
 
				-const puppeteer = require('puppeteer-core');
			
 
				+const puppeteer = require('puppeteer');
			
 
				 const {
			
 
				     findChromium,
			
 
				     getEnv,
			
@@ -43,6 +46,11 @@ const PLUGIN_NAME = 'chrome_tab';
 
				 const OUTPUT_DIR = '.';  // Hook already runs in chrome/ output directory
			
 
				 const CHROME_SESSION_DIR = '.';
			
 
				 
			
 
				+let finalStatus = 'failed';
			
 
				+let finalOutput = '';
			
 
				+let finalError = '';
			
 
				+let cmdVersion = '';
			
 
				+let finalized = false;
			
 
				 
			
 
				 // Parse command line arguments
			
 
				 function parseArgs() {
			
@@ -56,8 +64,31 @@ function parseArgs() {
 
				     return args;
			
 
				 }
			
 
				 
			
 
				+function emitResult(statusOverride) {
			
 
				+    if (finalized) return;
			
 
				+    finalized = true;
			
 
				+
			
 
				+    const status = statusOverride || finalStatus;
			
 
				+    const outputStr = status === 'succeeded'
			
 
				+        ? finalOutput
			
 
				+        : (finalError || finalOutput || '');
			
 
				+
			
 
				+    const result = {
			
 
				+        type: 'ArchiveResult',
			
 
				+        status,
			
 
				+        output_str: outputStr,
			
 
				+    };
			
 
				+    if (cmdVersion) {
			
 
				+        result.cmd_version = cmdVersion;
			
 
				+    }
			
 
				+    console.log(JSON.stringify(result));
			
 
				+}
			
 
				+
			
 
				 // Cleanup handler for SIGTERM - close this snapshot's tab
			
 
				-async function cleanup() {
			
 
				+async function cleanup(signal) {
			
 
				+    if (signal) {
			
 
				+        console.error(`\nReceived ${signal}, closing chrome tab...`);
			
 
				+    }
			
 
				     try {
			
 
				         const cdpFile = path.join(OUTPUT_DIR, 'cdp_url.txt');
			
 
				         const targetIdFile = path.join(OUTPUT_DIR, 'target_id.txt');
			
@@ -78,12 +109,13 @@ async function cleanup() {
 
				     } catch (e) {
			
 
				         // Best effort
			
 
				     }
			
 
				-    process.exit(0);
			
 
				+    emitResult();
			
 
				+    process.exit(finalStatus === 'succeeded' ? 0 : 1);
			
 
				 }
			
 
				 
			
 
				 // Register signal handlers
			
 
				-process.on('SIGTERM', cleanup);
			
 
				-process.on('SIGINT', cleanup);
			
 
				+process.on('SIGTERM', () => cleanup('SIGTERM'));
			
 
				+process.on('SIGINT', () => cleanup('SIGINT'));
			
 
				 
			
 
				 // Try to find the crawl's Chrome session
			
 
				 function findCrawlChromeSession(crawlId) {
			
@@ -272,23 +304,22 @@ async function main() {
 
				     const crawlId = args.crawl_id;
			
 
				 
			
 
				     if (!url || !snapshotId) {
			
 
				-        console.error('Usage: on_Snapshot__20_chrome_tab.bg.js --url=<url> --snapshot-id=<uuid> [--crawl-id=<uuid>]');
			
 
				+        console.error('Usage: on_Snapshot__10_chrome_tab.bg.js --url=<url> --snapshot-id=<uuid> [--crawl-id=<uuid>]');
			
 
				         process.exit(1);
			
 
				     }
			
 
				 
			
 
				-    const startTs = new Date();
			
 
				     let status = 'failed';
			
 
				-    let output = null;
			
 
				+    let output = '';
			
 
				     let error = '';
			
 
				     let version = '';
			
 
				 
			
 
				     try {
			
 
				         const binary = findChromium();
			
 
				         if (!binary) {
			
 
				-            console.error('ERROR: Chrome/Chromium binary not found');
			
 
				-            console.error('DEPENDENCY_NEEDED=chrome');
			
 
				+            console.error('ERROR: Chromium binary not found');
			
 
				+            console.error('DEPENDENCY_NEEDED=chromium');
			
 
				             console.error('BIN_PROVIDERS=puppeteer,env,playwright,apt,brew');
			
 
				-            console.error('INSTALL_HINT=npx @puppeteer/browsers install chrome@stable');
			
 
				+            console.error('INSTALL_HINT=npx @puppeteer/browsers install chromium@latest');
			
 
				             process.exit(1);
			
 
				         }
			
 
				 
			
@@ -327,24 +358,22 @@ async function main() {
 
				         status = 'failed';
			
 
				     }
			
 
				 
			
 
				-    const endTs = new Date();
			
 
				-
			
 
				     if (error) {
			
 
				         console.error(`ERROR: ${error}`);
			
 
				     }
			
 
				 
			
 
				-    // Output clean JSONL (no RESULT_JSON= prefix)
			
 
				-    const result = {
			
 
				-        type: 'ArchiveResult',
			
 
				-        status,
			
 
				-        output_str: output || error || '',
			
 
				-    };
			
 
				-    if (version) {
			
 
				-        result.cmd_version = version;
			
 
				+    finalStatus = status;
			
 
				+    finalOutput = output || '';
			
 
				+    finalError = error || '';
			
 
				+    cmdVersion = version || '';
			
 
				+
			
 
				+    if (status !== 'succeeded') {
			
 
				+        emitResult(status);
			
 
				+        process.exit(1);
			
 
				     }
			
 
				-    console.log(JSON.stringify(result));
			
 
				 
			
 
				-    process.exit(status === 'succeeded' ? 0 : 1);
			
 
				+    console.log('[*] Chrome tab created, waiting for cleanup signal...');
			
 
				+    await new Promise(() => {}); // Keep alive until SIGTERM
			
 
				 }
			
 
				 
			
 
				 main().catch(e => {
			
--- a/archivebox/plugins/chrome/on_Snapshot__11_chrome_wait.js
+++ b/archivebox/plugins/chrome/on_Snapshot__11_chrome_wait.js
@@ -0,0 +1,76 @@
 
				+#!/usr/bin/env node
			
 
				+/**
			
 
				+ * Wait for Chrome session files to exist (cdp_url.txt + target_id.txt).
			
 
				+ *
			
 
				+ * This is a foreground hook that blocks until the Chrome tab is ready,
			
 
				+ * so downstream hooks can safely connect to CDP.
			
 
				+ *
			
 
				+ * Usage: on_Snapshot__11_chrome_wait.js --url=<url> --snapshot-id=<uuid>
			
 
				+ */
			
 
				+
			
 
				+const fs = require('fs');
			
 
				+const path = require('path');
			
 
				+// Add NODE_MODULES_DIR to module resolution paths if set
			
 
				+if (process.env.NODE_MODULES_DIR) module.paths.unshift(process.env.NODE_MODULES_DIR);
			
 
				+
			
 
				+const {
			
 
				+    getEnvInt,
			
 
				+    waitForChromeSession,
			
 
				+    readCdpUrl,
			
 
				+    readTargetId,
			
 
				+} = require('./chrome_utils.js');
			
 
				+
			
 
				+const CHROME_SESSION_DIR = '.';
			
 
				+
			
 
				+function parseArgs() {
			
 
				+    const args = {};
			
 
				+    process.argv.slice(2).forEach(arg => {
			
 
				+        if (arg.startsWith('--')) {
			
 
				+            const [key, ...valueParts] = arg.slice(2).split('=');
			
 
				+            args[key.replace(/-/g, '_')] = valueParts.join('=') || true;
			
 
				+        }
			
 
				+    });
			
 
				+    return args;
			
 
				+}
			
 
				+
			
 
				+async function main() {
			
 
				+    const args = parseArgs();
			
 
				+    const url = args.url;
			
 
				+    const snapshotId = args.snapshot_id;
			
 
				+
			
 
				+    if (!url || !snapshotId) {
			
 
				+        console.error('Usage: on_Snapshot__11_chrome_wait.js --url=<url> --snapshot-id=<uuid>');
			
 
				+        process.exit(1);
			
 
				+    }
			
 
				+
			
 
				+    const timeoutSeconds = getEnvInt('CHROME_TAB_TIMEOUT', getEnvInt('CHROME_TIMEOUT', getEnvInt('TIMEOUT', 60)));
			
 
				+    const timeoutMs = timeoutSeconds * 1000;
			
 
				+
			
 
				+    console.error(`[chrome_wait] Waiting for Chrome session (timeout=${timeoutSeconds}s)...`);
			
 
				+
			
 
				+    const ready = await waitForChromeSession(CHROME_SESSION_DIR, timeoutMs);
			
 
				+    if (!ready) {
			
 
				+        const error = `Chrome session not ready after ${timeoutSeconds}s (cdp_url.txt/target_id.txt missing)`;
			
 
				+        console.error(`[chrome_wait] ERROR: ${error}`);
			
 
				+        console.log(JSON.stringify({ type: 'ArchiveResult', status: 'failed', output_str: error }));
			
 
				+        process.exit(1);
			
 
				+    }
			
 
				+
			
 
				+    const cdpUrl = readCdpUrl(CHROME_SESSION_DIR);
			
 
				+    const targetId = readTargetId(CHROME_SESSION_DIR);
			
 
				+    if (!cdpUrl || !targetId) {
			
 
				+        const error = 'Chrome session files incomplete (cdp_url.txt/target_id.txt missing)';
			
 
				+        console.error(`[chrome_wait] ERROR: ${error}`);
			
 
				+        console.log(JSON.stringify({ type: 'ArchiveResult', status: 'failed', output_str: error }));
			
 
				+        process.exit(1);
			
 
				+    }
			
 
				+
			
 
				+    console.error(`[chrome_wait] Chrome session ready (cdp_url=${cdpUrl.slice(0, 32)}..., target_id=${targetId}).`);
			
 
				+    console.log(JSON.stringify({ type: 'ArchiveResult', status: 'succeeded', output_str: 'chrome session ready' }));
			
 
				+    process.exit(0);
			
 
				+}
			
 
				+
			
 
				+main().catch(e => {
			
 
				+    console.error(`Fatal error: ${e.message}`);
			
 
				+    process.exit(1);
			
 
				+});
			
--- a/archivebox/plugins/chrome/on_Snapshot__30_chrome_navigate.js
+++ b/archivebox/plugins/chrome/on_Snapshot__30_chrome_navigate.js
@@ -19,7 +19,7 @@ const fs = require('fs');
 
				 const path = require('path');
			
 
				 // Add NODE_MODULES_DIR to module resolution paths if set
			
 
				 if (process.env.NODE_MODULES_DIR) module.paths.unshift(process.env.NODE_MODULES_DIR);
			
 
				-const puppeteer = require('puppeteer-core');
			
 
				+const puppeteer = require('puppeteer');
			
 
				 
			
 
				 const PLUGIN_NAME = 'chrome_navigate';
			
 
				 const CHROME_SESSION_DIR = '.';
			
--- a/archivebox/plugins/chrome/templates/icon.html
+++ b/archivebox/plugins/chrome/templates/icon.html
@@ -0,0 +1 @@
 
				+<span class="abx-output-icon abx-output-icon--chrome" title="Chrome"><svg width="16" height="16" viewBox="0 0 24 24" aria-hidden="true" focusable="false" fill="none" stroke="currentColor" stroke-width="1.8" stroke-linecap="round" stroke-linejoin="round"><rect x="3" y="4.5" width="18" height="15" rx="2"/><path d="M3 9h18"/><circle cx="7" cy="7" r="1" fill="currentColor" stroke="none"/><circle cx="11" cy="7" r="1" fill="currentColor" stroke="none"/></svg></span>
			
--- a/archivebox/plugins/chrome/tests/__init__.py
+++ b/archivebox/plugins/chrome/tests/__init__.py
--- a/archivebox/plugins/chrome/tests/chrome_test_helpers.py
+++ b/archivebox/plugins/chrome/tests/chrome_test_helpers.py
@@ -60,6 +60,7 @@ import os
 
				 import platform
			
 
				 import signal
			
 
				 import subprocess
			
 
				+import sys
			
 
				 import time
			
 
				 from datetime import datetime
			
 
				 from pathlib import Path
			
@@ -72,11 +73,14 @@ CHROME_PLUGIN_DIR = Path(__file__).parent.parent
 
				 PLUGINS_ROOT = CHROME_PLUGIN_DIR.parent
			
 
				 
			
 
				 # Hook script locations
			
 
				-CHROME_INSTALL_HOOK = CHROME_PLUGIN_DIR / 'on_Crawl__01_chrome_install.py'
			
 
				-CHROME_LAUNCH_HOOK = CHROME_PLUGIN_DIR / 'on_Crawl__20_chrome_launch.bg.js'
			
 
				-CHROME_TAB_HOOK = CHROME_PLUGIN_DIR / 'on_Snapshot__20_chrome_tab.bg.js'
			
 
				+CHROME_INSTALL_HOOK = CHROME_PLUGIN_DIR / 'on_Crawl__70_chrome_install.py'
			
 
				+CHROME_LAUNCH_HOOK = CHROME_PLUGIN_DIR / 'on_Crawl__90_chrome_launch.bg.js'
			
 
				+CHROME_TAB_HOOK = CHROME_PLUGIN_DIR / 'on_Snapshot__10_chrome_tab.bg.js'
			
 
				 CHROME_NAVIGATE_HOOK = next(CHROME_PLUGIN_DIR.glob('on_Snapshot__*_chrome_navigate.*'), None)
			
 
				 CHROME_UTILS = CHROME_PLUGIN_DIR / 'chrome_utils.js'
			
 
				+PUPPETEER_BINARY_HOOK = PLUGINS_ROOT / 'puppeteer' / 'on_Binary__12_puppeteer_install.py'
			
 
				+PUPPETEER_CRAWL_HOOK = PLUGINS_ROOT / 'puppeteer' / 'on_Crawl__60_puppeteer_install.py'
			
 
				+NPM_BINARY_HOOK = PLUGINS_ROOT / 'npm' / 'on_Binary__10_npm_install.py'
			
 
				 
			
 
				 
			
 
				 # =============================================================================
			
@@ -402,7 +406,7 @@ def run_hook(
 
				 
			
 
				     # Determine interpreter based on file extension
			
 
				     if hook_script.suffix == '.py':
			
 
				-        cmd = ['python', str(hook_script)]
			
 
				+        cmd = [sys.executable, str(hook_script)]
			
 
				     elif hook_script.suffix == '.js':
			
 
				         cmd = ['node', str(hook_script)]
			
 
				     else:
			
@@ -451,6 +455,128 @@ def parse_jsonl_output(stdout: str, record_type: str = 'ArchiveResult') -> Optio
 
				     return None
			
 
				 
			
 
				 
			
 
				+def parse_jsonl_records(stdout: str) -> List[Dict[str, Any]]:
			
 
				+    """Parse all JSONL records from stdout."""
			
 
				+    records: List[Dict[str, Any]] = []
			
 
				+    for line in stdout.strip().split('\n'):
			
 
				+        line = line.strip()
			
 
				+        if not line.startswith('{'):
			
 
				+            continue
			
 
				+        try:
			
 
				+            records.append(json.loads(line))
			
 
				+        except json.JSONDecodeError:
			
 
				+            continue
			
 
				+    return records
			
 
				+
			
 
				+
			
 
				+def apply_machine_updates(records: List[Dict[str, Any]], env: dict) -> None:
			
 
				+    """Apply Machine update records to env dict in-place."""
			
 
				+    for record in records:
			
 
				+        if record.get('type') != 'Machine':
			
 
				+            continue
			
 
				+        config = record.get('config')
			
 
				+        if not isinstance(config, dict):
			
 
				+            continue
			
 
				+        env.update(config)
			
 
				+
			
 
				+
			
 
				+def install_chromium_with_hooks(env: dict, timeout: int = 300) -> str:
			
 
				+    """Install Chromium via chrome crawl hook + puppeteer/npm hooks.
			
 
				+
			
 
				+    Returns absolute path to Chromium binary.
			
 
				+    """
			
 
				+    puppeteer_result = subprocess.run(
			
 
				+        [sys.executable, str(PUPPETEER_CRAWL_HOOK)],
			
 
				+        capture_output=True,
			
 
				+        text=True,
			
 
				+        timeout=timeout,
			
 
				+        env=env,
			
 
				+    )
			
 
				+    if puppeteer_result.returncode != 0:
			
 
				+        raise RuntimeError(f"Puppeteer crawl hook failed: {puppeteer_result.stderr}")
			
 
				+
			
 
				+    puppeteer_record = parse_jsonl_output(puppeteer_result.stdout, record_type='Binary') or {}
			
 
				+    if not puppeteer_record or puppeteer_record.get('name') != 'puppeteer':
			
 
				+        raise RuntimeError("Puppeteer Binary record not emitted by crawl hook")
			
 
				+
			
 
				+    npm_cmd = [
			
 
				+        sys.executable,
			
 
				+        str(NPM_BINARY_HOOK),
			
 
				+        '--machine-id=test-machine',
			
 
				+        '--binary-id=test-puppeteer',
			
 
				+        '--name=puppeteer',
			
 
				+        f"--binproviders={puppeteer_record.get('binproviders', '*')}",
			
 
				+    ]
			
 
				+    puppeteer_overrides = puppeteer_record.get('overrides')
			
 
				+    if puppeteer_overrides:
			
 
				+        npm_cmd.append(f'--overrides={json.dumps(puppeteer_overrides)}')
			
 
				+
			
 
				+    npm_result = subprocess.run(
			
 
				+        npm_cmd,
			
 
				+        capture_output=True,
			
 
				+        text=True,
			
 
				+        timeout=timeout,
			
 
				+        env=env,
			
 
				+    )
			
 
				+    if npm_result.returncode != 0:
			
 
				+        raise RuntimeError(f"Npm install failed: {npm_result.stderr}")
			
 
				+
			
 
				+    apply_machine_updates(parse_jsonl_records(npm_result.stdout), env)
			
 
				+
			
 
				+    chrome_result = subprocess.run(
			
 
				+        [sys.executable, str(CHROME_INSTALL_HOOK)],
			
 
				+        capture_output=True,
			
 
				+        text=True,
			
 
				+        timeout=timeout,
			
 
				+        env=env,
			
 
				+    )
			
 
				+    if chrome_result.returncode != 0:
			
 
				+        raise RuntimeError(f"Chrome install hook failed: {chrome_result.stderr}")
			
 
				+
			
 
				+    chrome_record = parse_jsonl_output(chrome_result.stdout, record_type='Binary') or {}
			
 
				+    if not chrome_record or chrome_record.get('name') not in ('chromium', 'chrome'):
			
 
				+        raise RuntimeError("Chrome Binary record not emitted by crawl hook")
			
 
				+
			
 
				+    chromium_cmd = [
			
 
				+        sys.executable,
			
 
				+        str(PUPPETEER_BINARY_HOOK),
			
 
				+        '--machine-id=test-machine',
			
 
				+        '--binary-id=test-chromium',
			
 
				+        f"--name={chrome_record.get('name', 'chromium')}",
			
 
				+        f"--binproviders={chrome_record.get('binproviders', '*')}",
			
 
				+    ]
			
 
				+    chrome_overrides = chrome_record.get('overrides')
			
 
				+    if chrome_overrides:
			
 
				+        chromium_cmd.append(f'--overrides={json.dumps(chrome_overrides)}')
			
 
				+
			
 
				+    result = subprocess.run(
			
 
				+        chromium_cmd,
			
 
				+        capture_output=True,
			
 
				+        text=True,
			
 
				+        timeout=timeout,
			
 
				+        env=env,
			
 
				+    )
			
 
				+    if result.returncode != 0:
			
 
				+        raise RuntimeError(f"Puppeteer chromium install failed: {result.stderr}")
			
 
				+
			
 
				+    records = parse_jsonl_records(result.stdout)
			
 
				+    chromium_record = None
			
 
				+    for record in records:
			
 
				+        if record.get('type') == 'Binary' and record.get('name') in ('chromium', 'chrome'):
			
 
				+            chromium_record = record
			
 
				+            break
			
 
				+    if not chromium_record:
			
 
				+        chromium_record = parse_jsonl_output(result.stdout, record_type='Binary')
			
 
				+
			
 
				+    chromium_path = chromium_record.get('abspath')
			
 
				+    if not chromium_path or not Path(chromium_path).exists():
			
 
				+        raise RuntimeError(f"Chromium binary not found after install: {chromium_path}")
			
 
				+
			
 
				+    env['CHROME_BINARY'] = chromium_path
			
 
				+    apply_machine_updates(records, env)
			
 
				+    return chromium_path
			
 
				+
			
 
				+
			
 
				 def run_hook_and_parse(
			
 
				     hook_script: Path,
			
 
				     url: str,
			
@@ -499,7 +625,7 @@ def setup_test_env(tmpdir: Path) -> dict:
 
				                     crawls/
			
 
				                     snapshots/
			
 
				 
			
 
				-    Calls chrome install hook which handles puppeteer-core and chromium installation.
			
 
				+    Calls chrome install hook + puppeteer/npm hooks for Chromium installation.
			
 
				     Returns env dict with DATA_DIR, LIB_DIR, NPM_BIN_DIR, NODE_MODULES_DIR, CHROME_BINARY, etc.
			
 
				 
			
 
				     Args:
			
@@ -559,31 +685,10 @@ def setup_test_env(tmpdir: Path) -> dict:
 
				     if 'CHROME_HEADLESS' not in os.environ:
			
 
				         env['CHROME_HEADLESS'] = 'true'
			
 
				 
			
 
				-    # Call chrome install hook (installs puppeteer-core and chromium, outputs JSONL)
			
 
				-    result = subprocess.run(
			
 
				-        ['python', str(CHROME_INSTALL_HOOK)],
			
 
				-        capture_output=True, text=True, timeout=120, env=env
			
 
				-    )
			
 
				-    if result.returncode != 0:
			
 
				-        pytest.skip(f"Chrome install hook failed: {result.stderr}")
			
 
				-
			
 
				-    # Parse JSONL output to get CHROME_BINARY
			
 
				-    chrome_binary = None
			
 
				-    for line in result.stdout.strip().split('\n'):
			
 
				-        if not line.strip():
			
 
				-            continue
			
 
				-        try:
			
 
				-            data = json.loads(line)
			
 
				-            if data.get('type') == 'Binary' and data.get('abspath'):
			
 
				-                chrome_binary = data['abspath']
			
 
				-                break
			
 
				-        except json.JSONDecodeError:
			
 
				-            continue
			
 
				-
			
 
				-    if not chrome_binary or not Path(chrome_binary).exists():
			
 
				-        pytest.skip(f"Chromium binary not found: {chrome_binary}")
			
 
				-
			
 
				-    env['CHROME_BINARY'] = chrome_binary
			
 
				+    try:
			
 
				+        install_chromium_with_hooks(env)
			
 
				+    except RuntimeError as e:
			
 
				+        pytest.skip(str(e))
			
 
				     return env
			
 
				 
			
 
				 
			
@@ -790,17 +895,8 @@ def chrome_session(
 
				             'CHROME_HEADLESS': 'true',
			
 
				         })
			
 
				 
			
 
				-        # CRITICAL: Run chrome install hook first (installs puppeteer-core and chromium)
			
 
				-        # chrome_launch assumes chrome_install has already run
			
 
				-        install_result = subprocess.run(
			
 
				-            ['python', str(CHROME_INSTALL_HOOK)],
			
 
				-            capture_output=True,
			
 
				-            text=True,
			
 
				-            timeout=120,
			
 
				-            env=env
			
 
				-        )
			
 
				-        if install_result.returncode != 0:
			
 
				-            raise RuntimeError(f"Chrome install failed: {install_result.stderr}")
			
 
				+        # Install Chromium via npm + puppeteer hooks using normal Binary flow
			
 
				+        install_chromium_with_hooks(env)
			
 
				 
			
 
				         # Launch Chrome at crawl level
			
 
				         chrome_launch_process = subprocess.Popen(
			
--- a/archivebox/plugins/chrome/tests/test_chrome.py
+++ b/archivebox/plugins/chrome/tests/test_chrome.py
@@ -30,9 +30,8 @@ import platform
 
				 
			
 
				 from archivebox.plugins.chrome.tests.chrome_test_helpers import (
			
 
				     get_test_env,
			
 
				-    get_lib_dir,
			
 
				-    get_node_modules_dir,
			
 
				     find_chromium_binary,
			
 
				+    install_chromium_with_hooks,
			
 
				     CHROME_PLUGIN_DIR as PLUGIN_DIR,
			
 
				     CHROME_LAUNCH_HOOK,
			
 
				     CHROME_TAB_HOOK,
			
@@ -41,58 +40,24 @@ from archivebox.plugins.chrome.tests.chrome_test_helpers import (
 
				 
			
 
				 @pytest.fixture(scope="session", autouse=True)
			
 
				 def ensure_chromium_and_puppeteer_installed(tmp_path_factory):
			
 
				-    """Ensure Chromium and puppeteer are installed before running tests.
			
 
				-
			
 
				-    Puppeteer handles Chromium installation automatically in its own cache.
			
 
				-    We only need to install puppeteer itself to LIB_DIR/npm.
			
 
				-    """
			
 
				-    from abx_pkg import Binary, NpmProvider, BinProviderOverrides
			
 
				-
			
 
				-    # Set DATA_DIR if not already set (required by abx_pkg)
			
 
				+    """Ensure Chromium and puppeteer are installed before running tests."""
			
 
				     if not os.environ.get('DATA_DIR'):
			
 
				-        # Use isolated temp dir for direct pytest runs
			
 
				         test_data_dir = tmp_path_factory.mktemp('chrome_test_data')
			
 
				         os.environ['DATA_DIR'] = str(test_data_dir)
			
 
				+    env = get_test_env()
			
 
				 
			
 
				-    # Compute paths AFTER setting DATA_DIR
			
 
				-    lib_dir = get_lib_dir()
			
 
				-    node_modules_dir = get_node_modules_dir()
			
 
				-    npm_prefix = lib_dir / 'npm'
			
 
				-
			
 
				-    # Rebuild pydantic models
			
 
				-    NpmProvider.model_rebuild()
			
 
				+    try:
			
 
				+        chromium_binary = install_chromium_with_hooks(env)
			
 
				+    except RuntimeError as e:
			
 
				+        pytest.skip(str(e))
			
 
				 
			
 
				-    # Install puppeteer if not available (it will handle Chromium in its own cache)
			
 
				-    puppeteer_core_path = node_modules_dir / 'puppeteer-core'
			
 
				-    if not puppeteer_core_path.exists():
			
 
				-        print(f"\n[*] Installing puppeteer to {npm_prefix}...")
			
 
				-        npm_prefix.mkdir(parents=True, exist_ok=True)
			
 
				-
			
 
				-        provider = NpmProvider(npm_prefix=npm_prefix)
			
 
				-        try:
			
 
				-            binary = Binary(
			
 
				-                name='puppeteer',
			
 
				-                binproviders=[provider],
			
 
				-                overrides={'npm': {'packages': ['puppeteer@^23.5.0']}}
			
 
				-            )
			
 
				-            binary.install()
			
 
				-            print(f"[*] Puppeteer installed successfully to {npm_prefix}")
			
 
				-        except Exception as e:
			
 
				-            pytest.skip(f"Failed to install puppeteer: {e}")
			
 
				-
			
 
				-    # Find Chromium binary (puppeteer installs it automatically in its cache)
			
 
				-    chromium_binary = find_chromium_binary()
			
 
				     if not chromium_binary:
			
 
				-        pytest.skip("Chromium not found - puppeteer should install it automatically")
			
 
				+        pytest.skip("Chromium not found after install")
			
 
				 
			
 
				-    # Set CHROME_BINARY env var for tests
			
 
				     os.environ['CHROME_BINARY'] = chromium_binary
			
 
				-
			
 
				-
			
 
				-# Get paths from helpers (will use DATA_DIR if set, or compute based on __file__)
			
 
				-LIB_DIR = get_lib_dir()
			
 
				-NODE_MODULES_DIR = get_node_modules_dir()
			
 
				-NPM_PREFIX = LIB_DIR / 'npm'
			
 
				+    for key in ('NODE_MODULES_DIR', 'NODE_PATH', 'PATH'):
			
 
				+        if env.get(key):
			
 
				+            os.environ[key] = env[key]
			
 
				 
			
 
				 
			
 
				 def test_hook_scripts_exist():
			
--- a/archivebox/plugins/consolelog/on_Snapshot__21_consolelog.bg.js
+++ b/archivebox/plugins/consolelog/on_Snapshot__21_consolelog.bg.js
@@ -32,6 +32,13 @@ const OUTPUT_DIR = '.';
 
				 const OUTPUT_FILE = 'console.jsonl';
			
 
				 const CHROME_SESSION_DIR = '../chrome';
			
 
				 
			
 
				+let browser = null;
			
 
				+let page = null;
			
 
				+let logCount = 0;
			
 
				+let errorCount = 0;
			
 
				+let requestFailCount = 0;
			
 
				+let shuttingDown = false;
			
 
				+
			
 
				 async function serializeArgs(args) {
			
 
				     const serialized = [];
			
 
				     for (const arg of args) {
			
@@ -73,6 +80,7 @@ async function setupListeners() {
 
				                 location: msg.location(),
			
 
				             };
			
 
				             fs.appendFileSync(outputPath, JSON.stringify(logEntry) + '\n');
			
 
				+            logCount += 1;
			
 
				         } catch (e) {
			
 
				             // Ignore errors
			
 
				         }
			
@@ -87,6 +95,7 @@ async function setupListeners() {
 
				                 stack: error.stack || '',
			
 
				             };
			
 
				             fs.appendFileSync(outputPath, JSON.stringify(logEntry) + '\n');
			
 
				+            errorCount += 1;
			
 
				         } catch (e) {
			
 
				             // Ignore
			
 
				         }
			
@@ -103,6 +112,7 @@ async function setupListeners() {
 
				                 url: request.url(),
			
 
				             };
			
 
				             fs.appendFileSync(outputPath, JSON.stringify(logEntry) + '\n');
			
 
				+            requestFailCount += 1;
			
 
				         } catch (e) {
			
 
				             // Ignore
			
 
				         }
			
@@ -111,6 +121,29 @@ async function setupListeners() {
 
				     return { browser, page };
			
 
				 }
			
 
				 
			
 
				+function emitResult(status = 'succeeded') {
			
 
				+    if (shuttingDown) return;
			
 
				+    shuttingDown = true;
			
 
				+
			
 
				+    const counts = `${logCount} console, ${errorCount} errors, ${requestFailCount} failed requests`;
			
 
				+    console.log(JSON.stringify({
			
 
				+        type: 'ArchiveResult',
			
 
				+        status,
			
 
				+        output_str: `${OUTPUT_FILE} (${counts})`,
			
 
				+    }));
			
 
				+}
			
 
				+
			
 
				+async function handleShutdown(signal) {
			
 
				+    console.error(`\nReceived ${signal}, emitting final results...`);
			
 
				+    emitResult('succeeded');
			
 
				+    if (browser) {
			
 
				+        try {
			
 
				+            browser.disconnect();
			
 
				+        } catch (e) {}
			
 
				+    }
			
 
				+    process.exit(0);
			
 
				+}
			
 
				+
			
 
				 async function main() {
			
 
				     const args = parseArgs();
			
 
				     const url = args.url;
			
@@ -127,23 +160,27 @@ async function main() {
 
				         process.exit(0);
			
 
				     }
			
 
				 
			
 
				-    const timeout = getEnvInt('CONSOLELOG_TIMEOUT', 30) * 1000;
			
 
				-
			
 
				     try {
			
 
				         // Set up listeners BEFORE navigation
			
 
				-        await setupListeners();
			
 
				+        const connection = await setupListeners();
			
 
				+        browser = connection.browser;
			
 
				+        page = connection.page;
			
 
				 
			
 
				-        // Wait for chrome_navigate to complete (BLOCKING)
			
 
				-        await waitForPageLoaded(CHROME_SESSION_DIR, timeout * 4, 500);
			
 
				+        // Register signal handlers for graceful shutdown
			
 
				+        process.on('SIGTERM', () => handleShutdown('SIGTERM'));
			
 
				+        process.on('SIGINT', () => handleShutdown('SIGINT'));
			
 
				 
			
 
				-        // Output clean JSONL
			
 
				-        console.log(JSON.stringify({
			
 
				-            type: 'ArchiveResult',
			
 
				-            status: 'succeeded',
			
 
				-            output_str: OUTPUT_FILE,
			
 
				-        }));
			
 
				+        // Wait for chrome_navigate to complete (non-fatal)
			
 
				+        try {
			
 
				+            const timeout = getEnvInt('CONSOLELOG_TIMEOUT', 30) * 1000;
			
 
				+            await waitForPageLoaded(CHROME_SESSION_DIR, timeout * 4, 500);
			
 
				+        } catch (e) {
			
 
				+            console.error(`WARN: ${e.message}`);
			
 
				+        }
			
 
				 
			
 
				-        process.exit(0);
			
 
				+        // console.error('Consolelog active, waiting for cleanup signal...');
			
 
				+        await new Promise(() => {}); // Keep alive until SIGTERM
			
 
				+        return;
			
 
				 
			
 
				     } catch (e) {
			
 
				         const error = `${e.name}: ${e.message}`;
			
--- a/archivebox/plugins/consolelog/templates/icon.html
+++ b/archivebox/plugins/consolelog/templates/icon.html
@@ -0,0 +1 @@
 
				+<span class="abx-output-icon abx-output-icon--consolelog" title="Console Log"><svg width="16" height="16" viewBox="0 0 24 24" aria-hidden="true" focusable="false" fill="none" stroke="currentColor" stroke-width="1.8" stroke-linecap="round" stroke-linejoin="round"><rect x="3" y="4.5" width="18" height="15" rx="2"/><path d="M7 12l2 2-2 2"/><path d="M11 16h6"/></svg></span>
			
--- a/archivebox/plugins/consolelog/tests/__init__.py
+++ b/archivebox/plugins/consolelog/tests/__init__.py
@@ -1 +0,0 @@
 
				-"""Tests for the consolelog plugin."""
			
--- a/archivebox/plugins/consolelog/tests/test_consolelog.py
+++ b/archivebox/plugins/consolelog/tests/test_consolelog.py
@@ -10,6 +10,7 @@ import shutil
 
				 import subprocess
			
 
				 import sys
			
 
				 import tempfile
			
 
				+import time
			
 
				 from pathlib import Path
			
 
				 
			
 
				 import pytest
			
@@ -76,26 +77,33 @@ class TestConsolelogWithChrome(TestCase):
 
				                 # Use the environment from chrome_session (already has CHROME_HEADLESS=true)
			
 
				 
			
 
				 
			
 
				-                # Run consolelog hook with the active Chrome session
			
 
				-                result = subprocess.run(
			
 
				+                # Run consolelog hook with the active Chrome session (background hook)
			
 
				+                result = subprocess.Popen(
			
 
				                     ['node', str(CONSOLELOG_HOOK), f'--url={test_url}', f'--snapshot-id={snapshot_id}'],
			
 
				                     cwd=str(snapshot_chrome_dir),
			
 
				-                    capture_output=True,
			
 
				+                    stdout=subprocess.PIPE,
			
 
				+                    stderr=subprocess.PIPE,
			
 
				                     text=True,
			
 
				-                    timeout=120,  # Longer timeout as it waits for navigation
			
 
				                     env=env
			
 
				                 )
			
 
				 
			
 
				                 # Check for output file
			
 
				                 console_output = snapshot_chrome_dir / 'console.jsonl'
			
 
				 
			
 
				-                # Verify hook ran (may succeed or timeout waiting for navigation)
			
 
				-                # The hook is designed to wait for page_loaded.txt from chrome_navigate
			
 
				-                # In test mode, that file may not exist, so hook may timeout
			
 
				-                # But it should still create the console.jsonl file
			
 
				+                # Allow it to run briefly, then terminate (background hook)
			
 
				+                time.sleep(3)
			
 
				+                if result.poll() is None:
			
 
				+                    result.terminate()
			
 
				+                    try:
			
 
				+                        stdout, stderr = result.communicate(timeout=5)
			
 
				+                    except subprocess.TimeoutExpired:
			
 
				+                        result.kill()
			
 
				+                        stdout, stderr = result.communicate()
			
 
				+                else:
			
 
				+                    stdout, stderr = result.communicate()
			
 
				 
			
 
				                 # At minimum, verify no crash
			
 
				-                self.assertNotIn('Traceback', result.stderr)
			
 
				+                self.assertNotIn('Traceback', stderr)
			
 
				 
			
 
				                 # If output file exists, verify it's valid JSONL
			
 
				                 if console_output.exists():
			
--- a/archivebox/plugins/custom/on_Binary__14_custom_install.py
+++ b/archivebox/plugins/custom/on_Binary__14_custom_install.py
@@ -59,9 +59,16 @@ def main(binary_id: str, machine_id: str, name: str, binproviders: str, custom_c
 
				     provider = EnvProvider()
			
 
				     try:
			
 
				         binary = Binary(name=name, binproviders=[provider]).load()
			
 
				-    except Exception as e:
			
 
				-        click.echo(f"{name} not found after custom install: {e}", err=True)
			
 
				-        sys.exit(1)
			
 
				+    except Exception:
			
 
				+        try:
			
 
				+            binary = Binary(
			
 
				+                name=name,
			
 
				+                binproviders=[provider],
			
 
				+                overrides={'env': {'version': '0.0.1'}},
			
 
				+            ).load()
			
 
				+        except Exception as e:
			
 
				+            click.echo(f"{name} not found after custom install: {e}", err=True)
			
 
				+            sys.exit(1)
			
 
				 
			
 
				     if not binary.abspath:
			
 
				         click.echo(f"{name} not found after custom install", err=True)
			
--- a/archivebox/plugins/custom/tests/__init__.py
+++ b/archivebox/plugins/custom/tests/__init__.py
@@ -1 +0,0 @@
 
				-"""Tests for the custom binary provider plugin."""
			
--- a/archivebox/plugins/custom/tests/test_custom_provider.py
+++ b/archivebox/plugins/custom/tests/test_custom_provider.py
@@ -17,7 +17,7 @@ from django.test import TestCase
 
				 
			
 
				 # Get the path to the custom provider hook
			
 
				 PLUGIN_DIR = Path(__file__).parent.parent
			
 
				-INSTALL_HOOK = PLUGIN_DIR / 'on_Binary__install_using_custom_bash.py'
			
 
				+INSTALL_HOOK = next(PLUGIN_DIR.glob('on_Binary__*_custom_install.py'), None)
			
 
				 
			
 
				 
			
 
				 class TestCustomProviderHook(TestCase):
			
@@ -34,7 +34,7 @@ class TestCustomProviderHook(TestCase):
 
				 
			
 
				     def test_hook_script_exists(self):
			
 
				         """Hook script should exist."""
			
 
				-        self.assertTrue(INSTALL_HOOK.exists(), f"Hook not found: {INSTALL_HOOK}")
			
 
				+        self.assertTrue(INSTALL_HOOK and INSTALL_HOOK.exists(), f"Hook not found: {INSTALL_HOOK}")
			
 
				 
			
 
				     def test_hook_skips_when_custom_not_allowed(self):
			
 
				         """Hook should skip when custom not in allowed binproviders."""
			
--- a/archivebox/plugins/dns/on_Snapshot__22_dns.bg.js
+++ b/archivebox/plugins/dns/on_Snapshot__22_dns.bg.js
@@ -32,6 +32,11 @@ const OUTPUT_DIR = '.';
 
				 const OUTPUT_FILE = 'dns.jsonl';
			
 
				 const CHROME_SESSION_DIR = '../chrome';
			
 
				 
			
 
				+let browser = null;
			
 
				+let page = null;
			
 
				+let recordCount = 0;
			
 
				+let shuttingDown = false;
			
 
				+
			
 
				 function extractHostname(url) {
			
 
				     try {
			
 
				         const urlObj = new URL(url);
			
@@ -121,6 +126,7 @@ async function setupListener(targetUrl) {
 
				 
			
 
				             // Append to output file
			
 
				             fs.appendFileSync(outputPath, JSON.stringify(dnsRecord) + '\n');
			
 
				+            recordCount += 1;
			
 
				 
			
 
				         } catch (e) {
			
 
				             // Ignore errors
			
@@ -170,6 +176,7 @@ async function setupListener(targetUrl) {
 
				                 };
			
 
				 
			
 
				                 fs.appendFileSync(outputPath, JSON.stringify(dnsRecord) + '\n');
			
 
				+                recordCount += 1;
			
 
				             }
			
 
				         } catch (e) {
			
 
				             // Ignore errors
			
@@ -179,6 +186,28 @@ async function setupListener(targetUrl) {
 
				     return { browser, page, client };
			
 
				 }
			
 
				 
			
 
				+function emitResult(status = 'succeeded') {
			
 
				+    if (shuttingDown) return;
			
 
				+    shuttingDown = true;
			
 
				+
			
 
				+    console.log(JSON.stringify({
			
 
				+        type: 'ArchiveResult',
			
 
				+        status,
			
 
				+        output_str: `${OUTPUT_FILE} (${recordCount} DNS records)`,
			
 
				+    }));
			
 
				+}
			
 
				+
			
 
				+async function handleShutdown(signal) {
			
 
				+    console.error(`\nReceived ${signal}, emitting final results...`);
			
 
				+    emitResult('succeeded');
			
 
				+    if (browser) {
			
 
				+        try {
			
 
				+            browser.disconnect();
			
 
				+        } catch (e) {}
			
 
				+    }
			
 
				+    process.exit(0);
			
 
				+}
			
 
				+
			
 
				 async function main() {
			
 
				     const args = parseArgs();
			
 
				     const url = args.url;
			
@@ -195,31 +224,27 @@ async function main() {
 
				         process.exit(0);
			
 
				     }
			
 
				 
			
 
				-    const timeout = getEnvInt('DNS_TIMEOUT', 30) * 1000;
			
 
				-
			
 
				     try {
			
 
				         // Set up listener BEFORE navigation
			
 
				-        await setupListener(url);
			
 
				+        const connection = await setupListener(url);
			
 
				+        browser = connection.browser;
			
 
				+        page = connection.page;
			
 
				 
			
 
				-        // Wait for chrome_navigate to complete (BLOCKING)
			
 
				-        await waitForPageLoaded(CHROME_SESSION_DIR, timeout * 4, 500);
			
 
				+        // Register signal handlers for graceful shutdown
			
 
				+        process.on('SIGTERM', () => handleShutdown('SIGTERM'));
			
 
				+        process.on('SIGINT', () => handleShutdown('SIGINT'));
			
 
				 
			
 
				-        // Count DNS records
			
 
				-        const outputPath = path.join(OUTPUT_DIR, OUTPUT_FILE);
			
 
				-        let recordCount = 0;
			
 
				-        if (fs.existsSync(outputPath)) {
			
 
				-            const content = fs.readFileSync(outputPath, 'utf8');
			
 
				-            recordCount = content.split('\n').filter(line => line.trim()).length;
			
 
				+        // Wait for chrome_navigate to complete (non-fatal)
			
 
				+        try {
			
 
				+            const timeout = getEnvInt('DNS_TIMEOUT', 30) * 1000;
			
 
				+            await waitForPageLoaded(CHROME_SESSION_DIR, timeout * 4, 500);
			
 
				+        } catch (e) {
			
 
				+            console.error(`WARN: ${e.message}`);
			
 
				         }
			
 
				 
			
 
				-        // Output clean JSONL
			
 
				-        console.log(JSON.stringify({
			
 
				-            type: 'ArchiveResult',
			
 
				-            status: 'succeeded',
			
 
				-            output_str: `${OUTPUT_FILE} (${recordCount} DNS records)`,
			
 
				-        }));
			
 
				-
			
 
				-        process.exit(0);
			
 
				+        // console.error('DNS listener active, waiting for cleanup signal...');
			
 
				+        await new Promise(() => {}); // Keep alive until SIGTERM
			
 
				+        return;
			
 
				 
			
 
				     } catch (e) {
			
 
				         const error = `${e.name}: ${e.message}`;
			
--- a/archivebox/plugins/dns/templates/icon.html
+++ b/archivebox/plugins/dns/templates/icon.html
@@ -0,0 +1 @@
 
				+<span class="abx-output-icon abx-output-icon--dns" title="DNS"><svg width="16" height="16" viewBox="0 0 24 24" aria-hidden="true" focusable="false" fill="none" stroke="currentColor" stroke-width="1.8" stroke-linecap="round" stroke-linejoin="round"><circle cx="6" cy="12" r="2"/><circle cx="18" cy="6" r="2"/><circle cx="18" cy="18" r="2"/><path d="M8 12h6"/><path d="M16 8l-2 2"/><path d="M16 16l-2-2"/></svg></span>
			
--- a/archivebox/plugins/dom/on_Snapshot__53_dom.js
+++ b/archivebox/plugins/dom/on_Snapshot__53_dom.js
@@ -52,7 +52,21 @@ const CHROME_SESSION_DIR = '../chrome';
 
				 // Check if staticfile extractor already downloaded this URL
			
 
				 const STATICFILE_DIR = '../staticfile';
			
 
				 function hasStaticFileOutput() {
			
 
				-    return fs.existsSync(STATICFILE_DIR) && fs.readdirSync(STATICFILE_DIR).length > 0;
			
 
				+    if (!fs.existsSync(STATICFILE_DIR)) return false;
			
 
				+    const stdoutPath = path.join(STATICFILE_DIR, 'stdout.log');
			
 
				+    if (!fs.existsSync(stdoutPath)) return false;
			
 
				+    const stdout = fs.readFileSync(stdoutPath, 'utf8');
			
 
				+    for (const line of stdout.split('\n')) {
			
 
				+        const trimmed = line.trim();
			
 
				+        if (!trimmed.startsWith('{')) continue;
			
 
				+        try {
			
 
				+            const record = JSON.parse(trimmed);
			
 
				+            if (record.type === 'ArchiveResult' && record.status === 'succeeded') {
			
 
				+                return true;
			
 
				+            }
			
 
				+        } catch (e) {}
			
 
				+    }
			
 
				+    return false;
			
 
				 }
			
 
				 
			
 
				 // Wait for chrome tab to be fully loaded
			
--- a/archivebox/plugins/dom/templates/icon.html
+++ b/archivebox/plugins/dom/templates/icon.html
@@ -1 +1 @@
 
				-🌐
			
 
				+<span class="abx-output-icon abx-output-icon--dom" title="DOM"><svg width="16" height="16" viewBox="0 0 24 24" aria-hidden="true" focusable="false" fill="none" stroke="currentColor" stroke-width="1.8" stroke-linecap="round" stroke-linejoin="round"><path d="M8 9l-3 3 3 3"/><path d="M16 9l3 3-3 3"/><path d="M10 20l4-16"/></svg></span>
			
--- a/archivebox/plugins/dom/tests/test_dom.py
+++ b/archivebox/plugins/dom/tests/test_dom.py
@@ -142,7 +142,7 @@ def test_staticfile_present_skips():
 
				         #   dom/         <- dom extractor runs here, looks for ../staticfile
			
 
				         staticfile_dir = tmpdir / 'staticfile'
			
 
				         staticfile_dir.mkdir()
			
 
				-        (staticfile_dir / 'index.html').write_text('<html>test</html>')
			
 
				+        (staticfile_dir / 'stdout.log').write_text('{"type":"ArchiveResult","status":"succeeded","output_str":"index.html"}\n')
			
 
				 
			
 
				         dom_dir = tmpdir / 'dom'
			
 
				         dom_dir.mkdir()
			
--- a/archivebox/plugins/env/on_Binary__15_env_install.py
+++ b/archivebox/plugins/env/on_Binary__15_env_install.py
@@ -25,7 +25,8 @@ from abx_pkg import Binary, EnvProvider
 
				 @click.option('--binary-id', required=True, help="Dependency UUID")
			
 
				 @click.option('--name', required=True, help="Binary name to find")
			
 
				 @click.option('--binproviders', default='*', help="Allowed providers (comma-separated)")
			
 
				-def main(binary_id: str, machine_id: str, name: str, binproviders: str):
			
 
				[email protected]('--overrides', default=None, help="JSON-encoded overrides dict (unused)")
			
 
				+def main(binary_id: str, machine_id: str, name: str, binproviders: str, overrides: str | None):
			
 
				     """Check if binary is available in PATH and record it."""
			
 
				 
			
 
				     # Check if env provider is allowed
			
--- a/archivebox/plugins/env/tests/__init__.py
+++ b/archivebox/plugins/env/tests/__init__.py
@@ -1 +0,0 @@
 
				-"""Tests for the env binary provider plugin."""
			
--- a/archivebox/plugins/env/tests/test_env_provider.py
+++ b/archivebox/plugins/env/tests/test_env_provider.py
@@ -17,7 +17,7 @@ from django.test import TestCase
 
				 
			
 
				 # Get the path to the env provider hook
			
 
				 PLUGIN_DIR = Path(__file__).parent.parent
			
 
				-INSTALL_HOOK = PLUGIN_DIR / 'on_Binary__install_using_env_provider.py'
			
 
				+INSTALL_HOOK = next(PLUGIN_DIR.glob('on_Binary__*_env_install.py'), None)
			
 
				 
			
 
				 
			
 
				 class TestEnvProviderHook(TestCase):
			
@@ -34,7 +34,7 @@ class TestEnvProviderHook(TestCase):
 
				 
			
 
				     def test_hook_script_exists(self):
			
 
				         """Hook script should exist."""
			
 
				-        self.assertTrue(INSTALL_HOOK.exists(), f"Hook not found: {INSTALL_HOOK}")
			
 
				+        self.assertTrue(INSTALL_HOOK and INSTALL_HOOK.exists(), f"Hook not found: {INSTALL_HOOK}")
			
 
				 
			
 
				     def test_hook_finds_python(self):
			
 
				         """Hook should find python3 binary in PATH."""
			
--- a/archivebox/plugins/favicon/on_Snapshot__11_favicon.py
+++ b/archivebox/plugins/favicon/on_Snapshot__11_favicon.py
@@ -126,7 +126,12 @@ def main(url: str, snapshot_id: str):
 
				     try:
			
 
				         # Run extraction
			
 
				         success, output, error = get_favicon(url)
			
 
				-        status = 'succeeded' if success else 'failed'
			
 
				+        if success:
			
 
				+            status = 'succeeded'
			
 
				+        elif error == 'No favicon found':
			
 
				+            status = 'skipped'
			
 
				+        else:
			
 
				+            status = 'failed'
			
 
				 
			
 
				     except Exception as e:
			
 
				         error = f'{type(e).__name__}: {e}'
			
@@ -143,7 +148,7 @@ def main(url: str, snapshot_id: str):
 
				     }
			
 
				     print(json.dumps(result))
			
 
				 
			
 
				-    sys.exit(0 if status == 'succeeded' else 1)
			
 
				+    sys.exit(0 if status in ('succeeded', 'skipped') else 1)
			
 
				 
			
 
				 
			
 
				 if __name__ == '__main__':
			
--- a/archivebox/plugins/favicon/templates/icon.html
+++ b/archivebox/plugins/favicon/templates/icon.html
@@ -1 +1 @@
 
				-⭐
			
 
				+<span class="abx-output-icon abx-output-icon--favicon" title="Favicon"><svg width="16" height="16" viewBox="0 0 24 24" aria-hidden="true" focusable="false" fill="none" stroke="currentColor" stroke-width="1.8" stroke-linecap="round" stroke-linejoin="round"><path d="M12 3l2.5 5.5 6 .5-4.5 3.8 1.5 5.7L12 15.5 6.5 18.5 8 12.8 3.5 9l6-.5z"/></svg></span>
			
--- a/archivebox/plugins/forumdl/binaries.jsonl
+++ b/archivebox/plugins/forumdl/binaries.jsonl
@@ -1 +0,0 @@
 
				-{"type": "Binary", "name": "forum-dl", "binproviders": "pip,env"}
			
--- a/archivebox/plugins/forumdl/on_Crawl__13_forumdl_install.py
+++ b/archivebox/plugins/forumdl/on_Crawl__13_forumdl_install.py
@@ -1,80 +0,0 @@
 
				-#!/usr/bin/env python3
			
 
				-"""
			
 
				-Detect forum-dl binary and emit Binary JSONL record.
			
 
				-
			
 
				-Output: Binary JSONL record to stdout if forum-dl is found
			
 
				-"""
			
 
				-
			
 
				-import json
			
 
				-import os
			
 
				-import sys
			
 
				-
			
 
				-from abx_pkg import Binary, EnvProvider
			
 
				-
			
 
				-
			
 
				-def get_env(name: str, default: str = '') -> str:
			
 
				-    return os.environ.get(name, default).strip()
			
 
				-
			
 
				-def get_env_bool(name: str, default: bool = False) -> bool:
			
 
				-    val = get_env(name, '').lower()
			
 
				-    if val in ('true', '1', 'yes', 'on'):
			
 
				-        return True
			
 
				-    if val in ('false', '0', 'no', 'off'):
			
 
				-        return False
			
 
				-    return default
			
 
				-
			
 
				-
			
 
				-def output_binary_found(binary: Binary, name: str):
			
 
				-    """Output Binary JSONL record for an installed binary."""
			
 
				-    machine_id = os.environ.get('MACHINE_ID', '')
			
 
				-
			
 
				-    record = {
			
 
				-        'type': 'Binary',
			
 
				-        'name': name,
			
 
				-        'abspath': str(binary.abspath),
			
 
				-        'version': str(binary.version) if binary.version else '',
			
 
				-        'sha256': binary.sha256 or '',
			
 
				-        'binprovider': 'env',  # Already installed
			
 
				-        'machine_id': machine_id,
			
 
				-    }
			
 
				-    print(json.dumps(record))
			
 
				-
			
 
				-
			
 
				-def output_binary_missing(name: str, binproviders: str):
			
 
				-    """Output Binary JSONL record for a missing binary that needs installation."""
			
 
				-    machine_id = os.environ.get('MACHINE_ID', '')
			
 
				-
			
 
				-    record = {
			
 
				-        'type': 'Binary',
			
 
				-        'name': name,
			
 
				-        'binproviders': binproviders,  # Providers that can install it
			
 
				-        'machine_id': machine_id,
			
 
				-    }
			
 
				-    print(json.dumps(record))
			
 
				-
			
 
				-
			
 
				-def main():
			
 
				-    forumdl_enabled = get_env_bool('FORUMDL_ENABLED', True)
			
 
				-    forumdl_binary = get_env('FORUMDL_BINARY', 'forum-dl')
			
 
				-
			
 
				-    if not forumdl_enabled:
			
 
				-        sys.exit(0)
			
 
				-
			
 
				-    provider = EnvProvider()
			
 
				-    try:
			
 
				-        binary = Binary(name=forumdl_binary, binproviders=[provider]).load()
			
 
				-        if binary.abspath:
			
 
				-            # Binary found
			
 
				-            output_binary_found(binary, name='forum-dl')
			
 
				-        else:
			
 
				-            # Binary not found
			
 
				-            output_binary_missing(name='forum-dl', binproviders='pip')
			
 
				-    except Exception:
			
 
				-        # Binary not found
			
 
				-        output_binary_missing(name='forum-dl', binproviders='pip')
			
 
				-
			
 
				-    sys.exit(0)
			
 
				-
			
 
				-
			
 
				-if __name__ == '__main__':
			
 
				-    main()
			
--- a/archivebox/plugins/forumdl/on_Crawl__25_forumdl_install.py
+++ b/archivebox/plugins/forumdl/on_Crawl__25_forumdl_install.py
@@ -0,0 +1,79 @@
 
				+#!/usr/bin/env python3
			
 
				+"""
			
 
				+Emit forum-dl Binary dependency for the crawl.
			
 
				+"""
			
 
				+
			
 
				+import json
			
 
				+import os
			
 
				+import sys
			
 
				+
			
 
				+
			
 
				+def get_env(name: str, default: str = '') -> str:
			
 
				+    return os.environ.get(name, default).strip()
			
 
				+
			
 
				+def get_env_bool(name: str, default: bool = False) -> bool:
			
 
				+    val = get_env(name, '').lower()
			
 
				+    if val in ('true', '1', 'yes', 'on'):
			
 
				+        return True
			
 
				+    if val in ('false', '0', 'no', 'off'):
			
 
				+        return False
			
 
				+    return default
			
 
				+
			
 
				+
			
 
				+def output_binary(name: str, binproviders: str, overrides: dict | None = None):
			
 
				+    """Output Binary JSONL record for a dependency."""
			
 
				+    machine_id = os.environ.get('MACHINE_ID', '')
			
 
				+
			
 
				+    record = {
			
 
				+        'type': 'Binary',
			
 
				+        'name': name,
			
 
				+        'binproviders': binproviders,
			
 
				+        'machine_id': machine_id,
			
 
				+    }
			
 
				+    if overrides:
			
 
				+        record['overrides'] = overrides
			
 
				+    print(json.dumps(record))
			
 
				+
			
 
				+
			
 
				+def main():
			
 
				+    forumdl_enabled = get_env_bool('FORUMDL_ENABLED', True)
			
 
				+
			
 
				+    if not forumdl_enabled:
			
 
				+        sys.exit(0)
			
 
				+
			
 
				+    output_binary(
			
 
				+        name='forum-dl',
			
 
				+        binproviders='pip,env',
			
 
				+        overrides={
			
 
				+            'pip': {
			
 
				+                'packages': [
			
 
				+                    '--no-deps',
			
 
				+                    'forum-dl',
			
 
				+                    'pydantic',
			
 
				+                    'pydantic-core',
			
 
				+                    'typing-extensions',
			
 
				+                    'annotated-types',
			
 
				+                    'typing-inspection',
			
 
				+                    'beautifulsoup4',
			
 
				+                    'soupsieve',
			
 
				+                    'lxml',
			
 
				+                    'requests',
			
 
				+                    'urllib3',
			
 
				+                    'certifi',
			
 
				+                    'idna',
			
 
				+                    'charset-normalizer',
			
 
				+                    'tenacity',
			
 
				+                    'python-dateutil',
			
 
				+                    'six',
			
 
				+                    'html2text',
			
 
				+                    'warcio',
			
 
				+                ]
			
 
				+            }
			
 
				+        },
			
 
				+    )
			
 
				+
			
 
				+    sys.exit(0)
			
 
				+
			
 
				+
			
 
				+if __name__ == '__main__':
			
 
				+    main()
			
--- a/archivebox/plugins/forumdl/on_Snapshot__04_forumdl.bg.py
+++ b/archivebox/plugins/forumdl/on_Snapshot__04_forumdl.bg.py
@@ -2,7 +2,7 @@
 
				 """
			
 
				 Download forum content from a URL using forum-dl.
			
 
				 
			
 
				-Usage: on_Snapshot__forumdl.py --url=<url> --snapshot-id=<uuid>
			
 
				+Usage: on_Snapshot__04_forumdl.bg.py --url=<url> --snapshot-id=<uuid>
			
 
				 Output: Downloads forum content to $PWD/
			
 
				 
			
 
				 Environment variables:
			
@@ -19,6 +19,7 @@ import json
 
				 import os
			
 
				 import subprocess
			
 
				 import sys
			
 
				+import threading
			
 
				 from pathlib import Path
			
 
				 
			
 
				 import rich_click as click
			
@@ -131,13 +132,41 @@ def save_forum(url: str, binary: str) -> tuple[bool, str | None, str]:
 
				     cmd.append(url)
			
 
				 
			
 
				     try:
			
 
				-        result = subprocess.run(cmd, capture_output=True, timeout=timeout, text=True)
			
 
				+        print(f'[forumdl] Starting download (timeout={timeout}s)', file=sys.stderr)
			
 
				+        output_lines: list[str] = []
			
 
				+        process = subprocess.Popen(
			
 
				+            cmd,
			
 
				+            stdout=subprocess.PIPE,
			
 
				+            stderr=subprocess.STDOUT,
			
 
				+            text=True,
			
 
				+            bufsize=1,
			
 
				+        )
			
 
				+
			
 
				+        def _read_output() -> None:
			
 
				+            if not process.stdout:
			
 
				+                return
			
 
				+            for line in process.stdout:
			
 
				+                output_lines.append(line)
			
 
				+                sys.stderr.write(line)
			
 
				+
			
 
				+        reader = threading.Thread(target=_read_output, daemon=True)
			
 
				+        reader.start()
			
 
				+
			
 
				+        try:
			
 
				+            process.wait(timeout=timeout)
			
 
				+        except subprocess.TimeoutExpired:
			
 
				+            process.kill()
			
 
				+            reader.join(timeout=1)
			
 
				+            return False, None, f'Timed out after {timeout} seconds'
			
 
				+
			
 
				+        reader.join(timeout=1)
			
 
				+        combined_output = ''.join(output_lines)
			
 
				 
			
 
				         # Check if output file was created
			
 
				         if output_file.exists() and output_file.stat().st_size > 0:
			
 
				             return True, str(output_file), ''
			
 
				         else:
			
 
				-            stderr = result.stderr
			
 
				+            stderr = combined_output
			
 
				 
			
 
				             # These are NOT errors - page simply has no downloadable forum content
			
 
				             stderr_lower = stderr.lower()
			
@@ -147,7 +176,7 @@ def save_forum(url: str, binary: str) -> tuple[bool, str | None, str]:
 
				                 return True, None, ''  # No forum found - success, no output
			
 
				             if 'extractornotfounderror' in stderr_lower:
			
 
				                 return True, None, ''  # No forum extractor for this URL - success, no output
			
 
				-            if result.returncode == 0:
			
 
				+            if process.returncode == 0:
			
 
				                 return True, None, ''  # forum-dl exited cleanly, just no forum - success
			
 
				 
			
 
				             # These ARE errors - something went wrong
			
--- a/archivebox/plugins/forumdl/templates/icon.html
+++ b/archivebox/plugins/forumdl/templates/icon.html
@@ -1 +1 @@
 
				-💬
			
 
				+<span class="abx-output-icon abx-output-icon--forumdl" title="Forum"><svg width="16" height="16" viewBox="0 0 24 24" aria-hidden="true" focusable="false" fill="none" stroke="currentColor" stroke-width="1.8" stroke-linecap="round" stroke-linejoin="round"><path d="M4 5h16v10H7l-3 3V5z"/></svg></span>
			
--- a/archivebox/plugins/gallerydl/binaries.jsonl
+++ b/archivebox/plugins/gallerydl/binaries.jsonl
@@ -1 +0,0 @@
 
				-{"type": "Binary", "name": "gallery-dl", "binproviders": "pip,brew,apt,env"}
			
--- a/archivebox/plugins/gallerydl/on_Crawl__10_gallerydl_install.py
+++ b/archivebox/plugins/gallerydl/on_Crawl__10_gallerydl_install.py
@@ -1,80 +0,0 @@
 
				-#!/usr/bin/env python3
			
 
				-"""
			
 
				-Detect gallery-dl binary and emit Binary JSONL record.
			
 
				-
			
 
				-Output: Binary JSONL record to stdout if gallery-dl is found
			
 
				-"""
			
 
				-
			
 
				-import json
			
 
				-import os
			
 
				-import sys
			
 
				-
			
 
				-from abx_pkg import Binary, EnvProvider
			
 
				-
			
 
				-
			
 
				-def get_env(name: str, default: str = '') -> str:
			
 
				-    return os.environ.get(name, default).strip()
			
 
				-
			
 
				-def get_env_bool(name: str, default: bool = False) -> bool:
			
 
				-    val = get_env(name, '').lower()
			
 
				-    if val in ('true', '1', 'yes', 'on'):
			
 
				-        return True
			
 
				-    if val in ('false', '0', 'no', 'off'):
			
 
				-        return False
			
 
				-    return default
			
 
				-
			
 
				-
			
 
				-def output_binary_found(binary: Binary, name: str):
			
 
				-    """Output Binary JSONL record for an installed binary."""
			
 
				-    machine_id = os.environ.get('MACHINE_ID', '')
			
 
				-
			
 
				-    record = {
			
 
				-        'type': 'Binary',
			
 
				-        'name': name,
			
 
				-        'abspath': str(binary.abspath),
			
 
				-        'version': str(binary.version) if binary.version else '',
			
 
				-        'sha256': binary.sha256 or '',
			
 
				-        'binprovider': 'env',  # Already installed
			
 
				-        'machine_id': machine_id,
			
 
				-    }
			
 
				-    print(json.dumps(record))
			
 
				-
			
 
				-
			
 
				-def output_binary_missing(name: str, binproviders: str):
			
 
				-    """Output Binary JSONL record for a missing binary that needs installation."""
			
 
				-    machine_id = os.environ.get('MACHINE_ID', '')
			
 
				-
			
 
				-    record = {
			
 
				-        'type': 'Binary',
			
 
				-        'name': name,
			
 
				-        'binproviders': binproviders,  # Providers that can install it
			
 
				-        'machine_id': machine_id,
			
 
				-    }
			
 
				-    print(json.dumps(record))
			
 
				-
			
 
				-
			
 
				-def main():
			
 
				-    gallerydl_enabled = get_env_bool('GALLERYDL_ENABLED', True)
			
 
				-    gallerydl_binary = get_env('GALLERYDL_BINARY', 'gallery-dl')
			
 
				-
			
 
				-    if not gallerydl_enabled:
			
 
				-        sys.exit(0)
			
 
				-
			
 
				-    provider = EnvProvider()
			
 
				-    try:
			
 
				-        binary = Binary(name=gallerydl_binary, binproviders=[provider]).load()
			
 
				-        if binary.abspath:
			
 
				-            # Binary found
			
 
				-            output_binary_found(binary, name='gallery-dl')
			
 
				-        else:
			
 
				-            # Binary not found
			
 
				-            output_binary_missing(name='gallery-dl', binproviders='pip')
			
 
				-    except Exception:
			
 
				-        # Binary not found
			
 
				-        output_binary_missing(name='gallery-dl', binproviders='pip')
			
 
				-
			
 
				-    sys.exit(0)
			
 
				-
			
 
				-
			
 
				-if __name__ == '__main__':
			
 
				-    main()
			
--- a/archivebox/plugins/gallerydl/on_Crawl__20_gallerydl_install.py
+++ b/archivebox/plugins/gallerydl/on_Crawl__20_gallerydl_install.py
@@ -0,0 +1,48 @@
 
				+#!/usr/bin/env python3
			
 
				+"""
			
 
				+Emit gallery-dl Binary dependency for the crawl.
			
 
				+"""
			
 
				+
			
 
				+import json
			
 
				+import os
			
 
				+import sys
			
 
				+
			
 
				+
			
 
				+def get_env(name: str, default: str = '') -> str:
			
 
				+    return os.environ.get(name, default).strip()
			
 
				+
			
 
				+def get_env_bool(name: str, default: bool = False) -> bool:
			
 
				+    val = get_env(name, '').lower()
			
 
				+    if val in ('true', '1', 'yes', 'on'):
			
 
				+        return True
			
 
				+    if val in ('false', '0', 'no', 'off'):
			
 
				+        return False
			
 
				+    return default
			
 
				+
			
 
				+
			
 
				+def output_binary(name: str, binproviders: str):
			
 
				+    """Output Binary JSONL record for a dependency."""
			
 
				+    machine_id = os.environ.get('MACHINE_ID', '')
			
 
				+
			
 
				+    record = {
			
 
				+        'type': 'Binary',
			
 
				+        'name': name,
			
 
				+        'binproviders': binproviders,
			
 
				+        'machine_id': machine_id,
			
 
				+    }
			
 
				+    print(json.dumps(record))
			
 
				+
			
 
				+
			
 
				+def main():
			
 
				+    gallerydl_enabled = get_env_bool('GALLERYDL_ENABLED', True)
			
 
				+
			
 
				+    if not gallerydl_enabled:
			
 
				+        sys.exit(0)
			
 
				+
			
 
				+    output_binary(name='gallery-dl', binproviders='pip,brew,apt,env')
			
 
				+
			
 
				+    sys.exit(0)
			
 
				+
			
 
				+
			
 
				+if __name__ == '__main__':
			
 
				+    main()
			
--- a/archivebox/plugins/gallerydl/on_Snapshot__03_gallerydl.bg.py
+++ b/archivebox/plugins/gallerydl/on_Snapshot__03_gallerydl.bg.py
@@ -2,7 +2,7 @@
 
				 """
			
 
				 Download image galleries from a URL using gallery-dl.
			
 
				 
			
 
				-Usage: on_Snapshot__gallerydl.py --url=<url> --snapshot-id=<uuid>
			
 
				+Usage: on_Snapshot__03_gallerydl.bg.py --url=<url> --snapshot-id=<uuid>
			
 
				 Output: Downloads gallery images to $PWD/gallerydl/
			
 
				 
			
 
				 Environment variables:
			
@@ -19,6 +19,7 @@ import json
 
				 import os
			
 
				 import subprocess
			
 
				 import sys
			
 
				+import threading
			
 
				 from pathlib import Path
			
 
				 
			
 
				 import rich_click as click
			
@@ -70,7 +71,22 @@ STATICFILE_DIR = '../staticfile'
 
				 def has_staticfile_output() -> bool:
			
 
				     """Check if staticfile extractor already downloaded this URL."""
			
 
				     staticfile_dir = Path(STATICFILE_DIR)
			
 
				-    return staticfile_dir.exists() and any(staticfile_dir.iterdir())
			
 
				+    if not staticfile_dir.exists():
			
 
				+        return False
			
 
				+    stdout_log = staticfile_dir / 'stdout.log'
			
 
				+    if not stdout_log.exists():
			
 
				+        return False
			
 
				+    for line in stdout_log.read_text(errors='ignore').splitlines():
			
 
				+        line = line.strip()
			
 
				+        if not line.startswith('{'):
			
 
				+            continue
			
 
				+        try:
			
 
				+            record = json.loads(line)
			
 
				+        except json.JSONDecodeError:
			
 
				+            continue
			
 
				+        if record.get('type') == 'ArchiveResult' and record.get('status') == 'succeeded':
			
 
				+            return True
			
 
				+    return False
			
 
				 
			
 
				 
			
 
				 def save_gallery(url: str, binary: str) -> tuple[bool, str | None, str]:
			
@@ -109,7 +125,35 @@ def save_gallery(url: str, binary: str) -> tuple[bool, str | None, str]:
 
				     cmd.append(url)
			
 
				 
			
 
				     try:
			
 
				-        result = subprocess.run(cmd, capture_output=True, timeout=timeout, text=True)
			
 
				+        print(f'[gallerydl] Starting download (timeout={timeout}s)', file=sys.stderr)
			
 
				+        output_lines: list[str] = []
			
 
				+        process = subprocess.Popen(
			
 
				+            cmd,
			
 
				+            stdout=subprocess.PIPE,
			
 
				+            stderr=subprocess.STDOUT,
			
 
				+            text=True,
			
 
				+            bufsize=1,
			
 
				+        )
			
 
				+
			
 
				+        def _read_output() -> None:
			
 
				+            if not process.stdout:
			
 
				+                return
			
 
				+            for line in process.stdout:
			
 
				+                output_lines.append(line)
			
 
				+                sys.stderr.write(line)
			
 
				+
			
 
				+        reader = threading.Thread(target=_read_output, daemon=True)
			
 
				+        reader.start()
			
 
				+
			
 
				+        try:
			
 
				+            process.wait(timeout=timeout)
			
 
				+        except subprocess.TimeoutExpired:
			
 
				+            process.kill()
			
 
				+            reader.join(timeout=1)
			
 
				+            return False, None, f'Timed out after {timeout} seconds'
			
 
				+
			
 
				+        reader.join(timeout=1)
			
 
				+        combined_output = ''.join(output_lines)
			
 
				 
			
 
				         # Check if any gallery files were downloaded (search recursively)
			
 
				         gallery_extensions = (
			
@@ -132,7 +176,7 @@ def save_gallery(url: str, binary: str) -> tuple[bool, str | None, str]:
 
				             output = str(image_files[0]) if image_files else str(downloaded_files[0])
			
 
				             return True, output, ''
			
 
				         else:
			
 
				-            stderr = result.stderr
			
 
				+            stderr = combined_output
			
 
				 
			
 
				             # These are NOT errors - page simply has no downloadable gallery
			
 
				             # Return success with no output (legitimate "nothing to download")
			
@@ -141,7 +185,7 @@ def save_gallery(url: str, binary: str) -> tuple[bool, str | None, str]:
 
				                 return True, None, ''  # Not a gallery site - success, no output
			
 
				             if 'no results' in stderr_lower:
			
 
				                 return True, None, ''  # No gallery found - success, no output
			
 
				-            if result.returncode == 0:
			
 
				+            if process.returncode == 0:
			
 
				                 return True, None, ''  # gallery-dl exited cleanly, just no gallery - success
			
 
				 
			
 
				             # These ARE errors - something went wrong
			
--- a/archivebox/plugins/gallerydl/templates/icon.html
+++ b/archivebox/plugins/gallerydl/templates/icon.html
@@ -1 +1 @@
 
				-🖼️
			
 
				+<span class="abx-output-icon abx-output-icon--gallerydl" title="Gallery"><svg width="16" height="16" viewBox="0 0 24 24" aria-hidden="true" focusable="false" fill="none" stroke="currentColor" stroke-width="1.8" stroke-linecap="round" stroke-linejoin="round"><rect x="3" y="5" width="18" height="14" rx="2"/><circle cx="8" cy="10" r="1.5" fill="currentColor" stroke="none"/><path d="M21 17l-5-5-5 5"/></svg></span>
			
--- a/archivebox/plugins/git/binaries.jsonl
+++ b/archivebox/plugins/git/binaries.jsonl
@@ -1 +0,0 @@
 
				-{"type": "Binary", "name": "git", "binproviders": "apt,brew,env"}
			
--- a/archivebox/plugins/git/on_Crawl__05_git_install.py
+++ b/archivebox/plugins/git/on_Crawl__05_git_install.py
@@ -0,0 +1,48 @@
 
				+#!/usr/bin/env python3
			
 
				+"""
			
 
				+Emit git Binary dependency for the crawl.
			
 
				+"""
			
 
				+
			
 
				+import json
			
 
				+import os
			
 
				+import sys
			
 
				+
			
 
				+
			
 
				+def get_env(name: str, default: str = '') -> str:
			
 
				+    return os.environ.get(name, default).strip()
			
 
				+
			
 
				+def get_env_bool(name: str, default: bool = False) -> bool:
			
 
				+    val = get_env(name, '').lower()
			
 
				+    if val in ('true', '1', 'yes', 'on'):
			
 
				+        return True
			
 
				+    if val in ('false', '0', 'no', 'off'):
			
 
				+        return False
			
 
				+    return default
			
 
				+
			
 
				+
			
 
				+def output_binary(name: str, binproviders: str):
			
 
				+    """Output Binary JSONL record for a dependency."""
			
 
				+    machine_id = os.environ.get('MACHINE_ID', '')
			
 
				+
			
 
				+    record = {
			
 
				+        'type': 'Binary',
			
 
				+        'name': name,
			
 
				+        'binproviders': binproviders,
			
 
				+        'machine_id': machine_id,
			
 
				+    }
			
 
				+    print(json.dumps(record))
			
 
				+
			
 
				+
			
 
				+def main():
			
 
				+    git_enabled = get_env_bool('GIT_ENABLED', True)
			
 
				+
			
 
				+    if not git_enabled:
			
 
				+        sys.exit(0)
			
 
				+
			
 
				+    output_binary(name='git', binproviders='apt,brew,env')
			
 
				+
			
 
				+    sys.exit(0)
			
 
				+
			
 
				+
			
 
				+if __name__ == '__main__':
			
 
				+    main()
			
--- a/archivebox/plugins/git/on_Crawl__09_git_install.py
+++ b/archivebox/plugins/git/on_Crawl__09_git_install.py
@@ -1,80 +0,0 @@
 
				-#!/usr/bin/env python3
			
 
				-"""
			
 
				-Detect git binary and emit Binary JSONL record.
			
 
				-
			
 
				-Output: Binary JSONL record to stdout if git is found
			
 
				-"""
			
 
				-
			
 
				-import json
			
 
				-import os
			
 
				-import sys
			
 
				-
			
 
				-from abx_pkg import Binary, EnvProvider
			
 
				-
			
 
				-
			
 
				-def get_env(name: str, default: str = '') -> str:
			
 
				-    return os.environ.get(name, default).strip()
			
 
				-
			
 
				-def get_env_bool(name: str, default: bool = False) -> bool:
			
 
				-    val = get_env(name, '').lower()
			
 
				-    if val in ('true', '1', 'yes', 'on'):
			
 
				-        return True
			
 
				-    if val in ('false', '0', 'no', 'off'):
			
 
				-        return False
			
 
				-    return default
			
 
				-
			
 
				-
			
 
				-def output_binary_found(binary: Binary, name: str):
			
 
				-    """Output Binary JSONL record for an installed binary."""
			
 
				-    machine_id = os.environ.get('MACHINE_ID', '')
			
 
				-
			
 
				-    record = {
			
 
				-        'type': 'Binary',
			
 
				-        'name': name,
			
 
				-        'abspath': str(binary.abspath),
			
 
				-        'version': str(binary.version) if binary.version else '',
			
 
				-        'sha256': binary.sha256 or '',
			
 
				-        'binprovider': 'env',  # Already installed
			
 
				-        'machine_id': machine_id,
			
 
				-    }
			
 
				-    print(json.dumps(record))
			
 
				-
			
 
				-
			
 
				-def output_binary_missing(name: str, binproviders: str):
			
 
				-    """Output Binary JSONL record for a missing binary that needs installation."""
			
 
				-    machine_id = os.environ.get('MACHINE_ID', '')
			
 
				-
			
 
				-    record = {
			
 
				-        'type': 'Binary',
			
 
				-        'name': name,
			
 
				-        'binproviders': binproviders,  # Providers that can install it
			
 
				-        'machine_id': machine_id,
			
 
				-    }
			
 
				-    print(json.dumps(record))
			
 
				-
			
 
				-
			
 
				-def main():
			
 
				-    git_enabled = get_env_bool('GIT_ENABLED', True)
			
 
				-    git_binary = get_env('GIT_BINARY', 'git')
			
 
				-
			
 
				-    if not git_enabled:
			
 
				-        sys.exit(0)
			
 
				-
			
 
				-    provider = EnvProvider()
			
 
				-    try:
			
 
				-        binary = Binary(name=git_binary, binproviders=[provider]).load()
			
 
				-        if binary.abspath:
			
 
				-            # Binary found
			
 
				-            output_binary_found(binary, name='git')
			
 
				-        else:
			
 
				-            # Binary not found
			
 
				-            output_binary_missing(name='git', binproviders='apt,brew')
			
 
				-    except Exception:
			
 
				-        # Binary not found
			
 
				-        output_binary_missing(name='git', binproviders='apt,brew')
			
 
				-
			
 
				-    sys.exit(0)
			
 
				-
			
 
				-
			
 
				-if __name__ == '__main__':
			
 
				-    main()
			
--- a/archivebox/plugins/git/on_Snapshot__05_git.bg.py
+++ b/archivebox/plugins/git/on_Snapshot__05_git.bg.py
@@ -2,7 +2,7 @@
 
				 """
			
 
				 Clone a git repository from a URL.
			
 
				 
			
 
				-Usage: on_Snapshot__git.py --url=<url> --snapshot-id=<uuid>
			
 
				+Usage: on_Snapshot__05_git.bg.py --url=<url> --snapshot-id=<uuid>
			
 
				 Output: Clones repository to $PWD/repo
			
 
				 
			
 
				 Environment variables:
			
--- a/archivebox/plugins/git/templates/icon.html
+++ b/archivebox/plugins/git/templates/icon.html
@@ -1 +1 @@
 
				-📂
			
 
				+<span class="abx-output-icon abx-output-icon--git" title="Git"><svg width="16" height="16" viewBox="0 0 24 24" aria-hidden="true" focusable="false" fill="none" stroke="currentColor" stroke-width="1.8" stroke-linecap="round" stroke-linejoin="round"><circle cx="6" cy="6" r="2"/><circle cx="6" cy="18" r="2"/><circle cx="18" cy="12" r="2"/><path d="M8 6h5a3 3 0 0 1 3 3v1"/><path d="M8 18h5a3 3 0 0 0 3-3v-1"/></svg></span>
			
--- a/archivebox/plugins/headers/templates/icon.html
+++ b/archivebox/plugins/headers/templates/icon.html
@@ -1 +1 @@
 
				-📋
			
 
				+<span class="abx-output-icon abx-output-icon--headers" title="Headers"><svg width="16" height="16" viewBox="0 0 24 24" aria-hidden="true" focusable="false" fill="none" stroke="currentColor" stroke-width="1.8" stroke-linecap="round" stroke-linejoin="round"><circle cx="4" cy="7" r="1" fill="currentColor" stroke="none"/><circle cx="4" cy="12" r="1" fill="currentColor" stroke="none"/><circle cx="4" cy="17" r="1" fill="currentColor" stroke="none"/><path d="M7 7h13"/><path d="M7 12h13"/><path d="M7 17h13"/></svg></span>
			
--- a/archivebox/plugins/htmltotext/on_Snapshot__58_htmltotext.py
+++ b/archivebox/plugins/htmltotext/on_Snapshot__58_htmltotext.py
@@ -76,22 +76,28 @@ def find_html_source() -> str | None:
 
				     # Hooks run in snapshot_dir, sibling extractor outputs are in subdirectories
			
 
				     search_patterns = [
			
 
				         'singlefile/singlefile.html',
			
 
				+        '*_singlefile/singlefile.html',
			
 
				         'singlefile/*.html',
			
 
				+        '*_singlefile/*.html',
			
 
				         'dom/output.html',
			
 
				+        '*_dom/output.html',
			
 
				         'dom/*.html',
			
 
				+        '*_dom/*.html',
			
 
				         'wget/**/*.html',
			
 
				+        '*_wget/**/*.html',
			
 
				         'wget/**/*.htm',
			
 
				+        '*_wget/**/*.htm',
			
 
				     ]
			
 
				 
			
 
				-    cwd = Path.cwd()
			
 
				-    for pattern in search_patterns:
			
 
				-        matches = list(cwd.glob(pattern))
			
 
				-        for match in matches:
			
 
				-            if match.is_file() and match.stat().st_size > 0:
			
 
				-                try:
			
 
				-                    return match.read_text(errors='ignore')
			
 
				-                except Exception:
			
 
				-                    continue
			
 
				+    for base in (Path.cwd(), Path.cwd().parent):
			
 
				+        for pattern in search_patterns:
			
 
				+            matches = list(base.glob(pattern))
			
 
				+            for match in matches:
			
 
				+                if match.is_file() and match.stat().st_size > 0:
			
 
				+                    try:
			
 
				+                        return match.read_text(errors='ignore')
			
 
				+                    except Exception:
			
 
				+                        continue
			
 
				 
			
 
				     return None
			
 
				 
			
--- a/archivebox/plugins/htmltotext/templates/icon.html
+++ b/archivebox/plugins/htmltotext/templates/icon.html
@@ -1 +1 @@
 
				-📃
			
 
				+<span class="abx-output-icon abx-output-icon--htmltotext" title="HTML to Text"><svg width="16" height="16" viewBox="0 0 24 24" aria-hidden="true" focusable="false" fill="none" stroke="currentColor" stroke-width="1.8" stroke-linecap="round" stroke-linejoin="round"><path d="M4 7h16"/><path d="M4 12h12"/><path d="M4 17h14"/></svg></span>
			
--- a/archivebox/plugins/infiniscroll/templates/icon.html
+++ b/archivebox/plugins/infiniscroll/templates/icon.html
@@ -0,0 +1 @@
 
				+<span class="abx-output-icon abx-output-icon--infiniscroll" title="Infinite Scroll"><svg width="16" height="16" viewBox="0 0 24 24" aria-hidden="true" focusable="false" fill="none" stroke="currentColor" stroke-width="1.8" stroke-linecap="round" stroke-linejoin="round"><path d="M12 5v9"/><path d="M8 10l4 4 4-4"/><circle cx="6" cy="19" r="1" fill="currentColor" stroke="none"/><circle cx="12" cy="19" r="1" fill="currentColor" stroke="none"/><circle cx="18" cy="19" r="1" fill="currentColor" stroke="none"/></svg></span>
			
--- a/archivebox/plugins/istilldontcareaboutcookies/on_Crawl__81_install_istilldontcareaboutcookies_extension.js
+++ b/archivebox/plugins/istilldontcareaboutcookies/on_Crawl__81_install_istilldontcareaboutcookies_extension.js
@@ -7,7 +7,7 @@
 
				  *
			
 
				  * Extension: https://chromewebstore.google.com/detail/edibdbjcniadpccecjdfdjjppcpchdlm
			
 
				  *
			
 
				- * Priority: 02 (early) - Must install before Chrome session starts at Crawl level
			
 
				+ * Priority: 81 - Must install before Chrome session starts at Crawl level
			
 
				  * Hook: on_Crawl (runs once per crawl, not per snapshot)
			
 
				  *
			
 
				  * This extension automatically:
			
--- a/archivebox/plugins/mercury/binaries.jsonl
+++ b/archivebox/plugins/mercury/binaries.jsonl
@@ -1 +0,0 @@
 
				-{"type": "Binary", "name": "postlight-parser", "binproviders": "npm,env", "overrides": {"npm": {"packages": ["@postlight/parser"]}}}
			
--- a/archivebox/plugins/mercury/on_Crawl__12_mercury_install.py
+++ b/archivebox/plugins/mercury/on_Crawl__12_mercury_install.py
@@ -1,85 +0,0 @@
 
				-#!/usr/bin/env python3
			
 
				-"""
			
 
				-Detect postlight-parser binary and emit Binary JSONL record.
			
 
				-
			
 
				-Output: Binary JSONL record to stdout if postlight-parser is found
			
 
				-"""
			
 
				-
			
 
				-import json
			
 
				-import os
			
 
				-import sys
			
 
				-
			
 
				-from abx_pkg import Binary, EnvProvider
			
 
				-
			
 
				-
			
 
				-def get_env(name: str, default: str = '') -> str:
			
 
				-    return os.environ.get(name, default).strip()
			
 
				-
			
 
				-def get_env_bool(name: str, default: bool = False) -> bool:
			
 
				-    val = get_env(name, '').lower()
			
 
				-    if val in ('true', '1', 'yes', 'on'):
			
 
				-        return True
			
 
				-    if val in ('false', '0', 'no', 'off'):
			
 
				-        return False
			
 
				-    return default
			
 
				-
			
 
				-
			
 
				-def output_binary_found(binary: Binary, name: str):
			
 
				-    """Output Binary JSONL record for an installed binary."""
			
 
				-    machine_id = os.environ.get('MACHINE_ID', '')
			
 
				-
			
 
				-    record = {
			
 
				-        'type': 'Binary',
			
 
				-        'name': name,
			
 
				-        'abspath': str(binary.abspath),
			
 
				-        'version': str(binary.version) if binary.version else '',
			
 
				-        'sha256': binary.sha256 or '',
			
 
				-        'binprovider': 'env',  # Already installed
			
 
				-        'machine_id': machine_id,
			
 
				-    }
			
 
				-    print(json.dumps(record))
			
 
				-
			
 
				-
			
 
				-def output_binary_missing(name: str, binproviders: str):
			
 
				-    """Output Binary JSONL record for a missing binary that needs installation."""
			
 
				-    machine_id = os.environ.get('MACHINE_ID', '')
			
 
				-
			
 
				-    record = {
			
 
				-        'type': 'Binary',
			
 
				-        'name': name,
			
 
				-        'binproviders': binproviders,  # Providers that can install it
			
 
				-        'overrides': {
			
 
				-            'npm': {
			
 
				-                'packages': ['@postlight/parser'],
			
 
				-            }
			
 
				-        },
			
 
				-        'machine_id': machine_id,
			
 
				-    }
			
 
				-    print(json.dumps(record))
			
 
				-
			
 
				-
			
 
				-def main():
			
 
				-    mercury_enabled = get_env_bool('MERCURY_ENABLED', True)
			
 
				-    mercury_binary = get_env('MERCURY_BINARY', 'postlight-parser')
			
 
				-
			
 
				-    if not mercury_enabled:
			
 
				-        sys.exit(0)
			
 
				-
			
 
				-    provider = EnvProvider()
			
 
				-    try:
			
 
				-        binary = Binary(name=mercury_binary, binproviders=[provider]).load()
			
 
				-        if binary.abspath:
			
 
				-            # Binary found
			
 
				-            output_binary_found(binary, name='postlight-parser')
			
 
				-        else:
			
 
				-            # Binary not found
			
 
				-            output_binary_missing(name='postlight-parser', binproviders='npm')
			
 
				-    except Exception:
			
 
				-        # Binary not found
			
 
				-        output_binary_missing(name='postlight-parser', binproviders='npm')
			
 
				-
			
 
				-    sys.exit(0)
			
 
				-
			
 
				-
			
 
				-if __name__ == '__main__':
			
 
				-    main()
			
--- a/archivebox/plugins/mercury/on_Crawl__40_mercury_install.py
+++ b/archivebox/plugins/mercury/on_Crawl__40_mercury_install.py
@@ -0,0 +1,53 @@
 
				+#!/usr/bin/env python3
			
 
				+"""
			
 
				+Emit postlight-parser Binary dependency for the crawl.
			
 
				+"""
			
 
				+
			
 
				+import json
			
 
				+import os
			
 
				+import sys
			
 
				+
			
 
				+
			
 
				+def get_env(name: str, default: str = '') -> str:
			
 
				+    return os.environ.get(name, default).strip()
			
 
				+
			
 
				+def get_env_bool(name: str, default: bool = False) -> bool:
			
 
				+    val = get_env(name, '').lower()
			
 
				+    if val in ('true', '1', 'yes', 'on'):
			
 
				+        return True
			
 
				+    if val in ('false', '0', 'no', 'off'):
			
 
				+        return False
			
 
				+    return default
			
 
				+
			
 
				+
			
 
				+def output_binary(name: str, binproviders: str):
			
 
				+    """Output Binary JSONL record for a dependency."""
			
 
				+    machine_id = os.environ.get('MACHINE_ID', '')
			
 
				+
			
 
				+    record = {
			
 
				+        'type': 'Binary',
			
 
				+        'name': name,
			
 
				+        'binproviders': binproviders,
			
 
				+        'overrides': {
			
 
				+            'npm': {
			
 
				+                'packages': ['@postlight/parser'],
			
 
				+            }
			
 
				+        },
			
 
				+        'machine_id': machine_id,
			
 
				+    }
			
 
				+    print(json.dumps(record))
			
 
				+
			
 
				+
			
 
				+def main():
			
 
				+    mercury_enabled = get_env_bool('MERCURY_ENABLED', True)
			
 
				+
			
 
				+    if not mercury_enabled:
			
 
				+        sys.exit(0)
			
 
				+
			
 
				+    output_binary(name='postlight-parser', binproviders='npm,env')
			
 
				+
			
 
				+    sys.exit(0)
			
 
				+
			
 
				+
			
 
				+if __name__ == '__main__':
			
 
				+    main()
			
--- a/archivebox/plugins/mercury/templates/icon.html
+++ b/archivebox/plugins/mercury/templates/icon.html
@@ -1 +1 @@
 
				-☿️
			
 
				+<span class="abx-output-icon abx-output-icon--mercury" title="Mercury"><svg width="16" height="16" viewBox="0 0 24 24" aria-hidden="true" focusable="false" fill="none" stroke="currentColor" stroke-width="1.8" stroke-linecap="round" stroke-linejoin="round"><rect x="3" y="5" width="18" height="14" rx="2"/><path d="M7 9h6"/><path d="M7 13h10"/><path d="M15 9h3"/></svg></span>
			
--- a/archivebox/plugins/merkletree/templates/icon.html
+++ b/archivebox/plugins/merkletree/templates/icon.html
@@ -0,0 +1 @@
 
				+<span class="abx-output-icon abx-output-icon--merkletree" title="Merkle Tree"><svg width="16" height="16" viewBox="0 0 24 24" aria-hidden="true" focusable="false" fill="none" stroke="currentColor" stroke-width="1.8" stroke-linecap="round" stroke-linejoin="round"><circle cx="12" cy="5" r="2"/><circle cx="6" cy="18" r="2"/><circle cx="18" cy="18" r="2"/><path d="M12 7v6"/><path d="M12 13l-4 3"/><path d="M12 13l4 3"/></svg></span>
			
--- a/archivebox/plugins/merkletree/tests/__init__.py
+++ b/archivebox/plugins/merkletree/tests/__init__.py
@@ -1 +0,0 @@
 
				-"""Tests for the merkletree plugin."""
			
--- a/archivebox/plugins/modalcloser/on_Snapshot__15_modalcloser.bg.js
+++ b/archivebox/plugins/modalcloser/on_Snapshot__15_modalcloser.bg.js
@@ -287,7 +287,7 @@ async function main() {
 
				             page = pages[pages.length - 1];
			
 
				         }
			
 
				 
			
 
				-        console.error(`Modalcloser listening on ${url}`);
			
 
				+        // console.error(`Modalcloser listening on ${url}`);
			
 
				 
			
 
				         // Set up dialog handler (for JS alert/confirm/prompt/beforeunload)
			
 
				         page.on('dialog', async (dialog) => {
			
--- a/archivebox/plugins/modalcloser/templates/icon.html
+++ b/archivebox/plugins/modalcloser/templates/icon.html
@@ -0,0 +1 @@
 
				+<span class="abx-output-icon abx-output-icon--modalcloser" title="Modal Closer"><svg width="16" height="16" viewBox="0 0 24 24" aria-hidden="true" focusable="false" fill="none" stroke="currentColor" stroke-width="1.8" stroke-linecap="round" stroke-linejoin="round"><rect x="4" y="4" width="16" height="16" rx="3"/><path d="M9 9l6 6"/><path d="M15 9l-6 6"/></svg></span>
			
--- a/archivebox/plugins/npm/on_Binary__10_npm_install.py
+++ b/archivebox/plugins/npm/on_Binary__10_npm_install.py
@@ -90,30 +90,34 @@ def main(binary_id: str, machine_id: str, name: str, binproviders: str, custom_c
 
				     }
			
 
				     print(json.dumps(record))
			
 
				 
			
 
				-    # Emit PATH update if npm bin dir not already in PATH
			
 
				-    npm_bin_dir = str(npm_prefix / 'bin')
			
 
				+    # Emit PATH update for npm bin dirs (node_modules/.bin preferred)
			
 
				+    npm_bin_dirs = [
			
 
				+        str(npm_prefix / 'node_modules' / '.bin'),
			
 
				+        str(npm_prefix / 'bin'),
			
 
				+    ]
			
 
				     current_path = os.environ.get('PATH', '')
			
 
				+    path_dirs = current_path.split(':') if current_path else []
			
 
				+    new_path = current_path
			
 
				 
			
 
				-    # Check if npm_bin_dir is already in PATH
			
 
				-    path_dirs = current_path.split(':')
			
 
				-    if npm_bin_dir not in path_dirs:
			
 
				-        # Prepend npm_bin_dir to PATH
			
 
				-        new_path = f"{npm_bin_dir}:{current_path}" if current_path else npm_bin_dir
			
 
				-        print(json.dumps({
			
 
				-            'type': 'Machine',
			
 
				-            '_method': 'update',
			
 
				-            'key': 'config/PATH',
			
 
				-            'value': new_path,
			
 
				-        }))
			
 
				-        click.echo(f"  Added {npm_bin_dir} to PATH", err=True)
			
 
				+    for npm_bin_dir in npm_bin_dirs:
			
 
				+        if npm_bin_dir and npm_bin_dir not in path_dirs:
			
 
				+            new_path = f"{npm_bin_dir}:{new_path}" if new_path else npm_bin_dir
			
 
				+            path_dirs.insert(0, npm_bin_dir)
			
 
				+
			
 
				+    print(json.dumps({
			
 
				+        'type': 'Machine',
			
 
				+        'config': {
			
 
				+            'PATH': new_path,
			
 
				+        },
			
 
				+    }))
			
 
				 
			
 
				     # Also emit NODE_MODULES_DIR for JS module resolution
			
 
				     node_modules_dir = str(npm_prefix / 'node_modules')
			
 
				     print(json.dumps({
			
 
				         'type': 'Machine',
			
 
				-        '_method': 'update',
			
 
				-        'key': 'config/NODE_MODULES_DIR',
			
 
				-        'value': node_modules_dir,
			
 
				+        'config': {
			
 
				+            'NODE_MODULES_DIR': node_modules_dir,
			
 
				+        },
			
 
				     }))
			
 
				 
			
 
				     # Log human-readable info to stderr
			
--- a/archivebox/plugins/npm/on_Crawl__00_npm_install.py
+++ b/archivebox/plugins/npm/on_Crawl__00_npm_install.py
@@ -0,0 +1,51 @@
 
				+#!/usr/bin/env python3
			
 
				+"""
			
 
				+Emit node/npm Binary dependencies for the crawl.
			
 
				+
			
 
				+This hook runs early in the Crawl lifecycle so node/npm are installed
			
 
				+before any npm-based extractors (e.g., puppeteer) run.
			
 
				+"""
			
 
				+
			
 
				+import json
			
 
				+import os
			
 
				+import sys
			
 
				+
			
 
				+
			
 
				+def get_env(name: str, default: str = '') -> str:
			
 
				+    return os.environ.get(name, default).strip()
			
 
				+
			
 
				+
			
 
				+def output_binary(name: str, binproviders: str, overrides: dict | None = None) -> None:
			
 
				+    machine_id = os.environ.get('MACHINE_ID', '')
			
 
				+    record = {
			
 
				+        'type': 'Binary',
			
 
				+        'name': name,
			
 
				+        'binproviders': binproviders,
			
 
				+        'machine_id': machine_id,
			
 
				+    }
			
 
				+    if overrides:
			
 
				+        record['overrides'] = overrides
			
 
				+    print(json.dumps(record))
			
 
				+
			
 
				+
			
 
				+def main() -> None:
			
 
				+    output_binary(
			
 
				+        name='node',
			
 
				+        binproviders='apt,brew,env',
			
 
				+        overrides={'apt': {'packages': ['nodejs']}},
			
 
				+    )
			
 
				+
			
 
				+    output_binary(
			
 
				+        name='npm',
			
 
				+        binproviders='apt,brew,env',
			
 
				+        overrides={
			
 
				+            'apt': {'packages': ['nodejs', 'npm']},
			
 
				+            'brew': {'packages': ['node']},
			
 
				+        },
			
 
				+    )
			
 
				+
			
 
				+    sys.exit(0)
			
 
				+
			
 
				+
			
 
				+if __name__ == '__main__':
			
 
				+    main()
			
--- a/archivebox/plugins/npm/tests/__init__.py
+++ b/archivebox/plugins/npm/tests/__init__.py
@@ -1 +0,0 @@
 
				-"""Tests for the npm binary provider plugin."""
			
--- a/archivebox/plugins/npm/tests/test_npm_provider.py
+++ b/archivebox/plugins/npm/tests/test_npm_provider.py
@@ -22,7 +22,7 @@ from django.test import TestCase
 
				 
			
 
				 # Get the path to the npm provider hook
			
 
				 PLUGIN_DIR = Path(__file__).parent.parent
			
 
				-INSTALL_HOOK = PLUGIN_DIR / 'on_Binary__install_using_npm_provider.py'
			
 
				+INSTALL_HOOK = next(PLUGIN_DIR.glob('on_Binary__*_npm_install.py'), None)
			
 
				 
			
 
				 
			
 
				 def npm_available() -> bool:
			
@@ -45,7 +45,7 @@ class TestNpmProviderHook(TestCase):
 
				 
			
 
				     def test_hook_script_exists(self):
			
 
				         """Hook script should exist."""
			
 
				-        self.assertTrue(INSTALL_HOOK.exists(), f"Hook not found: {INSTALL_HOOK}")
			
 
				+        self.assertTrue(INSTALL_HOOK and INSTALL_HOOK.exists(), f"Hook not found: {INSTALL_HOOK}")
			
 
				 
			
 
				     def test_hook_requires_lib_dir(self):
			
 
				         """Hook should fail when LIB_DIR is not set."""
			
--- a/archivebox/plugins/papersdl/binaries.jsonl
+++ b/archivebox/plugins/papersdl/binaries.jsonl
@@ -1 +0,0 @@
 
				-{"type": "Binary", "name": "papers-dl", "binproviders": "pip,env"}
			
--- a/archivebox/plugins/papersdl/on_Crawl__14_papersdl_install.py
+++ b/archivebox/plugins/papersdl/on_Crawl__14_papersdl_install.py
@@ -1,80 +0,0 @@
 
				-#!/usr/bin/env python3
			
 
				-"""
			
 
				-Detect papers-dl binary and emit Binary JSONL record.
			
 
				-
			
 
				-Output: Binary JSONL record to stdout if papers-dl is found
			
 
				-"""
			
 
				-
			
 
				-import json
			
 
				-import os
			
 
				-import sys
			
 
				-
			
 
				-from abx_pkg import Binary, EnvProvider
			
 
				-
			
 
				-
			
 
				-def get_env(name: str, default: str = '') -> str:
			
 
				-    return os.environ.get(name, default).strip()
			
 
				-
			
 
				-def get_env_bool(name: str, default: bool = False) -> bool:
			
 
				-    val = get_env(name, '').lower()
			
 
				-    if val in ('true', '1', 'yes', 'on'):
			
 
				-        return True
			
 
				-    if val in ('false', '0', 'no', 'off'):
			
 
				-        return False
			
 
				-    return default
			
 
				-
			
 
				-
			
 
				-def output_binary_found(binary: Binary, name: str):
			
 
				-    """Output Binary JSONL record for an installed binary."""
			
 
				-    machine_id = os.environ.get('MACHINE_ID', '')
			
 
				-
			
 
				-    record = {
			
 
				-        'type': 'Binary',
			
 
				-        'name': name,
			
 
				-        'abspath': str(binary.abspath),
			
 
				-        'version': str(binary.version) if binary.version else '',
			
 
				-        'sha256': binary.sha256 or '',
			
 
				-        'binprovider': 'env',  # Already installed
			
 
				-        'machine_id': machine_id,
			
 
				-    }
			
 
				-    print(json.dumps(record))
			
 
				-
			
 
				-
			
 
				-def output_binary_missing(name: str, binproviders: str):
			
 
				-    """Output Binary JSONL record for a missing binary that needs installation."""
			
 
				-    machine_id = os.environ.get('MACHINE_ID', '')
			
 
				-
			
 
				-    record = {
			
 
				-        'type': 'Binary',
			
 
				-        'name': name,
			
 
				-        'binproviders': binproviders,  # Providers that can install it
			
 
				-        'machine_id': machine_id,
			
 
				-    }
			
 
				-    print(json.dumps(record))
			
 
				-
			
 
				-
			
 
				-def main():
			
 
				-    papersdl_enabled = get_env_bool('PAPERSDL_ENABLED', True)
			
 
				-    papersdl_binary = get_env('PAPERSDL_BINARY', 'papers-dl')
			
 
				-
			
 
				-    if not papersdl_enabled:
			
 
				-        sys.exit(0)
			
 
				-
			
 
				-    provider = EnvProvider()
			
 
				-    try:
			
 
				-        binary = Binary(name=papersdl_binary, binproviders=[provider]).load()
			
 
				-        if binary.abspath:
			
 
				-            # Binary found
			
 
				-            output_binary_found(binary, name='papers-dl')
			
 
				-        else:
			
 
				-            # Binary not found
			
 
				-            output_binary_missing(name='papers-dl', binproviders='pip')
			
 
				-    except Exception:
			
 
				-        # Binary not found
			
 
				-        output_binary_missing(name='papers-dl', binproviders='pip')
			
 
				-
			
 
				-    sys.exit(0)
			
 
				-
			
 
				-
			
 
				-if __name__ == '__main__':
			
 
				-    main()
			
--- a/archivebox/plugins/papersdl/on_Crawl__30_papersdl_install.py
+++ b/archivebox/plugins/papersdl/on_Crawl__30_papersdl_install.py
@@ -0,0 +1,48 @@
 
				+#!/usr/bin/env python3
			
 
				+"""
			
 
				+Emit papers-dl Binary dependency for the crawl.
			
 
				+"""
			
 
				+
			
 
				+import json
			
 
				+import os
			
 
				+import sys
			
 
				+
			
 
				+
			
 
				+def get_env(name: str, default: str = '') -> str:
			
 
				+    return os.environ.get(name, default).strip()
			
 
				+
			
 
				+def get_env_bool(name: str, default: bool = False) -> bool:
			
 
				+    val = get_env(name, '').lower()
			
 
				+    if val in ('true', '1', 'yes', 'on'):
			
 
				+        return True
			
 
				+    if val in ('false', '0', 'no', 'off'):
			
 
				+        return False
			
 
				+    return default
			
 
				+
			
 
				+
			
 
				+def output_binary(name: str, binproviders: str):
			
 
				+    """Output Binary JSONL record for a dependency."""
			
 
				+    machine_id = os.environ.get('MACHINE_ID', '')
			
 
				+
			
 
				+    record = {
			
 
				+        'type': 'Binary',
			
 
				+        'name': name,
			
 
				+        'binproviders': binproviders,
			
 
				+        'machine_id': machine_id,
			
 
				+    }
			
 
				+    print(json.dumps(record))
			
 
				+
			
 
				+
			
 
				+def main():
			
 
				+    papersdl_enabled = get_env_bool('PAPERSDL_ENABLED', True)
			
 
				+
			
 
				+    if not papersdl_enabled:
			
 
				+        sys.exit(0)
			
 
				+
			
 
				+    output_binary(name='papers-dl', binproviders='pip,env')
			
 
				+
			
 
				+    sys.exit(0)
			
 
				+
			
 
				+
			
 
				+if __name__ == '__main__':
			
 
				+    main()
			
--- a/archivebox/plugins/papersdl/on_Snapshot__66_papersdl.bg.py
+++ b/archivebox/plugins/papersdl/on_Snapshot__66_papersdl.bg.py
@@ -23,6 +23,7 @@ import os
 
				 import re
			
 
				 import subprocess
			
 
				 import sys
			
 
				+import threading
			
 
				 from pathlib import Path
			
 
				 
			
 
				 import rich_click as click
			
@@ -108,7 +109,35 @@ def save_paper(url: str, binary: str) -> tuple[bool, str | None, str]:
 
				         cmd.extend(papersdl_args_extra)
			
 
				 
			
 
				     try:
			
 
				-        result = subprocess.run(cmd, capture_output=True, timeout=timeout, text=True)
			
 
				+        print(f'[papersdl] Starting download (timeout={timeout}s)', file=sys.stderr)
			
 
				+        output_lines: list[str] = []
			
 
				+        process = subprocess.Popen(
			
 
				+            cmd,
			
 
				+            stdout=subprocess.PIPE,
			
 
				+            stderr=subprocess.STDOUT,
			
 
				+            text=True,
			
 
				+            bufsize=1,
			
 
				+        )
			
 
				+
			
 
				+        def _read_output() -> None:
			
 
				+            if not process.stdout:
			
 
				+                return
			
 
				+            for line in process.stdout:
			
 
				+                output_lines.append(line)
			
 
				+                sys.stderr.write(line)
			
 
				+
			
 
				+        reader = threading.Thread(target=_read_output, daemon=True)
			
 
				+        reader.start()
			
 
				+
			
 
				+        try:
			
 
				+            process.wait(timeout=timeout)
			
 
				+        except subprocess.TimeoutExpired:
			
 
				+            process.kill()
			
 
				+            reader.join(timeout=1)
			
 
				+            return False, None, f'Timed out after {timeout} seconds'
			
 
				+
			
 
				+        reader.join(timeout=1)
			
 
				+        combined_output = ''.join(output_lines)
			
 
				 
			
 
				         # Check if any PDF files were downloaded
			
 
				         pdf_files = list(output_dir.glob('*.pdf'))
			
@@ -117,8 +146,8 @@ def save_paper(url: str, binary: str) -> tuple[bool, str | None, str]:
 
				             # Return first PDF file
			
 
				             return True, str(pdf_files[0]), ''
			
 
				         else:
			
 
				-            stderr = result.stderr
			
 
				-            stdout = result.stdout
			
 
				+            stderr = combined_output
			
 
				+            stdout = combined_output
			
 
				 
			
 
				             # These are NOT errors - page simply has no downloadable paper
			
 
				             stderr_lower = stderr.lower()
			
@@ -127,7 +156,7 @@ def save_paper(url: str, binary: str) -> tuple[bool, str | None, str]:
 
				                 return True, None, ''  # Paper not available - success, no output
			
 
				             if 'no results' in stderr_lower or 'no results' in stdout_lower:
			
 
				                 return True, None, ''  # No paper found - success, no output
			
 
				-            if result.returncode == 0:
			
 
				+            if process.returncode == 0:
			
 
				                 return True, None, ''  # papers-dl exited cleanly, just no paper - success
			
 
				 
			
 
				             # These ARE errors - something went wrong
			
--- a/archivebox/plugins/papersdl/templates/icon.html
+++ b/archivebox/plugins/papersdl/templates/icon.html
@@ -1 +1 @@
 
				-📄
			
 
				+<span class="abx-output-icon abx-output-icon--papersdl" title="Papers"><svg width="16" height="16" viewBox="0 0 24 24" aria-hidden="true" focusable="false" fill="none" stroke="currentColor" stroke-width="1.8" stroke-linecap="round" stroke-linejoin="round"><path d="M14 3H6a2 2 0 0 0-2 2v14a2 2 0 0 0 2 2h12a2 2 0 0 0 2-2V9z"/><path d="M14 3v6h6"/><path d="M12 12v5"/><path d="M9.5 14.5L12 17l2.5-2.5"/></svg></span>
			
--- a/archivebox/plugins/parse_dom_outlinks/on_Snapshot__75_parse_dom_outlinks.js
+++ b/archivebox/plugins/parse_dom_outlinks/on_Snapshot__75_parse_dom_outlinks.js
@@ -193,6 +193,9 @@ async function extractOutlinks(url) {
 
				             type: 'Snapshot',
			
 
				             url: href,
			
 
				             plugin: PLUGIN_NAME,
			
 
				+            depth: depth + 1,
			
 
				+            parent_snapshot_id: snapshotId || undefined,
			
 
				+            crawl_id: crawlId || undefined,
			
 
				         })).join('\n');
			
 
				 
			
 
				         if (urlsJsonl) {
			
@@ -214,6 +217,8 @@ async function main() {
 
				     const args = parseArgs();
			
 
				     const url = args.url;
			
 
				     const snapshotId = args.snapshot_id;
			
 
				+    const crawlId = args.crawl_id || process.env.CRAWL_ID;
			
 
				+    const depth = parseInt(args.depth || process.env.SNAPSHOT_DEPTH || '0', 10) || 0;
			
 
				 
			
 
				     if (!url || !snapshotId) {
			
 
				         console.error('Usage: on_Snapshot__75_parse_dom_outlinks.js --url=<url> --snapshot-id=<uuid>');
			
--- a/archivebox/plugins/parse_dom_outlinks/templates/icon.html
+++ b/archivebox/plugins/parse_dom_outlinks/templates/icon.html
@@ -1 +1 @@
 
				-🔗
			
 
				+<span class="abx-output-icon abx-output-icon--parse_dom_outlinks" title="Outlinks"><svg width="16" height="16" viewBox="0 0 24 24" aria-hidden="true" focusable="false" fill="none" stroke="currentColor" stroke-width="1.8" stroke-linecap="round" stroke-linejoin="round"><path d="M10 13a4 4 0 0 1 0-6l2-2a4 4 0 0 1 6 6l-1 1"/><path d="M14 11a4 4 0 0 1 0 6l-2 2a4 4 0 0 1-6-6l1-1"/></svg></span>
			
--- a/archivebox/plugins/parse_dom_outlinks/tests/__init__.py
+++ b/archivebox/plugins/parse_dom_outlinks/tests/__init__.py
@@ -1 +0,0 @@
 
				-"""Tests for the parse_dom_outlinks plugin."""
			
--- a/archivebox/plugins/parse_dom_outlinks/tests/test_parse_dom_outlinks.py
+++ b/archivebox/plugins/parse_dom_outlinks/tests/test_parse_dom_outlinks.py
@@ -79,8 +79,7 @@ class TestParseDomOutlinksWithChrome(TestCase):
 
				                 # Run outlinks hook with the active Chrome session
			
 
				                 result = subprocess.run(
			
 
				                     ['node', str(OUTLINKS_HOOK), f'--url={test_url}', f'--snapshot-id={snapshot_id}'],
			
 
				-                    cwd=str(snapshot_chrome_dir,
			
 
				-            env=get_test_env()),
			
 
				+                    cwd=str(snapshot_chrome_dir),
			
 
				                     capture_output=True,
			
 
				                     text=True,
			
 
				                     timeout=60,
			
--- a/archivebox/plugins/parse_html_urls/on_Snapshot__70_parse_html_urls.py
+++ b/archivebox/plugins/parse_html_urls/on_Snapshot__70_parse_html_urls.py
@@ -24,14 +24,15 @@ from datetime import datetime, timezone
 
				 from html import unescape
			
 
				 from html.parser import HTMLParser
			
 
				 from pathlib import Path
			
 
				-from urllib.parse import urljoin, urlparse
			
 
				+from urllib.parse import urljoin, urlparse, urlunparse
			
 
				 
			
 
				 import rich_click as click
			
 
				 
			
 
				 PLUGIN_NAME = 'parse_html_urls'
			
 
				 
			
 
				-# Check if parse_dom_outlinks extractor already ran
			
 
				-DOM_OUTLINKS_URLS_FILE = Path('parse_dom_outlinks/urls.jsonl')
			
 
				+# Check if parse_dom_outlinks extractor already ran (sibling plugin output dir)
			
 
				+DOM_OUTLINKS_URLS_FILE = Path('..') / 'parse_dom_outlinks' / 'urls.jsonl'
			
 
				+URLS_FILE = Path('urls.jsonl')
			
 
				 
			
 
				 
			
 
				 # URL regex from archivebox/misc/util.py
			
@@ -95,8 +96,9 @@ def fix_urljoin_bug(url: str, nesting_limit=5) -> str:
 
				 
			
 
				 def normalize_url(url: str, root_url: str = None) -> str:
			
 
				     """Normalize a URL, resolving relative paths if root_url provided."""
			
 
				+    url = clean_url_candidate(url)
			
 
				     if not root_url:
			
 
				-        return url
			
 
				+        return _normalize_trailing_slash(url)
			
 
				 
			
 
				     url_is_absolute = url.lower().startswith('http://') or url.lower().startswith('https://')
			
 
				 
			
@@ -110,7 +112,40 @@ def normalize_url(url: str, root_url: str = None) -> str:
 
				     if did_urljoin_misbehave(root_url, url, resolved):
			
 
				         resolved = fix_urljoin_bug(resolved)
			
 
				 
			
 
				-    return resolved
			
 
				+    return _normalize_trailing_slash(resolved)
			
 
				+
			
 
				+
			
 
				+def _normalize_trailing_slash(url: str) -> str:
			
 
				+    """Drop trailing slash for non-root paths when no query/fragment."""
			
 
				+    try:
			
 
				+        parsed = urlparse(url)
			
 
				+        path = parsed.path or ''
			
 
				+        if path != '/' and path.endswith('/') and not parsed.query and not parsed.fragment:
			
 
				+            path = path.rstrip('/')
			
 
				+            return urlunparse((parsed.scheme, parsed.netloc, path, parsed.params, parsed.query, parsed.fragment))
			
 
				+    except Exception:
			
 
				+        pass
			
 
				+    return url
			
 
				+
			
 
				+
			
 
				+def clean_url_candidate(url: str) -> str:
			
 
				+    """Strip obvious surrounding/trailing punctuation from extracted URLs."""
			
 
				+    cleaned = (url or '').strip()
			
 
				+    if not cleaned:
			
 
				+        return cleaned
			
 
				+
			
 
				+    # Strip common wrappers
			
 
				+    cleaned = cleaned.strip(' \t\r\n')
			
 
				+    cleaned = cleaned.strip('"\''"'"'<>[]()')
			
 
				+
			
 
				+    # Strip trailing punctuation and escape artifacts
			
 
				+    cleaned = cleaned.rstrip('.,;:!?)\\\'"')
			
 
				+    cleaned = cleaned.rstrip('"')
			
 
				+
			
 
				+    # Strip leading punctuation artifacts
			
 
				+    cleaned = cleaned.lstrip('("'\''<')
			
 
				+
			
 
				+    return cleaned
			
 
				 
			
 
				 
			
 
				 def fetch_content(url: str) -> str:
			
@@ -131,6 +166,43 @@ def fetch_content(url: str) -> str:
 
				             return response.read().decode('utf-8', errors='replace')
			
 
				 
			
 
				 
			
 
				+def find_html_sources() -> list[str]:
			
 
				+    """Find HTML content from other extractors in the snapshot directory."""
			
 
				+    search_patterns = [
			
 
				+        'readability/content.html',
			
 
				+        '*_readability/content.html',
			
 
				+        'mercury/content.html',
			
 
				+        '*_mercury/content.html',
			
 
				+        'singlefile/singlefile.html',
			
 
				+        '*_singlefile/singlefile.html',
			
 
				+        'singlefile/*.html',
			
 
				+        '*_singlefile/*.html',
			
 
				+        'dom/output.html',
			
 
				+        '*_dom/output.html',
			
 
				+        'dom/*.html',
			
 
				+        '*_dom/*.html',
			
 
				+        'wget/**/*.html',
			
 
				+        '*_wget/**/*.html',
			
 
				+        'wget/**/*.htm',
			
 
				+        '*_wget/**/*.htm',
			
 
				+        'wget/**/*.htm*',
			
 
				+        '*_wget/**/*.htm*',
			
 
				+    ]
			
 
				+
			
 
				+    sources: list[str] = []
			
 
				+    for base in (Path.cwd(), Path.cwd().parent):
			
 
				+        for pattern in search_patterns:
			
 
				+            for match in base.glob(pattern):
			
 
				+                if not match.is_file() or match.stat().st_size == 0:
			
 
				+                    continue
			
 
				+                try:
			
 
				+                    sources.append(match.read_text(errors='ignore'))
			
 
				+                except Exception:
			
 
				+                    continue
			
 
				+
			
 
				+    return sources
			
 
				+
			
 
				+
			
 
				 @click.command()
			
 
				 @click.option('--url', required=True, help='HTML URL to parse')
			
 
				 @click.option('--snapshot-id', required=False, help='Parent Snapshot UUID')
			
@@ -138,6 +210,13 @@ def fetch_content(url: str) -> str:
 
				 @click.option('--depth', type=int, default=0, help='Current depth level')
			
 
				 def main(url: str, snapshot_id: str = None, crawl_id: str = None, depth: int = 0):
			
 
				     """Parse HTML and extract href URLs."""
			
 
				+    env_depth = os.environ.get('SNAPSHOT_DEPTH')
			
 
				+    if env_depth is not None:
			
 
				+        try:
			
 
				+            depth = int(env_depth)
			
 
				+        except Exception:
			
 
				+            pass
			
 
				+    crawl_id = crawl_id or os.environ.get('CRAWL_ID')
			
 
				 
			
 
				     # Skip only if parse_dom_outlinks already ran AND found URLs (it uses Chrome for better coverage)
			
 
				     # If parse_dom_outlinks ran but found nothing, we still try static HTML parsing as fallback
			
@@ -145,32 +224,38 @@ def main(url: str, snapshot_id: str = None, crawl_id: str = None, depth: int = 0
 
				         click.echo(f'Skipping parse_html_urls - parse_dom_outlinks already extracted URLs')
			
 
				         sys.exit(0)
			
 
				 
			
 
				-    try:
			
 
				-        content = fetch_content(url)
			
 
				-    except Exception as e:
			
 
				-        click.echo(f'Failed to fetch {url}: {e}', err=True)
			
 
				-        sys.exit(1)
			
 
				-
			
 
				-    # Parse HTML for hrefs
			
 
				-    parser = HrefParser()
			
 
				-    try:
			
 
				-        parser.feed(content)
			
 
				-    except Exception as e:
			
 
				-        click.echo(f'Failed to parse HTML: {e}', err=True)
			
 
				-        sys.exit(1)
			
 
				+    contents = find_html_sources()
			
 
				+    if not contents:
			
 
				+        try:
			
 
				+            contents = [fetch_content(url)]
			
 
				+        except Exception as e:
			
 
				+            click.echo(f'Failed to fetch {url}: {e}', err=True)
			
 
				+            sys.exit(1)
			
 
				 
			
 
				     urls_found = set()
			
 
				-    for href in parser.urls:
			
 
				-        # Normalize URL
			
 
				-        normalized = normalize_url(href, root_url=url)
			
 
				-
			
 
				-        # Only include http/https URLs
			
 
				-        if normalized.lower().startswith('http://') or normalized.lower().startswith('https://'):
			
 
				-            # Skip the source URL itself
			
 
				-            if normalized != url:
			
 
				-                urls_found.add(unescape(normalized))
			
 
				-
			
 
				-    # Emit Snapshot records to stdout (JSONL)
			
 
				+    for content in contents:
			
 
				+        # Parse HTML for hrefs
			
 
				+        parser = HrefParser()
			
 
				+        try:
			
 
				+            parser.feed(content)
			
 
				+        except Exception:
			
 
				+            pass
			
 
				+
			
 
				+        for href in parser.urls:
			
 
				+            normalized = normalize_url(href, root_url=url)
			
 
				+            if normalized.lower().startswith('http://') or normalized.lower().startswith('https://'):
			
 
				+                if normalized != url:
			
 
				+                    urls_found.add(unescape(normalized))
			
 
				+
			
 
				+        # Also capture explicit URLs in the HTML text
			
 
				+        for match in URL_REGEX.findall(content):
			
 
				+            normalized = normalize_url(match, root_url=url)
			
 
				+            if normalized.lower().startswith('http://') or normalized.lower().startswith('https://'):
			
 
				+                if normalized != url:
			
 
				+                    urls_found.add(unescape(normalized))
			
 
				+
			
 
				+    # Emit Snapshot records to stdout (JSONL) and urls.jsonl for crawl system
			
 
				+    records = []
			
 
				     for found_url in sorted(urls_found):
			
 
				         record = {
			
 
				             'type': 'Snapshot',
			
@@ -183,8 +268,12 @@ def main(url: str, snapshot_id: str = None, crawl_id: str = None, depth: int = 0
 
				         if crawl_id:
			
 
				             record['crawl_id'] = crawl_id
			
 
				 
			
 
				+        records.append(record)
			
 
				         print(json.dumps(record))
			
 
				 
			
 
				+    if records:
			
 
				+        URLS_FILE.write_text('\n'.join(json.dumps(r) for r in records) + '\n')
			
 
				+
			
 
				     # Emit ArchiveResult record to mark completion
			
 
				     status = 'succeeded' if urls_found else 'skipped'
			
 
				     output_str = f'Found {len(urls_found)} URLs' if urls_found else 'No URLs found'
			
--- a/archivebox/plugins/parse_html_urls/templates/icon.html
+++ b/archivebox/plugins/parse_html_urls/templates/icon.html
@@ -1 +1 @@
 
				-🔗
			
 
				+<span class="abx-output-icon abx-output-icon--parse_html_urls" title="HTML URLs"><svg width="16" height="16" viewBox="0 0 24 24" aria-hidden="true" focusable="false" fill="none" stroke="currentColor" stroke-width="1.8" stroke-linecap="round" stroke-linejoin="round"><path d="M8 9l-3 3 3 3"/><path d="M16 9l3 3-3 3"/><path d="M10 20l4-16"/></svg></span>
			
--- a/archivebox/plugins/parse_jsonl_urls/on_Snapshot__74_parse_jsonl_urls.py
+++ b/archivebox/plugins/parse_jsonl_urls/on_Snapshot__74_parse_jsonl_urls.py
@@ -132,6 +132,13 @@ def fetch_content(url: str) -> str:
 
				 @click.option('--depth', type=int, default=0, help='Current depth level')
			
 
				 def main(url: str, snapshot_id: str = None, crawl_id: str = None, depth: int = 0):
			
 
				     """Parse JSONL bookmark file and extract URLs."""
			
 
				+    env_depth = os.environ.get('SNAPSHOT_DEPTH')
			
 
				+    if env_depth is not None:
			
 
				+        try:
			
 
				+            depth = int(env_depth)
			
 
				+        except Exception:
			
 
				+            pass
			
 
				+    crawl_id = crawl_id or os.environ.get('CRAWL_ID')
			
 
				 
			
 
				     try:
			
 
				         content = fetch_content(url)
			
--- a/archivebox/plugins/parse_jsonl_urls/templates/icon.html
+++ b/archivebox/plugins/parse_jsonl_urls/templates/icon.html
@@ -1 +1 @@
 
				-📋
			
 
				+<span class="abx-output-icon abx-output-icon--parse_jsonl_urls" title="JSONL URLs"><svg width="16" height="16" viewBox="0 0 24 24" aria-hidden="true" focusable="false" fill="none" stroke="currentColor" stroke-width="1.8" stroke-linecap="round" stroke-linejoin="round"><path d="M8 4H5v16h3"/><path d="M16 4h3v16h-3"/><circle cx="12" cy="8" r="1" fill="currentColor" stroke="none"/><circle cx="12" cy="12" r="1" fill="currentColor" stroke="none"/><circle cx="12" cy="16" r="1" fill="currentColor" stroke="none"/></svg></span>
			
--- a/archivebox/plugins/parse_netscape_urls/on_Snapshot__73_parse_netscape_urls.py
+++ b/archivebox/plugins/parse_netscape_urls/on_Snapshot__73_parse_netscape_urls.py
@@ -168,6 +168,13 @@ def fetch_content(url: str) -> str:
 
				 @click.option('--depth', type=int, default=0, help='Current depth level')
			
 
				 def main(url: str, snapshot_id: str = None, crawl_id: str = None, depth: int = 0):
			
 
				     """Parse Netscape bookmark HTML and extract URLs."""
			
 
				+    env_depth = os.environ.get('SNAPSHOT_DEPTH')
			
 
				+    if env_depth is not None:
			
 
				+        try:
			
 
				+            depth = int(env_depth)
			
 
				+        except Exception:
			
 
				+            pass
			
 
				+    crawl_id = crawl_id or os.environ.get('CRAWL_ID')
			
 
				 
			
 
				     try:
			
 
				         content = fetch_content(url)
			
--- a/archivebox/plugins/parse_netscape_urls/templates/icon.html
+++ b/archivebox/plugins/parse_netscape_urls/templates/icon.html
@@ -1 +1 @@
 
				-🔖
			
 
				+<span class="abx-output-icon abx-output-icon--parse_netscape_urls" title="Netscape Bookmarks"><svg width="16" height="16" viewBox="0 0 24 24" aria-hidden="true" focusable="false" fill="none" stroke="currentColor" stroke-width="1.8" stroke-linecap="round" stroke-linejoin="round"><path d="M6 4h12v16l-6-4-6 4z"/></svg></span>
			
--- a/archivebox/plugins/parse_rss_urls/on_Snapshot__72_parse_rss_urls.py
+++ b/archivebox/plugins/parse_rss_urls/on_Snapshot__72_parse_rss_urls.py
@@ -56,6 +56,13 @@ def fetch_content(url: str) -> str:
 
				 @click.option('--depth', type=int, default=0, help='Current depth level')
			
 
				 def main(url: str, snapshot_id: str = None, crawl_id: str = None, depth: int = 0):
			
 
				     """Parse RSS/Atom feed and extract article URLs."""
			
 
				+    env_depth = os.environ.get('SNAPSHOT_DEPTH')
			
 
				+    if env_depth is not None:
			
 
				+        try:
			
 
				+            depth = int(env_depth)
			
 
				+        except Exception:
			
 
				+            pass
			
 
				+    crawl_id = crawl_id or os.environ.get('CRAWL_ID')
			
 
				 
			
 
				     if feedparser is None:
			
 
				         click.echo('feedparser library not installed', err=True)
			
--- a/archivebox/plugins/parse_rss_urls/templates/icon.html
+++ b/archivebox/plugins/parse_rss_urls/templates/icon.html
@@ -1 +1 @@
 
				-📡
			
 
				+<span class="abx-output-icon abx-output-icon--parse_rss_urls" title="RSS"><svg width="16" height="16" viewBox="0 0 24 24" aria-hidden="true" focusable="false" fill="none" stroke="currentColor" stroke-width="1.8" stroke-linecap="round" stroke-linejoin="round"><circle cx="5" cy="19" r="1.5" fill="currentColor" stroke="none"/><path d="M5 11a8 8 0 0 1 8 8"/><path d="M5 5a14 14 0 0 1 14 14"/></svg></span>
			
--- a/archivebox/plugins/parse_txt_urls/on_Snapshot__71_parse_txt_urls.py
+++ b/archivebox/plugins/parse_txt_urls/on_Snapshot__71_parse_txt_urls.py
@@ -105,6 +105,13 @@ def fetch_content(url: str) -> str:
 
				 @click.option('--depth', type=int, default=0, help='Current depth level')
			
 
				 def main(url: str, snapshot_id: str = None, crawl_id: str = None, depth: int = 0):
			
 
				     """Parse plain text and extract URLs."""
			
 
				+    env_depth = os.environ.get('SNAPSHOT_DEPTH')
			
 
				+    if env_depth is not None:
			
 
				+        try:
			
 
				+            depth = int(env_depth)
			
 
				+        except Exception:
			
 
				+            pass
			
 
				+    crawl_id = crawl_id or os.environ.get('CRAWL_ID')
			
 
				 
			
 
				     try:
			
 
				         content = fetch_content(url)
		`@@ -1 +0,0 @@`
		`-"""Tests for the apt binary provider plugin."""`
		`@@ -1 +0,0 @@`
		`-{"type": "Binary", "name": "chrome", "binproviders": "npm,env,brew,apt", "overrides": {"npm": {"packages": ["@puppeteer/browsers"]}}}`
		`@@ -1 +0,0 @@`
		`-"""Tests for the custom binary provider plugin."""`
		`@@ -1 +0,0 @@`
		`-"""Tests for the env binary provider plugin."""`
		`@@ -1 +0,0 @@`
		`-{"type": "Binary", "name": "forum-dl", "binproviders": "pip,env"}`
		`@@ -1 +0,0 @@`
		`-{"type": "Binary", "name": "gallery-dl", "binproviders": "pip,brew,apt,env"}`
		`@@ -1 +0,0 @@`
		`-{"type": "Binary", "name": "git", "binproviders": "apt,brew,env"}`
		`@@ -1 +0,0 @@`
		`-{"type": "Binary", "name": "postlight-parser", "binproviders": "npm,env", "overrides": {"npm": {"packages": ["@postlight/parser"]}}}`
		`@@ -1 +0,0 @@`
		`-"""Tests for the npm binary provider plugin."""`
		`@@ -1 +0,0 @@`
		`-{"type": "Binary", "name": "papers-dl", "binproviders": "pip,env"}`