Pārlūkot izejas kodu

rename archive_org to archivedotorg, add BinaryWorker, fix config pass-through

Nick Sweeting 1 mēnesi atpakaļ
vecāks
revīzija
7ceaeae2d9
32 mainītis faili ar 1111 papildinājumiem un 110 dzēšanām
  1. 17 0
      .claude/settings.local.json
  2. 1 1
      archivebox/base_models/models.py
  3. 40 2
      archivebox/cli/archivebox_run.py
  4. 24 2
      archivebox/config/configset.py
  5. 1 1
      archivebox/core/forms.py
  6. 1 1
      archivebox/core/migrations/0007_archiveresult.py
  7. 1 1
      archivebox/core/migrations/0011_auto_20210216_1331.py
  8. 1 1
      archivebox/core/migrations/0021_auto_20220914_0934.py
  9. 1 1
      archivebox/core/migrations/0022_auto_20231023_2008.py
  10. 1 1
      archivebox/core/models.py
  11. 4 1
      archivebox/core/settings.py
  12. 17 0
      archivebox/crawls/migrations/0004_remove_crawl_output_dir.py
  13. 21 55
      archivebox/hooks.py
  14. 17 0
      archivebox/machine/migrations/0011_remove_binary_output_dir.py
  15. 1 1
      archivebox/personas/apps.py
  16. 2 1
      archivebox/personas/migrations/0001_initial.py
  17. 19 0
      archivebox/personas/migrations/0002_alter_persona_id.py
  18. 2 0
      archivebox/personas/models.py
  19. 0 0
      archivebox/plugins/archivedotorg/config.json
  20. 4 4
      archivebox/plugins/archivedotorg/on_Snapshot__13_archivedotorg.py
  21. 0 0
      archivebox/plugins/archivedotorg/templates/icon.html
  22. 0 0
      archivebox/plugins/archivedotorg/templates/thumbnail.html
  23. 4 4
      archivebox/plugins/archivedotorg/tests/test_archivedotorg.py
  24. 13 4
      archivebox/plugins/screenshot/on_Snapshot__51_screenshot.js
  25. 0 3
      archivebox/plugins/screenshot/tests/test_screenshot.py
  26. 1 1
      archivebox/plugins/ytdlp/config.json
  27. 256 0
      archivebox/tests/test_cli_run_binary_worker.py
  28. 508 0
      archivebox/tests/test_worker_config_propagation.py
  29. 48 15
      archivebox/workers/orchestrator.py
  30. 102 8
      archivebox/workers/worker.py
  31. 2 0
      bin/test_plugins.sh
  32. 2 2
      old/TODO_hook_architecture.md

+ 17 - 0
.claude/settings.local.json

@@ -1,6 +1,9 @@
 {
   "permissions": {
     "allow": [
+      "Read(**)",
+      "Glob(**)",
+      "Grep(**)",
       "Bash(python -m archivebox:*)",
       "Bash(ls:*)",
       "Bash(xargs:*)",
@@ -29,5 +32,19 @@
       "Bash(done)",
       "Bash(coverage erase:*)"
     ]
+  },
+  "hooks": {
+    "PreToolUse": [
+      {
+        "matcher": "Bash",
+        "hooks": [
+          {
+            "type": "command",
+            "command": "REPO_ROOT=$(git rev-parse --show-toplevel 2>/dev/null); if [ -n \"$REPO_ROOT\" ] && [ \"$PWD\" != \"$REPO_ROOT\" ]; then echo \"ERROR: Not in repo root ($REPO_ROOT). Current dir: $PWD\" >&2; exit 1; fi",
+            "statusMessage": "Checking working directory..."
+          }
+        ]
+      }
+    ]
   }
 }

+ 1 - 1
archivebox/base_models/models.py

@@ -128,4 +128,4 @@ class ModelWithOutputDir(ModelWithUUID):
 
     @property
     def output_dir(self) -> Path:
-        raise NotImplementedError(f'{self.__class__.__name__} must implement output_dir property')
+        raise NotImplementedError(f"{self.__class__.__name__} must implement output_dir property")

+ 40 - 2
archivebox/cli/archivebox_run.py

@@ -59,10 +59,11 @@ def process_stdin_records() -> int:
     """
     from django.utils import timezone
 
-    from archivebox.misc.jsonl import read_stdin, write_record, TYPE_CRAWL, TYPE_SNAPSHOT, TYPE_ARCHIVERESULT
+    from archivebox.misc.jsonl import read_stdin, write_record, TYPE_CRAWL, TYPE_SNAPSHOT, TYPE_ARCHIVERESULT, TYPE_BINARY
     from archivebox.base_models.models import get_or_create_system_user_pk
     from archivebox.core.models import Snapshot, ArchiveResult
     from archivebox.crawls.models import Crawl
+    from archivebox.machine.models import Binary
     from archivebox.workers.orchestrator import Orchestrator
 
     records = list(read_stdin())
@@ -137,6 +138,26 @@ def process_stdin_records() -> int:
                     output_records.append(archiveresult.to_json())
                     queued_count += 1
 
+            elif record_type == TYPE_BINARY:
+                # Binary records - create or update and queue for installation
+                if record_id:
+                    # Existing binary - re-queue
+                    try:
+                        binary = Binary.objects.get(id=record_id)
+                    except Binary.DoesNotExist:
+                        binary = Binary.from_json(record)
+                else:
+                    # New binary - create it
+                    binary = Binary.from_json(record)
+
+                if binary:
+                    binary.retry_at = timezone.now()
+                    if binary.status != Binary.StatusChoices.INSTALLED:
+                        binary.status = Binary.StatusChoices.QUEUED
+                    binary.save()
+                    output_records.append(binary.to_json())
+                    queued_count += 1
+
             else:
                 # Unknown type - pass through
                 output_records.append(record)
@@ -222,7 +243,8 @@ def run_snapshot_worker(snapshot_id: str) -> int:
 @click.option('--daemon', '-d', is_flag=True, help="Run forever (don't exit on idle)")
 @click.option('--crawl-id', help="Run orchestrator for specific crawl only")
 @click.option('--snapshot-id', help="Run worker for specific snapshot only")
-def main(daemon: bool, crawl_id: str, snapshot_id: str):
[email protected]('--binary-id', help="Run worker for specific binary only")
+def main(daemon: bool, crawl_id: str, snapshot_id: str, binary_id: str):
     """
     Process queued work.
 
@@ -231,11 +253,27 @@ def main(daemon: bool, crawl_id: str, snapshot_id: str):
     - No args + TTY: Run orchestrator for all work
     - --crawl-id: Run orchestrator for that crawl only
     - --snapshot-id: Run worker for that snapshot only
+    - --binary-id: Run worker for that binary only
     """
     # Snapshot worker mode
     if snapshot_id:
         sys.exit(run_snapshot_worker(snapshot_id))
 
+    # Binary worker mode
+    if binary_id:
+        from archivebox.workers.worker import BinaryWorker
+        try:
+            worker = BinaryWorker(binary_id=binary_id, worker_id=0)
+            worker.runloop()
+            sys.exit(0)
+        except KeyboardInterrupt:
+            sys.exit(0)
+        except Exception as e:
+            rprint(f'[red]Worker error: {type(e).__name__}: {e}[/red]', file=sys.stderr)
+            import traceback
+            traceback.print_exc()
+            sys.exit(1)
+
     # Crawl worker mode
     if crawl_id:
         from archivebox.workers.worker import CrawlWorker

+ 24 - 2
archivebox/config/configset.py

@@ -123,6 +123,7 @@ def get_config(
     user: Any = None,
     crawl: Any = None,
     snapshot: Any = None,
+    archiveresult: Any = None,
     machine: Any = None,
 ) -> Dict[str, Any]:
     """
@@ -145,11 +146,26 @@ def get_config(
         user: User object with config JSON field
         crawl: Crawl object with config JSON field
         snapshot: Snapshot object with config JSON field
+        archiveresult: ArchiveResult object (auto-fetches snapshot)
         machine: Machine object with config JSON field (defaults to Machine.current())
 
+    Note: Objects are auto-fetched from relationships if not provided:
+        - snapshot auto-fetched from archiveresult.snapshot
+        - crawl auto-fetched from snapshot.crawl
+        - user auto-fetched from crawl.created_by
+
     Returns:
         Merged config dict
     """
+    # Auto-fetch related objects from relationships
+    if snapshot is None and archiveresult and hasattr(archiveresult, "snapshot"):
+        snapshot = archiveresult.snapshot
+
+    if crawl is None and snapshot and hasattr(snapshot, "crawl"):
+        crawl = snapshot.crawl
+
+    if user is None and crawl and hasattr(crawl, "created_by"):
+        user = crawl.created_by
     from archivebox.config.constants import CONSTANTS
     from archivebox.config.common import (
         SHELL_CONFIG,
@@ -197,12 +213,18 @@ def get_config(
     if machine and hasattr(machine, "config") and machine.config:
         config.update(machine.config)
 
-    # Override with environment variables
+    # Override with environment variables (for keys that exist in config)
     for key in config:
         env_val = os.environ.get(key)
         if env_val is not None:
             config[key] = _parse_env_value(env_val, config.get(key))
 
+    # Also add NEW environment variables (not yet in config)
+    # This is important for worker subprocesses that receive config via Process.env
+    for key, value in os.environ.items():
+        if key.isupper() and key not in config:  # Only uppercase keys (config convention)
+            config[key] = _parse_env_value(value, None)
+
     # Also check plugin config aliases in environment
     try:
         from archivebox.hooks import discover_plugin_configs
@@ -335,7 +357,7 @@ DEFAULT_WORKER_CONCURRENCY = {
     "title": 5,
     "favicon": 5,
     "headers": 5,
-    "archive_org": 2,
+    "archivedotorg": 2,
     "readability": 3,
     "mercury": 3,
     "git": 2,

+ 1 - 1
archivebox/core/forms.py

@@ -147,7 +147,7 @@ class AddLinkForm(forms.Form):
             'screenshot', 'seo', 'singlefile', 'ssl', 'staticfile', 'title'
         }
         archiving = {
-            'archive_org', 'favicon', 'forumdl', 'gallerydl', 'git',
+            'archivedotorg', 'favicon', 'forumdl', 'gallerydl', 'git',
             'htmltotext', 'media', 'mercury', 'papersdl', 'readability', 'wget'
         }
         parsing = {

+ 1 - 1
archivebox/core/migrations/0007_archiveresult.py

@@ -120,7 +120,7 @@ class Migration(migrations.Migration):
                 ('output', models.CharField(max_length=512)),
                 ('start_ts', models.DateTimeField()),
                 ('end_ts', models.DateTimeField()),
-                ('extractor', models.CharField(choices=[('title', 'title'), ('favicon', 'favicon'), ('wget', 'wget'), ('singlefile', 'singlefile'), ('pdf', 'pdf'), ('screenshot', 'screenshot'), ('dom', 'dom'), ('readability', 'readability'), ('mercury', 'mercury'), ('git', 'git'), ('media', 'media'), ('headers', 'headers'), ('archive_org', 'archive_org')], max_length=32)),
+                ('extractor', models.CharField(choices=[('title', 'title'), ('favicon', 'favicon'), ('wget', 'wget'), ('singlefile', 'singlefile'), ('pdf', 'pdf'), ('screenshot', 'screenshot'), ('dom', 'dom'), ('readability', 'readability'), ('mercury', 'mercury'), ('git', 'git'), ('media', 'media'), ('headers', 'headers'), ('archivedotorg', 'archivedotorg')], max_length=32)),
                 ('snapshot', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, to='core.Snapshot')),
             ],
         ),

+ 1 - 1
archivebox/core/migrations/0011_auto_20210216_1331.py

@@ -19,6 +19,6 @@ class Migration(migrations.Migration):
         migrations.AlterField(
             model_name='archiveresult',
             name='extractor',
-            field=models.CharField(choices=[('title', 'title'), ('favicon', 'favicon'), ('headers', 'headers'), ('singlefile', 'singlefile'), ('pdf', 'pdf'), ('screenshot', 'screenshot'), ('dom', 'dom'), ('wget', 'wget'), ('readability', 'readability'), ('mercury', 'mercury'), ('git', 'git'), ('media', 'media'), ('archive_org', 'archive_org')], max_length=32),
+            field=models.CharField(choices=[('title', 'title'), ('favicon', 'favicon'), ('headers', 'headers'), ('singlefile', 'singlefile'), ('pdf', 'pdf'), ('screenshot', 'screenshot'), ('dom', 'dom'), ('wget', 'wget'), ('readability', 'readability'), ('mercury', 'mercury'), ('git', 'git'), ('media', 'media'), ('archivedotorg', 'archivedotorg')], max_length=32),
         ),
     ]

+ 1 - 1
archivebox/core/migrations/0021_auto_20220914_0934.py

@@ -13,6 +13,6 @@ class Migration(migrations.Migration):
         migrations.AlterField(
             model_name='archiveresult',
             name='extractor',
-            field=models.CharField(choices=[('favicon', 'favicon'), ('headers', 'headers'), ('singlefile', 'singlefile'), ('pdf', 'pdf'), ('screenshot', 'screenshot'), ('dom', 'dom'), ('wget', 'wget'), ('title', 'title'), ('readability', 'readability'), ('mercury', 'mercury'), ('git', 'git'), ('media', 'media'), ('archive_org', 'archive_org')], max_length=32),
+            field=models.CharField(choices=[('favicon', 'favicon'), ('headers', 'headers'), ('singlefile', 'singlefile'), ('pdf', 'pdf'), ('screenshot', 'screenshot'), ('dom', 'dom'), ('wget', 'wget'), ('title', 'title'), ('readability', 'readability'), ('mercury', 'mercury'), ('git', 'git'), ('media', 'media'), ('archivedotorg', 'archivedotorg')], max_length=32),
         ),
     ]

+ 1 - 1
archivebox/core/migrations/0022_auto_20231023_2008.py

@@ -13,6 +13,6 @@ class Migration(migrations.Migration):
         migrations.AlterField(
             model_name='archiveresult',
             name='extractor',
-            field=models.CharField(choices=[('favicon', 'favicon'), ('headers', 'headers'), ('singlefile', 'singlefile'), ('pdf', 'pdf'), ('screenshot', 'screenshot'), ('dom', 'dom'), ('wget', 'wget'), ('title', 'title'), ('readability', 'readability'), ('mercury', 'mercury'), ('htmltotext', 'htmltotext'), ('git', 'git'), ('media', 'media'), ('archive_org', 'archive_org')], max_length=32),
+            field=models.CharField(choices=[('favicon', 'favicon'), ('headers', 'headers'), ('singlefile', 'singlefile'), ('pdf', 'pdf'), ('screenshot', 'screenshot'), ('dom', 'dom'), ('wget', 'wget'), ('title', 'title'), ('readability', 'readability'), ('mercury', 'mercury'), ('htmltotext', 'htmltotext'), ('git', 'git'), ('media', 'media'), ('archivedotorg', 'archivedotorg')], max_length=32),
         ),
     ]

+ 1 - 1
archivebox/core/models.py

@@ -1973,7 +1973,7 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
         canonical = {
             'index_path': 'index.html',
             'google_favicon_path': FAVICON_PROVIDER.format(self.domain),
-            'archive_org_path': f'https://web.archive.org/web/{self.base_url}',
+            'archivedotorg_path': f'https://web.archive.org/web/{self.base_url}',
         }
 
         # Scan each ArchiveResult's output directory for the best file

+ 4 - 1
archivebox/core/settings.py

@@ -206,7 +206,10 @@ DATABASES = {
 }
 MIGRATION_MODULES = {"signal_webhooks": None}
 
-# as much as I'd love this to be a UUID or ULID field, it's not supported yet as of Django 5.0
+# Django requires DEFAULT_AUTO_FIELD to subclass AutoField (BigAutoField, SmallAutoField, etc.)
+# Cannot use UUIDField here until Django 6.0 introduces DEFAULT_PK_FIELD setting
+# For now: manually add `id = models.UUIDField(primary_key=True, default=uuid7, ...)` to all models
+# OR inherit from ModelWithUUID base class which provides UUID primary key
 DEFAULT_AUTO_FIELD = "django.db.models.BigAutoField"
 
 

+ 17 - 0
archivebox/crawls/migrations/0004_remove_crawl_output_dir.py

@@ -0,0 +1,17 @@
+# Generated by Django 6.0 on 2026-01-05 01:09
+
+from django.db import migrations
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ('crawls', '0003_remove_crawlschedule_num_uses_failed_and_more'),
+    ]
+
+    operations = [
+        migrations.RemoveField(
+            model_name='crawl',
+            name='output_dir',
+        ),
+    ]

+ 21 - 55
archivebox/hooks.py

@@ -370,20 +370,34 @@ def run_hook(
         from archivebox.machine.models import Machine
         machine = Machine.current()
         if machine and machine.config:
-            machine_path = machine.config.get('config/PATH')
+            machine_path = machine.config.get('PATH')
             if machine_path:
                 # Prepend LIB_BIN_DIR to machine PATH as well
                 if lib_bin_dir and not machine_path.startswith(f'{lib_bin_dir}:'):
                     env['PATH'] = f'{lib_bin_dir}:{machine_path}'
                 else:
                     env['PATH'] = machine_path
-            # Also set NODE_MODULES_DIR if configured
-            node_modules_dir = machine.config.get('config/NODE_MODULES_DIR')
-            if node_modules_dir:
-                env['NODE_MODULES_DIR'] = node_modules_dir
     except Exception:
         pass  # Fall back to system PATH if Machine not available
 
+    # Set NODE_PATH for Node.js module resolution
+    # Priority: config dict > Machine.config > derive from LIB_DIR
+    node_path = config.get('NODE_PATH')
+    if not node_path and lib_dir:
+        # Derive from LIB_DIR/npm/node_modules
+        node_modules_dir = Path(lib_dir) / 'npm' / 'node_modules'
+        if node_modules_dir.exists():
+            node_path = str(node_modules_dir)
+    if not node_path:
+        try:
+            # Fallback to Machine.config
+            node_path = machine.config.get('NODE_MODULES_DIR')
+        except Exception:
+            pass
+    if node_path:
+        env['NODE_PATH'] = node_path
+        env['NODE_MODULES_DIR'] = node_path  # For backwards compatibility
+
     # Export all config values to environment (already merged by get_config())
     for key, value in config.items():
         if value is None:
@@ -414,56 +428,8 @@ def run_hook(
             timeout=timeout,
         )
 
-        # Build environment from config (Process._build_env() expects self.env dict)
-        # We need to set env on the process before launching
-        process.env = {}
-        for key, value in config.items():
-            if value is None:
-                continue
-            elif isinstance(value, bool):
-                process.env[key] = 'true' if value else 'false'
-            elif isinstance(value, (list, dict)):
-                process.env[key] = json.dumps(value)
-            else:
-                process.env[key] = str(value)
-
-        # Add base paths to env
-        process.env['DATA_DIR'] = str(getattr(settings, 'DATA_DIR', Path.cwd()))
-        process.env['ARCHIVE_DIR'] = str(getattr(settings, 'ARCHIVE_DIR', Path.cwd() / 'archive'))
-        process.env.setdefault('MACHINE_ID', getattr(settings, 'MACHINE_ID', '') or os.environ.get('MACHINE_ID', ''))
-
-        # Add LIB_DIR and LIB_BIN_DIR
-        lib_dir = config.get('LIB_DIR', getattr(settings, 'LIB_DIR', None))
-        lib_bin_dir = config.get('LIB_BIN_DIR', getattr(settings, 'LIB_BIN_DIR', None))
-        if lib_dir:
-            process.env['LIB_DIR'] = str(lib_dir)
-        if not lib_bin_dir and lib_dir:
-            lib_bin_dir = Path(lib_dir) / 'bin'
-        if lib_bin_dir:
-            process.env['LIB_BIN_DIR'] = str(lib_bin_dir)
-
-        # Set PATH from Machine.config if available
-        try:
-            if machine and machine.config:
-                machine_path = machine.config.get('config/PATH')
-                if machine_path:
-                    # Prepend LIB_BIN_DIR to machine PATH as well
-                    if lib_bin_dir and not machine_path.startswith(f'{lib_bin_dir}:'):
-                        process.env['PATH'] = f'{lib_bin_dir}:{machine_path}'
-                    else:
-                        process.env['PATH'] = machine_path
-                elif lib_bin_dir:
-                    # Just prepend to current PATH
-                    current_path = os.environ.get('PATH', '')
-                    if not current_path.startswith(f'{lib_bin_dir}:'):
-                        process.env['PATH'] = f'{lib_bin_dir}:{current_path}' if current_path else str(lib_bin_dir)
-
-                # Also set NODE_MODULES_DIR if configured
-                node_modules_dir = machine.config.get('config/NODE_MODULES_DIR')
-                if node_modules_dir:
-                    process.env['NODE_MODULES_DIR'] = node_modules_dir
-        except Exception:
-            pass  # Fall back to system PATH if Machine not available
+        # Copy the env dict we already built (includes os.environ + all customizations)
+        process.env = env.copy()
 
         # Save env before launching
         process.save()

+ 17 - 0
archivebox/machine/migrations/0011_remove_binary_output_dir.py

@@ -0,0 +1,17 @@
+# Generated by Django 6.0 on 2026-01-05 01:09
+
+from django.db import migrations
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ('machine', '0010_alter_process_process_type'),
+    ]
+
+    operations = [
+        migrations.RemoveField(
+            model_name='binary',
+            name='output_dir',
+        ),
+    ]

+ 1 - 1
archivebox/personas/apps.py

@@ -1,7 +1,7 @@
 from django.apps import AppConfig
 
 
-class SessionsConfig(AppConfig):
+class PersonasConfig(AppConfig):
     default_auto_field = "django.db.models.BigAutoField"
     name = "archivebox.personas"
     label = "personas"

+ 2 - 1
archivebox/personas/migrations/0001_initial.py

@@ -1,6 +1,7 @@
 # Generated by Django 6.0 on 2025-12-31 09:06
 
 import archivebox.base_models.models
+from archivebox.uuid_compat import uuid7
 import django.db.models.deletion
 import django.utils.timezone
 from django.conf import settings
@@ -19,7 +20,7 @@ class Migration(migrations.Migration):
         migrations.CreateModel(
             name='Persona',
             fields=[
-                ('id', models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
+                ('id', models.UUIDField(primary_key=True, default=uuid7, editable=False, unique=True)),
                 ('config', models.JSONField(blank=True, default=dict, null=True)),
                 ('name', models.CharField(max_length=64, unique=True)),
                 ('created_at', models.DateTimeField(db_index=True, default=django.utils.timezone.now)),

+ 19 - 0
archivebox/personas/migrations/0002_alter_persona_id.py

@@ -0,0 +1,19 @@
+# Generated by Django 6.0 on 2026-01-05 01:09
+
+import uuid
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ('personas', '0001_initial'),
+    ]
+
+    operations = [
+        migrations.AlterField(
+            model_name='persona',
+            name='id',
+            field=models.UUIDField(default=uuid.uuid7, editable=False, primary_key=True, serialize=False, unique=True),
+        ),
+    ]

+ 2 - 0
archivebox/personas/models.py

@@ -19,6 +19,7 @@ from django.conf import settings
 from django.utils import timezone
 
 from archivebox.base_models.models import ModelWithConfig, get_or_create_system_user_pk
+from archivebox.uuid_compat import uuid7
 
 if TYPE_CHECKING:
     from django.db.models import QuerySet
@@ -44,6 +45,7 @@ class Persona(ModelWithConfig):
         persona.CHROME_USER_DATA_DIR  # -> Path to chrome_user_data
     """
 
+    id = models.UUIDField(primary_key=True, default=uuid7, editable=False, unique=True)
     name = models.CharField(max_length=64, unique=True)
     created_at = models.DateTimeField(default=timezone.now, db_index=True)
     created_by = models.ForeignKey(settings.AUTH_USER_MODEL, on_delete=models.CASCADE, default=get_or_create_system_user_pk)

+ 0 - 0
archivebox/plugins/archive_org/config.json → archivebox/plugins/archivedotorg/config.json


+ 4 - 4
archivebox/plugins/archive_org/on_Snapshot__13_archive_org.py → archivebox/plugins/archivedotorg/on_Snapshot__13_archivedotorg.py

@@ -2,7 +2,7 @@
 """
 Submit a URL to archive.org for archiving.
 
-Usage: on_Snapshot__archive_org.py --url=<url> --snapshot-id=<uuid>
+Usage: on_Snapshot__archivedotorg.py --url=<url> --snapshot-id=<uuid>
 Output: Writes archive.org.txt to $PWD with the archived URL
 
 Environment variables:
@@ -25,7 +25,7 @@ import rich_click as click
 
 
 # Extractor metadata
-PLUGIN_NAME = 'archive_org'
+PLUGIN_NAME = 'archivedotorg'
 OUTPUT_DIR = '.'
 OUTPUT_FILE = 'archive.org.txt'
 
@@ -41,7 +41,7 @@ def get_env_int(name: str, default: int = 0) -> int:
         return default
 
 
-def submit_to_archive_org(url: str) -> tuple[bool, str | None, str]:
+def submit_to_archivedotorg(url: str) -> tuple[bool, str | None, str]:
     """
     Submit URL to archive.org Wayback Machine.
 
@@ -113,7 +113,7 @@ def main(url: str, snapshot_id: str):
 
     try:
         # Run extraction
-        success, output, error = submit_to_archive_org(url)
+        success, output, error = submit_to_archivedotorg(url)
 
         if success:
             # Success - emit ArchiveResult with output file

+ 0 - 0
archivebox/plugins/archive_org/templates/icon.html → archivebox/plugins/archivedotorg/templates/icon.html


+ 0 - 0
archivebox/plugins/archive_org/templates/thumbnail.html → archivebox/plugins/archivedotorg/templates/thumbnail.html


+ 4 - 4
archivebox/plugins/archive_org/tests/test_archive_org.py → archivebox/plugins/archivedotorg/tests/test_archivedotorg.py

@@ -1,5 +1,5 @@
 """
-Integration tests for archive_org plugin
+Integration tests for archivedotorg plugin
 
 Tests verify standalone archive.org extractor execution.
 """
@@ -12,13 +12,13 @@ from pathlib import Path
 import pytest
 
 PLUGIN_DIR = Path(__file__).parent.parent
-ARCHIVEDOTORG_HOOK = next(PLUGIN_DIR.glob('on_Snapshot__*_archive_org.*'), None)
+ARCHIVEDOTORG_HOOK = next(PLUGIN_DIR.glob('on_Snapshot__*_archivedotorg.*'), None)
 TEST_URL = 'https://example.com'
 
 def test_hook_script_exists():
     assert ARCHIVEDOTORG_HOOK.exists()
 
-def test_submits_to_archive_org():
+def test_submits_to_archivedotorg():
     with tempfile.TemporaryDirectory() as tmpdir:
         result = subprocess.run(
             [sys.executable, str(ARCHIVEDOTORG_HOOK), '--url', TEST_URL, '--snapshot-id', 'test789'],
@@ -49,7 +49,7 @@ def test_submits_to_archive_org():
             assert not result_json, "Should NOT emit JSONL on transient error"
             assert result.stderr, "Should have error message in stderr"
 
-def test_config_save_archive_org_false_skips():
+def test_config_save_archivedotorg_false_skips():
     with tempfile.TemporaryDirectory() as tmpdir:
         import os
         env = os.environ.copy()

+ 13 - 4
archivebox/plugins/screenshot/on_Snapshot__51_screenshot.js

@@ -26,11 +26,20 @@ const {
     readCdpUrl,
 } = require('../chrome/chrome_utils.js');
 
+// Flush V8 coverage before exit (needed for NODE_V8_COVERAGE to capture early exits)
+function flushCoverageAndExit(exitCode) {
+    if (process.env.NODE_V8_COVERAGE) {
+        const v8 = require('v8');
+        v8.takeCoverage();
+    }
+    process.exit(exitCode);
+}
+
 // Check if screenshot is enabled BEFORE requiring puppeteer
 if (!getEnvBool('SCREENSHOT_ENABLED', true)) {
     console.error('Skipping screenshot (SCREENSHOT_ENABLED=False)');
     // Temporary failure (config disabled) - NO JSONL emission
-    process.exit(0);
+    flushCoverageAndExit(0);
 }
 
 // Now safe to require puppeteer
@@ -135,7 +144,7 @@ async function main() {
 
     if (!url || !snapshotId) {
         console.error('Usage: on_Snapshot__51_screenshot.js --url=<url> --snapshot-id=<uuid>');
-        process.exit(1);
+        flushCoverageAndExit(1);
     }
 
     // Check if staticfile extractor already handled this (permanent skip)
@@ -147,7 +156,7 @@ async function main() {
             status: 'skipped',
             output_str: 'staticfile already handled',
         }));
-        process.exit(0);
+        flushCoverageAndExit(0);
     }
 
     // Take screenshot (throws on error)
@@ -166,5 +175,5 @@ async function main() {
 main().catch(e => {
     // Transient error - emit NO JSONL
     console.error(`ERROR: ${e.message}`);
-    process.exit(1);
+    flushCoverageAndExit(1);
 });

+ 0 - 3
archivebox/plugins/screenshot/tests/test_screenshot.py

@@ -226,9 +226,6 @@ def test_config_save_screenshot_false_skips():
         print(f"[DEBUG RESULT] Exit code: {result.returncode}")
         print(f"[DEBUG RESULT] Stderr: {result.stderr[:200]}")
 
-        # FORCE FAILURE to verify test actually runs
-        assert False, f"FORCED FAILURE - NODE_V8_COVERAGE={'NODE_V8_COVERAGE' in env} value={env.get('NODE_V8_COVERAGE', 'NOTSET')}"
-
         assert result.returncode == 0, f"Should exit 0 when feature disabled: {result.stderr}"
 
         # Feature disabled - temporary failure, should NOT emit JSONL

+ 1 - 1
archivebox/plugins/ytdlp/config.json

@@ -74,7 +74,7 @@
         "--geo-bypass",
         "--add-metadata",
         "--no-progress",
-        "--remote-components ejs:github",
+        "--remote-components=ejs:github",
         "-o",
         "%(title)s.%(ext)s"
       ],

+ 256 - 0
archivebox/tests/test_cli_run_binary_worker.py

@@ -0,0 +1,256 @@
+"""
+Tests for BinaryWorker processing Binary queue.
+
+Tests cover:
+- BinaryWorker is spawned by Orchestrator when Binary queue has work
+- Binary hooks (on_Binary__*) actually run and install binaries
+- Binary status transitions from QUEUED -> INSTALLED
+- BinaryWorker exits after idle timeout
+"""
+
+import json
+import sqlite3
+import time
+
+from archivebox.tests.conftest import (
+    run_archivebox_cmd,
+    parse_jsonl_output,
+)
+
+
+class TestBinaryWorkerSpawning:
+    """Tests for BinaryWorker lifecycle."""
+
+    def test_binary_worker_spawns_when_binary_queued(self, initialized_archive):
+        """Orchestrator spawns BinaryWorker when Binary queue has work."""
+        # Create a Binary record via CLI
+        binary_record = {
+            'type': 'Binary',
+            'name': 'python3',
+            'binproviders': 'env',  # Use env provider to detect system python
+        }
+
+        # Use `archivebox run` to create the Binary (this queues it)
+        stdout, stderr, code = run_archivebox_cmd(
+            ['run'],
+            stdin=json.dumps(binary_record),
+            data_dir=initialized_archive,
+            timeout=30,
+        )
+
+        assert code == 0, f"Failed to create Binary: {stderr}"
+
+        # Verify Binary was created in DB
+        conn = sqlite3.connect(initialized_archive / 'index.sqlite3')
+        c = conn.cursor()
+        binaries = c.execute(
+            "SELECT name, status, abspath FROM machine_binary WHERE name='python3'"
+        ).fetchall()
+        conn.close()
+
+        assert len(binaries) >= 1, "Binary was not created in database"
+        name, status, abspath = binaries[0]
+        assert name == 'python3'
+        # Status should be INSTALLED after BinaryWorker processed it
+        # (or QUEUED if worker timed out before installing)
+        assert status in ['installed', 'queued']
+
+
+    def test_binary_hooks_actually_run(self, initialized_archive):
+        """Binary installation hooks (on_Binary__*) run and update abspath."""
+        # Create a Binary for python3 (guaranteed to exist on system)
+        binary_record = {
+            'type': 'Binary',
+            'name': 'python3',
+            'binproviders': 'env',
+        }
+
+        stdout, stderr, code = run_archivebox_cmd(
+            ['run'],
+            stdin=json.dumps(binary_record),
+            data_dir=initialized_archive,
+            timeout=30,
+        )
+
+        assert code == 0, f"Failed to process Binary: {stderr}"
+
+        # Query database to check if hooks ran and populated abspath
+        conn = sqlite3.connect(initialized_archive / 'index.sqlite3')
+        c = conn.cursor()
+        result = c.execute(
+            "SELECT name, status, abspath, version FROM machine_binary WHERE name='python3'"
+        ).fetchone()
+        conn.close()
+
+        assert result is not None, "Binary not found in database"
+        name, status, abspath, version = result
+
+        # If hooks ran successfully, abspath should be populated
+        if status == 'installed':
+            assert abspath, f"Binary installed but abspath is empty: {abspath}"
+            assert '/python3' in abspath or '\\python3' in abspath, \
+                f"abspath doesn't look like a python3 path: {abspath}"
+            # Version should also be populated
+            assert version, f"Binary installed but version is empty: {version}"
+
+
+    def test_binary_status_transitions(self, initialized_archive):
+        """Binary status correctly transitions QUEUED -> INSTALLED."""
+        binary_record = {
+            'type': 'Binary',
+            'name': 'python3',
+            'binproviders': 'env',
+        }
+
+        # Create and process the Binary
+        stdout, stderr, code = run_archivebox_cmd(
+            ['run'],
+            stdin=json.dumps(binary_record),
+            data_dir=initialized_archive,
+            timeout=30,
+        )
+
+        assert code == 0
+
+        # Check final status
+        conn = sqlite3.connect(initialized_archive / 'index.sqlite3')
+        c = conn.cursor()
+        status = c.execute(
+            "SELECT status FROM machine_binary WHERE name='python3'"
+        ).fetchone()
+        conn.close()
+
+        assert status is not None
+        # Should be installed (or queued if worker timed out)
+        assert status[0] in ['installed', 'queued']
+
+
+class TestBinaryWorkerHooks:
+    """Tests for specific Binary hook providers."""
+
+    def test_env_provider_hook_detects_system_binary(self, initialized_archive):
+        """on_Binary__15_env_install.py hook detects system binaries."""
+        binary_record = {
+            'type': 'Binary',
+            'name': 'python3',
+            'binproviders': 'env',
+        }
+
+        stdout, stderr, code = run_archivebox_cmd(
+            ['run'],
+            stdin=json.dumps(binary_record),
+            data_dir=initialized_archive,
+            timeout=30,
+        )
+
+        assert code == 0
+
+        # Check that env provider hook populated the Binary
+        conn = sqlite3.connect(initialized_archive / 'index.sqlite3')
+        c = conn.cursor()
+        result = c.execute(
+            "SELECT binprovider, abspath FROM machine_binary WHERE name='python3' AND status='installed'"
+        ).fetchone()
+        conn.close()
+
+        if result:
+            binprovider, abspath = result
+            assert binprovider == 'env', f"Expected env provider, got: {binprovider}"
+            assert abspath, "abspath should be populated by env provider"
+
+
+    def test_multiple_binaries_processed_in_batch(self, initialized_archive):
+        """BinaryWorker processes multiple queued binaries."""
+        # Create multiple Binary records
+        binaries = [
+            {'type': 'Binary', 'name': 'python3', 'binproviders': 'env'},
+            {'type': 'Binary', 'name': 'curl', 'binproviders': 'env'},
+        ]
+
+        stdin = '\n'.join(json.dumps(b) for b in binaries)
+
+        stdout, stderr, code = run_archivebox_cmd(
+            ['run'],
+            stdin=stdin,
+            data_dir=initialized_archive,
+            timeout=45,
+        )
+
+        assert code == 0
+
+        # Both should be processed
+        conn = sqlite3.connect(initialized_archive / 'index.sqlite3')
+        c = conn.cursor()
+        installed = c.execute(
+            "SELECT name FROM machine_binary WHERE name IN ('python3', 'curl')"
+        ).fetchall()
+        conn.close()
+
+        assert len(installed) >= 1, "At least one binary should be created"
+
+
+class TestBinaryWorkerEdgeCases:
+    """Tests for edge cases and error handling."""
+
+    def test_nonexistent_binary_stays_queued(self, initialized_archive):
+        """Binary that doesn't exist stays queued (doesn't fail permanently)."""
+        binary_record = {
+            'type': 'Binary',
+            'name': 'nonexistent-binary-xyz-12345',
+            'binproviders': 'env',
+        }
+
+        stdout, stderr, code = run_archivebox_cmd(
+            ['run'],
+            stdin=json.dumps(binary_record),
+            data_dir=initialized_archive,
+            timeout=30,
+        )
+
+        # Command should still succeed (orchestrator doesn't fail on binary install failures)
+        assert code == 0
+
+        # Binary should remain queued (not installed)
+        conn = sqlite3.connect(initialized_archive / 'index.sqlite3')
+        c = conn.cursor()
+        result = c.execute(
+            "SELECT status FROM machine_binary WHERE name='nonexistent-binary-xyz-12345'"
+        ).fetchone()
+        conn.close()
+
+        if result:
+            status = result[0]
+            # Should stay queued since installation failed
+            assert status == 'queued', f"Expected queued, got: {status}"
+
+
+    def test_binary_worker_respects_machine_isolation(self, initialized_archive):
+        """BinaryWorker only processes binaries for current machine."""
+        # This is implicitly tested by other tests - Binary.objects.filter(machine=current)
+        # ensures only current machine's binaries are processed
+        binary_record = {
+            'type': 'Binary',
+            'name': 'python3',
+            'binproviders': 'env',
+        }
+
+        stdout, stderr, code = run_archivebox_cmd(
+            ['run'],
+            stdin=json.dumps(binary_record),
+            data_dir=initialized_archive,
+            timeout=30,
+        )
+
+        assert code == 0
+
+        # Check that machine_id is set correctly
+        conn = sqlite3.connect(initialized_archive / 'index.sqlite3')
+        c = conn.cursor()
+        result = c.execute(
+            "SELECT machine_id FROM machine_binary WHERE name='python3'"
+        ).fetchone()
+        conn.close()
+
+        assert result is not None
+        machine_id = result[0]
+        assert machine_id, "machine_id should be set on Binary"

+ 508 - 0
archivebox/tests/test_worker_config_propagation.py

@@ -246,6 +246,68 @@ print(snapshot.id)
             "Expected debug output not found in stderr"
         print("✓ Config debug output found in stderr")
 
+        # Verify precedence order: snapshot > crawl > user > persona > env > machine > file > defaults
+        verify_precedence_script = f"""
+import os
+os.environ['DATA_DIR'] = '{data_dir}'
+
+from archivebox.config.django import setup_django
+setup_django()
+
+from archivebox.core.models import Snapshot
+from archivebox.config.configset import get_config
+
+snapshot = Snapshot.objects.get(id='{snapshot_id}')
+
+# Test precedence by getting config at different levels
+print("\\nTesting config precedence order:")
+
+# 1. Just defaults (lowest priority)
+config_defaults = get_config()
+print(f"  Defaults only: TIMEOUT={{config_defaults.get('TIMEOUT')}}")
+
+# 2. With machine config
+from archivebox.machine.models import Machine
+machine = Machine.current()
+config_machine = get_config(machine=machine)
+custom_machine = config_machine.get('CUSTOM_MACHINE_KEY')
+print(f"  + Machine: CUSTOM_MACHINE_KEY={{custom_machine}}")
+
+# 3. With crawl config
+config_crawl = get_config(crawl=snapshot.crawl)
+print(f"  + Crawl: TIMEOUT={{config_crawl.get('TIMEOUT')}} (should be 777 from crawl.config)")
+assert config_crawl.get('TIMEOUT') == 777, f"Expected 777 from crawl, got {{config_crawl.get('TIMEOUT')}}"
+
+# 4. With snapshot config (highest priority)
+config_snapshot = get_config(snapshot=snapshot)
+print(f"  + Snapshot: TIMEOUT={{config_snapshot.get('TIMEOUT')}} (should be 555 from snapshot.config)")
+assert config_snapshot.get('TIMEOUT') == 555, f"Expected 555 from snapshot, got {{config_snapshot.get('TIMEOUT')}}"
+
+# Verify snapshot config overrides crawl config
+assert config_snapshot.get('CUSTOM_CRAWL_KEY') == 'from_crawl_json', "Crawl config should be present"
+assert config_snapshot.get('CUSTOM_SNAPSHOT_KEY') == 'from_snapshot_json', "Snapshot config should be present"
+assert config_snapshot.get('CUSTOM_MACHINE_KEY') == 'from_machine_config', "Machine config should be present"
+
+print("\\n✓ Config precedence order verified: snapshot > crawl > machine > defaults")
+"""
+        result = subprocess.run(
+            ['python', '-c', verify_precedence_script],
+            cwd=str(data_dir.parent),
+            env={
+                **os.environ,
+                'DATA_DIR': str(data_dir),
+                'USE_COLOR': 'False',
+            },
+            capture_output=True,
+            timeout=30,
+        )
+
+        print(result.stdout.decode())
+        if result.returncode != 0:
+            print("\nPrecedence verification error:")
+            print(result.stderr.decode())
+        assert result.returncode == 0, f"Precedence verification failed: {result.stderr.decode()}"
+
         # Verify config values were actually used by checking ArchiveResults
         verify_script = f"""
 import os
@@ -475,7 +537,453 @@ print("\\n✓ All config values correctly parsed from environment")
         print("="*80 + "\n")
 
 
+def test_parent_environment_preserved_in_hooks():
+    """
+    Test that parent environment variables are preserved in hook execution.
+
+    This test catches the bug where we built env=os.environ.copy() but then
+    clobbered it with process.env={}, losing all parent environment.
+
+    Also verifies:
+    - NODE_PATH is correctly derived from LIB_DIR/npm/node_modules
+    - LIB_BIN_DIR is correctly derived and added to PATH
+    """
+
+    with tempfile.TemporaryDirectory() as tmpdir:
+        data_dir = Path(tmpdir) / 'test_archive'
+        data_dir.mkdir()
+
+        print(f"\n{'='*80}")
+        print(f"Test: Parent Environment Preserved in Hooks")
+        print(f"DATA_DIR: {data_dir}")
+        print(f"{'='*80}\n")
+
+        # Initialize archive
+        print("Step 1: Initialize archive")
+        result = subprocess.run(
+            ['python', '-m', 'archivebox', 'init'],
+            cwd=str(data_dir),
+            env={
+                **os.environ,
+                'DATA_DIR': str(data_dir),
+                'USE_COLOR': 'False',
+            },
+            capture_output=True,
+            timeout=60,
+        )
+        assert result.returncode == 0, f"Init failed: {result.stderr.decode()}"
+        print(f"✓ Archive initialized\n")
+
+        # Create snapshot
+        print("Step 2: Create Snapshot")
+        create_snapshot_script = f"""
+import os
+os.environ['DATA_DIR'] = '{data_dir}'
+
+from archivebox.config.django import setup_django
+setup_django()
+
+from django.utils import timezone
+from archivebox.core.models import Snapshot
+from archivebox.crawls.models import Crawl
+
+crawl = Crawl.objects.create(
+    urls='https://example.com',
+    status='queued',
+    retry_at=timezone.now()
+)
+
+snapshot = Snapshot.objects.create(
+    url='https://example.com',
+    crawl=crawl,
+    status='queued',
+    retry_at=timezone.now()
+)
+print(snapshot.id)
+"""
+        result = subprocess.run(
+            ['python', '-c', create_snapshot_script],
+            cwd=str(data_dir.parent),
+            env={
+                **os.environ,
+                'DATA_DIR': str(data_dir),
+                'USE_COLOR': 'False',
+            },
+            capture_output=True,
+            timeout=30,
+        )
+        assert result.returncode == 0, f"Create snapshot failed: {result.stderr.decode()}"
+        snapshot_id = result.stdout.decode().strip().split('\n')[-1]
+        print(f"✓ Created snapshot {snapshot_id}\n")
+
+        # Run SnapshotWorker with custom parent environment variable
+        print("Step 3: Run SnapshotWorker with TEST_PARENT_ENV_VAR in parent process")
+        result = subprocess.run(
+            ['python', '-m', 'archivebox', 'run', '--snapshot-id', snapshot_id],
+            cwd=str(data_dir),
+            env={
+                **os.environ,
+                'DATA_DIR': str(data_dir),
+                'USE_COLOR': 'False',
+                'TEST_PARENT_ENV_VAR': 'preserved_from_parent',  # This should reach the hook
+                'PLUGINS': 'favicon',  # Use existing plugin (favicon is simple and fast)
+            },
+            capture_output=True,
+            timeout=120,
+        )
+
+        stdout = result.stdout.decode()
+        stderr = result.stderr.decode()
+
+        print("\n--- SnapshotWorker stderr (first 50 lines) ---")
+        print('\n'.join(stderr.split('\n')[:50]))
+        print("--- End stderr ---\n")
+
+        # Verify hooks ran by checking Process records
+        print("Step 4: Verify environment variables in hook Process records")
+        verify_env_script = f"""
+import os
+os.environ['DATA_DIR'] = '{data_dir}'
+
+from archivebox.config.django import setup_django
+setup_django()
+
+from archivebox.machine.models import Process
+from archivebox.core.models import Snapshot
+import json
+
+snapshot = Snapshot.objects.get(id='{snapshot_id}')
+
+# Find hook processes for this snapshot
+hook_processes = Process.objects.filter(
+    process_type=Process.TypeChoices.HOOK,
+    pwd__contains=str(snapshot.id)
+).order_by('-created_at')
+
+print(f"Found {{hook_processes.count()}} hook processes")
+
+if hook_processes.count() == 0:
+    print("ERROR: No hook processes found!")
+    import sys
+    sys.exit(1)
+
+# Check the first hook process environment
+hook_process = hook_processes.first()
+print(f"\\nChecking hook: {{hook_process.cmd}}")
+print(f"Hook env keys: {{len(hook_process.env)}} total")
+
+# Verify TEST_PARENT_ENV_VAR was preserved
+test_parent = hook_process.env.get('TEST_PARENT_ENV_VAR')
+print(f"  TEST_PARENT_ENV_VAR: {{test_parent}}")
+assert test_parent == 'preserved_from_parent', f"Expected 'preserved_from_parent', got {{test_parent}}"
+
+# Verify LIB_DIR is set
+lib_dir = hook_process.env.get('LIB_DIR')
+print(f"  LIB_DIR: {{lib_dir}}")
+assert lib_dir is not None, "LIB_DIR not set"
+
+# Verify LIB_BIN_DIR is derived
+lib_bin_dir = hook_process.env.get('LIB_BIN_DIR')
+print(f"  LIB_BIN_DIR: {{lib_bin_dir}}")
+if lib_dir:
+    assert lib_bin_dir is not None, "LIB_BIN_DIR not derived from LIB_DIR"
+    assert lib_bin_dir.endswith('/bin'), f"LIB_BIN_DIR should end with /bin, got {{lib_bin_dir}}"
+
+# Verify LIB_BIN_DIR is in PATH
+path = hook_process.env.get('PATH')
+if lib_bin_dir:
+    assert lib_bin_dir in path, f"LIB_BIN_DIR not in PATH. LIB_BIN_DIR={{lib_bin_dir}}, PATH={{path[:200]}}..."
+
+# Verify NODE_PATH is set
+node_path = hook_process.env.get('NODE_PATH')
+node_modules_dir = hook_process.env.get('NODE_MODULES_DIR')
+print(f"  NODE_PATH: {{node_path}}")
+print(f"  NODE_MODULES_DIR: {{node_modules_dir}}")
+if node_path:
+    # Should also have NODE_MODULES_DIR for backwards compatibility
+    assert node_modules_dir == node_path, f"NODE_MODULES_DIR should match NODE_PATH"
+
+print("\\n✓ All environment checks passed")
+"""
+        result = subprocess.run(
+            ['python', '-c', verify_env_script],
+            cwd=str(data_dir.parent),
+            env={
+                **os.environ,
+                'DATA_DIR': str(data_dir),
+                'USE_COLOR': 'False',
+            },
+            capture_output=True,
+            timeout=30,
+        )
+
+        print(result.stdout.decode())
+        if result.returncode != 0:
+            print("\nVerification error:")
+            print(result.stderr.decode())
+
+        assert result.returncode == 0, f"Environment verification failed: {result.stderr.decode()}"
+
+        print("\n" + "="*80)
+        print("✓ TEST PASSED: Parent environment preserved in hooks")
+        print("  - Custom parent env vars reach hooks")
+        print("  - LIB_DIR propagated correctly")
+        print("  - LIB_BIN_DIR derived and added to PATH")
+        print("  - NODE_PATH/NODE_MODULES_DIR set when available")
+        print("="*80 + "\n")
+
+
+def test_config_auto_fetch_relationships():
+    """
+    Test that get_config() auto-fetches related objects from relationships.
+
+    Verifies:
+    - snapshot auto-fetched from archiveresult.snapshot
+    - crawl auto-fetched from snapshot.crawl
+    - user auto-fetched from crawl.created_by
+    """
+
+    with tempfile.TemporaryDirectory() as tmpdir:
+        data_dir = Path(tmpdir) / 'test_archive'
+        data_dir.mkdir()
+
+        print(f"\n{'='*80}")
+        print(f"Test: Config Auto-Fetch Relationships")
+        print(f"DATA_DIR: {data_dir}")
+        print(f"{'='*80}\n")
+
+        # Initialize archive
+        print("Step 1: Initialize archive")
+        result = subprocess.run(
+            ['python', '-m', 'archivebox', 'init'],
+            cwd=str(data_dir),
+            env={
+                **os.environ,
+                'DATA_DIR': str(data_dir),
+                'USE_COLOR': 'False',
+            },
+            capture_output=True,
+            timeout=60,
+        )
+        assert result.returncode == 0, f"Init failed: {result.stderr.decode()}"
+        print(f"✓ Archive initialized\n")
+
+        # Create objects with config at each level
+        print("Step 2: Create Crawl -> Snapshot -> ArchiveResult with config at each level")
+        create_objects_script = f"""
+import os
+os.environ['DATA_DIR'] = '{data_dir}'
+
+from archivebox.config.django import setup_django
+setup_django()
+
+from django.utils import timezone
+from archivebox.crawls.models import Crawl
+from archivebox.core.models import Snapshot, ArchiveResult
+from archivebox.config.configset import get_config
+
+# Create crawl with config
+crawl = Crawl.objects.create(
+    urls='https://example.com',
+    status='queued',
+    retry_at=timezone.now(),
+    config={{
+        'CRAWL_KEY': 'from_crawl',
+        'TIMEOUT': 777,
+    }}
+)
+
+# Create snapshot with config
+snapshot = Snapshot.objects.create(
+    url='https://example.com',
+    crawl=crawl,
+    status='queued',
+    retry_at=timezone.now(),
+    config={{
+        'SNAPSHOT_KEY': 'from_snapshot',
+        'TIMEOUT': 555,
+    }}
+)
+
+# Create ArchiveResult
+ar = ArchiveResult.objects.create(
+    snapshot=snapshot,
+    plugin='test',
+    hook_name='test_hook',
+    status=ArchiveResult.StatusChoices.STARTED
+)
+
+print(f"Created: crawl={{crawl.id}}, snapshot={{snapshot.id}}, ar={{ar.id}}")
+
+# Test 1: Auto-fetch crawl from snapshot
+print("\\nTest 1: get_config(snapshot=snapshot) auto-fetches crawl")
+config = get_config(snapshot=snapshot)
+assert config.get('TIMEOUT') == 555, f"Expected 555 from snapshot, got {{config.get('TIMEOUT')}}"
+assert config.get('SNAPSHOT_KEY') == 'from_snapshot', f"Expected from_snapshot, got {{config.get('SNAPSHOT_KEY')}}"
+assert config.get('CRAWL_KEY') == 'from_crawl', f"Expected from_crawl, got {{config.get('CRAWL_KEY')}}"
+print("✓ Snapshot config (TIMEOUT=555) overrides crawl config (TIMEOUT=777)")
+print("✓ Both snapshot.config and crawl.config values present")
+
+# Test 2: Auto-fetch snapshot from archiveresult
+print("\\nTest 2: get_config(archiveresult=ar) auto-fetches snapshot and crawl")
+config_from_ar = get_config(archiveresult=ar)
+assert config_from_ar.get('TIMEOUT') == 555, f"Expected 555, got {{config_from_ar.get('TIMEOUT')}}"
+assert config_from_ar.get('SNAPSHOT_KEY') == 'from_snapshot', f"Expected from_snapshot"
+assert config_from_ar.get('CRAWL_KEY') == 'from_crawl', f"Expected from_crawl"
+print("✓ Auto-fetched snapshot from ar.snapshot")
+print("✓ Auto-fetched crawl from snapshot.crawl")
+
+# Test 3: Precedence without auto-fetch (explicit crawl only)
+print("\\nTest 3: get_config(crawl=crawl) without snapshot")
+config_crawl_only = get_config(crawl=crawl)
+assert config_crawl_only.get('TIMEOUT') == 777, f"Expected 777 from crawl, got {{config_crawl_only.get('TIMEOUT')}}"
+assert config_crawl_only.get('CRAWL_KEY') == 'from_crawl'
+assert config_crawl_only.get('SNAPSHOT_KEY') is None, "Should not have snapshot config"
+print("✓ Crawl-only config has TIMEOUT=777")
+print("✓ No snapshot config values present")
+
+print("\\n✓ All auto-fetch tests passed")
+"""
+
+        result = subprocess.run(
+            ['python', '-c', create_objects_script],
+            cwd=str(data_dir.parent),
+            env={
+                **os.environ,
+                'DATA_DIR': str(data_dir),
+                'USE_COLOR': 'False',
+            },
+            capture_output=True,
+            timeout=30,
+        )
+
+        print(result.stdout.decode())
+        if result.returncode != 0:
+            print("\nAuto-fetch test error:")
+            print(result.stderr.decode())
+
+        assert result.returncode == 0, f"Auto-fetch test failed: {result.stderr.decode()}"
+
+        print("\n" + "="*80)
+        print("✓ TEST PASSED: Config auto-fetches related objects correctly")
+        print("  - archiveresult → snapshot → crawl → user")
+        print("  - Precedence preserved during auto-fetch")
+        print("="*80 + "\n")
+
+
+def test_config_precedence_with_environment_vars():
+    """
+    Test that config precedence order is correct when environment vars are set.
+
+    Documented order (highest to lowest):
+    1. snapshot.config
+    2. crawl.config
+    3. user.config
+    4. persona config
+    5. environment variables  <-- LOWER priority than snapshot/crawl
+    6. machine.config
+    7. config file
+    8. plugin defaults
+    9. core defaults
+
+    This test verifies snapshot.config overrides environment variables.
+    """
+
+    with tempfile.TemporaryDirectory() as tmpdir:
+        data_dir = Path(tmpdir) / 'test_archive'
+        data_dir.mkdir()
+
+        print(f"\n{'='*80}")
+        print(f"Test: Config Precedence with Environment Variables")
+        print(f"DATA_DIR: {data_dir}")
+        print(f"{'='*80}\n")
+
+        # Initialize
+        result = subprocess.run(
+            ['python', '-m', 'archivebox', 'init'],
+            cwd=str(data_dir),
+            env={**os.environ, 'DATA_DIR': str(data_dir), 'USE_COLOR': 'False'},
+            capture_output=True,
+            timeout=60,
+        )
+        assert result.returncode == 0
+        print("✓ Archive initialized\n")
+
+        # Test with environment variable set
+        print("Step 1: Test with TIMEOUT=999 in environment")
+        test_script = f"""
+import os
+os.environ['DATA_DIR'] = '{data_dir}'
+os.environ['TIMEOUT'] = '999'  # Set env var
+
+from archivebox.config.django import setup_django
+setup_django()
+
+from django.utils import timezone
+from archivebox.crawls.models import Crawl
+from archivebox.core.models import Snapshot
+from archivebox.config.configset import get_config
+
+# Create crawl with TIMEOUT=777
+crawl = Crawl.objects.create(
+    urls='https://example.com',
+    status='queued',
+    retry_at=timezone.now(),
+    config={{'TIMEOUT': 777}}
+)
+
+# Create snapshot with TIMEOUT=555
+snapshot = Snapshot.objects.create(
+    url='https://example.com',
+    crawl=crawl,
+    status='queued',
+    retry_at=timezone.now(),
+    config={{'TIMEOUT': 555}}
+)
+
+# Get config with all sources
+config = get_config(snapshot=snapshot)
+
+print(f"Environment: TIMEOUT={{os.environ.get('TIMEOUT')}}")
+print(f"Crawl config: TIMEOUT={{crawl.config.get('TIMEOUT')}}")
+print(f"Snapshot config: TIMEOUT={{snapshot.config.get('TIMEOUT')}}")
+print(f"Merged config: TIMEOUT={{config.get('TIMEOUT')}}")
+
+# Snapshot should override both crawl AND environment
+expected = 555
+actual = config.get('TIMEOUT')
+if actual != expected:
+    print(f"\\n❌ PRECEDENCE BUG: Expected {{expected}}, got {{actual}}")
+    print(f"   Snapshot.config should have highest priority!")
+    import sys
+    sys.exit(1)
+
+print(f"\\n✓ snapshot.config ({{expected}}) correctly overrides env var (999) and crawl.config (777)")
+"""
+
+        result = subprocess.run(
+            ['python', '-c', test_script],
+            cwd=str(data_dir.parent),
+            capture_output=True,
+            timeout=30,
+        )
+
+        print(result.stdout.decode())
+        if result.returncode != 0:
+            print("\nPrecedence bug detected:")
+            print(result.stderr.decode())
+
+        assert result.returncode == 0, f"Precedence test failed: {result.stderr.decode()}"
+
+        print("\n" + "="*80)
+        print("✓ TEST PASSED: Snapshot config correctly overrides environment variables")
+        print("="*80 + "\n")
+
+
 if __name__ == '__main__':
     # Run as standalone script
     test_config_propagation_through_worker_hierarchy()
     test_config_environment_variable_parsing()
+    test_parent_environment_preserved_in_hooks()
+    test_config_auto_fetch_relationships()
+    test_config_precedence_with_environment_vars()

+ 48 - 15
archivebox/workers/orchestrator.py

@@ -36,7 +36,7 @@ from django.utils import timezone
 from rich import print
 
 from archivebox.misc.logging_util import log_worker_event
-from .worker import Worker, CrawlWorker
+from .worker import Worker, BinaryWorker, CrawlWorker
 
 
 def _run_orchestrator_process(exit_on_idle: bool) -> None:
@@ -63,13 +63,14 @@ class Orchestrator:
     - Each SnapshotWorker runs hooks sequentially for its snapshot
     """
 
-    # Only CrawlWorker - SnapshotWorkers are spawned by CrawlWorker subprocess, not by Orchestrator
-    WORKER_TYPES: list[Type[Worker]] = [CrawlWorker]
+    # BinaryWorker (singleton daemon) and CrawlWorker - SnapshotWorkers are spawned by CrawlWorker subprocess, not by Orchestrator
+    WORKER_TYPES: list[Type[Worker]] = [BinaryWorker, CrawlWorker]
 
     # Configuration
     POLL_INTERVAL: float = 2.0  # How often to check for new work (seconds)
     IDLE_TIMEOUT: int = 3  # Exit after N idle ticks (0 = never exit)
     MAX_CRAWL_WORKERS: int = 8  # Max crawls processing simultaneously
+    MAX_BINARY_WORKERS: int = 1  # Max binaries installing simultaneously (sequential only)
 
     def __init__(self, exit_on_idle: bool = True, crawl_id: str | None = None):
         self.exit_on_idle = exit_on_idle
@@ -194,15 +195,23 @@ class Orchestrator:
         return len(WorkerClass.get_running_workers())
     
     def should_spawn_worker(self, WorkerClass: Type[Worker], queue_count: int) -> bool:
-        """Determine if we should spawn a new CrawlWorker."""
+        """Determine if we should spawn a new worker."""
         if queue_count == 0:
             return False
 
-        # Check CrawlWorker limit
+        # Get appropriate limit based on worker type
+        if WorkerClass.name == 'crawl':
+            max_workers = self.MAX_CRAWL_WORKERS
+        elif WorkerClass.name == 'binary':
+            max_workers = self.MAX_BINARY_WORKERS  # Force sequential: only 1 binary at a time
+        else:
+            max_workers = 1  # Default for unknown types
+
+        # Check worker limit
         running_workers = WorkerClass.get_running_workers()
         running_count = len(running_workers)
 
-        if running_count >= self.MAX_CRAWL_WORKERS:
+        if running_count >= max_workers:
             return False
 
         # Check if we already have enough workers for the queue size
@@ -285,14 +294,35 @@ class Orchestrator:
     
     def check_queues_and_spawn_workers(self) -> dict[str, int]:
         """
-        Check Crawl queue and spawn CrawlWorkers as needed.
+        Check Binary and Crawl queues and spawn workers as needed.
         Returns dict of queue sizes.
         """
         from archivebox.crawls.models import Crawl
+        from archivebox.machine.models import Binary, Machine
 
         queue_sizes = {}
 
-        # Only check Crawl queue
+        # Check Binary queue
+        machine = Machine.current()
+        binary_queue = Binary.objects.filter(
+            machine=machine,
+            status=Binary.StatusChoices.QUEUED,
+            retry_at__lte=timezone.now()
+        ).order_by('retry_at')
+        binary_count = binary_queue.count()
+        queue_sizes['binary'] = binary_count
+
+        # Spawn BinaryWorker if needed (one worker per binary, up to MAX_BINARY_WORKERS)
+        if self.should_spawn_worker(BinaryWorker, binary_count):
+            # Get next binary to process
+            binary = binary_queue.first()
+            if binary:
+                BinaryWorker.start(binary_id=str(binary.id))
+
+        # Check if any BinaryWorkers are still running
+        running_binary_workers = len(BinaryWorker.get_running_workers())
+
+        # Check Crawl queue
         crawl_queue = Crawl.objects.filter(
             retry_at__lte=timezone.now()
         ).exclude(
@@ -307,12 +337,15 @@ class Orchestrator:
         crawl_count = crawl_queue.count()
         queue_sizes['crawl'] = crawl_count
 
-        # Spawn CrawlWorker if needed
-        if self.should_spawn_worker(CrawlWorker, crawl_count):
-            # Claim next crawl
-            crawl = crawl_queue.first()
-            if crawl and self._claim_crawl(crawl):
-                CrawlWorker.start(crawl_id=str(crawl.id))
+        # CRITICAL: Only spawn CrawlWorkers if binary queue is empty AND no BinaryWorkers running
+        # This ensures all binaries are installed before snapshots start processing
+        if binary_count == 0 and running_binary_workers == 0:
+            # Spawn CrawlWorker if needed
+            if self.should_spawn_worker(CrawlWorker, crawl_count):
+                # Claim next crawl
+                crawl = crawl_queue.first()
+                if crawl and self._claim_crawl(crawl):
+                    CrawlWorker.start(crawl_id=str(crawl.id))
 
         return queue_sizes
 
@@ -328,7 +361,7 @@ class Orchestrator:
         )
 
         return updated == 1
-    
+
     def has_pending_work(self, queue_sizes: dict[str, int]) -> bool:
         """Check if any queue has pending work."""
         return any(count > 0 for count in queue_sizes.values())

+ 102 - 8
archivebox/workers/worker.py

@@ -323,6 +323,20 @@ class Worker:
             pwd = Path(snapshot.output_dir)  # Run in snapshot's output directory
             env = get_config(snapshot=snapshot)
 
+        elif cls.name == 'binary':
+            # BinaryWorker processes a specific binary installation
+            binary_id = kwargs.get('binary_id')
+            if not binary_id:
+                raise ValueError("BinaryWorker requires binary_id")
+
+            from archivebox.machine.models import Binary
+            binary = Binary.objects.get(id=binary_id)
+
+            cmd = [sys.executable, '-m', 'archivebox', 'run', '--binary-id', str(binary_id)]
+            pwd = Path(settings.DATA_DIR) / 'machines' / str(Machine.current().id) / 'binaries' / binary.name / str(binary.id)
+            pwd.mkdir(parents=True, exist_ok=True)
+            env = get_config()
+
         else:
             raise ValueError(f"Unknown worker type: {cls.name}")
 
@@ -654,16 +668,8 @@ class SnapshotWorker(Worker):
             hooks = discover_hooks('Snapshot', config=config)
             hooks = sorted(hooks, key=lambda h: h.name)  # Sort by name (includes step prefix)
 
-            import sys
-            print(f'[SnapshotWorker] Discovered {len(hooks)} hooks for snapshot {self.snapshot.url}', file=sys.stderr, flush=True)
-            if hooks:
-                print(f'[SnapshotWorker] First 5 hooks: {[h.name for h in hooks[:5]]}', file=sys.stderr, flush=True)
-            else:
-                print(f'[SnapshotWorker] WARNING: No hooks discovered! Config keys: {list(config.keys())[:10]}...', file=sys.stderr, flush=True)
-
             # Execute each hook sequentially
             for hook_path in hooks:
-                print(f'[SnapshotWorker] Running hook: {hook_path.name}', file=sys.stderr, flush=True)
                 hook_name = hook_path.name
                 plugin = self._extract_plugin_name(hook_name)
                 hook_step = extract_step(hook_name)
@@ -829,8 +835,96 @@ class SnapshotWorker(Worker):
         return name
 
 
+class BinaryWorker(Worker):
+    """
+    Worker that processes a specific Binary installation.
+
+    Like CrawlWorker and SnapshotWorker, BinaryWorker:
+    - Processes one specific binary (specified by binary_id)
+    - Installs it via Binary.run() which runs on_Binary__* hooks
+    - Exits when done
+
+    Orchestrator spawns BinaryWorkers sequentially (MAX_BINARY_WORKERS=1) to avoid
+    conflicts during binary installations.
+    """
+
+    name: ClassVar[str] = 'binary'
+    MAX_TICK_TIME: ClassVar[int] = 600  # 10 minutes for binary installations
+    MAX_CONCURRENT_TASKS: ClassVar[int] = 1  # One binary per worker
+
+    def __init__(self, binary_id: str, worker_id: int = 0):
+        self.binary_id = binary_id
+        super().__init__(worker_id=worker_id)
+
+    def get_model(self):
+        from archivebox.machine.models import Binary
+        return Binary
+
+    def get_next_item(self):
+        """Get the specific binary to install."""
+        from archivebox.machine.models import Binary
+
+        try:
+            return Binary.objects.get(id=self.binary_id)
+        except Binary.DoesNotExist:
+            return None
+
+    def runloop(self) -> None:
+        """Install the specified binary."""
+        import sys
+
+        self.on_startup()
+
+        try:
+            binary = self.get_next_item()
+
+            if not binary:
+                log_worker_event(
+                    worker_type='BinaryWorker',
+                    event=f'Binary {self.binary_id} not found',
+                    indent_level=1,
+                    pid=self.pid,
+                )
+                return
+
+            print(f'[cyan]🔧 BinaryWorker installing: {binary.name}[/cyan]', file=sys.stderr)
+
+            # Tick the state machine to trigger installation
+            # This calls BinaryMachine.on_install() -> Binary.run() -> on_Binary__* hooks
+            binary.sm.tick()
+
+            # Check result
+            binary.refresh_from_db()
+            if binary.status == Binary.StatusChoices.INSTALLED:
+                log_worker_event(
+                    worker_type='BinaryWorker',
+                    event=f'Installed: {binary.name} -> {binary.abspath}',
+                    indent_level=1,
+                    pid=self.pid,
+                )
+            else:
+                log_worker_event(
+                    worker_type='BinaryWorker',
+                    event=f'Installation pending: {binary.name} (status={binary.status})',
+                    indent_level=1,
+                    pid=self.pid,
+                )
+
+        except Exception as e:
+            log_worker_event(
+                worker_type='BinaryWorker',
+                event=f'Failed to install binary',
+                indent_level=1,
+                pid=self.pid,
+                error=e,
+            )
+        finally:
+            self.on_shutdown()
+
+
 # Populate the registry
 WORKER_TYPES.update({
+    'binary': BinaryWorker,
     'crawl': CrawlWorker,
     'snapshot': SnapshotWorker,
 })

+ 2 - 0
bin/test_plugins.sh

@@ -239,6 +239,8 @@ for test_dir in $TEST_DIRS; do
     PYTEST_CMD="python -m pytest $test_dir -p no:django -v --tb=short"
     if [ "$ENABLE_COVERAGE" = true ]; then
         PYTEST_CMD="$PYTEST_CMD --cov=$plugin_name --cov-append --cov-branch"
+        echo "[DEBUG] NODE_V8_COVERAGE before pytest: $NODE_V8_COVERAGE"
+        python -c "import os; print('[DEBUG BASH->PYTHON] NODE_V8_COVERAGE:', os.environ.get('NODE_V8_COVERAGE', 'NOT_SET'))"
     fi
 
     if eval "$PYTEST_CMD" 2>&1 | grep -v "^platform\|^cachedir\|^rootdir\|^configfile\|^plugins:" | tail -100; then

+ 2 - 2
old/TODO_hook_architecture.md

@@ -1878,7 +1878,7 @@ Updated `archivebox/core/statemachines.py`:
 |--------|------|--------|-------|
 | favicon | `on_Snapshot__11_favicon.py` | ✅ UPDATED | Now outputs clean JSONL |
 | git | `on_Snapshot__12_git.py` | ✅ UPDATED | Now outputs clean JSONL with cmd |
-| archive_org | `on_Snapshot__13_archive_org.py` | ✅ UPDATED | Now outputs clean JSONL |
+| archivedotorg | `on_Snapshot__13_archivedotorg.py` | ✅ UPDATED | Now outputs clean JSONL |
 | title | `on_Snapshot__32_title.js` | ✅ UPDATED | Now outputs clean JSONL |
 | singlefile | `on_Snapshot__37_singlefile.py` | ✅ UPDATED | Now outputs clean JSONL with cmd |
 | wget | `on_Snapshot__50_wget.py` | ✅ UPDATED | Now outputs clean JSONL with cmd |
@@ -1930,7 +1930,7 @@ The following hooks have been renamed with `.bg.` suffix:
 - `archivebox/core/migrations/0030_migrate_output_field.py` (new)
 
 ### Plugins Updated (Python Hooks)
-- `archivebox/plugins/archive_org/on_Snapshot__13_archive_org.py`
+- `archivebox/plugins/archivedotorg/on_Snapshot__13_archivedotorg.py`
 - `archivebox/plugins/favicon/on_Snapshot__11_favicon.py`
 - `archivebox/plugins/git/on_Snapshot__12_git.py`
 - `archivebox/plugins/media/on_Snapshot__51_media.py`