1 month ago · 456aaee287
--- a/archivebox/base_models/models.py
+++ b/archivebox/base_models/models.py
@@ -111,7 +111,7 @@ class ModelWithOutputDir(ModelWithUUID):
 
				 
			
 
				     def save(self, *args, **kwargs):
			
 
				         super().save(*args, **kwargs)
			
 
				-        self.OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
			
 
				+        Path(self.output_dir).mkdir(parents=True, exist_ok=True)
			
 
				         # Note: index.json is deprecated, models should use write_index_jsonl() for full data
			
 
				 
			
 
				     @property
			
@@ -127,5 +127,5 @@ class ModelWithOutputDir(ModelWithUUID):
 
				         return f'{self.output_dir_parent}/{self.output_dir_name}'
			
 
				 
			
 
				     @property
			
 
				-    def OUTPUT_DIR(self) -> Path:
			
 
				-        return DATA_DIR / self.output_dir_str
			
 
				+    def output_dir(self) -> Path:
			
 
				+        raise NotImplementedError(f'{self.__class__.__name__} must implement output_dir property')
			
--- a/archivebox/config/configset.py
+++ b/archivebox/config/configset.py
@@ -118,12 +118,12 @@ class BaseConfigSet(BaseSettings):
 
				 
			
 
				 
			
 
				 def get_config(
			
 
				-    scope: str = "global",
			
 
				     defaults: Optional[Dict] = None,
			
 
				     persona: Any = None,
			
 
				     user: Any = None,
			
 
				     crawl: Any = None,
			
 
				     snapshot: Any = None,
			
 
				+    machine: Any = None,
			
 
				 ) -> Dict[str, Any]:
			
 
				     """
			
 
				     Get merged config from all sources.
			
@@ -134,17 +134,18 @@ def get_config(
 
				     3. Per-user config (user.config JSON field)
			
 
				     4. Per-persona config (persona.get_derived_config() - includes CHROME_USER_DATA_DIR etc.)
			
 
				     5. Environment variables
			
 
				-    6. Config file (ArchiveBox.conf)
			
 
				-    7. Plugin schema defaults (config.json)
			
 
				-    8. Core config defaults
			
 
				+    6. Per-machine config (machine.config JSON field - resolved binary paths)
			
 
				+    7. Config file (ArchiveBox.conf)
			
 
				+    8. Plugin schema defaults (config.json)
			
 
				+    9. Core config defaults
			
 
				 
			
 
				     Args:
			
 
				-        scope: Config scope ('global', 'crawl', 'snapshot', etc.)
			
 
				         defaults: Default values to start with
			
 
				         persona: Persona object (provides derived paths like CHROME_USER_DATA_DIR)
			
 
				         user: User object with config JSON field
			
 
				         crawl: Crawl object with config JSON field
			
 
				         snapshot: Snapshot object with config JSON field
			
 
				+        machine: Machine object with config JSON field (defaults to Machine.current())
			
 
				 
			
 
				     Returns:
			
 
				         Merged config dict
			
@@ -184,6 +185,18 @@ def get_config(
 
				         file_config = BaseConfigSet.load_from_file(config_file)
			
 
				         config.update(file_config)
			
 
				 
			
 
				+    # Apply machine config overrides (cached binary paths, etc.)
			
 
				+    if machine is None:
			
 
				+        # Default to current machine if not provided
			
 
				+        try:
			
 
				+            from archivebox.machine.models import Machine
			
 
				+            machine = Machine.current()
			
 
				+        except Exception:
			
 
				+            pass  # Machine might not be available during early init
			
 
				+
			
 
				+    if machine and hasattr(machine, "config") and machine.config:
			
 
				+        config.update(machine.config)
			
 
				+
			
 
				     # Override with environment variables
			
 
				     for key in config:
			
 
				         env_val = os.environ.get(key)
			
@@ -221,8 +234,8 @@ def get_config(
 
				         config.update(crawl.config)
			
 
				 
			
 
				     # Add CRAWL_OUTPUT_DIR for snapshot hooks to find shared Chrome session
			
 
				-    if crawl and hasattr(crawl, "OUTPUT_DIR"):
			
 
				-        config['CRAWL_OUTPUT_DIR'] = str(crawl.OUTPUT_DIR)
			
 
				+    if crawl and hasattr(crawl, "output_dir"):
			
 
				+        config['CRAWL_OUTPUT_DIR'] = str(crawl.output_dir)
			
 
				 
			
 
				     # Apply snapshot config overrides (highest priority)
			
 
				     if snapshot and hasattr(snapshot, "config") and snapshot.config:
			
@@ -260,7 +273,7 @@ def get_flat_config() -> Dict[str, Any]:
 
				 
			
 
				     Replaces abx.pm.hook.get_FLAT_CONFIG()
			
 
				     """
			
 
				-    return get_config(scope="global")
			
 
				+    return get_config()
			
 
				 
			
 
				 
			
 
				 def get_all_configs() -> Dict[str, BaseConfigSet]:
			
--- a/archivebox/config/constants.py
+++ b/archivebox/config/constants.py
@@ -176,6 +176,7 @@ class ConstantsDict(Mapping):
 
				         CRONTABS_DIR_NAME,
			
 
				         "invalid",
			
 
				         "users",
			
 
				+        "machine",
			
 
				         # Backwards compatibility with old directory names
			
 
				         "user_plugins",          # old name for USER_PLUGINS_DIR (now 'plugins')
			
 
				         "user_templates",        # old name for CUSTOM_TEMPLATES_DIR (now 'templates')
			
--- a/archivebox/core/migrations/0023_upgrade_to_0_9_0.py
+++ b/archivebox/core/migrations/0023_upgrade_to_0_9_0.py
@@ -15,6 +15,7 @@ def get_table_columns(table_name):
 
				 
			
 
				 def upgrade_core_tables(apps, schema_editor):
			
 
				     """Upgrade core tables from v0.7.2 or v0.8.6rc0 to v0.9.0."""
			
 
				+    from archivebox.uuid_compat import uuid7
			
 
				     cursor = connection.cursor()
			
 
				 
			
 
				     # Check if core_archiveresult table exists
			
@@ -60,8 +61,8 @@ def upgrade_core_tables(apps, schema_editor):
 
				 
			
 
				     if has_data:
			
 
				         if has_uuid and not has_abid:
			
 
				-            # Migrating from v0.7.2 (has uuid, minimal fields)
			
 
				-            print('Migrating ArchiveResult from v0.7.2 schema...')
			
 
				+            # Migrating from v0.7.2+ (has uuid column)
			
 
				+            print('Migrating ArchiveResult from v0.7.2+ schema (with uuid)...')
			
 
				             cursor.execute("""
			
 
				                 INSERT OR IGNORE INTO core_archiveresult_new (
			
 
				                     id, uuid, snapshot_id, cmd, pwd, cmd_version,
			
@@ -86,7 +87,18 @@ def upgrade_core_tables(apps, schema_editor):
 
				                 FROM core_archiveresult;
			
 
				             """)
			
 
				         else:
			
 
				-            print(f'Warning: Unexpected schema - has_uuid={has_uuid}, has_abid={has_abid}')
			
 
				+            # Migrating from v0.7.2 (no uuid or abid column - generate fresh UUIDs)
			
 
				+            print('Migrating ArchiveResult from v0.7.2 schema (no uuid - generating UUIDs)...')
			
 
				+            cursor.execute("SELECT id, snapshot_id, cmd, pwd, cmd_version, start_ts, end_ts, status, extractor, output FROM core_archiveresult")
			
 
				+            old_records = cursor.fetchall()
			
 
				+            for record in old_records:
			
 
				+                new_uuid = uuid7().hex
			
 
				+                cursor.execute("""
			
 
				+                    INSERT OR IGNORE INTO core_archiveresult_new (
			
 
				+                        id, uuid, snapshot_id, cmd, pwd, cmd_version,
			
 
				+                        start_ts, end_ts, status, extractor, output
			
 
				+                    ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
			
 
				+                """, (record[0], new_uuid, record[1], record[2], record[3], record[4], record[5], record[6], record[7], record[8], record[9]))
			
 
				 
			
 
				     cursor.execute("DROP TABLE IF EXISTS core_archiveresult;")
			
 
				     cursor.execute("ALTER TABLE core_archiveresult_new RENAME TO core_archiveresult;")
			
--- a/archivebox/core/migrations/0025_alter_archiveresult_options_alter_snapshot_options_and_more.py
+++ b/archivebox/core/migrations/0025_alter_archiveresult_options_alter_snapshot_options_and_more.py
@@ -33,6 +33,7 @@ def copy_old_fields_to_new(apps, schema_editor):
 
				 
			
 
				     # NOTE: Snapshot timestamps (added→bookmarked_at, updated→modified_at) were already
			
 
				     # transformed by migration 0023, so we don't need to copy them here.
			
 
				+    # NOTE: UUIDs are already populated by migration 0023 for all migration paths
			
 
				 
			
 
				     # Debug: Check Snapshot timestamps at end of RunPython
			
 
				     cursor.execute("SELECT id, bookmarked_at, modified_at FROM core_snapshot LIMIT 2")
			
--- a/archivebox/core/migrations/0029_migrate_archiveresult_to_uuid_pk.py
+++ b/archivebox/core/migrations/0029_migrate_archiveresult_to_uuid_pk.py
@@ -8,12 +8,20 @@ from archivebox.uuid_compat import uuid7
 
				 
			
 
				 def migrate_archiveresult_id_to_uuid(apps, schema_editor):
			
 
				     """
			
 
				-    Migrate ArchiveResult from integer PK to UUID PK.
			
 
				+    Migrate ArchiveResult from integer PK to UUID PK (clean one-step migration).
			
 
				+
			
 
				+    Handles both migration paths:
			
 
				+    - 0.7.x: ArchiveResult has integer id, NO uuid field → generate new UUIDs
			
 
				+    - 0.8.x: ArchiveResult has integer id + optional uuid field → reuse existing UUIDs
			
 
				 
			
 
				     Strategy:
			
 
				-    1. Add old_id field to store current integer IDs
			
 
				-    2. Generate UUIDs for any records missing them
			
 
				-    3. Swap id and uuid fields (uuid becomes PK, old integer id becomes old_id)
			
 
				+    1. Create new table with UUID as primary key (no temporary columns)
			
 
				+    2. Generate UUIDs for records missing them (0.7.x) or reuse existing (0.8.x)
			
 
				+    3. Copy all data with UUID as new id
			
 
				+    4. Drop old table, rename new table
			
 
				+    5. Recreate indexes
			
 
				+
			
 
				+    Result: Clean schema with ONLY id as UUIDField (no old_id, no uuid)
			
 
				     """
			
 
				     cursor = connection.cursor()
			
 
				 
			
@@ -26,11 +34,13 @@ def migrate_archiveresult_id_to_uuid(apps, schema_editor):
 
				     cursor.execute("SELECT COUNT(*) FROM core_archiveresult")
			
 
				     row_count = cursor.fetchone()[0]
			
 
				 
			
 
				-    if row_count == 0:
			
 
				-        print('No ArchiveResult records to migrate')
			
 
				-        return
			
 
				+    # Don't skip if table is empty - we still need to recreate to remove uuid column
			
 
				+    # (fresh installs create table with uuid from 0025, but model expects no uuid after 0029)
			
 
				 
			
 
				-    print(f'Migrating {row_count} ArchiveResult records from integer PK to UUID PK...')
			
 
				+    if row_count == 0:
			
 
				+        print('[0029] Recreating ArchiveResult table schema (integer→UUID PK, removing uuid column)...')
			
 
				+    else:
			
 
				+        print(f'[0029] Migrating {row_count} ArchiveResult records from integer PK to UUID PK...')
			
 
				 
			
 
				     # Step 0: Check if machine_process table exists, if not NULL out process_id values
			
 
				     cursor.execute("SELECT name FROM sqlite_master WHERE type='table' AND name='machine_process'")
			
@@ -40,12 +50,10 @@ def migrate_archiveresult_id_to_uuid(apps, schema_editor):
 
				         print('machine_process table does not exist yet, setting process_id to NULL')
			
 
				         cursor.execute("UPDATE core_archiveresult SET process_id = NULL WHERE process_id IS NOT NULL")
			
 
				 
			
 
				-    # Step 1: Create new table with UUID as primary key
			
 
				+    # Step 1: Create new table with UUID as primary key (clean - no old_id or uuid columns)
			
 
				     cursor.execute("""
			
 
				         CREATE TABLE core_archiveresult_new (
			
 
				             id TEXT PRIMARY KEY NOT NULL,
			
 
				-            old_id INTEGER,
			
 
				-            uuid TEXT UNIQUE,
			
 
				             created_at DATETIME NOT NULL,
			
 
				             modified_at DATETIME NOT NULL,
			
 
				 
			
@@ -78,28 +86,36 @@ def migrate_archiveresult_id_to_uuid(apps, schema_editor):
 
				     """)
			
 
				 
			
 
				     # Step 2: Generate UUIDs for records that don't have them
			
 
				-    cursor.execute("SELECT id, uuid FROM core_archiveresult")
			
 
				-    records = cursor.fetchall()
			
 
				-
			
 
				-    id_to_uuid = {}
			
 
				-    for old_id, existing_uuid in records:
			
 
				-        if existing_uuid:
			
 
				-            # Normalize existing UUID to 32-char hex format (Django SQLite UUIDField format)
			
 
				-            # (existing UUIDs might be stored with or without dashes in old schema)
			
 
				-            id_to_uuid[old_id] = UUID(existing_uuid).hex
			
 
				-        else:
			
 
				-            # Generate new UUIDv7 (time-ordered) as 32-char hex
			
 
				-            id_to_uuid[old_id] = uuid7().hex
			
 
				+    # Check if uuid column exists (0.8.x has it, 0.7.x doesn't)
			
 
				+    cursor.execute("PRAGMA table_info(core_archiveresult)")
			
 
				+    columns = cursor.fetchall()
			
 
				+    col_names = [col[1] for col in columns]
			
 
				+    has_uuid_column = 'uuid' in col_names
			
 
				+
			
 
				+    if has_uuid_column:
			
 
				+        cursor.execute("SELECT id, uuid FROM core_archiveresult")
			
 
				+        records = cursor.fetchall()
			
 
				+        id_to_uuid = {}
			
 
				+        for old_id, existing_uuid in records:
			
 
				+            if existing_uuid:
			
 
				+                # Normalize existing UUID to 32-char hex format (Django SQLite UUIDField format)
			
 
				+                # (existing UUIDs might be stored with or without dashes in old schema)
			
 
				+                id_to_uuid[old_id] = UUID(existing_uuid).hex
			
 
				+            else:
			
 
				+                # Generate new UUIDv7 (time-ordered) as 32-char hex
			
 
				+                id_to_uuid[old_id] = uuid7().hex
			
 
				+    else:
			
 
				+        # 0.7.x path: no uuid column, generate new UUIDs for all records
			
 
				+        cursor.execute("SELECT id FROM core_archiveresult")
			
 
				+        records = cursor.fetchall()
			
 
				+        id_to_uuid = {old_id: uuid7().hex for (old_id,) in records}
			
 
				 
			
 
				     # Step 3: Copy data with UUIDs as new primary key
			
 
				     cursor.execute("SELECT * FROM core_archiveresult")
			
 
				     old_records = cursor.fetchall()
			
 
				 
			
 
				-    # Get column names
			
 
				-    cursor.execute("PRAGMA table_info(core_archiveresult)")
			
 
				-    columns = cursor.fetchall()
			
 
				-    col_names = [col[1] for col in columns]
			
 
				-
			
 
				+    # col_names already fetched in Step 2
			
 
				+    inserted_count = 0
			
 
				     for i, record in enumerate(old_records):
			
 
				         old_id = record[col_names.index('id')]
			
 
				         new_uuid = id_to_uuid[old_id]
			
@@ -107,7 +123,7 @@ def migrate_archiveresult_id_to_uuid(apps, schema_editor):
 
				         # Build insert with new structure
			
 
				         values = {col_names[i]: record[i] for i in range(len(col_names))}
			
 
				 
			
 
				-        # Check which fields exist in new table
			
 
				+        # List of fields to copy (all fields from new schema except id, old_id, uuid)
			
 
				         fields_to_copy = [
			
 
				             'created_at', 'modified_at', 'snapshot_id', 'plugin', 'hook_name',
			
 
				             'status', 'retry_at', 'start_ts', 'end_ts',
			
@@ -115,17 +131,31 @@ def migrate_archiveresult_id_to_uuid(apps, schema_editor):
 
				             'config', 'notes', 'num_uses_succeeded', 'num_uses_failed', 'process_id'
			
 
				         ]
			
 
				 
			
 
				-        # Build INSERT statement
			
 
				+        # Build INSERT statement (only copy fields that exist in source)
			
 
				         existing_fields = [f for f in fields_to_copy if f in values]
			
 
				-        placeholders = ', '.join(['?'] * (len(existing_fields) + 3))  # +3 for id, old_id, uuid
			
 
				-        field_list = 'id, old_id, uuid, ' + ', '.join(existing_fields)
			
 
				 
			
 
				-        insert_values = [new_uuid, old_id, new_uuid] + [values.get(f) for f in existing_fields]
			
 
				+        if i == 0:
			
 
				+            print(f'[0029] Source columns: {col_names}')
			
 
				+            print(f'[0029] Copying fields: {existing_fields}')
			
 
				+
			
 
				+        placeholders = ', '.join(['?'] * (len(existing_fields) + 1))  # +1 for id
			
 
				+        field_list = 'id, ' + ', '.join(existing_fields)
			
 
				 
			
 
				-        cursor.execute(
			
 
				-            f"INSERT INTO core_archiveresult_new ({field_list}) VALUES ({placeholders})",
			
 
				-            insert_values
			
 
				-        )
			
 
				+        insert_values = [new_uuid] + [values.get(f) for f in existing_fields]
			
 
				+
			
 
				+        try:
			
 
				+            cursor.execute(
			
 
				+                f"INSERT INTO core_archiveresult_new ({field_list}) VALUES ({placeholders})",
			
 
				+                insert_values
			
 
				+            )
			
 
				+            inserted_count += 1
			
 
				+        except Exception as e:
			
 
				+            print(f'[0029] ERROR inserting record {old_id}: {e}')
			
 
				+            if i == 0:
			
 
				+                print(f'[0029] First record values: {insert_values[:5]}...')
			
 
				+                raise
			
 
				+
			
 
				+    print(f'[0029] Inserted {inserted_count}/{len(old_records)} records')
			
 
				 
			
 
				     # Step 4: Replace old table with new table
			
 
				     cursor.execute("DROP TABLE core_archiveresult")
			
@@ -139,7 +169,6 @@ def migrate_archiveresult_id_to_uuid(apps, schema_editor):
 
				     cursor.execute("CREATE INDEX core_archiveresult_created_at_idx ON core_archiveresult(created_at)")
			
 
				     cursor.execute("CREATE INDEX core_archiveresult_hook_name_idx ON core_archiveresult(hook_name)")
			
 
				     cursor.execute("CREATE INDEX core_archiveresult_process_id_idx ON core_archiveresult(process_id)")
			
 
				-    cursor.execute("CREATE INDEX core_archiveresult_old_id_idx ON core_archiveresult(old_id)")
			
 
				 
			
 
				     print(f'✓ Migrated {row_count} ArchiveResult records to UUID primary key')
			
 
				 
			
@@ -159,23 +188,17 @@ class Migration(migrations.Migration):
 
				                 ),
			
 
				             ],
			
 
				             state_operations=[
			
 
				-                # Remove old uuid field
			
 
				+                # Remove uuid field (was added in 0025, we're merging it into id)
			
 
				                 migrations.RemoveField(
			
 
				                     model_name='archiveresult',
			
 
				                     name='uuid',
			
 
				                 ),
			
 
				-                # Change id from AutoField to UUIDField
			
 
				+                # Change id from AutoField to UUIDField (absorbing the uuid field)
			
 
				                 migrations.AlterField(
			
 
				                     model_name='archiveresult',
			
 
				                     name='id',
			
 
				                     field=models.UUIDField(primary_key=True, default=uuid7, editable=False, unique=True),
			
 
				                 ),
			
 
				-                # Add old_id field to preserve legacy integer IDs
			
 
				-                migrations.AddField(
			
 
				-                    model_name='archiveresult',
			
 
				-                    name='old_id',
			
 
				-                    field=models.IntegerField(null=True, blank=True, db_index=True, help_text='Legacy integer ID from pre-0.9.0 versions'),
			
 
				-                ),
			
 
				             ],
			
 
				         ),
			
 
				     ]
			
--- a/archivebox/core/models.py
+++ b/archivebox/core/models.py
@@ -1354,7 +1354,7 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
 
				     def domain(self) -> str:
			
 
				         return url_domain(self.url)
			
 
				 
			
 
				-    @cached_property
			
 
				+    @property
			
 
				     def output_dir(self):
			
 
				         """The filesystem path to the snapshot's output directory."""
			
 
				         import os
			
@@ -1435,8 +1435,8 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
 
				                 print(f'[yellow]🔪 Killed {killed_count} process(es) for hook {process.pid}[/yellow]')
			
 
				 
			
 
				         # Clean up .pid files from output directory
			
 
				-        if self.OUTPUT_DIR.exists():
			
 
				-            for pid_file in self.OUTPUT_DIR.glob('**/*.pid'):
			
 
				+        if Path(self.output_dir).exists():
			
 
				+            for pid_file in Path(self.output_dir).glob('**/*.pid'):
			
 
				                 pid_file.unlink(missing_ok=True)
			
 
				 
			
 
				         # Update all STARTED ArchiveResults from filesystem
			
@@ -2263,9 +2263,6 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
 
				 
			
 
				     # UUID primary key (migrated from integer in 0029)
			
 
				     id = models.UUIDField(primary_key=True, default=uuid7, editable=False, unique=True)
			
 
				-    # old_id preserves the legacy integer ID for backward compatibility
			
 
				-    old_id = models.IntegerField(null=True, blank=True, db_index=True, help_text='Legacy integer ID from pre-0.9.0 versions')
			
 
				-    # Note: uuid field was removed in migration 0029 when id became UUID
			
 
				     created_at = models.DateTimeField(default=timezone.now, db_index=True)
			
 
				     modified_at = models.DateTimeField(auto_now=True)
			
 
				 
			
@@ -2494,7 +2491,7 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
 
				 
			
 
				     @property
			
 
				     def output_dir_parent(self) -> str:
			
 
				-        return str(self.snapshot.OUTPUT_DIR.relative_to(CONSTANTS.DATA_DIR))
			
 
				+        return str(Path(self.snapshot.output_dir).relative_to(CONSTANTS.DATA_DIR))
			
 
				 
			
 
				     # Properties that delegate to Process model (for backwards compatibility)
			
 
				     # These properties will replace the direct fields after migration is complete
			
--- a/archivebox/crawls/models.py
+++ b/archivebox/crawls/models.py
@@ -180,7 +180,7 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith
 
				         return crawl
			
 
				 
			
 
				     @property
			
 
				-    def OUTPUT_DIR(self) -> Path:
			
 
				+    def output_dir(self) -> Path:
			
 
				         """
			
 
				         Construct output directory: users/{username}/crawls/{YYYYMMDD}/{domain}/{crawl-id}
			
 
				         Domain is extracted from the first URL in the crawl.
			
@@ -383,7 +383,7 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith
 
				                 f.flush()
			
 
				             hook_start = time.time()
			
 
				             plugin_name = hook.parent.name
			
 
				-            output_dir = self.OUTPUT_DIR / plugin_name
			
 
				+            output_dir = self.output_dir / plugin_name
			
 
				             output_dir.mkdir(parents=True, exist_ok=True)
			
 
				 
			
 
				             # Run hook using Process.launch() - returns Process model
			
@@ -427,7 +427,10 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith
 
				             f.write(f'Created {len(created_snapshots)} snapshots\n')
			
 
				             f.write(f'=== Crawl.run() complete ===\n\n')
			
 
				             f.flush()
			
 
				-        return created_snapshots[0] if created_snapshots else None
			
 
				+
			
 
				+        # Return first snapshot for this crawl (newly created or existing)
			
 
				+        # This ensures the crawl doesn't seal if snapshots exist, even if they weren't just created
			
 
				+        return self.snapshot_set.first()
			
 
				 
			
 
				     def is_finished(self) -> bool:
			
 
				         """Check if crawl is finished (all snapshots sealed or no snapshots exist)."""
			
@@ -467,8 +470,8 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith
 
				                 print(f'[yellow]🔪 Killed {killed_count} orphaned crawl hook process(es)[/yellow]')
			
 
				 
			
 
				         # Clean up .pid files from output directory
			
 
				-        if self.OUTPUT_DIR.exists():
			
 
				-            for pid_file in self.OUTPUT_DIR.glob('**/*.pid'):
			
 
				+        if self.output_dir.exists():
			
 
				+            for pid_file in self.output_dir.glob('**/*.pid'):
			
 
				                 pid_file.unlink(missing_ok=True)
			
 
				 
			
 
				         # Run on_CrawlEnd hooks
			
@@ -479,7 +482,7 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith
 
				 
			
 
				         for hook in hooks:
			
 
				             plugin_name = hook.parent.name
			
 
				-            output_dir = self.OUTPUT_DIR / plugin_name
			
 
				+            output_dir = self.output_dir / plugin_name
			
 
				             output_dir.mkdir(parents=True, exist_ok=True)
			
 
				 
			
 
				             process = run_hook(
			
--- a/archivebox/hooks.py
+++ b/archivebox/hooks.py
@@ -207,7 +207,7 @@ def discover_hooks(
 
				         # Get merged config if not provided (lazy import to avoid circular dependency)
			
 
				         if config is None:
			
 
				             from archivebox.config.configset import get_config
			
 
				-            config = get_config(scope='global')
			
 
				+            config = get_config()
			
 
				 
			
 
				         enabled_hooks = []
			
 
				 
			
@@ -703,7 +703,7 @@ def get_enabled_plugins(config: Optional[Dict[str, Any]] = None) -> List[str]:
 
				     # Get merged config if not provided
			
 
				     if config is None:
			
 
				         from archivebox.config.configset import get_config
			
 
				-        config = get_config(scope='global')
			
 
				+        config = get_config()
			
 
				 
			
 
				     # Support explicit ENABLED_PLUGINS override (legacy)
			
 
				     if 'ENABLED_PLUGINS' in config:
			
@@ -967,9 +967,9 @@ def get_plugin_special_config(plugin_name: str, config: Dict[str, Any]) -> Dict[
 
				     else:
			
 
				         # No PLUGINS whitelist - use PLUGINNAME_ENABLED (default True)
			
 
				         import sys
			
 
				-        print(f"DEBUG: NO PLUGINS whitelist in config, checking {plugin_upper}_ENABLED", file=sys.stderr)
			
 
				         enabled_key = f'{plugin_upper}_ENABLED'
			
 
				         enabled = config.get(enabled_key)
			
 
				+        print(f"DEBUG: NO PLUGINS whitelist in config, checking {enabled_key}={enabled}", file=sys.stderr)
			
 
				         if enabled is None:
			
 
				             enabled = True
			
 
				         elif isinstance(enabled, str):
			
--- a/archivebox/machine/models.py
+++ b/archivebox/machine/models.py
@@ -378,7 +378,7 @@ class Binary(ModelWithHealthStats):
 
				         return None
			
 
				 
			
 
				     @property
			
 
				-    def OUTPUT_DIR(self):
			
 
				+    def output_dir(self):
			
 
				         """Return the output directory for this binary installation."""
			
 
				         from pathlib import Path
			
 
				         from django.conf import settings
			
@@ -412,10 +412,10 @@ class Binary(ModelWithHealthStats):
 
				         from archivebox.config.configset import get_config
			
 
				 
			
 
				         # Get merged config (Binary doesn't have crawl/snapshot context)
			
 
				-        config = get_config(scope='global')
			
 
				+        config = get_config()
			
 
				 
			
 
				         # Create output directory
			
 
				-        output_dir = self.OUTPUT_DIR
			
 
				+        output_dir = self.output_dir
			
 
				         output_dir.mkdir(parents=True, exist_ok=True)
			
 
				         self.output_dir = str(output_dir)
			
 
				         self.save()
			
@@ -514,7 +514,7 @@ class Binary(ModelWithHealthStats):
 
				                 print(f'[yellow]🔪 Killed {killed_count} binary installation hook process(es)[/yellow]')
			
 
				 
			
 
				         # Clean up .pid files from output directory
			
 
				-        output_dir = self.OUTPUT_DIR
			
 
				+        output_dir = self.output_dir
			
 
				         if output_dir.exists():
			
 
				             for pid_file in output_dir.glob('**/*.pid'):
			
 
				                 pid_file.unlink(missing_ok=True)
			
@@ -1276,6 +1276,128 @@ class Process(models.Model):
 
				         """Path to stderr log."""
			
 
				         return Path(self.pwd) / 'stderr.log' if self.pwd else None
			
 
				 
			
 
				+    def tail_stdout(self, lines: int = 50, follow: bool = False):
			
 
				+        """
			
 
				+        Tail stdout log file (like `tail` or `tail -f`).
			
 
				+
			
 
				+        Args:
			
 
				+            lines: Number of lines to show (default 50)
			
 
				+            follow: If True, follow the file and yield new lines as they appear
			
 
				+
			
 
				+        Yields:
			
 
				+            Lines from stdout
			
 
				+        """
			
 
				+        if not self.stdout_file or not self.stdout_file.exists():
			
 
				+            return
			
 
				+
			
 
				+        if follow:
			
 
				+            # Follow mode - yield new lines as they appear (tail -f)
			
 
				+            import time
			
 
				+            with open(self.stdout_file, 'r') as f:
			
 
				+                # Seek to end minus roughly 'lines' worth of bytes
			
 
				+                f.seek(0, 2)  # Seek to end
			
 
				+                file_size = f.tell()
			
 
				+                # Rough estimate: 100 bytes per line
			
 
				+                seek_pos = max(0, file_size - (lines * 100))
			
 
				+                f.seek(seek_pos)
			
 
				+
			
 
				+                # Skip partial line if we seeked to middle
			
 
				+                if seek_pos > 0:
			
 
				+                    f.readline()
			
 
				+
			
 
				+                # Yield existing lines
			
 
				+                for line in f:
			
 
				+                    yield line.rstrip('\n')
			
 
				+
			
 
				+                # Now follow for new lines
			
 
				+                while True:
			
 
				+                    line = f.readline()
			
 
				+                    if line:
			
 
				+                        yield line.rstrip('\n')
			
 
				+                    else:
			
 
				+                        time.sleep(0.1)  # Wait before checking again
			
 
				+        else:
			
 
				+            # Just get last N lines (tail -n)
			
 
				+            try:
			
 
				+                content = self.stdout_file.read_text()
			
 
				+                for line in content.splitlines()[-lines:]:
			
 
				+                    yield line
			
 
				+            except Exception:
			
 
				+                return
			
 
				+
			
 
				+    def tail_stderr(self, lines: int = 50, follow: bool = False):
			
 
				+        """
			
 
				+        Tail stderr log file (like `tail` or `tail -f`).
			
 
				+
			
 
				+        Args:
			
 
				+            lines: Number of lines to show (default 50)
			
 
				+            follow: If True, follow the file and yield new lines as they appear
			
 
				+
			
 
				+        Yields:
			
 
				+            Lines from stderr
			
 
				+        """
			
 
				+        if not self.stderr_file or not self.stderr_file.exists():
			
 
				+            return
			
 
				+
			
 
				+        if follow:
			
 
				+            # Follow mode - yield new lines as they appear (tail -f)
			
 
				+            import time
			
 
				+            with open(self.stderr_file, 'r') as f:
			
 
				+                # Seek to end minus roughly 'lines' worth of bytes
			
 
				+                f.seek(0, 2)  # Seek to end
			
 
				+                file_size = f.tell()
			
 
				+                # Rough estimate: 100 bytes per line
			
 
				+                seek_pos = max(0, file_size - (lines * 100))
			
 
				+                f.seek(seek_pos)
			
 
				+
			
 
				+                # Skip partial line if we seeked to middle
			
 
				+                if seek_pos > 0:
			
 
				+                    f.readline()
			
 
				+
			
 
				+                # Yield existing lines
			
 
				+                for line in f:
			
 
				+                    yield line.rstrip('\n')
			
 
				+
			
 
				+                # Now follow for new lines
			
 
				+                while True:
			
 
				+                    line = f.readline()
			
 
				+                    if line:
			
 
				+                        yield line.rstrip('\n')
			
 
				+                    else:
			
 
				+                        time.sleep(0.1)  # Wait before checking again
			
 
				+        else:
			
 
				+            # Just get last N lines (tail -n)
			
 
				+            try:
			
 
				+                content = self.stderr_file.read_text()
			
 
				+                for line in content.splitlines()[-lines:]:
			
 
				+                    yield line
			
 
				+            except Exception:
			
 
				+                return
			
 
				+
			
 
				+    def pipe_stdout(self, lines: int = 10, follow: bool = True):
			
 
				+        """
			
 
				+        Pipe stdout to sys.stdout.
			
 
				+
			
 
				+        Args:
			
 
				+            lines: Number of initial lines to show
			
 
				+            follow: If True, follow the file and print new lines as they appear
			
 
				+        """
			
 
				+        import sys
			
 
				+        for line in self.tail_stdout(lines=lines, follow=follow):
			
 
				+            print(line, file=sys.stdout, flush=True)
			
 
				+
			
 
				+    def pipe_stderr(self, lines: int = 10, follow: bool = True):
			
 
				+        """
			
 
				+        Pipe stderr to sys.stderr.
			
 
				+
			
 
				+        Args:
			
 
				+            lines: Number of initial lines to show
			
 
				+            follow: If True, follow the file and print new lines as they appear
			
 
				+        """
			
 
				+        import sys
			
 
				+        for line in self.tail_stderr(lines=lines, follow=follow):
			
 
				+            print(line, file=sys.stderr, flush=True)
			
 
				+
			
 
				     def _write_pid_file(self) -> None:
			
 
				         """Write PID file with mtime set to process start time."""
			
 
				         if self.pid and self.started_at and self.pid_file:
			
--- a/archivebox/plugins/chrome/config.json
+++ b/archivebox/plugins/chrome/config.json
@@ -3,6 +3,12 @@
 
				   "type": "object",
			
 
				   "additionalProperties": false,
			
 
				   "properties": {
			
 
				+    "CHROME_ENABLED": {
			
 
				+      "type": "boolean",
			
 
				+      "default": true,
			
 
				+      "x-aliases": ["USE_CHROME"],
			
 
				+      "description": "Enable Chrome/Chromium browser integration for archiving"
			
 
				+    },
			
 
				     "CHROME_BINARY": {
			
 
				       "type": "string",
			
 
				       "default": "chromium",
			
--- a/archivebox/plugins/screenshot/tests/test_screenshot.py
+++ b/archivebox/plugins/screenshot/tests/test_screenshot.py
@@ -201,16 +201,18 @@ def test_config_save_screenshot_false_skips():
 
				     """Test that SCREENSHOT_ENABLED=False exits without emitting JSONL."""
			
 
				     import os
			
 
				 
			
 
				+    # FIRST check what Python sees
			
 
				+    print(f"\n[DEBUG PYTHON] NODE_V8_COVERAGE in os.environ: {'NODE_V8_COVERAGE' in os.environ}")
			
 
				+    print(f"[DEBUG PYTHON] Value: {os.environ.get('NODE_V8_COVERAGE', 'NOT SET')}")
			
 
				+
			
 
				     with tempfile.TemporaryDirectory() as tmpdir:
			
 
				         tmpdir = Path(tmpdir)
			
 
				         env = os.environ.copy()
			
 
				         env['SCREENSHOT_ENABLED'] = 'False'
			
 
				 
			
 
				-        # DEBUG: Check if NODE_V8_COVERAGE is in env
			
 
				-        if 'NODE_V8_COVERAGE' in env:
			
 
				-            print(f"\n[DEBUG] NODE_V8_COVERAGE in env: {env['NODE_V8_COVERAGE']}")
			
 
				-        else:
			
 
				-            print("\n[DEBUG] NODE_V8_COVERAGE NOT in env")
			
 
				+        # Check what's in the copied env
			
 
				+        print(f"[DEBUG ENV COPY] NODE_V8_COVERAGE in env: {'NODE_V8_COVERAGE' in env}")
			
 
				+        print(f"[DEBUG ENV COPY] Value: {env.get('NODE_V8_COVERAGE', 'NOT SET')}")
			
 
				 
			
 
				         result = subprocess.run(
			
 
				             ['node', str(SCREENSHOT_HOOK), f'--url={TEST_URL}', '--snapshot-id=test999'],
			
@@ -221,6 +223,12 @@ def test_config_save_screenshot_false_skips():
 
				             timeout=30
			
 
				         )
			
 
				 
			
 
				+        print(f"[DEBUG RESULT] Exit code: {result.returncode}")
			
 
				+        print(f"[DEBUG RESULT] Stderr: {result.stderr[:200]}")
			
 
				+
			
 
				+        # FORCE FAILURE to verify test actually runs
			
 
				+        assert False, f"FORCED FAILURE - NODE_V8_COVERAGE={'NODE_V8_COVERAGE' in env} value={env.get('NODE_V8_COVERAGE', 'NOTSET')}"
			
 
				+
			
 
				         assert result.returncode == 0, f"Should exit 0 when feature disabled: {result.stderr}"
			
 
				 
			
 
				         # Feature disabled - temporary failure, should NOT emit JSONL
			
--- a/archivebox/tests/test_migrations_07_to_09.py
+++ b/archivebox/tests/test_migrations_07_to_09.py
@@ -136,7 +136,7 @@ class TestMigrationFrom07x(unittest.TestCase):
 
				         result = run_archivebox(self.work_dir, ['init'], timeout=45)
			
 
				         self.assertEqual(result.returncode, 0, f"Init failed: {result.stderr}")
			
 
				 
			
 
				-        result = run_archivebox(self.work_dir, ['list'])
			
 
				+        result = run_archivebox(self.work_dir, ['snapshot', 'list'])
			
 
				         self.assertEqual(result.returncode, 0, f"List failed after migration: {result.stderr}")
			
 
				 
			
 
				         # Verify ALL snapshots appear in output
			
--- a/archivebox/tests/test_worker_config_propagation.py
+++ b/archivebox/tests/test_worker_config_propagation.py
@@ -0,0 +1,481 @@
 
				+"""
			
 
				+Integration test for config propagation through worker hierarchy.
			
 
				+
			
 
				+Tests that config is properly merged and passed through:
			
 
				+    Parent CLI/Orchestrator
			
 
				+    └── CrawlWorker subprocess (via Process.env)
			
 
				+        └── SnapshotWorker subprocess (via Process.env)
			
 
				+            └── Hook subprocess (via Process.env)
			
 
				+
			
 
				+Config priority order (highest to lowest):
			
 
				+1. Snapshot.config (JSON field)
			
 
				+2. Crawl.config (JSON field)
			
 
				+3. User.config (JSON field)
			
 
				+4. Environment variables (os.environ + Process.env)
			
 
				+5. Config file (ArchiveBox.conf)
			
 
				+6. Plugin defaults (config.json)
			
 
				+7. Core defaults
			
 
				+"""
			
 
				+
			
 
				+import os
			
 
				+import json
			
 
				+import tempfile
			
 
				+import subprocess
			
 
				+import time
			
 
				+from pathlib import Path
			
 
				+
			
 
				+
			
 
				+def test_config_propagation_through_worker_hierarchy():
			
 
				+    """
			
 
				+    Integration test: Verify config is properly merged at every level.
			
 
				+
			
 
				+    Test flow:
			
 
				+    1. Create test archive with custom config in ArchiveBox.conf
			
 
				+    2. Set custom env vars before spawning worker
			
 
				+    3. Create Crawl with custom crawl.config JSON field
			
 
				+    4. Create Snapshot with custom snapshot.config JSON field
			
 
				+    5. Spawn SnapshotWorker via archivebox run --snapshot-id=...
			
 
				+    6. Verify worker received merged config from all sources
			
 
				+    7. Verify hook subprocess also received correct config
			
 
				+    """
			
 
				+
			
 
				+    with tempfile.TemporaryDirectory() as tmpdir:
			
 
				+        data_dir = Path(tmpdir) / 'test_archive'
			
 
				+        data_dir.mkdir()
			
 
				+
			
 
				+        print(f"\n{'='*80}")
			
 
				+        print(f"Test: Config Propagation Through Worker Hierarchy")
			
 
				+        print(f"DATA_DIR: {data_dir}")
			
 
				+        print(f"{'='*80}\n")
			
 
				+
			
 
				+        # Step 1: Initialize archive
			
 
				+        print("Step 1: Initialize archive")
			
 
				+        result = subprocess.run(
			
 
				+            ['python', '-m', 'archivebox', 'init'],
			
 
				+            cwd=str(data_dir),
			
 
				+            env={
			
 
				+                **os.environ,
			
 
				+                'DATA_DIR': str(data_dir),
			
 
				+                'USE_COLOR': 'False',
			
 
				+            },
			
 
				+            capture_output=True,
			
 
				+            timeout=60,
			
 
				+        )
			
 
				+        assert result.returncode == 0, f"Init failed: {result.stderr.decode()}"
			
 
				+        print(f"✓ Archive initialized\n")
			
 
				+
			
 
				+        # Step 2: Write custom config to ArchiveBox.conf
			
 
				+        print("Step 2: Write custom config to ArchiveBox.conf")
			
 
				+        config_file = data_dir / 'ArchiveBox.conf'
			
 
				+        config_file.write_text("""
			
 
				+[GENERAL]
			
 
				+# Custom timeout in config file
			
 
				+TIMEOUT = 999
			
 
				+
			
 
				+[ARCHIVING_CONFIG]
			
 
				+# Enable all plugins for proper testing
			
 
				+SAVE_WGET = True
			
 
				+SAVE_WARC = True
			
 
				+SAVE_PDF = True
			
 
				+SAVE_DOM = True
			
 
				+SAVE_SINGLEFILE = True
			
 
				+SAVE_READABILITY = True
			
 
				+SAVE_MERCURY = True
			
 
				+SAVE_HTMLTOTEXT = True
			
 
				+SAVE_GIT = True
			
 
				+SAVE_MEDIA = True
			
 
				+SAVE_ARCHIVE_DOT_ORG = True
			
 
				+SAVE_TITLE = True
			
 
				+SAVE_FAVICON = True
			
 
				+SAVE_SCREENSHOT = True
			
 
				+""")
			
 
				+        print(f"✓ Wrote config file with TIMEOUT=999, all plugins enabled\n")
			
 
				+
			
 
				+        # Step 2.5: Set Machine.config values
			
 
				+        print("Step 2.5: Set Machine.config with custom binary path")
			
 
				+        set_machine_config_script = f"""
			
 
				+import os
			
 
				+os.environ['DATA_DIR'] = '{data_dir}'
			
 
				+
			
 
				+from archivebox.config.django import setup_django
			
 
				+setup_django()
			
 
				+
			
 
				+from archivebox.machine.models import Machine
			
 
				+
			
 
				+machine = Machine.current()
			
 
				+machine.config = {{
			
 
				+    'CUSTOM_MACHINE_KEY': 'from_machine_config',
			
 
				+    'WGET_BINARY': '/custom/machine/wget',  # Machine-specific binary path
			
 
				+}}
			
 
				+machine.save()
			
 
				+print(f"Machine {{machine.hostname}} config updated")
			
 
				+"""
			
 
				+        result = subprocess.run(
			
 
				+            ['python', '-c', set_machine_config_script],
			
 
				+            cwd=str(data_dir.parent),
			
 
				+            env={
			
 
				+                **os.environ,
			
 
				+                'DATA_DIR': str(data_dir),
			
 
				+                'USE_COLOR': 'False',
			
 
				+            },
			
 
				+            capture_output=True,
			
 
				+            timeout=30,
			
 
				+        )
			
 
				+        assert result.returncode == 0, f"Set machine config failed: {result.stderr.decode()}"
			
 
				+        print(f"✓ Set Machine.config with CUSTOM_MACHINE_KEY=from_machine_config, WGET_BINARY=/custom/machine/wget\n")
			
 
				+
			
 
				+        # Step 3: Create Crawl via Django ORM with custom crawl.config
			
 
				+        print("Step 3: Create Crawl with custom crawl.config JSON")
			
 
				+        create_crawl_script = f"""
			
 
				+import os
			
 
				+os.environ['DATA_DIR'] = '{data_dir}'
			
 
				+
			
 
				+from archivebox.config.django import setup_django
			
 
				+setup_django()
			
 
				+
			
 
				+from django.utils import timezone
			
 
				+from archivebox.crawls.models import Crawl
			
 
				+
			
 
				+# Create crawl with custom config
			
 
				+crawl = Crawl.objects.create(
			
 
				+    status='queued',
			
 
				+    retry_at=timezone.now(),
			
 
				+    urls='https://example.com',
			
 
				+    config={{
			
 
				+        'TIMEOUT': 777,  # Crawl-level override (higher priority than file)
			
 
				+        'CUSTOM_CRAWL_KEY': 'from_crawl_json',
			
 
				+    }}
			
 
				+)
			
 
				+print(crawl.id)
			
 
				+"""
			
 
				+        result = subprocess.run(
			
 
				+            ['python', '-c', create_crawl_script],
			
 
				+            cwd=str(data_dir.parent),
			
 
				+            env={
			
 
				+                **os.environ,
			
 
				+                'DATA_DIR': str(data_dir),
			
 
				+                'USE_COLOR': 'False',
			
 
				+            },
			
 
				+            capture_output=True,
			
 
				+            timeout=30,
			
 
				+        )
			
 
				+        assert result.returncode == 0, f"Create crawl failed: {result.stderr.decode()}"
			
 
				+        # Extract UUID from output (last line should be the UUID)
			
 
				+        crawl_id = result.stdout.decode().strip().split('\n')[-1]
			
 
				+        print(f"✓ Created crawl {crawl_id} with TIMEOUT=777, CUSTOM_CRAWL_KEY=from_crawl_json\n")
			
 
				+
			
 
				+        # Step 4: Create Snapshot with custom snapshot.config
			
 
				+        print("Step 4: Create Snapshot with custom snapshot.config JSON")
			
 
				+        create_snapshot_script = f"""
			
 
				+import os
			
 
				+os.environ['DATA_DIR'] = '{data_dir}'
			
 
				+
			
 
				+from archivebox.config.django import setup_django
			
 
				+setup_django()
			
 
				+
			
 
				+from django.utils import timezone
			
 
				+from archivebox.core.models import Snapshot
			
 
				+from archivebox.crawls.models import Crawl
			
 
				+
			
 
				+crawl = Crawl.objects.get(id='{crawl_id}')
			
 
				+snapshot = Snapshot.objects.create(
			
 
				+    url='https://example.com',
			
 
				+    crawl=crawl,
			
 
				+    status='queued',
			
 
				+    retry_at=timezone.now(),
			
 
				+    config={{
			
 
				+        'TIMEOUT': 555,  # Snapshot-level override (highest priority)
			
 
				+        'CUSTOM_SNAPSHOT_KEY': 'from_snapshot_json',
			
 
				+        'SAVE_SCREENSHOT': True,  # Keep screenshot enabled
			
 
				+        'SAVE_WGET': False,  # But disable wget as a test of per-snapshot override
			
 
				+    }}
			
 
				+)
			
 
				+print(snapshot.id)
			
 
				+"""
			
 
				+        result = subprocess.run(
			
 
				+            ['python', '-c', create_snapshot_script],
			
 
				+            cwd=str(data_dir.parent),
			
 
				+            env={
			
 
				+                **os.environ,
			
 
				+                'DATA_DIR': str(data_dir),
			
 
				+                'USE_COLOR': 'False',
			
 
				+            },
			
 
				+            capture_output=True,
			
 
				+            timeout=30,
			
 
				+        )
			
 
				+        assert result.returncode == 0, f"Create snapshot failed: {result.stderr.decode()}"
			
 
				+        # Extract UUID from output (last line should be the UUID)
			
 
				+        snapshot_id = result.stdout.decode().strip().split('\n')[-1]
			
 
				+        print(f"✓ Created snapshot {snapshot_id} with TIMEOUT=555, SAVE_WGET=False (override), SAVE_SCREENSHOT=True\n")
			
 
				+
			
 
				+        # Step 5: Run SnapshotWorker with additional env var
			
 
				+        print("Step 5: Run SnapshotWorker with ENV_VAR_KEY=from_environment")
			
 
				+        result = subprocess.run(
			
 
				+            ['python', '-m', 'archivebox', 'run', '--snapshot-id', snapshot_id],
			
 
				+            cwd=str(data_dir),
			
 
				+            env={
			
 
				+                **os.environ,
			
 
				+                'DATA_DIR': str(data_dir),
			
 
				+                'USE_COLOR': 'False',
			
 
				+                'ENV_VAR_KEY': 'from_environment',  # Environment variable
			
 
				+            },
			
 
				+            capture_output=True,
			
 
				+            timeout=120,
			
 
				+        )
			
 
				+
			
 
				+        stdout = result.stdout.decode()
			
 
				+        stderr = result.stderr.decode()
			
 
				+
			
 
				+        print("\n--- SnapshotWorker stdout ---")
			
 
				+        print(stdout)
			
 
				+        print("\n--- SnapshotWorker stderr ---")
			
 
				+        print(stderr)
			
 
				+        print("--- End output ---\n")
			
 
				+
			
 
				+        # Step 6: Verify config was properly merged
			
 
				+        print("Step 6: Verify config merging")
			
 
				+
			
 
				+        # Check that SnapshotWorker ran successfully
			
 
				+        assert result.returncode == 0, f"SnapshotWorker failed with exit code {result.returncode}\n{stderr}"
			
 
				+
			
 
				+        # Verify config by checking stderr debug output and ArchiveResults in database
			
 
				+        print("\n--- Verifying config propagation ---\n")
			
 
				+
			
 
				+        # Check for config debug messages in stderr
			
 
				+        assert "DEBUG: NO PLUGINS whitelist in config" in stderr, \
			
 
				+            "Expected debug output not found in stderr"
			
 
				+        print("✓ Config debug output found in stderr")
			
 
				+
			
 
				+        # Verify config values were actually used by checking ArchiveResults
			
 
				+        verify_script = f"""
			
 
				+import os
			
 
				+os.environ['DATA_DIR'] = '{data_dir}'
			
 
				+
			
 
				+from archivebox.config.django import setup_django
			
 
				+setup_django()
			
 
				+
			
 
				+from archivebox.core.models import Snapshot, ArchiveResult
			
 
				+from archivebox.config.configset import get_config
			
 
				+
			
 
				+snapshot = Snapshot.objects.get(id='{snapshot_id}')
			
 
				+print(f"Snapshot status: {{snapshot.status}}")
			
 
				+print(f"Snapshot URL: {{snapshot.url}}")
			
 
				+
			
 
				+# Check that snapshot reached sealed state
			
 
				+assert snapshot.status == 'sealed', f"Expected sealed, got {{snapshot.status}}"
			
 
				+
			
 
				+# Verify all config sources are present in merged config
			
 
				+print("\\nVerifying config merge priority:")
			
 
				+config = get_config(snapshot=snapshot)
			
 
				+
			
 
				+# 1. Snapshot.config (highest priority)
			
 
				+timeout = config.get('TIMEOUT')
			
 
				+print(f"  1. Snapshot.config: TIMEOUT={timeout} (expected: 555)")
			
 
				+assert timeout == 555, f"TIMEOUT should be 555 from snapshot.config, got {{timeout}}"
			
 
				+
			
 
				+wget_enabled = config.get('SAVE_WGET')
			
 
				+print(f"  1. Snapshot.config: SAVE_WGET={wget_enabled} (expected: False)")
			
 
				+assert wget_enabled == False, f"SAVE_WGET should be False from snapshot.config, got {{wget_enabled}}"
			
 
				+
			
 
				+custom_snapshot = config.get('CUSTOM_SNAPSHOT_KEY')
			
 
				+print(f"  1. Snapshot.config: CUSTOM_SNAPSHOT_KEY={custom_snapshot} (expected: from_snapshot_json)")
			
 
				+assert custom_snapshot == 'from_snapshot_json', f"Expected from_snapshot_json, got {{custom_snapshot}}"
			
 
				+
			
 
				+# 2. Crawl.config
			
 
				+custom_crawl = config.get('CUSTOM_CRAWL_KEY')
			
 
				+print(f"  2. Crawl.config: CUSTOM_CRAWL_KEY={custom_crawl} (expected: from_crawl_json)")
			
 
				+assert custom_crawl == 'from_crawl_json', f"Expected from_crawl_json, got {{custom_crawl}}"
			
 
				+
			
 
				+# 6. Machine.config
			
 
				+custom_machine = config.get('CUSTOM_MACHINE_KEY')
			
 
				+print(f"  6. Machine.config: CUSTOM_MACHINE_KEY={custom_machine} (expected: from_machine_config)")
			
 
				+assert custom_machine == 'from_machine_config', f"Expected from_machine_config, got {{custom_machine}}"
			
 
				+
			
 
				+wget_binary = config.get('WGET_BINARY')
			
 
				+print(f"  6. Machine.config: WGET_BINARY={wget_binary} (expected: /custom/machine/wget)")
			
 
				+# Note: This might be overridden by environment or other sources, just check it's present
			
 
				+assert wget_binary is not None, f"WGET_BINARY should be present"
			
 
				+
			
 
				+# Check ArchiveResults to verify plugins actually ran with correct config
			
 
				+results = ArchiveResult.objects.filter(snapshot=snapshot)
			
 
				+print(f"\\nArchiveResults created: {{results.count()}}")
			
 
				+
			
 
				+for ar in results.order_by('plugin'):
			
 
				+    print(f"  {{ar.plugin}}: {{ar.status}}")
			
 
				+
			
 
				+# Verify SAVE_WGET=False was respected (should have no wget result)
			
 
				+wget_results = results.filter(plugin='wget')
			
 
				+print(f"\\nWGET results: {{wget_results.count()}} (expected: 0, disabled in snapshot.config)")
			
 
				+assert wget_results.count() == 0, f"WGET should be disabled, found {{wget_results.count()}} results"
			
 
				+
			
 
				+# Verify SAVE_SCREENSHOT=True was respected (should have screenshot result)
			
 
				+screenshot_results = results.filter(plugin='screenshot')
			
 
				+print(f"SCREENSHOT results: {{screenshot_results.count()}} (expected: >0, enabled globally)")
			
 
				+assert screenshot_results.count() > 0, f"SCREENSHOT should be enabled, found {{screenshot_results.count()}} results"
			
 
				+
			
 
				+print("\\n✓ All config sources correctly merged:")
			
 
				+print("  - Snapshot.config overrides (highest priority)")
			
 
				+print("  - Crawl.config values present")
			
 
				+print("  - Machine.config values present")
			
 
				+print("  - File config values present")
			
 
				+print("✓ Config priority order verified")
			
 
				+print("✓ Snapshot successfully sealed")
			
 
				+"""
			
 
				+        result = subprocess.run(
			
 
				+            ['python', '-c', verify_script],
			
 
				+            cwd=str(data_dir.parent),
			
 
				+            env={
			
 
				+                **os.environ,
			
 
				+                'DATA_DIR': str(data_dir),
			
 
				+                'USE_COLOR': 'False',
			
 
				+            },
			
 
				+            capture_output=True,
			
 
				+            timeout=30,
			
 
				+        )
			
 
				+
			
 
				+        print(result.stdout.decode())
			
 
				+        if result.returncode != 0:
			
 
				+            print("\nVerification error:")
			
 
				+            print(result.stderr.decode())
			
 
				+
			
 
				+        assert result.returncode == 0, f"Config verification failed: {result.stderr.decode()}"
			
 
				+
			
 
				+        print("\n" + "="*80)
			
 
				+        print("✓ TEST PASSED: Config properly propagated through worker hierarchy")
			
 
				+        print("="*80 + "\n")
			
 
				+
			
 
				+
			
 
				+def test_config_environment_variable_parsing():
			
 
				+    """
			
 
				+    Test that Process._build_env() correctly serializes config values,
			
 
				+    and get_config() correctly parses them back from environment.
			
 
				+    """
			
 
				+
			
 
				+    with tempfile.TemporaryDirectory() as tmpdir:
			
 
				+        data_dir = Path(tmpdir) / 'test_archive'
			
 
				+        data_dir.mkdir()
			
 
				+
			
 
				+        print(f"\n{'='*80}")
			
 
				+        print(f"Test: Config Environment Variable Parsing")
			
 
				+        print(f"DATA_DIR: {data_dir}")
			
 
				+        print(f"{'='*80}\n")
			
 
				+
			
 
				+        # Initialize archive
			
 
				+        result = subprocess.run(
			
 
				+            ['python', '-m', 'archivebox', 'init'],
			
 
				+            cwd=str(data_dir),
			
 
				+            env={
			
 
				+                **os.environ,
			
 
				+                'DATA_DIR': str(data_dir),
			
 
				+                'USE_COLOR': 'False',
			
 
				+            },
			
 
				+            capture_output=True,
			
 
				+            timeout=60,
			
 
				+        )
			
 
				+        assert result.returncode == 0, f"Init failed: {result.stderr.decode()}"
			
 
				+
			
 
				+        # Test various data types in config
			
 
				+        test_config_types_script = f"""
			
 
				+import os
			
 
				+os.environ['DATA_DIR'] = '{data_dir}'
			
 
				+
			
 
				+from archivebox.config.django import setup_django
			
 
				+setup_django()
			
 
				+
			
 
				+from archivebox.config.configset import get_config
			
 
				+from archivebox.machine.models import Process, Machine
			
 
				+
			
 
				+# Test get_config() with no overrides (baseline)
			
 
				+config = get_config()
			
 
				+print(f"Baseline config keys: {{len(config)}}")
			
 
				+
			
 
				+# Create a test Process with various config types
			
 
				+process = Process.objects.create(
			
 
				+    machine=Machine.current(),
			
 
				+    process_type=Process.TypeChoices.WORKER,
			
 
				+    pwd='{data_dir}',
			
 
				+    cmd=['test'],
			
 
				+    env={{
			
 
				+        'STRING_VAL': 'hello',
			
 
				+        'INT_VAL': 123,
			
 
				+        'FLOAT_VAL': 45.67,
			
 
				+        'BOOL_TRUE': True,
			
 
				+        'BOOL_FALSE': False,
			
 
				+        'LIST_VAL': ['a', 'b', 'c'],
			
 
				+        'DICT_VAL': {{'key': 'value'}},
			
 
				+        'NONE_VAL': None,
			
 
				+    }},
			
 
				+)
			
 
				+
			
 
				+# Test _build_env() serialization
			
 
				+env = process._build_env()
			
 
				+print(f"\\nSerialized environment:")
			
 
				+print(f"  STRING_VAL: {{env.get('STRING_VAL')}} (type: {{type(env.get('STRING_VAL')).__name__}})")
			
 
				+print(f"  INT_VAL: {{env.get('INT_VAL')}} (type: {{type(env.get('INT_VAL')).__name__}})")
			
 
				+print(f"  FLOAT_VAL: {{env.get('FLOAT_VAL')}} (type: {{type(env.get('FLOAT_VAL')).__name__}})")
			
 
				+print(f"  BOOL_TRUE: {{env.get('BOOL_TRUE')}} (type: {{type(env.get('BOOL_TRUE')).__name__}})")
			
 
				+print(f"  BOOL_FALSE: {{env.get('BOOL_FALSE')}} (type: {{type(env.get('BOOL_FALSE')).__name__}})")
			
 
				+print(f"  LIST_VAL: {{env.get('LIST_VAL')}} (type: {{type(env.get('LIST_VAL')).__name__}})")
			
 
				+print(f"  DICT_VAL: {{env.get('DICT_VAL')}} (type: {{type(env.get('DICT_VAL')).__name__}})")
			
 
				+print(f"  NONE_VAL: {{env.get('NONE_VAL')}} (should be None/missing)")
			
 
				+
			
 
				+# Verify all are strings (required by subprocess.Popen)
			
 
				+assert isinstance(env.get('STRING_VAL'), str), "STRING_VAL should be str"
			
 
				+assert isinstance(env.get('INT_VAL'), str), "INT_VAL should be str"
			
 
				+assert isinstance(env.get('FLOAT_VAL'), str), "FLOAT_VAL should be str"
			
 
				+assert isinstance(env.get('BOOL_TRUE'), str), "BOOL_TRUE should be str"
			
 
				+assert isinstance(env.get('BOOL_FALSE'), str), "BOOL_FALSE should be str"
			
 
				+assert isinstance(env.get('LIST_VAL'), str), "LIST_VAL should be str"
			
 
				+assert isinstance(env.get('DICT_VAL'), str), "DICT_VAL should be str"
			
 
				+
			
 
				+print("\\n✓ All environment values correctly serialized as strings")
			
 
				+
			
 
				+# Now test that get_config() can parse them back
			
 
				+# Simulate subprocess by setting os.environ
			
 
				+import json
			
 
				+for key, val in env.items():
			
 
				+    if key in ['STRING_VAL', 'INT_VAL', 'FLOAT_VAL', 'BOOL_TRUE', 'BOOL_FALSE', 'LIST_VAL', 'DICT_VAL']:
			
 
				+        os.environ[key] = val
			
 
				+
			
 
				+# Get config again - should parse from environment
			
 
				+config = get_config()
			
 
				+print(f"\\nParsed from environment:")
			
 
				+print(f"  STRING_VAL: {{config.get('STRING_VAL')}} (type: {{type(config.get('STRING_VAL')).__name__}})")
			
 
				+print(f"  INT_VAL: {{config.get('INT_VAL')}} (type: {{type(config.get('INT_VAL')).__name__}})")
			
 
				+print(f"  FLOAT_VAL: {{config.get('FLOAT_VAL')}} (type: {{type(config.get('FLOAT_VAL')).__name__}})")
			
 
				+print(f"  BOOL_TRUE: {{config.get('BOOL_TRUE')}} (type: {{type(config.get('BOOL_TRUE')).__name__}})")
			
 
				+print(f"  BOOL_FALSE: {{config.get('BOOL_FALSE')}} (type: {{type(config.get('BOOL_FALSE')).__name__}})")
			
 
				+print(f"  LIST_VAL: {{config.get('LIST_VAL')}} (type: {{type(config.get('LIST_VAL')).__name__}})")
			
 
				+print(f"  DICT_VAL: {{config.get('DICT_VAL')}} (type: {{type(config.get('DICT_VAL')).__name__}})")
			
 
				+
			
 
				+print("\\n✓ All config values correctly parsed from environment")
			
 
				+"""
			
 
				+
			
 
				+        result = subprocess.run(
			
 
				+            ['python', '-c', test_config_types_script],
			
 
				+            cwd=str(data_dir.parent),
			
 
				+            env={
			
 
				+                **os.environ,
			
 
				+                'DATA_DIR': str(data_dir),
			
 
				+                'USE_COLOR': 'False',
			
 
				+            },
			
 
				+            capture_output=True,
			
 
				+            timeout=30,
			
 
				+        )
			
 
				+
			
 
				+        print(result.stdout.decode())
			
 
				+        if result.stderr:
			
 
				+            print("Script stderr:")
			
 
				+            print(result.stderr.decode())
			
 
				+
			
 
				+        assert result.returncode == 0, f"Type parsing test failed: {result.stderr.decode()}"
			
 
				+
			
 
				+        print("\n" + "="*80)
			
 
				+        print("✓ TEST PASSED: Config serialization and parsing works correctly")
			
 
				+        print("="*80 + "\n")
			
 
				+
			
 
				+
			
 
				+if __name__ == '__main__':
			
 
				+    # Run as standalone script
			
 
				+    test_config_propagation_through_worker_hierarchy()
			
 
				+    test_config_environment_variable_parsing()
			
--- a/archivebox/workers/worker.py
+++ b/archivebox/workers/worker.py
@@ -308,8 +308,8 @@ class Worker:
 
				             crawl = Crawl.objects.get(id=crawl_id)
			
 
				 
			
 
				             cmd = [sys.executable, '-m', 'archivebox', 'run', '--crawl-id', str(crawl_id)]
			
 
				-            pwd = Path(crawl.OUTPUT_DIR)  # Run in crawl's output directory
			
 
				-            env = get_config(scope='crawl', crawl=crawl)
			
 
				+            pwd = Path(crawl.output_dir)  # Run in crawl's output directory
			
 
				+            env = get_config(crawl=crawl)
			
 
				 
			
 
				         elif cls.name == 'snapshot':
			
 
				             snapshot_id = kwargs.get('snapshot_id')
			
@@ -321,7 +321,7 @@ class Worker:
 
				 
			
 
				             cmd = [sys.executable, '-m', 'archivebox', 'run', '--snapshot-id', str(snapshot_id)]
			
 
				             pwd = Path(snapshot.output_dir)  # Run in snapshot's output directory
			
 
				-            env = get_config(scope='snapshot', snapshot=snapshot)
			
 
				+            env = get_config(snapshot=snapshot)
			
 
				 
			
 
				         else:
			
 
				             raise ValueError(f"Unknown worker type: {cls.name}")
			
@@ -459,6 +459,8 @@ class CrawlWorker(Worker):
 
				         from pathlib import Path
			
 
				         from archivebox.core.models import Snapshot
			
 
				         from archivebox.machine.models import Process
			
 
				+        import sys
			
 
				+        import threading
			
 
				 
			
 
				         debug_log = Path('/tmp/archivebox_crawl_worker_debug.log')
			
 
				 
			
@@ -514,7 +516,9 @@ class CrawlWorker(Worker):
 
				             with open(debug_log, 'a') as f:
			
 
				                 f.write(f'  Spawning worker for {snapshot.url} (status={snapshot.status})\n')
			
 
				                 f.flush()
			
 
				-            SnapshotWorker.start(parent=self.db_process, snapshot_id=str(snapshot.id))
			
 
				+
			
 
				+            pid = SnapshotWorker.start(parent=self.db_process, snapshot_id=str(snapshot.id))
			
 
				+
			
 
				             log_worker_event(
			
 
				                 worker_type='CrawlWorker',
			
 
				                 event=f'Spawned SnapshotWorker for {snapshot.url}',
			
@@ -522,6 +526,18 @@ class CrawlWorker(Worker):
 
				                 pid=self.pid,
			
 
				             )
			
 
				 
			
 
				+            # Pipe the SnapshotWorker's stderr to our stderr so we can see what's happening
			
 
				+            # Get the Process record that was just created
			
 
				+            worker_process = Process.objects.filter(pid=pid).first()
			
 
				+            if worker_process:
			
 
				+                # Pipe stderr in background thread so it doesn't block
			
 
				+                def pipe_worker_stderr():
			
 
				+                    for line in worker_process.tail_stderr(lines=0, follow=True):
			
 
				+                        print(f'  [SnapshotWorker] {line}', file=sys.stderr, flush=True)
			
 
				+
			
 
				+                thread = threading.Thread(target=pipe_worker_stderr, daemon=True)
			
 
				+                thread.start()
			
 
				+
			
 
				     def _is_crawl_finished(self) -> bool:
			
 
				         """Check if all snapshots are sealed."""
			
 
				         from pathlib import Path
			
@@ -626,16 +642,28 @@ class SnapshotWorker(Worker):
 
				         """Execute all hooks sequentially."""
			
 
				         from archivebox.hooks import discover_hooks, is_background_hook, extract_step
			
 
				         from archivebox.core.models import ArchiveResult
			
 
				+        from archivebox.config.configset import get_config
			
 
				 
			
 
				         self.on_startup()
			
 
				 
			
 
				         try:
			
 
				+            # Get merged config (includes env vars passed via Process.env, snapshot.config, defaults, etc.)
			
 
				+            config = get_config(snapshot=self.snapshot)
			
 
				+
			
 
				             # Discover all hooks for this snapshot
			
 
				-            hooks = discover_hooks('Snapshot', config=self.snapshot.config)
			
 
				+            hooks = discover_hooks('Snapshot', config=config)
			
 
				             hooks = sorted(hooks, key=lambda h: h.name)  # Sort by name (includes step prefix)
			
 
				 
			
 
				+            import sys
			
 
				+            print(f'[SnapshotWorker] Discovered {len(hooks)} hooks for snapshot {self.snapshot.url}', file=sys.stderr, flush=True)
			
 
				+            if hooks:
			
 
				+                print(f'[SnapshotWorker] First 5 hooks: {[h.name for h in hooks[:5]]}', file=sys.stderr, flush=True)
			
 
				+            else:
			
 
				+                print(f'[SnapshotWorker] WARNING: No hooks discovered! Config keys: {list(config.keys())[:10]}...', file=sys.stderr, flush=True)
			
 
				+
			
 
				             # Execute each hook sequentially
			
 
				             for hook_path in hooks:
			
 
				+                print(f'[SnapshotWorker] Running hook: {hook_path.name}', file=sys.stderr, flush=True)
			
 
				                 hook_name = hook_path.name
			
 
				                 plugin = self._extract_plugin_name(hook_name)
			
 
				                 hook_step = extract_step(hook_name)
			
@@ -661,7 +689,7 @@ class SnapshotWorker(Worker):
 
				                     ar.save(update_fields=['status', 'start_ts', 'modified_at'])
			
 
				 
			
 
				                 # Fork and run the hook
			
 
				-                process = self._run_hook(hook_path, ar)
			
 
				+                process = self._run_hook(hook_path, ar, config)
			
 
				 
			
 
				                 if is_background:
			
 
				                     # Track but don't wait
			
@@ -698,7 +726,7 @@ class SnapshotWorker(Worker):
 
				         finally:
			
 
				             self.on_shutdown()
			
 
				 
			
 
				-    def _run_hook(self, hook_path: Path, ar: Any) -> Any:
			
 
				+    def _run_hook(self, hook_path: Path, ar: Any, config: dict) -> Any:
			
 
				         """Fork and run a hook using Process model, return Process."""
			
 
				         from archivebox.hooks import run_hook
			
 
				 
			
@@ -710,7 +738,7 @@ class SnapshotWorker(Worker):
 
				         process = run_hook(
			
 
				             script=hook_path,
			
 
				             output_dir=output_dir,
			
 
				-            config=self.snapshot.config,
			
 
				+            config=config,
			
 
				             timeout=120,
			
 
				             parent=self.db_process,
			
 
				             url=str(self.snapshot.url),
			
--- a/bin/test_plugins.sh
+++ b/bin/test_plugins.sh
@@ -179,7 +179,7 @@ if [ "$ENABLE_COVERAGE" = true ]; then
 
				     export NODE_V8_COVERAGE="$ROOT_DIR/coverage/js"
			
 
				 
			
 
				     echo "Python coverage: enabled (subprocess support)"
			
 
				-    echo "JavaScript coverage: enabled (NODE_V8_COVERAGE)"
			
 
				+    echo "JavaScript coverage: enabled (NODE_V8_COVERAGE=$NODE_V8_COVERAGE)"
			
 
				     echo ""
			
 
				 fi