Browse Source

more migration id/uuid and config propagation fixes

Nick Sweeting 1 month ago
parent
commit
456aaee287

+ 3 - 3
archivebox/base_models/models.py

@@ -111,7 +111,7 @@ class ModelWithOutputDir(ModelWithUUID):
 
     def save(self, *args, **kwargs):
         super().save(*args, **kwargs)
-        self.OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
+        Path(self.output_dir).mkdir(parents=True, exist_ok=True)
         # Note: index.json is deprecated, models should use write_index_jsonl() for full data
 
     @property
@@ -127,5 +127,5 @@ class ModelWithOutputDir(ModelWithUUID):
         return f'{self.output_dir_parent}/{self.output_dir_name}'
 
     @property
-    def OUTPUT_DIR(self) -> Path:
-        return DATA_DIR / self.output_dir_str
+    def output_dir(self) -> Path:
+        raise NotImplementedError(f'{self.__class__.__name__} must implement output_dir property')

+ 21 - 8
archivebox/config/configset.py

@@ -118,12 +118,12 @@ class BaseConfigSet(BaseSettings):
 
 
 def get_config(
-    scope: str = "global",
     defaults: Optional[Dict] = None,
     persona: Any = None,
     user: Any = None,
     crawl: Any = None,
     snapshot: Any = None,
+    machine: Any = None,
 ) -> Dict[str, Any]:
     """
     Get merged config from all sources.
@@ -134,17 +134,18 @@ def get_config(
     3. Per-user config (user.config JSON field)
     4. Per-persona config (persona.get_derived_config() - includes CHROME_USER_DATA_DIR etc.)
     5. Environment variables
-    6. Config file (ArchiveBox.conf)
-    7. Plugin schema defaults (config.json)
-    8. Core config defaults
+    6. Per-machine config (machine.config JSON field - resolved binary paths)
+    7. Config file (ArchiveBox.conf)
+    8. Plugin schema defaults (config.json)
+    9. Core config defaults
 
     Args:
-        scope: Config scope ('global', 'crawl', 'snapshot', etc.)
         defaults: Default values to start with
         persona: Persona object (provides derived paths like CHROME_USER_DATA_DIR)
         user: User object with config JSON field
         crawl: Crawl object with config JSON field
         snapshot: Snapshot object with config JSON field
+        machine: Machine object with config JSON field (defaults to Machine.current())
 
     Returns:
         Merged config dict
@@ -184,6 +185,18 @@ def get_config(
         file_config = BaseConfigSet.load_from_file(config_file)
         config.update(file_config)
 
+    # Apply machine config overrides (cached binary paths, etc.)
+    if machine is None:
+        # Default to current machine if not provided
+        try:
+            from archivebox.machine.models import Machine
+            machine = Machine.current()
+        except Exception:
+            pass  # Machine might not be available during early init
+
+    if machine and hasattr(machine, "config") and machine.config:
+        config.update(machine.config)
+
     # Override with environment variables
     for key in config:
         env_val = os.environ.get(key)
@@ -221,8 +234,8 @@ def get_config(
         config.update(crawl.config)
 
     # Add CRAWL_OUTPUT_DIR for snapshot hooks to find shared Chrome session
-    if crawl and hasattr(crawl, "OUTPUT_DIR"):
-        config['CRAWL_OUTPUT_DIR'] = str(crawl.OUTPUT_DIR)
+    if crawl and hasattr(crawl, "output_dir"):
+        config['CRAWL_OUTPUT_DIR'] = str(crawl.output_dir)
 
     # Apply snapshot config overrides (highest priority)
     if snapshot and hasattr(snapshot, "config") and snapshot.config:
@@ -260,7 +273,7 @@ def get_flat_config() -> Dict[str, Any]:
 
     Replaces abx.pm.hook.get_FLAT_CONFIG()
     """
-    return get_config(scope="global")
+    return get_config()
 
 
 def get_all_configs() -> Dict[str, BaseConfigSet]:

+ 1 - 0
archivebox/config/constants.py

@@ -176,6 +176,7 @@ class ConstantsDict(Mapping):
         CRONTABS_DIR_NAME,
         "invalid",
         "users",
+        "machine",
         # Backwards compatibility with old directory names
         "user_plugins",          # old name for USER_PLUGINS_DIR (now 'plugins')
         "user_templates",        # old name for CUSTOM_TEMPLATES_DIR (now 'templates')

+ 15 - 3
archivebox/core/migrations/0023_upgrade_to_0_9_0.py

@@ -15,6 +15,7 @@ def get_table_columns(table_name):
 
 def upgrade_core_tables(apps, schema_editor):
     """Upgrade core tables from v0.7.2 or v0.8.6rc0 to v0.9.0."""
+    from archivebox.uuid_compat import uuid7
     cursor = connection.cursor()
 
     # Check if core_archiveresult table exists
@@ -60,8 +61,8 @@ def upgrade_core_tables(apps, schema_editor):
 
     if has_data:
         if has_uuid and not has_abid:
-            # Migrating from v0.7.2 (has uuid, minimal fields)
-            print('Migrating ArchiveResult from v0.7.2 schema...')
+            # Migrating from v0.7.2+ (has uuid column)
+            print('Migrating ArchiveResult from v0.7.2+ schema (with uuid)...')
             cursor.execute("""
                 INSERT OR IGNORE INTO core_archiveresult_new (
                     id, uuid, snapshot_id, cmd, pwd, cmd_version,
@@ -86,7 +87,18 @@ def upgrade_core_tables(apps, schema_editor):
                 FROM core_archiveresult;
             """)
         else:
-            print(f'Warning: Unexpected schema - has_uuid={has_uuid}, has_abid={has_abid}')
+            # Migrating from v0.7.2 (no uuid or abid column - generate fresh UUIDs)
+            print('Migrating ArchiveResult from v0.7.2 schema (no uuid - generating UUIDs)...')
+            cursor.execute("SELECT id, snapshot_id, cmd, pwd, cmd_version, start_ts, end_ts, status, extractor, output FROM core_archiveresult")
+            old_records = cursor.fetchall()
+            for record in old_records:
+                new_uuid = uuid7().hex
+                cursor.execute("""
+                    INSERT OR IGNORE INTO core_archiveresult_new (
+                        id, uuid, snapshot_id, cmd, pwd, cmd_version,
+                        start_ts, end_ts, status, extractor, output
+                    ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
+                """, (record[0], new_uuid, record[1], record[2], record[3], record[4], record[5], record[6], record[7], record[8], record[9]))
 
     cursor.execute("DROP TABLE IF EXISTS core_archiveresult;")
     cursor.execute("ALTER TABLE core_archiveresult_new RENAME TO core_archiveresult;")

+ 1 - 0
archivebox/core/migrations/0025_alter_archiveresult_options_alter_snapshot_options_and_more.py

@@ -33,6 +33,7 @@ def copy_old_fields_to_new(apps, schema_editor):
 
     # NOTE: Snapshot timestamps (added→bookmarked_at, updated→modified_at) were already
     # transformed by migration 0023, so we don't need to copy them here.
+    # NOTE: UUIDs are already populated by migration 0023 for all migration paths
 
     # Debug: Check Snapshot timestamps at end of RunPython
     cursor.execute("SELECT id, bookmarked_at, modified_at FROM core_snapshot LIMIT 2")

+ 69 - 46
archivebox/core/migrations/0029_migrate_archiveresult_to_uuid_pk.py

@@ -8,12 +8,20 @@ from archivebox.uuid_compat import uuid7
 
 def migrate_archiveresult_id_to_uuid(apps, schema_editor):
     """
-    Migrate ArchiveResult from integer PK to UUID PK.
+    Migrate ArchiveResult from integer PK to UUID PK (clean one-step migration).
+
+    Handles both migration paths:
+    - 0.7.x: ArchiveResult has integer id, NO uuid field → generate new UUIDs
+    - 0.8.x: ArchiveResult has integer id + optional uuid field → reuse existing UUIDs
 
     Strategy:
-    1. Add old_id field to store current integer IDs
-    2. Generate UUIDs for any records missing them
-    3. Swap id and uuid fields (uuid becomes PK, old integer id becomes old_id)
+    1. Create new table with UUID as primary key (no temporary columns)
+    2. Generate UUIDs for records missing them (0.7.x) or reuse existing (0.8.x)
+    3. Copy all data with UUID as new id
+    4. Drop old table, rename new table
+    5. Recreate indexes
+
+    Result: Clean schema with ONLY id as UUIDField (no old_id, no uuid)
     """
     cursor = connection.cursor()
 
@@ -26,11 +34,13 @@ def migrate_archiveresult_id_to_uuid(apps, schema_editor):
     cursor.execute("SELECT COUNT(*) FROM core_archiveresult")
     row_count = cursor.fetchone()[0]
 
-    if row_count == 0:
-        print('No ArchiveResult records to migrate')
-        return
+    # Don't skip if table is empty - we still need to recreate to remove uuid column
+    # (fresh installs create table with uuid from 0025, but model expects no uuid after 0029)
 
-    print(f'Migrating {row_count} ArchiveResult records from integer PK to UUID PK...')
+    if row_count == 0:
+        print('[0029] Recreating ArchiveResult table schema (integer→UUID PK, removing uuid column)...')
+    else:
+        print(f'[0029] Migrating {row_count} ArchiveResult records from integer PK to UUID PK...')
 
     # Step 0: Check if machine_process table exists, if not NULL out process_id values
     cursor.execute("SELECT name FROM sqlite_master WHERE type='table' AND name='machine_process'")
@@ -40,12 +50,10 @@ def migrate_archiveresult_id_to_uuid(apps, schema_editor):
         print('machine_process table does not exist yet, setting process_id to NULL')
         cursor.execute("UPDATE core_archiveresult SET process_id = NULL WHERE process_id IS NOT NULL")
 
-    # Step 1: Create new table with UUID as primary key
+    # Step 1: Create new table with UUID as primary key (clean - no old_id or uuid columns)
     cursor.execute("""
         CREATE TABLE core_archiveresult_new (
             id TEXT PRIMARY KEY NOT NULL,
-            old_id INTEGER,
-            uuid TEXT UNIQUE,
             created_at DATETIME NOT NULL,
             modified_at DATETIME NOT NULL,
 
@@ -78,28 +86,36 @@ def migrate_archiveresult_id_to_uuid(apps, schema_editor):
     """)
 
     # Step 2: Generate UUIDs for records that don't have them
-    cursor.execute("SELECT id, uuid FROM core_archiveresult")
-    records = cursor.fetchall()
-
-    id_to_uuid = {}
-    for old_id, existing_uuid in records:
-        if existing_uuid:
-            # Normalize existing UUID to 32-char hex format (Django SQLite UUIDField format)
-            # (existing UUIDs might be stored with or without dashes in old schema)
-            id_to_uuid[old_id] = UUID(existing_uuid).hex
-        else:
-            # Generate new UUIDv7 (time-ordered) as 32-char hex
-            id_to_uuid[old_id] = uuid7().hex
+    # Check if uuid column exists (0.8.x has it, 0.7.x doesn't)
+    cursor.execute("PRAGMA table_info(core_archiveresult)")
+    columns = cursor.fetchall()
+    col_names = [col[1] for col in columns]
+    has_uuid_column = 'uuid' in col_names
+
+    if has_uuid_column:
+        cursor.execute("SELECT id, uuid FROM core_archiveresult")
+        records = cursor.fetchall()
+        id_to_uuid = {}
+        for old_id, existing_uuid in records:
+            if existing_uuid:
+                # Normalize existing UUID to 32-char hex format (Django SQLite UUIDField format)
+                # (existing UUIDs might be stored with or without dashes in old schema)
+                id_to_uuid[old_id] = UUID(existing_uuid).hex
+            else:
+                # Generate new UUIDv7 (time-ordered) as 32-char hex
+                id_to_uuid[old_id] = uuid7().hex
+    else:
+        # 0.7.x path: no uuid column, generate new UUIDs for all records
+        cursor.execute("SELECT id FROM core_archiveresult")
+        records = cursor.fetchall()
+        id_to_uuid = {old_id: uuid7().hex for (old_id,) in records}
 
     # Step 3: Copy data with UUIDs as new primary key
     cursor.execute("SELECT * FROM core_archiveresult")
     old_records = cursor.fetchall()
 
-    # Get column names
-    cursor.execute("PRAGMA table_info(core_archiveresult)")
-    columns = cursor.fetchall()
-    col_names = [col[1] for col in columns]
-
+    # col_names already fetched in Step 2
+    inserted_count = 0
     for i, record in enumerate(old_records):
         old_id = record[col_names.index('id')]
         new_uuid = id_to_uuid[old_id]
@@ -107,7 +123,7 @@ def migrate_archiveresult_id_to_uuid(apps, schema_editor):
         # Build insert with new structure
         values = {col_names[i]: record[i] for i in range(len(col_names))}
 
-        # Check which fields exist in new table
+        # List of fields to copy (all fields from new schema except id, old_id, uuid)
         fields_to_copy = [
             'created_at', 'modified_at', 'snapshot_id', 'plugin', 'hook_name',
             'status', 'retry_at', 'start_ts', 'end_ts',
@@ -115,17 +131,31 @@ def migrate_archiveresult_id_to_uuid(apps, schema_editor):
             'config', 'notes', 'num_uses_succeeded', 'num_uses_failed', 'process_id'
         ]
 
-        # Build INSERT statement
+        # Build INSERT statement (only copy fields that exist in source)
         existing_fields = [f for f in fields_to_copy if f in values]
-        placeholders = ', '.join(['?'] * (len(existing_fields) + 3))  # +3 for id, old_id, uuid
-        field_list = 'id, old_id, uuid, ' + ', '.join(existing_fields)
 
-        insert_values = [new_uuid, old_id, new_uuid] + [values.get(f) for f in existing_fields]
+        if i == 0:
+            print(f'[0029] Source columns: {col_names}')
+            print(f'[0029] Copying fields: {existing_fields}')
+
+        placeholders = ', '.join(['?'] * (len(existing_fields) + 1))  # +1 for id
+        field_list = 'id, ' + ', '.join(existing_fields)
 
-        cursor.execute(
-            f"INSERT INTO core_archiveresult_new ({field_list}) VALUES ({placeholders})",
-            insert_values
-        )
+        insert_values = [new_uuid] + [values.get(f) for f in existing_fields]
+
+        try:
+            cursor.execute(
+                f"INSERT INTO core_archiveresult_new ({field_list}) VALUES ({placeholders})",
+                insert_values
+            )
+            inserted_count += 1
+        except Exception as e:
+            print(f'[0029] ERROR inserting record {old_id}: {e}')
+            if i == 0:
+                print(f'[0029] First record values: {insert_values[:5]}...')
+                raise
+
+    print(f'[0029] Inserted {inserted_count}/{len(old_records)} records')
 
     # Step 4: Replace old table with new table
     cursor.execute("DROP TABLE core_archiveresult")
@@ -139,7 +169,6 @@ def migrate_archiveresult_id_to_uuid(apps, schema_editor):
     cursor.execute("CREATE INDEX core_archiveresult_created_at_idx ON core_archiveresult(created_at)")
     cursor.execute("CREATE INDEX core_archiveresult_hook_name_idx ON core_archiveresult(hook_name)")
     cursor.execute("CREATE INDEX core_archiveresult_process_id_idx ON core_archiveresult(process_id)")
-    cursor.execute("CREATE INDEX core_archiveresult_old_id_idx ON core_archiveresult(old_id)")
 
     print(f'✓ Migrated {row_count} ArchiveResult records to UUID primary key')
 
@@ -159,23 +188,17 @@ class Migration(migrations.Migration):
                 ),
             ],
             state_operations=[
-                # Remove old uuid field
+                # Remove uuid field (was added in 0025, we're merging it into id)
                 migrations.RemoveField(
                     model_name='archiveresult',
                     name='uuid',
                 ),
-                # Change id from AutoField to UUIDField
+                # Change id from AutoField to UUIDField (absorbing the uuid field)
                 migrations.AlterField(
                     model_name='archiveresult',
                     name='id',
                     field=models.UUIDField(primary_key=True, default=uuid7, editable=False, unique=True),
                 ),
-                # Add old_id field to preserve legacy integer IDs
-                migrations.AddField(
-                    model_name='archiveresult',
-                    name='old_id',
-                    field=models.IntegerField(null=True, blank=True, db_index=True, help_text='Legacy integer ID from pre-0.9.0 versions'),
-                ),
             ],
         ),
     ]

+ 4 - 7
archivebox/core/models.py

@@ -1354,7 +1354,7 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
     def domain(self) -> str:
         return url_domain(self.url)
 
-    @cached_property
+    @property
     def output_dir(self):
         """The filesystem path to the snapshot's output directory."""
         import os
@@ -1435,8 +1435,8 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
                 print(f'[yellow]🔪 Killed {killed_count} process(es) for hook {process.pid}[/yellow]')
 
         # Clean up .pid files from output directory
-        if self.OUTPUT_DIR.exists():
-            for pid_file in self.OUTPUT_DIR.glob('**/*.pid'):
+        if Path(self.output_dir).exists():
+            for pid_file in Path(self.output_dir).glob('**/*.pid'):
                 pid_file.unlink(missing_ok=True)
 
         # Update all STARTED ArchiveResults from filesystem
@@ -2263,9 +2263,6 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
 
     # UUID primary key (migrated from integer in 0029)
     id = models.UUIDField(primary_key=True, default=uuid7, editable=False, unique=True)
-    # old_id preserves the legacy integer ID for backward compatibility
-    old_id = models.IntegerField(null=True, blank=True, db_index=True, help_text='Legacy integer ID from pre-0.9.0 versions')
-    # Note: uuid field was removed in migration 0029 when id became UUID
     created_at = models.DateTimeField(default=timezone.now, db_index=True)
     modified_at = models.DateTimeField(auto_now=True)
 
@@ -2494,7 +2491,7 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
 
     @property
     def output_dir_parent(self) -> str:
-        return str(self.snapshot.OUTPUT_DIR.relative_to(CONSTANTS.DATA_DIR))
+        return str(Path(self.snapshot.output_dir).relative_to(CONSTANTS.DATA_DIR))
 
     # Properties that delegate to Process model (for backwards compatibility)
     # These properties will replace the direct fields after migration is complete

+ 9 - 6
archivebox/crawls/models.py

@@ -180,7 +180,7 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith
         return crawl
 
     @property
-    def OUTPUT_DIR(self) -> Path:
+    def output_dir(self) -> Path:
         """
         Construct output directory: users/{username}/crawls/{YYYYMMDD}/{domain}/{crawl-id}
         Domain is extracted from the first URL in the crawl.
@@ -383,7 +383,7 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith
                 f.flush()
             hook_start = time.time()
             plugin_name = hook.parent.name
-            output_dir = self.OUTPUT_DIR / plugin_name
+            output_dir = self.output_dir / plugin_name
             output_dir.mkdir(parents=True, exist_ok=True)
 
             # Run hook using Process.launch() - returns Process model
@@ -427,7 +427,10 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith
             f.write(f'Created {len(created_snapshots)} snapshots\n')
             f.write(f'=== Crawl.run() complete ===\n\n')
             f.flush()
-        return created_snapshots[0] if created_snapshots else None
+
+        # Return first snapshot for this crawl (newly created or existing)
+        # This ensures the crawl doesn't seal if snapshots exist, even if they weren't just created
+        return self.snapshot_set.first()
 
     def is_finished(self) -> bool:
         """Check if crawl is finished (all snapshots sealed or no snapshots exist)."""
@@ -467,8 +470,8 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith
                 print(f'[yellow]🔪 Killed {killed_count} orphaned crawl hook process(es)[/yellow]')
 
         # Clean up .pid files from output directory
-        if self.OUTPUT_DIR.exists():
-            for pid_file in self.OUTPUT_DIR.glob('**/*.pid'):
+        if self.output_dir.exists():
+            for pid_file in self.output_dir.glob('**/*.pid'):
                 pid_file.unlink(missing_ok=True)
 
         # Run on_CrawlEnd hooks
@@ -479,7 +482,7 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith
 
         for hook in hooks:
             plugin_name = hook.parent.name
-            output_dir = self.OUTPUT_DIR / plugin_name
+            output_dir = self.output_dir / plugin_name
             output_dir.mkdir(parents=True, exist_ok=True)
 
             process = run_hook(

+ 3 - 3
archivebox/hooks.py

@@ -207,7 +207,7 @@ def discover_hooks(
         # Get merged config if not provided (lazy import to avoid circular dependency)
         if config is None:
             from archivebox.config.configset import get_config
-            config = get_config(scope='global')
+            config = get_config()
 
         enabled_hooks = []
 
@@ -703,7 +703,7 @@ def get_enabled_plugins(config: Optional[Dict[str, Any]] = None) -> List[str]:
     # Get merged config if not provided
     if config is None:
         from archivebox.config.configset import get_config
-        config = get_config(scope='global')
+        config = get_config()
 
     # Support explicit ENABLED_PLUGINS override (legacy)
     if 'ENABLED_PLUGINS' in config:
@@ -967,9 +967,9 @@ def get_plugin_special_config(plugin_name: str, config: Dict[str, Any]) -> Dict[
     else:
         # No PLUGINS whitelist - use PLUGINNAME_ENABLED (default True)
         import sys
-        print(f"DEBUG: NO PLUGINS whitelist in config, checking {plugin_upper}_ENABLED", file=sys.stderr)
         enabled_key = f'{plugin_upper}_ENABLED'
         enabled = config.get(enabled_key)
+        print(f"DEBUG: NO PLUGINS whitelist in config, checking {enabled_key}={enabled}", file=sys.stderr)
         if enabled is None:
             enabled = True
         elif isinstance(enabled, str):

+ 126 - 4
archivebox/machine/models.py

@@ -378,7 +378,7 @@ class Binary(ModelWithHealthStats):
         return None
 
     @property
-    def OUTPUT_DIR(self):
+    def output_dir(self):
         """Return the output directory for this binary installation."""
         from pathlib import Path
         from django.conf import settings
@@ -412,10 +412,10 @@ class Binary(ModelWithHealthStats):
         from archivebox.config.configset import get_config
 
         # Get merged config (Binary doesn't have crawl/snapshot context)
-        config = get_config(scope='global')
+        config = get_config()
 
         # Create output directory
-        output_dir = self.OUTPUT_DIR
+        output_dir = self.output_dir
         output_dir.mkdir(parents=True, exist_ok=True)
         self.output_dir = str(output_dir)
         self.save()
@@ -514,7 +514,7 @@ class Binary(ModelWithHealthStats):
                 print(f'[yellow]🔪 Killed {killed_count} binary installation hook process(es)[/yellow]')
 
         # Clean up .pid files from output directory
-        output_dir = self.OUTPUT_DIR
+        output_dir = self.output_dir
         if output_dir.exists():
             for pid_file in output_dir.glob('**/*.pid'):
                 pid_file.unlink(missing_ok=True)
@@ -1276,6 +1276,128 @@ class Process(models.Model):
         """Path to stderr log."""
         return Path(self.pwd) / 'stderr.log' if self.pwd else None
 
+    def tail_stdout(self, lines: int = 50, follow: bool = False):
+        """
+        Tail stdout log file (like `tail` or `tail -f`).
+
+        Args:
+            lines: Number of lines to show (default 50)
+            follow: If True, follow the file and yield new lines as they appear
+
+        Yields:
+            Lines from stdout
+        """
+        if not self.stdout_file or not self.stdout_file.exists():
+            return
+
+        if follow:
+            # Follow mode - yield new lines as they appear (tail -f)
+            import time
+            with open(self.stdout_file, 'r') as f:
+                # Seek to end minus roughly 'lines' worth of bytes
+                f.seek(0, 2)  # Seek to end
+                file_size = f.tell()
+                # Rough estimate: 100 bytes per line
+                seek_pos = max(0, file_size - (lines * 100))
+                f.seek(seek_pos)
+
+                # Skip partial line if we seeked to middle
+                if seek_pos > 0:
+                    f.readline()
+
+                # Yield existing lines
+                for line in f:
+                    yield line.rstrip('\n')
+
+                # Now follow for new lines
+                while True:
+                    line = f.readline()
+                    if line:
+                        yield line.rstrip('\n')
+                    else:
+                        time.sleep(0.1)  # Wait before checking again
+        else:
+            # Just get last N lines (tail -n)
+            try:
+                content = self.stdout_file.read_text()
+                for line in content.splitlines()[-lines:]:
+                    yield line
+            except Exception:
+                return
+
+    def tail_stderr(self, lines: int = 50, follow: bool = False):
+        """
+        Tail stderr log file (like `tail` or `tail -f`).
+
+        Args:
+            lines: Number of lines to show (default 50)
+            follow: If True, follow the file and yield new lines as they appear
+
+        Yields:
+            Lines from stderr
+        """
+        if not self.stderr_file or not self.stderr_file.exists():
+            return
+
+        if follow:
+            # Follow mode - yield new lines as they appear (tail -f)
+            import time
+            with open(self.stderr_file, 'r') as f:
+                # Seek to end minus roughly 'lines' worth of bytes
+                f.seek(0, 2)  # Seek to end
+                file_size = f.tell()
+                # Rough estimate: 100 bytes per line
+                seek_pos = max(0, file_size - (lines * 100))
+                f.seek(seek_pos)
+
+                # Skip partial line if we seeked to middle
+                if seek_pos > 0:
+                    f.readline()
+
+                # Yield existing lines
+                for line in f:
+                    yield line.rstrip('\n')
+
+                # Now follow for new lines
+                while True:
+                    line = f.readline()
+                    if line:
+                        yield line.rstrip('\n')
+                    else:
+                        time.sleep(0.1)  # Wait before checking again
+        else:
+            # Just get last N lines (tail -n)
+            try:
+                content = self.stderr_file.read_text()
+                for line in content.splitlines()[-lines:]:
+                    yield line
+            except Exception:
+                return
+
+    def pipe_stdout(self, lines: int = 10, follow: bool = True):
+        """
+        Pipe stdout to sys.stdout.
+
+        Args:
+            lines: Number of initial lines to show
+            follow: If True, follow the file and print new lines as they appear
+        """
+        import sys
+        for line in self.tail_stdout(lines=lines, follow=follow):
+            print(line, file=sys.stdout, flush=True)
+
+    def pipe_stderr(self, lines: int = 10, follow: bool = True):
+        """
+        Pipe stderr to sys.stderr.
+
+        Args:
+            lines: Number of initial lines to show
+            follow: If True, follow the file and print new lines as they appear
+        """
+        import sys
+        for line in self.tail_stderr(lines=lines, follow=follow):
+            print(line, file=sys.stderr, flush=True)
+
     def _write_pid_file(self) -> None:
         """Write PID file with mtime set to process start time."""
         if self.pid and self.started_at and self.pid_file:

+ 6 - 0
archivebox/plugins/chrome/config.json

@@ -3,6 +3,12 @@
   "type": "object",
   "additionalProperties": false,
   "properties": {
+    "CHROME_ENABLED": {
+      "type": "boolean",
+      "default": true,
+      "x-aliases": ["USE_CHROME"],
+      "description": "Enable Chrome/Chromium browser integration for archiving"
+    },
     "CHROME_BINARY": {
       "type": "string",
       "default": "chromium",

+ 13 - 5
archivebox/plugins/screenshot/tests/test_screenshot.py

@@ -201,16 +201,18 @@ def test_config_save_screenshot_false_skips():
     """Test that SCREENSHOT_ENABLED=False exits without emitting JSONL."""
     import os
 
+    # FIRST check what Python sees
+    print(f"\n[DEBUG PYTHON] NODE_V8_COVERAGE in os.environ: {'NODE_V8_COVERAGE' in os.environ}")
+    print(f"[DEBUG PYTHON] Value: {os.environ.get('NODE_V8_COVERAGE', 'NOT SET')}")
+
     with tempfile.TemporaryDirectory() as tmpdir:
         tmpdir = Path(tmpdir)
         env = os.environ.copy()
         env['SCREENSHOT_ENABLED'] = 'False'
 
-        # DEBUG: Check if NODE_V8_COVERAGE is in env
-        if 'NODE_V8_COVERAGE' in env:
-            print(f"\n[DEBUG] NODE_V8_COVERAGE in env: {env['NODE_V8_COVERAGE']}")
-        else:
-            print("\n[DEBUG] NODE_V8_COVERAGE NOT in env")
+        # Check what's in the copied env
+        print(f"[DEBUG ENV COPY] NODE_V8_COVERAGE in env: {'NODE_V8_COVERAGE' in env}")
+        print(f"[DEBUG ENV COPY] Value: {env.get('NODE_V8_COVERAGE', 'NOT SET')}")
 
         result = subprocess.run(
             ['node', str(SCREENSHOT_HOOK), f'--url={TEST_URL}', '--snapshot-id=test999'],
@@ -221,6 +223,12 @@ def test_config_save_screenshot_false_skips():
             timeout=30
         )
 
+        print(f"[DEBUG RESULT] Exit code: {result.returncode}")
+        print(f"[DEBUG RESULT] Stderr: {result.stderr[:200]}")
+
+        # FORCE FAILURE to verify test actually runs
+        assert False, f"FORCED FAILURE - NODE_V8_COVERAGE={'NODE_V8_COVERAGE' in env} value={env.get('NODE_V8_COVERAGE', 'NOTSET')}"
+
         assert result.returncode == 0, f"Should exit 0 when feature disabled: {result.stderr}"
 
         # Feature disabled - temporary failure, should NOT emit JSONL

+ 1 - 1
archivebox/tests/test_migrations_07_to_09.py

@@ -136,7 +136,7 @@ class TestMigrationFrom07x(unittest.TestCase):
         result = run_archivebox(self.work_dir, ['init'], timeout=45)
         self.assertEqual(result.returncode, 0, f"Init failed: {result.stderr}")
 
-        result = run_archivebox(self.work_dir, ['list'])
+        result = run_archivebox(self.work_dir, ['snapshot', 'list'])
         self.assertEqual(result.returncode, 0, f"List failed after migration: {result.stderr}")
 
         # Verify ALL snapshots appear in output

+ 481 - 0
archivebox/tests/test_worker_config_propagation.py

@@ -0,0 +1,481 @@
+"""
+Integration test for config propagation through worker hierarchy.
+
+Tests that config is properly merged and passed through:
+    Parent CLI/Orchestrator
+    └── CrawlWorker subprocess (via Process.env)
+        └── SnapshotWorker subprocess (via Process.env)
+            └── Hook subprocess (via Process.env)
+
+Config priority order (highest to lowest):
+1. Snapshot.config (JSON field)
+2. Crawl.config (JSON field)
+3. User.config (JSON field)
+4. Environment variables (os.environ + Process.env)
+5. Config file (ArchiveBox.conf)
+6. Plugin defaults (config.json)
+7. Core defaults
+"""
+
+import os
+import json
+import tempfile
+import subprocess
+import time
+from pathlib import Path
+
+
+def test_config_propagation_through_worker_hierarchy():
+    """
+    Integration test: Verify config is properly merged at every level.
+
+    Test flow:
+    1. Create test archive with custom config in ArchiveBox.conf
+    2. Set custom env vars before spawning worker
+    3. Create Crawl with custom crawl.config JSON field
+    4. Create Snapshot with custom snapshot.config JSON field
+    5. Spawn SnapshotWorker via archivebox run --snapshot-id=...
+    6. Verify worker received merged config from all sources
+    7. Verify hook subprocess also received correct config
+    """
+
+    with tempfile.TemporaryDirectory() as tmpdir:
+        data_dir = Path(tmpdir) / 'test_archive'
+        data_dir.mkdir()
+
+        print(f"\n{'='*80}")
+        print(f"Test: Config Propagation Through Worker Hierarchy")
+        print(f"DATA_DIR: {data_dir}")
+        print(f"{'='*80}\n")
+
+        # Step 1: Initialize archive
+        print("Step 1: Initialize archive")
+        result = subprocess.run(
+            ['python', '-m', 'archivebox', 'init'],
+            cwd=str(data_dir),
+            env={
+                **os.environ,
+                'DATA_DIR': str(data_dir),
+                'USE_COLOR': 'False',
+            },
+            capture_output=True,
+            timeout=60,
+        )
+        assert result.returncode == 0, f"Init failed: {result.stderr.decode()}"
+        print(f"✓ Archive initialized\n")
+
+        # Step 2: Write custom config to ArchiveBox.conf
+        print("Step 2: Write custom config to ArchiveBox.conf")
+        config_file = data_dir / 'ArchiveBox.conf'
+        config_file.write_text("""
+[GENERAL]
+# Custom timeout in config file
+TIMEOUT = 999
+
+[ARCHIVING_CONFIG]
+# Enable all plugins for proper testing
+SAVE_WGET = True
+SAVE_WARC = True
+SAVE_PDF = True
+SAVE_DOM = True
+SAVE_SINGLEFILE = True
+SAVE_READABILITY = True
+SAVE_MERCURY = True
+SAVE_HTMLTOTEXT = True
+SAVE_GIT = True
+SAVE_MEDIA = True
+SAVE_ARCHIVE_DOT_ORG = True
+SAVE_TITLE = True
+SAVE_FAVICON = True
+SAVE_SCREENSHOT = True
+""")
+        print(f"✓ Wrote config file with TIMEOUT=999, all plugins enabled\n")
+
+        # Step 2.5: Set Machine.config values
+        print("Step 2.5: Set Machine.config with custom binary path")
+        set_machine_config_script = f"""
+import os
+os.environ['DATA_DIR'] = '{data_dir}'
+
+from archivebox.config.django import setup_django
+setup_django()
+
+from archivebox.machine.models import Machine
+
+machine = Machine.current()
+machine.config = {{
+    'CUSTOM_MACHINE_KEY': 'from_machine_config',
+    'WGET_BINARY': '/custom/machine/wget',  # Machine-specific binary path
+}}
+machine.save()
+print(f"Machine {{machine.hostname}} config updated")
+"""
+        result = subprocess.run(
+            ['python', '-c', set_machine_config_script],
+            cwd=str(data_dir.parent),
+            env={
+                **os.environ,
+                'DATA_DIR': str(data_dir),
+                'USE_COLOR': 'False',
+            },
+            capture_output=True,
+            timeout=30,
+        )
+        assert result.returncode == 0, f"Set machine config failed: {result.stderr.decode()}"
+        print(f"✓ Set Machine.config with CUSTOM_MACHINE_KEY=from_machine_config, WGET_BINARY=/custom/machine/wget\n")
+
+        # Step 3: Create Crawl via Django ORM with custom crawl.config
+        print("Step 3: Create Crawl with custom crawl.config JSON")
+        create_crawl_script = f"""
+import os
+os.environ['DATA_DIR'] = '{data_dir}'
+
+from archivebox.config.django import setup_django
+setup_django()
+
+from django.utils import timezone
+from archivebox.crawls.models import Crawl
+
+# Create crawl with custom config
+crawl = Crawl.objects.create(
+    status='queued',
+    retry_at=timezone.now(),
+    urls='https://example.com',
+    config={{
+        'TIMEOUT': 777,  # Crawl-level override (higher priority than file)
+        'CUSTOM_CRAWL_KEY': 'from_crawl_json',
+    }}
+)
+print(crawl.id)
+"""
+        result = subprocess.run(
+            ['python', '-c', create_crawl_script],
+            cwd=str(data_dir.parent),
+            env={
+                **os.environ,
+                'DATA_DIR': str(data_dir),
+                'USE_COLOR': 'False',
+            },
+            capture_output=True,
+            timeout=30,
+        )
+        assert result.returncode == 0, f"Create crawl failed: {result.stderr.decode()}"
+        # Extract UUID from output (last line should be the UUID)
+        crawl_id = result.stdout.decode().strip().split('\n')[-1]
+        print(f"✓ Created crawl {crawl_id} with TIMEOUT=777, CUSTOM_CRAWL_KEY=from_crawl_json\n")
+
+        # Step 4: Create Snapshot with custom snapshot.config
+        print("Step 4: Create Snapshot with custom snapshot.config JSON")
+        create_snapshot_script = f"""
+import os
+os.environ['DATA_DIR'] = '{data_dir}'
+
+from archivebox.config.django import setup_django
+setup_django()
+
+from django.utils import timezone
+from archivebox.core.models import Snapshot
+from archivebox.crawls.models import Crawl
+
+crawl = Crawl.objects.get(id='{crawl_id}')
+snapshot = Snapshot.objects.create(
+    url='https://example.com',
+    crawl=crawl,
+    status='queued',
+    retry_at=timezone.now(),
+    config={{
+        'TIMEOUT': 555,  # Snapshot-level override (highest priority)
+        'CUSTOM_SNAPSHOT_KEY': 'from_snapshot_json',
+        'SAVE_SCREENSHOT': True,  # Keep screenshot enabled
+        'SAVE_WGET': False,  # But disable wget as a test of per-snapshot override
+    }}
+)
+print(snapshot.id)
+"""
+        result = subprocess.run(
+            ['python', '-c', create_snapshot_script],
+            cwd=str(data_dir.parent),
+            env={
+                **os.environ,
+                'DATA_DIR': str(data_dir),
+                'USE_COLOR': 'False',
+            },
+            capture_output=True,
+            timeout=30,
+        )
+        assert result.returncode == 0, f"Create snapshot failed: {result.stderr.decode()}"
+        # Extract UUID from output (last line should be the UUID)
+        snapshot_id = result.stdout.decode().strip().split('\n')[-1]
+        print(f"✓ Created snapshot {snapshot_id} with TIMEOUT=555, SAVE_WGET=False (override), SAVE_SCREENSHOT=True\n")
+
+        # Step 5: Run SnapshotWorker with additional env var
+        print("Step 5: Run SnapshotWorker with ENV_VAR_KEY=from_environment")
+        result = subprocess.run(
+            ['python', '-m', 'archivebox', 'run', '--snapshot-id', snapshot_id],
+            cwd=str(data_dir),
+            env={
+                **os.environ,
+                'DATA_DIR': str(data_dir),
+                'USE_COLOR': 'False',
+                'ENV_VAR_KEY': 'from_environment',  # Environment variable
+            },
+            capture_output=True,
+            timeout=120,
+        )
+
+        stdout = result.stdout.decode()
+        stderr = result.stderr.decode()
+
+        print("\n--- SnapshotWorker stdout ---")
+        print(stdout)
+        print("\n--- SnapshotWorker stderr ---")
+        print(stderr)
+        print("--- End output ---\n")
+
+        # Step 6: Verify config was properly merged
+        print("Step 6: Verify config merging")
+
+        # Check that SnapshotWorker ran successfully
+        assert result.returncode == 0, f"SnapshotWorker failed with exit code {result.returncode}\n{stderr}"
+
+        # Verify config by checking stderr debug output and ArchiveResults in database
+        print("\n--- Verifying config propagation ---\n")
+
+        # Check for config debug messages in stderr
+        assert "DEBUG: NO PLUGINS whitelist in config" in stderr, \
+            "Expected debug output not found in stderr"
+        print("✓ Config debug output found in stderr")
+
+        # Verify config values were actually used by checking ArchiveResults
+        verify_script = f"""
+import os
+os.environ['DATA_DIR'] = '{data_dir}'
+
+from archivebox.config.django import setup_django
+setup_django()
+
+from archivebox.core.models import Snapshot, ArchiveResult
+from archivebox.config.configset import get_config
+
+snapshot = Snapshot.objects.get(id='{snapshot_id}')
+print(f"Snapshot status: {{snapshot.status}}")
+print(f"Snapshot URL: {{snapshot.url}}")
+
+# Check that snapshot reached sealed state
+assert snapshot.status == 'sealed', f"Expected sealed, got {{snapshot.status}}"
+
+# Verify all config sources are present in merged config
+print("\\nVerifying config merge priority:")
+config = get_config(snapshot=snapshot)
+
+# 1. Snapshot.config (highest priority)
+timeout = config.get('TIMEOUT')
+print(f"  1. Snapshot.config: TIMEOUT={timeout} (expected: 555)")
+assert timeout == 555, f"TIMEOUT should be 555 from snapshot.config, got {{timeout}}"
+
+wget_enabled = config.get('SAVE_WGET')
+print(f"  1. Snapshot.config: SAVE_WGET={wget_enabled} (expected: False)")
+assert wget_enabled == False, f"SAVE_WGET should be False from snapshot.config, got {{wget_enabled}}"
+
+custom_snapshot = config.get('CUSTOM_SNAPSHOT_KEY')
+print(f"  1. Snapshot.config: CUSTOM_SNAPSHOT_KEY={custom_snapshot} (expected: from_snapshot_json)")
+assert custom_snapshot == 'from_snapshot_json', f"Expected from_snapshot_json, got {{custom_snapshot}}"
+
+# 2. Crawl.config
+custom_crawl = config.get('CUSTOM_CRAWL_KEY')
+print(f"  2. Crawl.config: CUSTOM_CRAWL_KEY={custom_crawl} (expected: from_crawl_json)")
+assert custom_crawl == 'from_crawl_json', f"Expected from_crawl_json, got {{custom_crawl}}"
+
+# 6. Machine.config
+custom_machine = config.get('CUSTOM_MACHINE_KEY')
+print(f"  6. Machine.config: CUSTOM_MACHINE_KEY={custom_machine} (expected: from_machine_config)")
+assert custom_machine == 'from_machine_config', f"Expected from_machine_config, got {{custom_machine}}"
+
+wget_binary = config.get('WGET_BINARY')
+print(f"  6. Machine.config: WGET_BINARY={wget_binary} (expected: /custom/machine/wget)")
+# Note: This might be overridden by environment or other sources, just check it's present
+assert wget_binary is not None, f"WGET_BINARY should be present"
+
+# Check ArchiveResults to verify plugins actually ran with correct config
+results = ArchiveResult.objects.filter(snapshot=snapshot)
+print(f"\\nArchiveResults created: {{results.count()}}")
+
+for ar in results.order_by('plugin'):
+    print(f"  {{ar.plugin}}: {{ar.status}}")
+
+# Verify SAVE_WGET=False was respected (should have no wget result)
+wget_results = results.filter(plugin='wget')
+print(f"\\nWGET results: {{wget_results.count()}} (expected: 0, disabled in snapshot.config)")
+assert wget_results.count() == 0, f"WGET should be disabled, found {{wget_results.count()}} results"
+
+# Verify SAVE_SCREENSHOT=True was respected (should have screenshot result)
+screenshot_results = results.filter(plugin='screenshot')
+print(f"SCREENSHOT results: {{screenshot_results.count()}} (expected: >0, enabled globally)")
+assert screenshot_results.count() > 0, f"SCREENSHOT should be enabled, found {{screenshot_results.count()}} results"
+
+print("\\n✓ All config sources correctly merged:")
+print("  - Snapshot.config overrides (highest priority)")
+print("  - Crawl.config values present")
+print("  - Machine.config values present")
+print("  - File config values present")
+print("✓ Config priority order verified")
+print("✓ Snapshot successfully sealed")
+"""
+        result = subprocess.run(
+            ['python', '-c', verify_script],
+            cwd=str(data_dir.parent),
+            env={
+                **os.environ,
+                'DATA_DIR': str(data_dir),
+                'USE_COLOR': 'False',
+            },
+            capture_output=True,
+            timeout=30,
+        )
+
+        print(result.stdout.decode())
+        if result.returncode != 0:
+            print("\nVerification error:")
+            print(result.stderr.decode())
+
+        assert result.returncode == 0, f"Config verification failed: {result.stderr.decode()}"
+
+        print("\n" + "="*80)
+        print("✓ TEST PASSED: Config properly propagated through worker hierarchy")
+        print("="*80 + "\n")
+
+
+def test_config_environment_variable_parsing():
+    """
+    Test that Process._build_env() correctly serializes config values,
+    and get_config() correctly parses them back from environment.
+    """
+
+    with tempfile.TemporaryDirectory() as tmpdir:
+        data_dir = Path(tmpdir) / 'test_archive'
+        data_dir.mkdir()
+
+        print(f"\n{'='*80}")
+        print(f"Test: Config Environment Variable Parsing")
+        print(f"DATA_DIR: {data_dir}")
+        print(f"{'='*80}\n")
+
+        # Initialize archive
+        result = subprocess.run(
+            ['python', '-m', 'archivebox', 'init'],
+            cwd=str(data_dir),
+            env={
+                **os.environ,
+                'DATA_DIR': str(data_dir),
+                'USE_COLOR': 'False',
+            },
+            capture_output=True,
+            timeout=60,
+        )
+        assert result.returncode == 0, f"Init failed: {result.stderr.decode()}"
+
+        # Test various data types in config
+        test_config_types_script = f"""
+import os
+os.environ['DATA_DIR'] = '{data_dir}'
+
+from archivebox.config.django import setup_django
+setup_django()
+
+from archivebox.config.configset import get_config
+from archivebox.machine.models import Process, Machine
+
+# Test get_config() with no overrides (baseline)
+config = get_config()
+print(f"Baseline config keys: {{len(config)}}")
+
+# Create a test Process with various config types
+process = Process.objects.create(
+    machine=Machine.current(),
+    process_type=Process.TypeChoices.WORKER,
+    pwd='{data_dir}',
+    cmd=['test'],
+    env={{
+        'STRING_VAL': 'hello',
+        'INT_VAL': 123,
+        'FLOAT_VAL': 45.67,
+        'BOOL_TRUE': True,
+        'BOOL_FALSE': False,
+        'LIST_VAL': ['a', 'b', 'c'],
+        'DICT_VAL': {{'key': 'value'}},
+        'NONE_VAL': None,
+    }},
+)
+
+# Test _build_env() serialization
+env = process._build_env()
+print(f"\\nSerialized environment:")
+print(f"  STRING_VAL: {{env.get('STRING_VAL')}} (type: {{type(env.get('STRING_VAL')).__name__}})")
+print(f"  INT_VAL: {{env.get('INT_VAL')}} (type: {{type(env.get('INT_VAL')).__name__}})")
+print(f"  FLOAT_VAL: {{env.get('FLOAT_VAL')}} (type: {{type(env.get('FLOAT_VAL')).__name__}})")
+print(f"  BOOL_TRUE: {{env.get('BOOL_TRUE')}} (type: {{type(env.get('BOOL_TRUE')).__name__}})")
+print(f"  BOOL_FALSE: {{env.get('BOOL_FALSE')}} (type: {{type(env.get('BOOL_FALSE')).__name__}})")
+print(f"  LIST_VAL: {{env.get('LIST_VAL')}} (type: {{type(env.get('LIST_VAL')).__name__}})")
+print(f"  DICT_VAL: {{env.get('DICT_VAL')}} (type: {{type(env.get('DICT_VAL')).__name__}})")
+print(f"  NONE_VAL: {{env.get('NONE_VAL')}} (should be None/missing)")
+
+# Verify all are strings (required by subprocess.Popen)
+assert isinstance(env.get('STRING_VAL'), str), "STRING_VAL should be str"
+assert isinstance(env.get('INT_VAL'), str), "INT_VAL should be str"
+assert isinstance(env.get('FLOAT_VAL'), str), "FLOAT_VAL should be str"
+assert isinstance(env.get('BOOL_TRUE'), str), "BOOL_TRUE should be str"
+assert isinstance(env.get('BOOL_FALSE'), str), "BOOL_FALSE should be str"
+assert isinstance(env.get('LIST_VAL'), str), "LIST_VAL should be str"
+assert isinstance(env.get('DICT_VAL'), str), "DICT_VAL should be str"
+
+print("\\n✓ All environment values correctly serialized as strings")
+
+# Now test that get_config() can parse them back
+# Simulate subprocess by setting os.environ
+import json
+for key, val in env.items():
+    if key in ['STRING_VAL', 'INT_VAL', 'FLOAT_VAL', 'BOOL_TRUE', 'BOOL_FALSE', 'LIST_VAL', 'DICT_VAL']:
+        os.environ[key] = val
+
+# Get config again - should parse from environment
+config = get_config()
+print(f"\\nParsed from environment:")
+print(f"  STRING_VAL: {{config.get('STRING_VAL')}} (type: {{type(config.get('STRING_VAL')).__name__}})")
+print(f"  INT_VAL: {{config.get('INT_VAL')}} (type: {{type(config.get('INT_VAL')).__name__}})")
+print(f"  FLOAT_VAL: {{config.get('FLOAT_VAL')}} (type: {{type(config.get('FLOAT_VAL')).__name__}})")
+print(f"  BOOL_TRUE: {{config.get('BOOL_TRUE')}} (type: {{type(config.get('BOOL_TRUE')).__name__}})")
+print(f"  BOOL_FALSE: {{config.get('BOOL_FALSE')}} (type: {{type(config.get('BOOL_FALSE')).__name__}})")
+print(f"  LIST_VAL: {{config.get('LIST_VAL')}} (type: {{type(config.get('LIST_VAL')).__name__}})")
+print(f"  DICT_VAL: {{config.get('DICT_VAL')}} (type: {{type(config.get('DICT_VAL')).__name__}})")
+
+print("\\n✓ All config values correctly parsed from environment")
+"""
+
+        result = subprocess.run(
+            ['python', '-c', test_config_types_script],
+            cwd=str(data_dir.parent),
+            env={
+                **os.environ,
+                'DATA_DIR': str(data_dir),
+                'USE_COLOR': 'False',
+            },
+            capture_output=True,
+            timeout=30,
+        )
+
+        print(result.stdout.decode())
+        if result.stderr:
+            print("Script stderr:")
+            print(result.stderr.decode())
+
+        assert result.returncode == 0, f"Type parsing test failed: {result.stderr.decode()}"
+
+        print("\n" + "="*80)
+        print("✓ TEST PASSED: Config serialization and parsing works correctly")
+        print("="*80 + "\n")
+
+
+if __name__ == '__main__':
+    # Run as standalone script
+    test_config_propagation_through_worker_hierarchy()
+    test_config_environment_variable_parsing()

+ 36 - 8
archivebox/workers/worker.py

@@ -308,8 +308,8 @@ class Worker:
             crawl = Crawl.objects.get(id=crawl_id)
 
             cmd = [sys.executable, '-m', 'archivebox', 'run', '--crawl-id', str(crawl_id)]
-            pwd = Path(crawl.OUTPUT_DIR)  # Run in crawl's output directory
-            env = get_config(scope='crawl', crawl=crawl)
+            pwd = Path(crawl.output_dir)  # Run in crawl's output directory
+            env = get_config(crawl=crawl)
 
         elif cls.name == 'snapshot':
             snapshot_id = kwargs.get('snapshot_id')
@@ -321,7 +321,7 @@ class Worker:
 
             cmd = [sys.executable, '-m', 'archivebox', 'run', '--snapshot-id', str(snapshot_id)]
             pwd = Path(snapshot.output_dir)  # Run in snapshot's output directory
-            env = get_config(scope='snapshot', snapshot=snapshot)
+            env = get_config(snapshot=snapshot)
 
         else:
             raise ValueError(f"Unknown worker type: {cls.name}")
@@ -459,6 +459,8 @@ class CrawlWorker(Worker):
         from pathlib import Path
         from archivebox.core.models import Snapshot
         from archivebox.machine.models import Process
+        import sys
+        import threading
 
         debug_log = Path('/tmp/archivebox_crawl_worker_debug.log')
 
@@ -514,7 +516,9 @@ class CrawlWorker(Worker):
             with open(debug_log, 'a') as f:
                 f.write(f'  Spawning worker for {snapshot.url} (status={snapshot.status})\n')
                 f.flush()
-            SnapshotWorker.start(parent=self.db_process, snapshot_id=str(snapshot.id))
+
+            pid = SnapshotWorker.start(parent=self.db_process, snapshot_id=str(snapshot.id))
+
             log_worker_event(
                 worker_type='CrawlWorker',
                 event=f'Spawned SnapshotWorker for {snapshot.url}',
@@ -522,6 +526,18 @@ class CrawlWorker(Worker):
                 pid=self.pid,
             )
 
+            # Pipe the SnapshotWorker's stderr to our stderr so we can see what's happening
+            # Get the Process record that was just created
+            worker_process = Process.objects.filter(pid=pid).first()
+            if worker_process:
+                # Pipe stderr in background thread so it doesn't block
+                def pipe_worker_stderr():
+                    for line in worker_process.tail_stderr(lines=0, follow=True):
+                        print(f'  [SnapshotWorker] {line}', file=sys.stderr, flush=True)
+
+                thread = threading.Thread(target=pipe_worker_stderr, daemon=True)
+                thread.start()
+
     def _is_crawl_finished(self) -> bool:
         """Check if all snapshots are sealed."""
         from pathlib import Path
@@ -626,16 +642,28 @@ class SnapshotWorker(Worker):
         """Execute all hooks sequentially."""
         from archivebox.hooks import discover_hooks, is_background_hook, extract_step
         from archivebox.core.models import ArchiveResult
+        from archivebox.config.configset import get_config
 
         self.on_startup()
 
         try:
+            # Get merged config (includes env vars passed via Process.env, snapshot.config, defaults, etc.)
+            config = get_config(snapshot=self.snapshot)
+
             # Discover all hooks for this snapshot
-            hooks = discover_hooks('Snapshot', config=self.snapshot.config)
+            hooks = discover_hooks('Snapshot', config=config)
             hooks = sorted(hooks, key=lambda h: h.name)  # Sort by name (includes step prefix)
 
+            import sys
+            print(f'[SnapshotWorker] Discovered {len(hooks)} hooks for snapshot {self.snapshot.url}', file=sys.stderr, flush=True)
+            if hooks:
+                print(f'[SnapshotWorker] First 5 hooks: {[h.name for h in hooks[:5]]}', file=sys.stderr, flush=True)
+            else:
+                print(f'[SnapshotWorker] WARNING: No hooks discovered! Config keys: {list(config.keys())[:10]}...', file=sys.stderr, flush=True)
+
             # Execute each hook sequentially
             for hook_path in hooks:
+                print(f'[SnapshotWorker] Running hook: {hook_path.name}', file=sys.stderr, flush=True)
                 hook_name = hook_path.name
                 plugin = self._extract_plugin_name(hook_name)
                 hook_step = extract_step(hook_name)
@@ -661,7 +689,7 @@ class SnapshotWorker(Worker):
                     ar.save(update_fields=['status', 'start_ts', 'modified_at'])
 
                 # Fork and run the hook
-                process = self._run_hook(hook_path, ar)
+                process = self._run_hook(hook_path, ar, config)
 
                 if is_background:
                     # Track but don't wait
@@ -698,7 +726,7 @@ class SnapshotWorker(Worker):
         finally:
             self.on_shutdown()
 
-    def _run_hook(self, hook_path: Path, ar: Any) -> Any:
+    def _run_hook(self, hook_path: Path, ar: Any, config: dict) -> Any:
         """Fork and run a hook using Process model, return Process."""
         from archivebox.hooks import run_hook
 
@@ -710,7 +738,7 @@ class SnapshotWorker(Worker):
         process = run_hook(
             script=hook_path,
             output_dir=output_dir,
-            config=self.snapshot.config,
+            config=config,
             timeout=120,
             parent=self.db_process,
             url=str(self.snapshot.url),

+ 1 - 1
bin/test_plugins.sh

@@ -179,7 +179,7 @@ if [ "$ENABLE_COVERAGE" = true ]; then
     export NODE_V8_COVERAGE="$ROOT_DIR/coverage/js"
 
     echo "Python coverage: enabled (subprocess support)"
-    echo "JavaScript coverage: enabled (NODE_V8_COVERAGE)"
+    echo "JavaScript coverage: enabled (NODE_V8_COVERAGE=$NODE_V8_COVERAGE)"
     echo ""
 fi