Browse Source

use full dotted paths for all archivebox imports, add migrations and more fixes

Nick Sweeting 1 month ago
parent
commit
f4e7820533
61 changed files with 1082 additions and 2985 deletions
  1. 2 2
      README.md
  2. 4 4
      archivebox/__init__.py
  3. 1 0
      archivebox/api/apps.py
  4. 1 1
      archivebox/api/v1_workers.py
  5. 2 2
      archivebox/base_models/models.py
  6. 1 1
      archivebox/cli/archivebox_add.py
  7. 1 1
      archivebox/cli/archivebox_crawl.py
  8. 1 1
      archivebox/cli/archivebox_extract.py
  9. 3 5
      archivebox/cli/archivebox_init.py
  10. 1 1
      archivebox/cli/archivebox_install.py
  11. 1 1
      archivebox/cli/archivebox_orchestrator.py
  12. 1 1
      archivebox/cli/archivebox_server.py
  13. 1 1
      archivebox/cli/archivebox_snapshot.py
  14. 2 2
      archivebox/cli/archivebox_update.py
  15. 1 1
      archivebox/cli/tests.py
  16. 1 1
      archivebox/cli/tests_piping.py
  17. 23 0
      archivebox/config/configset.py
  18. 4 0
      archivebox/core/apps.py
  19. 57 0
      archivebox/core/migrations/0024_b_clear_config_fields.py
  20. 28 0
      archivebox/core/migrations/0024_c_disable_fk_checks.py
  21. 93 0
      archivebox/core/migrations/0024_d_fix_crawls_config.py
  22. 1 3
      archivebox/core/migrations/0024_snapshot_crawl.py
  23. 104 73
      archivebox/core/migrations/0026_remove_archiveresult_output_dir_and_more.py
  24. 74 63
      archivebox/core/migrations/0029_archiveresult_hook_fields.py
  25. 39 20
      archivebox/core/migrations/0030_migrate_output_field.py
  26. 56 37
      archivebox/core/migrations/0032_alter_archiveresult_binary_and_more.py
  27. 30 15
      archivebox/core/migrations/0033_rename_extractor_add_hook_name.py
  28. 22 8
      archivebox/core/migrations/0034_snapshot_current_step.py
  29. 20 12
      archivebox/core/migrations/0035_snapshot_crawl_non_nullable_remove_created_by.py
  30. 12 4
      archivebox/core/migrations/0036_remove_archiveresult_created_by.py
  31. 44 0
      archivebox/core/migrations/0037_remove_archiveresult_output_dir_and_more.py
  32. 84 0
      archivebox/core/migrations/0038_fix_missing_columns.py
  33. 30 0
      archivebox/core/migrations/0039_fix_num_uses_values.py
  34. 1 1
      archivebox/core/models.py
  35. 0 2638
      archivebox/core/models.py.bak
  36. 15 3
      archivebox/core/templatetags/core_tags.py
  37. 1 1
      archivebox/core/views.py
  38. 5 0
      archivebox/crawls/apps.py
  39. 55 32
      archivebox/crawls/migrations/0002_drop_seed_model.py
  40. 13 4
      archivebox/crawls/migrations/0003_alter_crawl_output_dir.py
  41. 12 4
      archivebox/crawls/migrations/0004_alter_crawl_output_dir.py
  42. 28 0
      archivebox/crawls/migrations/0005_drop_seed_id_column.py
  43. 35 0
      archivebox/crawls/migrations/0006_alter_crawl_config_alter_crawl_output_dir_and_more.py
  44. 3 4
      archivebox/crawls/models.py
  45. 5 0
      archivebox/machine/apps.py
  46. 6 0
      archivebox/machine/migrations/0001_squashed.py
  47. 104 0
      archivebox/machine/migrations/0005_binary_binproviders_binary_output_dir_and_more.py
  48. 3 3
      archivebox/machine/models.py
  49. 1 0
      archivebox/personas/apps.py
  50. 1 1
      archivebox/personas/models.py
  51. 3 3
      archivebox/plugins/search_backend_ripgrep/tests/test_ripgrep_detection.py
  52. 5 2
      archivebox/templates/core/snapshot.html
  53. 0 13
      archivebox/tests/test_hooks.py
  54. 1 1
      archivebox/tests/test_migrations_08_to_09.py
  55. 18 9
      archivebox/tests/test_migrations_helpers.py
  56. 1 0
      archivebox/workers/apps.py
  57. 2 2
      docker-compose.yml
  58. 2 1
      pyproject.toml
  59. 1 1
      tests/fixtures.py
  60. 2 2
      tests/test_recursive_crawl.py
  61. 14 0
      uv.lock

+ 2 - 2
README.md

@@ -763,7 +763,7 @@ The configuration is documented here: **[Configuration Wiki](https://github.com/
 <br/>
 TIMEOUT=240                # default: 60    add more seconds on slower networks
 CHECK_SSL_VALIDITY=False   # default: True  False = allow saving URLs w/ bad SSL
-SAVE_ARCHIVE_DOT_ORG=False # default: True  False = disable Archive.org saving
+SAVE_ARCHIVEDOTORG=False # default: True  False = disable Archive.org saving
 MAX_MEDIA_SIZE=1500m       # default: 750m  raise/lower youtubedl output size
 <br/>
 PUBLIC_INDEX=True          # default: True  whether anon users can view index
@@ -959,7 +959,7 @@ archivebox add 'https://docs.google.com/document/d/12345somePrivateDocument'
 archivebox add 'https://vimeo.com/somePrivateVideo'
 
 # without first disabling saving to Archive.org:
-archivebox config --set SAVE_ARCHIVE_DOT_ORG=False  # disable saving all URLs in Archive.org
+archivebox config --set SAVE_ARCHIVEDOTORG=False  # disable saving all URLs in Archive.org
 
 # restrict the main index, Snapshot content, and Add Page to authenticated users as-needed:
 archivebox config --set PUBLIC_INDEX=False

+ 4 - 4
archivebox/__init__.py

@@ -26,10 +26,10 @@ ASCII_LOGO = """
 
 PACKAGE_DIR = Path(__file__).resolve().parent
 
-# Add PACKAGE_DIR to sys.path - required for Django migrations to import models
-# Migrations reference models like 'machine.Binary' which need to be importable
-if str(PACKAGE_DIR) not in sys.path:
-    sys.path.append(str(PACKAGE_DIR))
+# # Add PACKAGE_DIR to sys.path - required for Django migrations to import models
+# # Migrations reference models like 'machine.Binary' which need to be importable
+# if str(PACKAGE_DIR) not in sys.path:
+#     sys.path.append(str(PACKAGE_DIR))
 
 os.environ['DJANGO_SETTINGS_MODULE'] = 'archivebox.core.settings'
 os.environ['TZ'] = 'UTC'

+ 1 - 0
archivebox/api/apps.py

@@ -5,6 +5,7 @@ from django.apps import AppConfig
 
 class APIConfig(AppConfig):
     name = 'archivebox.api'
+    label = 'api'
 
 
 def register_admin(admin_site):

+ 1 - 1
archivebox/api/v1_workers.py

@@ -94,7 +94,7 @@ class OrchestratorSchema(Schema):
 @router.get("/orchestrator", response=OrchestratorSchema, url_name="get_orchestrator")
 def get_orchestrator(request):
     """Get the orchestrator status and all worker queues."""
-    from workers.orchestrator import Orchestrator
+    from archivebox.workers.orchestrator import Orchestrator
     from workers.worker import CrawlWorker, SnapshotWorker, ArchiveResultWorker
 
     orchestrator = Orchestrator()

+ 2 - 2
archivebox/base_models/models.py

@@ -73,7 +73,7 @@ class ModelWithUUID(models.Model):
         return f'/api/v1/docs#/{self._meta.app_label.title()}%20Models/api_v1_{self._meta.app_label}_get_{self._meta.db_table}'
 
     def as_json(self, keys: Iterable[str] = ()) -> dict:
-        default_keys = ('id', 'created_at', 'modified_at', 'created_by_id')
+        default_keys = ('id', 'created_at', 'modified_at')
         return {key: getattr(self, key) for key in (keys or default_keys) if hasattr(self, key)}
 
 
@@ -119,7 +119,7 @@ class ModelWithHealthStats(models.Model):
 
 class ModelWithConfig(models.Model):
     """Mixin for models with a JSON config field."""
-    config = models.JSONField(default=dict, null=False, blank=False, editable=True)
+    config = models.JSONField(default=dict, null=True, blank=True, editable=True)
 
     class Meta:
         abstract = True

+ 1 - 1
archivebox/cli/archivebox_add.py

@@ -56,7 +56,7 @@ def add(urls: str | list[str],
     from archivebox.core.models import Snapshot
     from archivebox.crawls.models import Crawl
     from archivebox.base_models.models import get_or_create_system_user_pk
-    from workers.orchestrator import Orchestrator
+    from archivebox.workers.orchestrator import Orchestrator
 
     created_by_id = created_by_id or get_or_create_system_user_pk()
 

+ 1 - 1
archivebox/cli/archivebox_crawl.py

@@ -78,7 +78,7 @@ def discover_outlinks(
     from archivebox.core.models import Snapshot, ArchiveResult
     from archivebox.crawls.models import Crawl
     from archivebox.config import CONSTANTS
-    from workers.orchestrator import Orchestrator
+    from archivebox.workers.orchestrator import Orchestrator
 
     created_by_id = get_or_create_system_user_pk()
     is_tty = sys.stdout.isatty()

+ 1 - 1
archivebox/cli/archivebox_extract.py

@@ -96,7 +96,7 @@ def run_plugins(
         TYPE_SNAPSHOT, TYPE_ARCHIVERESULT
     )
     from archivebox.core.models import Snapshot, ArchiveResult
-    from workers.orchestrator import Orchestrator
+    from archivebox.workers.orchestrator import Orchestrator
 
     is_tty = sys.stdout.isatty()
 

+ 3 - 5
archivebox/cli/archivebox_init.py

@@ -13,11 +13,9 @@ from archivebox.misc.util import docstring, enforce_types
 
 
 @enforce_types
-def init(force: bool=False, quick: bool=False, install: bool=False, setup: bool=False) -> None:
+def init(force: bool=False, quick: bool=False, install: bool=False) -> None:
     """Initialize a new ArchiveBox collection in the current directory"""
     
-    install = install or setup
-    
     from archivebox.config import CONSTANTS, VERSION, DATA_DIR
     from archivebox.config.common import SERVER_CONFIG
     from archivebox.config.collection import write_config_file
@@ -128,7 +126,8 @@ def init(force: bool=False, quick: bool=False, install: bool=False, setup: bool=
                 print(f'    [yellow]√ Added {len(orphaned_data_dir_links)} orphaned links from existing archive directories.[/yellow]')
 
             if pending_links:
-                Snapshot.objects.create_from_dicts(list(pending_links.values()))
+                for link_dict in pending_links.values():
+                    Snapshot.from_jsonl(link_dict)
 
             # Hint for orphaned snapshot directories
             print()
@@ -187,7 +186,6 @@ def init(force: bool=False, quick: bool=False, install: bool=False, setup: bool=
 @click.option('--force', '-f', is_flag=True, help='Ignore unrecognized files in current directory and initialize anyway')
 @click.option('--quick', '-q', is_flag=True, help='Run any updates or migrations without rechecking all snapshot dirs')
 @click.option('--install', '-s', is_flag=True, help='Automatically install dependencies and extras used for archiving')
[email protected]('--setup', '-s', is_flag=True, help='DEPRECATED: equivalent to --install')
 @docstring(init.__doc__)
 def main(**kwargs) -> None:
     init(**kwargs)

+ 1 - 1
archivebox/cli/archivebox_install.py

@@ -85,7 +85,7 @@ def install(dry_run: bool=False) -> None:
     print()
 
     # Run the crawl synchronously (this triggers on_Crawl hooks)
-    from workers.orchestrator import Orchestrator
+    from archivebox.workers.orchestrator import Orchestrator
     orchestrator = Orchestrator(exit_on_idle=True)
     orchestrator.runloop()
 

+ 1 - 1
archivebox/cli/archivebox_orchestrator.py

@@ -37,7 +37,7 @@ def orchestrator(daemon: bool = False, watch: bool = False) -> int:
         0: All work completed successfully
         1: Error occurred
     """
-    from workers.orchestrator import Orchestrator
+    from archivebox.workers.orchestrator import Orchestrator
     
     if Orchestrator.is_running():
         print('[yellow]Orchestrator is already running[/yellow]')

+ 1 - 1
archivebox/cli/archivebox_server.py

@@ -74,7 +74,7 @@ def server(runserver_args: Iterable[str]=(SERVER_CONFIG.BIND_ADDR,),
             tail_multiple_worker_logs,
             is_port_in_use,
         )
-        from workers.orchestrator import Orchestrator
+        from archivebox.workers.orchestrator import Orchestrator
         import sys
 
         # Check if port is already in use

+ 1 - 1
archivebox/cli/archivebox_snapshot.py

@@ -163,7 +163,7 @@ def create_snapshots(
 
     # If --plugins is passed, run the orchestrator for those plugins
     if plugins:
-        from workers.orchestrator import Orchestrator
+        from archivebox.workers.orchestrator import Orchestrator
         rprint(f'[blue]Running plugins: {plugins or "all"}...[/blue]', file=sys.stderr)
         orchestrator = Orchestrator(exit_on_idle=True)
         orchestrator.runloop()

+ 2 - 2
archivebox/cli/archivebox_update.py

@@ -160,7 +160,7 @@ def process_all_db_snapshots(batch_size: int = 100) -> dict:
     total = Snapshot.objects.count()
     print(f'[*] Processing {total} snapshots from database...')
 
-    for snapshot in Snapshot.objects.iterator():
+    for snapshot in Snapshot.objects.iterator(chunk_size=batch_size):
         # Reconcile index.json with DB
         snapshot.reconcile_with_index_json()
 
@@ -209,7 +209,7 @@ def process_filtered_snapshots(
     total = snapshots.count()
     print(f'[*] Found {total} matching snapshots')
 
-    for snapshot in snapshots.iterator():
+    for snapshot in snapshots.iterator(chunk_size=batch_size):
         # Reconcile index.json with DB
         snapshot.reconcile_with_index_json()
 

+ 1 - 1
archivebox/cli/tests.py

@@ -17,7 +17,7 @@ TEST_CONFIG = {
 
     'DATA_DIR': 'data.tests',
     
-    'SAVE_ARCHIVE_DOT_ORG': 'False',
+    'SAVE_ARCHIVEDOTORG': 'False',
     'SAVE_TITLE': 'False',
     
     'USE_CURL': 'False',

+ 1 - 1
archivebox/cli/tests_piping.py

@@ -32,7 +32,7 @@ from unittest.mock import patch, MagicMock
 TEST_CONFIG = {
     'USE_COLOR': 'False',
     'SHOW_PROGRESS': 'False',
-    'SAVE_ARCHIVE_DOT_ORG': 'False',
+    'SAVE_ARCHIVEDOTORG': 'False',
     'SAVE_TITLE': 'True',  # Fast extractor
     'SAVE_FAVICON': 'False',
     'SAVE_WGET': 'False',

+ 23 - 0
archivebox/config/configset.py

@@ -216,6 +216,29 @@ def get_config(
     if snapshot and hasattr(snapshot, "config") and snapshot.config:
         config.update(snapshot.config)
 
+    # Normalize all aliases to canonical names (after all sources merged)
+    # This handles aliases that came from user/crawl/snapshot configs, not just env
+    try:
+        from archivebox.hooks import discover_plugin_configs
+        plugin_configs = discover_plugin_configs()
+        aliases_to_normalize = {}  # {alias_key: canonical_key}
+
+        # Build alias mapping from all plugin schemas
+        for plugin_name, schema in plugin_configs.items():
+            for canonical_key, prop_schema in schema.get('properties', {}).items():
+                for alias in prop_schema.get('x-aliases', []):
+                    aliases_to_normalize[alias] = canonical_key
+
+        # Normalize: copy alias values to canonical keys (aliases take precedence)
+        for alias_key, canonical_key in aliases_to_normalize.items():
+            if alias_key in config:
+                # Alias exists - copy to canonical key (overwriting any default)
+                config[canonical_key] = config[alias_key]
+                # Remove alias from config to keep it clean
+                del config[alias_key]
+    except ImportError:
+        pass
+
     return config
 
 

+ 4 - 0
archivebox/core/apps.py

@@ -5,8 +5,12 @@ from django.apps import AppConfig
 
 class CoreConfig(AppConfig):
     name = 'archivebox.core'
+    label = 'core'
 
     def ready(self):
         """Register the archivebox.core.admin_site as the main django admin site"""
         from archivebox.core.admin_site import register_admin_site
         register_admin_site()
+
+        # Import models to register state machines with the registry
+        from archivebox.core import models  # noqa: F401

+ 57 - 0
archivebox/core/migrations/0024_b_clear_config_fields.py

@@ -0,0 +1,57 @@
+# Data migration to clear config fields that may contain invalid JSON
+# This runs before 0025 to prevent CHECK constraint failures
+
+from django.db import migrations
+
+
+def clear_config_fields(apps, schema_editor):
+    """Clear all config fields in related tables to avoid JSON validation errors."""
+    db_alias = schema_editor.connection.alias
+
+    # Disable foreign key checks temporarily to allow updates
+    with schema_editor.connection.cursor() as cursor:
+        cursor.execute("PRAGMA foreign_keys=OFF")
+
+    tables_to_clear = [
+        ('crawls_seed', 'config'),
+        ('crawls_crawl', 'config'),
+        ('crawls_crawlschedule', 'config') if 'crawlschedule' in dir() else None,
+        ('machine_machine', 'stats'),
+        ('machine_machine', 'config'),
+    ]
+
+    for table_info in tables_to_clear:
+        if table_info is None:
+            continue
+        table_name, field_name = table_info
+
+        try:
+            with schema_editor.connection.cursor() as cursor:
+                # Check if table exists first
+                cursor.execute(f"SELECT name FROM sqlite_master WHERE type='table' AND name='{table_name}'")
+                if not cursor.fetchone():
+                    print(f"  Skipping {table_name}.{field_name}: table does not exist")
+                    continue
+
+                # Set all to empty JSON object
+                cursor.execute(f"UPDATE {table_name} SET {field_name} = '{{}}' WHERE {field_name} IS NOT NULL")
+                print(f"  Cleared {field_name} in {table_name}: {cursor.rowcount} rows")
+        except Exception as e:
+            print(f"  Skipping {table_name}.{field_name}: {e}")
+
+    # Re-enable foreign key checks
+    with schema_editor.connection.cursor() as cursor:
+        cursor.execute("PRAGMA foreign_keys=ON")
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ('core', '0023_new_schema'),
+        ('crawls', '0001_initial'),
+        ('machine', '0001_squashed'),
+    ]
+
+    operations = [
+        migrations.RunPython(clear_config_fields, reverse_code=migrations.RunPython.noop),
+    ]

+ 28 - 0
archivebox/core/migrations/0024_c_disable_fk_checks.py

@@ -0,0 +1,28 @@
+# Disable foreign key checks before 0025 to prevent CHECK constraint validation errors
+
+from django.db import migrations
+
+
+def disable_fk_checks(apps, schema_editor):
+    """Temporarily disable foreign key checks."""
+    with schema_editor.connection.cursor() as cursor:
+        cursor.execute("PRAGMA foreign_keys=OFF")
+        print("  Disabled foreign key checks")
+
+
+def enable_fk_checks(apps, schema_editor):
+    """Re-enable foreign key checks."""
+    with schema_editor.connection.cursor() as cursor:
+        cursor.execute("PRAGMA foreign_keys=ON")
+        print("  Enabled foreign key checks")
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ('core', '0024_b_clear_config_fields'),
+    ]
+
+    operations = [
+        migrations.RunPython(disable_fk_checks, reverse_code=enable_fk_checks),
+    ]

+ 93 - 0
archivebox/core/migrations/0024_d_fix_crawls_config.py

@@ -0,0 +1,93 @@
+# Fix crawls_crawl config field to avoid CHECK constraint errors during table rebuilds
+
+from django.db import migrations
+
+
+def fix_crawls_config(apps, schema_editor):
+    """
+    Rebuild crawls_crawl table to fix CHECK constraints and make seed_id nullable.
+    Only runs for UPGRADES from 0.8.x (when crawls.0001_initial didn't exist yet).
+    For fresh installs, crawls.0001_initial creates the correct schema.
+    """
+    with schema_editor.connection.cursor() as cursor:
+        # Check if this is an upgrade from old 0.8.x or a fresh install
+        # In fresh installs, crawls.0001_initial was applied, creating seed FK
+        # In upgrades, the table was created by old migrations before 0001_initial existed
+        cursor.execute("""
+            SELECT COUNT(*) FROM django_migrations
+            WHERE app='crawls' AND name='0001_initial'
+        """)
+        has_crawls_0001 = cursor.fetchone()[0] > 0
+
+        if has_crawls_0001:
+            # Fresh install - crawls.0001_initial already created the correct schema
+            # Just clear config to avoid CHECK constraint issues
+            print("  Fresh install detected - clearing config field only")
+            try:
+                cursor.execute('UPDATE "crawls_crawl" SET "config" = NULL')
+            except Exception as e:
+                print(f"  Skipping config clear: {e}")
+            return
+
+        # Upgrade from 0.8.x - rebuild table to make seed_id nullable and remove CHECK constraint
+        print("  Upgrading from 0.8.x - rebuilding crawls_crawl table")
+        cursor.execute("PRAGMA foreign_keys=OFF")
+
+        # Backup
+        cursor.execute("CREATE TABLE crawls_crawl_backup AS SELECT * FROM crawls_crawl")
+
+        # Recreate without config CHECK constraint, with nullable seed_id
+        cursor.execute("DROP TABLE crawls_crawl")
+        cursor.execute("""
+            CREATE TABLE "crawls_crawl" (
+                "num_uses_failed" integer unsigned NOT NULL CHECK ("num_uses_failed" >= 0),
+                "num_uses_succeeded" integer unsigned NOT NULL CHECK ("num_uses_succeeded" >= 0),
+                "id" char(32) NOT NULL PRIMARY KEY,
+                "created_at" datetime NOT NULL,
+                "modified_at" datetime NOT NULL,
+                "urls" text NOT NULL,
+                "config" text,
+                "max_depth" smallint unsigned NOT NULL CHECK ("max_depth" >= 0),
+                "tags_str" varchar(1024) NOT NULL,
+                "persona_id" char(32) NULL,
+                "label" varchar(64) NOT NULL,
+                "notes" text NOT NULL,
+                "output_dir" varchar(512) NOT NULL,
+                "status" varchar(15) NOT NULL,
+                "retry_at" datetime NULL,
+                "created_by_id" integer NOT NULL REFERENCES "auth_user" ("id") DEFERRABLE INITIALLY DEFERRED,
+                "seed_id" char(32) NULL DEFAULT NULL,
+                "schedule_id" char(32) NULL REFERENCES "crawls_crawlschedule" ("id") DEFERRABLE INITIALLY DEFERRED
+            )
+        """)
+
+        # Restore data
+        cursor.execute("""
+            INSERT INTO "crawls_crawl" (
+                "num_uses_failed", "num_uses_succeeded", "id", "created_at", "modified_at",
+                "urls", "config", "max_depth", "tags_str", "persona_id", "label", "notes",
+                "output_dir", "status", "retry_at", "created_by_id", "seed_id", "schedule_id"
+            )
+            SELECT
+                "num_uses_failed", "num_uses_succeeded", "id", "created_at", "modified_at",
+                "urls", "config", "max_depth", "tags_str", "persona_id", "label", "notes",
+                "output_dir", "status", "retry_at", "created_by_id", "seed_id", "schedule_id"
+            FROM crawls_crawl_backup
+        """)
+
+        cursor.execute("DROP TABLE crawls_crawl_backup")
+
+        # NULL out config to avoid any invalid JSON
+        cursor.execute('UPDATE "crawls_crawl" SET "config" = NULL')
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ('core', '0024_c_disable_fk_checks'),
+        ('crawls', '0001_initial'),
+    ]
+
+    operations = [
+        migrations.RunPython(fix_crawls_config, reverse_code=migrations.RunPython.noop),
+    ]

+ 1 - 3
archivebox/core/migrations/0024_snapshot_crawl.py

@@ -8,9 +8,7 @@ import django.db.models.deletion
 class Migration(migrations.Migration):
 
     dependencies = [
-        ('core', '0023_new_schema'),
-        ('crawls', '0001_initial'),
-        ('machine', '0001_squashed'),
+        ('core', '0024_d_fix_crawls_config'),
     ]
 
     operations = [

+ 104 - 73
archivebox/core/migrations/0026_remove_archiveresult_output_dir_and_more.py

@@ -10,6 +10,13 @@ from django.db import migrations, models
 
 def populate_archiveresult_uuids(apps, schema_editor):
     """Generate unique UUIDs for ArchiveResults that don't have one."""
+    # Check if uuid column exists before trying to populate it
+    with schema_editor.connection.cursor() as cursor:
+        cursor.execute("PRAGMA table_info(core_archiveresult)")
+        columns = [row[1] for row in cursor.fetchall()]
+        if 'uuid' not in columns:
+            return  # uuid column doesn't exist, skip this data migration
+
     ArchiveResult = apps.get_model('core', 'ArchiveResult')
     for result in ArchiveResult.objects.filter(uuid__isnull=True):
         result.uuid = uuid_compat.uuid7()
@@ -21,6 +28,22 @@ def reverse_populate_uuids(apps, schema_editor):
     pass
 
 
+def remove_output_dir_if_exists(apps, schema_editor):
+    """Remove output_dir columns if they exist."""
+    with schema_editor.connection.cursor() as cursor:
+        # Check and remove from core_archiveresult
+        cursor.execute("PRAGMA table_info(core_archiveresult)")
+        columns = [row[1] for row in cursor.fetchall()]
+        if 'output_dir' in columns:
+            cursor.execute("ALTER TABLE core_archiveresult DROP COLUMN output_dir")
+
+        # Check and remove from core_snapshot
+        cursor.execute("PRAGMA table_info(core_snapshot)")
+        columns = [row[1] for row in cursor.fetchall()]
+        if 'output_dir' in columns:
+            cursor.execute("ALTER TABLE core_snapshot DROP COLUMN output_dir")
+
+
 class Migration(migrations.Migration):
 
     dependencies = [
@@ -33,82 +56,90 @@ class Migration(migrations.Migration):
         migrations.RunPython(populate_archiveresult_uuids, reverse_populate_uuids),
 
         # Remove output_dir fields (not needed, computed from snapshot)
-        migrations.RemoveField(
-            model_name='archiveresult',
-            name='output_dir',
-        ),
-        migrations.RemoveField(
-            model_name='snapshot',
-            name='output_dir',
-        ),
+        migrations.RunPython(remove_output_dir_if_exists, reverse_code=migrations.RunPython.noop),
 
-        # Archiveresult field alterations
-        migrations.AlterField(
-            model_name='archiveresult',
-            name='created_at',
-            field=models.DateTimeField(db_index=True, default=django.utils.timezone.now),
-        ),
-        migrations.AlterField(
-            model_name='archiveresult',
-            name='created_by',
-            field=models.ForeignKey(default=None, on_delete=django.db.models.deletion.CASCADE, related_name='archiveresult_set', to=settings.AUTH_USER_MODEL),
-        ),
-        migrations.AlterField(
-            model_name='archiveresult',
-            name='extractor',
-            field=models.CharField(db_index=True, max_length=32),
-        ),
-        migrations.AlterField(
-            model_name='archiveresult',
-            name='id',
-            field=models.AutoField(editable=False, primary_key=True, serialize=False),
-        ),
-        migrations.AlterField(
-            model_name='archiveresult',
-            name='status',
-            field=models.CharField(choices=[('queued', 'Queued'), ('started', 'Started'), ('backoff', 'Waiting to retry'), ('succeeded', 'Succeeded'), ('failed', 'Failed'), ('skipped', 'Skipped')], db_index=True, default='queued', max_length=15),
-        ),
+        # Update Django's migration state to match 0.9.x schema
+        # Database already has correct types from 0.8.x, just update state
+        migrations.SeparateDatabaseAndState(
+            state_operations=[
+                # Archiveresult field alterations
+                migrations.AlterField(
+                    model_name='archiveresult',
+                    name='created_at',
+                    field=models.DateTimeField(db_index=True, default=django.utils.timezone.now),
+                ),
+                migrations.AlterField(
+                    model_name='archiveresult',
+                    name='created_by',
+                    field=models.ForeignKey(default=archivebox.base_models.models.get_or_create_system_user_pk, on_delete=django.db.models.deletion.CASCADE, related_name='archiveresult_set', to=settings.AUTH_USER_MODEL),
+                ),
+                migrations.AlterField(
+                    model_name='archiveresult',
+                    name='extractor',
+                    field=models.CharField(db_index=True, max_length=32),
+                ),
+                # Convert id from AutoField to UUIDField (database already has UUID CHAR(32))
+                migrations.AlterField(
+                    model_name='archiveresult',
+                    name='id',
+                    field=models.UUIDField(default=uuid_compat.uuid7, editable=False, primary_key=True, serialize=False, unique=True),
+                ),
+                migrations.AlterField(
+                    model_name='archiveresult',
+                    name='status',
+                    field=models.CharField(choices=[('queued', 'Queued'), ('started', 'Started'), ('backoff', 'Waiting to retry'), ('succeeded', 'Succeeded'), ('failed', 'Failed'), ('skipped', 'Skipped')], db_index=True, default='queued', max_length=15),
+                ),
 
-        # Snapshot field alterations
-        migrations.AlterField(
-            model_name='snapshot',
-            name='bookmarked_at',
-            field=models.DateTimeField(db_index=True, default=django.utils.timezone.now),
-        ),
-        migrations.AlterField(
-            model_name='snapshot',
-            name='created_at',
-            field=models.DateTimeField(db_index=True, default=django.utils.timezone.now),
-        ),
-        migrations.AlterField(
-            model_name='snapshot',
-            name='created_by',
-            field=models.ForeignKey(default=None, on_delete=django.db.models.deletion.CASCADE, related_name='snapshot_set', to=settings.AUTH_USER_MODEL),
-        ),
-        migrations.AlterField(
-            model_name='snapshot',
-            name='downloaded_at',
-            field=models.DateTimeField(blank=True, db_index=True, default=None, editable=False, null=True),
-        ),
-        migrations.AlterField(
-            model_name='snapshot',
-            name='id',
-            field=models.UUIDField(default=uuid_compat.uuid7, editable=False, primary_key=True, serialize=False, unique=True),
+                # Snapshot field alterations
+                migrations.AlterField(
+                    model_name='snapshot',
+                    name='bookmarked_at',
+                    field=models.DateTimeField(db_index=True, default=django.utils.timezone.now),
+                ),
+                migrations.AlterField(
+                    model_name='snapshot',
+                    name='created_at',
+                    field=models.DateTimeField(db_index=True, default=django.utils.timezone.now),
+                ),
+                migrations.AlterField(
+                    model_name='snapshot',
+                    name='created_by',
+                    field=models.ForeignKey(default=archivebox.base_models.models.get_or_create_system_user_pk, on_delete=django.db.models.deletion.CASCADE, related_name='snapshot_set', to=settings.AUTH_USER_MODEL),
+                ),
+                migrations.AlterField(
+                    model_name='snapshot',
+                    name='downloaded_at',
+                    field=models.DateTimeField(blank=True, db_index=True, default=None, editable=False, null=True),
+                ),
+                migrations.AlterField(
+                    model_name='snapshot',
+                    name='id',
+                    field=models.UUIDField(default=uuid_compat.uuid7, editable=False, primary_key=True, serialize=False, unique=True),
+                ),
+            ],
+            database_operations=[
+                # No actual database changes needed - schema is already correct from 0.8.x
+            ],
         ),
 
-        # SnapshotTag and Tag alterations
-        migrations.AlterField(
-            model_name='snapshottag',
-            name='id',
-            field=models.AutoField(primary_key=True, serialize=False),
-        ),
-        migrations.AlterField(
-            model_name='tag',
-            name='created_by',
-            field=models.ForeignKey(default=archivebox.base_models.models.get_or_create_system_user_pk, on_delete=django.db.models.deletion.CASCADE, related_name='tag_set', to=settings.AUTH_USER_MODEL),
-        ),
-        migrations.AlterUniqueTogether(
-            name='snapshottag',
-            unique_together={('snapshot', 'tag')},
+        # SnapshotTag and Tag alterations - state only, DB already correct
+        migrations.SeparateDatabaseAndState(
+            state_operations=[
+                migrations.AlterField(
+                    model_name='snapshottag',
+                    name='id',
+                    field=models.AutoField(primary_key=True, serialize=False),
+                ),
+                migrations.AlterField(
+                    model_name='tag',
+                    name='created_by',
+                    field=models.ForeignKey(default=archivebox.base_models.models.get_or_create_system_user_pk, on_delete=django.db.models.deletion.CASCADE, related_name='tag_set', to=settings.AUTH_USER_MODEL),
+                ),
+                migrations.AlterUniqueTogether(
+                    name='snapshottag',
+                    unique_together={('snapshot', 'tag')},
+                ),
+            ],
+            database_operations=[],
         ),
     ]

+ 74 - 63
archivebox/core/migrations/0029_archiveresult_hook_fields.py

@@ -13,68 +13,79 @@ class Migration(migrations.Migration):
     ]
 
     operations = [
-        # Add new output fields (keep old 'output' temporarily for migration)
-        migrations.AddField(
-            model_name='archiveresult',
-            name='output_str',
-            field=models.TextField(
-                blank=True,
-                default='',
-                help_text='Human-readable output summary (e.g., "Downloaded 5 files")'
-            ),
-        ),
-
-        migrations.AddField(
-            model_name='archiveresult',
-            name='output_json',
-            field=models.JSONField(
-                null=True,
-                blank=True,
-                default=None,
-                help_text='Structured metadata (headers, redirects, etc.) - should NOT duplicate ArchiveResult fields'
-            ),
-        ),
-
-        migrations.AddField(
-            model_name='archiveresult',
-            name='output_files',
-            field=models.JSONField(
-                default=dict,
-                help_text='Dict of {relative_path: {metadata}} - values are empty dicts for now, extensible for future metadata'
-            ),
-        ),
-
-        migrations.AddField(
-            model_name='archiveresult',
-            name='output_size',
-            field=models.BigIntegerField(
-                default=0,
-                help_text='Total recursive size in bytes of all output files'
-            ),
-        ),
-
-        migrations.AddField(
-            model_name='archiveresult',
-            name='output_mimetypes',
-            field=models.CharField(
-                max_length=512,
-                blank=True,
-                default='',
-                help_text='CSV of mimetypes sorted by size descending'
-            ),
-        ),
-
-        # Add binary FK (optional)
-        migrations.AddField(
-            model_name='archiveresult',
-            name='binary',
-            field=models.ForeignKey(
-                'machine.Binary',
-                on_delete=models.SET_NULL,
-                null=True,
-                blank=True,
-                related_name='archiveresults',
-                help_text='Primary binary used by this hook (optional)'
-            ),
+        # Add new output fields using SeparateDatabaseAndState to avoid table rebuilds
+        migrations.SeparateDatabaseAndState(
+            state_operations=[
+                migrations.AddField(
+                    model_name='archiveresult',
+                    name='output_str',
+                    field=models.TextField(
+                        blank=True,
+                        default='',
+                        help_text='Human-readable output summary (e.g., "Downloaded 5 files")'
+                    ),
+                ),
+                migrations.AddField(
+                    model_name='archiveresult',
+                    name='output_json',
+                    field=models.JSONField(
+                        null=True,
+                        blank=True,
+                        default=None,
+                        help_text='Structured metadata (headers, redirects, etc.) - should NOT duplicate ArchiveResult fields'
+                    ),
+                ),
+                migrations.AddField(
+                    model_name='archiveresult',
+                    name='output_files',
+                    field=models.JSONField(
+                        default=dict,
+                        help_text='Dict of {relative_path: {metadata}} - values are empty dicts for now, extensible for future metadata'
+                    ),
+                ),
+                migrations.AddField(
+                    model_name='archiveresult',
+                    name='output_size',
+                    field=models.BigIntegerField(
+                        default=0,
+                        help_text='Total recursive size in bytes of all output files'
+                    ),
+                ),
+                migrations.AddField(
+                    model_name='archiveresult',
+                    name='output_mimetypes',
+                    field=models.CharField(
+                        max_length=512,
+                        blank=True,
+                        default='',
+                        help_text='CSV of mimetypes sorted by size descending'
+                    ),
+                ),
+                migrations.AddField(
+                    model_name='archiveresult',
+                    name='binary',
+                    field=models.ForeignKey(
+                        'machine.Binary',
+                        on_delete=models.SET_NULL,
+                        null=True,
+                        blank=True,
+                        related_name='archiveresults',
+                        help_text='Primary binary used by this hook (optional)'
+                    ),
+                ),
+            ],
+            database_operations=[
+                migrations.RunSQL(
+                    sql="""
+                        ALTER TABLE core_archiveresult ADD COLUMN output_str TEXT DEFAULT '';
+                        ALTER TABLE core_archiveresult ADD COLUMN output_json TEXT;
+                        ALTER TABLE core_archiveresult ADD COLUMN output_files TEXT DEFAULT '{}';
+                        ALTER TABLE core_archiveresult ADD COLUMN output_size BIGINT DEFAULT 0;
+                        ALTER TABLE core_archiveresult ADD COLUMN output_mimetypes VARCHAR(512) DEFAULT '';
+                        ALTER TABLE core_archiveresult ADD COLUMN binary_id CHAR(32) REFERENCES machine_binary(id);
+                    """,
+                    reverse_sql=migrations.RunSQL.noop,
+                ),
+            ],
         ),
     ]

+ 39 - 20
archivebox/core/migrations/0030_migrate_output_field.py

@@ -12,27 +12,46 @@ def migrate_output_field(apps, schema_editor):
     Logic:
     - If output contains JSON {...}, move to output_json
     - Otherwise, move to output_str
-    """
-    ArchiveResult = apps.get_model('core', 'ArchiveResult')
-
-    for ar in ArchiveResult.objects.all().iterator():
-        old_output = ar.output or ''
 
-        # Case 1: JSON output
-        if old_output.strip().startswith('{'):
-            try:
-                parsed = json.loads(old_output)
-                ar.output_json = parsed
-                ar.output_str = ''
-            except json.JSONDecodeError:
-                # Not valid JSON, treat as string
-                ar.output_str = old_output
-
-        # Case 2: File path or plain string
-        else:
-            ar.output_str = old_output
-
-        ar.save(update_fields=['output_str', 'output_json'])
+    Use raw SQL to avoid CHECK constraint issues during migration.
+    """
+    # Use raw SQL to migrate data without triggering CHECK constraints
+    with schema_editor.connection.cursor() as cursor:
+        # Get all archive results
+        cursor.execute("""
+            SELECT id, output FROM core_archiveresult
+        """)
+
+        for row in cursor.fetchall():
+            ar_id, old_output = row
+            old_output = old_output or ''
+
+            # Case 1: JSON output
+            if old_output.strip().startswith('{'):
+                try:
+                    # Validate it's actual JSON
+                    parsed = json.loads(old_output)
+                    # Update with JSON - cast to JSON to satisfy CHECK constraint
+                    json_str = json.dumps(parsed)
+                    cursor.execute("""
+                        UPDATE core_archiveresult
+                        SET output_str = '', output_json = json(?)
+                        WHERE id = ?
+                    """, (json_str, ar_id))
+                except json.JSONDecodeError:
+                    # Not valid JSON, treat as string
+                    cursor.execute("""
+                        UPDATE core_archiveresult
+                        SET output_str = ?, output_json = NULL
+                        WHERE id = ?
+                    """, (old_output, ar_id))
+            # Case 2: File path or plain string
+            else:
+                cursor.execute("""
+                    UPDATE core_archiveresult
+                    SET output_str = ?, output_json = NULL
+                    WHERE id = ?
+                """, (old_output, ar_id))
 
 
 def reverse_migrate(apps, schema_editor):

+ 56 - 37
archivebox/core/migrations/0032_alter_archiveresult_binary_and_more.py

@@ -16,43 +16,62 @@ class Migration(migrations.Migration):
     ]
 
     operations = [
-        migrations.AlterField(
-            model_name='archiveresult',
-            name='binary',
-            field=models.ForeignKey(blank=True, help_text='Primary binary used by this hook', null=True, on_delete=django.db.models.deletion.SET_NULL, related_name='archiveresults', to='machine.binary'),
+        # Update Django's state only - database already has correct schema from 0029
+        migrations.SeparateDatabaseAndState(
+            state_operations=[
+                migrations.AlterField(
+                    model_name='archiveresult',
+                    name='binary',
+                    field=models.ForeignKey(blank=True, help_text='Primary binary used by this hook', null=True, on_delete=django.db.models.deletion.SET_NULL, related_name='archiveresults', to='machine.binary'),
+                ),
+                migrations.AlterField(
+                    model_name='archiveresult',
+                    name='output_files',
+                    field=models.JSONField(default=dict, help_text='Dict of {relative_path: {metadata}}'),
+                ),
+                migrations.AlterField(
+                    model_name='archiveresult',
+                    name='output_json',
+                    field=models.JSONField(blank=True, default=None, help_text='Structured metadata (headers, redirects, etc.)', null=True),
+                ),
+                migrations.AlterField(
+                    model_name='archiveresult',
+                    name='output_mimetypes',
+                    field=models.CharField(blank=True, default='', help_text='CSV of mimetypes sorted by size', max_length=512),
+                ),
+                migrations.AlterField(
+                    model_name='archiveresult',
+                    name='output_size',
+                    field=models.BigIntegerField(default=0, help_text='Total bytes of all output files'),
+                ),
+                migrations.AlterField(
+                    model_name='archiveresult',
+                    name='output_str',
+                    field=models.TextField(blank=True, default='', help_text='Human-readable output summary'),
+                ),
+                migrations.AlterField(
+                    model_name='archiveresult',
+                    name='uuid',
+                    field=models.UUIDField(blank=True, db_index=True, default=uuid_compat.uuid7, null=True),
+                ),
+            ],
+            database_operations=[
+                # No database changes needed - columns already exist with correct types
+            ],
         ),
-        migrations.AlterField(
-            model_name='archiveresult',
-            name='output_files',
-            field=models.JSONField(default=dict, help_text='Dict of {relative_path: {metadata}}'),
-        ),
-        migrations.AlterField(
-            model_name='archiveresult',
-            name='output_json',
-            field=models.JSONField(blank=True, default=None, help_text='Structured metadata (headers, redirects, etc.)', null=True),
-        ),
-        migrations.AlterField(
-            model_name='archiveresult',
-            name='output_mimetypes',
-            field=models.CharField(blank=True, default='', help_text='CSV of mimetypes sorted by size', max_length=512),
-        ),
-        migrations.AlterField(
-            model_name='archiveresult',
-            name='output_size',
-            field=models.BigIntegerField(default=0, help_text='Total bytes of all output files'),
-        ),
-        migrations.AlterField(
-            model_name='archiveresult',
-            name='output_str',
-            field=models.TextField(blank=True, default='', help_text='Human-readable output summary'),
-        ),
-        migrations.AlterField(
-            model_name='archiveresult',
-            name='uuid',
-            field=models.UUIDField(blank=True, db_index=True, default=uuid_compat.uuid7, null=True),
-        ),
-        migrations.AddConstraint(
-            model_name='snapshot',
-            constraint=models.UniqueConstraint(fields=('timestamp',), name='unique_timestamp'),
+        # Add unique constraint without table rebuild
+        migrations.SeparateDatabaseAndState(
+            state_operations=[
+                migrations.AddConstraint(
+                    model_name='snapshot',
+                    constraint=models.UniqueConstraint(fields=('timestamp',), name='unique_timestamp'),
+                ),
+            ],
+            database_operations=[
+                migrations.RunSQL(
+                    sql="CREATE UNIQUE INDEX IF NOT EXISTS unique_timestamp ON core_snapshot (timestamp);",
+                    reverse_sql="DROP INDEX IF EXISTS unique_timestamp;",
+                ),
+            ],
         ),
     ]

+ 30 - 15
archivebox/core/migrations/0033_rename_extractor_add_hook_name.py

@@ -10,20 +10,35 @@ class Migration(migrations.Migration):
     ]
 
     operations = [
-        migrations.RenameField(
-            model_name='archiveresult',
-            old_name='extractor',
-            new_name='plugin',
-        ),
-        migrations.AddField(
-            model_name='archiveresult',
-            name='hook_name',
-            field=models.CharField(
-                blank=True,
-                default='',
-                max_length=255,
-                db_index=True,
-                help_text='Full filename of the hook that executed (e.g., on_Snapshot__50_wget.py)'
-            ),
+        # Use SeparateDatabaseAndState to avoid table rebuilds that would re-add CHECK constraints
+        migrations.SeparateDatabaseAndState(
+            state_operations=[
+                migrations.RenameField(
+                    model_name='archiveresult',
+                    old_name='extractor',
+                    new_name='plugin',
+                ),
+                migrations.AddField(
+                    model_name='archiveresult',
+                    name='hook_name',
+                    field=models.CharField(
+                        blank=True,
+                        default='',
+                        max_length=255,
+                        db_index=True,
+                        help_text='Full filename of the hook that executed (e.g., on_Snapshot__50_wget.py)'
+                    ),
+                ),
+            ],
+            database_operations=[
+                migrations.RunSQL(
+                    sql="""
+                        ALTER TABLE core_archiveresult RENAME COLUMN extractor TO plugin;
+                        ALTER TABLE core_archiveresult ADD COLUMN hook_name VARCHAR(255) DEFAULT '' NOT NULL;
+                        CREATE INDEX IF NOT EXISTS core_archiveresult_hook_name_idx ON core_archiveresult (hook_name);
+                    """,
+                    reverse_sql=migrations.RunSQL.noop,
+                ),
+            ],
         ),
     ]

+ 22 - 8
archivebox/core/migrations/0034_snapshot_current_step.py

@@ -11,13 +11,27 @@ class Migration(migrations.Migration):
     ]
 
     operations = [
-        migrations.AddField(
-            model_name='snapshot',
-            name='current_step',
-            field=models.PositiveSmallIntegerField(
-                default=0,
-                db_index=True,
-                help_text='Current hook step being executed (0-9). Used for sequential hook execution.'
-            ),
+        # Use SeparateDatabaseAndState to avoid table rebuild that would fail on config NOT NULL constraint
+        migrations.SeparateDatabaseAndState(
+            state_operations=[
+                migrations.AddField(
+                    model_name='snapshot',
+                    name='current_step',
+                    field=models.PositiveSmallIntegerField(
+                        default=0,
+                        db_index=True,
+                        help_text='Current hook step being executed (0-9). Used for sequential hook execution.'
+                    ),
+                ),
+            ],
+            database_operations=[
+                migrations.RunSQL(
+                    sql="""
+                        ALTER TABLE core_snapshot ADD COLUMN current_step SMALLINT UNSIGNED DEFAULT 0 NOT NULL;
+                        CREATE INDEX IF NOT EXISTS core_snapshot_current_step_idx ON core_snapshot (current_step);
+                    """,
+                    reverse_sql=migrations.RunSQL.noop,
+                ),
+            ],
         ),
     ]

+ 20 - 12
archivebox/core/migrations/0035_snapshot_crawl_non_nullable_remove_created_by.py

@@ -54,7 +54,7 @@ class Migration(migrations.Migration):
 
     dependencies = [
         ('core', '0034_snapshot_current_step'),
-        ('crawls', '0004_alter_crawl_output_dir'),
+        ('crawls', '0005_drop_seed_id_column'),
     ]
 
     operations = [
@@ -64,16 +64,24 @@ class Migration(migrations.Migration):
             reverse_code=migrations.RunPython.noop,
         ),
 
-        # Step 2: Make crawl non-nullable
-        migrations.AlterField(
-            model_name='snapshot',
-            name='crawl',
-            field=models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, related_name='snapshot_set', to='crawls.crawl'),
-        ),
-
-        # Step 3: Remove created_by field
-        migrations.RemoveField(
-            model_name='snapshot',
-            name='created_by',
+        # Step 2 & 3: Update Django's state only - leave created_by_id column in database (unused but harmless)
+        migrations.SeparateDatabaseAndState(
+            state_operations=[
+                # Make crawl non-nullable
+                migrations.AlterField(
+                    model_name='snapshot',
+                    name='crawl',
+                    field=models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, related_name='snapshot_set', to='crawls.crawl'),
+                ),
+                # Remove created_by field from Django's state
+                migrations.RemoveField(
+                    model_name='snapshot',
+                    name='created_by',
+                ),
+            ],
+            database_operations=[
+                # No database changes - crawl_id already exists and NOT NULL constraint will be enforced by model
+                # created_by_id column remains in database but is unused
+            ],
         ),
     ]

+ 12 - 4
archivebox/core/migrations/0036_remove_archiveresult_created_by.py

@@ -10,10 +10,18 @@ class Migration(migrations.Migration):
     ]
 
     operations = [
-        # Remove created_by field from ArchiveResult
+        # Remove created_by field from ArchiveResult (state only)
         # No data migration needed - created_by can be accessed via snapshot.crawl.created_by
-        migrations.RemoveField(
-            model_name='archiveresult',
-            name='created_by',
+        # Leave created_by_id column in database (unused but harmless, avoids table rebuild)
+        migrations.SeparateDatabaseAndState(
+            state_operations=[
+                migrations.RemoveField(
+                    model_name='archiveresult',
+                    name='created_by',
+                ),
+            ],
+            database_operations=[
+                # No database changes - leave created_by_id column in place to avoid table rebuild
+            ],
         ),
     ]

+ 44 - 0
archivebox/core/migrations/0037_remove_archiveresult_output_dir_and_more.py

@@ -0,0 +1,44 @@
+# Generated by Django 6.0 on 2025-12-29 06:45
+
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ('core', '0036_remove_archiveresult_created_by'),
+    ]
+
+    operations = [
+        # Update Django's state only - database columns remain for backwards compat
+        migrations.SeparateDatabaseAndState(
+            state_operations=[
+                migrations.RemoveField(
+                    model_name='archiveresult',
+                    name='output_dir',
+                ),
+                migrations.RemoveField(
+                    model_name='snapshot',
+                    name='output_dir',
+                ),
+                migrations.AlterField(
+                    model_name='archiveresult',
+                    name='config',
+                    field=models.JSONField(blank=True, default=dict, null=True),
+                ),
+                migrations.AlterField(
+                    model_name='snapshot',
+                    name='config',
+                    field=models.JSONField(blank=True, default=dict, null=True),
+                ),
+                migrations.AlterField(
+                    model_name='snapshot',
+                    name='tags',
+                    field=models.ManyToManyField(blank=True, related_name='snapshot_set', through='core.SnapshotTag', through_fields=('snapshot', 'tag'), to='core.tag'),
+                ),
+            ],
+            database_operations=[
+                # No database changes - columns remain in place to avoid table rebuilds
+            ],
+        ),
+    ]

+ 84 - 0
archivebox/core/migrations/0038_fix_missing_columns.py

@@ -0,0 +1,84 @@
+# Add missing columns to ArchiveResult and remove created_by_id from Snapshot
+
+from django.db import migrations, models, connection
+import django.utils.timezone
+
+
+def add_columns_if_not_exist(apps, schema_editor):
+    """Add columns to ArchiveResult only if they don't already exist."""
+    with connection.cursor() as cursor:
+        # Get existing columns
+        cursor.execute("PRAGMA table_info(core_archiveresult)")
+        existing_columns = {row[1] for row in cursor.fetchall()}
+
+        # Add num_uses_failed if it doesn't exist
+        if 'num_uses_failed' not in existing_columns:
+            cursor.execute("ALTER TABLE core_archiveresult ADD COLUMN num_uses_failed integer unsigned NOT NULL DEFAULT 0 CHECK (num_uses_failed >= 0)")
+
+        # Add num_uses_succeeded if it doesn't exist
+        if 'num_uses_succeeded' not in existing_columns:
+            cursor.execute("ALTER TABLE core_archiveresult ADD COLUMN num_uses_succeeded integer unsigned NOT NULL DEFAULT 0 CHECK (num_uses_succeeded >= 0)")
+
+        # Add config if it doesn't exist
+        if 'config' not in existing_columns:
+            cursor.execute("ALTER TABLE core_archiveresult ADD COLUMN config text NULL")
+
+        # Add retry_at if it doesn't exist
+        if 'retry_at' not in existing_columns:
+            cursor.execute("ALTER TABLE core_archiveresult ADD COLUMN retry_at datetime NULL")
+            cursor.execute("CREATE INDEX IF NOT EXISTS core_archiveresult_retry_at_idx ON core_archiveresult(retry_at)")
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ('core', '0037_remove_archiveresult_output_dir_and_more'),
+    ]
+
+    operations = [
+        # Add missing columns to ArchiveResult
+        migrations.SeparateDatabaseAndState(
+            state_operations=[
+                migrations.AddField(
+                    model_name='archiveresult',
+                    name='num_uses_failed',
+                    field=models.PositiveIntegerField(default=0),
+                ),
+                migrations.AddField(
+                    model_name='archiveresult',
+                    name='num_uses_succeeded',
+                    field=models.PositiveIntegerField(default=0),
+                ),
+                migrations.AddField(
+                    model_name='archiveresult',
+                    name='config',
+                    field=models.JSONField(blank=True, default=dict, null=True),
+                ),
+                migrations.AddField(
+                    model_name='archiveresult',
+                    name='retry_at',
+                    field=models.DateTimeField(blank=True, db_index=True, default=django.utils.timezone.now, null=True),
+                ),
+            ],
+            database_operations=[
+                migrations.RunPython(add_columns_if_not_exist, reverse_code=migrations.RunPython.noop),
+            ],
+        ),
+
+        # Drop created_by_id from Snapshot (database only, already removed from model in 0035)
+        migrations.SeparateDatabaseAndState(
+            state_operations=[
+                # No state changes - field already removed in 0035
+            ],
+            database_operations=[
+                migrations.RunSQL(
+                    sql="""
+                        -- Drop index first, then column
+                        DROP INDEX IF EXISTS core_snapshot_created_by_id_6dbd6149;
+                        ALTER TABLE core_snapshot DROP COLUMN created_by_id;
+                    """,
+                    reverse_sql=migrations.RunSQL.noop,
+                ),
+            ],
+        ),
+    ]

+ 30 - 0
archivebox/core/migrations/0039_fix_num_uses_values.py

@@ -0,0 +1,30 @@
+# Fix num_uses_failed and num_uses_succeeded string values to integers
+
+from django.db import migrations
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ('core', '0038_fix_missing_columns'),
+    ]
+
+    operations = [
+        # Fix string values that got inserted as literals instead of integers
+        migrations.RunSQL(
+            sql="""
+                UPDATE core_snapshot
+                SET num_uses_failed = 0
+                WHERE typeof(num_uses_failed) = 'text' OR num_uses_failed = 'num_uses_failed';
+
+                UPDATE core_snapshot
+                SET num_uses_succeeded = 0
+                WHERE typeof(num_uses_succeeded) = 'text' OR num_uses_succeeded = 'num_uses_succeeded';
+
+                UPDATE core_snapshot
+                SET depth = 0
+                WHERE typeof(depth) = 'text' OR depth = 'depth';
+            """,
+            reverse_sql=migrations.RunSQL.noop,
+        ),
+    ]

+ 1 - 1
archivebox/core/models.py

@@ -911,7 +911,7 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
         )
 
         merged = 0
-        for dup in duplicates.iterator():
+        for dup in duplicates.iterator(chunk_size=500):
             snapshots = list(
                 cls.objects
                 .filter(url=dup['url'], timestamp=dup['timestamp'])

+ 0 - 2638
archivebox/core/models.py.bak

@@ -1,2638 +0,0 @@
-__package__ = 'archivebox.core'
-
-from typing import Optional, Dict, Iterable, Any, List, TYPE_CHECKING
-from archivebox.uuid_compat import uuid7
-from datetime import datetime, timedelta
-from django_stubs_ext.db.models import TypedModelMeta
-
-import os
-import json
-from pathlib import Path
-
-from statemachine import State, registry
-
-from django.db import models
-from django.db.models import QuerySet, Value, Case, When, IntegerField
-from django.utils.functional import cached_property
-from django.utils.text import slugify
-from django.utils import timezone
-from django.core.cache import cache
-from django.urls import reverse, reverse_lazy
-from django.contrib import admin
-from django.conf import settings
-
-from archivebox.config import CONSTANTS
-from archivebox.misc.system import get_dir_size, atomic_write
-from archivebox.misc.util import parse_date, base_url, domain as url_domain, to_json, ts_to_date_str, urlencode, htmlencode, urldecode
-from archivebox.misc.hashing import get_dir_info
-from archivebox.hooks import (
-    EXTRACTOR_INDEXING_PRECEDENCE,
-    get_plugins, get_plugin_name, get_plugin_icon,
-    DEFAULT_PLUGIN_ICONS,
-)
-from archivebox.base_models.models import (
-    ModelWithUUID, ModelWithSerializers, ModelWithOutputDir,
-    ModelWithConfig, ModelWithNotes, ModelWithHealthStats,
-    get_or_create_system_user_pk,
-)
-from workers.models import ModelWithStateMachine, BaseStateMachine
-from workers.tasks import bg_archive_snapshot
-from archivebox.crawls.models import Crawl
-from archivebox.machine.models import NetworkInterface, Binary
-
-
-
-class Tag(ModelWithSerializers):
-    # Keep AutoField for compatibility with main branch migrations
-    # Don't use UUIDField here - requires complex FK transformation
-    id = models.AutoField(primary_key=True, serialize=False, verbose_name='ID')
-    created_by = models.ForeignKey(settings.AUTH_USER_MODEL, on_delete=models.CASCADE, default=get_or_create_system_user_pk, null=False, related_name='tag_set')
-    created_at = models.DateTimeField(default=timezone.now, db_index=True, null=True)
-    modified_at = models.DateTimeField(auto_now=True)
-    name = models.CharField(unique=True, blank=False, max_length=100)
-    slug = models.SlugField(unique=True, blank=False, max_length=100, editable=False)
-
-    snapshot_set: models.Manager['Snapshot']
-
-    class Meta(TypedModelMeta):
-        verbose_name = "Tag"
-        verbose_name_plural = "Tags"
-
-    def __str__(self):
-        return self.name
-
-    def save(self, *args, **kwargs):
-        is_new = self._state.adding
-        if is_new:
-            self.slug = slugify(self.name)
-            existing = set(Tag.objects.filter(slug__startswith=self.slug).values_list("slug", flat=True))
-            i = None
-            while True:
-                slug = f"{slugify(self.name)}_{i}" if i else slugify(self.name)
-                if slug not in existing:
-                    self.slug = slug
-                    break
-                i = (i or 0) + 1
-        super().save(*args, **kwargs)
-
-        if is_new:
-            from archivebox.misc.logging_util import log_worker_event
-            log_worker_event(
-                worker_type='DB',
-                event='Created Tag',
-                indent_level=0,
-                metadata={
-                    'id': self.id,
-                    'name': self.name,
-                    'slug': self.slug,
-                },
-            )
-
-    @property
-    def api_url(self) -> str:
-        return reverse_lazy('api-1:get_tag', args=[self.id])
-
-    @staticmethod
-    def from_jsonl(record: Dict[str, Any], overrides: Dict[str, Any] = None):
-        """
-        Create/update Tag from JSONL record.
-
-        Args:
-            record: JSONL record with 'name' field
-            overrides: Optional dict with 'snapshot' to auto-attach tag
-
-        Returns:
-            Tag instance or None
-        """
-        from archivebox.misc.jsonl import get_or_create_tag
-
-        try:
-            tag = get_or_create_tag(record)
-
-            # Auto-attach to snapshot if in overrides
-            if overrides and 'snapshot' in overrides and tag:
-                overrides['snapshot'].tags.add(tag)
-
-            return tag
-        except ValueError:
-            return None
-
-
-class SnapshotTag(models.Model):
-    id = models.AutoField(primary_key=True)
-    snapshot = models.ForeignKey('Snapshot', db_column='snapshot_id', on_delete=models.CASCADE, to_field='id')
-    tag = models.ForeignKey(Tag, db_column='tag_id', on_delete=models.CASCADE, to_field='id')
-
-    class Meta:
-        db_table = 'core_snapshot_tags'
-        unique_together = [('snapshot', 'tag')]
-
-
-class SnapshotQuerySet(models.QuerySet):
-    """Custom QuerySet for Snapshot model with export methods that persist through .filter() etc."""
-
-    # =========================================================================
-    # Filtering Methods
-    # =========================================================================
-
-    FILTER_TYPES = {
-        'exact': lambda pattern: models.Q(url=pattern),
-        'substring': lambda pattern: models.Q(url__icontains=pattern),
-        'regex': lambda pattern: models.Q(url__iregex=pattern),
-        'domain': lambda pattern: models.Q(url__istartswith=f"http://{pattern}") | models.Q(url__istartswith=f"https://{pattern}") | models.Q(url__istartswith=f"ftp://{pattern}"),
-        'tag': lambda pattern: models.Q(tags__name=pattern),
-        'timestamp': lambda pattern: models.Q(timestamp=pattern),
-    }
-
-    def filter_by_patterns(self, patterns: List[str], filter_type: str = 'exact') -> 'SnapshotQuerySet':
-        """Filter snapshots by URL patterns using specified filter type"""
-        from archivebox.misc.logging import stderr
-
-        q_filter = models.Q()
-        for pattern in patterns:
-            try:
-                q_filter = q_filter | self.FILTER_TYPES[filter_type](pattern)
-            except KeyError:
-                stderr()
-                stderr(f'[X] Got invalid pattern for --filter-type={filter_type}:', color='red')
-                stderr(f'    {pattern}')
-                raise SystemExit(2)
-        return self.filter(q_filter)
-
-    def search(self, patterns: List[str]) -> 'SnapshotQuerySet':
-        """Search snapshots using the configured search backend"""
-        from archivebox.config.common import SEARCH_BACKEND_CONFIG
-        from archivebox.search import query_search_index
-        from archivebox.misc.logging import stderr
-
-        if not SEARCH_BACKEND_CONFIG.USE_SEARCHING_BACKEND:
-            stderr()
-            stderr('[X] The search backend is not enabled, set config.USE_SEARCHING_BACKEND = True', color='red')
-            raise SystemExit(2)
-
-        qsearch = self.none()
-        for pattern in patterns:
-            try:
-                qsearch |= query_search_index(pattern)
-            except:
-                raise SystemExit(2)
-        return self.all() & qsearch
-
-    # =========================================================================
-    # Export Methods
-    # =========================================================================
-
-    def to_json(self, with_headers: bool = False) -> str:
-        """Generate JSON index from snapshots"""
-        import sys
-        from datetime import datetime, timezone as tz
-        from archivebox.config import VERSION
-        from archivebox.config.common import SERVER_CONFIG
-
-        MAIN_INDEX_HEADER = {
-            'info': 'This is an index of site data archived by ArchiveBox: The self-hosted web archive.',
-            'schema': 'archivebox.index.json',
-            'copyright_info': SERVER_CONFIG.FOOTER_INFO,
-            'meta': {
-                'project': 'ArchiveBox',
-                'version': VERSION,
-                'git_sha': VERSION,
-                'website': 'https://ArchiveBox.io',
-                'docs': 'https://github.com/ArchiveBox/ArchiveBox/wiki',
-                'source': 'https://github.com/ArchiveBox/ArchiveBox',
-                'issues': 'https://github.com/ArchiveBox/ArchiveBox/issues',
-                'dependencies': {},
-            },
-        } if with_headers else {}
-
-        snapshot_dicts = [s.to_dict(extended=True) for s in self.iterator(chunk_size=500)]
-
-        if with_headers:
-            output = {
-                **MAIN_INDEX_HEADER,
-                'num_links': len(snapshot_dicts),
-                'updated': datetime.now(tz.utc),
-                'last_run_cmd': sys.argv,
-                'links': snapshot_dicts,
-            }
-        else:
-            output = snapshot_dicts
-        return to_json(output, indent=4, sort_keys=True)
-
-    def to_csv(self, cols: Optional[List[str]] = None, header: bool = True, separator: str = ',', ljust: int = 0) -> str:
-        """Generate CSV output from snapshots"""
-        cols = cols or ['timestamp', 'is_archived', 'url']
-        header_str = separator.join(col.ljust(ljust) for col in cols) if header else ''
-        row_strs = (s.to_csv(cols=cols, ljust=ljust, separator=separator) for s in self.iterator(chunk_size=500))
-        return '\n'.join((header_str, *row_strs))
-
-    def to_html(self, with_headers: bool = True) -> str:
-        """Generate main index HTML from snapshots"""
-        from datetime import datetime, timezone as tz
-        from django.template.loader import render_to_string
-        from archivebox.config import VERSION
-        from archivebox.config.common import SERVER_CONFIG
-        from archivebox.config.version import get_COMMIT_HASH
-
-        template = 'static_index.html' if with_headers else 'minimal_index.html'
-        snapshot_list = list(self.iterator(chunk_size=500))
-
-        return render_to_string(template, {
-            'version': VERSION,
-            'git_sha': get_COMMIT_HASH() or VERSION,
-            'num_links': str(len(snapshot_list)),
-            'date_updated': datetime.now(tz.utc).strftime('%Y-%m-%d'),
-            'time_updated': datetime.now(tz.utc).strftime('%Y-%m-%d %H:%M'),
-            'links': snapshot_list,
-            'FOOTER_INFO': SERVER_CONFIG.FOOTER_INFO,
-        })
-
-
-class SnapshotManager(models.Manager.from_queryset(SnapshotQuerySet)):
-    """Manager for Snapshot model - uses SnapshotQuerySet for chainable methods"""
-
-    def filter(self, *args, **kwargs):
-        domain = kwargs.pop('domain', None)
-        qs = super().filter(*args, **kwargs)
-        if domain:
-            qs = qs.filter(url__icontains=f'://{domain}')
-        return qs
-
-    def get_queryset(self):
-        return super().get_queryset().prefetch_related('tags', 'archiveresult_set')
-
-    # =========================================================================
-    # Import Methods
-    # =========================================================================
-
-    def remove(self, atomic: bool = False) -> tuple:
-        """Remove snapshots from the database"""
-        from django.db import transaction
-        if atomic:
-            with transaction.atomic():
-                return self.delete()
-        return self.delete()
-
-
-class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHealthStats, ModelWithStateMachine):
-    id = models.UUIDField(primary_key=True, default=uuid7, editable=False, unique=True)
-    created_at = models.DateTimeField(default=timezone.now, db_index=True)
-    modified_at = models.DateTimeField(auto_now=True)
-
-    url = models.URLField(unique=False, db_index=True)  # URLs can appear in multiple crawls
-    timestamp = models.CharField(max_length=32, unique=True, db_index=True, editable=False)
-    bookmarked_at = models.DateTimeField(default=timezone.now, db_index=True)
-    crawl: Crawl = models.ForeignKey(Crawl, on_delete=models.CASCADE, null=False, related_name='snapshot_set', db_index=True)  # type: ignore[assignment]
-    parent_snapshot = models.ForeignKey('self', on_delete=models.SET_NULL, null=True, blank=True, related_name='child_snapshots', db_index=True, help_text='Parent snapshot that discovered this URL (for recursive crawling)')
-
-    title = models.CharField(max_length=512, null=True, blank=True, db_index=True)
-    downloaded_at = models.DateTimeField(default=None, null=True, editable=False, db_index=True, blank=True)
-    depth = models.PositiveSmallIntegerField(default=0, db_index=True)  # 0 for root snapshot, 1+ for discovered URLs
-    fs_version = models.CharField(max_length=10, default='0.9.0', help_text='Filesystem version of this snapshot (e.g., "0.7.0", "0.8.0", "0.9.0"). Used to trigger lazy migration on save().')
-    current_step = models.PositiveSmallIntegerField(default=0, db_index=True, help_text='Current hook step being executed (0-9). Used for sequential hook execution.')
-
-    retry_at = ModelWithStateMachine.RetryAtField(default=timezone.now)
-    status = ModelWithStateMachine.StatusField(choices=ModelWithStateMachine.StatusChoices, default=ModelWithStateMachine.StatusChoices.QUEUED)
-    config = models.JSONField(default=dict, null=False, blank=False, editable=True)
-    notes = models.TextField(blank=True, null=False, default='')
-    output_dir = models.FilePathField(path=CONSTANTS.ARCHIVE_DIR, recursive=True, match='.*', default=None, null=True, blank=True, editable=True)
-
-    tags = models.ManyToManyField(Tag, blank=True, through=SnapshotTag, related_name='snapshot_set', through_fields=('snapshot', 'tag'))
-
-    state_machine_name = 'core.models.SnapshotMachine'
-    state_field_name = 'status'
-    retry_at_field_name = 'retry_at'
-    StatusChoices = ModelWithStateMachine.StatusChoices
-    active_state = StatusChoices.STARTED
-
-    objects = SnapshotManager()
-    archiveresult_set: models.Manager['ArchiveResult']
-
-    class Meta(TypedModelMeta):
-        verbose_name = "Snapshot"
-        verbose_name_plural = "Snapshots"
-        constraints = [
-            # Allow same URL in different crawls, but not duplicates within same crawl
-            models.UniqueConstraint(fields=['url', 'crawl'], name='unique_url_per_crawl'),
-            # Global timestamp uniqueness for 1:1 symlink mapping
-            models.UniqueConstraint(fields=['timestamp'], name='unique_timestamp'),
-        ]
-
-    def __str__(self):
-        return f'[{self.id}] {self.url[:64]}'
-
-    def save(self, *args, **kwargs):
-        is_new = self._state.adding
-        if not self.bookmarked_at:
-            self.bookmarked_at = self.created_at or timezone.now()
-        if not self.timestamp:
-            self.timestamp = str(self.bookmarked_at.timestamp())
-
-        # Migrate filesystem if needed (happens automatically on save)
-        if self.pk and self.fs_migration_needed:
-            from django.db import transaction
-            with transaction.atomic():
-                # Walk through migration chain automatically
-                current = self.fs_version
-                target = self._fs_current_version()
-
-                while current != target:
-                    next_ver = self._fs_next_version(current)
-                    method = f'_fs_migrate_from_{current.replace(".", "_")}_to_{next_ver.replace(".", "_")}'
-
-                    # Only run if method exists (most are no-ops)
-                    if hasattr(self, method):
-                        getattr(self, method)()
-
-                    current = next_ver
-
-                # Update version (still in transaction)
-                self.fs_version = target
-
-        super().save(*args, **kwargs)
-        if self.crawl and self.url not in self.crawl.urls:
-            self.crawl.urls += f'\n{self.url}'
-            self.crawl.save()
-
-        if is_new:
-            from archivebox.misc.logging_util import log_worker_event
-            log_worker_event(
-                worker_type='DB',
-                event='Created Snapshot',
-                indent_level=2,
-                url=self.url,
-                metadata={
-                    'id': str(self.id),
-                    'crawl_id': str(self.crawl_id) if self.crawl_id else None,
-                    'depth': self.depth,
-                    'status': self.status,
-                },
-            )
-
-    # =========================================================================
-    # Filesystem Migration Methods
-    # =========================================================================
-
-    @staticmethod
-    def _fs_current_version() -> str:
-        """Get current ArchiveBox filesystem version (normalized to x.x.0 format)"""
-        from archivebox.config import VERSION
-        # Normalize version to x.x.0 format (e.g., "0.9.0rc1" -> "0.9.0")
-        parts = VERSION.split('.')
-        if len(parts) >= 2:
-            major, minor = parts[0], parts[1]
-            # Strip any non-numeric suffix from minor version
-            minor = ''.join(c for c in minor if c.isdigit())
-            return f'{major}.{minor}.0'
-        return '0.9.0'  # Fallback if version parsing fails
-
-    @property
-    def fs_migration_needed(self) -> bool:
-        """Check if snapshot needs filesystem migration"""
-        return self.fs_version != self._fs_current_version()
-
-    def _fs_next_version(self, version: str) -> str:
-        """Get next version in migration chain"""
-        chain = ['0.7.0', '0.8.0', '0.9.0']
-        try:
-            idx = chain.index(version)
-            return chain[idx + 1] if idx + 1 < len(chain) else self._fs_current_version()
-        except ValueError:
-            # Unknown version - skip to current
-            return self._fs_current_version()
-
-    def _fs_migrate_from_0_7_0_to_0_8_0(self):
-        """Migration from 0.7.0 to 0.8.0 layout (no-op)"""
-        # 0.7 and 0.8 both used archive/<timestamp>
-        # Nothing to do!
-        pass
-
-    def _fs_migrate_from_0_8_0_to_0_9_0(self):
-        """
-        Migrate from flat to nested structure.
-
-        0.8.x: archive/{timestamp}/
-        0.9.x: users/{user}/snapshots/YYYYMMDD/{domain}/{uuid}/
-
-        Transaction handling:
-        1. Copy files INSIDE transaction
-        2. Create symlink INSIDE transaction
-        3. Update fs_version INSIDE transaction (done by save())
-        4. Exit transaction (DB commit)
-        5. Delete old files OUTSIDE transaction (after commit)
-        """
-        import shutil
-        from django.db import transaction
-
-        old_dir = self.get_storage_path_for_version('0.8.0')
-        new_dir = self.get_storage_path_for_version('0.9.0')
-
-        if not old_dir.exists() or old_dir == new_dir or new_dir.exists():
-            return
-
-        new_dir.mkdir(parents=True, exist_ok=True)
-
-        # Copy all files (idempotent)
-        for old_file in old_dir.rglob('*'):
-            if not old_file.is_file():
-                continue
-
-            rel_path = old_file.relative_to(old_dir)
-            new_file = new_dir / rel_path
-
-            # Skip if already copied
-            if new_file.exists() and new_file.stat().st_size == old_file.stat().st_size:
-                continue
-
-            new_file.parent.mkdir(parents=True, exist_ok=True)
-            shutil.copy2(old_file, new_file)
-
-        # Verify all copied
-        old_files = {f.relative_to(old_dir): f.stat().st_size
-                     for f in old_dir.rglob('*') if f.is_file()}
-        new_files = {f.relative_to(new_dir): f.stat().st_size
-                     for f in new_dir.rglob('*') if f.is_file()}
-
-        if old_files.keys() != new_files.keys():
-            missing = old_files.keys() - new_files.keys()
-            raise Exception(f"Migration incomplete: missing {missing}")
-
-        # Create backwards-compat symlink (INSIDE transaction)
-        symlink_path = CONSTANTS.ARCHIVE_DIR / self.timestamp
-        if symlink_path.is_symlink():
-            symlink_path.unlink()
-
-        if not symlink_path.exists() or symlink_path == old_dir:
-            symlink_path.symlink_to(new_dir, target_is_directory=True)
-
-        # Schedule old directory deletion AFTER transaction commits
-        transaction.on_commit(lambda: self._cleanup_old_migration_dir(old_dir))
-
-    def _cleanup_old_migration_dir(self, old_dir: Path):
-        """
-        Delete old directory after successful migration.
-        Called via transaction.on_commit() after DB commit succeeds.
-        """
-        import shutil
-        import logging
-
-        if old_dir.exists() and not old_dir.is_symlink():
-            try:
-                shutil.rmtree(old_dir)
-            except Exception as e:
-                # Log but don't raise - migration succeeded, this is just cleanup
-                logging.getLogger('archivebox.migration').warning(
-                    f"Could not remove old migration directory {old_dir}: {e}"
-                )
-
-    # =========================================================================
-    # Path Calculation and Migration Helpers
-    # =========================================================================
-
-    @staticmethod
-    def extract_domain_from_url(url: str) -> str:
-        """
-        Extract domain from URL for 0.9.x path structure.
-        Uses full hostname with sanitized special chars.
-
-        Examples:
-            https://example.com:8080 → example.com_8080
-            https://sub.example.com → sub.example.com
-            file:///path → localhost
-            data:text/html → data
-        """
-        from urllib.parse import urlparse
-
-        try:
-            parsed = urlparse(url)
-
-            if parsed.scheme in ('http', 'https'):
-                if parsed.port:
-                    return f"{parsed.hostname}_{parsed.port}".replace(':', '_')
-                return parsed.hostname or 'unknown'
-            elif parsed.scheme == 'file':
-                return 'localhost'
-            elif parsed.scheme:
-                return parsed.scheme
-            else:
-                return 'unknown'
-        except Exception:
-            return 'unknown'
-
-    def get_storage_path_for_version(self, version: str) -> Path:
-        """
-        Calculate storage path for specific filesystem version.
-        Centralizes path logic so it's reusable.
-
-        0.7.x/0.8.x: archive/{timestamp}
-        0.9.x: users/{username}/snapshots/YYYYMMDD/{domain}/{uuid}/
-        """
-        from datetime import datetime
-
-        if version in ('0.7.0', '0.8.0'):
-            return CONSTANTS.ARCHIVE_DIR / self.timestamp
-
-        elif version in ('0.9.0', '1.0.0'):
-            username = self.crawl.created_by.username
-
-            # Use created_at for date grouping (fallback to timestamp)
-            if self.created_at:
-                date_str = self.created_at.strftime('%Y%m%d')
-            else:
-                date_str = datetime.fromtimestamp(float(self.timestamp)).strftime('%Y%m%d')
-
-            domain = self.extract_domain_from_url(self.url)
-
-            return (
-                CONSTANTS.DATA_DIR / 'users' / username / 'snapshots' /
-                date_str / domain / str(self.id)
-            )
-        else:
-            # Unknown version - use current
-            return self.get_storage_path_for_version(self._fs_current_version())
-
-    # =========================================================================
-    # Loading and Creation from Filesystem (Used by archivebox update ONLY)
-    # =========================================================================
-
-    @classmethod
-    def load_from_directory(cls, snapshot_dir: Path) -> Optional['Snapshot']:
-        """
-        Load existing Snapshot from DB by reading index.json.
-
-        Reads index.json, extracts url+timestamp, queries DB.
-        Returns existing Snapshot or None if not found/invalid.
-        Does NOT create new snapshots.
-
-        ONLY used by: archivebox update (for orphan detection)
-        """
-        import json
-
-        index_path = snapshot_dir / 'index.json'
-        if not index_path.exists():
-            return None
-
-        try:
-            with open(index_path) as f:
-                data = json.load(f)
-        except:
-            return None
-
-        url = data.get('url')
-        if not url:
-            return None
-
-        # Get timestamp - prefer index.json, fallback to folder name
-        timestamp = cls._select_best_timestamp(
-            index_timestamp=data.get('timestamp'),
-            folder_name=snapshot_dir.name
-        )
-
-        if not timestamp:
-            return None
-
-        # Look up existing
-        try:
-            return cls.objects.get(url=url, timestamp=timestamp)
-        except cls.DoesNotExist:
-            return None
-        except cls.MultipleObjectsReturned:
-            # Should not happen with unique constraint
-            return cls.objects.filter(url=url, timestamp=timestamp).first()
-
-    @classmethod
-    def create_from_directory(cls, snapshot_dir: Path) -> Optional['Snapshot']:
-        """
-        Create new Snapshot from orphaned directory.
-
-        Validates timestamp, ensures uniqueness.
-        Returns new UNSAVED Snapshot or None if invalid.
-
-        ONLY used by: archivebox update (for orphan import)
-        """
-        import json
-
-        index_path = snapshot_dir / 'index.json'
-        if not index_path.exists():
-            return None
-
-        try:
-            with open(index_path) as f:
-                data = json.load(f)
-        except:
-            return None
-
-        url = data.get('url')
-        if not url:
-            return None
-
-        # Get and validate timestamp
-        timestamp = cls._select_best_timestamp(
-            index_timestamp=data.get('timestamp'),
-            folder_name=snapshot_dir.name
-        )
-
-        if not timestamp:
-            return None
-
-        # Ensure uniqueness (reuses existing logic from create_or_update_from_dict)
-        timestamp = cls._ensure_unique_timestamp(url, timestamp)
-
-        # Detect version
-        fs_version = cls._detect_fs_version_from_index(data)
-
-        return cls(
-            url=url,
-            timestamp=timestamp,
-            title=data.get('title', ''),
-            fs_version=fs_version,
-            created_by_id=get_or_create_system_user_pk(),
-        )
-
-    @staticmethod
-    def _select_best_timestamp(index_timestamp: str, folder_name: str) -> Optional[str]:
-        """
-        Select best timestamp from index.json vs folder name.
-
-        Validates range (1995-2035).
-        Prefers index.json if valid.
-        """
-        def is_valid_timestamp(ts):
-            try:
-                ts_int = int(float(ts))
-                # 1995-01-01 to 2035-12-31
-                return 788918400 <= ts_int <= 2082758400
-            except:
-                return False
-
-        index_valid = is_valid_timestamp(index_timestamp) if index_timestamp else False
-        folder_valid = is_valid_timestamp(folder_name)
-
-        if index_valid:
-            return str(int(float(index_timestamp)))
-        elif folder_valid:
-            return str(int(float(folder_name)))
-        else:
-            return None
-
-    @classmethod
-    def _ensure_unique_timestamp(cls, url: str, timestamp: str) -> str:
-        """
-        Ensure timestamp is globally unique.
-        If collision with different URL, increment by 1 until unique.
-
-        NOTE: Logic already exists in create_or_update_from_dict (line 266-267)
-        This is just an extracted, reusable version.
-        """
-        while cls.objects.filter(timestamp=timestamp).exclude(url=url).exists():
-            timestamp = str(int(float(timestamp)) + 1)
-        return timestamp
-
-    @staticmethod
-    def _detect_fs_version_from_index(data: dict) -> str:
-        """
-        Detect fs_version from index.json structure.
-
-        - Has fs_version field: use it
-        - Has history dict: 0.7.0
-        - Has archive_results list: 0.8.0
-        - Default: 0.7.0
-        """
-        if 'fs_version' in data:
-            return data['fs_version']
-        if 'history' in data and 'archive_results' not in data:
-            return '0.7.0'
-        if 'archive_results' in data:
-            return '0.8.0'
-        return '0.7.0'
-
-    # =========================================================================
-    # Index.json Reconciliation
-    # =========================================================================
-
-    def reconcile_with_index_json(self):
-        """
-        Merge index.json with DB. DB is source of truth.
-
-        - Title: longest non-URL
-        - Tags: union
-        - ArchiveResults: keep both (by plugin+start_ts)
-
-        Writes back in 0.9.x format.
-
-        Used by: archivebox update (to sync index.json with DB)
-        """
-        import json
-
-        index_path = Path(self.output_dir) / 'index.json'
-
-        index_data = {}
-        if index_path.exists():
-            try:
-                with open(index_path) as f:
-                    index_data = json.load(f)
-            except:
-                pass
-
-        # Merge title
-        self._merge_title_from_index(index_data)
-
-        # Merge tags
-        self._merge_tags_from_index(index_data)
-
-        # Merge ArchiveResults
-        self._merge_archive_results_from_index(index_data)
-
-        # Write back
-        self.write_index_json()
-
-    def _merge_title_from_index(self, index_data: dict):
-        """Merge title - prefer longest non-URL title."""
-        index_title = index_data.get('title', '').strip()
-        db_title = self.title or ''
-
-        candidates = [t for t in [index_title, db_title] if t and t != self.url]
-        if candidates:
-            best_title = max(candidates, key=len)
-            if self.title != best_title:
-                self.title = best_title
-
-    def _merge_tags_from_index(self, index_data: dict):
-        """Merge tags - union of both sources."""
-        from django.db import transaction
-
-        index_tags = set(index_data.get('tags', '').split(',')) if index_data.get('tags') else set()
-        index_tags = {t.strip() for t in index_tags if t.strip()}
-
-        db_tags = set(self.tags.values_list('name', flat=True))
-
-        new_tags = index_tags - db_tags
-        if new_tags:
-            with transaction.atomic():
-                for tag_name in new_tags:
-                    tag, _ = Tag.objects.get_or_create(name=tag_name)
-                    self.tags.add(tag)
-
-    def _merge_archive_results_from_index(self, index_data: dict):
-        """Merge ArchiveResults - keep both (by plugin+start_ts)."""
-        existing = {
-            (ar.plugin, ar.start_ts): ar
-            for ar in ArchiveResult.objects.filter(snapshot=self)
-        }
-
-        # Handle 0.8.x format (archive_results list)
-        for result_data in index_data.get('archive_results', []):
-            self._create_archive_result_if_missing(result_data, existing)
-
-        # Handle 0.7.x format (history dict)
-        if 'history' in index_data and isinstance(index_data['history'], dict):
-            for plugin, result_list in index_data['history'].items():
-                if isinstance(result_list, list):
-                    for result_data in result_list:
-                        # Support both old 'extractor' and new 'plugin' keys for backwards compat
-                        result_data['plugin'] = result_data.get('plugin') or result_data.get('extractor') or plugin
-                        self._create_archive_result_if_missing(result_data, existing)
-
-    def _create_archive_result_if_missing(self, result_data: dict, existing: dict):
-        """Create ArchiveResult if not already in DB."""
-        from dateutil import parser
-
-        # Support both old 'extractor' and new 'plugin' keys for backwards compat
-        plugin = result_data.get('plugin') or result_data.get('extractor', '')
-        if not plugin:
-            return
-
-        start_ts = None
-        if result_data.get('start_ts'):
-            try:
-                start_ts = parser.parse(result_data['start_ts'])
-            except:
-                pass
-
-        if (plugin, start_ts) in existing:
-            return
-
-        try:
-            end_ts = None
-            if result_data.get('end_ts'):
-                try:
-                    end_ts = parser.parse(result_data['end_ts'])
-                except:
-                    pass
-
-            ArchiveResult.objects.create(
-                snapshot=self,
-                plugin=plugin,
-                hook_name=result_data.get('hook_name', ''),
-                status=result_data.get('status', 'failed'),
-                output_str=result_data.get('output', ''),
-                cmd=result_data.get('cmd', []),
-                pwd=result_data.get('pwd', str(self.output_dir)),
-                start_ts=start_ts,
-                end_ts=end_ts,
-                created_by=self.crawl.created_by,
-            )
-        except:
-            pass
-
-    def write_index_json(self):
-        """Write index.json in 0.9.x format."""
-        import json
-
-        index_path = Path(self.output_dir) / 'index.json'
-
-        data = {
-            'url': self.url,
-            'timestamp': self.timestamp,
-            'title': self.title or '',
-            'tags': ','.join(sorted(self.tags.values_list('name', flat=True))),
-            'fs_version': self.fs_version,
-            'bookmarked_at': self.bookmarked_at.isoformat() if self.bookmarked_at else None,
-            'created_at': self.created_at.isoformat() if self.created_at else None,
-            'archive_results': [
-                {
-                    'plugin': ar.plugin,
-                    'status': ar.status,
-                    'start_ts': ar.start_ts.isoformat() if ar.start_ts else None,
-                    'end_ts': ar.end_ts.isoformat() if ar.end_ts else None,
-                    'output': ar.output_str or '',
-                    'cmd': ar.cmd if isinstance(ar.cmd, list) else [],
-                    'pwd': ar.pwd,
-                }
-                for ar in ArchiveResult.objects.filter(snapshot=self).order_by('start_ts')
-            ],
-        }
-
-        index_path.parent.mkdir(parents=True, exist_ok=True)
-        with open(index_path, 'w') as f:
-            json.dump(data, f, indent=2, sort_keys=True)
-
-    # =========================================================================
-    # Snapshot Utilities
-    # =========================================================================
-
-    @staticmethod
-    def move_directory_to_invalid(snapshot_dir: Path):
-        """
-        Move invalid directory to data/invalid/YYYYMMDD/.
-
-        Used by: archivebox update (when encountering invalid directories)
-        """
-        from datetime import datetime
-        import shutil
-
-        invalid_dir = CONSTANTS.DATA_DIR / 'invalid' / datetime.now().strftime('%Y%m%d')
-        invalid_dir.mkdir(parents=True, exist_ok=True)
-
-        dest = invalid_dir / snapshot_dir.name
-        counter = 1
-        while dest.exists():
-            dest = invalid_dir / f"{snapshot_dir.name}_{counter}"
-            counter += 1
-
-        try:
-            shutil.move(str(snapshot_dir), str(dest))
-        except:
-            pass
-
-    @classmethod
-    def find_and_merge_duplicates(cls) -> int:
-        """
-        Find and merge snapshots with same url:timestamp.
-        Returns count of duplicate sets merged.
-
-        Used by: archivebox update (Phase 3: deduplication)
-        """
-        from django.db.models import Count
-
-        duplicates = (
-            cls.objects
-            .values('url', 'timestamp')
-            .annotate(count=Count('id'))
-            .filter(count__gt=1)
-        )
-
-        merged = 0
-        for dup in duplicates.iterator():
-            snapshots = list(
-                cls.objects
-                .filter(url=dup['url'], timestamp=dup['timestamp'])
-                .order_by('created_at')  # Keep oldest
-            )
-
-            if len(snapshots) > 1:
-                try:
-                    cls._merge_snapshots(snapshots)
-                    merged += 1
-                except:
-                    pass
-
-        return merged
-
-    @classmethod
-    def _merge_snapshots(cls, snapshots: list['Snapshot']):
-        """
-        Merge exact duplicates.
-        Keep oldest, union files + ArchiveResults.
-        """
-        import shutil
-
-        keeper = snapshots[0]
-        duplicates = snapshots[1:]
-
-        keeper_dir = Path(keeper.output_dir)
-
-        for dup in duplicates:
-            dup_dir = Path(dup.output_dir)
-
-            # Merge files
-            if dup_dir.exists() and dup_dir != keeper_dir:
-                for dup_file in dup_dir.rglob('*'):
-                    if not dup_file.is_file():
-                        continue
-
-                    rel = dup_file.relative_to(dup_dir)
-                    keeper_file = keeper_dir / rel
-
-                    if not keeper_file.exists():
-                        keeper_file.parent.mkdir(parents=True, exist_ok=True)
-                        shutil.copy2(dup_file, keeper_file)
-
-                try:
-                    shutil.rmtree(dup_dir)
-                except:
-                    pass
-
-            # Merge tags
-            for tag in dup.tags.all():
-                keeper.tags.add(tag)
-
-            # Move ArchiveResults
-            ArchiveResult.objects.filter(snapshot=dup).update(snapshot=keeper)
-
-            # Delete
-            dup.delete()
-
-    # =========================================================================
-    # Output Directory Properties
-    # =========================================================================
-
-    @property
-    def output_dir_parent(self) -> str:
-        return 'archive'
-
-    @property
-    def output_dir_name(self) -> str:
-        return str(self.timestamp)
-
-    def archive(self, overwrite=False, methods=None):
-        return bg_archive_snapshot(self, overwrite=overwrite, methods=methods)
-
-    @admin.display(description='Tags')
-    def tags_str(self, nocache=True) -> str | None:
-        calc_tags_str = lambda: ','.join(sorted(tag.name for tag in self.tags.all()))
-        if hasattr(self, '_prefetched_objects_cache') and 'tags' in self._prefetched_objects_cache:
-            return calc_tags_str()
-        cache_key = f'{self.pk}-tags'
-        return cache.get_or_set(cache_key, calc_tags_str) if not nocache else calc_tags_str()
-
-    def icons(self) -> str:
-        """Generate HTML icons showing which extractor plugins have succeeded for this snapshot"""
-        from django.utils.html import format_html, mark_safe
-
-        cache_key = f'result_icons:{self.pk}:{(self.downloaded_at or self.modified_at or self.created_at or self.bookmarked_at).timestamp()}'
-
-        def calc_icons():
-            if hasattr(self, '_prefetched_objects_cache') and 'archiveresult_set' in self._prefetched_objects_cache:
-                archive_results = {r.plugin: r for r in self.archiveresult_set.all() if r.status == "succeeded" and (r.output_files or r.output_str)}
-            else:
-                # Filter for results that have either output_files or output_str
-                from django.db.models import Q
-                archive_results = {r.plugin: r for r in self.archiveresult_set.filter(
-                    Q(status="succeeded") & (Q(output_files__isnull=False) | ~Q(output_str=''))
-                )}
-
-            path = self.archive_path
-            canon = self.canonical_outputs()
-            output = ""
-            output_template = '<a href="/{}/{}" class="exists-{}" title="{}">{}</a> &nbsp;'
-
-            # Get all plugins from hooks system (sorted by numeric prefix)
-            all_plugins = [get_plugin_name(e) for e in get_plugins()]
-
-            for plugin in all_plugins:
-                result = archive_results.get(plugin)
-                existing = result and result.status == 'succeeded' and (result.output_files or result.output_str)
-                icon = get_plugin_icon(plugin)
-                output += format_html(
-                    output_template,
-                    path,
-                    canon.get(plugin, plugin + '/'),
-                    str(bool(existing)),
-                    plugin,
-                    icon
-                )
-
-            return format_html('<span class="files-icons" style="font-size: 1.1em; opacity: 0.8; min-width: 240px; display: inline-block">{}</span>', mark_safe(output))
-
-        cache_result = cache.get(cache_key)
-        if cache_result:
-            return cache_result
-
-        fresh_result = calc_icons()
-        cache.set(cache_key, fresh_result, timeout=60 * 60 * 24)
-        return fresh_result
-
-    @property
-    def api_url(self) -> str:
-        return reverse_lazy('api-1:get_snapshot', args=[self.id])
-
-    def get_absolute_url(self):
-        return f'/{self.archive_path}'
-
-    @cached_property
-    def domain(self) -> str:
-        return url_domain(self.url)
-
-    @cached_property
-    def output_dir(self):
-        """The filesystem path to the snapshot's output directory."""
-        import os
-
-        current_path = self.get_storage_path_for_version(self.fs_version)
-
-        if current_path.exists():
-            return str(current_path)
-
-        # Check for backwards-compat symlink
-        old_path = CONSTANTS.ARCHIVE_DIR / self.timestamp
-        if old_path.is_symlink():
-            return str(Path(os.readlink(old_path)).resolve())
-        elif old_path.exists():
-            return str(old_path)
-
-        return str(current_path)
-
-    @cached_property
-    def archive_path(self):
-        return f'{CONSTANTS.ARCHIVE_DIR_NAME}/{self.timestamp}'
-
-    @cached_property
-    def archive_size(self):
-        try:
-            return get_dir_size(self.output_dir)[0]
-        except Exception:
-            return 0
-
-    def save_tags(self, tags: Iterable[str] = ()) -> None:
-        tags_id = [Tag.objects.get_or_create(name=tag)[0].pk for tag in tags if tag.strip()]
-        self.tags.clear()
-        self.tags.add(*tags_id)
-
-    def pending_archiveresults(self) -> QuerySet['ArchiveResult']:
-        return self.archiveresult_set.exclude(status__in=ArchiveResult.FINAL_OR_ACTIVE_STATES)
-
-    def run(self) -> list['ArchiveResult']:
-        """
-        Execute snapshot by creating pending ArchiveResults for all enabled hooks.
-
-        Called by: SnapshotMachine.enter_started()
-
-        Hook Lifecycle:
-            1. discover_hooks('Snapshot') → finds all plugin hooks
-            2. For each hook:
-               - Create ArchiveResult with status=QUEUED
-               - Store hook_name (e.g., 'on_Snapshot__50_wget.py')
-            3. ArchiveResults execute independently via ArchiveResultMachine
-            4. Hook execution happens in ArchiveResult.run(), NOT here
-
-        Returns:
-            list[ArchiveResult]: Newly created pending results
-        """
-        return self.create_pending_archiveresults()
-
-    def cleanup(self):
-        """
-        Clean up background ArchiveResult hooks.
-
-        Called by the state machine when entering the 'sealed' state.
-        Kills any background hooks and finalizes their ArchiveResults.
-        """
-        from archivebox.hooks import kill_process
-
-        # Kill any background ArchiveResult hooks
-        if not self.OUTPUT_DIR.exists():
-            return
-
-        # Find all .pid files in this snapshot's output directory
-        for pid_file in self.OUTPUT_DIR.glob('**/*.pid'):
-            kill_process(pid_file, validate=True)
-
-        # Update all STARTED ArchiveResults from filesystem
-        results = self.archiveresult_set.filter(status=ArchiveResult.StatusChoices.STARTED)
-        for ar in results:
-            ar.update_from_output()
-
-    def has_running_background_hooks(self) -> bool:
-        """
-        Check if any ArchiveResult background hooks are still running.
-
-        Used by state machine to determine if snapshot is finished.
-        """
-        from archivebox.hooks import process_is_alive
-
-        if not self.OUTPUT_DIR.exists():
-            return False
-
-        for plugin_dir in self.OUTPUT_DIR.iterdir():
-            if not plugin_dir.is_dir():
-                continue
-            pid_file = plugin_dir / 'hook.pid'
-            if process_is_alive(pid_file):
-                return True
-
-        return False
-
-    @staticmethod
-    def from_jsonl(record: Dict[str, Any], overrides: Dict[str, Any] = None, queue_for_extraction: bool = True):
-        """
-        Create/update Snapshot from JSONL record or dict.
-
-        Unified method that handles:
-        - ID-based patching: {"id": "...", "title": "new title"}
-        - URL-based create/update: {"url": "...", "title": "...", "tags": "..."}
-        - Auto-creates Crawl if not provided
-        - Optionally queues for extraction
-
-        Args:
-            record: Dict with 'url' (for create) or 'id' (for patch), plus other fields
-            overrides: Dict with 'crawl', 'snapshot' (parent), 'created_by_id'
-            queue_for_extraction: If True, sets status=QUEUED and retry_at (default: True)
-
-        Returns:
-            Snapshot instance or None
-        """
-        import re
-        from django.utils import timezone
-        from archivebox.misc.util import parse_date
-        from archivebox.base_models.models import get_or_create_system_user_pk
-        from archivebox.config.common import GENERAL_CONFIG
-
-        overrides = overrides or {}
-
-        # If 'id' is provided, lookup and patch that specific snapshot
-        snapshot_id = record.get('id')
-        if snapshot_id:
-            try:
-                snapshot = Snapshot.objects.get(id=snapshot_id)
-
-                # Generically update all fields present in record
-                update_fields = []
-                for field_name, value in record.items():
-                    # Skip internal fields
-                    if field_name in ('id', 'type'):
-                        continue
-
-                    # Skip if field doesn't exist on model
-                    if not hasattr(snapshot, field_name):
-                        continue
-
-                    # Special parsing for date fields
-                    if field_name in ('bookmarked_at', 'retry_at', 'created_at', 'modified_at'):
-                        if value and isinstance(value, str):
-                            value = parse_date(value)
-
-                    # Update field if value is provided and different
-                    if value is not None and getattr(snapshot, field_name) != value:
-                        setattr(snapshot, field_name, value)
-                        update_fields.append(field_name)
-
-                if update_fields:
-                    snapshot.save(update_fields=update_fields + ['modified_at'])
-
-                return snapshot
-            except Snapshot.DoesNotExist:
-                # ID not found, fall through to create-by-URL logic
-                pass
-
-        url = record.get('url')
-        if not url:
-            return None
-
-        # Determine or create crawl (every snapshot must have a crawl)
-        crawl = overrides.get('crawl')
-        parent_snapshot = overrides.get('snapshot')  # Parent snapshot
-        created_by_id = overrides.get('created_by_id') or (parent_snapshot.crawl.created_by_id if parent_snapshot else None) or get_or_create_system_user_pk()
-
-        # If no crawl provided, inherit from parent or auto-create one
-        if not crawl:
-            if parent_snapshot:
-                # Inherit crawl from parent snapshot
-                crawl = parent_snapshot.crawl
-            else:
-                # Auto-create a single-URL crawl
-                from archivebox.crawls.models import Crawl
-                from archivebox.config import CONSTANTS
-
-                timestamp_str = timezone.now().strftime("%Y-%m-%d__%H-%M-%S")
-                sources_file = CONSTANTS.SOURCES_DIR / f'{timestamp_str}__auto_crawl.txt'
-                sources_file.parent.mkdir(parents=True, exist_ok=True)
-                sources_file.write_text(url)
-
-                crawl = Crawl.objects.create(
-                    urls=url,
-                    max_depth=0,
-                    label=f'auto-created for {url[:50]}',
-                    created_by_id=created_by_id,
-                )
-
-        # Parse tags
-        tags_str = record.get('tags', '')
-        tag_list = []
-        if tags_str:
-            tag_list = list(dict.fromkeys(
-                tag.strip() for tag in re.split(GENERAL_CONFIG.TAG_SEPARATOR_PATTERN, tags_str)
-                if tag.strip()
-            ))
-
-        # Get most recent snapshot with this URL (URLs can exist in multiple crawls)
-        snapshot = Snapshot.objects.filter(url=url).order_by('-created_at').first()
-
-        title = record.get('title')
-        timestamp = record.get('timestamp')
-
-        if snapshot:
-            # Update existing snapshot
-            if title and (not snapshot.title or len(title) > len(snapshot.title or '')):
-                snapshot.title = title
-                snapshot.save(update_fields=['title', 'modified_at'])
-        else:
-            # Create new snapshot
-            if timestamp:
-                while Snapshot.objects.filter(timestamp=timestamp).exists():
-                    timestamp = str(float(timestamp) + 1.0)
-
-            snapshot = Snapshot.objects.create(
-                url=url,
-                timestamp=timestamp,
-                title=title,
-                crawl=crawl,
-            )
-
-        # Update tags
-        if tag_list:
-            existing_tags = set(snapshot.tags.values_list('name', flat=True))
-            new_tags = set(tag_list) | existing_tags
-            snapshot.save_tags(new_tags)
-
-        # Queue for extraction and update additional fields
-        update_fields = []
-
-        if queue_for_extraction:
-            snapshot.status = Snapshot.StatusChoices.QUEUED
-            snapshot.retry_at = timezone.now()
-            update_fields.extend(['status', 'retry_at'])
-
-        # Update additional fields if provided
-        for field_name in ('depth', 'parent_snapshot_id', 'crawl_id', 'bookmarked_at'):
-            value = record.get(field_name)
-            if value is not None and getattr(snapshot, field_name) != value:
-                setattr(snapshot, field_name, value)
-                update_fields.append(field_name)
-
-        if update_fields:
-            snapshot.save(update_fields=update_fields + ['modified_at'])
-
-        return snapshot
-
-    def create_pending_archiveresults(self) -> list['ArchiveResult']:
-        """
-        Create ArchiveResult records for all enabled hooks.
-
-        Uses the hooks system to discover available hooks from:
-        - archivebox/plugins/*/on_Snapshot__*.{py,sh,js}
-        - data/plugins/*/on_Snapshot__*.{py,sh,js}
-
-        Creates one ArchiveResult per hook (not per plugin), with hook_name set.
-        This enables step-based execution where all hooks in a step can run in parallel.
-        """
-        from archivebox.hooks import discover_hooks
-
-        hooks = discover_hooks('Snapshot')
-        archiveresults = []
-
-        for hook_path in hooks:
-            hook_name = hook_path.name  # e.g., 'on_Snapshot__50_wget.py'
-            plugin = hook_path.parent.name  # e.g., 'wget'
-
-            # Check if AR already exists for this specific hook
-            if ArchiveResult.objects.filter(snapshot=self, hook_name=hook_name).exists():
-                continue
-
-            archiveresult, created = ArchiveResult.objects.get_or_create(
-                snapshot=self,
-                hook_name=hook_name,
-                defaults={
-                    'plugin': plugin,
-                    'status': ArchiveResult.INITIAL_STATE,
-                    'retry_at': timezone.now(),
-                    'created_by_id': self.crawl.created_by_id,
-                },
-            )
-            if archiveresult.status == ArchiveResult.INITIAL_STATE:
-                archiveresults.append(archiveresult)
-
-        return archiveresults
-
-    def advance_step_if_ready(self) -> bool:
-        """
-        Advance current_step if all foreground hooks in current step are finished.
-
-        Called by the state machine to check if step can advance.
-        Background hooks (.bg) don't block step advancement.
-
-        Step advancement rules:
-        - All foreground ARs in current step must be finished (SUCCEEDED/FAILED/SKIPPED)
-        - Background ARs (hook_name contains '.bg.') are ignored for advancement
-        - When ready, increments current_step by 1 (up to 9)
-
-        Returns:
-            True if step was advanced, False if not ready or already at step 9.
-        """
-        from archivebox.hooks import extract_step, is_background_hook
-
-        if self.current_step >= 9:
-            return False  # Already at final step
-
-        # Get all ARs for current step that are foreground
-        current_step_ars = self.archiveresult_set.filter(
-            hook_name__isnull=False
-        ).exclude(hook_name='')
-
-        # Check each AR in current step
-        for ar in current_step_ars:
-            ar_step = extract_step(ar.hook_name)
-            if ar_step != self.current_step:
-                continue  # Not in current step
-
-            if is_background_hook(ar.hook_name):
-                continue  # Background hooks don't block
-
-            # Foreground hook in current step - check if finished
-            if ar.status not in ArchiveResult.FINAL_OR_ACTIVE_STATES:
-                # Still pending/queued - can't advance
-                return False
-
-            if ar.status == ArchiveResult.StatusChoices.STARTED:
-                # Still running - can't advance
-                return False
-
-        # All foreground hooks in current step are finished - advance!
-        self.current_step += 1
-        self.save(update_fields=['current_step', 'modified_at'])
-        return True
-
-    def is_finished_processing(self) -> bool:
-        """
-        Check if this snapshot has finished processing.
-
-        Used by SnapshotMachine.is_finished() to determine if snapshot is complete.
-
-        Returns:
-            True if all archiveresults are finished (or no work to do), False otherwise.
-        """
-        # if no archiveresults exist yet, it's not finished
-        if not self.archiveresult_set.exists():
-            return False
-
-        # Try to advance step if ready (handles step-based hook execution)
-        # This will increment current_step when all foreground hooks in current step are done
-        while self.advance_step_if_ready():
-            pass  # Keep advancing until we can't anymore
-
-        # if archiveresults exist but are still pending, it's not finished
-        if self.pending_archiveresults().exists():
-            return False
-
-        # Don't wait for background hooks - they'll be cleaned up on entering sealed state
-        # Background hooks in STARTED state are excluded by pending_archiveresults()
-        # (STARTED is in FINAL_OR_ACTIVE_STATES) so once all results are FINAL or ACTIVE,
-        # we can transition to sealed and cleanup() will kill the background hooks
-
-        # otherwise archiveresults exist and are all finished, so it's finished
-        return True
-
-    def retry_failed_archiveresults(self, retry_at: Optional['timezone.datetime'] = None) -> int:
-        """
-        Reset failed/skipped ArchiveResults to queued for retry.
-
-        This enables seamless retry of the entire extraction pipeline:
-        - Resets FAILED and SKIPPED results to QUEUED
-        - Sets retry_at so workers pick them up
-        - Plugins run in order (numeric prefix)
-        - Each plugin checks its dependencies at runtime
-
-        Dependency handling (e.g., chrome_session → screenshot):
-        - Plugins check if required outputs exist before running
-        - If dependency output missing → plugin returns 'skipped'
-        - On retry, if dependency now succeeds → dependent can run
-
-        Returns count of ArchiveResults reset.
-        """
-        retry_at = retry_at or timezone.now()
-
-        count = self.archiveresult_set.filter(
-            status__in=[
-                ArchiveResult.StatusChoices.FAILED,
-                ArchiveResult.StatusChoices.SKIPPED,
-            ]
-        ).update(
-            status=ArchiveResult.StatusChoices.QUEUED,
-            retry_at=retry_at,
-            output=None,
-            start_ts=None,
-            end_ts=None,
-        )
-
-        # Also reset the snapshot and current_step so it gets re-checked from the beginning
-        if count > 0:
-            self.status = self.StatusChoices.STARTED
-            self.retry_at = retry_at
-            self.current_step = 0  # Reset to step 0 for retry
-            self.save(update_fields=['status', 'retry_at', 'current_step', 'modified_at'])
-
-        return count
-
-    # =========================================================================
-    # URL Helper Properties (migrated from Link schema)
-    # =========================================================================
-
-    @cached_property
-    def url_hash(self) -> str:
-        from hashlib import sha256
-        return sha256(self.url.encode()).hexdigest()[:8]
-
-    @cached_property
-    def scheme(self) -> str:
-        return self.url.split('://')[0]
-
-    @cached_property
-    def path(self) -> str:
-        parts = self.url.split('://', 1)
-        return '/' + parts[1].split('/', 1)[1] if len(parts) > 1 and '/' in parts[1] else '/'
-
-    @cached_property
-    def basename(self) -> str:
-        return self.path.split('/')[-1]
-
-    @cached_property
-    def extension(self) -> str:
-        basename = self.basename
-        return basename.split('.')[-1] if '.' in basename else ''
-
-    @cached_property
-    def base_url(self) -> str:
-        return f'{self.scheme}://{self.domain}'
-
-    @cached_property
-    def is_static(self) -> bool:
-        static_extensions = {'.pdf', '.jpg', '.jpeg', '.png', '.gif', '.webp', '.svg', '.mp4', '.mp3', '.wav', '.webm'}
-        return any(self.url.lower().endswith(ext) for ext in static_extensions)
-
-    @cached_property
-    def is_archived(self) -> bool:
-        output_paths = (
-            self.domain,
-            'output.html',
-            'output.pdf',
-            'screenshot.png',
-            'singlefile.html',
-            'readability/content.html',
-            'mercury/content.html',
-            'htmltotext.txt',
-            'media',
-            'git',
-        )
-        return any((Path(self.output_dir) / path).exists() for path in output_paths)
-
-    # =========================================================================
-    # Date/Time Properties (migrated from Link schema)
-    # =========================================================================
-
-    @cached_property
-    def bookmarked_date(self) -> Optional[str]:
-        max_ts = (timezone.now() + timedelta(days=30)).timestamp()
-        if self.timestamp and self.timestamp.replace('.', '').isdigit():
-            if 0 < float(self.timestamp) < max_ts:
-                return self._ts_to_date_str(datetime.fromtimestamp(float(self.timestamp)))
-            return str(self.timestamp)
-        return None
-
-    @cached_property
-    def downloaded_datestr(self) -> Optional[str]:
-        return self._ts_to_date_str(self.downloaded_at) if self.downloaded_at else None
-
-    @cached_property
-    def archive_dates(self) -> List[datetime]:
-        return [
-            result.start_ts
-            for result in self.archiveresult_set.all()
-            if result.start_ts
-        ]
-
-    @cached_property
-    def oldest_archive_date(self) -> Optional[datetime]:
-        dates = self.archive_dates
-        return min(dates) if dates else None
-
-    @cached_property
-    def newest_archive_date(self) -> Optional[datetime]:
-        dates = self.archive_dates
-        return max(dates) if dates else None
-
-    @cached_property
-    def num_outputs(self) -> int:
-        return self.archiveresult_set.filter(status='succeeded').count()
-
-    @cached_property
-    def num_failures(self) -> int:
-        return self.archiveresult_set.filter(status='failed').count()
-
-    # =========================================================================
-    # Output Path Methods (migrated from Link schema)
-    # =========================================================================
-
-    def canonical_outputs(self) -> Dict[str, Optional[str]]:
-        """
-        Intelligently discover the best output file for each plugin.
-        Uses actual ArchiveResult data and filesystem scanning with smart heuristics.
-        """
-        FAVICON_PROVIDER = 'https://www.google.com/s2/favicons?domain={}'
-
-        # Mimetypes that can be embedded/previewed in an iframe
-        IFRAME_EMBEDDABLE_EXTENSIONS = {
-            'html', 'htm', 'pdf', 'txt', 'md', 'json', 'jsonl',
-            'png', 'jpg', 'jpeg', 'gif', 'webp', 'svg', 'ico',
-            'mp4', 'webm', 'mp3', 'opus', 'ogg', 'wav',
-        }
-
-        MIN_DISPLAY_SIZE = 15_000  # 15KB - filter out tiny files
-        MAX_SCAN_FILES = 50  # Don't scan massive directories
-
-        def find_best_output_in_dir(dir_path: Path, plugin_name: str) -> Optional[str]:
-            """Find the best representative file in a plugin's output directory"""
-            if not dir_path.exists() or not dir_path.is_dir():
-                return None
-
-            candidates = []
-            file_count = 0
-
-            # Special handling for media plugin - look for thumbnails
-            is_media_dir = plugin_name == 'media'
-
-            # Scan for suitable files
-            for file_path in dir_path.rglob('*'):
-                file_count += 1
-                if file_count > MAX_SCAN_FILES:
-                    break
-
-                if file_path.is_dir() or file_path.name.startswith('.'):
-                    continue
-
-                ext = file_path.suffix.lstrip('.').lower()
-                if ext not in IFRAME_EMBEDDABLE_EXTENSIONS:
-                    continue
-
-                try:
-                    size = file_path.stat().st_size
-                except OSError:
-                    continue
-
-                # For media dir, allow smaller image files (thumbnails are often < 15KB)
-                min_size = 5_000 if (is_media_dir and ext in ('png', 'jpg', 'jpeg', 'webp', 'gif')) else MIN_DISPLAY_SIZE
-                if size < min_size:
-                    continue
-
-                # Prefer main files: index.html, output.*, content.*, etc.
-                priority = 0
-                name_lower = file_path.name.lower()
-
-                if is_media_dir:
-                    # Special prioritization for media directories
-                    if any(keyword in name_lower for keyword in ('thumb', 'thumbnail', 'cover', 'poster')):
-                        priority = 200  # Highest priority for thumbnails
-                    elif ext in ('png', 'jpg', 'jpeg', 'webp', 'gif'):
-                        priority = 150  # High priority for any image
-                    elif ext in ('mp4', 'webm', 'mp3', 'opus', 'ogg'):
-                        priority = 100  # Lower priority for actual media files
-                    else:
-                        priority = 50
-                elif 'index' in name_lower:
-                    priority = 100
-                elif name_lower.startswith(('output', 'content', plugin_name)):
-                    priority = 50
-                elif ext in ('html', 'htm', 'pdf'):
-                    priority = 30
-                elif ext in ('png', 'jpg', 'jpeg', 'webp'):
-                    priority = 20
-                else:
-                    priority = 10
-
-                candidates.append((priority, size, file_path))
-
-            if not candidates:
-                return None
-
-            # Sort by priority (desc), then size (desc)
-            candidates.sort(key=lambda x: (x[0], x[1]), reverse=True)
-            best_file = candidates[0][2]
-            return str(best_file.relative_to(Path(self.output_dir)))
-
-        canonical = {
-            'index_path': 'index.html',
-            'google_favicon_path': FAVICON_PROVIDER.format(self.domain),
-            'archive_org_path': f'https://web.archive.org/web/{self.base_url}',
-        }
-
-        # Scan each ArchiveResult's output directory for the best file
-        snap_dir = Path(self.output_dir)
-        for result in self.archiveresult_set.filter(status='succeeded'):
-            if not result.output_files and not result.output_str:
-                continue
-
-            # Try to find the best output file for this plugin
-            plugin_dir = snap_dir / result.plugin
-            best_output = None
-
-            # Check output_files first (new field)
-            if result.output_files:
-                first_file = next(iter(result.output_files.keys()), None)
-                if first_file and (plugin_dir / first_file).exists():
-                    best_output = f'{result.plugin}/{first_file}'
-
-            # Fallback to output_str if it looks like a path
-            if not best_output and result.output_str and (snap_dir / result.output_str).exists():
-                best_output = result.output_str
-
-            if not best_output and plugin_dir.exists():
-                # Intelligently find the best file in the plugin's directory
-                best_output = find_best_output_in_dir(plugin_dir, result.plugin)
-
-            if best_output:
-                canonical[f'{result.plugin}_path'] = best_output
-
-        # Also scan top-level for legacy outputs (backwards compatibility)
-        for file_path in snap_dir.glob('*'):
-            if file_path.is_dir() or file_path.name in ('index.html', 'index.json'):
-                continue
-
-            ext = file_path.suffix.lstrip('.').lower()
-            if ext not in IFRAME_EMBEDDABLE_EXTENSIONS:
-                continue
-
-            try:
-                size = file_path.stat().st_size
-                if size >= MIN_DISPLAY_SIZE:
-                    # Add as generic output with stem as key
-                    key = f'{file_path.stem}_path'
-                    if key not in canonical:
-                        canonical[key] = file_path.name
-            except OSError:
-                continue
-
-        if self.is_static:
-            static_path = f'warc/{self.timestamp}'
-            canonical.update({
-                'title': self.basename,
-                'wget_path': static_path,
-            })
-
-        return canonical
-
-    def latest_outputs(self, status: Optional[str] = None) -> Dict[str, Any]:
-        """Get the latest output that each plugin produced"""
-        from archivebox.hooks import get_plugins
-        from django.db.models import Q
-
-        latest: Dict[str, Any] = {}
-        for plugin in get_plugins():
-            results = self.archiveresult_set.filter(plugin=plugin)
-            if status is not None:
-                results = results.filter(status=status)
-            # Filter for results with output_files or output_str
-            results = results.filter(Q(output_files__isnull=False) | ~Q(output_str='')).order_by('-start_ts')
-            result = results.first()
-            # Return embed_path() for backwards compatibility
-            latest[plugin] = result.embed_path() if result else None
-        return latest
-
-    # =========================================================================
-    # Serialization Methods
-    # =========================================================================
-
-    def to_dict(self, extended: bool = False) -> Dict[str, Any]:
-        """Convert Snapshot to a dictionary (replacement for Link._asdict())"""
-        from archivebox.misc.util import ts_to_date_str
-
-        result = {
-            'TYPE': 'core.models.Snapshot',
-            'id': str(self.id),
-            'url': self.url,
-            'timestamp': self.timestamp,
-            'title': self.title,
-            'tags': self.tags_str(),
-            'downloaded_at': self.downloaded_at.isoformat() if self.downloaded_at else None,
-            'bookmarked_at': self.bookmarked_at.isoformat() if self.bookmarked_at else None,
-            'created_at': self.created_at.isoformat() if self.created_at else None,
-            # Computed properties
-            'domain': self.domain,
-            'scheme': self.scheme,
-            'base_url': self.base_url,
-            'path': self.path,
-            'basename': self.basename,
-            'extension': self.extension,
-            'is_static': self.is_static,
-            'is_archived': self.is_archived,
-            'archive_path': self.archive_path,
-            'output_dir': self.output_dir,
-            'link_dir': self.output_dir,  # backwards compatibility alias
-            'archive_size': self.archive_size,
-            'bookmarked_date': self.bookmarked_date,
-            'downloaded_datestr': self.downloaded_datestr,
-            'num_outputs': self.num_outputs,
-            'num_failures': self.num_failures,
-        }
-        if extended:
-            result['canonical'] = self.canonical_outputs()
-        return result
-
-    def to_json(self, indent: int = 4) -> str:
-        """Convert to JSON string"""
-        return to_json(self.to_dict(extended=True), indent=indent)
-
-    def to_csv(self, cols: Optional[List[str]] = None, separator: str = ',', ljust: int = 0) -> str:
-        """Convert to CSV string"""
-        data = self.to_dict()
-        cols = cols or ['timestamp', 'is_archived', 'url']
-        return separator.join(to_json(data.get(col, ''), indent=None).ljust(ljust) for col in cols)
-
-    def write_json_details(self, out_dir: Optional[str] = None) -> None:
-        """Write JSON index file for this snapshot to its output directory"""
-        out_dir = out_dir or self.output_dir
-        path = Path(out_dir) / CONSTANTS.JSON_INDEX_FILENAME
-        atomic_write(str(path), self.to_dict(extended=True))
-
-    def write_html_details(self, out_dir: Optional[str] = None) -> None:
-        """Write HTML detail page for this snapshot to its output directory"""
-        from django.template.loader import render_to_string
-        from archivebox.config.common import SERVER_CONFIG
-        from archivebox.config.configset import get_config
-        from archivebox.misc.logging_util import printable_filesize
-
-        out_dir = out_dir or self.output_dir
-        config = get_config()
-        SAVE_ARCHIVE_DOT_ORG = config.get('SAVE_ARCHIVE_DOT_ORG', True)
-        TITLE_LOADING_MSG = 'Not yet archived...'
-
-        canonical = self.canonical_outputs()
-        context = {
-            **self.to_dict(extended=True),
-            **{f'{k}_path': v for k, v in canonical.items()},
-            'canonical': {f'{k}_path': v for k, v in canonical.items()},
-            'title': htmlencode(self.title or (self.base_url if self.is_archived else TITLE_LOADING_MSG)),
-            'url_str': htmlencode(urldecode(self.base_url)),
-            'archive_url': urlencode(f'warc/{self.timestamp}' or (self.domain if self.is_archived else '')) or 'about:blank',
-            'extension': self.extension or 'html',
-            'tags': self.tags_str() or 'untagged',
-            'size': printable_filesize(self.archive_size) if self.archive_size else 'pending',
-            'status': 'archived' if self.is_archived else 'not yet archived',
-            'status_color': 'success' if self.is_archived else 'danger',
-            'oldest_archive_date': ts_to_date_str(self.oldest_archive_date),
-            'SAVE_ARCHIVE_DOT_ORG': SAVE_ARCHIVE_DOT_ORG,
-            'PREVIEW_ORIGINALS': SERVER_CONFIG.PREVIEW_ORIGINALS,
-        }
-        rendered_html = render_to_string('snapshot.html', context)
-        atomic_write(str(Path(out_dir) / CONSTANTS.HTML_INDEX_FILENAME), rendered_html)
-
-    # =========================================================================
-    # Helper Methods
-    # =========================================================================
-
-    @staticmethod
-    def _ts_to_date_str(dt: Optional[datetime]) -> Optional[str]:
-        return dt.strftime('%Y-%m-%d %H:%M:%S') if dt else None
-
-
-# =============================================================================
-# Snapshot State Machine
-# =============================================================================
-
-class SnapshotMachine(BaseStateMachine, strict_states=True):
-    """
-    State machine for managing Snapshot lifecycle.
-
-    Hook Lifecycle:
-    ┌─────────────────────────────────────────────────────────────┐
-    │ QUEUED State                                                │
-    │  • Waiting for snapshot to be ready                         │
-    └─────────────────────────────────────────────────────────────┘
-                            ↓ tick() when can_start()
-    ┌─────────────────────────────────────────────────────────────┐
-    │ STARTED State → enter_started()                             │
-    │  1. snapshot.run()                                          │
-    │     • discover_hooks('Snapshot') → finds all plugin hooks   │
-    │     • create_pending_archiveresults() → creates ONE         │
-    │       ArchiveResult per hook (NO execution yet)             │
-    │  2. ArchiveResults process independently with their own     │
-    │     state machines (see ArchiveResultMachine)               │
-    │  3. Advance through steps 0-9 as foreground hooks complete  │
-    └─────────────────────────────────────────────────────────────┘
-                            ↓ tick() when is_finished()
-    ┌─────────────────────────────────────────────────────────────┐
-    │ SEALED State → enter_sealed()                               │
-    │  • cleanup() → kills any background hooks still running     │
-    │  • Set retry_at=None (no more processing)                   │
-    └─────────────────────────────────────────────────────────────┘
-
-    https://github.com/ArchiveBox/ArchiveBox/wiki/ArchiveBox-Architecture-Diagrams
-    """
-
-    model_attr_name = 'snapshot'
-
-    # States
-    queued = State(value=Snapshot.StatusChoices.QUEUED, initial=True)
-    started = State(value=Snapshot.StatusChoices.STARTED)
-    sealed = State(value=Snapshot.StatusChoices.SEALED, final=True)
-
-    # Tick Event
-    tick = (
-        queued.to.itself(unless='can_start') |
-        queued.to(started, cond='can_start') |
-        started.to.itself(unless='is_finished') |
-        started.to(sealed, cond='is_finished')
-    )
-
-    def can_start(self) -> bool:
-        can_start = bool(self.snapshot.url)
-        # Suppressed: queue waiting logs
-        return can_start
-
-    def is_finished(self) -> bool:
-        """Check if snapshot processing is complete - delegates to model method."""
-        return self.snapshot.is_finished_processing()
-
-    @queued.enter
-    def enter_queued(self):
-        # Suppressed: state transition logs
-        self.snapshot.update_and_requeue(
-            retry_at=timezone.now(),
-            status=Snapshot.StatusChoices.QUEUED,
-        )
-
-    @started.enter
-    def enter_started(self):
-        # Suppressed: state transition logs
-        # lock the snapshot while we create the pending archiveresults
-        self.snapshot.update_and_requeue(
-            retry_at=timezone.now() + timedelta(seconds=30),  # if failed, wait 30s before retrying
-        )
-
-        # Run the snapshot - creates pending archiveresults for all enabled plugins
-        self.snapshot.run()
-
-        # unlock the snapshot after we're done + set status = started
-        self.snapshot.update_and_requeue(
-            retry_at=timezone.now() + timedelta(seconds=5),  # check again in 5s
-            status=Snapshot.StatusChoices.STARTED,
-        )
-
-    @sealed.enter
-    def enter_sealed(self):
-        # Clean up background hooks
-        self.snapshot.cleanup()
-
-        # Suppressed: state transition logs
-        self.snapshot.update_and_requeue(
-            retry_at=None,
-            status=Snapshot.StatusChoices.SEALED,
-        )
-
-
-class ArchiveResultManager(models.Manager):
-    def indexable(self, sorted: bool = True):
-        INDEXABLE_METHODS = [r[0] for r in EXTRACTOR_INDEXING_PRECEDENCE]
-        qs = self.get_queryset().filter(plugin__in=INDEXABLE_METHODS, status='succeeded')
-        if sorted:
-            precedence = [When(plugin=method, then=Value(p)) for method, p in EXTRACTOR_INDEXING_PRECEDENCE]
-            qs = qs.annotate(indexing_precedence=Case(*precedence, default=Value(1000), output_field=IntegerField())).order_by('indexing_precedence')
-        return qs
-
-
-class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHealthStats, ModelWithStateMachine):
-    class StatusChoices(models.TextChoices):
-        QUEUED = 'queued', 'Queued'
-        STARTED = 'started', 'Started'
-        BACKOFF = 'backoff', 'Waiting to retry'
-        SUCCEEDED = 'succeeded', 'Succeeded'
-        FAILED = 'failed', 'Failed'
-        SKIPPED = 'skipped', 'Skipped'
-
-    @classmethod
-    def get_plugin_choices(cls):
-        """Get plugin choices from discovered hooks (for forms/admin)."""
-        plugins = [get_plugin_name(e) for e in get_plugins()]
-        return tuple((e, e) for e in plugins)
-
-    # Keep AutoField for backward compatibility with 0.7.x databases
-    # UUID field is added separately by migration for new records
-    id = models.AutoField(primary_key=True, editable=False)
-    # Note: unique constraint is added by migration 0027 - don't set unique=True here
-    # or SQLite table recreation in earlier migrations will fail
-    uuid = models.UUIDField(default=uuid7, null=True, blank=True, db_index=True)
-    created_by = models.ForeignKey(settings.AUTH_USER_MODEL, on_delete=models.CASCADE, default=get_or_create_system_user_pk, null=False, related_name='archiveresult_set', db_index=True)
-    created_at = models.DateTimeField(default=timezone.now, db_index=True)
-    modified_at = models.DateTimeField(auto_now=True)
-
-    snapshot: Snapshot = models.ForeignKey(Snapshot, on_delete=models.CASCADE)  # type: ignore
-    # No choices= constraint - plugin names come from plugin system and can be any string
-    plugin = models.CharField(max_length=32, blank=False, null=False, db_index=True)
-    hook_name = models.CharField(max_length=255, blank=True, default='', db_index=True, help_text='Full filename of the hook that executed (e.g., on_Snapshot__50_wget.py)')
-    pwd = models.CharField(max_length=256, default=None, null=True, blank=True)
-    cmd = models.JSONField(default=None, null=True, blank=True)
-    cmd_version = models.CharField(max_length=128, default=None, null=True, blank=True)
-
-    # New output fields (replacing old 'output' field)
-    output_str = models.TextField(blank=True, default='', help_text='Human-readable output summary')
-    output_json = models.JSONField(null=True, blank=True, default=None, help_text='Structured metadata (headers, redirects, etc.)')
-    output_files = models.JSONField(default=dict, help_text='Dict of {relative_path: {metadata}}')
-    output_size = models.BigIntegerField(default=0, help_text='Total bytes of all output files')
-    output_mimetypes = models.CharField(max_length=512, blank=True, default='', help_text='CSV of mimetypes sorted by size')
-
-    # Binary FK (optional - set when hook reports cmd)
-    binary = models.ForeignKey(
-        'machine.Binary',
-        on_delete=models.SET_NULL,
-        null=True, blank=True,
-        related_name='archiveresults',
-        help_text='Primary binary used by this hook'
-    )
-
-    start_ts = models.DateTimeField(default=None, null=True, blank=True)
-    end_ts = models.DateTimeField(default=None, null=True, blank=True)
-
-    status = ModelWithStateMachine.StatusField(choices=StatusChoices.choices, default=StatusChoices.QUEUED)
-    retry_at = ModelWithStateMachine.RetryAtField(default=timezone.now)
-    notes = models.TextField(blank=True, null=False, default='')
-    output_dir = models.CharField(max_length=256, default=None, null=True, blank=True)
-    iface = models.ForeignKey(NetworkInterface, on_delete=models.SET_NULL, null=True, blank=True)
-
-    state_machine_name = 'core.models.ArchiveResultMachine'
-    retry_at_field_name = 'retry_at'
-    state_field_name = 'status'
-    active_state = StatusChoices.STARTED
-
-    objects = ArchiveResultManager()
-
-    class Meta(TypedModelMeta):
-        verbose_name = 'Archive Result'
-        verbose_name_plural = 'Archive Results Log'
-
-    def __str__(self):
-        return f'[{self.id}] {self.snapshot.url[:64]} -> {self.plugin}'
-
-    def save(self, *args, **kwargs):
-        is_new = self._state.adding
-        # Skip ModelWithOutputDir.save() to avoid creating index.json in plugin directories
-        # Call the Django Model.save() directly instead
-        models.Model.save(self, *args, **kwargs)
-
-        if is_new:
-            from archivebox.misc.logging_util import log_worker_event
-            log_worker_event(
-                worker_type='DB',
-                event='Created ArchiveResult',
-                indent_level=3,
-                plugin=self.plugin,
-                metadata={
-                    'id': str(self.id),
-                    'snapshot_id': str(self.snapshot_id),
-                    'snapshot_url': str(self.snapshot.url)[:64],
-                    'status': self.status,
-                },
-            )
-
-    @cached_property
-    def snapshot_dir(self):
-        return Path(self.snapshot.output_dir)
-
-    @cached_property
-    def url(self):
-        return self.snapshot.url
-
-    @property
-    def api_url(self) -> str:
-        return reverse_lazy('api-1:get_archiveresult', args=[self.id])
-
-    def get_absolute_url(self):
-        return f'/{self.snapshot.archive_path}/{self.plugin}'
-
-    @property
-    def plugin_module(self) -> Any | None:
-        # Hook scripts are now used instead of Python plugin modules
-        # The plugin name maps to hooks in archivebox/plugins/{plugin}/
-        return None
-
-    def output_exists(self) -> bool:
-        return os.path.exists(Path(self.snapshot_dir) / self.plugin)
-
-    def embed_path(self) -> Optional[str]:
-        """
-        Get the relative path to the embeddable output file for this result.
-
-        Returns the first file from output_files if set, otherwise tries to
-        find a reasonable default based on the plugin type.
-        """
-        # Check output_files dict for primary output
-        if self.output_files:
-            # Return first file from output_files (dict preserves insertion order)
-            first_file = next(iter(self.output_files.keys()), None)
-            if first_file:
-                return f'{self.plugin}/{first_file}'
-
-        # Fallback: check output_str if it looks like a file path
-        if self.output_str and ('/' in self.output_str or '.' in self.output_str):
-            return self.output_str
-
-        # Try to find output file based on plugin's canonical output path
-        canonical = self.snapshot.canonical_outputs()
-        plugin_key = f'{self.plugin}_path'
-        if plugin_key in canonical:
-            return canonical[plugin_key]
-
-        # Fallback to plugin directory
-        return f'{self.plugin}/'
-
-    def create_output_dir(self):
-        output_dir = Path(self.snapshot_dir) / self.plugin
-        output_dir.mkdir(parents=True, exist_ok=True)
-        return output_dir
-
-    @property
-    def output_dir_name(self) -> str:
-        return self.plugin
-
-    @property
-    def output_dir_parent(self) -> str:
-        return str(self.snapshot.OUTPUT_DIR.relative_to(CONSTANTS.DATA_DIR))
-
-    def save_search_index(self):
-        pass
-
-    def cascade_health_update(self, success: bool):
-        """Update health stats for self, parent Snapshot, and grandparent Crawl (if present)."""
-        self.increment_health_stats(success)
-        self.snapshot.increment_health_stats(success)
-        if self.snapshot.crawl_id:
-            self.snapshot.crawl.increment_health_stats(success)
-
-    def run(self):
-        """
-        Execute this ArchiveResult's hook and update status.
-
-        If self.hook_name is set, runs only that specific hook.
-        If self.hook_name is empty, discovers and runs all hooks for self.plugin (backwards compat).
-
-        Updates status/output fields, queues discovered URLs, and triggers indexing.
-        """
-        from django.utils import timezone
-        from archivebox.hooks import BUILTIN_PLUGINS_DIR, USER_PLUGINS_DIR, run_hook, is_background_hook
-        from archivebox.config.configset import get_config
-
-        # Get merged config with proper context
-        config = get_config(
-            crawl=self.snapshot.crawl if self.snapshot.crawl else None,
-            snapshot=self.snapshot,
-        )
-
-        # Determine which hook(s) to run
-        hooks = []
-
-        if self.hook_name:
-            # SPECIFIC HOOK MODE: Find the specific hook by name
-            for base_dir in (BUILTIN_PLUGINS_DIR, USER_PLUGINS_DIR):
-                if not base_dir.exists():
-                    continue
-                plugin_dir = base_dir / self.plugin
-                if plugin_dir.exists():
-                    hook_path = plugin_dir / self.hook_name
-                    if hook_path.exists():
-                        hooks.append(hook_path)
-                        break
-        else:
-            # LEGACY MODE: Discover all hooks for this plugin (backwards compatibility)
-            for base_dir in (BUILTIN_PLUGINS_DIR, USER_PLUGINS_DIR):
-                if not base_dir.exists():
-                    continue
-                plugin_dir = base_dir / self.plugin
-                if plugin_dir.exists():
-                    matches = list(plugin_dir.glob('on_Snapshot__*.*'))
-                    if matches:
-                        hooks.extend(sorted(matches))
-
-        if not hooks:
-            self.status = self.StatusChoices.FAILED
-            if self.hook_name:
-                self.output_str = f'Hook not found: {self.plugin}/{self.hook_name}'
-            else:
-                self.output_str = f'No hooks found for plugin: {self.plugin}'
-            self.retry_at = None
-            self.save()
-            return
-
-        # Output directory is plugin_dir for the hook output
-        plugin_dir = Path(self.snapshot.output_dir) / self.plugin
-
-        start_ts = timezone.now()
-        is_bg_hook = False
-
-        for hook in hooks:
-            # Check if this is a background hook
-            is_bg_hook = is_background_hook(hook.name)
-
-            result = run_hook(
-                hook,
-                output_dir=plugin_dir,
-                config=config,
-                url=self.snapshot.url,
-                snapshot_id=str(self.snapshot.id),
-                crawl_id=str(self.snapshot.crawl.id) if self.snapshot.crawl else None,
-                depth=self.snapshot.depth,
-            )
-
-            # Background hooks return None
-            if result is None:
-                is_bg_hook = True
-
-        # Update status based on hook execution
-        if is_bg_hook:
-            # BACKGROUND HOOK - still running, return immediately
-            # Status stays STARTED, will be finalized by Snapshot.cleanup()
-            self.status = self.StatusChoices.STARTED
-            self.start_ts = start_ts
-            self.pwd = str(plugin_dir)
-            self.save()
-            return
-
-        # FOREGROUND HOOK - completed, update from filesystem
-        self.start_ts = start_ts
-        self.pwd = str(plugin_dir)
-        self.update_from_output()
-
-        # Clean up empty output directory if no files were created
-        if plugin_dir.exists() and not self.output_files:
-            try:
-                if not any(plugin_dir.iterdir()):
-                    plugin_dir.rmdir()
-            except (OSError, RuntimeError):
-                pass
-
-    def update_from_output(self):
-        """
-        Update this ArchiveResult from filesystem logs and output files.
-
-        Used for:
-        - Foreground hooks that completed (called from ArchiveResult.run())
-        - Background hooks that completed (called from Snapshot.cleanup())
-
-        Updates:
-        - status, output_str, output_json from ArchiveResult JSONL record
-        - output_files, output_size, output_mimetypes by walking filesystem
-        - end_ts, retry_at, cmd, cmd_version, binary FK
-        - Processes side-effect records (Snapshot, Tag, etc.) via process_hook_records()
-        """
-        import json
-        import mimetypes
-        from collections import defaultdict
-        from pathlib import Path
-        from django.utils import timezone
-        from archivebox.hooks import process_hook_records
-
-        plugin_dir = Path(self.pwd) if self.pwd else None
-        if not plugin_dir or not plugin_dir.exists():
-            self.status = self.StatusChoices.FAILED
-            self.output_str = 'Output directory not found'
-            self.end_ts = timezone.now()
-            self.retry_at = None
-            self.save()
-            return
-
-        # Read and parse JSONL output from stdout.log
-        stdout_file = plugin_dir / 'stdout.log'
-        stdout = stdout_file.read_text() if stdout_file.exists() else ''
-
-        records = []
-        for line in stdout.splitlines():
-            if line.strip() and line.strip().startswith('{'):
-                try:
-                    records.append(json.loads(line))
-                except json.JSONDecodeError:
-                    continue
-
-        # Find ArchiveResult record and update status/output from it
-        ar_records = [r for r in records if r.get('type') == 'ArchiveResult']
-        if ar_records:
-            hook_data = ar_records[0]
-
-            # Update status
-            status_map = {
-                'succeeded': self.StatusChoices.SUCCEEDED,
-                'failed': self.StatusChoices.FAILED,
-                'skipped': self.StatusChoices.SKIPPED,
-            }
-            self.status = status_map.get(hook_data.get('status', 'failed'), self.StatusChoices.FAILED)
-
-            # Update output fields
-            self.output_str = hook_data.get('output_str') or hook_data.get('output') or ''
-            self.output_json = hook_data.get('output_json')
-
-            # Update cmd fields
-            if hook_data.get('cmd'):
-                self.cmd = hook_data['cmd']
-                self._set_binary_from_cmd(hook_data['cmd'])
-            if hook_data.get('cmd_version'):
-                self.cmd_version = hook_data['cmd_version'][:128]
-        else:
-            # No ArchiveResult record = failed
-            self.status = self.StatusChoices.FAILED
-            self.output_str = 'Hook did not output ArchiveResult record'
-
-        # Walk filesystem and populate output_files, output_size, output_mimetypes
-        exclude_names = {'stdout.log', 'stderr.log', 'hook.pid', 'listener.pid'}
-        mime_sizes = defaultdict(int)
-        total_size = 0
-        output_files = {}
-
-        for file_path in plugin_dir.rglob('*'):
-            if not file_path.is_file():
-                continue
-            if file_path.name in exclude_names:
-                continue
-
-            try:
-                stat = file_path.stat()
-                mime_type, _ = mimetypes.guess_type(str(file_path))
-                mime_type = mime_type or 'application/octet-stream'
-
-                relative_path = str(file_path.relative_to(plugin_dir))
-                output_files[relative_path] = {}
-                mime_sizes[mime_type] += stat.st_size
-                total_size += stat.st_size
-            except (OSError, IOError):
-                continue
-
-        self.output_files = output_files
-        self.output_size = total_size
-        sorted_mimes = sorted(mime_sizes.items(), key=lambda x: x[1], reverse=True)
-        self.output_mimetypes = ','.join(mime for mime, _ in sorted_mimes)
-
-        # Update timestamps
-        self.end_ts = timezone.now()
-        self.retry_at = None
-
-        self.save()
-
-        # Process side-effect records (filter Snapshots for depth/URL)
-        filtered_records = []
-        for record in records:
-            record_type = record.get('type')
-
-            # Skip ArchiveResult records (already processed above)
-            if record_type == 'ArchiveResult':
-                continue
-
-            # Filter Snapshot records for depth/URL constraints
-            if record_type == 'Snapshot':
-                if not self.snapshot.crawl:
-                    continue
-
-                url = record.get('url')
-                if not url:
-                    continue
-
-                depth = record.get('depth', self.snapshot.depth + 1)
-                if depth > self.snapshot.crawl.max_depth:
-                    continue
-
-                if not self._url_passes_filters(url):
-                    continue
-
-            filtered_records.append(record)
-
-        # Process filtered records with unified dispatcher
-        overrides = {
-            'snapshot': self.snapshot,
-            'crawl': self.snapshot.crawl,
-            'created_by_id': self.snapshot.crawl.created_by_id,
-        }
-        process_hook_records(filtered_records, overrides=overrides)
-
-        # Cleanup PID files and empty logs
-        pid_file = plugin_dir / 'hook.pid'
-        pid_file.unlink(missing_ok=True)
-        stderr_file = plugin_dir / 'stderr.log'
-        if stdout_file.exists() and stdout_file.stat().st_size == 0:
-            stdout_file.unlink()
-        if stderr_file.exists() and stderr_file.stat().st_size == 0:
-            stderr_file.unlink()
-
-    def _set_binary_from_cmd(self, cmd: list) -> None:
-        """
-        Find Binary for command and set binary FK.
-
-        Tries matching by absolute path first, then by binary name.
-        Only matches binaries on the current machine.
-        """
-        if not cmd:
-            return
-
-        from archivebox.machine.models import Machine
-
-        bin_path_or_name = cmd[0] if isinstance(cmd, list) else cmd
-        machine = Machine.current()
-
-        # Try matching by absolute path first
-        binary = Binary.objects.filter(
-            abspath=bin_path_or_name,
-            machine=machine
-        ).first()
-
-        if binary:
-            self.binary = binary
-            return
-
-        # Fallback: match by binary name
-        bin_name = Path(bin_path_or_name).name
-        binary = Binary.objects.filter(
-            name=bin_name,
-            machine=machine
-        ).first()
-
-        if binary:
-            self.binary = binary
-
-    def _url_passes_filters(self, url: str) -> bool:
-        """Check if URL passes URL_ALLOWLIST and URL_DENYLIST config filters.
-
-        Uses proper config hierarchy: defaults -> file -> env -> machine -> user -> crawl -> snapshot
-        """
-        import re
-        from archivebox.config.configset import get_config
-
-        # Get merged config with proper hierarchy
-        config = get_config(
-            user=self.snapshot.crawl.created_by if self.snapshot else None,
-            crawl=self.snapshot.crawl if self.snapshot else None,
-            snapshot=self.snapshot,
-        )
-
-        # Get allowlist/denylist (can be string or list)
-        allowlist_raw = config.get('URL_ALLOWLIST', '')
-        denylist_raw = config.get('URL_DENYLIST', '')
-
-        # Normalize to list of patterns
-        def to_pattern_list(value):
-            if isinstance(value, list):
-                return value
-            if isinstance(value, str):
-                return [p.strip() for p in value.split(',') if p.strip()]
-            return []
-
-        allowlist = to_pattern_list(allowlist_raw)
-        denylist = to_pattern_list(denylist_raw)
-
-        # Denylist takes precedence
-        if denylist:
-            for pattern in denylist:
-                try:
-                    if re.search(pattern, url):
-                        return False
-                except re.error:
-                    continue  # Skip invalid regex patterns
-
-        # If allowlist exists, URL must match at least one pattern
-        if allowlist:
-            for pattern in allowlist:
-                try:
-                    if re.search(pattern, url):
-                        return True
-                except re.error:
-                    continue  # Skip invalid regex patterns
-            return False  # No allowlist patterns matched
-
-        return True  # No filters or passed filters
-
-    @property
-    def output_dir(self) -> Path:
-        """Get the output directory for this plugin's results."""
-        return Path(self.snapshot.output_dir) / self.plugin
-
-    def is_background_hook(self) -> bool:
-        """Check if this ArchiveResult is for a background hook."""
-        plugin_dir = Path(self.pwd) if self.pwd else None
-        if not plugin_dir:
-            return False
-        pid_file = plugin_dir / 'hook.pid'
-        return pid_file.exists()
-
-
-# =============================================================================
-# ArchiveResult State Machine
-# =============================================================================
-
-class ArchiveResultMachine(BaseStateMachine, strict_states=True):
-    """
-    State machine for managing ArchiveResult (single plugin execution) lifecycle.
-
-    Hook Lifecycle:
-    ┌─────────────────────────────────────────────────────────────┐
-    │ QUEUED State                                                │
-    │  • Waiting for its turn to run                              │
-    └─────────────────────────────────────────────────────────────┘
-                            ↓ tick() when can_start()
-    ┌─────────────────────────────────────────────────────────────┐
-    │ STARTED State → enter_started()                             │
-    │  1. archiveresult.run()                                     │
-    │     • Find specific hook by hook_name                       │
-    │     • run_hook(script, output_dir, ...) → subprocess        │
-    │                                                              │
-    │  2a. FOREGROUND hook (returns HookResult):                  │
-    │      • update_from_output() immediately                     │
-    │        - Read stdout.log                                    │
-    │        - Parse JSONL records                                │
-    │        - Extract 'ArchiveResult' record → update status     │
-    │        - Walk output_dir → populate output_files            │
-    │        - Call process_hook_records() for side effects       │
-    │                                                              │
-    │  2b. BACKGROUND hook (returns None):                        │
-    │      • Status stays STARTED                                 │
-    │      • Continues running in background                      │
-    │      • Killed by Snapshot.cleanup() when sealed             │
-    └─────────────────────────────────────────────────────────────┘
-                            ↓ tick() checks status
-    ┌─────────────────────────────────────────────────────────────┐
-    │ SUCCEEDED / FAILED / SKIPPED / BACKOFF                      │
-    │  • Set by hook's JSONL output during update_from_output()   │
-    │  • Health stats incremented (num_uses_succeeded/failed)     │
-    │  • Parent Snapshot health stats also updated                │
-    └─────────────────────────────────────────────────────────────┘
-
-    https://github.com/ArchiveBox/ArchiveBox/wiki/ArchiveBox-Architecture-Diagrams
-    """
-
-    model_attr_name = 'archiveresult'
-
-    # States
-    queued = State(value=ArchiveResult.StatusChoices.QUEUED, initial=True)
-    started = State(value=ArchiveResult.StatusChoices.STARTED)
-    backoff = State(value=ArchiveResult.StatusChoices.BACKOFF)
-    succeeded = State(value=ArchiveResult.StatusChoices.SUCCEEDED, final=True)
-    failed = State(value=ArchiveResult.StatusChoices.FAILED, final=True)
-    skipped = State(value=ArchiveResult.StatusChoices.SKIPPED, final=True)
-
-    # Tick Event - transitions based on conditions
-    tick = (
-        queued.to.itself(unless='can_start') |
-        queued.to(started, cond='can_start') |
-        started.to.itself(unless='is_finished') |
-        started.to(succeeded, cond='is_succeeded') |
-        started.to(failed, cond='is_failed') |
-        started.to(skipped, cond='is_skipped') |
-        started.to(backoff, cond='is_backoff') |
-        backoff.to.itself(unless='can_start') |
-        backoff.to(started, cond='can_start') |
-        backoff.to(succeeded, cond='is_succeeded') |
-        backoff.to(failed, cond='is_failed') |
-        backoff.to(skipped, cond='is_skipped')
-    )
-
-    def can_start(self) -> bool:
-        can_start = bool(self.archiveresult.snapshot.url)
-        # Suppressed: queue waiting logs
-        return can_start
-
-    def is_succeeded(self) -> bool:
-        """Check if extractor plugin succeeded (status was set by run())."""
-        return self.archiveresult.status == ArchiveResult.StatusChoices.SUCCEEDED
-
-    def is_failed(self) -> bool:
-        """Check if extractor plugin failed (status was set by run())."""
-        return self.archiveresult.status == ArchiveResult.StatusChoices.FAILED
-
-    def is_skipped(self) -> bool:
-        """Check if extractor plugin was skipped (status was set by run())."""
-        return self.archiveresult.status == ArchiveResult.StatusChoices.SKIPPED
-
-    def is_backoff(self) -> bool:
-        """Check if we should backoff and retry later."""
-        # Backoff if status is still started (plugin didn't complete) and output_str is empty
-        return (
-            self.archiveresult.status == ArchiveResult.StatusChoices.STARTED and
-            not self.archiveresult.output_str
-        )
-
-    def is_finished(self) -> bool:
-        """Check if extraction has completed (success, failure, or skipped)."""
-        return self.archiveresult.status in (
-            ArchiveResult.StatusChoices.SUCCEEDED,
-            ArchiveResult.StatusChoices.FAILED,
-            ArchiveResult.StatusChoices.SKIPPED,
-        )
-
-    @queued.enter
-    def enter_queued(self):
-        # Suppressed: state transition logs
-        self.archiveresult.update_and_requeue(
-            retry_at=timezone.now(),
-            status=ArchiveResult.StatusChoices.QUEUED,
-            start_ts=None,
-        )  # bump the snapshot's retry_at so they pickup any new changes
-
-    @started.enter
-    def enter_started(self):
-        from archivebox.machine.models import NetworkInterface
-
-        # Suppressed: state transition logs
-        # Lock the object and mark start time
-        self.archiveresult.update_and_requeue(
-            retry_at=timezone.now() + timedelta(seconds=120),  # 2 min timeout for plugin
-            status=ArchiveResult.StatusChoices.STARTED,
-            start_ts=timezone.now(),
-            iface=NetworkInterface.current(),
-        )
-
-        # Run the plugin - this updates status, output, timestamps, etc.
-        self.archiveresult.run()
-
-        # Save the updated result
-        self.archiveresult.save()
-
-        # Suppressed: plugin result logs (already logged by worker)
-
-    @backoff.enter
-    def enter_backoff(self):
-        # Suppressed: state transition logs
-        self.archiveresult.update_and_requeue(
-            retry_at=timezone.now() + timedelta(seconds=60),
-            status=ArchiveResult.StatusChoices.BACKOFF,
-            end_ts=None,
-            # retries=F('retries') + 1,               # F() equivalent to getattr(self.archiveresult, 'retries', 0) + 1,
-        )
-
-    @succeeded.enter
-    def enter_succeeded(self):
-        # Suppressed: state transition logs
-        self.archiveresult.update_and_requeue(
-            retry_at=None,
-            status=ArchiveResult.StatusChoices.SUCCEEDED,
-            end_ts=timezone.now(),
-            # **self.archiveresult.get_output_dict(),     # {output, output_json, stderr, stdout, returncode, errors, cmd_version, pwd, cmd, machine}
-        )
-        self.archiveresult.save()
-
-        # Update health stats for ArchiveResult, Snapshot, and Crawl cascade
-        self.archiveresult.cascade_health_update(success=True)
-
-    @failed.enter
-    def enter_failed(self):
-        # Suppressed: state transition logs
-        self.archiveresult.update_and_requeue(
-            retry_at=None,
-            status=ArchiveResult.StatusChoices.FAILED,
-            end_ts=timezone.now(),
-        )
-
-        # Update health stats for ArchiveResult, Snapshot, and Crawl cascade
-        self.archiveresult.cascade_health_update(success=False)
-
-    @skipped.enter
-    def enter_skipped(self):
-        # Suppressed: state transition logs
-        self.archiveresult.update_and_requeue(
-            retry_at=None,
-            status=ArchiveResult.StatusChoices.SKIPPED,
-            end_ts=timezone.now(),
-        )
-
-    def after_transition(self, event: str, source: State, target: State):
-        # print(f"after '{event}' from '{source.id}' to '{target.id}'")
-        self.archiveresult.snapshot.update_and_requeue()  # bump snapshot retry time so it picks up all the new changes
-
-
-# =============================================================================
-# State Machine Registration
-# =============================================================================
-
-# Manually register state machines with python-statemachine registry
-# (normally auto-discovered from statemachines.py, but we define them here for clarity)
-registry.register(SnapshotMachine)
-registry.register(ArchiveResultMachine)

+ 15 - 3
archivebox/core/templatetags/core_tags.py

@@ -91,7 +91,11 @@ def plugin_thumbnail(context, result) -> str:
             'output_path': output_path,
             'plugin': plugin,
         })
-        return mark_safe(tpl.render(ctx))
+        rendered = tpl.render(ctx)
+        # Only return non-empty content (strip whitespace to check)
+        if rendered.strip():
+            return mark_safe(rendered)
+        return ''
     except Exception:
         return ''
 
@@ -119,7 +123,11 @@ def plugin_embed(context, result) -> str:
             'output_path': output_path,
             'plugin': plugin,
         })
-        return mark_safe(tpl.render(ctx))
+        rendered = tpl.render(ctx)
+        # Only return non-empty content (strip whitespace to check)
+        if rendered.strip():
+            return mark_safe(rendered)
+        return ''
     except Exception:
         return ''
 
@@ -147,7 +155,11 @@ def plugin_fullscreen(context, result) -> str:
             'output_path': output_path,
             'plugin': plugin,
         })
-        return mark_safe(tpl.render(ctx))
+        rendered = tpl.render(ctx)
+        # Only return non-empty content (strip whitespace to check)
+        if rendered.strip():
+            return mark_safe(rendered)
+        return ''
     except Exception:
         return ''
 

+ 1 - 1
archivebox/core/views.py

@@ -539,7 +539,7 @@ from django.http import JsonResponse
 def live_progress_view(request):
     """Simple JSON endpoint for live progress status - used by admin progress monitor."""
     try:
-        from workers.orchestrator import Orchestrator
+        from archivebox.workers.orchestrator import Orchestrator
         from archivebox.crawls.models import Crawl
         from archivebox.core.models import Snapshot, ArchiveResult
         from django.db.models import Case, When, Value, IntegerField

+ 5 - 0
archivebox/crawls/apps.py

@@ -4,3 +4,8 @@ from django.apps import AppConfig
 class CrawlsConfig(AppConfig):
     default_auto_field = "django.db.models.BigAutoField"
     name = "archivebox.crawls"
+    label = "crawls"
+
+    def ready(self):
+        """Import models to register state machines with the registry"""
+        from archivebox.crawls.models import CrawlMachine  # noqa: F401

+ 55 - 32
archivebox/crawls/migrations/0002_drop_seed_model.py

@@ -17,39 +17,62 @@ class Migration(migrations.Migration):
     ]
 
     operations = [
-        # Remove the seed foreign key from Crawl
-        migrations.RemoveField(
-            model_name='crawl',
-            name='seed',
+        # Remove the seed foreign key from Crawl (no-op if already removed by core/0024_d)
+        migrations.RunPython(
+            code=lambda apps, schema_editor: None,
+            reverse_code=migrations.RunPython.noop,
         ),
-        # Delete the Seed model entirely
-        migrations.DeleteModel(
-            name='Seed',
+        # Delete the Seed model entirely (already done)
+        migrations.RunPython(
+            code=lambda apps, schema_editor: None,
+            reverse_code=migrations.RunPython.noop,
         ),
-        # Update fields to new schema
-        migrations.AlterField(
-            model_name='crawl',
-            name='created_by',
-            field=models.ForeignKey(default=archivebox.base_models.models.get_or_create_system_user_pk, on_delete=django.db.models.deletion.CASCADE, to=settings.AUTH_USER_MODEL),
-        ),
-        migrations.AlterField(
-            model_name='crawl',
-            name='id',
-            field=models.UUIDField(default=uuid_compat.uuid7, editable=False, primary_key=True, serialize=False, unique=True),
-        ),
-        migrations.AlterField(
-            model_name='crawl',
-            name='urls',
-            field=models.TextField(help_text='Newline-separated list of URLs to crawl'),
-        ),
-        migrations.AlterField(
-            model_name='crawlschedule',
-            name='created_by',
-            field=models.ForeignKey(default=archivebox.base_models.models.get_or_create_system_user_pk, on_delete=django.db.models.deletion.CASCADE, to=settings.AUTH_USER_MODEL),
-        ),
-        migrations.AlterField(
-            model_name='crawlschedule',
-            name='id',
-            field=models.UUIDField(default=uuid_compat.uuid7, editable=False, primary_key=True, serialize=False, unique=True),
+        # Drop seed_id column if it exists, then update Django's migration state
+        migrations.SeparateDatabaseAndState(
+            state_operations=[
+                # Update fields to new schema
+                migrations.AlterField(
+                    model_name='crawl',
+                    name='created_by',
+                    field=models.ForeignKey(default=archivebox.base_models.models.get_or_create_system_user_pk, on_delete=django.db.models.deletion.CASCADE, to=settings.AUTH_USER_MODEL),
+                ),
+                migrations.AlterField(
+                    model_name='crawl',
+                    name='id',
+                    field=models.UUIDField(default=uuid_compat.uuid7, editable=False, primary_key=True, serialize=False, unique=True),
+                ),
+                migrations.AlterField(
+                    model_name='crawl',
+                    name='urls',
+                    field=models.TextField(help_text='Newline-separated list of URLs to crawl'),
+                ),
+                migrations.AlterField(
+                    model_name='crawlschedule',
+                    name='created_by',
+                    field=models.ForeignKey(default=archivebox.base_models.models.get_or_create_system_user_pk, on_delete=django.db.models.deletion.CASCADE, to=settings.AUTH_USER_MODEL),
+                ),
+                migrations.AlterField(
+                    model_name='crawlschedule',
+                    name='id',
+                    field=models.UUIDField(default=uuid_compat.uuid7, editable=False, primary_key=True, serialize=False, unique=True),
+                ),
+            ],
+            database_operations=[
+                # Drop seed table and NULL out seed_id FK values
+                migrations.RunSQL(
+                    sql="""
+                        PRAGMA foreign_keys=OFF;
+
+                        -- NULL out seed_id values in crawls_crawl
+                        UPDATE crawls_crawl SET seed_id = NULL;
+
+                        -- Drop seed table if it exists
+                        DROP TABLE IF EXISTS crawls_seed;
+
+                        PRAGMA foreign_keys=ON;
+                    """,
+                    reverse_sql=migrations.RunSQL.noop,
+                ),
+            ],
         ),
     ]

+ 13 - 4
archivebox/crawls/migrations/0003_alter_crawl_output_dir.py

@@ -8,12 +8,21 @@ class Migration(migrations.Migration):
 
     dependencies = [
         ('crawls', '0002_drop_seed_model'),
+        ('core', '0024_d_fix_crawls_config'),  # Depends on config fix
     ]
 
     operations = [
-        migrations.AlterField(
-            model_name='crawl',
-            name='output_dir',
-            field=models.FilePathField(blank=True, default='', path=pathlib.PurePosixPath('/private/tmp/test_archivebox_migrations/archive')),
+        # Update Django's state only to avoid table rebuild that would re-apply old constraints
+        migrations.SeparateDatabaseAndState(
+            state_operations=[
+                migrations.AlterField(
+                    model_name='crawl',
+                    name='output_dir',
+                    field=models.FilePathField(blank=True, default='', path=pathlib.PurePosixPath('/private/tmp/test_archivebox_migrations/archive')),
+                ),
+            ],
+            database_operations=[
+                # No database changes - output_dir type change is cosmetic for Django admin
+            ],
         ),
     ]

+ 12 - 4
archivebox/crawls/migrations/0004_alter_crawl_output_dir.py

@@ -11,9 +11,17 @@ class Migration(migrations.Migration):
     ]
 
     operations = [
-        migrations.AlterField(
-            model_name='crawl',
-            name='output_dir',
-            field=models.FilePathField(blank=True, default='', path=pathlib.PurePosixPath('/private/tmp/archivebox-makemigrations/archive')),
+        # Update Django's state only to avoid table rebuild that would re-apply old constraints
+        migrations.SeparateDatabaseAndState(
+            state_operations=[
+                migrations.AlterField(
+                    model_name='crawl',
+                    name='output_dir',
+                    field=models.FilePathField(blank=True, default='', path=pathlib.PurePosixPath('/private/tmp/archivebox-makemigrations/archive')),
+                ),
+            ],
+            database_operations=[
+                # No database changes - output_dir type change is cosmetic for Django admin
+            ],
         ),
     ]

+ 28 - 0
archivebox/crawls/migrations/0005_drop_seed_id_column.py

@@ -0,0 +1,28 @@
+# Drop seed_id column from Django's state (leave in database to avoid FK issues)
+
+from django.db import migrations
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ('crawls', '0004_alter_crawl_output_dir'),
+    ]
+
+    operations = [
+        # Update Django's state only - leave seed_id column in database (unused but harmless)
+        # This avoids FK mismatch errors with crawls_crawlschedule
+        migrations.SeparateDatabaseAndState(
+            state_operations=[
+                # Remove seed field from Django's migration state
+                migrations.RemoveField(
+                    model_name='crawl',
+                    name='seed',
+                ),
+            ],
+            database_operations=[
+                # No database changes - seed_id column remains to avoid FK rebuild issues
+                # crawls_seed table can be manually dropped by DBA if needed
+            ],
+        ),
+    ]

+ 35 - 0
archivebox/crawls/migrations/0006_alter_crawl_config_alter_crawl_output_dir_and_more.py

@@ -0,0 +1,35 @@
+# Generated by Django 6.0 on 2025-12-29 06:45
+
+import pathlib
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ('crawls', '0005_drop_seed_id_column'),
+    ]
+
+    operations = [
+        # Update Django's state only - database already correct
+        migrations.SeparateDatabaseAndState(
+            state_operations=[
+                migrations.AlterField(
+                    model_name='crawl',
+                    name='config',
+                    field=models.JSONField(blank=True, default=dict, null=True),
+                ),
+                migrations.AlterField(
+                    model_name='crawl',
+                    name='output_dir',
+                    field=models.FilePathField(blank=True, default='', path=pathlib.PurePosixPath('/Users/squash/Local/Code/archiveboxes/archivebox-nue/data/archive')),
+                ),
+                migrations.DeleteModel(
+                    name='Seed',
+                ),
+            ],
+            database_operations=[
+                # No database changes - Seed table already dropped in 0005
+            ],
+        ),
+    ]

+ 3 - 4
archivebox/crawls/models.py

@@ -65,7 +65,7 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith
     modified_at = models.DateTimeField(auto_now=True)
 
     urls = models.TextField(blank=False, null=False, help_text='Newline-separated list of URLs to crawl')
-    config = models.JSONField(default=dict)
+    config = models.JSONField(default=dict, null=True, blank=True)
     max_depth = models.PositiveSmallIntegerField(default=0, validators=[MinValueValidator(0), MaxValueValidator(4)])
     tags_str = models.CharField(max_length=1024, blank=True, null=False, default='')
     persona_id = models.UUIDField(null=True, blank=True)
@@ -77,7 +77,7 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith
     status = ModelWithStateMachine.StatusField(choices=ModelWithStateMachine.StatusChoices, default=ModelWithStateMachine.StatusChoices.QUEUED)
     retry_at = ModelWithStateMachine.RetryAtField(default=timezone.now)
 
-    state_machine_name = 'crawls.models.CrawlMachine'
+    state_machine_name = 'archivebox.crawls.models.CrawlMachine'
     retry_at_field_name = 'retry_at'
     state_field_name = 'status'
     StatusChoices = ModelWithStateMachine.StatusChoices
@@ -190,7 +190,6 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith
                 'status': Snapshot.INITIAL_STATE,
                 'retry_at': timezone.now(),
                 'timestamp': str(timezone.now().timestamp()),
-                'created_by_id': self.created_by_id,
                 'depth': 0,
             },
         )
@@ -290,7 +289,7 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith
                     'timestamp': timestamp or str(timezone.now().timestamp()),
                     'status': Snapshot.INITIAL_STATE,
                     'retry_at': timezone.now(),
-                    'created_by_id': self.created_by_id,
+                    # Note: created_by removed in 0.9.0 - Snapshot inherits from Crawl
                 }
             )
 

+ 5 - 0
archivebox/machine/apps.py

@@ -7,8 +7,13 @@ class MachineConfig(AppConfig):
     default_auto_field = 'django.db.models.BigAutoField'
 
     name = 'archivebox.machine'
+    label = 'machine'  # Explicit label for migrations
     verbose_name = 'Machine Info'
 
+    def ready(self):
+        """Import models to register state machines with the registry"""
+        from archivebox.machine import models  # noqa: F401
+
 
 def register_admin(admin_site):
     from archivebox.machine.admin import register_admin

+ 6 - 0
archivebox/machine/migrations/0001_squashed.py

@@ -85,6 +85,12 @@ class Migration(migrations.Migration):
                 ('version', models.CharField(blank=True, default=None, max_length=32)),
                 ('sha256', models.CharField(blank=True, default=None, max_length=64)),
                 ('machine', models.ForeignKey(blank=True, default=None, on_delete=django.db.models.deletion.CASCADE, to='machine.machine')),
+                # Fields added in migration 0005 (included here for fresh installs)
+                ('binproviders', models.CharField(blank=True, default='env', max_length=127)),
+                ('output_dir', models.CharField(blank=True, default='', max_length=255)),
+                ('overrides', models.JSONField(blank=True, default=dict)),
+                ('retry_at', models.DateTimeField(blank=True, db_index=True, default=django.utils.timezone.now, null=True)),
+                ('status', models.CharField(choices=[('queued', 'Queued'), ('started', 'Started'), ('succeeded', 'Succeeded'), ('failed', 'Failed')], db_index=True, default='queued', max_length=16)),
                 # dependency FK removed - Dependency model deleted
             ],
             options={

+ 104 - 0
archivebox/machine/migrations/0005_binary_binproviders_binary_output_dir_and_more.py

@@ -0,0 +1,104 @@
+# Generated by Django 6.0 on 2025-12-29 06:45
+
+import django.db.models.deletion
+import django.utils.timezone
+from archivebox.uuid_compat import uuid7
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ('machine', '0004_drop_dependency_table'),
+    ]
+
+    operations = [
+        # Update Django's state only - database already has correct schema
+        migrations.SeparateDatabaseAndState(
+            state_operations=[
+                migrations.AddField(
+                    model_name='binary',
+                    name='binproviders',
+                    field=models.CharField(blank=True, default='env', help_text='Comma-separated list of allowed providers: apt,brew,pip,npm,env', max_length=127),
+                ),
+                migrations.AddField(
+                    model_name='binary',
+                    name='output_dir',
+                    field=models.CharField(blank=True, default='', help_text='Directory where installation hook logs are stored', max_length=255),
+                ),
+                migrations.AddField(
+                    model_name='binary',
+                    name='overrides',
+                    field=models.JSONField(blank=True, default=dict, help_text="Provider-specific overrides: {'apt': {'packages': ['pkg']}, ...}"),
+                ),
+                migrations.AddField(
+                    model_name='binary',
+                    name='retry_at',
+                    field=models.DateTimeField(blank=True, db_index=True, default=django.utils.timezone.now, help_text='When to retry this binary installation', null=True),
+                ),
+                migrations.AddField(
+                    model_name='binary',
+                    name='status',
+                    field=models.CharField(choices=[('queued', 'Queued'), ('started', 'Started'), ('succeeded', 'Succeeded'), ('failed', 'Failed')], db_index=True, default='queued', max_length=16),
+                ),
+                migrations.AlterField(
+                    model_name='binary',
+                    name='abspath',
+                    field=models.CharField(blank=True, default='', max_length=255),
+                ),
+                migrations.AlterField(
+                    model_name='binary',
+                    name='binprovider',
+                    field=models.CharField(blank=True, default='', help_text='Provider that successfully installed this binary', max_length=31),
+                ),
+                migrations.AlterField(
+                    model_name='binary',
+                    name='id',
+                    field=models.UUIDField(default=uuid7, editable=False, primary_key=True, serialize=False, unique=True),
+                ),
+                migrations.AlterField(
+                    model_name='binary',
+                    name='machine',
+                    field=models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, to='machine.machine'),
+                ),
+                migrations.AlterField(
+                    model_name='binary',
+                    name='name',
+                    field=models.CharField(blank=True, db_index=True, default='', max_length=63),
+                ),
+                migrations.AlterField(
+                    model_name='binary',
+                    name='sha256',
+                    field=models.CharField(blank=True, default='', max_length=64),
+                ),
+                migrations.AlterField(
+                    model_name='binary',
+                    name='version',
+                    field=models.CharField(blank=True, default='', max_length=32),
+                ),
+                migrations.AlterField(
+                    model_name='machine',
+                    name='config',
+                    field=models.JSONField(blank=True, default=dict, help_text='Machine-specific config overrides (e.g., resolved binary paths like WGET_BINARY)', null=True),
+                ),
+                migrations.AlterField(
+                    model_name='machine',
+                    name='id',
+                    field=models.UUIDField(default=uuid7, editable=False, primary_key=True, serialize=False, unique=True),
+                ),
+                migrations.AlterField(
+                    model_name='machine',
+                    name='stats',
+                    field=models.JSONField(blank=True, default=dict, null=True),
+                ),
+                migrations.AlterField(
+                    model_name='networkinterface',
+                    name='id',
+                    field=models.UUIDField(default=uuid7, editable=False, primary_key=True, serialize=False, unique=True),
+                ),
+            ],
+            database_operations=[
+                # No database changes - schema already correct from previous migrations
+            ],
+        ),
+    ]

+ 3 - 3
archivebox/machine/models.py

@@ -44,8 +44,8 @@ class Machine(ModelWithHealthStats):
     os_platform = models.CharField(max_length=63, default=None, null=False)
     os_release = models.CharField(max_length=63, default=None, null=False)
     os_kernel = models.CharField(max_length=255, default=None, null=False)
-    stats = models.JSONField(default=dict, null=False)
-    config = models.JSONField(default=dict, null=False, blank=True,
+    stats = models.JSONField(default=dict, null=True, blank=True)
+    config = models.JSONField(default=dict, null=True, blank=True,
         help_text="Machine-specific config overrides (e.g., resolved binary paths like WGET_BINARY)")
     num_uses_failed = models.PositiveIntegerField(default=0)
     num_uses_succeeded = models.PositiveIntegerField(default=0)
@@ -213,7 +213,7 @@ class Binary(ModelWithHealthStats):
     num_uses_failed = models.PositiveIntegerField(default=0)
     num_uses_succeeded = models.PositiveIntegerField(default=0)
 
-    state_machine_name: str = 'machine.models.BinaryMachine'
+    state_machine_name: str = 'archivebox.machine.models.BinaryMachine'
 
     objects: BinaryManager = BinaryManager()
 

+ 1 - 0
archivebox/personas/apps.py

@@ -4,3 +4,4 @@ from django.apps import AppConfig
 class SessionsConfig(AppConfig):
     default_auto_field = "django.db.models.BigAutoField"
     name = "archivebox.personas"
+    label = "personas"

+ 1 - 1
archivebox/personas/models.py

@@ -21,7 +21,7 @@
 #     #    COOKIES_TXT_FILE: '/path/to/cookies.txt',
 #     #    CHROME_USER_DATA_DIR: '/path/to/chrome/user/data/dir',
 #     #    CHECK_SSL_VALIDITY: False,
-#     #    SAVE_ARCHIVE_DOT_ORG: True,
+#     #    SAVE_ARCHIVEDOTORG: True,
 #     #    CHROME_BINARY: 'chromium'
 #     #    ...
 #     # }

+ 3 - 3
archivebox/plugins/search_backend_ripgrep/tests/test_ripgrep_detection.py

@@ -63,7 +63,7 @@ def test_ripgrep_hook_detects_binary_from_path():
 
 def test_ripgrep_hook_skips_when_backend_not_ripgrep():
     """Test that ripgrep hook exits silently when search backend is not ripgrep."""
-    hook_path = Path(__file__).parent.parent / 'on_Crawl__00_validate_ripgrep.py'
+    hook_path = Path(__file__).parent.parent / 'on_Crawl__00_install_ripgrep.py'
 
     env = os.environ.copy()
     env['SEARCH_BACKEND_ENGINE'] = 'sqlite'  # Different backend
@@ -82,7 +82,7 @@ def test_ripgrep_hook_skips_when_backend_not_ripgrep():
 
 def test_ripgrep_hook_handles_absolute_path():
     """Test that ripgrep hook works when RIPGREP_BINARY is an absolute path."""
-    hook_path = Path(__file__).parent.parent / 'on_Crawl__00_validate_ripgrep.py'
+    hook_path = Path(__file__).parent.parent / 'on_Crawl__00_install_ripgrep.py'
 
     rg_path = shutil.which('rg')
     if not rg_path:
@@ -222,7 +222,7 @@ def test_ripgrep_only_detected_when_backend_enabled():
     if not shutil.which('rg'):
         pytest.skip("ripgrep not installed")
 
-    hook_path = Path(__file__).parent.parent / 'on_Crawl__00_validate_ripgrep.py'
+    hook_path = Path(__file__).parent.parent / 'on_Crawl__00_install_ripgrep.py'
 
     # Test 1: With ripgrep backend - should output Binary record
     env1 = os.environ.copy()

+ 5 - 2
archivebox/templates/core/snapshot.html

@@ -360,9 +360,11 @@
                 <div class="row header-bottom-frames">
                     {% for result_info in archiveresults %}
                         {% if result_info.result %}
+                            {% plugin_thumbnail result_info.result as thumbnail_html %}
+                            {% if thumbnail_html %}
                             <div class="col-lg-2">
                                 <div class="card{% if forloop.first %} selected-card{% endif %}">
-                                    {% plugin_thumbnail result_info.result %}
+                                    {{ thumbnail_html }}
                                     <div class="card-body">
                                         <a href="{{ result_info.path }}" title="Open in new tab..." target="_blank" rel="noopener">
                                             <p class="card-text"><code>{{ result_info.path }}</code></p>
@@ -373,6 +375,7 @@
                                     </div>
                                 </div>
                             </div>
+                            {% endif %}
                         {% endif %}
                     {% endfor %}
 
@@ -395,7 +398,7 @@
                 </div>
             </div>
         </header>
-        <iframe sandbox="allow-same-origin allow-top-navigation-by-user-activation allow-scripts allow-forms" class="full-page-iframe" src="{{singlefile_path}}" name="preview"></iframe>
+        <iframe sandbox="allow-same-origin allow-top-navigation-by-user-activation allow-scripts allow-forms" class="full-page-iframe" src="{{best_preview_path}}" name="preview"></iframe>
     
         <script>
             /*! jQuery v3.2.1 -ajax,-ajax/jsonp,-ajax/load,-ajax/parseXML,-ajax/script,-ajax/var/location,-ajax/var/nonce,-ajax/var/rquery,-ajax/xhr,-manipulation/_evalUrl,-event/ajax,-effects,-effects/Tween,-effects/animatedSelector | (c) JS Foundation and other contributors | jquery.org/license */

+ 0 - 13
archivebox/tests/test_hooks.py

@@ -429,19 +429,6 @@ class TestInstallHookOutput(unittest.TestCase):
         self.assertEqual(data['name'], 'wget')
         self.assertTrue(data['abspath'].startswith('/'))
 
-    def test_install_hook_outputs_dependency(self):
-        """Install hook should output Dependency JSONL when binary not found."""
-        hook_output = json.dumps({
-            'type': 'Dependency',
-            'bin_name': 'wget',
-            'bin_providers': 'apt,brew,env',
-        })
-
-        data = json.loads(hook_output)
-        self.assertEqual(data['type'], 'Dependency')
-        self.assertEqual(data['bin_name'], 'wget')
-        self.assertIn('apt', data['bin_providers'])
-
     def test_install_hook_outputs_machine_config(self):
         """Install hook should output Machine config update JSONL."""
         hook_output = json.dumps({

+ 1 - 1
archivebox/tests/test_migrations_08_to_09.py

@@ -459,7 +459,7 @@ class TestFilesystemMigration08to09(unittest.TestCase):
                     'SAVE_MERCURY': 'True',
                     'SAVE_PDF': 'True',
                     'SAVE_MEDIA': 'True',
-                    'SAVE_ARCHIVE_DOT_ORG': 'True',
+                    'SAVE_ARCHIVEDOTORG': 'True',
                     'SAVE_HEADERS': 'True',
                     'SAVE_HTMLTOTEXT': 'True',
                     'SAVE_GIT': 'True',

+ 18 - 9
archivebox/tests/test_migrations_helpers.py

@@ -949,19 +949,30 @@ def seed_0_8_data(db_path: Path) -> Dict[str, List[Dict]]:
         ('core', '0072_rename_added_snapshot_bookmarked_at_and_more'),
         ('core', '0073_rename_created_archiveresult_created_at_and_more'),
         ('core', '0074_alter_snapshot_downloaded_at'),
-        ('core', '0023_new_schema'),
+        # For 0.8.x: DO NOT record 0023_new_schema - it replaces 0023-0074 for fresh installs
+        # We already recorded 0023-0074 above, so Django will know the state
+        # For 0.8.x: Record original machine migrations (before squashing)
+        # DO NOT record 0001_squashed here - it replaces 0001-0004 for fresh installs
         ('machine', '0001_initial'),
         ('machine', '0002_alter_machine_stats_installedbinary'),
         ('machine', '0003_alter_installedbinary_options_and_more'),
         ('machine', '0004_alter_installedbinary_abspath_and_more'),
-        ('machine', '0001_squashed'),
+        # Then the new migrations after squashing
         ('machine', '0002_rename_custom_cmds_to_overrides'),
         ('machine', '0003_alter_dependency_id_alter_installedbinary_dependency_and_more'),
         ('machine', '0004_drop_dependency_table'),
+        # Crawls must come before core.0024 because 0024_b depends on it
+        ('crawls', '0001_initial'),
+        # Core 0024 migrations chain (in dependency order)
+        ('core', '0024_b_clear_config_fields'),
+        ('core', '0024_c_disable_fk_checks'),
+        ('core', '0024_d_fix_crawls_config'),
         ('core', '0024_snapshot_crawl'),
+        ('core', '0024_f_add_snapshot_config'),
         ('core', '0025_allow_duplicate_urls_per_crawl'),
+        # For 0.8.x: Record original api migration (before squashing)
+        # DO NOT record 0001_squashed here - it replaces 0001 for fresh installs
         ('api', '0001_initial'),
-        ('api', '0001_squashed'),
         ('api', '0002_alter_apitoken_options'),
         ('api', '0003_rename_user_apitoken_created_by_apitoken_abid_and_more'),
         ('api', '0004_alter_apitoken_id_alter_apitoken_uuid'),
@@ -970,11 +981,9 @@ def seed_0_8_data(db_path: Path) -> Dict[str, List[Dict]]:
         ('api', '0007_alter_apitoken_created_by'),
         ('api', '0008_alter_apitoken_created_alter_apitoken_created_by_and_more'),
         ('api', '0009_rename_created_apitoken_created_at_and_more'),
-        ('crawls', '0001_initial'),
-        ('crawls', '0002_drop_seed_model'),
-        ('crawls', '0003_alter_crawl_output_dir'),
-        ('crawls', '0004_alter_crawl_output_dir'),
-        ('core', '0035_snapshot_crawl_non_nullable_remove_created_by'),
+        # Note: crawls.0001_initial moved earlier (before core.0024) due to dependencies
+        # Stop here - 0.8.x ends at core.0025, crawls.0001, and we want to TEST the later migrations
+        # Do NOT record 0026+ as they need to be tested during migration
     ]
 
     for app, name in migrations:
@@ -1000,7 +1009,7 @@ def run_archivebox(data_dir: Path, args: list, timeout: int = 60, env: dict = No
     base_env['USE_COLOR'] = 'False'
     base_env['SHOW_PROGRESS'] = 'False'
     # Disable ALL extractors for faster tests (can be overridden by env parameter)
-    base_env['SAVE_ARCHIVE_DOT_ORG'] = 'False'
+    base_env['SAVE_ARCHIVEDOTORG'] = 'False'
     base_env['SAVE_TITLE'] = 'False'
     base_env['SAVE_FAVICON'] = 'False'
     base_env['SAVE_WGET'] = 'False'

+ 1 - 0
archivebox/workers/apps.py

@@ -4,4 +4,5 @@ from django.apps import AppConfig
 class WorkersConfig(AppConfig):
     default_auto_field = 'django.db.models.BigAutoField'
     name = 'archivebox.workers'
+    label = 'workers'
 

+ 2 - 2
docker-compose.yml

@@ -2,7 +2,7 @@
 #     mkdir -p ~/archivebox/data && cd ~/archivebox
 #     curl -fsSL 'https://docker-compose.archivebox.io' > docker-compose.yml
 #     docker compose run archivebox version
-#     docker compose run archivebox config --set SAVE_ARCHIVE_DOT_ORG=False
+#     docker compose run archivebox config --set SAVE_ARCHIVEDOTORG=False
 #     docker compose run archivebox add --depth=1 'https://news.ycombinator.com'
 #     docker compose run -T archivebox add < bookmarks.txt
 #     docker compose up -d && open 'https://localhost:8000'
@@ -35,7 +35,7 @@ services:
             # - MEDIA_MAX_SIZE=750m             # increase this filesize limit to allow archiving larger audio/video files
             # - TIMEOUT=60                      # increase this number to 120+ seconds if you see many slow downloads timing out
             # - CHECK_SSL_VALIDITY=True         # set to False to disable strict SSL checking (allows saving URLs w/ broken certs)
-            # - SAVE_ARCHIVE_DOT_ORG=True       # set to False to disable submitting all URLs to Archive.org when archiving
+            # - SAVE_ARCHIVEDOTORG=True       # set to False to disable submitting all URLs to Archive.org when archiving
             # - USER_AGENT="..."                # set a custom USER_AGENT to avoid being blocked as a bot
             # ...
             # For more info, see: https://github.com/ArchiveBox/ArchiveBox/wiki/Docker#configuration

+ 2 - 1
pyproject.toml

@@ -85,9 +85,9 @@ dependencies = [
     ### Binary/Package Management
     "abx-pkg>=0.1.0",        # for: detecting, versioning, and installing binaries via apt/brew/pip/npm
     "gallery-dl>=1.31.1",
-
     ### UUID7 backport for Python <3.14
     "uuid7>=0.1.0; python_version < '3.14'",  # for: uuid7 support on Python 3.13 (provides uuid_extensions module)
+    "pytest-django>=4.11.1",
 ]
 
 [project.optional-dependencies]
@@ -183,6 +183,7 @@ ignore = ["E731", "E303", "E266", "E241", "E222"]
 
 [tool.pytest.ini_options]
 testpaths = [ "tests" ]
+DJANGO_SETTINGS_MODULE = "archivebox.core.settings"
 
 [tool.mypy]
 mypy_path = "archivebox,archivebox/typings"

+ 1 - 1
tests/fixtures.py

@@ -24,7 +24,7 @@ def disable_extractors_dict():
         "SAVE_HEADERS": "false",
         "USE_GIT": "false",
         "SAVE_MEDIA": "false",
-        "SAVE_ARCHIVE_DOT_ORG": "false",
+        "SAVE_ARCHIVEDOTORG": "false",
         "SAVE_TITLE": "false",
         "SAVE_FAVICON": "false",
     })

+ 2 - 2
tests/test_recursive_crawl.py

@@ -33,7 +33,7 @@ def test_background_hooks_dont_block_parser_extractors(tmp_path, process):
         "SAVE_HEADERS": "false",
         "USE_GIT": "false",
         "SAVE_MEDIA": "false",
-        "SAVE_ARCHIVE_DOT_ORG": "false",
+        "SAVE_ARCHIVEDOTORG": "false",
         "SAVE_TITLE": "false",
         "SAVE_FAVICON": "false",
         # Enable chrome session (required for background hooks to start)
@@ -133,7 +133,7 @@ def test_parser_extractors_emit_snapshot_jsonl(tmp_path, process):
         "SAVE_HEADERS": "false",
         "USE_GIT": "false",
         "SAVE_MEDIA": "false",
-        "SAVE_ARCHIVE_DOT_ORG": "false",
+        "SAVE_ARCHIVEDOTORG": "false",
         "SAVE_TITLE": "false",
         "SAVE_FAVICON": "false",
         "USE_CHROME": "false",

+ 14 - 0
uv.lock

@@ -88,6 +88,7 @@ dependencies = [
     { name = "py-machineid", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
     { name = "pydantic", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
     { name = "pydantic-settings", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
+    { name = "pytest-django", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
     { name = "python-benedict", extra = ["io", "parse"], marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
     { name = "python-crontab", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
     { name = "python-statemachine", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
@@ -186,6 +187,7 @@ requires-dist = [
     { name = "py-machineid", specifier = ">=0.6.0" },
     { name = "pydantic", specifier = ">=2.8.0" },
     { name = "pydantic-settings", specifier = ">=2.5.2" },
+    { name = "pytest-django", specifier = ">=4.11.1" },
     { name = "python-benedict", extras = ["io", "parse"], specifier = ">=0.33.2" },
     { name = "python-crontab", specifier = ">=3.2.0" },
     { name = "python-ldap", marker = "extra == 'ldap'", specifier = ">=3.4.3" },
@@ -1848,6 +1850,18 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/3b/ab/b3226f0bd7cdcf710fbede2b3548584366da3b19b5021e74f5bde2a8fa3f/pytest-9.0.2-py3-none-any.whl", hash = "sha256:711ffd45bf766d5264d487b917733b453d917afd2b0ad65223959f59089f875b", size = 374801, upload-time = "2025-12-06T21:30:49.154Z" },
 ]
 
+[[package]]
+name = "pytest-django"
+version = "4.11.1"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "pytest", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/b1/fb/55d580352db26eb3d59ad50c64321ddfe228d3d8ac107db05387a2fadf3a/pytest_django-4.11.1.tar.gz", hash = "sha256:a949141a1ee103cb0e7a20f1451d355f83f5e4a5d07bdd4dcfdd1fd0ff227991", size = 86202, upload-time = "2025-04-03T18:56:09.338Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/be/ac/bd0608d229ec808e51a21044f3f2f27b9a37e7a0ebaca7247882e67876af/pytest_django-4.11.1-py3-none-any.whl", hash = "sha256:1b63773f648aa3d8541000c26929c1ea63934be1cfa674c76436966d73fe6a10", size = 25281, upload-time = "2025-04-03T18:56:07.678Z" },
+]
+
 [[package]]
 name = "python-benedict"
 version = "0.35.0"