1 month ago · f4e7820533
--- a/README.md
+++ b/README.md
@@ -763,7 +763,7 @@ The configuration is documented here: **[Configuration Wiki](https://github.com/
 
				 <br/>
			
 
				 TIMEOUT=240                # default: 60    add more seconds on slower networks
			
 
				 CHECK_SSL_VALIDITY=False   # default: True  False = allow saving URLs w/ bad SSL
			
 
				-SAVE_ARCHIVE_DOT_ORG=False # default: True  False = disable Archive.org saving
			
 
				+SAVE_ARCHIVEDOTORG=False # default: True  False = disable Archive.org saving
			
 
				 MAX_MEDIA_SIZE=1500m       # default: 750m  raise/lower youtubedl output size
			
 
				 <br/>
			
 
				 PUBLIC_INDEX=True          # default: True  whether anon users can view index
			
@@ -959,7 +959,7 @@ archivebox add 'https://docs.google.com/document/d/12345somePrivateDocument'
 
				 archivebox add 'https://vimeo.com/somePrivateVideo'
			
 
				 
			
 
				 # without first disabling saving to Archive.org:
			
 
				-archivebox config --set SAVE_ARCHIVE_DOT_ORG=False  # disable saving all URLs in Archive.org
			
 
				+archivebox config --set SAVE_ARCHIVEDOTORG=False  # disable saving all URLs in Archive.org
			
 
				 
			
 
				 # restrict the main index, Snapshot content, and Add Page to authenticated users as-needed:
			
 
				 archivebox config --set PUBLIC_INDEX=False
			
--- a/archivebox/__init__.py
+++ b/archivebox/__init__.py
@@ -26,10 +26,10 @@ ASCII_LOGO = """
 
				 
			
 
				 PACKAGE_DIR = Path(__file__).resolve().parent
			
 
				 
			
 
				-# Add PACKAGE_DIR to sys.path - required for Django migrations to import models
			
 
				-# Migrations reference models like 'machine.Binary' which need to be importable
			
 
				-if str(PACKAGE_DIR) not in sys.path:
			
 
				-    sys.path.append(str(PACKAGE_DIR))
			
 
				+# # Add PACKAGE_DIR to sys.path - required for Django migrations to import models
			
 
				+# # Migrations reference models like 'machine.Binary' which need to be importable
			
 
				+# if str(PACKAGE_DIR) not in sys.path:
			
 
				+#     sys.path.append(str(PACKAGE_DIR))
			
 
				 
			
 
				 os.environ['DJANGO_SETTINGS_MODULE'] = 'archivebox.core.settings'
			
 
				 os.environ['TZ'] = 'UTC'
			
--- a/archivebox/api/apps.py
+++ b/archivebox/api/apps.py
@@ -5,6 +5,7 @@ from django.apps import AppConfig
 
				 
			
 
				 class APIConfig(AppConfig):
			
 
				     name = 'archivebox.api'
			
 
				+    label = 'api'
			
 
				 
			
 
				 
			
 
				 def register_admin(admin_site):
			
--- a/archivebox/api/v1_workers.py
+++ b/archivebox/api/v1_workers.py
@@ -94,7 +94,7 @@ class OrchestratorSchema(Schema):
 
				 @router.get("/orchestrator", response=OrchestratorSchema, url_name="get_orchestrator")
			
 
				 def get_orchestrator(request):
			
 
				     """Get the orchestrator status and all worker queues."""
			
 
				-    from workers.orchestrator import Orchestrator
			
 
				+    from archivebox.workers.orchestrator import Orchestrator
			
 
				     from workers.worker import CrawlWorker, SnapshotWorker, ArchiveResultWorker
			
 
				 
			
 
				     orchestrator = Orchestrator()
			
--- a/archivebox/base_models/models.py
+++ b/archivebox/base_models/models.py
@@ -73,7 +73,7 @@ class ModelWithUUID(models.Model):
 
				         return f'/api/v1/docs#/{self._meta.app_label.title()}%20Models/api_v1_{self._meta.app_label}_get_{self._meta.db_table}'
			
 
				 
			
 
				     def as_json(self, keys: Iterable[str] = ()) -> dict:
			
 
				-        default_keys = ('id', 'created_at', 'modified_at', 'created_by_id')
			
 
				+        default_keys = ('id', 'created_at', 'modified_at')
			
 
				         return {key: getattr(self, key) for key in (keys or default_keys) if hasattr(self, key)}
			
 
				 
			
 
				 
			
@@ -119,7 +119,7 @@ class ModelWithHealthStats(models.Model):
 
				 
			
 
				 class ModelWithConfig(models.Model):
			
 
				     """Mixin for models with a JSON config field."""
			
 
				-    config = models.JSONField(default=dict, null=False, blank=False, editable=True)
			
 
				+    config = models.JSONField(default=dict, null=True, blank=True, editable=True)
			
 
				 
			
 
				     class Meta:
			
 
				         abstract = True
			
--- a/archivebox/cli/archivebox_add.py
+++ b/archivebox/cli/archivebox_add.py
@@ -56,7 +56,7 @@ def add(urls: str | list[str],
 
				     from archivebox.core.models import Snapshot
			
 
				     from archivebox.crawls.models import Crawl
			
 
				     from archivebox.base_models.models import get_or_create_system_user_pk
			
 
				-    from workers.orchestrator import Orchestrator
			
 
				+    from archivebox.workers.orchestrator import Orchestrator
			
 
				 
			
 
				     created_by_id = created_by_id or get_or_create_system_user_pk()
			
 
				 
			
--- a/archivebox/cli/archivebox_crawl.py
+++ b/archivebox/cli/archivebox_crawl.py
@@ -78,7 +78,7 @@ def discover_outlinks(
 
				     from archivebox.core.models import Snapshot, ArchiveResult
			
 
				     from archivebox.crawls.models import Crawl
			
 
				     from archivebox.config import CONSTANTS
			
 
				-    from workers.orchestrator import Orchestrator
			
 
				+    from archivebox.workers.orchestrator import Orchestrator
			
 
				 
			
 
				     created_by_id = get_or_create_system_user_pk()
			
 
				     is_tty = sys.stdout.isatty()
			
--- a/archivebox/cli/archivebox_extract.py
+++ b/archivebox/cli/archivebox_extract.py
@@ -96,7 +96,7 @@ def run_plugins(
 
				         TYPE_SNAPSHOT, TYPE_ARCHIVERESULT
			
 
				     )
			
 
				     from archivebox.core.models import Snapshot, ArchiveResult
			
 
				-    from workers.orchestrator import Orchestrator
			
 
				+    from archivebox.workers.orchestrator import Orchestrator
			
 
				 
			
 
				     is_tty = sys.stdout.isatty()
			
 
				 
			
--- a/archivebox/cli/archivebox_init.py
+++ b/archivebox/cli/archivebox_init.py
@@ -13,11 +13,9 @@ from archivebox.misc.util import docstring, enforce_types
 
				 
			
 
				 
			
 
				 @enforce_types
			
 
				-def init(force: bool=False, quick: bool=False, install: bool=False, setup: bool=False) -> None:
			
 
				+def init(force: bool=False, quick: bool=False, install: bool=False) -> None:
			
 
				     """Initialize a new ArchiveBox collection in the current directory"""
			
 
				     
			
 
				-    install = install or setup
			
 
				-    
			
 
				     from archivebox.config import CONSTANTS, VERSION, DATA_DIR
			
 
				     from archivebox.config.common import SERVER_CONFIG
			
 
				     from archivebox.config.collection import write_config_file
			
@@ -128,7 +126,8 @@ def init(force: bool=False, quick: bool=False, install: bool=False, setup: bool=
 
				                 print(f'    [yellow]√ Added {len(orphaned_data_dir_links)} orphaned links from existing archive directories.[/yellow]')
			
 
				 
			
 
				             if pending_links:
			
 
				-                Snapshot.objects.create_from_dicts(list(pending_links.values()))
			
 
				+                for link_dict in pending_links.values():
			
 
				+                    Snapshot.from_jsonl(link_dict)
			
 
				 
			
 
				             # Hint for orphaned snapshot directories
			
 
				             print()
			
@@ -187,7 +186,6 @@ def init(force: bool=False, quick: bool=False, install: bool=False, setup: bool=
 
				 @click.option('--force', '-f', is_flag=True, help='Ignore unrecognized files in current directory and initialize anyway')
			
 
				 @click.option('--quick', '-q', is_flag=True, help='Run any updates or migrations without rechecking all snapshot dirs')
			
 
				 @click.option('--install', '-s', is_flag=True, help='Automatically install dependencies and extras used for archiving')
			
 
				[email protected]('--setup', '-s', is_flag=True, help='DEPRECATED: equivalent to --install')
			
 
				 @docstring(init.__doc__)
			
 
				 def main(**kwargs) -> None:
			
 
				     init(**kwargs)
			
--- a/archivebox/cli/archivebox_install.py
+++ b/archivebox/cli/archivebox_install.py
@@ -85,7 +85,7 @@ def install(dry_run: bool=False) -> None:
 
				     print()
			
 
				 
			
 
				     # Run the crawl synchronously (this triggers on_Crawl hooks)
			
 
				-    from workers.orchestrator import Orchestrator
			
 
				+    from archivebox.workers.orchestrator import Orchestrator
			
 
				     orchestrator = Orchestrator(exit_on_idle=True)
			
 
				     orchestrator.runloop()
			
 
				 
			
--- a/archivebox/cli/archivebox_orchestrator.py
+++ b/archivebox/cli/archivebox_orchestrator.py
@@ -37,7 +37,7 @@ def orchestrator(daemon: bool = False, watch: bool = False) -> int:
 
				         0: All work completed successfully
			
 
				         1: Error occurred
			
 
				     """
			
 
				-    from workers.orchestrator import Orchestrator
			
 
				+    from archivebox.workers.orchestrator import Orchestrator
			
 
				     
			
 
				     if Orchestrator.is_running():
			
 
				         print('[yellow]Orchestrator is already running[/yellow]')
			
--- a/archivebox/cli/archivebox_server.py
+++ b/archivebox/cli/archivebox_server.py
@@ -74,7 +74,7 @@ def server(runserver_args: Iterable[str]=(SERVER_CONFIG.BIND_ADDR,),
 
				             tail_multiple_worker_logs,
			
 
				             is_port_in_use,
			
 
				         )
			
 
				-        from workers.orchestrator import Orchestrator
			
 
				+        from archivebox.workers.orchestrator import Orchestrator
			
 
				         import sys
			
 
				 
			
 
				         # Check if port is already in use
			
--- a/archivebox/cli/archivebox_snapshot.py
+++ b/archivebox/cli/archivebox_snapshot.py
@@ -163,7 +163,7 @@ def create_snapshots(
 
				 
			
 
				     # If --plugins is passed, run the orchestrator for those plugins
			
 
				     if plugins:
			
 
				-        from workers.orchestrator import Orchestrator
			
 
				+        from archivebox.workers.orchestrator import Orchestrator
			
 
				         rprint(f'[blue]Running plugins: {plugins or "all"}...[/blue]', file=sys.stderr)
			
 
				         orchestrator = Orchestrator(exit_on_idle=True)
			
 
				         orchestrator.runloop()
			
--- a/archivebox/cli/archivebox_update.py
+++ b/archivebox/cli/archivebox_update.py
@@ -160,7 +160,7 @@ def process_all_db_snapshots(batch_size: int = 100) -> dict:
 
				     total = Snapshot.objects.count()
			
 
				     print(f'[*] Processing {total} snapshots from database...')
			
 
				 
			
 
				-    for snapshot in Snapshot.objects.iterator():
			
 
				+    for snapshot in Snapshot.objects.iterator(chunk_size=batch_size):
			
 
				         # Reconcile index.json with DB
			
 
				         snapshot.reconcile_with_index_json()
			
 
				 
			
@@ -209,7 +209,7 @@ def process_filtered_snapshots(
 
				     total = snapshots.count()
			
 
				     print(f'[*] Found {total} matching snapshots')
			
 
				 
			
 
				-    for snapshot in snapshots.iterator():
			
 
				+    for snapshot in snapshots.iterator(chunk_size=batch_size):
			
 
				         # Reconcile index.json with DB
			
 
				         snapshot.reconcile_with_index_json()
			
 
				 
			
--- a/archivebox/cli/tests.py
+++ b/archivebox/cli/tests.py
@@ -17,7 +17,7 @@ TEST_CONFIG = {
 
				 
			
 
				     'DATA_DIR': 'data.tests',
			
 
				     
			
 
				-    'SAVE_ARCHIVE_DOT_ORG': 'False',
			
 
				+    'SAVE_ARCHIVEDOTORG': 'False',
			
 
				     'SAVE_TITLE': 'False',
			
 
				     
			
 
				     'USE_CURL': 'False',
			
--- a/archivebox/cli/tests_piping.py
+++ b/archivebox/cli/tests_piping.py
@@ -32,7 +32,7 @@ from unittest.mock import patch, MagicMock
 
				 TEST_CONFIG = {
			
 
				     'USE_COLOR': 'False',
			
 
				     'SHOW_PROGRESS': 'False',
			
 
				-    'SAVE_ARCHIVE_DOT_ORG': 'False',
			
 
				+    'SAVE_ARCHIVEDOTORG': 'False',
			
 
				     'SAVE_TITLE': 'True',  # Fast extractor
			
 
				     'SAVE_FAVICON': 'False',
			
 
				     'SAVE_WGET': 'False',
			
--- a/archivebox/config/configset.py
+++ b/archivebox/config/configset.py
@@ -216,6 +216,29 @@ def get_config(
 
				     if snapshot and hasattr(snapshot, "config") and snapshot.config:
			
 
				         config.update(snapshot.config)
			
 
				 
			
 
				+    # Normalize all aliases to canonical names (after all sources merged)
			
 
				+    # This handles aliases that came from user/crawl/snapshot configs, not just env
			
 
				+    try:
			
 
				+        from archivebox.hooks import discover_plugin_configs
			
 
				+        plugin_configs = discover_plugin_configs()
			
 
				+        aliases_to_normalize = {}  # {alias_key: canonical_key}
			
 
				+
			
 
				+        # Build alias mapping from all plugin schemas
			
 
				+        for plugin_name, schema in plugin_configs.items():
			
 
				+            for canonical_key, prop_schema in schema.get('properties', {}).items():
			
 
				+                for alias in prop_schema.get('x-aliases', []):
			
 
				+                    aliases_to_normalize[alias] = canonical_key
			
 
				+
			
 
				+        # Normalize: copy alias values to canonical keys (aliases take precedence)
			
 
				+        for alias_key, canonical_key in aliases_to_normalize.items():
			
 
				+            if alias_key in config:
			
 
				+                # Alias exists - copy to canonical key (overwriting any default)
			
 
				+                config[canonical_key] = config[alias_key]
			
 
				+                # Remove alias from config to keep it clean
			
 
				+                del config[alias_key]
			
 
				+    except ImportError:
			
 
				+        pass
			
 
				+
			
 
				     return config
			
 
				 
			
 
				 
			
--- a/archivebox/core/apps.py
+++ b/archivebox/core/apps.py
@@ -5,8 +5,12 @@ from django.apps import AppConfig
 
				 
			
 
				 class CoreConfig(AppConfig):
			
 
				     name = 'archivebox.core'
			
 
				+    label = 'core'
			
 
				 
			
 
				     def ready(self):
			
 
				         """Register the archivebox.core.admin_site as the main django admin site"""
			
 
				         from archivebox.core.admin_site import register_admin_site
			
 
				         register_admin_site()
			
 
				+
			
 
				+        # Import models to register state machines with the registry
			
 
				+        from archivebox.core import models  # noqa: F401
			
--- a/archivebox/core/migrations/0024_b_clear_config_fields.py
+++ b/archivebox/core/migrations/0024_b_clear_config_fields.py
@@ -0,0 +1,57 @@
 
				+# Data migration to clear config fields that may contain invalid JSON
			
 
				+# This runs before 0025 to prevent CHECK constraint failures
			
 
				+
			
 
				+from django.db import migrations
			
 
				+
			
 
				+
			
 
				+def clear_config_fields(apps, schema_editor):
			
 
				+    """Clear all config fields in related tables to avoid JSON validation errors."""
			
 
				+    db_alias = schema_editor.connection.alias
			
 
				+
			
 
				+    # Disable foreign key checks temporarily to allow updates
			
 
				+    with schema_editor.connection.cursor() as cursor:
			
 
				+        cursor.execute("PRAGMA foreign_keys=OFF")
			
 
				+
			
 
				+    tables_to_clear = [
			
 
				+        ('crawls_seed', 'config'),
			
 
				+        ('crawls_crawl', 'config'),
			
 
				+        ('crawls_crawlschedule', 'config') if 'crawlschedule' in dir() else None,
			
 
				+        ('machine_machine', 'stats'),
			
 
				+        ('machine_machine', 'config'),
			
 
				+    ]
			
 
				+
			
 
				+    for table_info in tables_to_clear:
			
 
				+        if table_info is None:
			
 
				+            continue
			
 
				+        table_name, field_name = table_info
			
 
				+
			
 
				+        try:
			
 
				+            with schema_editor.connection.cursor() as cursor:
			
 
				+                # Check if table exists first
			
 
				+                cursor.execute(f"SELECT name FROM sqlite_master WHERE type='table' AND name='{table_name}'")
			
 
				+                if not cursor.fetchone():
			
 
				+                    print(f"  Skipping {table_name}.{field_name}: table does not exist")
			
 
				+                    continue
			
 
				+
			
 
				+                # Set all to empty JSON object
			
 
				+                cursor.execute(f"UPDATE {table_name} SET {field_name} = '{{}}' WHERE {field_name} IS NOT NULL")
			
 
				+                print(f"  Cleared {field_name} in {table_name}: {cursor.rowcount} rows")
			
 
				+        except Exception as e:
			
 
				+            print(f"  Skipping {table_name}.{field_name}: {e}")
			
 
				+
			
 
				+    # Re-enable foreign key checks
			
 
				+    with schema_editor.connection.cursor() as cursor:
			
 
				+        cursor.execute("PRAGMA foreign_keys=ON")
			
 
				+
			
 
				+
			
 
				+class Migration(migrations.Migration):
			
 
				+
			
 
				+    dependencies = [
			
 
				+        ('core', '0023_new_schema'),
			
 
				+        ('crawls', '0001_initial'),
			
 
				+        ('machine', '0001_squashed'),
			
 
				+    ]
			
 
				+
			
 
				+    operations = [
			
 
				+        migrations.RunPython(clear_config_fields, reverse_code=migrations.RunPython.noop),
			
 
				+    ]
			
--- a/archivebox/core/migrations/0024_c_disable_fk_checks.py
+++ b/archivebox/core/migrations/0024_c_disable_fk_checks.py
@@ -0,0 +1,28 @@
 
				+# Disable foreign key checks before 0025 to prevent CHECK constraint validation errors
			
 
				+
			
 
				+from django.db import migrations
			
 
				+
			
 
				+
			
 
				+def disable_fk_checks(apps, schema_editor):
			
 
				+    """Temporarily disable foreign key checks."""
			
 
				+    with schema_editor.connection.cursor() as cursor:
			
 
				+        cursor.execute("PRAGMA foreign_keys=OFF")
			
 
				+        print("  Disabled foreign key checks")
			
 
				+
			
 
				+
			
 
				+def enable_fk_checks(apps, schema_editor):
			
 
				+    """Re-enable foreign key checks."""
			
 
				+    with schema_editor.connection.cursor() as cursor:
			
 
				+        cursor.execute("PRAGMA foreign_keys=ON")
			
 
				+        print("  Enabled foreign key checks")
			
 
				+
			
 
				+
			
 
				+class Migration(migrations.Migration):
			
 
				+
			
 
				+    dependencies = [
			
 
				+        ('core', '0024_b_clear_config_fields'),
			
 
				+    ]
			
 
				+
			
 
				+    operations = [
			
 
				+        migrations.RunPython(disable_fk_checks, reverse_code=enable_fk_checks),
			
 
				+    ]
			
--- a/archivebox/core/migrations/0024_d_fix_crawls_config.py
+++ b/archivebox/core/migrations/0024_d_fix_crawls_config.py
@@ -0,0 +1,93 @@
 
				+# Fix crawls_crawl config field to avoid CHECK constraint errors during table rebuilds
			
 
				+
			
 
				+from django.db import migrations
			
 
				+
			
 
				+
			
 
				+def fix_crawls_config(apps, schema_editor):
			
 
				+    """
			
 
				+    Rebuild crawls_crawl table to fix CHECK constraints and make seed_id nullable.
			
 
				+    Only runs for UPGRADES from 0.8.x (when crawls.0001_initial didn't exist yet).
			
 
				+    For fresh installs, crawls.0001_initial creates the correct schema.
			
 
				+    """
			
 
				+    with schema_editor.connection.cursor() as cursor:
			
 
				+        # Check if this is an upgrade from old 0.8.x or a fresh install
			
 
				+        # In fresh installs, crawls.0001_initial was applied, creating seed FK
			
 
				+        # In upgrades, the table was created by old migrations before 0001_initial existed
			
 
				+        cursor.execute("""
			
 
				+            SELECT COUNT(*) FROM django_migrations
			
 
				+            WHERE app='crawls' AND name='0001_initial'
			
 
				+        """)
			
 
				+        has_crawls_0001 = cursor.fetchone()[0] > 0
			
 
				+
			
 
				+        if has_crawls_0001:
			
 
				+            # Fresh install - crawls.0001_initial already created the correct schema
			
 
				+            # Just clear config to avoid CHECK constraint issues
			
 
				+            print("  Fresh install detected - clearing config field only")
			
 
				+            try:
			
 
				+                cursor.execute('UPDATE "crawls_crawl" SET "config" = NULL')
			
 
				+            except Exception as e:
			
 
				+                print(f"  Skipping config clear: {e}")
			
 
				+            return
			
 
				+
			
 
				+        # Upgrade from 0.8.x - rebuild table to make seed_id nullable and remove CHECK constraint
			
 
				+        print("  Upgrading from 0.8.x - rebuilding crawls_crawl table")
			
 
				+        cursor.execute("PRAGMA foreign_keys=OFF")
			
 
				+
			
 
				+        # Backup
			
 
				+        cursor.execute("CREATE TABLE crawls_crawl_backup AS SELECT * FROM crawls_crawl")
			
 
				+
			
 
				+        # Recreate without config CHECK constraint, with nullable seed_id
			
 
				+        cursor.execute("DROP TABLE crawls_crawl")
			
 
				+        cursor.execute("""
			
 
				+            CREATE TABLE "crawls_crawl" (
			
 
				+                "num_uses_failed" integer unsigned NOT NULL CHECK ("num_uses_failed" >= 0),
			
 
				+                "num_uses_succeeded" integer unsigned NOT NULL CHECK ("num_uses_succeeded" >= 0),
			
 
				+                "id" char(32) NOT NULL PRIMARY KEY,
			
 
				+                "created_at" datetime NOT NULL,
			
 
				+                "modified_at" datetime NOT NULL,
			
 
				+                "urls" text NOT NULL,
			
 
				+                "config" text,
			
 
				+                "max_depth" smallint unsigned NOT NULL CHECK ("max_depth" >= 0),
			
 
				+                "tags_str" varchar(1024) NOT NULL,
			
 
				+                "persona_id" char(32) NULL,
			
 
				+                "label" varchar(64) NOT NULL,
			
 
				+                "notes" text NOT NULL,
			
 
				+                "output_dir" varchar(512) NOT NULL,
			
 
				+                "status" varchar(15) NOT NULL,
			
 
				+                "retry_at" datetime NULL,
			
 
				+                "created_by_id" integer NOT NULL REFERENCES "auth_user" ("id") DEFERRABLE INITIALLY DEFERRED,
			
 
				+                "seed_id" char(32) NULL DEFAULT NULL,
			
 
				+                "schedule_id" char(32) NULL REFERENCES "crawls_crawlschedule" ("id") DEFERRABLE INITIALLY DEFERRED
			
 
				+            )
			
 
				+        """)
			
 
				+
			
 
				+        # Restore data
			
 
				+        cursor.execute("""
			
 
				+            INSERT INTO "crawls_crawl" (
			
 
				+                "num_uses_failed", "num_uses_succeeded", "id", "created_at", "modified_at",
			
 
				+                "urls", "config", "max_depth", "tags_str", "persona_id", "label", "notes",
			
 
				+                "output_dir", "status", "retry_at", "created_by_id", "seed_id", "schedule_id"
			
 
				+            )
			
 
				+            SELECT
			
 
				+                "num_uses_failed", "num_uses_succeeded", "id", "created_at", "modified_at",
			
 
				+                "urls", "config", "max_depth", "tags_str", "persona_id", "label", "notes",
			
 
				+                "output_dir", "status", "retry_at", "created_by_id", "seed_id", "schedule_id"
			
 
				+            FROM crawls_crawl_backup
			
 
				+        """)
			
 
				+
			
 
				+        cursor.execute("DROP TABLE crawls_crawl_backup")
			
 
				+
			
 
				+        # NULL out config to avoid any invalid JSON
			
 
				+        cursor.execute('UPDATE "crawls_crawl" SET "config" = NULL')
			
 
				+
			
 
				+
			
 
				+class Migration(migrations.Migration):
			
 
				+
			
 
				+    dependencies = [
			
 
				+        ('core', '0024_c_disable_fk_checks'),
			
 
				+        ('crawls', '0001_initial'),
			
 
				+    ]
			
 
				+
			
 
				+    operations = [
			
 
				+        migrations.RunPython(fix_crawls_config, reverse_code=migrations.RunPython.noop),
			
 
				+    ]
			
--- a/archivebox/core/migrations/0024_snapshot_crawl.py
+++ b/archivebox/core/migrations/0024_snapshot_crawl.py
@@ -8,9 +8,7 @@ import django.db.models.deletion
 
				 class Migration(migrations.Migration):
			
 
				 
			
 
				     dependencies = [
			
 
				-        ('core', '0023_new_schema'),
			
 
				-        ('crawls', '0001_initial'),
			
 
				-        ('machine', '0001_squashed'),
			
 
				+        ('core', '0024_d_fix_crawls_config'),
			
 
				     ]
			
 
				 
			
 
				     operations = [
			
--- a/archivebox/core/migrations/0026_remove_archiveresult_output_dir_and_more.py
+++ b/archivebox/core/migrations/0026_remove_archiveresult_output_dir_and_more.py
@@ -10,6 +10,13 @@ from django.db import migrations, models
 
				 
			
 
				 def populate_archiveresult_uuids(apps, schema_editor):
			
 
				     """Generate unique UUIDs for ArchiveResults that don't have one."""
			
 
				+    # Check if uuid column exists before trying to populate it
			
 
				+    with schema_editor.connection.cursor() as cursor:
			
 
				+        cursor.execute("PRAGMA table_info(core_archiveresult)")
			
 
				+        columns = [row[1] for row in cursor.fetchall()]
			
 
				+        if 'uuid' not in columns:
			
 
				+            return  # uuid column doesn't exist, skip this data migration
			
 
				+
			
 
				     ArchiveResult = apps.get_model('core', 'ArchiveResult')
			
 
				     for result in ArchiveResult.objects.filter(uuid__isnull=True):
			
 
				         result.uuid = uuid_compat.uuid7()
			
@@ -21,6 +28,22 @@ def reverse_populate_uuids(apps, schema_editor):
 
				     pass
			
 
				 
			
 
				 
			
 
				+def remove_output_dir_if_exists(apps, schema_editor):
			
 
				+    """Remove output_dir columns if they exist."""
			
 
				+    with schema_editor.connection.cursor() as cursor:
			
 
				+        # Check and remove from core_archiveresult
			
 
				+        cursor.execute("PRAGMA table_info(core_archiveresult)")
			
 
				+        columns = [row[1] for row in cursor.fetchall()]
			
 
				+        if 'output_dir' in columns:
			
 
				+            cursor.execute("ALTER TABLE core_archiveresult DROP COLUMN output_dir")
			
 
				+
			
 
				+        # Check and remove from core_snapshot
			
 
				+        cursor.execute("PRAGMA table_info(core_snapshot)")
			
 
				+        columns = [row[1] for row in cursor.fetchall()]
			
 
				+        if 'output_dir' in columns:
			
 
				+            cursor.execute("ALTER TABLE core_snapshot DROP COLUMN output_dir")
			
 
				+
			
 
				+
			
 
				 class Migration(migrations.Migration):
			
 
				 
			
 
				     dependencies = [
			
@@ -33,82 +56,90 @@ class Migration(migrations.Migration):
 
				         migrations.RunPython(populate_archiveresult_uuids, reverse_populate_uuids),
			
 
				 
			
 
				         # Remove output_dir fields (not needed, computed from snapshot)
			
 
				-        migrations.RemoveField(
			
 
				-            model_name='archiveresult',
			
 
				-            name='output_dir',
			
 
				-        ),
			
 
				-        migrations.RemoveField(
			
 
				-            model_name='snapshot',
			
 
				-            name='output_dir',
			
 
				-        ),
			
 
				+        migrations.RunPython(remove_output_dir_if_exists, reverse_code=migrations.RunPython.noop),
			
 
				 
			
 
				-        # Archiveresult field alterations
			
 
				-        migrations.AlterField(
			
 
				-            model_name='archiveresult',
			
 
				-            name='created_at',
			
 
				-            field=models.DateTimeField(db_index=True, default=django.utils.timezone.now),
			
 
				-        ),
			
 
				-        migrations.AlterField(
			
 
				-            model_name='archiveresult',
			
 
				-            name='created_by',
			
 
				-            field=models.ForeignKey(default=None, on_delete=django.db.models.deletion.CASCADE, related_name='archiveresult_set', to=settings.AUTH_USER_MODEL),
			
 
				-        ),
			
 
				-        migrations.AlterField(
			
 
				-            model_name='archiveresult',
			
 
				-            name='extractor',
			
 
				-            field=models.CharField(db_index=True, max_length=32),
			
 
				-        ),
			
 
				-        migrations.AlterField(
			
 
				-            model_name='archiveresult',
			
 
				-            name='id',
			
 
				-            field=models.AutoField(editable=False, primary_key=True, serialize=False),
			
 
				-        ),
			
 
				-        migrations.AlterField(
			
 
				-            model_name='archiveresult',
			
 
				-            name='status',
			
 
				-            field=models.CharField(choices=[('queued', 'Queued'), ('started', 'Started'), ('backoff', 'Waiting to retry'), ('succeeded', 'Succeeded'), ('failed', 'Failed'), ('skipped', 'Skipped')], db_index=True, default='queued', max_length=15),
			
 
				-        ),
			
 
				+        # Update Django's migration state to match 0.9.x schema
			
 
				+        # Database already has correct types from 0.8.x, just update state
			
 
				+        migrations.SeparateDatabaseAndState(
			
 
				+            state_operations=[
			
 
				+                # Archiveresult field alterations
			
 
				+                migrations.AlterField(
			
 
				+                    model_name='archiveresult',
			
 
				+                    name='created_at',
			
 
				+                    field=models.DateTimeField(db_index=True, default=django.utils.timezone.now),
			
 
				+                ),
			
 
				+                migrations.AlterField(
			
 
				+                    model_name='archiveresult',
			
 
				+                    name='created_by',
			
 
				+                    field=models.ForeignKey(default=archivebox.base_models.models.get_or_create_system_user_pk, on_delete=django.db.models.deletion.CASCADE, related_name='archiveresult_set', to=settings.AUTH_USER_MODEL),
			
 
				+                ),
			
 
				+                migrations.AlterField(
			
 
				+                    model_name='archiveresult',
			
 
				+                    name='extractor',
			
 
				+                    field=models.CharField(db_index=True, max_length=32),
			
 
				+                ),
			
 
				+                # Convert id from AutoField to UUIDField (database already has UUID CHAR(32))
			
 
				+                migrations.AlterField(
			
 
				+                    model_name='archiveresult',
			
 
				+                    name='id',
			
 
				+                    field=models.UUIDField(default=uuid_compat.uuid7, editable=False, primary_key=True, serialize=False, unique=True),
			
 
				+                ),
			
 
				+                migrations.AlterField(
			
 
				+                    model_name='archiveresult',
			
 
				+                    name='status',
			
 
				+                    field=models.CharField(choices=[('queued', 'Queued'), ('started', 'Started'), ('backoff', 'Waiting to retry'), ('succeeded', 'Succeeded'), ('failed', 'Failed'), ('skipped', 'Skipped')], db_index=True, default='queued', max_length=15),
			
 
				+                ),
			
 
				 
			
 
				-        # Snapshot field alterations
			
 
				-        migrations.AlterField(
			
 
				-            model_name='snapshot',
			
 
				-            name='bookmarked_at',
			
 
				-            field=models.DateTimeField(db_index=True, default=django.utils.timezone.now),
			
 
				-        ),
			
 
				-        migrations.AlterField(
			
 
				-            model_name='snapshot',
			
 
				-            name='created_at',
			
 
				-            field=models.DateTimeField(db_index=True, default=django.utils.timezone.now),
			
 
				-        ),
			
 
				-        migrations.AlterField(
			
 
				-            model_name='snapshot',
			
 
				-            name='created_by',
			
 
				-            field=models.ForeignKey(default=None, on_delete=django.db.models.deletion.CASCADE, related_name='snapshot_set', to=settings.AUTH_USER_MODEL),
			
 
				-        ),
			
 
				-        migrations.AlterField(
			
 
				-            model_name='snapshot',
			
 
				-            name='downloaded_at',
			
 
				-            field=models.DateTimeField(blank=True, db_index=True, default=None, editable=False, null=True),
			
 
				-        ),
			
 
				-        migrations.AlterField(
			
 
				-            model_name='snapshot',
			
 
				-            name='id',
			
 
				-            field=models.UUIDField(default=uuid_compat.uuid7, editable=False, primary_key=True, serialize=False, unique=True),
			
 
				+                # Snapshot field alterations
			
 
				+                migrations.AlterField(
			
 
				+                    model_name='snapshot',
			
 
				+                    name='bookmarked_at',
			
 
				+                    field=models.DateTimeField(db_index=True, default=django.utils.timezone.now),
			
 
				+                ),
			
 
				+                migrations.AlterField(
			
 
				+                    model_name='snapshot',
			
 
				+                    name='created_at',
			
 
				+                    field=models.DateTimeField(db_index=True, default=django.utils.timezone.now),
			
 
				+                ),
			
 
				+                migrations.AlterField(
			
 
				+                    model_name='snapshot',
			
 
				+                    name='created_by',
			
 
				+                    field=models.ForeignKey(default=archivebox.base_models.models.get_or_create_system_user_pk, on_delete=django.db.models.deletion.CASCADE, related_name='snapshot_set', to=settings.AUTH_USER_MODEL),
			
 
				+                ),
			
 
				+                migrations.AlterField(
			
 
				+                    model_name='snapshot',
			
 
				+                    name='downloaded_at',
			
 
				+                    field=models.DateTimeField(blank=True, db_index=True, default=None, editable=False, null=True),
			
 
				+                ),
			
 
				+                migrations.AlterField(
			
 
				+                    model_name='snapshot',
			
 
				+                    name='id',
			
 
				+                    field=models.UUIDField(default=uuid_compat.uuid7, editable=False, primary_key=True, serialize=False, unique=True),
			
 
				+                ),
			
 
				+            ],
			
 
				+            database_operations=[
			
 
				+                # No actual database changes needed - schema is already correct from 0.8.x
			
 
				+            ],
			
 
				         ),
			
 
				 
			
 
				-        # SnapshotTag and Tag alterations
			
 
				-        migrations.AlterField(
			
 
				-            model_name='snapshottag',
			
 
				-            name='id',
			
 
				-            field=models.AutoField(primary_key=True, serialize=False),
			
 
				-        ),
			
 
				-        migrations.AlterField(
			
 
				-            model_name='tag',
			
 
				-            name='created_by',
			
 
				-            field=models.ForeignKey(default=archivebox.base_models.models.get_or_create_system_user_pk, on_delete=django.db.models.deletion.CASCADE, related_name='tag_set', to=settings.AUTH_USER_MODEL),
			
 
				-        ),
			
 
				-        migrations.AlterUniqueTogether(
			
 
				-            name='snapshottag',
			
 
				-            unique_together={('snapshot', 'tag')},
			
 
				+        # SnapshotTag and Tag alterations - state only, DB already correct
			
 
				+        migrations.SeparateDatabaseAndState(
			
 
				+            state_operations=[
			
 
				+                migrations.AlterField(
			
 
				+                    model_name='snapshottag',
			
 
				+                    name='id',
			
 
				+                    field=models.AutoField(primary_key=True, serialize=False),
			
 
				+                ),
			
 
				+                migrations.AlterField(
			
 
				+                    model_name='tag',
			
 
				+                    name='created_by',
			
 
				+                    field=models.ForeignKey(default=archivebox.base_models.models.get_or_create_system_user_pk, on_delete=django.db.models.deletion.CASCADE, related_name='tag_set', to=settings.AUTH_USER_MODEL),
			
 
				+                ),
			
 
				+                migrations.AlterUniqueTogether(
			
 
				+                    name='snapshottag',
			
 
				+                    unique_together={('snapshot', 'tag')},
			
 
				+                ),
			
 
				+            ],
			
 
				+            database_operations=[],
			
 
				         ),
			
 
				     ]
			
--- a/archivebox/core/migrations/0029_archiveresult_hook_fields.py
+++ b/archivebox/core/migrations/0029_archiveresult_hook_fields.py
@@ -13,68 +13,79 @@ class Migration(migrations.Migration):
 
				     ]
			
 
				 
			
 
				     operations = [
			
 
				-        # Add new output fields (keep old 'output' temporarily for migration)
			
 
				-        migrations.AddField(
			
 
				-            model_name='archiveresult',
			
 
				-            name='output_str',
			
 
				-            field=models.TextField(
			
 
				-                blank=True,
			
 
				-                default='',
			
 
				-                help_text='Human-readable output summary (e.g., "Downloaded 5 files")'
			
 
				-            ),
			
 
				-        ),
			
 
				-
			
 
				-        migrations.AddField(
			
 
				-            model_name='archiveresult',
			
 
				-            name='output_json',
			
 
				-            field=models.JSONField(
			
 
				-                null=True,
			
 
				-                blank=True,
			
 
				-                default=None,
			
 
				-                help_text='Structured metadata (headers, redirects, etc.) - should NOT duplicate ArchiveResult fields'
			
 
				-            ),
			
 
				-        ),
			
 
				-
			
 
				-        migrations.AddField(
			
 
				-            model_name='archiveresult',
			
 
				-            name='output_files',
			
 
				-            field=models.JSONField(
			
 
				-                default=dict,
			
 
				-                help_text='Dict of {relative_path: {metadata}} - values are empty dicts for now, extensible for future metadata'
			
 
				-            ),
			
 
				-        ),
			
 
				-
			
 
				-        migrations.AddField(
			
 
				-            model_name='archiveresult',
			
 
				-            name='output_size',
			
 
				-            field=models.BigIntegerField(
			
 
				-                default=0,
			
 
				-                help_text='Total recursive size in bytes of all output files'
			
 
				-            ),
			
 
				-        ),
			
 
				-
			
 
				-        migrations.AddField(
			
 
				-            model_name='archiveresult',
			
 
				-            name='output_mimetypes',
			
 
				-            field=models.CharField(
			
 
				-                max_length=512,
			
 
				-                blank=True,
			
 
				-                default='',
			
 
				-                help_text='CSV of mimetypes sorted by size descending'
			
 
				-            ),
			
 
				-        ),
			
 
				-
			
 
				-        # Add binary FK (optional)
			
 
				-        migrations.AddField(
			
 
				-            model_name='archiveresult',
			
 
				-            name='binary',
			
 
				-            field=models.ForeignKey(
			
 
				-                'machine.Binary',
			
 
				-                on_delete=models.SET_NULL,
			
 
				-                null=True,
			
 
				-                blank=True,
			
 
				-                related_name='archiveresults',
			
 
				-                help_text='Primary binary used by this hook (optional)'
			
 
				-            ),
			
 
				+        # Add new output fields using SeparateDatabaseAndState to avoid table rebuilds
			
 
				+        migrations.SeparateDatabaseAndState(
			
 
				+            state_operations=[
			
 
				+                migrations.AddField(
			
 
				+                    model_name='archiveresult',
			
 
				+                    name='output_str',
			
 
				+                    field=models.TextField(
			
 
				+                        blank=True,
			
 
				+                        default='',
			
 
				+                        help_text='Human-readable output summary (e.g., "Downloaded 5 files")'
			
 
				+                    ),
			
 
				+                ),
			
 
				+                migrations.AddField(
			
 
				+                    model_name='archiveresult',
			
 
				+                    name='output_json',
			
 
				+                    field=models.JSONField(
			
 
				+                        null=True,
			
 
				+                        blank=True,
			
 
				+                        default=None,
			
 
				+                        help_text='Structured metadata (headers, redirects, etc.) - should NOT duplicate ArchiveResult fields'
			
 
				+                    ),
			
 
				+                ),
			
 
				+                migrations.AddField(
			
 
				+                    model_name='archiveresult',
			
 
				+                    name='output_files',
			
 
				+                    field=models.JSONField(
			
 
				+                        default=dict,
			
 
				+                        help_text='Dict of {relative_path: {metadata}} - values are empty dicts for now, extensible for future metadata'
			
 
				+                    ),
			
 
				+                ),
			
 
				+                migrations.AddField(
			
 
				+                    model_name='archiveresult',
			
 
				+                    name='output_size',
			
 
				+                    field=models.BigIntegerField(
			
 
				+                        default=0,
			
 
				+                        help_text='Total recursive size in bytes of all output files'
			
 
				+                    ),
			
 
				+                ),
			
 
				+                migrations.AddField(
			
 
				+                    model_name='archiveresult',
			
 
				+                    name='output_mimetypes',
			
 
				+                    field=models.CharField(
			
 
				+                        max_length=512,
			
 
				+                        blank=True,
			
 
				+                        default='',
			
 
				+                        help_text='CSV of mimetypes sorted by size descending'
			
 
				+                    ),
			
 
				+                ),
			
 
				+                migrations.AddField(
			
 
				+                    model_name='archiveresult',
			
 
				+                    name='binary',
			
 
				+                    field=models.ForeignKey(
			
 
				+                        'machine.Binary',
			
 
				+                        on_delete=models.SET_NULL,
			
 
				+                        null=True,
			
 
				+                        blank=True,
			
 
				+                        related_name='archiveresults',
			
 
				+                        help_text='Primary binary used by this hook (optional)'
			
 
				+                    ),
			
 
				+                ),
			
 
				+            ],
			
 
				+            database_operations=[
			
 
				+                migrations.RunSQL(
			
 
				+                    sql="""
			
 
				+                        ALTER TABLE core_archiveresult ADD COLUMN output_str TEXT DEFAULT '';
			
 
				+                        ALTER TABLE core_archiveresult ADD COLUMN output_json TEXT;
			
 
				+                        ALTER TABLE core_archiveresult ADD COLUMN output_files TEXT DEFAULT '{}';
			
 
				+                        ALTER TABLE core_archiveresult ADD COLUMN output_size BIGINT DEFAULT 0;
			
 
				+                        ALTER TABLE core_archiveresult ADD COLUMN output_mimetypes VARCHAR(512) DEFAULT '';
			
 
				+                        ALTER TABLE core_archiveresult ADD COLUMN binary_id CHAR(32) REFERENCES machine_binary(id);
			
 
				+                    """,
			
 
				+                    reverse_sql=migrations.RunSQL.noop,
			
 
				+                ),
			
 
				+            ],
			
 
				         ),
			
 
				     ]
			
--- a/archivebox/core/migrations/0030_migrate_output_field.py
+++ b/archivebox/core/migrations/0030_migrate_output_field.py
@@ -12,27 +12,46 @@ def migrate_output_field(apps, schema_editor):
 
				     Logic:
			
 
				     - If output contains JSON {...}, move to output_json
			
 
				     - Otherwise, move to output_str
			
 
				-    """
			
 
				-    ArchiveResult = apps.get_model('core', 'ArchiveResult')
			
 
				-
			
 
				-    for ar in ArchiveResult.objects.all().iterator():
			
 
				-        old_output = ar.output or ''
			
 
				 
			
 
				-        # Case 1: JSON output
			
 
				-        if old_output.strip().startswith('{'):
			
 
				-            try:
			
 
				-                parsed = json.loads(old_output)
			
 
				-                ar.output_json = parsed
			
 
				-                ar.output_str = ''
			
 
				-            except json.JSONDecodeError:
			
 
				-                # Not valid JSON, treat as string
			
 
				-                ar.output_str = old_output
			
 
				-
			
 
				-        # Case 2: File path or plain string
			
 
				-        else:
			
 
				-            ar.output_str = old_output
			
 
				-
			
 
				-        ar.save(update_fields=['output_str', 'output_json'])
			
 
				+    Use raw SQL to avoid CHECK constraint issues during migration.
			
 
				+    """
			
 
				+    # Use raw SQL to migrate data without triggering CHECK constraints
			
 
				+    with schema_editor.connection.cursor() as cursor:
			
 
				+        # Get all archive results
			
 
				+        cursor.execute("""
			
 
				+            SELECT id, output FROM core_archiveresult
			
 
				+        """)
			
 
				+
			
 
				+        for row in cursor.fetchall():
			
 
				+            ar_id, old_output = row
			
 
				+            old_output = old_output or ''
			
 
				+
			
 
				+            # Case 1: JSON output
			
 
				+            if old_output.strip().startswith('{'):
			
 
				+                try:
			
 
				+                    # Validate it's actual JSON
			
 
				+                    parsed = json.loads(old_output)
			
 
				+                    # Update with JSON - cast to JSON to satisfy CHECK constraint
			
 
				+                    json_str = json.dumps(parsed)
			
 
				+                    cursor.execute("""
			
 
				+                        UPDATE core_archiveresult
			
 
				+                        SET output_str = '', output_json = json(?)
			
 
				+                        WHERE id = ?
			
 
				+                    """, (json_str, ar_id))
			
 
				+                except json.JSONDecodeError:
			
 
				+                    # Not valid JSON, treat as string
			
 
				+                    cursor.execute("""
			
 
				+                        UPDATE core_archiveresult
			
 
				+                        SET output_str = ?, output_json = NULL
			
 
				+                        WHERE id = ?
			
 
				+                    """, (old_output, ar_id))
			
 
				+            # Case 2: File path or plain string
			
 
				+            else:
			
 
				+                cursor.execute("""
			
 
				+                    UPDATE core_archiveresult
			
 
				+                    SET output_str = ?, output_json = NULL
			
 
				+                    WHERE id = ?
			
 
				+                """, (old_output, ar_id))
			
 
				 
			
 
				 
			
 
				 def reverse_migrate(apps, schema_editor):
			
--- a/archivebox/core/migrations/0032_alter_archiveresult_binary_and_more.py
+++ b/archivebox/core/migrations/0032_alter_archiveresult_binary_and_more.py
@@ -16,43 +16,62 @@ class Migration(migrations.Migration):
 
				     ]
			
 
				 
			
 
				     operations = [
			
 
				-        migrations.AlterField(
			
 
				-            model_name='archiveresult',
			
 
				-            name='binary',
			
 
				-            field=models.ForeignKey(blank=True, help_text='Primary binary used by this hook', null=True, on_delete=django.db.models.deletion.SET_NULL, related_name='archiveresults', to='machine.binary'),
			
 
				+        # Update Django's state only - database already has correct schema from 0029
			
 
				+        migrations.SeparateDatabaseAndState(
			
 
				+            state_operations=[
			
 
				+                migrations.AlterField(
			
 
				+                    model_name='archiveresult',
			
 
				+                    name='binary',
			
 
				+                    field=models.ForeignKey(blank=True, help_text='Primary binary used by this hook', null=True, on_delete=django.db.models.deletion.SET_NULL, related_name='archiveresults', to='machine.binary'),
			
 
				+                ),
			
 
				+                migrations.AlterField(
			
 
				+                    model_name='archiveresult',
			
 
				+                    name='output_files',
			
 
				+                    field=models.JSONField(default=dict, help_text='Dict of {relative_path: {metadata}}'),
			
 
				+                ),
			
 
				+                migrations.AlterField(
			
 
				+                    model_name='archiveresult',
			
 
				+                    name='output_json',
			
 
				+                    field=models.JSONField(blank=True, default=None, help_text='Structured metadata (headers, redirects, etc.)', null=True),
			
 
				+                ),
			
 
				+                migrations.AlterField(
			
 
				+                    model_name='archiveresult',
			
 
				+                    name='output_mimetypes',
			
 
				+                    field=models.CharField(blank=True, default='', help_text='CSV of mimetypes sorted by size', max_length=512),
			
 
				+                ),
			
 
				+                migrations.AlterField(
			
 
				+                    model_name='archiveresult',
			
 
				+                    name='output_size',
			
 
				+                    field=models.BigIntegerField(default=0, help_text='Total bytes of all output files'),
			
 
				+                ),
			
 
				+                migrations.AlterField(
			
 
				+                    model_name='archiveresult',
			
 
				+                    name='output_str',
			
 
				+                    field=models.TextField(blank=True, default='', help_text='Human-readable output summary'),
			
 
				+                ),
			
 
				+                migrations.AlterField(
			
 
				+                    model_name='archiveresult',
			
 
				+                    name='uuid',
			
 
				+                    field=models.UUIDField(blank=True, db_index=True, default=uuid_compat.uuid7, null=True),
			
 
				+                ),
			
 
				+            ],
			
 
				+            database_operations=[
			
 
				+                # No database changes needed - columns already exist with correct types
			
 
				+            ],
			
 
				         ),
			
 
				-        migrations.AlterField(
			
 
				-            model_name='archiveresult',
			
 
				-            name='output_files',
			
 
				-            field=models.JSONField(default=dict, help_text='Dict of {relative_path: {metadata}}'),
			
 
				-        ),
			
 
				-        migrations.AlterField(
			
 
				-            model_name='archiveresult',
			
 
				-            name='output_json',
			
 
				-            field=models.JSONField(blank=True, default=None, help_text='Structured metadata (headers, redirects, etc.)', null=True),
			
 
				-        ),
			
 
				-        migrations.AlterField(
			
 
				-            model_name='archiveresult',
			
 
				-            name='output_mimetypes',
			
 
				-            field=models.CharField(blank=True, default='', help_text='CSV of mimetypes sorted by size', max_length=512),
			
 
				-        ),
			
 
				-        migrations.AlterField(
			
 
				-            model_name='archiveresult',
			
 
				-            name='output_size',
			
 
				-            field=models.BigIntegerField(default=0, help_text='Total bytes of all output files'),
			
 
				-        ),
			
 
				-        migrations.AlterField(
			
 
				-            model_name='archiveresult',
			
 
				-            name='output_str',
			
 
				-            field=models.TextField(blank=True, default='', help_text='Human-readable output summary'),
			
 
				-        ),
			
 
				-        migrations.AlterField(
			
 
				-            model_name='archiveresult',
			
 
				-            name='uuid',
			
 
				-            field=models.UUIDField(blank=True, db_index=True, default=uuid_compat.uuid7, null=True),
			
 
				-        ),
			
 
				-        migrations.AddConstraint(
			
 
				-            model_name='snapshot',
			
 
				-            constraint=models.UniqueConstraint(fields=('timestamp',), name='unique_timestamp'),
			
 
				+        # Add unique constraint without table rebuild
			
 
				+        migrations.SeparateDatabaseAndState(
			
 
				+            state_operations=[
			
 
				+                migrations.AddConstraint(
			
 
				+                    model_name='snapshot',
			
 
				+                    constraint=models.UniqueConstraint(fields=('timestamp',), name='unique_timestamp'),
			
 
				+                ),
			
 
				+            ],
			
 
				+            database_operations=[
			
 
				+                migrations.RunSQL(
			
 
				+                    sql="CREATE UNIQUE INDEX IF NOT EXISTS unique_timestamp ON core_snapshot (timestamp);",
			
 
				+                    reverse_sql="DROP INDEX IF EXISTS unique_timestamp;",
			
 
				+                ),
			
 
				+            ],
			
 
				         ),
			
 
				     ]
			
--- a/archivebox/core/migrations/0033_rename_extractor_add_hook_name.py
+++ b/archivebox/core/migrations/0033_rename_extractor_add_hook_name.py
@@ -10,20 +10,35 @@ class Migration(migrations.Migration):
 
				     ]
			
 
				 
			
 
				     operations = [
			
 
				-        migrations.RenameField(
			
 
				-            model_name='archiveresult',
			
 
				-            old_name='extractor',
			
 
				-            new_name='plugin',
			
 
				-        ),
			
 
				-        migrations.AddField(
			
 
				-            model_name='archiveresult',
			
 
				-            name='hook_name',
			
 
				-            field=models.CharField(
			
 
				-                blank=True,
			
 
				-                default='',
			
 
				-                max_length=255,
			
 
				-                db_index=True,
			
 
				-                help_text='Full filename of the hook that executed (e.g., on_Snapshot__50_wget.py)'
			
 
				-            ),
			
 
				+        # Use SeparateDatabaseAndState to avoid table rebuilds that would re-add CHECK constraints
			
 
				+        migrations.SeparateDatabaseAndState(
			
 
				+            state_operations=[
			
 
				+                migrations.RenameField(
			
 
				+                    model_name='archiveresult',
			
 
				+                    old_name='extractor',
			
 
				+                    new_name='plugin',
			
 
				+                ),
			
 
				+                migrations.AddField(
			
 
				+                    model_name='archiveresult',
			
 
				+                    name='hook_name',
			
 
				+                    field=models.CharField(
			
 
				+                        blank=True,
			
 
				+                        default='',
			
 
				+                        max_length=255,
			
 
				+                        db_index=True,
			
 
				+                        help_text='Full filename of the hook that executed (e.g., on_Snapshot__50_wget.py)'
			
 
				+                    ),
			
 
				+                ),
			
 
				+            ],
			
 
				+            database_operations=[
			
 
				+                migrations.RunSQL(
			
 
				+                    sql="""
			
 
				+                        ALTER TABLE core_archiveresult RENAME COLUMN extractor TO plugin;
			
 
				+                        ALTER TABLE core_archiveresult ADD COLUMN hook_name VARCHAR(255) DEFAULT '' NOT NULL;
			
 
				+                        CREATE INDEX IF NOT EXISTS core_archiveresult_hook_name_idx ON core_archiveresult (hook_name);
			
 
				+                    """,
			
 
				+                    reverse_sql=migrations.RunSQL.noop,
			
 
				+                ),
			
 
				+            ],
			
 
				         ),
			
 
				     ]
			
--- a/archivebox/core/migrations/0034_snapshot_current_step.py
+++ b/archivebox/core/migrations/0034_snapshot_current_step.py
@@ -11,13 +11,27 @@ class Migration(migrations.Migration):
 
				     ]
			
 
				 
			
 
				     operations = [
			
 
				-        migrations.AddField(
			
 
				-            model_name='snapshot',
			
 
				-            name='current_step',
			
 
				-            field=models.PositiveSmallIntegerField(
			
 
				-                default=0,
			
 
				-                db_index=True,
			
 
				-                help_text='Current hook step being executed (0-9). Used for sequential hook execution.'
			
 
				-            ),
			
 
				+        # Use SeparateDatabaseAndState to avoid table rebuild that would fail on config NOT NULL constraint
			
 
				+        migrations.SeparateDatabaseAndState(
			
 
				+            state_operations=[
			
 
				+                migrations.AddField(
			
 
				+                    model_name='snapshot',
			
 
				+                    name='current_step',
			
 
				+                    field=models.PositiveSmallIntegerField(
			
 
				+                        default=0,
			
 
				+                        db_index=True,
			
 
				+                        help_text='Current hook step being executed (0-9). Used for sequential hook execution.'
			
 
				+                    ),
			
 
				+                ),
			
 
				+            ],
			
 
				+            database_operations=[
			
 
				+                migrations.RunSQL(
			
 
				+                    sql="""
			
 
				+                        ALTER TABLE core_snapshot ADD COLUMN current_step SMALLINT UNSIGNED DEFAULT 0 NOT NULL;
			
 
				+                        CREATE INDEX IF NOT EXISTS core_snapshot_current_step_idx ON core_snapshot (current_step);
			
 
				+                    """,
			
 
				+                    reverse_sql=migrations.RunSQL.noop,
			
 
				+                ),
			
 
				+            ],
			
 
				         ),
			
 
				     ]
			
--- a/archivebox/core/migrations/0035_snapshot_crawl_non_nullable_remove_created_by.py
+++ b/archivebox/core/migrations/0035_snapshot_crawl_non_nullable_remove_created_by.py
@@ -54,7 +54,7 @@ class Migration(migrations.Migration):
 
				 
			
 
				     dependencies = [
			
 
				         ('core', '0034_snapshot_current_step'),
			
 
				-        ('crawls', '0004_alter_crawl_output_dir'),
			
 
				+        ('crawls', '0005_drop_seed_id_column'),
			
 
				     ]
			
 
				 
			
 
				     operations = [
			
@@ -64,16 +64,24 @@ class Migration(migrations.Migration):
 
				             reverse_code=migrations.RunPython.noop,
			
 
				         ),
			
 
				 
			
 
				-        # Step 2: Make crawl non-nullable
			
 
				-        migrations.AlterField(
			
 
				-            model_name='snapshot',
			
 
				-            name='crawl',
			
 
				-            field=models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, related_name='snapshot_set', to='crawls.crawl'),
			
 
				-        ),
			
 
				-
			
 
				-        # Step 3: Remove created_by field
			
 
				-        migrations.RemoveField(
			
 
				-            model_name='snapshot',
			
 
				-            name='created_by',
			
 
				+        # Step 2 & 3: Update Django's state only - leave created_by_id column in database (unused but harmless)
			
 
				+        migrations.SeparateDatabaseAndState(
			
 
				+            state_operations=[
			
 
				+                # Make crawl non-nullable
			
 
				+                migrations.AlterField(
			
 
				+                    model_name='snapshot',
			
 
				+                    name='crawl',
			
 
				+                    field=models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, related_name='snapshot_set', to='crawls.crawl'),
			
 
				+                ),
			
 
				+                # Remove created_by field from Django's state
			
 
				+                migrations.RemoveField(
			
 
				+                    model_name='snapshot',
			
 
				+                    name='created_by',
			
 
				+                ),
			
 
				+            ],
			
 
				+            database_operations=[
			
 
				+                # No database changes - crawl_id already exists and NOT NULL constraint will be enforced by model
			
 
				+                # created_by_id column remains in database but is unused
			
 
				+            ],
			
 
				         ),
			
 
				     ]
			
--- a/archivebox/core/migrations/0036_remove_archiveresult_created_by.py
+++ b/archivebox/core/migrations/0036_remove_archiveresult_created_by.py
@@ -10,10 +10,18 @@ class Migration(migrations.Migration):
 
				     ]
			
 
				 
			
 
				     operations = [
			
 
				-        # Remove created_by field from ArchiveResult
			
 
				+        # Remove created_by field from ArchiveResult (state only)
			
 
				         # No data migration needed - created_by can be accessed via snapshot.crawl.created_by
			
 
				-        migrations.RemoveField(
			
 
				-            model_name='archiveresult',
			
 
				-            name='created_by',
			
 
				+        # Leave created_by_id column in database (unused but harmless, avoids table rebuild)
			
 
				+        migrations.SeparateDatabaseAndState(
			
 
				+            state_operations=[
			
 
				+                migrations.RemoveField(
			
 
				+                    model_name='archiveresult',
			
 
				+                    name='created_by',
			
 
				+                ),
			
 
				+            ],
			
 
				+            database_operations=[
			
 
				+                # No database changes - leave created_by_id column in place to avoid table rebuild
			
 
				+            ],
			
 
				         ),
			
 
				     ]
			
--- a/archivebox/core/migrations/0037_remove_archiveresult_output_dir_and_more.py
+++ b/archivebox/core/migrations/0037_remove_archiveresult_output_dir_and_more.py
@@ -0,0 +1,44 @@
 
				+# Generated by Django 6.0 on 2025-12-29 06:45
			
 
				+
			
 
				+from django.db import migrations, models
			
 
				+
			
 
				+
			
 
				+class Migration(migrations.Migration):
			
 
				+
			
 
				+    dependencies = [
			
 
				+        ('core', '0036_remove_archiveresult_created_by'),
			
 
				+    ]
			
 
				+
			
 
				+    operations = [
			
 
				+        # Update Django's state only - database columns remain for backwards compat
			
 
				+        migrations.SeparateDatabaseAndState(
			
 
				+            state_operations=[
			
 
				+                migrations.RemoveField(
			
 
				+                    model_name='archiveresult',
			
 
				+                    name='output_dir',
			
 
				+                ),
			
 
				+                migrations.RemoveField(
			
 
				+                    model_name='snapshot',
			
 
				+                    name='output_dir',
			
 
				+                ),
			
 
				+                migrations.AlterField(
			
 
				+                    model_name='archiveresult',
			
 
				+                    name='config',
			
 
				+                    field=models.JSONField(blank=True, default=dict, null=True),
			
 
				+                ),
			
 
				+                migrations.AlterField(
			
 
				+                    model_name='snapshot',
			
 
				+                    name='config',
			
 
				+                    field=models.JSONField(blank=True, default=dict, null=True),
			
 
				+                ),
			
 
				+                migrations.AlterField(
			
 
				+                    model_name='snapshot',
			
 
				+                    name='tags',
			
 
				+                    field=models.ManyToManyField(blank=True, related_name='snapshot_set', through='core.SnapshotTag', through_fields=('snapshot', 'tag'), to='core.tag'),
			
 
				+                ),
			
 
				+            ],
			
 
				+            database_operations=[
			
 
				+                # No database changes - columns remain in place to avoid table rebuilds
			
 
				+            ],
			
 
				+        ),
			
 
				+    ]
			
--- a/archivebox/core/migrations/0038_fix_missing_columns.py
+++ b/archivebox/core/migrations/0038_fix_missing_columns.py
@@ -0,0 +1,84 @@
 
				+# Add missing columns to ArchiveResult and remove created_by_id from Snapshot
			
 
				+
			
 
				+from django.db import migrations, models, connection
			
 
				+import django.utils.timezone
			
 
				+
			
 
				+
			
 
				+def add_columns_if_not_exist(apps, schema_editor):
			
 
				+    """Add columns to ArchiveResult only if they don't already exist."""
			
 
				+    with connection.cursor() as cursor:
			
 
				+        # Get existing columns
			
 
				+        cursor.execute("PRAGMA table_info(core_archiveresult)")
			
 
				+        existing_columns = {row[1] for row in cursor.fetchall()}
			
 
				+
			
 
				+        # Add num_uses_failed if it doesn't exist
			
 
				+        if 'num_uses_failed' not in existing_columns:
			
 
				+            cursor.execute("ALTER TABLE core_archiveresult ADD COLUMN num_uses_failed integer unsigned NOT NULL DEFAULT 0 CHECK (num_uses_failed >= 0)")
			
 
				+
			
 
				+        # Add num_uses_succeeded if it doesn't exist
			
 
				+        if 'num_uses_succeeded' not in existing_columns:
			
 
				+            cursor.execute("ALTER TABLE core_archiveresult ADD COLUMN num_uses_succeeded integer unsigned NOT NULL DEFAULT 0 CHECK (num_uses_succeeded >= 0)")
			
 
				+
			
 
				+        # Add config if it doesn't exist
			
 
				+        if 'config' not in existing_columns:
			
 
				+            cursor.execute("ALTER TABLE core_archiveresult ADD COLUMN config text NULL")
			
 
				+
			
 
				+        # Add retry_at if it doesn't exist
			
 
				+        if 'retry_at' not in existing_columns:
			
 
				+            cursor.execute("ALTER TABLE core_archiveresult ADD COLUMN retry_at datetime NULL")
			
 
				+            cursor.execute("CREATE INDEX IF NOT EXISTS core_archiveresult_retry_at_idx ON core_archiveresult(retry_at)")
			
 
				+
			
 
				+
			
 
				+class Migration(migrations.Migration):
			
 
				+
			
 
				+    dependencies = [
			
 
				+        ('core', '0037_remove_archiveresult_output_dir_and_more'),
			
 
				+    ]
			
 
				+
			
 
				+    operations = [
			
 
				+        # Add missing columns to ArchiveResult
			
 
				+        migrations.SeparateDatabaseAndState(
			
 
				+            state_operations=[
			
 
				+                migrations.AddField(
			
 
				+                    model_name='archiveresult',
			
 
				+                    name='num_uses_failed',
			
 
				+                    field=models.PositiveIntegerField(default=0),
			
 
				+                ),
			
 
				+                migrations.AddField(
			
 
				+                    model_name='archiveresult',
			
 
				+                    name='num_uses_succeeded',
			
 
				+                    field=models.PositiveIntegerField(default=0),
			
 
				+                ),
			
 
				+                migrations.AddField(
			
 
				+                    model_name='archiveresult',
			
 
				+                    name='config',
			
 
				+                    field=models.JSONField(blank=True, default=dict, null=True),
			
 
				+                ),
			
 
				+                migrations.AddField(
			
 
				+                    model_name='archiveresult',
			
 
				+                    name='retry_at',
			
 
				+                    field=models.DateTimeField(blank=True, db_index=True, default=django.utils.timezone.now, null=True),
			
 
				+                ),
			
 
				+            ],
			
 
				+            database_operations=[
			
 
				+                migrations.RunPython(add_columns_if_not_exist, reverse_code=migrations.RunPython.noop),
			
 
				+            ],
			
 
				+        ),
			
 
				+
			
 
				+        # Drop created_by_id from Snapshot (database only, already removed from model in 0035)
			
 
				+        migrations.SeparateDatabaseAndState(
			
 
				+            state_operations=[
			
 
				+                # No state changes - field already removed in 0035
			
 
				+            ],
			
 
				+            database_operations=[
			
 
				+                migrations.RunSQL(
			
 
				+                    sql="""
			
 
				+                        -- Drop index first, then column
			
 
				+                        DROP INDEX IF EXISTS core_snapshot_created_by_id_6dbd6149;
			
 
				+                        ALTER TABLE core_snapshot DROP COLUMN created_by_id;
			
 
				+                    """,
			
 
				+                    reverse_sql=migrations.RunSQL.noop,
			
 
				+                ),
			
 
				+            ],
			
 
				+        ),
			
 
				+    ]
			
--- a/archivebox/core/migrations/0039_fix_num_uses_values.py
+++ b/archivebox/core/migrations/0039_fix_num_uses_values.py
@@ -0,0 +1,30 @@
 
				+# Fix num_uses_failed and num_uses_succeeded string values to integers
			
 
				+
			
 
				+from django.db import migrations
			
 
				+
			
 
				+
			
 
				+class Migration(migrations.Migration):
			
 
				+
			
 
				+    dependencies = [
			
 
				+        ('core', '0038_fix_missing_columns'),
			
 
				+    ]
			
 
				+
			
 
				+    operations = [
			
 
				+        # Fix string values that got inserted as literals instead of integers
			
 
				+        migrations.RunSQL(
			
 
				+            sql="""
			
 
				+                UPDATE core_snapshot
			
 
				+                SET num_uses_failed = 0
			
 
				+                WHERE typeof(num_uses_failed) = 'text' OR num_uses_failed = 'num_uses_failed';
			
 
				+
			
 
				+                UPDATE core_snapshot
			
 
				+                SET num_uses_succeeded = 0
			
 
				+                WHERE typeof(num_uses_succeeded) = 'text' OR num_uses_succeeded = 'num_uses_succeeded';
			
 
				+
			
 
				+                UPDATE core_snapshot
			
 
				+                SET depth = 0
			
 
				+                WHERE typeof(depth) = 'text' OR depth = 'depth';
			
 
				+            """,
			
 
				+            reverse_sql=migrations.RunSQL.noop,
			
 
				+        ),
			
 
				+    ]
			
--- a/archivebox/core/models.py
+++ b/archivebox/core/models.py
@@ -911,7 +911,7 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
 
				         )
			
 
				 
			
 
				         merged = 0
			
 
				-        for dup in duplicates.iterator():
			
 
				+        for dup in duplicates.iterator(chunk_size=500):
			
 
				             snapshots = list(
			
 
				                 cls.objects
			
 
				                 .filter(url=dup['url'], timestamp=dup['timestamp'])
			
--- a/archivebox/core/models.py.bak
+++ b/archivebox/core/models.py.bak
@@ -1,2638 +0,0 @@
 
				-__package__ = 'archivebox.core'
			
 
				-
			
 
				-from typing import Optional, Dict, Iterable, Any, List, TYPE_CHECKING
			
 
				-from archivebox.uuid_compat import uuid7
			
 
				-from datetime import datetime, timedelta
			
 
				-from django_stubs_ext.db.models import TypedModelMeta
			
 
				-
			
 
				-import os
			
 
				-import json
			
 
				-from pathlib import Path
			
 
				-
			
 
				-from statemachine import State, registry
			
 
				-
			
 
				-from django.db import models
			
 
				-from django.db.models import QuerySet, Value, Case, When, IntegerField
			
 
				-from django.utils.functional import cached_property
			
 
				-from django.utils.text import slugify
			
 
				-from django.utils import timezone
			
 
				-from django.core.cache import cache
			
 
				-from django.urls import reverse, reverse_lazy
			
 
				-from django.contrib import admin
			
 
				-from django.conf import settings
			
 
				-
			
 
				-from archivebox.config import CONSTANTS
			
 
				-from archivebox.misc.system import get_dir_size, atomic_write
			
 
				-from archivebox.misc.util import parse_date, base_url, domain as url_domain, to_json, ts_to_date_str, urlencode, htmlencode, urldecode
			
 
				-from archivebox.misc.hashing import get_dir_info
			
 
				-from archivebox.hooks import (
			
 
				-    EXTRACTOR_INDEXING_PRECEDENCE,
			
 
				-    get_plugins, get_plugin_name, get_plugin_icon,
			
 
				-    DEFAULT_PLUGIN_ICONS,
			
 
				-)
			
 
				-from archivebox.base_models.models import (
			
 
				-    ModelWithUUID, ModelWithSerializers, ModelWithOutputDir,
			
 
				-    ModelWithConfig, ModelWithNotes, ModelWithHealthStats,
			
 
				-    get_or_create_system_user_pk,
			
 
				-)
			
 
				-from workers.models import ModelWithStateMachine, BaseStateMachine
			
 
				-from workers.tasks import bg_archive_snapshot
			
 
				-from archivebox.crawls.models import Crawl
			
 
				-from archivebox.machine.models import NetworkInterface, Binary
			
 
				-
			
 
				-
			
 
				-
			
 
				-class Tag(ModelWithSerializers):
			
 
				-    # Keep AutoField for compatibility with main branch migrations
			
 
				-    # Don't use UUIDField here - requires complex FK transformation
			
 
				-    id = models.AutoField(primary_key=True, serialize=False, verbose_name='ID')
			
 
				-    created_by = models.ForeignKey(settings.AUTH_USER_MODEL, on_delete=models.CASCADE, default=get_or_create_system_user_pk, null=False, related_name='tag_set')
			
 
				-    created_at = models.DateTimeField(default=timezone.now, db_index=True, null=True)
			
 
				-    modified_at = models.DateTimeField(auto_now=True)
			
 
				-    name = models.CharField(unique=True, blank=False, max_length=100)
			
 
				-    slug = models.SlugField(unique=True, blank=False, max_length=100, editable=False)
			
 
				-
			
 
				-    snapshot_set: models.Manager['Snapshot']
			
 
				-
			
 
				-    class Meta(TypedModelMeta):
			
 
				-        verbose_name = "Tag"
			
 
				-        verbose_name_plural = "Tags"
			
 
				-
			
 
				-    def __str__(self):
			
 
				-        return self.name
			
 
				-
			
 
				-    def save(self, *args, **kwargs):
			
 
				-        is_new = self._state.adding
			
 
				-        if is_new:
			
 
				-            self.slug = slugify(self.name)
			
 
				-            existing = set(Tag.objects.filter(slug__startswith=self.slug).values_list("slug", flat=True))
			
 
				-            i = None
			
 
				-            while True:
			
 
				-                slug = f"{slugify(self.name)}_{i}" if i else slugify(self.name)
			
 
				-                if slug not in existing:
			
 
				-                    self.slug = slug
			
 
				-                    break
			
 
				-                i = (i or 0) + 1
			
 
				-        super().save(*args, **kwargs)
			
 
				-
			
 
				-        if is_new:
			
 
				-            from archivebox.misc.logging_util import log_worker_event
			
 
				-            log_worker_event(
			
 
				-                worker_type='DB',
			
 
				-                event='Created Tag',
			
 
				-                indent_level=0,
			
 
				-                metadata={
			
 
				-                    'id': self.id,
			
 
				-                    'name': self.name,
			
 
				-                    'slug': self.slug,
			
 
				-                },
			
 
				-            )
			
 
				-
			
 
				-    @property
			
 
				-    def api_url(self) -> str:
			
 
				-        return reverse_lazy('api-1:get_tag', args=[self.id])
			
 
				-
			
 
				-    @staticmethod
			
 
				-    def from_jsonl(record: Dict[str, Any], overrides: Dict[str, Any] = None):
			
 
				-        """
			
 
				-        Create/update Tag from JSONL record.
			
 
				-
			
 
				-        Args:
			
 
				-            record: JSONL record with 'name' field
			
 
				-            overrides: Optional dict with 'snapshot' to auto-attach tag
			
 
				-
			
 
				-        Returns:
			
 
				-            Tag instance or None
			
 
				-        """
			
 
				-        from archivebox.misc.jsonl import get_or_create_tag
			
 
				-
			
 
				-        try:
			
 
				-            tag = get_or_create_tag(record)
			
 
				-
			
 
				-            # Auto-attach to snapshot if in overrides
			
 
				-            if overrides and 'snapshot' in overrides and tag:
			
 
				-                overrides['snapshot'].tags.add(tag)
			
 
				-
			
 
				-            return tag
			
 
				-        except ValueError:
			
 
				-            return None
			
 
				-
			
 
				-
			
 
				-class SnapshotTag(models.Model):
			
 
				-    id = models.AutoField(primary_key=True)
			
 
				-    snapshot = models.ForeignKey('Snapshot', db_column='snapshot_id', on_delete=models.CASCADE, to_field='id')
			
 
				-    tag = models.ForeignKey(Tag, db_column='tag_id', on_delete=models.CASCADE, to_field='id')
			
 
				-
			
 
				-    class Meta:
			
 
				-        db_table = 'core_snapshot_tags'
			
 
				-        unique_together = [('snapshot', 'tag')]
			
 
				-
			
 
				-
			
 
				-class SnapshotQuerySet(models.QuerySet):
			
 
				-    """Custom QuerySet for Snapshot model with export methods that persist through .filter() etc."""
			
 
				-
			
 
				-    # =========================================================================
			
 
				-    # Filtering Methods
			
 
				-    # =========================================================================
			
 
				-
			
 
				-    FILTER_TYPES = {
			
 
				-        'exact': lambda pattern: models.Q(url=pattern),
			
 
				-        'substring': lambda pattern: models.Q(url__icontains=pattern),
			
 
				-        'regex': lambda pattern: models.Q(url__iregex=pattern),
			
 
				-        'domain': lambda pattern: models.Q(url__istartswith=f"http://{pattern}") | models.Q(url__istartswith=f"https://{pattern}") | models.Q(url__istartswith=f"ftp://{pattern}"),
			
 
				-        'tag': lambda pattern: models.Q(tags__name=pattern),
			
 
				-        'timestamp': lambda pattern: models.Q(timestamp=pattern),
			
 
				-    }
			
 
				-
			
 
				-    def filter_by_patterns(self, patterns: List[str], filter_type: str = 'exact') -> 'SnapshotQuerySet':
			
 
				-        """Filter snapshots by URL patterns using specified filter type"""
			
 
				-        from archivebox.misc.logging import stderr
			
 
				-
			
 
				-        q_filter = models.Q()
			
 
				-        for pattern in patterns:
			
 
				-            try:
			
 
				-                q_filter = q_filter | self.FILTER_TYPES[filter_type](pattern)
			
 
				-            except KeyError:
			
 
				-                stderr()
			
 
				-                stderr(f'[X] Got invalid pattern for --filter-type={filter_type}:', color='red')
			
 
				-                stderr(f'    {pattern}')
			
 
				-                raise SystemExit(2)
			
 
				-        return self.filter(q_filter)
			
 
				-
			
 
				-    def search(self, patterns: List[str]) -> 'SnapshotQuerySet':
			
 
				-        """Search snapshots using the configured search backend"""
			
 
				-        from archivebox.config.common import SEARCH_BACKEND_CONFIG
			
 
				-        from archivebox.search import query_search_index
			
 
				-        from archivebox.misc.logging import stderr
			
 
				-
			
 
				-        if not SEARCH_BACKEND_CONFIG.USE_SEARCHING_BACKEND:
			
 
				-            stderr()
			
 
				-            stderr('[X] The search backend is not enabled, set config.USE_SEARCHING_BACKEND = True', color='red')
			
 
				-            raise SystemExit(2)
			
 
				-
			
 
				-        qsearch = self.none()
			
 
				-        for pattern in patterns:
			
 
				-            try:
			
 
				-                qsearch |= query_search_index(pattern)
			
 
				-            except:
			
 
				-                raise SystemExit(2)
			
 
				-        return self.all() & qsearch
			
 
				-
			
 
				-    # =========================================================================
			
 
				-    # Export Methods
			
 
				-    # =========================================================================
			
 
				-
			
 
				-    def to_json(self, with_headers: bool = False) -> str:
			
 
				-        """Generate JSON index from snapshots"""
			
 
				-        import sys
			
 
				-        from datetime import datetime, timezone as tz
			
 
				-        from archivebox.config import VERSION
			
 
				-        from archivebox.config.common import SERVER_CONFIG
			
 
				-
			
 
				-        MAIN_INDEX_HEADER = {
			
 
				-            'info': 'This is an index of site data archived by ArchiveBox: The self-hosted web archive.',
			
 
				-            'schema': 'archivebox.index.json',
			
 
				-            'copyright_info': SERVER_CONFIG.FOOTER_INFO,
			
 
				-            'meta': {
			
 
				-                'project': 'ArchiveBox',
			
 
				-                'version': VERSION,
			
 
				-                'git_sha': VERSION,
			
 
				-                'website': 'https://ArchiveBox.io',
			
 
				-                'docs': 'https://github.com/ArchiveBox/ArchiveBox/wiki',
			
 
				-                'source': 'https://github.com/ArchiveBox/ArchiveBox',
			
 
				-                'issues': 'https://github.com/ArchiveBox/ArchiveBox/issues',
			
 
				-                'dependencies': {},
			
 
				-            },
			
 
				-        } if with_headers else {}
			
 
				-
			
 
				-        snapshot_dicts = [s.to_dict(extended=True) for s in self.iterator(chunk_size=500)]
			
 
				-
			
 
				-        if with_headers:
			
 
				-            output = {
			
 
				-                **MAIN_INDEX_HEADER,
			
 
				-                'num_links': len(snapshot_dicts),
			
 
				-                'updated': datetime.now(tz.utc),
			
 
				-                'last_run_cmd': sys.argv,
			
 
				-                'links': snapshot_dicts,
			
 
				-            }
			
 
				-        else:
			
 
				-            output = snapshot_dicts
			
 
				-        return to_json(output, indent=4, sort_keys=True)
			
 
				-
			
 
				-    def to_csv(self, cols: Optional[List[str]] = None, header: bool = True, separator: str = ',', ljust: int = 0) -> str:
			
 
				-        """Generate CSV output from snapshots"""
			
 
				-        cols = cols or ['timestamp', 'is_archived', 'url']
			
 
				-        header_str = separator.join(col.ljust(ljust) for col in cols) if header else ''
			
 
				-        row_strs = (s.to_csv(cols=cols, ljust=ljust, separator=separator) for s in self.iterator(chunk_size=500))
			
 
				-        return '\n'.join((header_str, *row_strs))
			
 
				-
			
 
				-    def to_html(self, with_headers: bool = True) -> str:
			
 
				-        """Generate main index HTML from snapshots"""
			
 
				-        from datetime import datetime, timezone as tz
			
 
				-        from django.template.loader import render_to_string
			
 
				-        from archivebox.config import VERSION
			
 
				-        from archivebox.config.common import SERVER_CONFIG
			
 
				-        from archivebox.config.version import get_COMMIT_HASH
			
 
				-
			
 
				-        template = 'static_index.html' if with_headers else 'minimal_index.html'
			
 
				-        snapshot_list = list(self.iterator(chunk_size=500))
			
 
				-
			
 
				-        return render_to_string(template, {
			
 
				-            'version': VERSION,
			
 
				-            'git_sha': get_COMMIT_HASH() or VERSION,
			
 
				-            'num_links': str(len(snapshot_list)),
			
 
				-            'date_updated': datetime.now(tz.utc).strftime('%Y-%m-%d'),
			
 
				-            'time_updated': datetime.now(tz.utc).strftime('%Y-%m-%d %H:%M'),
			
 
				-            'links': snapshot_list,
			
 
				-            'FOOTER_INFO': SERVER_CONFIG.FOOTER_INFO,
			
 
				-        })
			
 
				-
			
 
				-
			
 
				-class SnapshotManager(models.Manager.from_queryset(SnapshotQuerySet)):
			
 
				-    """Manager for Snapshot model - uses SnapshotQuerySet for chainable methods"""
			
 
				-
			
 
				-    def filter(self, *args, **kwargs):
			
 
				-        domain = kwargs.pop('domain', None)
			
 
				-        qs = super().filter(*args, **kwargs)
			
 
				-        if domain:
			
 
				-            qs = qs.filter(url__icontains=f'://{domain}')
			
 
				-        return qs
			
 
				-
			
 
				-    def get_queryset(self):
			
 
				-        return super().get_queryset().prefetch_related('tags', 'archiveresult_set')
			
 
				-
			
 
				-    # =========================================================================
			
 
				-    # Import Methods
			
 
				-    # =========================================================================
			
 
				-
			
 
				-    def remove(self, atomic: bool = False) -> tuple:
			
 
				-        """Remove snapshots from the database"""
			
 
				-        from django.db import transaction
			
 
				-        if atomic:
			
 
				-            with transaction.atomic():
			
 
				-                return self.delete()
			
 
				-        return self.delete()
			
 
				-
			
 
				-
			
 
				-class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHealthStats, ModelWithStateMachine):
			
 
				-    id = models.UUIDField(primary_key=True, default=uuid7, editable=False, unique=True)
			
 
				-    created_at = models.DateTimeField(default=timezone.now, db_index=True)
			
 
				-    modified_at = models.DateTimeField(auto_now=True)
			
 
				-
			
 
				-    url = models.URLField(unique=False, db_index=True)  # URLs can appear in multiple crawls
			
 
				-    timestamp = models.CharField(max_length=32, unique=True, db_index=True, editable=False)
			
 
				-    bookmarked_at = models.DateTimeField(default=timezone.now, db_index=True)
			
 
				-    crawl: Crawl = models.ForeignKey(Crawl, on_delete=models.CASCADE, null=False, related_name='snapshot_set', db_index=True)  # type: ignore[assignment]
			
 
				-    parent_snapshot = models.ForeignKey('self', on_delete=models.SET_NULL, null=True, blank=True, related_name='child_snapshots', db_index=True, help_text='Parent snapshot that discovered this URL (for recursive crawling)')
			
 
				-
			
 
				-    title = models.CharField(max_length=512, null=True, blank=True, db_index=True)
			
 
				-    downloaded_at = models.DateTimeField(default=None, null=True, editable=False, db_index=True, blank=True)
			
 
				-    depth = models.PositiveSmallIntegerField(default=0, db_index=True)  # 0 for root snapshot, 1+ for discovered URLs
			
 
				-    fs_version = models.CharField(max_length=10, default='0.9.0', help_text='Filesystem version of this snapshot (e.g., "0.7.0", "0.8.0", "0.9.0"). Used to trigger lazy migration on save().')
			
 
				-    current_step = models.PositiveSmallIntegerField(default=0, db_index=True, help_text='Current hook step being executed (0-9). Used for sequential hook execution.')
			
 
				-
			
 
				-    retry_at = ModelWithStateMachine.RetryAtField(default=timezone.now)
			
 
				-    status = ModelWithStateMachine.StatusField(choices=ModelWithStateMachine.StatusChoices, default=ModelWithStateMachine.StatusChoices.QUEUED)
			
 
				-    config = models.JSONField(default=dict, null=False, blank=False, editable=True)
			
 
				-    notes = models.TextField(blank=True, null=False, default='')
			
 
				-    output_dir = models.FilePathField(path=CONSTANTS.ARCHIVE_DIR, recursive=True, match='.*', default=None, null=True, blank=True, editable=True)
			
 
				-
			
 
				-    tags = models.ManyToManyField(Tag, blank=True, through=SnapshotTag, related_name='snapshot_set', through_fields=('snapshot', 'tag'))
			
 
				-
			
 
				-    state_machine_name = 'core.models.SnapshotMachine'
			
 
				-    state_field_name = 'status'
			
 
				-    retry_at_field_name = 'retry_at'
			
 
				-    StatusChoices = ModelWithStateMachine.StatusChoices
			
 
				-    active_state = StatusChoices.STARTED
			
 
				-
			
 
				-    objects = SnapshotManager()
			
 
				-    archiveresult_set: models.Manager['ArchiveResult']
			
 
				-
			
 
				-    class Meta(TypedModelMeta):
			
 
				-        verbose_name = "Snapshot"
			
 
				-        verbose_name_plural = "Snapshots"
			
 
				-        constraints = [
			
 
				-            # Allow same URL in different crawls, but not duplicates within same crawl
			
 
				-            models.UniqueConstraint(fields=['url', 'crawl'], name='unique_url_per_crawl'),
			
 
				-            # Global timestamp uniqueness for 1:1 symlink mapping
			
 
				-            models.UniqueConstraint(fields=['timestamp'], name='unique_timestamp'),
			
 
				-        ]
			
 
				-
			
 
				-    def __str__(self):
			
 
				-        return f'[{self.id}] {self.url[:64]}'
			
 
				-
			
 
				-    def save(self, *args, **kwargs):
			
 
				-        is_new = self._state.adding
			
 
				-        if not self.bookmarked_at:
			
 
				-            self.bookmarked_at = self.created_at or timezone.now()
			
 
				-        if not self.timestamp:
			
 
				-            self.timestamp = str(self.bookmarked_at.timestamp())
			
 
				-
			
 
				-        # Migrate filesystem if needed (happens automatically on save)
			
 
				-        if self.pk and self.fs_migration_needed:
			
 
				-            from django.db import transaction
			
 
				-            with transaction.atomic():
			
 
				-                # Walk through migration chain automatically
			
 
				-                current = self.fs_version
			
 
				-                target = self._fs_current_version()
			
 
				-
			
 
				-                while current != target:
			
 
				-                    next_ver = self._fs_next_version(current)
			
 
				-                    method = f'_fs_migrate_from_{current.replace(".", "_")}_to_{next_ver.replace(".", "_")}'
			
 
				-
			
 
				-                    # Only run if method exists (most are no-ops)
			
 
				-                    if hasattr(self, method):
			
 
				-                        getattr(self, method)()
			
 
				-
			
 
				-                    current = next_ver
			
 
				-
			
 
				-                # Update version (still in transaction)
			
 
				-                self.fs_version = target
			
 
				-
			
 
				-        super().save(*args, **kwargs)
			
 
				-        if self.crawl and self.url not in self.crawl.urls:
			
 
				-            self.crawl.urls += f'\n{self.url}'
			
 
				-            self.crawl.save()
			
 
				-
			
 
				-        if is_new:
			
 
				-            from archivebox.misc.logging_util import log_worker_event
			
 
				-            log_worker_event(
			
 
				-                worker_type='DB',
			
 
				-                event='Created Snapshot',
			
 
				-                indent_level=2,
			
 
				-                url=self.url,
			
 
				-                metadata={
			
 
				-                    'id': str(self.id),
			
 
				-                    'crawl_id': str(self.crawl_id) if self.crawl_id else None,
			
 
				-                    'depth': self.depth,
			
 
				-                    'status': self.status,
			
 
				-                },
			
 
				-            )
			
 
				-
			
 
				-    # =========================================================================
			
 
				-    # Filesystem Migration Methods
			
 
				-    # =========================================================================
			
 
				-
			
 
				-    @staticmethod
			
 
				-    def _fs_current_version() -> str:
			
 
				-        """Get current ArchiveBox filesystem version (normalized to x.x.0 format)"""
			
 
				-        from archivebox.config import VERSION
			
 
				-        # Normalize version to x.x.0 format (e.g., "0.9.0rc1" -> "0.9.0")
			
 
				-        parts = VERSION.split('.')
			
 
				-        if len(parts) >= 2:
			
 
				-            major, minor = parts[0], parts[1]
			
 
				-            # Strip any non-numeric suffix from minor version
			
 
				-            minor = ''.join(c for c in minor if c.isdigit())
			
 
				-            return f'{major}.{minor}.0'
			
 
				-        return '0.9.0'  # Fallback if version parsing fails
			
 
				-
			
 
				-    @property
			
 
				-    def fs_migration_needed(self) -> bool:
			
 
				-        """Check if snapshot needs filesystem migration"""
			
 
				-        return self.fs_version != self._fs_current_version()
			
 
				-
			
 
				-    def _fs_next_version(self, version: str) -> str:
			
 
				-        """Get next version in migration chain"""
			
 
				-        chain = ['0.7.0', '0.8.0', '0.9.0']
			
 
				-        try:
			
 
				-            idx = chain.index(version)
			
 
				-            return chain[idx + 1] if idx + 1 < len(chain) else self._fs_current_version()
			
 
				-        except ValueError:
			
 
				-            # Unknown version - skip to current
			
 
				-            return self._fs_current_version()
			
 
				-
			
 
				-    def _fs_migrate_from_0_7_0_to_0_8_0(self):
			
 
				-        """Migration from 0.7.0 to 0.8.0 layout (no-op)"""
			
 
				-        # 0.7 and 0.8 both used archive/<timestamp>
			
 
				-        # Nothing to do!
			
 
				-        pass
			
 
				-
			
 
				-    def _fs_migrate_from_0_8_0_to_0_9_0(self):
			
 
				-        """
			
 
				-        Migrate from flat to nested structure.
			
 
				-
			
 
				-        0.8.x: archive/{timestamp}/
			
 
				-        0.9.x: users/{user}/snapshots/YYYYMMDD/{domain}/{uuid}/
			
 
				-
			
 
				-        Transaction handling:
			
 
				-        1. Copy files INSIDE transaction
			
 
				-        2. Create symlink INSIDE transaction
			
 
				-        3. Update fs_version INSIDE transaction (done by save())
			
 
				-        4. Exit transaction (DB commit)
			
 
				-        5. Delete old files OUTSIDE transaction (after commit)
			
 
				-        """
			
 
				-        import shutil
			
 
				-        from django.db import transaction
			
 
				-
			
 
				-        old_dir = self.get_storage_path_for_version('0.8.0')
			
 
				-        new_dir = self.get_storage_path_for_version('0.9.0')
			
 
				-
			
 
				-        if not old_dir.exists() or old_dir == new_dir or new_dir.exists():
			
 
				-            return
			
 
				-
			
 
				-        new_dir.mkdir(parents=True, exist_ok=True)
			
 
				-
			
 
				-        # Copy all files (idempotent)
			
 
				-        for old_file in old_dir.rglob('*'):
			
 
				-            if not old_file.is_file():
			
 
				-                continue
			
 
				-
			
 
				-            rel_path = old_file.relative_to(old_dir)
			
 
				-            new_file = new_dir / rel_path
			
 
				-
			
 
				-            # Skip if already copied
			
 
				-            if new_file.exists() and new_file.stat().st_size == old_file.stat().st_size:
			
 
				-                continue
			
 
				-
			
 
				-            new_file.parent.mkdir(parents=True, exist_ok=True)
			
 
				-            shutil.copy2(old_file, new_file)
			
 
				-
			
 
				-        # Verify all copied
			
 
				-        old_files = {f.relative_to(old_dir): f.stat().st_size
			
 
				-                     for f in old_dir.rglob('*') if f.is_file()}
			
 
				-        new_files = {f.relative_to(new_dir): f.stat().st_size
			
 
				-                     for f in new_dir.rglob('*') if f.is_file()}
			
 
				-
			
 
				-        if old_files.keys() != new_files.keys():
			
 
				-            missing = old_files.keys() - new_files.keys()
			
 
				-            raise Exception(f"Migration incomplete: missing {missing}")
			
 
				-
			
 
				-        # Create backwards-compat symlink (INSIDE transaction)
			
 
				-        symlink_path = CONSTANTS.ARCHIVE_DIR / self.timestamp
			
 
				-        if symlink_path.is_symlink():
			
 
				-            symlink_path.unlink()
			
 
				-
			
 
				-        if not symlink_path.exists() or symlink_path == old_dir:
			
 
				-            symlink_path.symlink_to(new_dir, target_is_directory=True)
			
 
				-
			
 
				-        # Schedule old directory deletion AFTER transaction commits
			
 
				-        transaction.on_commit(lambda: self._cleanup_old_migration_dir(old_dir))
			
 
				-
			
 
				-    def _cleanup_old_migration_dir(self, old_dir: Path):
			
 
				-        """
			
 
				-        Delete old directory after successful migration.
			
 
				-        Called via transaction.on_commit() after DB commit succeeds.
			
 
				-        """
			
 
				-        import shutil
			
 
				-        import logging
			
 
				-
			
 
				-        if old_dir.exists() and not old_dir.is_symlink():
			
 
				-            try:
			
 
				-                shutil.rmtree(old_dir)
			
 
				-            except Exception as e:
			
 
				-                # Log but don't raise - migration succeeded, this is just cleanup
			
 
				-                logging.getLogger('archivebox.migration').warning(
			
 
				-                    f"Could not remove old migration directory {old_dir}: {e}"
			
 
				-                )
			
 
				-
			
 
				-    # =========================================================================
			
 
				-    # Path Calculation and Migration Helpers
			
 
				-    # =========================================================================
			
 
				-
			
 
				-    @staticmethod
			
 
				-    def extract_domain_from_url(url: str) -> str:
			
 
				-        """
			
 
				-        Extract domain from URL for 0.9.x path structure.
			
 
				-        Uses full hostname with sanitized special chars.
			
 
				-
			
 
				-        Examples:
			
 
				-            https://example.com:8080 → example.com_8080
			
 
				-            https://sub.example.com → sub.example.com
			
 
				-            file:///path → localhost
			
 
				-            data:text/html → data
			
 
				-        """
			
 
				-        from urllib.parse import urlparse
			
 
				-
			
 
				-        try:
			
 
				-            parsed = urlparse(url)
			
 
				-
			
 
				-            if parsed.scheme in ('http', 'https'):
			
 
				-                if parsed.port:
			
 
				-                    return f"{parsed.hostname}_{parsed.port}".replace(':', '_')
			
 
				-                return parsed.hostname or 'unknown'
			
 
				-            elif parsed.scheme == 'file':
			
 
				-                return 'localhost'
			
 
				-            elif parsed.scheme:
			
 
				-                return parsed.scheme
			
 
				-            else:
			
 
				-                return 'unknown'
			
 
				-        except Exception:
			
 
				-            return 'unknown'
			
 
				-
			
 
				-    def get_storage_path_for_version(self, version: str) -> Path:
			
 
				-        """
			
 
				-        Calculate storage path for specific filesystem version.
			
 
				-        Centralizes path logic so it's reusable.
			
 
				-
			
 
				-        0.7.x/0.8.x: archive/{timestamp}
			
 
				-        0.9.x: users/{username}/snapshots/YYYYMMDD/{domain}/{uuid}/
			
 
				-        """
			
 
				-        from datetime import datetime
			
 
				-
			
 
				-        if version in ('0.7.0', '0.8.0'):
			
 
				-            return CONSTANTS.ARCHIVE_DIR / self.timestamp
			
 
				-
			
 
				-        elif version in ('0.9.0', '1.0.0'):
			
 
				-            username = self.crawl.created_by.username
			
 
				-
			
 
				-            # Use created_at for date grouping (fallback to timestamp)
			
 
				-            if self.created_at:
			
 
				-                date_str = self.created_at.strftime('%Y%m%d')
			
 
				-            else:
			
 
				-                date_str = datetime.fromtimestamp(float(self.timestamp)).strftime('%Y%m%d')
			
 
				-
			
 
				-            domain = self.extract_domain_from_url(self.url)
			
 
				-
			
 
				-            return (
			
 
				-                CONSTANTS.DATA_DIR / 'users' / username / 'snapshots' /
			
 
				-                date_str / domain / str(self.id)
			
 
				-            )
			
 
				-        else:
			
 
				-            # Unknown version - use current
			
 
				-            return self.get_storage_path_for_version(self._fs_current_version())
			
 
				-
			
 
				-    # =========================================================================
			
 
				-    # Loading and Creation from Filesystem (Used by archivebox update ONLY)
			
 
				-    # =========================================================================
			
 
				-
			
 
				-    @classmethod
			
 
				-    def load_from_directory(cls, snapshot_dir: Path) -> Optional['Snapshot']:
			
 
				-        """
			
 
				-        Load existing Snapshot from DB by reading index.json.
			
 
				-
			
 
				-        Reads index.json, extracts url+timestamp, queries DB.
			
 
				-        Returns existing Snapshot or None if not found/invalid.
			
 
				-        Does NOT create new snapshots.
			
 
				-
			
 
				-        ONLY used by: archivebox update (for orphan detection)
			
 
				-        """
			
 
				-        import json
			
 
				-
			
 
				-        index_path = snapshot_dir / 'index.json'
			
 
				-        if not index_path.exists():
			
 
				-            return None
			
 
				-
			
 
				-        try:
			
 
				-            with open(index_path) as f:
			
 
				-                data = json.load(f)
			
 
				-        except:
			
 
				-            return None
			
 
				-
			
 
				-        url = data.get('url')
			
 
				-        if not url:
			
 
				-            return None
			
 
				-
			
 
				-        # Get timestamp - prefer index.json, fallback to folder name
			
 
				-        timestamp = cls._select_best_timestamp(
			
 
				-            index_timestamp=data.get('timestamp'),
			
 
				-            folder_name=snapshot_dir.name
			
 
				-        )
			
 
				-
			
 
				-        if not timestamp:
			
 
				-            return None
			
 
				-
			
 
				-        # Look up existing
			
 
				-        try:
			
 
				-            return cls.objects.get(url=url, timestamp=timestamp)
			
 
				-        except cls.DoesNotExist:
			
 
				-            return None
			
 
				-        except cls.MultipleObjectsReturned:
			
 
				-            # Should not happen with unique constraint
			
 
				-            return cls.objects.filter(url=url, timestamp=timestamp).first()
			
 
				-
			
 
				-    @classmethod
			
 
				-    def create_from_directory(cls, snapshot_dir: Path) -> Optional['Snapshot']:
			
 
				-        """
			
 
				-        Create new Snapshot from orphaned directory.
			
 
				-
			
 
				-        Validates timestamp, ensures uniqueness.
			
 
				-        Returns new UNSAVED Snapshot or None if invalid.
			
 
				-
			
 
				-        ONLY used by: archivebox update (for orphan import)
			
 
				-        """
			
 
				-        import json
			
 
				-
			
 
				-        index_path = snapshot_dir / 'index.json'
			
 
				-        if not index_path.exists():
			
 
				-            return None
			
 
				-
			
 
				-        try:
			
 
				-            with open(index_path) as f:
			
 
				-                data = json.load(f)
			
 
				-        except:
			
 
				-            return None
			
 
				-
			
 
				-        url = data.get('url')
			
 
				-        if not url:
			
 
				-            return None
			
 
				-
			
 
				-        # Get and validate timestamp
			
 
				-        timestamp = cls._select_best_timestamp(
			
 
				-            index_timestamp=data.get('timestamp'),
			
 
				-            folder_name=snapshot_dir.name
			
 
				-        )
			
 
				-
			
 
				-        if not timestamp:
			
 
				-            return None
			
 
				-
			
 
				-        # Ensure uniqueness (reuses existing logic from create_or_update_from_dict)
			
 
				-        timestamp = cls._ensure_unique_timestamp(url, timestamp)
			
 
				-
			
 
				-        # Detect version
			
 
				-        fs_version = cls._detect_fs_version_from_index(data)
			
 
				-
			
 
				-        return cls(
			
 
				-            url=url,
			
 
				-            timestamp=timestamp,
			
 
				-            title=data.get('title', ''),
			
 
				-            fs_version=fs_version,
			
 
				-            created_by_id=get_or_create_system_user_pk(),
			
 
				-        )
			
 
				-
			
 
				-    @staticmethod
			
 
				-    def _select_best_timestamp(index_timestamp: str, folder_name: str) -> Optional[str]:
			
 
				-        """
			
 
				-        Select best timestamp from index.json vs folder name.
			
 
				-
			
 
				-        Validates range (1995-2035).
			
 
				-        Prefers index.json if valid.
			
 
				-        """
			
 
				-        def is_valid_timestamp(ts):
			
 
				-            try:
			
 
				-                ts_int = int(float(ts))
			
 
				-                # 1995-01-01 to 2035-12-31
			
 
				-                return 788918400 <= ts_int <= 2082758400
			
 
				-            except:
			
 
				-                return False
			
 
				-
			
 
				-        index_valid = is_valid_timestamp(index_timestamp) if index_timestamp else False
			
 
				-        folder_valid = is_valid_timestamp(folder_name)
			
 
				-
			
 
				-        if index_valid:
			
 
				-            return str(int(float(index_timestamp)))
			
 
				-        elif folder_valid:
			
 
				-            return str(int(float(folder_name)))
			
 
				-        else:
			
 
				-            return None
			
 
				-
			
 
				-    @classmethod
			
 
				-    def _ensure_unique_timestamp(cls, url: str, timestamp: str) -> str:
			
 
				-        """
			
 
				-        Ensure timestamp is globally unique.
			
 
				-        If collision with different URL, increment by 1 until unique.
			
 
				-
			
 
				-        NOTE: Logic already exists in create_or_update_from_dict (line 266-267)
			
 
				-        This is just an extracted, reusable version.
			
 
				-        """
			
 
				-        while cls.objects.filter(timestamp=timestamp).exclude(url=url).exists():
			
 
				-            timestamp = str(int(float(timestamp)) + 1)
			
 
				-        return timestamp
			
 
				-
			
 
				-    @staticmethod
			
 
				-    def _detect_fs_version_from_index(data: dict) -> str:
			
 
				-        """
			
 
				-        Detect fs_version from index.json structure.
			
 
				-
			
 
				-        - Has fs_version field: use it
			
 
				-        - Has history dict: 0.7.0
			
 
				-        - Has archive_results list: 0.8.0
			
 
				-        - Default: 0.7.0
			
 
				-        """
			
 
				-        if 'fs_version' in data:
			
 
				-            return data['fs_version']
			
 
				-        if 'history' in data and 'archive_results' not in data:
			
 
				-            return '0.7.0'
			
 
				-        if 'archive_results' in data:
			
 
				-            return '0.8.0'
			
 
				-        return '0.7.0'
			
 
				-
			
 
				-    # =========================================================================
			
 
				-    # Index.json Reconciliation
			
 
				-    # =========================================================================
			
 
				-
			
 
				-    def reconcile_with_index_json(self):
			
 
				-        """
			
 
				-        Merge index.json with DB. DB is source of truth.
			
 
				-
			
 
				-        - Title: longest non-URL
			
 
				-        - Tags: union
			
 
				-        - ArchiveResults: keep both (by plugin+start_ts)
			
 
				-
			
 
				-        Writes back in 0.9.x format.
			
 
				-
			
 
				-        Used by: archivebox update (to sync index.json with DB)
			
 
				-        """
			
 
				-        import json
			
 
				-
			
 
				-        index_path = Path(self.output_dir) / 'index.json'
			
 
				-
			
 
				-        index_data = {}
			
 
				-        if index_path.exists():
			
 
				-            try:
			
 
				-                with open(index_path) as f:
			
 
				-                    index_data = json.load(f)
			
 
				-            except:
			
 
				-                pass
			
 
				-
			
 
				-        # Merge title
			
 
				-        self._merge_title_from_index(index_data)
			
 
				-
			
 
				-        # Merge tags
			
 
				-        self._merge_tags_from_index(index_data)
			
 
				-
			
 
				-        # Merge ArchiveResults
			
 
				-        self._merge_archive_results_from_index(index_data)
			
 
				-
			
 
				-        # Write back
			
 
				-        self.write_index_json()
			
 
				-
			
 
				-    def _merge_title_from_index(self, index_data: dict):
			
 
				-        """Merge title - prefer longest non-URL title."""
			
 
				-        index_title = index_data.get('title', '').strip()
			
 
				-        db_title = self.title or ''
			
 
				-
			
 
				-        candidates = [t for t in [index_title, db_title] if t and t != self.url]
			
 
				-        if candidates:
			
 
				-            best_title = max(candidates, key=len)
			
 
				-            if self.title != best_title:
			
 
				-                self.title = best_title
			
 
				-
			
 
				-    def _merge_tags_from_index(self, index_data: dict):
			
 
				-        """Merge tags - union of both sources."""
			
 
				-        from django.db import transaction
			
 
				-
			
 
				-        index_tags = set(index_data.get('tags', '').split(',')) if index_data.get('tags') else set()
			
 
				-        index_tags = {t.strip() for t in index_tags if t.strip()}
			
 
				-
			
 
				-        db_tags = set(self.tags.values_list('name', flat=True))
			
 
				-
			
 
				-        new_tags = index_tags - db_tags
			
 
				-        if new_tags:
			
 
				-            with transaction.atomic():
			
 
				-                for tag_name in new_tags:
			
 
				-                    tag, _ = Tag.objects.get_or_create(name=tag_name)
			
 
				-                    self.tags.add(tag)
			
 
				-
			
 
				-    def _merge_archive_results_from_index(self, index_data: dict):
			
 
				-        """Merge ArchiveResults - keep both (by plugin+start_ts)."""
			
 
				-        existing = {
			
 
				-            (ar.plugin, ar.start_ts): ar
			
 
				-            for ar in ArchiveResult.objects.filter(snapshot=self)
			
 
				-        }
			
 
				-
			
 
				-        # Handle 0.8.x format (archive_results list)
			
 
				-        for result_data in index_data.get('archive_results', []):
			
 
				-            self._create_archive_result_if_missing(result_data, existing)
			
 
				-
			
 
				-        # Handle 0.7.x format (history dict)
			
 
				-        if 'history' in index_data and isinstance(index_data['history'], dict):
			
 
				-            for plugin, result_list in index_data['history'].items():
			
 
				-                if isinstance(result_list, list):
			
 
				-                    for result_data in result_list:
			
 
				-                        # Support both old 'extractor' and new 'plugin' keys for backwards compat
			
 
				-                        result_data['plugin'] = result_data.get('plugin') or result_data.get('extractor') or plugin
			
 
				-                        self._create_archive_result_if_missing(result_data, existing)
			
 
				-
			
 
				-    def _create_archive_result_if_missing(self, result_data: dict, existing: dict):
			
 
				-        """Create ArchiveResult if not already in DB."""
			
 
				-        from dateutil import parser
			
 
				-
			
 
				-        # Support both old 'extractor' and new 'plugin' keys for backwards compat
			
 
				-        plugin = result_data.get('plugin') or result_data.get('extractor', '')
			
 
				-        if not plugin:
			
 
				-            return
			
 
				-
			
 
				-        start_ts = None
			
 
				-        if result_data.get('start_ts'):
			
 
				-            try:
			
 
				-                start_ts = parser.parse(result_data['start_ts'])
			
 
				-            except:
			
 
				-                pass
			
 
				-
			
 
				-        if (plugin, start_ts) in existing:
			
 
				-            return
			
 
				-
			
 
				-        try:
			
 
				-            end_ts = None
			
 
				-            if result_data.get('end_ts'):
			
 
				-                try:
			
 
				-                    end_ts = parser.parse(result_data['end_ts'])
			
 
				-                except:
			
 
				-                    pass
			
 
				-
			
 
				-            ArchiveResult.objects.create(
			
 
				-                snapshot=self,
			
 
				-                plugin=plugin,
			
 
				-                hook_name=result_data.get('hook_name', ''),
			
 
				-                status=result_data.get('status', 'failed'),
			
 
				-                output_str=result_data.get('output', ''),
			
 
				-                cmd=result_data.get('cmd', []),
			
 
				-                pwd=result_data.get('pwd', str(self.output_dir)),
			
 
				-                start_ts=start_ts,
			
 
				-                end_ts=end_ts,
			
 
				-                created_by=self.crawl.created_by,
			
 
				-            )
			
 
				-        except:
			
 
				-            pass
			
 
				-
			
 
				-    def write_index_json(self):
			
 
				-        """Write index.json in 0.9.x format."""
			
 
				-        import json
			
 
				-
			
 
				-        index_path = Path(self.output_dir) / 'index.json'
			
 
				-
			
 
				-        data = {
			
 
				-            'url': self.url,
			
 
				-            'timestamp': self.timestamp,
			
 
				-            'title': self.title or '',
			
 
				-            'tags': ','.join(sorted(self.tags.values_list('name', flat=True))),
			
 
				-            'fs_version': self.fs_version,
			
 
				-            'bookmarked_at': self.bookmarked_at.isoformat() if self.bookmarked_at else None,
			
 
				-            'created_at': self.created_at.isoformat() if self.created_at else None,
			
 
				-            'archive_results': [
			
 
				-                {
			
 
				-                    'plugin': ar.plugin,
			
 
				-                    'status': ar.status,
			
 
				-                    'start_ts': ar.start_ts.isoformat() if ar.start_ts else None,
			
 
				-                    'end_ts': ar.end_ts.isoformat() if ar.end_ts else None,
			
 
				-                    'output': ar.output_str or '',
			
 
				-                    'cmd': ar.cmd if isinstance(ar.cmd, list) else [],
			
 
				-                    'pwd': ar.pwd,
			
 
				-                }
			
 
				-                for ar in ArchiveResult.objects.filter(snapshot=self).order_by('start_ts')
			
 
				-            ],
			
 
				-        }
			
 
				-
			
 
				-        index_path.parent.mkdir(parents=True, exist_ok=True)
			
 
				-        with open(index_path, 'w') as f:
			
 
				-            json.dump(data, f, indent=2, sort_keys=True)
			
 
				-
			
 
				-    # =========================================================================
			
 
				-    # Snapshot Utilities
			
 
				-    # =========================================================================
			
 
				-
			
 
				-    @staticmethod
			
 
				-    def move_directory_to_invalid(snapshot_dir: Path):
			
 
				-        """
			
 
				-        Move invalid directory to data/invalid/YYYYMMDD/.
			
 
				-
			
 
				-        Used by: archivebox update (when encountering invalid directories)
			
 
				-        """
			
 
				-        from datetime import datetime
			
 
				-        import shutil
			
 
				-
			
 
				-        invalid_dir = CONSTANTS.DATA_DIR / 'invalid' / datetime.now().strftime('%Y%m%d')
			
 
				-        invalid_dir.mkdir(parents=True, exist_ok=True)
			
 
				-
			
 
				-        dest = invalid_dir / snapshot_dir.name
			
 
				-        counter = 1
			
 
				-        while dest.exists():
			
 
				-            dest = invalid_dir / f"{snapshot_dir.name}_{counter}"
			
 
				-            counter += 1
			
 
				-
			
 
				-        try:
			
 
				-            shutil.move(str(snapshot_dir), str(dest))
			
 
				-        except:
			
 
				-            pass
			
 
				-
			
 
				-    @classmethod
			
 
				-    def find_and_merge_duplicates(cls) -> int:
			
 
				-        """
			
 
				-        Find and merge snapshots with same url:timestamp.
			
 
				-        Returns count of duplicate sets merged.
			
 
				-
			
 
				-        Used by: archivebox update (Phase 3: deduplication)
			
 
				-        """
			
 
				-        from django.db.models import Count
			
 
				-
			
 
				-        duplicates = (
			
 
				-            cls.objects
			
 
				-            .values('url', 'timestamp')
			
 
				-            .annotate(count=Count('id'))
			
 
				-            .filter(count__gt=1)
			
 
				-        )
			
 
				-
			
 
				-        merged = 0
			
 
				-        for dup in duplicates.iterator():
			
 
				-            snapshots = list(
			
 
				-                cls.objects
			
 
				-                .filter(url=dup['url'], timestamp=dup['timestamp'])
			
 
				-                .order_by('created_at')  # Keep oldest
			
 
				-            )
			
 
				-
			
 
				-            if len(snapshots) > 1:
			
 
				-                try:
			
 
				-                    cls._merge_snapshots(snapshots)
			
 
				-                    merged += 1
			
 
				-                except:
			
 
				-                    pass
			
 
				-
			
 
				-        return merged
			
 
				-
			
 
				-    @classmethod
			
 
				-    def _merge_snapshots(cls, snapshots: list['Snapshot']):
			
 
				-        """
			
 
				-        Merge exact duplicates.
			
 
				-        Keep oldest, union files + ArchiveResults.
			
 
				-        """
			
 
				-        import shutil
			
 
				-
			
 
				-        keeper = snapshots[0]
			
 
				-        duplicates = snapshots[1:]
			
 
				-
			
 
				-        keeper_dir = Path(keeper.output_dir)
			
 
				-
			
 
				-        for dup in duplicates:
			
 
				-            dup_dir = Path(dup.output_dir)
			
 
				-
			
 
				-            # Merge files
			
 
				-            if dup_dir.exists() and dup_dir != keeper_dir:
			
 
				-                for dup_file in dup_dir.rglob('*'):
			
 
				-                    if not dup_file.is_file():
			
 
				-                        continue
			
 
				-
			
 
				-                    rel = dup_file.relative_to(dup_dir)
			
 
				-                    keeper_file = keeper_dir / rel
			
 
				-
			
 
				-                    if not keeper_file.exists():
			
 
				-                        keeper_file.parent.mkdir(parents=True, exist_ok=True)
			
 
				-                        shutil.copy2(dup_file, keeper_file)
			
 
				-
			
 
				-                try:
			
 
				-                    shutil.rmtree(dup_dir)
			
 
				-                except:
			
 
				-                    pass
			
 
				-
			
 
				-            # Merge tags
			
 
				-            for tag in dup.tags.all():
			
 
				-                keeper.tags.add(tag)
			
 
				-
			
 
				-            # Move ArchiveResults
			
 
				-            ArchiveResult.objects.filter(snapshot=dup).update(snapshot=keeper)
			
 
				-
			
 
				-            # Delete
			
 
				-            dup.delete()
			
 
				-
			
 
				-    # =========================================================================
			
 
				-    # Output Directory Properties
			
 
				-    # =========================================================================
			
 
				-
			
 
				-    @property
			
 
				-    def output_dir_parent(self) -> str:
			
 
				-        return 'archive'
			
 
				-
			
 
				-    @property
			
 
				-    def output_dir_name(self) -> str:
			
 
				-        return str(self.timestamp)
			
 
				-
			
 
				-    def archive(self, overwrite=False, methods=None):
			
 
				-        return bg_archive_snapshot(self, overwrite=overwrite, methods=methods)
			
 
				-
			
 
				-    @admin.display(description='Tags')
			
 
				-    def tags_str(self, nocache=True) -> str | None:
			
 
				-        calc_tags_str = lambda: ','.join(sorted(tag.name for tag in self.tags.all()))
			
 
				-        if hasattr(self, '_prefetched_objects_cache') and 'tags' in self._prefetched_objects_cache:
			
 
				-            return calc_tags_str()
			
 
				-        cache_key = f'{self.pk}-tags'
			
 
				-        return cache.get_or_set(cache_key, calc_tags_str) if not nocache else calc_tags_str()
			
 
				-
			
 
				-    def icons(self) -> str:
			
 
				-        """Generate HTML icons showing which extractor plugins have succeeded for this snapshot"""
			
 
				-        from django.utils.html import format_html, mark_safe
			
 
				-
			
 
				-        cache_key = f'result_icons:{self.pk}:{(self.downloaded_at or self.modified_at or self.created_at or self.bookmarked_at).timestamp()}'
			
 
				-
			
 
				-        def calc_icons():
			
 
				-            if hasattr(self, '_prefetched_objects_cache') and 'archiveresult_set' in self._prefetched_objects_cache:
			
 
				-                archive_results = {r.plugin: r for r in self.archiveresult_set.all() if r.status == "succeeded" and (r.output_files or r.output_str)}
			
 
				-            else:
			
 
				-                # Filter for results that have either output_files or output_str
			
 
				-                from django.db.models import Q
			
 
				-                archive_results = {r.plugin: r for r in self.archiveresult_set.filter(
			
 
				-                    Q(status="succeeded") & (Q(output_files__isnull=False) | ~Q(output_str=''))
			
 
				-                )}
			
 
				-
			
 
				-            path = self.archive_path
			
 
				-            canon = self.canonical_outputs()
			
 
				-            output = ""
			
 
				-            output_template = '<a href="/{}/{}" class="exists-{}" title="{}">{}</a> &nbsp;'
			
 
				-
			
 
				-            # Get all plugins from hooks system (sorted by numeric prefix)
			
 
				-            all_plugins = [get_plugin_name(e) for e in get_plugins()]
			
 
				-
			
 
				-            for plugin in all_plugins:
			
 
				-                result = archive_results.get(plugin)
			
 
				-                existing = result and result.status == 'succeeded' and (result.output_files or result.output_str)
			
 
				-                icon = get_plugin_icon(plugin)
			
 
				-                output += format_html(
			
 
				-                    output_template,
			
 
				-                    path,
			
 
				-                    canon.get(plugin, plugin + '/'),
			
 
				-                    str(bool(existing)),
			
 
				-                    plugin,
			
 
				-                    icon
			
 
				-                )
			
 
				-
			
 
				-            return format_html('<span class="files-icons" style="font-size: 1.1em; opacity: 0.8; min-width: 240px; display: inline-block">{}</span>', mark_safe(output))
			
 
				-
			
 
				-        cache_result = cache.get(cache_key)
			
 
				-        if cache_result:
			
 
				-            return cache_result
			
 
				-
			
 
				-        fresh_result = calc_icons()
			
 
				-        cache.set(cache_key, fresh_result, timeout=60 * 60 * 24)
			
 
				-        return fresh_result
			
 
				-
			
 
				-    @property
			
 
				-    def api_url(self) -> str:
			
 
				-        return reverse_lazy('api-1:get_snapshot', args=[self.id])
			
 
				-
			
 
				-    def get_absolute_url(self):
			
 
				-        return f'/{self.archive_path}'
			
 
				-
			
 
				-    @cached_property
			
 
				-    def domain(self) -> str:
			
 
				-        return url_domain(self.url)
			
 
				-
			
 
				-    @cached_property
			
 
				-    def output_dir(self):
			
 
				-        """The filesystem path to the snapshot's output directory."""
			
 
				-        import os
			
 
				-
			
 
				-        current_path = self.get_storage_path_for_version(self.fs_version)
			
 
				-
			
 
				-        if current_path.exists():
			
 
				-            return str(current_path)
			
 
				-
			
 
				-        # Check for backwards-compat symlink
			
 
				-        old_path = CONSTANTS.ARCHIVE_DIR / self.timestamp
			
 
				-        if old_path.is_symlink():
			
 
				-            return str(Path(os.readlink(old_path)).resolve())
			
 
				-        elif old_path.exists():
			
 
				-            return str(old_path)
			
 
				-
			
 
				-        return str(current_path)
			
 
				-
			
 
				-    @cached_property
			
 
				-    def archive_path(self):
			
 
				-        return f'{CONSTANTS.ARCHIVE_DIR_NAME}/{self.timestamp}'
			
 
				-
			
 
				-    @cached_property
			
 
				-    def archive_size(self):
			
 
				-        try:
			
 
				-            return get_dir_size(self.output_dir)[0]
			
 
				-        except Exception:
			
 
				-            return 0
			
 
				-
			
 
				-    def save_tags(self, tags: Iterable[str] = ()) -> None:
			
 
				-        tags_id = [Tag.objects.get_or_create(name=tag)[0].pk for tag in tags if tag.strip()]
			
 
				-        self.tags.clear()
			
 
				-        self.tags.add(*tags_id)
			
 
				-
			
 
				-    def pending_archiveresults(self) -> QuerySet['ArchiveResult']:
			
 
				-        return self.archiveresult_set.exclude(status__in=ArchiveResult.FINAL_OR_ACTIVE_STATES)
			
 
				-
			
 
				-    def run(self) -> list['ArchiveResult']:
			
 
				-        """
			
 
				-        Execute snapshot by creating pending ArchiveResults for all enabled hooks.
			
 
				-
			
 
				-        Called by: SnapshotMachine.enter_started()
			
 
				-
			
 
				-        Hook Lifecycle:
			
 
				-            1. discover_hooks('Snapshot') → finds all plugin hooks
			
 
				-            2. For each hook:
			
 
				-               - Create ArchiveResult with status=QUEUED
			
 
				-               - Store hook_name (e.g., 'on_Snapshot__50_wget.py')
			
 
				-            3. ArchiveResults execute independently via ArchiveResultMachine
			
 
				-            4. Hook execution happens in ArchiveResult.run(), NOT here
			
 
				-
			
 
				-        Returns:
			
 
				-            list[ArchiveResult]: Newly created pending results
			
 
				-        """
			
 
				-        return self.create_pending_archiveresults()
			
 
				-
			
 
				-    def cleanup(self):
			
 
				-        """
			
 
				-        Clean up background ArchiveResult hooks.
			
 
				-
			
 
				-        Called by the state machine when entering the 'sealed' state.
			
 
				-        Kills any background hooks and finalizes their ArchiveResults.
			
 
				-        """
			
 
				-        from archivebox.hooks import kill_process
			
 
				-
			
 
				-        # Kill any background ArchiveResult hooks
			
 
				-        if not self.OUTPUT_DIR.exists():
			
 
				-            return
			
 
				-
			
 
				-        # Find all .pid files in this snapshot's output directory
			
 
				-        for pid_file in self.OUTPUT_DIR.glob('**/*.pid'):
			
 
				-            kill_process(pid_file, validate=True)
			
 
				-
			
 
				-        # Update all STARTED ArchiveResults from filesystem
			
 
				-        results = self.archiveresult_set.filter(status=ArchiveResult.StatusChoices.STARTED)
			
 
				-        for ar in results:
			
 
				-            ar.update_from_output()
			
 
				-
			
 
				-    def has_running_background_hooks(self) -> bool:
			
 
				-        """
			
 
				-        Check if any ArchiveResult background hooks are still running.
			
 
				-
			
 
				-        Used by state machine to determine if snapshot is finished.
			
 
				-        """
			
 
				-        from archivebox.hooks import process_is_alive
			
 
				-
			
 
				-        if not self.OUTPUT_DIR.exists():
			
 
				-            return False
			
 
				-
			
 
				-        for plugin_dir in self.OUTPUT_DIR.iterdir():
			
 
				-            if not plugin_dir.is_dir():
			
 
				-                continue
			
 
				-            pid_file = plugin_dir / 'hook.pid'
			
 
				-            if process_is_alive(pid_file):
			
 
				-                return True
			
 
				-
			
 
				-        return False
			
 
				-
			
 
				-    @staticmethod
			
 
				-    def from_jsonl(record: Dict[str, Any], overrides: Dict[str, Any] = None, queue_for_extraction: bool = True):
			
 
				-        """
			
 
				-        Create/update Snapshot from JSONL record or dict.
			
 
				-
			
 
				-        Unified method that handles:
			
 
				-        - ID-based patching: {"id": "...", "title": "new title"}
			
 
				-        - URL-based create/update: {"url": "...", "title": "...", "tags": "..."}
			
 
				-        - Auto-creates Crawl if not provided
			
 
				-        - Optionally queues for extraction
			
 
				-
			
 
				-        Args:
			
 
				-            record: Dict with 'url' (for create) or 'id' (for patch), plus other fields
			
 
				-            overrides: Dict with 'crawl', 'snapshot' (parent), 'created_by_id'
			
 
				-            queue_for_extraction: If True, sets status=QUEUED and retry_at (default: True)
			
 
				-
			
 
				-        Returns:
			
 
				-            Snapshot instance or None
			
 
				-        """
			
 
				-        import re
			
 
				-        from django.utils import timezone
			
 
				-        from archivebox.misc.util import parse_date
			
 
				-        from archivebox.base_models.models import get_or_create_system_user_pk
			
 
				-        from archivebox.config.common import GENERAL_CONFIG
			
 
				-
			
 
				-        overrides = overrides or {}
			
 
				-
			
 
				-        # If 'id' is provided, lookup and patch that specific snapshot
			
 
				-        snapshot_id = record.get('id')
			
 
				-        if snapshot_id:
			
 
				-            try:
			
 
				-                snapshot = Snapshot.objects.get(id=snapshot_id)
			
 
				-
			
 
				-                # Generically update all fields present in record
			
 
				-                update_fields = []
			
 
				-                for field_name, value in record.items():
			
 
				-                    # Skip internal fields
			
 
				-                    if field_name in ('id', 'type'):
			
 
				-                        continue
			
 
				-
			
 
				-                    # Skip if field doesn't exist on model
			
 
				-                    if not hasattr(snapshot, field_name):
			
 
				-                        continue
			
 
				-
			
 
				-                    # Special parsing for date fields
			
 
				-                    if field_name in ('bookmarked_at', 'retry_at', 'created_at', 'modified_at'):
			
 
				-                        if value and isinstance(value, str):
			
 
				-                            value = parse_date(value)
			
 
				-
			
 
				-                    # Update field if value is provided and different
			
 
				-                    if value is not None and getattr(snapshot, field_name) != value:
			
 
				-                        setattr(snapshot, field_name, value)
			
 
				-                        update_fields.append(field_name)
			
 
				-
			
 
				-                if update_fields:
			
 
				-                    snapshot.save(update_fields=update_fields + ['modified_at'])
			
 
				-
			
 
				-                return snapshot
			
 
				-            except Snapshot.DoesNotExist:
			
 
				-                # ID not found, fall through to create-by-URL logic
			
 
				-                pass
			
 
				-
			
 
				-        url = record.get('url')
			
 
				-        if not url:
			
 
				-            return None
			
 
				-
			
 
				-        # Determine or create crawl (every snapshot must have a crawl)
			
 
				-        crawl = overrides.get('crawl')
			
 
				-        parent_snapshot = overrides.get('snapshot')  # Parent snapshot
			
 
				-        created_by_id = overrides.get('created_by_id') or (parent_snapshot.crawl.created_by_id if parent_snapshot else None) or get_or_create_system_user_pk()
			
 
				-
			
 
				-        # If no crawl provided, inherit from parent or auto-create one
			
 
				-        if not crawl:
			
 
				-            if parent_snapshot:
			
 
				-                # Inherit crawl from parent snapshot
			
 
				-                crawl = parent_snapshot.crawl
			
 
				-            else:
			
 
				-                # Auto-create a single-URL crawl
			
 
				-                from archivebox.crawls.models import Crawl
			
 
				-                from archivebox.config import CONSTANTS
			
 
				-
			
 
				-                timestamp_str = timezone.now().strftime("%Y-%m-%d__%H-%M-%S")
			
 
				-                sources_file = CONSTANTS.SOURCES_DIR / f'{timestamp_str}__auto_crawl.txt'
			
 
				-                sources_file.parent.mkdir(parents=True, exist_ok=True)
			
 
				-                sources_file.write_text(url)
			
 
				-
			
 
				-                crawl = Crawl.objects.create(
			
 
				-                    urls=url,
			
 
				-                    max_depth=0,
			
 
				-                    label=f'auto-created for {url[:50]}',
			
 
				-                    created_by_id=created_by_id,
			
 
				-                )
			
 
				-
			
 
				-        # Parse tags
			
 
				-        tags_str = record.get('tags', '')
			
 
				-        tag_list = []
			
 
				-        if tags_str:
			
 
				-            tag_list = list(dict.fromkeys(
			
 
				-                tag.strip() for tag in re.split(GENERAL_CONFIG.TAG_SEPARATOR_PATTERN, tags_str)
			
 
				-                if tag.strip()
			
 
				-            ))
			
 
				-
			
 
				-        # Get most recent snapshot with this URL (URLs can exist in multiple crawls)
			
 
				-        snapshot = Snapshot.objects.filter(url=url).order_by('-created_at').first()
			
 
				-
			
 
				-        title = record.get('title')
			
 
				-        timestamp = record.get('timestamp')
			
 
				-
			
 
				-        if snapshot:
			
 
				-            # Update existing snapshot
			
 
				-            if title and (not snapshot.title or len(title) > len(snapshot.title or '')):
			
 
				-                snapshot.title = title
			
 
				-                snapshot.save(update_fields=['title', 'modified_at'])
			
 
				-        else:
			
 
				-            # Create new snapshot
			
 
				-            if timestamp:
			
 
				-                while Snapshot.objects.filter(timestamp=timestamp).exists():
			
 
				-                    timestamp = str(float(timestamp) + 1.0)
			
 
				-
			
 
				-            snapshot = Snapshot.objects.create(
			
 
				-                url=url,
			
 
				-                timestamp=timestamp,
			
 
				-                title=title,
			
 
				-                crawl=crawl,
			
 
				-            )
			
 
				-
			
 
				-        # Update tags
			
 
				-        if tag_list:
			
 
				-            existing_tags = set(snapshot.tags.values_list('name', flat=True))
			
 
				-            new_tags = set(tag_list) | existing_tags
			
 
				-            snapshot.save_tags(new_tags)
			
 
				-
			
 
				-        # Queue for extraction and update additional fields
			
 
				-        update_fields = []
			
 
				-
			
 
				-        if queue_for_extraction:
			
 
				-            snapshot.status = Snapshot.StatusChoices.QUEUED
			
 
				-            snapshot.retry_at = timezone.now()
			
 
				-            update_fields.extend(['status', 'retry_at'])
			
 
				-
			
 
				-        # Update additional fields if provided
			
 
				-        for field_name in ('depth', 'parent_snapshot_id', 'crawl_id', 'bookmarked_at'):
			
 
				-            value = record.get(field_name)
			
 
				-            if value is not None and getattr(snapshot, field_name) != value:
			
 
				-                setattr(snapshot, field_name, value)
			
 
				-                update_fields.append(field_name)
			
 
				-
			
 
				-        if update_fields:
			
 
				-            snapshot.save(update_fields=update_fields + ['modified_at'])
			
 
				-
			
 
				-        return snapshot
			
 
				-
			
 
				-    def create_pending_archiveresults(self) -> list['ArchiveResult']:
			
 
				-        """
			
 
				-        Create ArchiveResult records for all enabled hooks.
			
 
				-
			
 
				-        Uses the hooks system to discover available hooks from:
			
 
				-        - archivebox/plugins/*/on_Snapshot__*.{py,sh,js}
			
 
				-        - data/plugins/*/on_Snapshot__*.{py,sh,js}
			
 
				-
			
 
				-        Creates one ArchiveResult per hook (not per plugin), with hook_name set.
			
 
				-        This enables step-based execution where all hooks in a step can run in parallel.
			
 
				-        """
			
 
				-        from archivebox.hooks import discover_hooks
			
 
				-
			
 
				-        hooks = discover_hooks('Snapshot')
			
 
				-        archiveresults = []
			
 
				-
			
 
				-        for hook_path in hooks:
			
 
				-            hook_name = hook_path.name  # e.g., 'on_Snapshot__50_wget.py'
			
 
				-            plugin = hook_path.parent.name  # e.g., 'wget'
			
 
				-
			
 
				-            # Check if AR already exists for this specific hook
			
 
				-            if ArchiveResult.objects.filter(snapshot=self, hook_name=hook_name).exists():
			
 
				-                continue
			
 
				-
			
 
				-            archiveresult, created = ArchiveResult.objects.get_or_create(
			
 
				-                snapshot=self,
			
 
				-                hook_name=hook_name,
			
 
				-                defaults={
			
 
				-                    'plugin': plugin,
			
 
				-                    'status': ArchiveResult.INITIAL_STATE,
			
 
				-                    'retry_at': timezone.now(),
			
 
				-                    'created_by_id': self.crawl.created_by_id,
			
 
				-                },
			
 
				-            )
			
 
				-            if archiveresult.status == ArchiveResult.INITIAL_STATE:
			
 
				-                archiveresults.append(archiveresult)
			
 
				-
			
 
				-        return archiveresults
			
 
				-
			
 
				-    def advance_step_if_ready(self) -> bool:
			
 
				-        """
			
 
				-        Advance current_step if all foreground hooks in current step are finished.
			
 
				-
			
 
				-        Called by the state machine to check if step can advance.
			
 
				-        Background hooks (.bg) don't block step advancement.
			
 
				-
			
 
				-        Step advancement rules:
			
 
				-        - All foreground ARs in current step must be finished (SUCCEEDED/FAILED/SKIPPED)
			
 
				-        - Background ARs (hook_name contains '.bg.') are ignored for advancement
			
 
				-        - When ready, increments current_step by 1 (up to 9)
			
 
				-
			
 
				-        Returns:
			
 
				-            True if step was advanced, False if not ready or already at step 9.
			
 
				-        """
			
 
				-        from archivebox.hooks import extract_step, is_background_hook
			
 
				-
			
 
				-        if self.current_step >= 9:
			
 
				-            return False  # Already at final step
			
 
				-
			
 
				-        # Get all ARs for current step that are foreground
			
 
				-        current_step_ars = self.archiveresult_set.filter(
			
 
				-            hook_name__isnull=False
			
 
				-        ).exclude(hook_name='')
			
 
				-
			
 
				-        # Check each AR in current step
			
 
				-        for ar in current_step_ars:
			
 
				-            ar_step = extract_step(ar.hook_name)
			
 
				-            if ar_step != self.current_step:
			
 
				-                continue  # Not in current step
			
 
				-
			
 
				-            if is_background_hook(ar.hook_name):
			
 
				-                continue  # Background hooks don't block
			
 
				-
			
 
				-            # Foreground hook in current step - check if finished
			
 
				-            if ar.status not in ArchiveResult.FINAL_OR_ACTIVE_STATES:
			
 
				-                # Still pending/queued - can't advance
			
 
				-                return False
			
 
				-
			
 
				-            if ar.status == ArchiveResult.StatusChoices.STARTED:
			
 
				-                # Still running - can't advance
			
 
				-                return False
			
 
				-
			
 
				-        # All foreground hooks in current step are finished - advance!
			
 
				-        self.current_step += 1
			
 
				-        self.save(update_fields=['current_step', 'modified_at'])
			
 
				-        return True
			
 
				-
			
 
				-    def is_finished_processing(self) -> bool:
			
 
				-        """
			
 
				-        Check if this snapshot has finished processing.
			
 
				-
			
 
				-        Used by SnapshotMachine.is_finished() to determine if snapshot is complete.
			
 
				-
			
 
				-        Returns:
			
 
				-            True if all archiveresults are finished (or no work to do), False otherwise.
			
 
				-        """
			
 
				-        # if no archiveresults exist yet, it's not finished
			
 
				-        if not self.archiveresult_set.exists():
			
 
				-            return False
			
 
				-
			
 
				-        # Try to advance step if ready (handles step-based hook execution)
			
 
				-        # This will increment current_step when all foreground hooks in current step are done
			
 
				-        while self.advance_step_if_ready():
			
 
				-            pass  # Keep advancing until we can't anymore
			
 
				-
			
 
				-        # if archiveresults exist but are still pending, it's not finished
			
 
				-        if self.pending_archiveresults().exists():
			
 
				-            return False
			
 
				-
			
 
				-        # Don't wait for background hooks - they'll be cleaned up on entering sealed state
			
 
				-        # Background hooks in STARTED state are excluded by pending_archiveresults()
			
 
				-        # (STARTED is in FINAL_OR_ACTIVE_STATES) so once all results are FINAL or ACTIVE,
			
 
				-        # we can transition to sealed and cleanup() will kill the background hooks
			
 
				-
			
 
				-        # otherwise archiveresults exist and are all finished, so it's finished
			
 
				-        return True
			
 
				-
			
 
				-    def retry_failed_archiveresults(self, retry_at: Optional['timezone.datetime'] = None) -> int:
			
 
				-        """
			
 
				-        Reset failed/skipped ArchiveResults to queued for retry.
			
 
				-
			
 
				-        This enables seamless retry of the entire extraction pipeline:
			
 
				-        - Resets FAILED and SKIPPED results to QUEUED
			
 
				-        - Sets retry_at so workers pick them up
			
 
				-        - Plugins run in order (numeric prefix)
			
 
				-        - Each plugin checks its dependencies at runtime
			
 
				-
			
 
				-        Dependency handling (e.g., chrome_session → screenshot):
			
 
				-        - Plugins check if required outputs exist before running
			
 
				-        - If dependency output missing → plugin returns 'skipped'
			
 
				-        - On retry, if dependency now succeeds → dependent can run
			
 
				-
			
 
				-        Returns count of ArchiveResults reset.
			
 
				-        """
			
 
				-        retry_at = retry_at or timezone.now()
			
 
				-
			
 
				-        count = self.archiveresult_set.filter(
			
 
				-            status__in=[
			
 
				-                ArchiveResult.StatusChoices.FAILED,
			
 
				-                ArchiveResult.StatusChoices.SKIPPED,
			
 
				-            ]
			
 
				-        ).update(
			
 
				-            status=ArchiveResult.StatusChoices.QUEUED,
			
 
				-            retry_at=retry_at,
			
 
				-            output=None,
			
 
				-            start_ts=None,
			
 
				-            end_ts=None,
			
 
				-        )
			
 
				-
			
 
				-        # Also reset the snapshot and current_step so it gets re-checked from the beginning
			
 
				-        if count > 0:
			
 
				-            self.status = self.StatusChoices.STARTED
			
 
				-            self.retry_at = retry_at
			
 
				-            self.current_step = 0  # Reset to step 0 for retry
			
 
				-            self.save(update_fields=['status', 'retry_at', 'current_step', 'modified_at'])
			
 
				-
			
 
				-        return count
			
 
				-
			
 
				-    # =========================================================================
			
 
				-    # URL Helper Properties (migrated from Link schema)
			
 
				-    # =========================================================================
			
 
				-
			
 
				-    @cached_property
			
 
				-    def url_hash(self) -> str:
			
 
				-        from hashlib import sha256
			
 
				-        return sha256(self.url.encode()).hexdigest()[:8]
			
 
				-
			
 
				-    @cached_property
			
 
				-    def scheme(self) -> str:
			
 
				-        return self.url.split('://')[0]
			
 
				-
			
 
				-    @cached_property
			
 
				-    def path(self) -> str:
			
 
				-        parts = self.url.split('://', 1)
			
 
				-        return '/' + parts[1].split('/', 1)[1] if len(parts) > 1 and '/' in parts[1] else '/'
			
 
				-
			
 
				-    @cached_property
			
 
				-    def basename(self) -> str:
			
 
				-        return self.path.split('/')[-1]
			
 
				-
			
 
				-    @cached_property
			
 
				-    def extension(self) -> str:
			
 
				-        basename = self.basename
			
 
				-        return basename.split('.')[-1] if '.' in basename else ''
			
 
				-
			
 
				-    @cached_property
			
 
				-    def base_url(self) -> str:
			
 
				-        return f'{self.scheme}://{self.domain}'
			
 
				-
			
 
				-    @cached_property
			
 
				-    def is_static(self) -> bool:
			
 
				-        static_extensions = {'.pdf', '.jpg', '.jpeg', '.png', '.gif', '.webp', '.svg', '.mp4', '.mp3', '.wav', '.webm'}
			
 
				-        return any(self.url.lower().endswith(ext) for ext in static_extensions)
			
 
				-
			
 
				-    @cached_property
			
 
				-    def is_archived(self) -> bool:
			
 
				-        output_paths = (
			
 
				-            self.domain,
			
 
				-            'output.html',
			
 
				-            'output.pdf',
			
 
				-            'screenshot.png',
			
 
				-            'singlefile.html',
			
 
				-            'readability/content.html',
			
 
				-            'mercury/content.html',
			
 
				-            'htmltotext.txt',
			
 
				-            'media',
			
 
				-            'git',
			
 
				-        )
			
 
				-        return any((Path(self.output_dir) / path).exists() for path in output_paths)
			
 
				-
			
 
				-    # =========================================================================
			
 
				-    # Date/Time Properties (migrated from Link schema)
			
 
				-    # =========================================================================
			
 
				-
			
 
				-    @cached_property
			
 
				-    def bookmarked_date(self) -> Optional[str]:
			
 
				-        max_ts = (timezone.now() + timedelta(days=30)).timestamp()
			
 
				-        if self.timestamp and self.timestamp.replace('.', '').isdigit():
			
 
				-            if 0 < float(self.timestamp) < max_ts:
			
 
				-                return self._ts_to_date_str(datetime.fromtimestamp(float(self.timestamp)))
			
 
				-            return str(self.timestamp)
			
 
				-        return None
			
 
				-
			
 
				-    @cached_property
			
 
				-    def downloaded_datestr(self) -> Optional[str]:
			
 
				-        return self._ts_to_date_str(self.downloaded_at) if self.downloaded_at else None
			
 
				-
			
 
				-    @cached_property
			
 
				-    def archive_dates(self) -> List[datetime]:
			
 
				-        return [
			
 
				-            result.start_ts
			
 
				-            for result in self.archiveresult_set.all()
			
 
				-            if result.start_ts
			
 
				-        ]
			
 
				-
			
 
				-    @cached_property
			
 
				-    def oldest_archive_date(self) -> Optional[datetime]:
			
 
				-        dates = self.archive_dates
			
 
				-        return min(dates) if dates else None
			
 
				-
			
 
				-    @cached_property
			
 
				-    def newest_archive_date(self) -> Optional[datetime]:
			
 
				-        dates = self.archive_dates
			
 
				-        return max(dates) if dates else None
			
 
				-
			
 
				-    @cached_property
			
 
				-    def num_outputs(self) -> int:
			
 
				-        return self.archiveresult_set.filter(status='succeeded').count()
			
 
				-
			
 
				-    @cached_property
			
 
				-    def num_failures(self) -> int:
			
 
				-        return self.archiveresult_set.filter(status='failed').count()
			
 
				-
			
 
				-    # =========================================================================
			
 
				-    # Output Path Methods (migrated from Link schema)
			
 
				-    # =========================================================================
			
 
				-
			
 
				-    def canonical_outputs(self) -> Dict[str, Optional[str]]:
			
 
				-        """
			
 
				-        Intelligently discover the best output file for each plugin.
			
 
				-        Uses actual ArchiveResult data and filesystem scanning with smart heuristics.
			
 
				-        """
			
 
				-        FAVICON_PROVIDER = 'https://www.google.com/s2/favicons?domain={}'
			
 
				-
			
 
				-        # Mimetypes that can be embedded/previewed in an iframe
			
 
				-        IFRAME_EMBEDDABLE_EXTENSIONS = {
			
 
				-            'html', 'htm', 'pdf', 'txt', 'md', 'json', 'jsonl',
			
 
				-            'png', 'jpg', 'jpeg', 'gif', 'webp', 'svg', 'ico',
			
 
				-            'mp4', 'webm', 'mp3', 'opus', 'ogg', 'wav',
			
 
				-        }
			
 
				-
			
 
				-        MIN_DISPLAY_SIZE = 15_000  # 15KB - filter out tiny files
			
 
				-        MAX_SCAN_FILES = 50  # Don't scan massive directories
			
 
				-
			
 
				-        def find_best_output_in_dir(dir_path: Path, plugin_name: str) -> Optional[str]:
			
 
				-            """Find the best representative file in a plugin's output directory"""
			
 
				-            if not dir_path.exists() or not dir_path.is_dir():
			
 
				-                return None
			
 
				-
			
 
				-            candidates = []
			
 
				-            file_count = 0
			
 
				-
			
 
				-            # Special handling for media plugin - look for thumbnails
			
 
				-            is_media_dir = plugin_name == 'media'
			
 
				-
			
 
				-            # Scan for suitable files
			
 
				-            for file_path in dir_path.rglob('*'):
			
 
				-                file_count += 1
			
 
				-                if file_count > MAX_SCAN_FILES:
			
 
				-                    break
			
 
				-
			
 
				-                if file_path.is_dir() or file_path.name.startswith('.'):
			
 
				-                    continue
			
 
				-
			
 
				-                ext = file_path.suffix.lstrip('.').lower()
			
 
				-                if ext not in IFRAME_EMBEDDABLE_EXTENSIONS:
			
 
				-                    continue
			
 
				-
			
 
				-                try:
			
 
				-                    size = file_path.stat().st_size
			
 
				-                except OSError:
			
 
				-                    continue
			
 
				-
			
 
				-                # For media dir, allow smaller image files (thumbnails are often < 15KB)
			
 
				-                min_size = 5_000 if (is_media_dir and ext in ('png', 'jpg', 'jpeg', 'webp', 'gif')) else MIN_DISPLAY_SIZE
			
 
				-                if size < min_size:
			
 
				-                    continue
			
 
				-
			
 
				-                # Prefer main files: index.html, output.*, content.*, etc.
			
 
				-                priority = 0
			
 
				-                name_lower = file_path.name.lower()
			
 
				-
			
 
				-                if is_media_dir:
			
 
				-                    # Special prioritization for media directories
			
 
				-                    if any(keyword in name_lower for keyword in ('thumb', 'thumbnail', 'cover', 'poster')):
			
 
				-                        priority = 200  # Highest priority for thumbnails
			
 
				-                    elif ext in ('png', 'jpg', 'jpeg', 'webp', 'gif'):
			
 
				-                        priority = 150  # High priority for any image
			
 
				-                    elif ext in ('mp4', 'webm', 'mp3', 'opus', 'ogg'):
			
 
				-                        priority = 100  # Lower priority for actual media files
			
 
				-                    else:
			
 
				-                        priority = 50
			
 
				-                elif 'index' in name_lower:
			
 
				-                    priority = 100
			
 
				-                elif name_lower.startswith(('output', 'content', plugin_name)):
			
 
				-                    priority = 50
			
 
				-                elif ext in ('html', 'htm', 'pdf'):
			
 
				-                    priority = 30
			
 
				-                elif ext in ('png', 'jpg', 'jpeg', 'webp'):
			
 
				-                    priority = 20
			
 
				-                else:
			
 
				-                    priority = 10
			
 
				-
			
 
				-                candidates.append((priority, size, file_path))
			
 
				-
			
 
				-            if not candidates:
			
 
				-                return None
			
 
				-
			
 
				-            # Sort by priority (desc), then size (desc)
			
 
				-            candidates.sort(key=lambda x: (x[0], x[1]), reverse=True)
			
 
				-            best_file = candidates[0][2]
			
 
				-            return str(best_file.relative_to(Path(self.output_dir)))
			
 
				-
			
 
				-        canonical = {
			
 
				-            'index_path': 'index.html',
			
 
				-            'google_favicon_path': FAVICON_PROVIDER.format(self.domain),
			
 
				-            'archive_org_path': f'https://web.archive.org/web/{self.base_url}',
			
 
				-        }
			
 
				-
			
 
				-        # Scan each ArchiveResult's output directory for the best file
			
 
				-        snap_dir = Path(self.output_dir)
			
 
				-        for result in self.archiveresult_set.filter(status='succeeded'):
			
 
				-            if not result.output_files and not result.output_str:
			
 
				-                continue
			
 
				-
			
 
				-            # Try to find the best output file for this plugin
			
 
				-            plugin_dir = snap_dir / result.plugin
			
 
				-            best_output = None
			
 
				-
			
 
				-            # Check output_files first (new field)
			
 
				-            if result.output_files:
			
 
				-                first_file = next(iter(result.output_files.keys()), None)
			
 
				-                if first_file and (plugin_dir / first_file).exists():
			
 
				-                    best_output = f'{result.plugin}/{first_file}'
			
 
				-
			
 
				-            # Fallback to output_str if it looks like a path
			
 
				-            if not best_output and result.output_str and (snap_dir / result.output_str).exists():
			
 
				-                best_output = result.output_str
			
 
				-
			
 
				-            if not best_output and plugin_dir.exists():
			
 
				-                # Intelligently find the best file in the plugin's directory
			
 
				-                best_output = find_best_output_in_dir(plugin_dir, result.plugin)
			
 
				-
			
 
				-            if best_output:
			
 
				-                canonical[f'{result.plugin}_path'] = best_output
			
 
				-
			
 
				-        # Also scan top-level for legacy outputs (backwards compatibility)
			
 
				-        for file_path in snap_dir.glob('*'):
			
 
				-            if file_path.is_dir() or file_path.name in ('index.html', 'index.json'):
			
 
				-                continue
			
 
				-
			
 
				-            ext = file_path.suffix.lstrip('.').lower()
			
 
				-            if ext not in IFRAME_EMBEDDABLE_EXTENSIONS:
			
 
				-                continue
			
 
				-
			
 
				-            try:
			
 
				-                size = file_path.stat().st_size
			
 
				-                if size >= MIN_DISPLAY_SIZE:
			
 
				-                    # Add as generic output with stem as key
			
 
				-                    key = f'{file_path.stem}_path'
			
 
				-                    if key not in canonical:
			
 
				-                        canonical[key] = file_path.name
			
 
				-            except OSError:
			
 
				-                continue
			
 
				-
			
 
				-        if self.is_static:
			
 
				-            static_path = f'warc/{self.timestamp}'
			
 
				-            canonical.update({
			
 
				-                'title': self.basename,
			
 
				-                'wget_path': static_path,
			
 
				-            })
			
 
				-
			
 
				-        return canonical
			
 
				-
			
 
				-    def latest_outputs(self, status: Optional[str] = None) -> Dict[str, Any]:
			
 
				-        """Get the latest output that each plugin produced"""
			
 
				-        from archivebox.hooks import get_plugins
			
 
				-        from django.db.models import Q
			
 
				-
			
 
				-        latest: Dict[str, Any] = {}
			
 
				-        for plugin in get_plugins():
			
 
				-            results = self.archiveresult_set.filter(plugin=plugin)
			
 
				-            if status is not None:
			
 
				-                results = results.filter(status=status)
			
 
				-            # Filter for results with output_files or output_str
			
 
				-            results = results.filter(Q(output_files__isnull=False) | ~Q(output_str='')).order_by('-start_ts')
			
 
				-            result = results.first()
			
 
				-            # Return embed_path() for backwards compatibility
			
 
				-            latest[plugin] = result.embed_path() if result else None
			
 
				-        return latest
			
 
				-
			
 
				-    # =========================================================================
			
 
				-    # Serialization Methods
			
 
				-    # =========================================================================
			
 
				-
			
 
				-    def to_dict(self, extended: bool = False) -> Dict[str, Any]:
			
 
				-        """Convert Snapshot to a dictionary (replacement for Link._asdict())"""
			
 
				-        from archivebox.misc.util import ts_to_date_str
			
 
				-
			
 
				-        result = {
			
 
				-            'TYPE': 'core.models.Snapshot',
			
 
				-            'id': str(self.id),
			
 
				-            'url': self.url,
			
 
				-            'timestamp': self.timestamp,
			
 
				-            'title': self.title,
			
 
				-            'tags': self.tags_str(),
			
 
				-            'downloaded_at': self.downloaded_at.isoformat() if self.downloaded_at else None,
			
 
				-            'bookmarked_at': self.bookmarked_at.isoformat() if self.bookmarked_at else None,
			
 
				-            'created_at': self.created_at.isoformat() if self.created_at else None,
			
 
				-            # Computed properties
			
 
				-            'domain': self.domain,
			
 
				-            'scheme': self.scheme,
			
 
				-            'base_url': self.base_url,
			
 
				-            'path': self.path,
			
 
				-            'basename': self.basename,
			
 
				-            'extension': self.extension,
			
 
				-            'is_static': self.is_static,
			
 
				-            'is_archived': self.is_archived,
			
 
				-            'archive_path': self.archive_path,
			
 
				-            'output_dir': self.output_dir,
			
 
				-            'link_dir': self.output_dir,  # backwards compatibility alias
			
 
				-            'archive_size': self.archive_size,
			
 
				-            'bookmarked_date': self.bookmarked_date,
			
 
				-            'downloaded_datestr': self.downloaded_datestr,
			
 
				-            'num_outputs': self.num_outputs,
			
 
				-            'num_failures': self.num_failures,
			
 
				-        }
			
 
				-        if extended:
			
 
				-            result['canonical'] = self.canonical_outputs()
			
 
				-        return result
			
 
				-
			
 
				-    def to_json(self, indent: int = 4) -> str:
			
 
				-        """Convert to JSON string"""
			
 
				-        return to_json(self.to_dict(extended=True), indent=indent)
			
 
				-
			
 
				-    def to_csv(self, cols: Optional[List[str]] = None, separator: str = ',', ljust: int = 0) -> str:
			
 
				-        """Convert to CSV string"""
			
 
				-        data = self.to_dict()
			
 
				-        cols = cols or ['timestamp', 'is_archived', 'url']
			
 
				-        return separator.join(to_json(data.get(col, ''), indent=None).ljust(ljust) for col in cols)
			
 
				-
			
 
				-    def write_json_details(self, out_dir: Optional[str] = None) -> None:
			
 
				-        """Write JSON index file for this snapshot to its output directory"""
			
 
				-        out_dir = out_dir or self.output_dir
			
 
				-        path = Path(out_dir) / CONSTANTS.JSON_INDEX_FILENAME
			
 
				-        atomic_write(str(path), self.to_dict(extended=True))
			
 
				-
			
 
				-    def write_html_details(self, out_dir: Optional[str] = None) -> None:
			
 
				-        """Write HTML detail page for this snapshot to its output directory"""
			
 
				-        from django.template.loader import render_to_string
			
 
				-        from archivebox.config.common import SERVER_CONFIG
			
 
				-        from archivebox.config.configset import get_config
			
 
				-        from archivebox.misc.logging_util import printable_filesize
			
 
				-
			
 
				-        out_dir = out_dir or self.output_dir
			
 
				-        config = get_config()
			
 
				-        SAVE_ARCHIVE_DOT_ORG = config.get('SAVE_ARCHIVE_DOT_ORG', True)
			
 
				-        TITLE_LOADING_MSG = 'Not yet archived...'
			
 
				-
			
 
				-        canonical = self.canonical_outputs()
			
 
				-        context = {
			
 
				-            **self.to_dict(extended=True),
			
 
				-            **{f'{k}_path': v for k, v in canonical.items()},
			
 
				-            'canonical': {f'{k}_path': v for k, v in canonical.items()},
			
 
				-            'title': htmlencode(self.title or (self.base_url if self.is_archived else TITLE_LOADING_MSG)),
			
 
				-            'url_str': htmlencode(urldecode(self.base_url)),
			
 
				-            'archive_url': urlencode(f'warc/{self.timestamp}' or (self.domain if self.is_archived else '')) or 'about:blank',
			
 
				-            'extension': self.extension or 'html',
			
 
				-            'tags': self.tags_str() or 'untagged',
			
 
				-            'size': printable_filesize(self.archive_size) if self.archive_size else 'pending',
			
 
				-            'status': 'archived' if self.is_archived else 'not yet archived',
			
 
				-            'status_color': 'success' if self.is_archived else 'danger',
			
 
				-            'oldest_archive_date': ts_to_date_str(self.oldest_archive_date),
			
 
				-            'SAVE_ARCHIVE_DOT_ORG': SAVE_ARCHIVE_DOT_ORG,
			
 
				-            'PREVIEW_ORIGINALS': SERVER_CONFIG.PREVIEW_ORIGINALS,
			
 
				-        }
			
 
				-        rendered_html = render_to_string('snapshot.html', context)
			
 
				-        atomic_write(str(Path(out_dir) / CONSTANTS.HTML_INDEX_FILENAME), rendered_html)
			
 
				-
			
 
				-    # =========================================================================
			
 
				-    # Helper Methods
			
 
				-    # =========================================================================
			
 
				-
			
 
				-    @staticmethod
			
 
				-    def _ts_to_date_str(dt: Optional[datetime]) -> Optional[str]:
			
 
				-        return dt.strftime('%Y-%m-%d %H:%M:%S') if dt else None
			
 
				-
			
 
				-
			
 
				-# =============================================================================
			
 
				-# Snapshot State Machine
			
 
				-# =============================================================================
			
 
				-
			
 
				-class SnapshotMachine(BaseStateMachine, strict_states=True):
			
 
				-    """
			
 
				-    State machine for managing Snapshot lifecycle.
			
 
				-
			
 
				-    Hook Lifecycle:
			
 
				-    ┌─────────────────────────────────────────────────────────────┐
			
 
				-    │ QUEUED State                                                │
			
 
				-    │  • Waiting for snapshot to be ready                         │
			
 
				-    └─────────────────────────────────────────────────────────────┘
			
 
				-                            ↓ tick() when can_start()
			
 
				-    ┌─────────────────────────────────────────────────────────────┐
			
 
				-    │ STARTED State → enter_started()                             │
			
 
				-    │  1. snapshot.run()                                          │
			
 
				-    │     • discover_hooks('Snapshot') → finds all plugin hooks   │
			
 
				-    │     • create_pending_archiveresults() → creates ONE         │
			
 
				-    │       ArchiveResult per hook (NO execution yet)             │
			
 
				-    │  2. ArchiveResults process independently with their own     │
			
 
				-    │     state machines (see ArchiveResultMachine)               │
			
 
				-    │  3. Advance through steps 0-9 as foreground hooks complete  │
			
 
				-    └─────────────────────────────────────────────────────────────┘
			
 
				-                            ↓ tick() when is_finished()
			
 
				-    ┌─────────────────────────────────────────────────────────────┐
			
 
				-    │ SEALED State → enter_sealed()                               │
			
 
				-    │  • cleanup() → kills any background hooks still running     │
			
 
				-    │  • Set retry_at=None (no more processing)                   │
			
 
				-    └─────────────────────────────────────────────────────────────┘
			
 
				-
			
 
				-    https://github.com/ArchiveBox/ArchiveBox/wiki/ArchiveBox-Architecture-Diagrams
			
 
				-    """
			
 
				-
			
 
				-    model_attr_name = 'snapshot'
			
 
				-
			
 
				-    # States
			
 
				-    queued = State(value=Snapshot.StatusChoices.QUEUED, initial=True)
			
 
				-    started = State(value=Snapshot.StatusChoices.STARTED)
			
 
				-    sealed = State(value=Snapshot.StatusChoices.SEALED, final=True)
			
 
				-
			
 
				-    # Tick Event
			
 
				-    tick = (
			
 
				-        queued.to.itself(unless='can_start') |
			
 
				-        queued.to(started, cond='can_start') |
			
 
				-        started.to.itself(unless='is_finished') |
			
 
				-        started.to(sealed, cond='is_finished')
			
 
				-    )
			
 
				-
			
 
				-    def can_start(self) -> bool:
			
 
				-        can_start = bool(self.snapshot.url)
			
 
				-        # Suppressed: queue waiting logs
			
 
				-        return can_start
			
 
				-
			
 
				-    def is_finished(self) -> bool:
			
 
				-        """Check if snapshot processing is complete - delegates to model method."""
			
 
				-        return self.snapshot.is_finished_processing()
			
 
				-
			
 
				-    @queued.enter
			
 
				-    def enter_queued(self):
			
 
				-        # Suppressed: state transition logs
			
 
				-        self.snapshot.update_and_requeue(
			
 
				-            retry_at=timezone.now(),
			
 
				-            status=Snapshot.StatusChoices.QUEUED,
			
 
				-        )
			
 
				-
			
 
				-    @started.enter
			
 
				-    def enter_started(self):
			
 
				-        # Suppressed: state transition logs
			
 
				-        # lock the snapshot while we create the pending archiveresults
			
 
				-        self.snapshot.update_and_requeue(
			
 
				-            retry_at=timezone.now() + timedelta(seconds=30),  # if failed, wait 30s before retrying
			
 
				-        )
			
 
				-
			
 
				-        # Run the snapshot - creates pending archiveresults for all enabled plugins
			
 
				-        self.snapshot.run()
			
 
				-
			
 
				-        # unlock the snapshot after we're done + set status = started
			
 
				-        self.snapshot.update_and_requeue(
			
 
				-            retry_at=timezone.now() + timedelta(seconds=5),  # check again in 5s
			
 
				-            status=Snapshot.StatusChoices.STARTED,
			
 
				-        )
			
 
				-
			
 
				-    @sealed.enter
			
 
				-    def enter_sealed(self):
			
 
				-        # Clean up background hooks
			
 
				-        self.snapshot.cleanup()
			
 
				-
			
 
				-        # Suppressed: state transition logs
			
 
				-        self.snapshot.update_and_requeue(
			
 
				-            retry_at=None,
			
 
				-            status=Snapshot.StatusChoices.SEALED,
			
 
				-        )
			
 
				-
			
 
				-
			
 
				-class ArchiveResultManager(models.Manager):
			
 
				-    def indexable(self, sorted: bool = True):
			
 
				-        INDEXABLE_METHODS = [r[0] for r in EXTRACTOR_INDEXING_PRECEDENCE]
			
 
				-        qs = self.get_queryset().filter(plugin__in=INDEXABLE_METHODS, status='succeeded')
			
 
				-        if sorted:
			
 
				-            precedence = [When(plugin=method, then=Value(p)) for method, p in EXTRACTOR_INDEXING_PRECEDENCE]
			
 
				-            qs = qs.annotate(indexing_precedence=Case(*precedence, default=Value(1000), output_field=IntegerField())).order_by('indexing_precedence')
			
 
				-        return qs
			
 
				-
			
 
				-
			
 
				-class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHealthStats, ModelWithStateMachine):
			
 
				-    class StatusChoices(models.TextChoices):
			
 
				-        QUEUED = 'queued', 'Queued'
			
 
				-        STARTED = 'started', 'Started'
			
 
				-        BACKOFF = 'backoff', 'Waiting to retry'
			
 
				-        SUCCEEDED = 'succeeded', 'Succeeded'
			
 
				-        FAILED = 'failed', 'Failed'
			
 
				-        SKIPPED = 'skipped', 'Skipped'
			
 
				-
			
 
				-    @classmethod
			
 
				-    def get_plugin_choices(cls):
			
 
				-        """Get plugin choices from discovered hooks (for forms/admin)."""
			
 
				-        plugins = [get_plugin_name(e) for e in get_plugins()]
			
 
				-        return tuple((e, e) for e in plugins)
			
 
				-
			
 
				-    # Keep AutoField for backward compatibility with 0.7.x databases
			
 
				-    # UUID field is added separately by migration for new records
			
 
				-    id = models.AutoField(primary_key=True, editable=False)
			
 
				-    # Note: unique constraint is added by migration 0027 - don't set unique=True here
			
 
				-    # or SQLite table recreation in earlier migrations will fail
			
 
				-    uuid = models.UUIDField(default=uuid7, null=True, blank=True, db_index=True)
			
 
				-    created_by = models.ForeignKey(settings.AUTH_USER_MODEL, on_delete=models.CASCADE, default=get_or_create_system_user_pk, null=False, related_name='archiveresult_set', db_index=True)
			
 
				-    created_at = models.DateTimeField(default=timezone.now, db_index=True)
			
 
				-    modified_at = models.DateTimeField(auto_now=True)
			
 
				-
			
 
				-    snapshot: Snapshot = models.ForeignKey(Snapshot, on_delete=models.CASCADE)  # type: ignore
			
 
				-    # No choices= constraint - plugin names come from plugin system and can be any string
			
 
				-    plugin = models.CharField(max_length=32, blank=False, null=False, db_index=True)
			
 
				-    hook_name = models.CharField(max_length=255, blank=True, default='', db_index=True, help_text='Full filename of the hook that executed (e.g., on_Snapshot__50_wget.py)')
			
 
				-    pwd = models.CharField(max_length=256, default=None, null=True, blank=True)
			
 
				-    cmd = models.JSONField(default=None, null=True, blank=True)
			
 
				-    cmd_version = models.CharField(max_length=128, default=None, null=True, blank=True)
			
 
				-
			
 
				-    # New output fields (replacing old 'output' field)
			
 
				-    output_str = models.TextField(blank=True, default='', help_text='Human-readable output summary')
			
 
				-    output_json = models.JSONField(null=True, blank=True, default=None, help_text='Structured metadata (headers, redirects, etc.)')
			
 
				-    output_files = models.JSONField(default=dict, help_text='Dict of {relative_path: {metadata}}')
			
 
				-    output_size = models.BigIntegerField(default=0, help_text='Total bytes of all output files')
			
 
				-    output_mimetypes = models.CharField(max_length=512, blank=True, default='', help_text='CSV of mimetypes sorted by size')
			
 
				-
			
 
				-    # Binary FK (optional - set when hook reports cmd)
			
 
				-    binary = models.ForeignKey(
			
 
				-        'machine.Binary',
			
 
				-        on_delete=models.SET_NULL,
			
 
				-        null=True, blank=True,
			
 
				-        related_name='archiveresults',
			
 
				-        help_text='Primary binary used by this hook'
			
 
				-    )
			
 
				-
			
 
				-    start_ts = models.DateTimeField(default=None, null=True, blank=True)
			
 
				-    end_ts = models.DateTimeField(default=None, null=True, blank=True)
			
 
				-
			
 
				-    status = ModelWithStateMachine.StatusField(choices=StatusChoices.choices, default=StatusChoices.QUEUED)
			
 
				-    retry_at = ModelWithStateMachine.RetryAtField(default=timezone.now)
			
 
				-    notes = models.TextField(blank=True, null=False, default='')
			
 
				-    output_dir = models.CharField(max_length=256, default=None, null=True, blank=True)
			
 
				-    iface = models.ForeignKey(NetworkInterface, on_delete=models.SET_NULL, null=True, blank=True)
			
 
				-
			
 
				-    state_machine_name = 'core.models.ArchiveResultMachine'
			
 
				-    retry_at_field_name = 'retry_at'
			
 
				-    state_field_name = 'status'
			
 
				-    active_state = StatusChoices.STARTED
			
 
				-
			
 
				-    objects = ArchiveResultManager()
			
 
				-
			
 
				-    class Meta(TypedModelMeta):
			
 
				-        verbose_name = 'Archive Result'
			
 
				-        verbose_name_plural = 'Archive Results Log'
			
 
				-
			
 
				-    def __str__(self):
			
 
				-        return f'[{self.id}] {self.snapshot.url[:64]} -> {self.plugin}'
			
 
				-
			
 
				-    def save(self, *args, **kwargs):
			
 
				-        is_new = self._state.adding
			
 
				-        # Skip ModelWithOutputDir.save() to avoid creating index.json in plugin directories
			
 
				-        # Call the Django Model.save() directly instead
			
 
				-        models.Model.save(self, *args, **kwargs)
			
 
				-
			
 
				-        if is_new:
			
 
				-            from archivebox.misc.logging_util import log_worker_event
			
 
				-            log_worker_event(
			
 
				-                worker_type='DB',
			
 
				-                event='Created ArchiveResult',
			
 
				-                indent_level=3,
			
 
				-                plugin=self.plugin,
			
 
				-                metadata={
			
 
				-                    'id': str(self.id),
			
 
				-                    'snapshot_id': str(self.snapshot_id),
			
 
				-                    'snapshot_url': str(self.snapshot.url)[:64],
			
 
				-                    'status': self.status,
			
 
				-                },
			
 
				-            )
			
 
				-
			
 
				-    @cached_property
			
 
				-    def snapshot_dir(self):
			
 
				-        return Path(self.snapshot.output_dir)
			
 
				-
			
 
				-    @cached_property
			
 
				-    def url(self):
			
 
				-        return self.snapshot.url
			
 
				-
			
 
				-    @property
			
 
				-    def api_url(self) -> str:
			
 
				-        return reverse_lazy('api-1:get_archiveresult', args=[self.id])
			
 
				-
			
 
				-    def get_absolute_url(self):
			
 
				-        return f'/{self.snapshot.archive_path}/{self.plugin}'
			
 
				-
			
 
				-    @property
			
 
				-    def plugin_module(self) -> Any | None:
			
 
				-        # Hook scripts are now used instead of Python plugin modules
			
 
				-        # The plugin name maps to hooks in archivebox/plugins/{plugin}/
			
 
				-        return None
			
 
				-
			
 
				-    def output_exists(self) -> bool:
			
 
				-        return os.path.exists(Path(self.snapshot_dir) / self.plugin)
			
 
				-
			
 
				-    def embed_path(self) -> Optional[str]:
			
 
				-        """
			
 
				-        Get the relative path to the embeddable output file for this result.
			
 
				-
			
 
				-        Returns the first file from output_files if set, otherwise tries to
			
 
				-        find a reasonable default based on the plugin type.
			
 
				-        """
			
 
				-        # Check output_files dict for primary output
			
 
				-        if self.output_files:
			
 
				-            # Return first file from output_files (dict preserves insertion order)
			
 
				-            first_file = next(iter(self.output_files.keys()), None)
			
 
				-            if first_file:
			
 
				-                return f'{self.plugin}/{first_file}'
			
 
				-
			
 
				-        # Fallback: check output_str if it looks like a file path
			
 
				-        if self.output_str and ('/' in self.output_str or '.' in self.output_str):
			
 
				-            return self.output_str
			
 
				-
			
 
				-        # Try to find output file based on plugin's canonical output path
			
 
				-        canonical = self.snapshot.canonical_outputs()
			
 
				-        plugin_key = f'{self.plugin}_path'
			
 
				-        if plugin_key in canonical:
			
 
				-            return canonical[plugin_key]
			
 
				-
			
 
				-        # Fallback to plugin directory
			
 
				-        return f'{self.plugin}/'
			
 
				-
			
 
				-    def create_output_dir(self):
			
 
				-        output_dir = Path(self.snapshot_dir) / self.plugin
			
 
				-        output_dir.mkdir(parents=True, exist_ok=True)
			
 
				-        return output_dir
			
 
				-
			
 
				-    @property
			
 
				-    def output_dir_name(self) -> str:
			
 
				-        return self.plugin
			
 
				-
			
 
				-    @property
			
 
				-    def output_dir_parent(self) -> str:
			
 
				-        return str(self.snapshot.OUTPUT_DIR.relative_to(CONSTANTS.DATA_DIR))
			
 
				-
			
 
				-    def save_search_index(self):
			
 
				-        pass
			
 
				-
			
 
				-    def cascade_health_update(self, success: bool):
			
 
				-        """Update health stats for self, parent Snapshot, and grandparent Crawl (if present)."""
			
 
				-        self.increment_health_stats(success)
			
 
				-        self.snapshot.increment_health_stats(success)
			
 
				-        if self.snapshot.crawl_id:
			
 
				-            self.snapshot.crawl.increment_health_stats(success)
			
 
				-
			
 
				-    def run(self):
			
 
				-        """
			
 
				-        Execute this ArchiveResult's hook and update status.
			
 
				-
			
 
				-        If self.hook_name is set, runs only that specific hook.
			
 
				-        If self.hook_name is empty, discovers and runs all hooks for self.plugin (backwards compat).
			
 
				-
			
 
				-        Updates status/output fields, queues discovered URLs, and triggers indexing.
			
 
				-        """
			
 
				-        from django.utils import timezone
			
 
				-        from archivebox.hooks import BUILTIN_PLUGINS_DIR, USER_PLUGINS_DIR, run_hook, is_background_hook
			
 
				-        from archivebox.config.configset import get_config
			
 
				-
			
 
				-        # Get merged config with proper context
			
 
				-        config = get_config(
			
 
				-            crawl=self.snapshot.crawl if self.snapshot.crawl else None,
			
 
				-            snapshot=self.snapshot,
			
 
				-        )
			
 
				-
			
 
				-        # Determine which hook(s) to run
			
 
				-        hooks = []
			
 
				-
			
 
				-        if self.hook_name:
			
 
				-            # SPECIFIC HOOK MODE: Find the specific hook by name
			
 
				-            for base_dir in (BUILTIN_PLUGINS_DIR, USER_PLUGINS_DIR):
			
 
				-                if not base_dir.exists():
			
 
				-                    continue
			
 
				-                plugin_dir = base_dir / self.plugin
			
 
				-                if plugin_dir.exists():
			
 
				-                    hook_path = plugin_dir / self.hook_name
			
 
				-                    if hook_path.exists():
			
 
				-                        hooks.append(hook_path)
			
 
				-                        break
			
 
				-        else:
			
 
				-            # LEGACY MODE: Discover all hooks for this plugin (backwards compatibility)
			
 
				-            for base_dir in (BUILTIN_PLUGINS_DIR, USER_PLUGINS_DIR):
			
 
				-                if not base_dir.exists():
			
 
				-                    continue
			
 
				-                plugin_dir = base_dir / self.plugin
			
 
				-                if plugin_dir.exists():
			
 
				-                    matches = list(plugin_dir.glob('on_Snapshot__*.*'))
			
 
				-                    if matches:
			
 
				-                        hooks.extend(sorted(matches))
			
 
				-
			
 
				-        if not hooks:
			
 
				-            self.status = self.StatusChoices.FAILED
			
 
				-            if self.hook_name:
			
 
				-                self.output_str = f'Hook not found: {self.plugin}/{self.hook_name}'
			
 
				-            else:
			
 
				-                self.output_str = f'No hooks found for plugin: {self.plugin}'
			
 
				-            self.retry_at = None
			
 
				-            self.save()
			
 
				-            return
			
 
				-
			
 
				-        # Output directory is plugin_dir for the hook output
			
 
				-        plugin_dir = Path(self.snapshot.output_dir) / self.plugin
			
 
				-
			
 
				-        start_ts = timezone.now()
			
 
				-        is_bg_hook = False
			
 
				-
			
 
				-        for hook in hooks:
			
 
				-            # Check if this is a background hook
			
 
				-            is_bg_hook = is_background_hook(hook.name)
			
 
				-
			
 
				-            result = run_hook(
			
 
				-                hook,
			
 
				-                output_dir=plugin_dir,
			
 
				-                config=config,
			
 
				-                url=self.snapshot.url,
			
 
				-                snapshot_id=str(self.snapshot.id),
			
 
				-                crawl_id=str(self.snapshot.crawl.id) if self.snapshot.crawl else None,
			
 
				-                depth=self.snapshot.depth,
			
 
				-            )
			
 
				-
			
 
				-            # Background hooks return None
			
 
				-            if result is None:
			
 
				-                is_bg_hook = True
			
 
				-
			
 
				-        # Update status based on hook execution
			
 
				-        if is_bg_hook:
			
 
				-            # BACKGROUND HOOK - still running, return immediately
			
 
				-            # Status stays STARTED, will be finalized by Snapshot.cleanup()
			
 
				-            self.status = self.StatusChoices.STARTED
			
 
				-            self.start_ts = start_ts
			
 
				-            self.pwd = str(plugin_dir)
			
 
				-            self.save()
			
 
				-            return
			
 
				-
			
 
				-        # FOREGROUND HOOK - completed, update from filesystem
			
 
				-        self.start_ts = start_ts
			
 
				-        self.pwd = str(plugin_dir)
			
 
				-        self.update_from_output()
			
 
				-
			
 
				-        # Clean up empty output directory if no files were created
			
 
				-        if plugin_dir.exists() and not self.output_files:
			
 
				-            try:
			
 
				-                if not any(plugin_dir.iterdir()):
			
 
				-                    plugin_dir.rmdir()
			
 
				-            except (OSError, RuntimeError):
			
 
				-                pass
			
 
				-
			
 
				-    def update_from_output(self):
			
 
				-        """
			
 
				-        Update this ArchiveResult from filesystem logs and output files.
			
 
				-
			
 
				-        Used for:
			
 
				-        - Foreground hooks that completed (called from ArchiveResult.run())
			
 
				-        - Background hooks that completed (called from Snapshot.cleanup())
			
 
				-
			
 
				-        Updates:
			
 
				-        - status, output_str, output_json from ArchiveResult JSONL record
			
 
				-        - output_files, output_size, output_mimetypes by walking filesystem
			
 
				-        - end_ts, retry_at, cmd, cmd_version, binary FK
			
 
				-        - Processes side-effect records (Snapshot, Tag, etc.) via process_hook_records()
			
 
				-        """
			
 
				-        import json
			
 
				-        import mimetypes
			
 
				-        from collections import defaultdict
			
 
				-        from pathlib import Path
			
 
				-        from django.utils import timezone
			
 
				-        from archivebox.hooks import process_hook_records
			
 
				-
			
 
				-        plugin_dir = Path(self.pwd) if self.pwd else None
			
 
				-        if not plugin_dir or not plugin_dir.exists():
			
 
				-            self.status = self.StatusChoices.FAILED
			
 
				-            self.output_str = 'Output directory not found'
			
 
				-            self.end_ts = timezone.now()
			
 
				-            self.retry_at = None
			
 
				-            self.save()
			
 
				-            return
			
 
				-
			
 
				-        # Read and parse JSONL output from stdout.log
			
 
				-        stdout_file = plugin_dir / 'stdout.log'
			
 
				-        stdout = stdout_file.read_text() if stdout_file.exists() else ''
			
 
				-
			
 
				-        records = []
			
 
				-        for line in stdout.splitlines():
			
 
				-            if line.strip() and line.strip().startswith('{'):
			
 
				-                try:
			
 
				-                    records.append(json.loads(line))
			
 
				-                except json.JSONDecodeError:
			
 
				-                    continue
			
 
				-
			
 
				-        # Find ArchiveResult record and update status/output from it
			
 
				-        ar_records = [r for r in records if r.get('type') == 'ArchiveResult']
			
 
				-        if ar_records:
			
 
				-            hook_data = ar_records[0]
			
 
				-
			
 
				-            # Update status
			
 
				-            status_map = {
			
 
				-                'succeeded': self.StatusChoices.SUCCEEDED,
			
 
				-                'failed': self.StatusChoices.FAILED,
			
 
				-                'skipped': self.StatusChoices.SKIPPED,
			
 
				-            }
			
 
				-            self.status = status_map.get(hook_data.get('status', 'failed'), self.StatusChoices.FAILED)
			
 
				-
			
 
				-            # Update output fields
			
 
				-            self.output_str = hook_data.get('output_str') or hook_data.get('output') or ''
			
 
				-            self.output_json = hook_data.get('output_json')
			
 
				-
			
 
				-            # Update cmd fields
			
 
				-            if hook_data.get('cmd'):
			
 
				-                self.cmd = hook_data['cmd']
			
 
				-                self._set_binary_from_cmd(hook_data['cmd'])
			
 
				-            if hook_data.get('cmd_version'):
			
 
				-                self.cmd_version = hook_data['cmd_version'][:128]
			
 
				-        else:
			
 
				-            # No ArchiveResult record = failed
			
 
				-            self.status = self.StatusChoices.FAILED
			
 
				-            self.output_str = 'Hook did not output ArchiveResult record'
			
 
				-
			
 
				-        # Walk filesystem and populate output_files, output_size, output_mimetypes
			
 
				-        exclude_names = {'stdout.log', 'stderr.log', 'hook.pid', 'listener.pid'}
			
 
				-        mime_sizes = defaultdict(int)
			
 
				-        total_size = 0
			
 
				-        output_files = {}
			
 
				-
			
 
				-        for file_path in plugin_dir.rglob('*'):
			
 
				-            if not file_path.is_file():
			
 
				-                continue
			
 
				-            if file_path.name in exclude_names:
			
 
				-                continue
			
 
				-
			
 
				-            try:
			
 
				-                stat = file_path.stat()
			
 
				-                mime_type, _ = mimetypes.guess_type(str(file_path))
			
 
				-                mime_type = mime_type or 'application/octet-stream'
			
 
				-
			
 
				-                relative_path = str(file_path.relative_to(plugin_dir))
			
 
				-                output_files[relative_path] = {}
			
 
				-                mime_sizes[mime_type] += stat.st_size
			
 
				-                total_size += stat.st_size
			
 
				-            except (OSError, IOError):
			
 
				-                continue
			
 
				-
			
 
				-        self.output_files = output_files
			
 
				-        self.output_size = total_size
			
 
				-        sorted_mimes = sorted(mime_sizes.items(), key=lambda x: x[1], reverse=True)
			
 
				-        self.output_mimetypes = ','.join(mime for mime, _ in sorted_mimes)
			
 
				-
			
 
				-        # Update timestamps
			
 
				-        self.end_ts = timezone.now()
			
 
				-        self.retry_at = None
			
 
				-
			
 
				-        self.save()
			
 
				-
			
 
				-        # Process side-effect records (filter Snapshots for depth/URL)
			
 
				-        filtered_records = []
			
 
				-        for record in records:
			
 
				-            record_type = record.get('type')
			
 
				-
			
 
				-            # Skip ArchiveResult records (already processed above)
			
 
				-            if record_type == 'ArchiveResult':
			
 
				-                continue
			
 
				-
			
 
				-            # Filter Snapshot records for depth/URL constraints
			
 
				-            if record_type == 'Snapshot':
			
 
				-                if not self.snapshot.crawl:
			
 
				-                    continue
			
 
				-
			
 
				-                url = record.get('url')
			
 
				-                if not url:
			
 
				-                    continue
			
 
				-
			
 
				-                depth = record.get('depth', self.snapshot.depth + 1)
			
 
				-                if depth > self.snapshot.crawl.max_depth:
			
 
				-                    continue
			
 
				-
			
 
				-                if not self._url_passes_filters(url):
			
 
				-                    continue
			
 
				-
			
 
				-            filtered_records.append(record)
			
 
				-
			
 
				-        # Process filtered records with unified dispatcher
			
 
				-        overrides = {
			
 
				-            'snapshot': self.snapshot,
			
 
				-            'crawl': self.snapshot.crawl,
			
 
				-            'created_by_id': self.snapshot.crawl.created_by_id,
			
 
				-        }
			
 
				-        process_hook_records(filtered_records, overrides=overrides)
			
 
				-
			
 
				-        # Cleanup PID files and empty logs
			
 
				-        pid_file = plugin_dir / 'hook.pid'
			
 
				-        pid_file.unlink(missing_ok=True)
			
 
				-        stderr_file = plugin_dir / 'stderr.log'
			
 
				-        if stdout_file.exists() and stdout_file.stat().st_size == 0:
			
 
				-            stdout_file.unlink()
			
 
				-        if stderr_file.exists() and stderr_file.stat().st_size == 0:
			
 
				-            stderr_file.unlink()
			
 
				-
			
 
				-    def _set_binary_from_cmd(self, cmd: list) -> None:
			
 
				-        """
			
 
				-        Find Binary for command and set binary FK.
			
 
				-
			
 
				-        Tries matching by absolute path first, then by binary name.
			
 
				-        Only matches binaries on the current machine.
			
 
				-        """
			
 
				-        if not cmd:
			
 
				-            return
			
 
				-
			
 
				-        from archivebox.machine.models import Machine
			
 
				-
			
 
				-        bin_path_or_name = cmd[0] if isinstance(cmd, list) else cmd
			
 
				-        machine = Machine.current()
			
 
				-
			
 
				-        # Try matching by absolute path first
			
 
				-        binary = Binary.objects.filter(
			
 
				-            abspath=bin_path_or_name,
			
 
				-            machine=machine
			
 
				-        ).first()
			
 
				-
			
 
				-        if binary:
			
 
				-            self.binary = binary
			
 
				-            return
			
 
				-
			
 
				-        # Fallback: match by binary name
			
 
				-        bin_name = Path(bin_path_or_name).name
			
 
				-        binary = Binary.objects.filter(
			
 
				-            name=bin_name,
			
 
				-            machine=machine
			
 
				-        ).first()
			
 
				-
			
 
				-        if binary:
			
 
				-            self.binary = binary
			
 
				-
			
 
				-    def _url_passes_filters(self, url: str) -> bool:
			
 
				-        """Check if URL passes URL_ALLOWLIST and URL_DENYLIST config filters.
			
 
				-
			
 
				-        Uses proper config hierarchy: defaults -> file -> env -> machine -> user -> crawl -> snapshot
			
 
				-        """
			
 
				-        import re
			
 
				-        from archivebox.config.configset import get_config
			
 
				-
			
 
				-        # Get merged config with proper hierarchy
			
 
				-        config = get_config(
			
 
				-            user=self.snapshot.crawl.created_by if self.snapshot else None,
			
 
				-            crawl=self.snapshot.crawl if self.snapshot else None,
			
 
				-            snapshot=self.snapshot,
			
 
				-        )
			
 
				-
			
 
				-        # Get allowlist/denylist (can be string or list)
			
 
				-        allowlist_raw = config.get('URL_ALLOWLIST', '')
			
 
				-        denylist_raw = config.get('URL_DENYLIST', '')
			
 
				-
			
 
				-        # Normalize to list of patterns
			
 
				-        def to_pattern_list(value):
			
 
				-            if isinstance(value, list):
			
 
				-                return value
			
 
				-            if isinstance(value, str):
			
 
				-                return [p.strip() for p in value.split(',') if p.strip()]
			
 
				-            return []
			
 
				-
			
 
				-        allowlist = to_pattern_list(allowlist_raw)
			
 
				-        denylist = to_pattern_list(denylist_raw)
			
 
				-
			
 
				-        # Denylist takes precedence
			
 
				-        if denylist:
			
 
				-            for pattern in denylist:
			
 
				-                try:
			
 
				-                    if re.search(pattern, url):
			
 
				-                        return False
			
 
				-                except re.error:
			
 
				-                    continue  # Skip invalid regex patterns
			
 
				-
			
 
				-        # If allowlist exists, URL must match at least one pattern
			
 
				-        if allowlist:
			
 
				-            for pattern in allowlist:
			
 
				-                try:
			
 
				-                    if re.search(pattern, url):
			
 
				-                        return True
			
 
				-                except re.error:
			
 
				-                    continue  # Skip invalid regex patterns
			
 
				-            return False  # No allowlist patterns matched
			
 
				-
			
 
				-        return True  # No filters or passed filters
			
 
				-
			
 
				-    @property
			
 
				-    def output_dir(self) -> Path:
			
 
				-        """Get the output directory for this plugin's results."""
			
 
				-        return Path(self.snapshot.output_dir) / self.plugin
			
 
				-
			
 
				-    def is_background_hook(self) -> bool:
			
 
				-        """Check if this ArchiveResult is for a background hook."""
			
 
				-        plugin_dir = Path(self.pwd) if self.pwd else None
			
 
				-        if not plugin_dir:
			
 
				-            return False
			
 
				-        pid_file = plugin_dir / 'hook.pid'
			
 
				-        return pid_file.exists()
			
 
				-
			
 
				-
			
 
				-# =============================================================================
			
 
				-# ArchiveResult State Machine
			
 
				-# =============================================================================
			
 
				-
			
 
				-class ArchiveResultMachine(BaseStateMachine, strict_states=True):
			
 
				-    """
			
 
				-    State machine for managing ArchiveResult (single plugin execution) lifecycle.
			
 
				-
			
 
				-    Hook Lifecycle:
			
 
				-    ┌─────────────────────────────────────────────────────────────┐
			
 
				-    │ QUEUED State                                                │
			
 
				-    │  • Waiting for its turn to run                              │
			
 
				-    └─────────────────────────────────────────────────────────────┘
			
 
				-                            ↓ tick() when can_start()
			
 
				-    ┌─────────────────────────────────────────────────────────────┐
			
 
				-    │ STARTED State → enter_started()                             │
			
 
				-    │  1. archiveresult.run()                                     │
			
 
				-    │     • Find specific hook by hook_name                       │
			
 
				-    │     • run_hook(script, output_dir, ...) → subprocess        │
			
 
				-    │                                                              │
			
 
				-    │  2a. FOREGROUND hook (returns HookResult):                  │
			
 
				-    │      • update_from_output() immediately                     │
			
 
				-    │        - Read stdout.log                                    │
			
 
				-    │        - Parse JSONL records                                │
			
 
				-    │        - Extract 'ArchiveResult' record → update status     │
			
 
				-    │        - Walk output_dir → populate output_files            │
			
 
				-    │        - Call process_hook_records() for side effects       │
			
 
				-    │                                                              │
			
 
				-    │  2b. BACKGROUND hook (returns None):                        │
			
 
				-    │      • Status stays STARTED                                 │
			
 
				-    │      • Continues running in background                      │
			
 
				-    │      • Killed by Snapshot.cleanup() when sealed             │
			
 
				-    └─────────────────────────────────────────────────────────────┘
			
 
				-                            ↓ tick() checks status
			
 
				-    ┌─────────────────────────────────────────────────────────────┐
			
 
				-    │ SUCCEEDED / FAILED / SKIPPED / BACKOFF                      │
			
 
				-    │  • Set by hook's JSONL output during update_from_output()   │
			
 
				-    │  • Health stats incremented (num_uses_succeeded/failed)     │
			
 
				-    │  • Parent Snapshot health stats also updated                │
			
 
				-    └─────────────────────────────────────────────────────────────┘
			
 
				-
			
 
				-    https://github.com/ArchiveBox/ArchiveBox/wiki/ArchiveBox-Architecture-Diagrams
			
 
				-    """
			
 
				-
			
 
				-    model_attr_name = 'archiveresult'
			
 
				-
			
 
				-    # States
			
 
				-    queued = State(value=ArchiveResult.StatusChoices.QUEUED, initial=True)
			
 
				-    started = State(value=ArchiveResult.StatusChoices.STARTED)
			
 
				-    backoff = State(value=ArchiveResult.StatusChoices.BACKOFF)
			
 
				-    succeeded = State(value=ArchiveResult.StatusChoices.SUCCEEDED, final=True)
			
 
				-    failed = State(value=ArchiveResult.StatusChoices.FAILED, final=True)
			
 
				-    skipped = State(value=ArchiveResult.StatusChoices.SKIPPED, final=True)
			
 
				-
			
 
				-    # Tick Event - transitions based on conditions
			
 
				-    tick = (
			
 
				-        queued.to.itself(unless='can_start') |
			
 
				-        queued.to(started, cond='can_start') |
			
 
				-        started.to.itself(unless='is_finished') |
			
 
				-        started.to(succeeded, cond='is_succeeded') |
			
 
				-        started.to(failed, cond='is_failed') |
			
 
				-        started.to(skipped, cond='is_skipped') |
			
 
				-        started.to(backoff, cond='is_backoff') |
			
 
				-        backoff.to.itself(unless='can_start') |
			
 
				-        backoff.to(started, cond='can_start') |
			
 
				-        backoff.to(succeeded, cond='is_succeeded') |
			
 
				-        backoff.to(failed, cond='is_failed') |
			
 
				-        backoff.to(skipped, cond='is_skipped')
			
 
				-    )
			
 
				-
			
 
				-    def can_start(self) -> bool:
			
 
				-        can_start = bool(self.archiveresult.snapshot.url)
			
 
				-        # Suppressed: queue waiting logs
			
 
				-        return can_start
			
 
				-
			
 
				-    def is_succeeded(self) -> bool:
			
 
				-        """Check if extractor plugin succeeded (status was set by run())."""
			
 
				-        return self.archiveresult.status == ArchiveResult.StatusChoices.SUCCEEDED
			
 
				-
			
 
				-    def is_failed(self) -> bool:
			
 
				-        """Check if extractor plugin failed (status was set by run())."""
			
 
				-        return self.archiveresult.status == ArchiveResult.StatusChoices.FAILED
			
 
				-
			
 
				-    def is_skipped(self) -> bool:
			
 
				-        """Check if extractor plugin was skipped (status was set by run())."""
			
 
				-        return self.archiveresult.status == ArchiveResult.StatusChoices.SKIPPED
			
 
				-
			
 
				-    def is_backoff(self) -> bool:
			
 
				-        """Check if we should backoff and retry later."""
			
 
				-        # Backoff if status is still started (plugin didn't complete) and output_str is empty
			
 
				-        return (
			
 
				-            self.archiveresult.status == ArchiveResult.StatusChoices.STARTED and
			
 
				-            not self.archiveresult.output_str
			
 
				-        )
			
 
				-
			
 
				-    def is_finished(self) -> bool:
			
 
				-        """Check if extraction has completed (success, failure, or skipped)."""
			
 
				-        return self.archiveresult.status in (
			
 
				-            ArchiveResult.StatusChoices.SUCCEEDED,
			
 
				-            ArchiveResult.StatusChoices.FAILED,
			
 
				-            ArchiveResult.StatusChoices.SKIPPED,
			
 
				-        )
			
 
				-
			
 
				-    @queued.enter
			
 
				-    def enter_queued(self):
			
 
				-        # Suppressed: state transition logs
			
 
				-        self.archiveresult.update_and_requeue(
			
 
				-            retry_at=timezone.now(),
			
 
				-            status=ArchiveResult.StatusChoices.QUEUED,
			
 
				-            start_ts=None,
			
 
				-        )  # bump the snapshot's retry_at so they pickup any new changes
			
 
				-
			
 
				-    @started.enter
			
 
				-    def enter_started(self):
			
 
				-        from archivebox.machine.models import NetworkInterface
			
 
				-
			
 
				-        # Suppressed: state transition logs
			
 
				-        # Lock the object and mark start time
			
 
				-        self.archiveresult.update_and_requeue(
			
 
				-            retry_at=timezone.now() + timedelta(seconds=120),  # 2 min timeout for plugin
			
 
				-            status=ArchiveResult.StatusChoices.STARTED,
			
 
				-            start_ts=timezone.now(),
			
 
				-            iface=NetworkInterface.current(),
			
 
				-        )
			
 
				-
			
 
				-        # Run the plugin - this updates status, output, timestamps, etc.
			
 
				-        self.archiveresult.run()
			
 
				-
			
 
				-        # Save the updated result
			
 
				-        self.archiveresult.save()
			
 
				-
			
 
				-        # Suppressed: plugin result logs (already logged by worker)
			
 
				-
			
 
				-    @backoff.enter
			
 
				-    def enter_backoff(self):
			
 
				-        # Suppressed: state transition logs
			
 
				-        self.archiveresult.update_and_requeue(
			
 
				-            retry_at=timezone.now() + timedelta(seconds=60),
			
 
				-            status=ArchiveResult.StatusChoices.BACKOFF,
			
 
				-            end_ts=None,
			
 
				-            # retries=F('retries') + 1,               # F() equivalent to getattr(self.archiveresult, 'retries', 0) + 1,
			
 
				-        )
			
 
				-
			
 
				-    @succeeded.enter
			
 
				-    def enter_succeeded(self):
			
 
				-        # Suppressed: state transition logs
			
 
				-        self.archiveresult.update_and_requeue(
			
 
				-            retry_at=None,
			
 
				-            status=ArchiveResult.StatusChoices.SUCCEEDED,
			
 
				-            end_ts=timezone.now(),
			
 
				-            # **self.archiveresult.get_output_dict(),     # {output, output_json, stderr, stdout, returncode, errors, cmd_version, pwd, cmd, machine}
			
 
				-        )
			
 
				-        self.archiveresult.save()
			
 
				-
			
 
				-        # Update health stats for ArchiveResult, Snapshot, and Crawl cascade
			
 
				-        self.archiveresult.cascade_health_update(success=True)
			
 
				-
			
 
				-    @failed.enter
			
 
				-    def enter_failed(self):
			
 
				-        # Suppressed: state transition logs
			
 
				-        self.archiveresult.update_and_requeue(
			
 
				-            retry_at=None,
			
 
				-            status=ArchiveResult.StatusChoices.FAILED,
			
 
				-            end_ts=timezone.now(),
			
 
				-        )
			
 
				-
			
 
				-        # Update health stats for ArchiveResult, Snapshot, and Crawl cascade
			
 
				-        self.archiveresult.cascade_health_update(success=False)
			
 
				-
			
 
				-    @skipped.enter
			
 
				-    def enter_skipped(self):
			
 
				-        # Suppressed: state transition logs
			
 
				-        self.archiveresult.update_and_requeue(
			
 
				-            retry_at=None,
			
 
				-            status=ArchiveResult.StatusChoices.SKIPPED,
			
 
				-            end_ts=timezone.now(),
			
 
				-        )
			
 
				-
			
 
				-    def after_transition(self, event: str, source: State, target: State):
			
 
				-        # print(f"after '{event}' from '{source.id}' to '{target.id}'")
			
 
				-        self.archiveresult.snapshot.update_and_requeue()  # bump snapshot retry time so it picks up all the new changes
			
 
				-
			
 
				-
			
 
				-# =============================================================================
			
 
				-# State Machine Registration
			
 
				-# =============================================================================
			
 
				-
			
 
				-# Manually register state machines with python-statemachine registry
			
 
				-# (normally auto-discovered from statemachines.py, but we define them here for clarity)
			
 
				-registry.register(SnapshotMachine)
			
 
				-registry.register(ArchiveResultMachine)
			
--- a/archivebox/core/templatetags/core_tags.py
+++ b/archivebox/core/templatetags/core_tags.py
@@ -91,7 +91,11 @@ def plugin_thumbnail(context, result) -> str:
 
				             'output_path': output_path,
			
 
				             'plugin': plugin,
			
 
				         })
			
 
				-        return mark_safe(tpl.render(ctx))
			
 
				+        rendered = tpl.render(ctx)
			
 
				+        # Only return non-empty content (strip whitespace to check)
			
 
				+        if rendered.strip():
			
 
				+            return mark_safe(rendered)
			
 
				+        return ''
			
 
				     except Exception:
			
 
				         return ''
			
 
				 
			
@@ -119,7 +123,11 @@ def plugin_embed(context, result) -> str:
 
				             'output_path': output_path,
			
 
				             'plugin': plugin,
			
 
				         })
			
 
				-        return mark_safe(tpl.render(ctx))
			
 
				+        rendered = tpl.render(ctx)
			
 
				+        # Only return non-empty content (strip whitespace to check)
			
 
				+        if rendered.strip():
			
 
				+            return mark_safe(rendered)
			
 
				+        return ''
			
 
				     except Exception:
			
 
				         return ''
			
 
				 
			
@@ -147,7 +155,11 @@ def plugin_fullscreen(context, result) -> str:
 
				             'output_path': output_path,
			
 
				             'plugin': plugin,
			
 
				         })
			
 
				-        return mark_safe(tpl.render(ctx))
			
 
				+        rendered = tpl.render(ctx)
			
 
				+        # Only return non-empty content (strip whitespace to check)
			
 
				+        if rendered.strip():
			
 
				+            return mark_safe(rendered)
			
 
				+        return ''
			
 
				     except Exception:
			
 
				         return ''
			
 
				 
			
--- a/archivebox/core/views.py
+++ b/archivebox/core/views.py
@@ -539,7 +539,7 @@ from django.http import JsonResponse
 
				 def live_progress_view(request):
			
 
				     """Simple JSON endpoint for live progress status - used by admin progress monitor."""
			
 
				     try:
			
 
				-        from workers.orchestrator import Orchestrator
			
 
				+        from archivebox.workers.orchestrator import Orchestrator
			
 
				         from archivebox.crawls.models import Crawl
			
 
				         from archivebox.core.models import Snapshot, ArchiveResult
			
 
				         from django.db.models import Case, When, Value, IntegerField
			
--- a/archivebox/crawls/apps.py
+++ b/archivebox/crawls/apps.py
@@ -4,3 +4,8 @@ from django.apps import AppConfig
 
				 class CrawlsConfig(AppConfig):
			
 
				     default_auto_field = "django.db.models.BigAutoField"
			
 
				     name = "archivebox.crawls"
			
 
				+    label = "crawls"
			
 
				+
			
 
				+    def ready(self):
			
 
				+        """Import models to register state machines with the registry"""
			
 
				+        from archivebox.crawls.models import CrawlMachine  # noqa: F401
			
--- a/archivebox/crawls/migrations/0002_drop_seed_model.py
+++ b/archivebox/crawls/migrations/0002_drop_seed_model.py
@@ -17,39 +17,62 @@ class Migration(migrations.Migration):
 
				     ]
			
 
				 
			
 
				     operations = [
			
 
				-        # Remove the seed foreign key from Crawl
			
 
				-        migrations.RemoveField(
			
 
				-            model_name='crawl',
			
 
				-            name='seed',
			
 
				+        # Remove the seed foreign key from Crawl (no-op if already removed by core/0024_d)
			
 
				+        migrations.RunPython(
			
 
				+            code=lambda apps, schema_editor: None,
			
 
				+            reverse_code=migrations.RunPython.noop,
			
 
				         ),
			
 
				-        # Delete the Seed model entirely
			
 
				-        migrations.DeleteModel(
			
 
				-            name='Seed',
			
 
				+        # Delete the Seed model entirely (already done)
			
 
				+        migrations.RunPython(
			
 
				+            code=lambda apps, schema_editor: None,
			
 
				+            reverse_code=migrations.RunPython.noop,
			
 
				         ),
			
 
				-        # Update fields to new schema
			
 
				-        migrations.AlterField(
			
 
				-            model_name='crawl',
			
 
				-            name='created_by',
			
 
				-            field=models.ForeignKey(default=archivebox.base_models.models.get_or_create_system_user_pk, on_delete=django.db.models.deletion.CASCADE, to=settings.AUTH_USER_MODEL),
			
 
				-        ),
			
 
				-        migrations.AlterField(
			
 
				-            model_name='crawl',
			
 
				-            name='id',
			
 
				-            field=models.UUIDField(default=uuid_compat.uuid7, editable=False, primary_key=True, serialize=False, unique=True),
			
 
				-        ),
			
 
				-        migrations.AlterField(
			
 
				-            model_name='crawl',
			
 
				-            name='urls',
			
 
				-            field=models.TextField(help_text='Newline-separated list of URLs to crawl'),
			
 
				-        ),
			
 
				-        migrations.AlterField(
			
 
				-            model_name='crawlschedule',
			
 
				-            name='created_by',
			
 
				-            field=models.ForeignKey(default=archivebox.base_models.models.get_or_create_system_user_pk, on_delete=django.db.models.deletion.CASCADE, to=settings.AUTH_USER_MODEL),
			
 
				-        ),
			
 
				-        migrations.AlterField(
			
 
				-            model_name='crawlschedule',
			
 
				-            name='id',
			
 
				-            field=models.UUIDField(default=uuid_compat.uuid7, editable=False, primary_key=True, serialize=False, unique=True),
			
 
				+        # Drop seed_id column if it exists, then update Django's migration state
			
 
				+        migrations.SeparateDatabaseAndState(
			
 
				+            state_operations=[
			
 
				+                # Update fields to new schema
			
 
				+                migrations.AlterField(
			
 
				+                    model_name='crawl',
			
 
				+                    name='created_by',
			
 
				+                    field=models.ForeignKey(default=archivebox.base_models.models.get_or_create_system_user_pk, on_delete=django.db.models.deletion.CASCADE, to=settings.AUTH_USER_MODEL),
			
 
				+                ),
			
 
				+                migrations.AlterField(
			
 
				+                    model_name='crawl',
			
 
				+                    name='id',
			
 
				+                    field=models.UUIDField(default=uuid_compat.uuid7, editable=False, primary_key=True, serialize=False, unique=True),
			
 
				+                ),
			
 
				+                migrations.AlterField(
			
 
				+                    model_name='crawl',
			
 
				+                    name='urls',
			
 
				+                    field=models.TextField(help_text='Newline-separated list of URLs to crawl'),
			
 
				+                ),
			
 
				+                migrations.AlterField(
			
 
				+                    model_name='crawlschedule',
			
 
				+                    name='created_by',
			
 
				+                    field=models.ForeignKey(default=archivebox.base_models.models.get_or_create_system_user_pk, on_delete=django.db.models.deletion.CASCADE, to=settings.AUTH_USER_MODEL),
			
 
				+                ),
			
 
				+                migrations.AlterField(
			
 
				+                    model_name='crawlschedule',
			
 
				+                    name='id',
			
 
				+                    field=models.UUIDField(default=uuid_compat.uuid7, editable=False, primary_key=True, serialize=False, unique=True),
			
 
				+                ),
			
 
				+            ],
			
 
				+            database_operations=[
			
 
				+                # Drop seed table and NULL out seed_id FK values
			
 
				+                migrations.RunSQL(
			
 
				+                    sql="""
			
 
				+                        PRAGMA foreign_keys=OFF;
			
 
				+
			
 
				+                        -- NULL out seed_id values in crawls_crawl
			
 
				+                        UPDATE crawls_crawl SET seed_id = NULL;
			
 
				+
			
 
				+                        -- Drop seed table if it exists
			
 
				+                        DROP TABLE IF EXISTS crawls_seed;
			
 
				+
			
 
				+                        PRAGMA foreign_keys=ON;
			
 
				+                    """,
			
 
				+                    reverse_sql=migrations.RunSQL.noop,
			
 
				+                ),
			
 
				+            ],
			
 
				         ),
			
 
				     ]
			
--- a/archivebox/crawls/migrations/0003_alter_crawl_output_dir.py
+++ b/archivebox/crawls/migrations/0003_alter_crawl_output_dir.py
@@ -8,12 +8,21 @@ class Migration(migrations.Migration):
 
				 
			
 
				     dependencies = [
			
 
				         ('crawls', '0002_drop_seed_model'),
			
 
				+        ('core', '0024_d_fix_crawls_config'),  # Depends on config fix
			
 
				     ]
			
 
				 
			
 
				     operations = [
			
 
				-        migrations.AlterField(
			
 
				-            model_name='crawl',
			
 
				-            name='output_dir',
			
 
				-            field=models.FilePathField(blank=True, default='', path=pathlib.PurePosixPath('/private/tmp/test_archivebox_migrations/archive')),
			
 
				+        # Update Django's state only to avoid table rebuild that would re-apply old constraints
			
 
				+        migrations.SeparateDatabaseAndState(
			
 
				+            state_operations=[
			
 
				+                migrations.AlterField(
			
 
				+                    model_name='crawl',
			
 
				+                    name='output_dir',
			
 
				+                    field=models.FilePathField(blank=True, default='', path=pathlib.PurePosixPath('/private/tmp/test_archivebox_migrations/archive')),
			
 
				+                ),
			
 
				+            ],
			
 
				+            database_operations=[
			
 
				+                # No database changes - output_dir type change is cosmetic for Django admin
			
 
				+            ],
			
 
				         ),
			
 
				     ]
			
--- a/archivebox/crawls/migrations/0004_alter_crawl_output_dir.py
+++ b/archivebox/crawls/migrations/0004_alter_crawl_output_dir.py
@@ -11,9 +11,17 @@ class Migration(migrations.Migration):
 
				     ]
			
 
				 
			
 
				     operations = [
			
 
				-        migrations.AlterField(
			
 
				-            model_name='crawl',
			
 
				-            name='output_dir',
			
 
				-            field=models.FilePathField(blank=True, default='', path=pathlib.PurePosixPath('/private/tmp/archivebox-makemigrations/archive')),
			
 
				+        # Update Django's state only to avoid table rebuild that would re-apply old constraints
			
 
				+        migrations.SeparateDatabaseAndState(
			
 
				+            state_operations=[
			
 
				+                migrations.AlterField(
			
 
				+                    model_name='crawl',
			
 
				+                    name='output_dir',
			
 
				+                    field=models.FilePathField(blank=True, default='', path=pathlib.PurePosixPath('/private/tmp/archivebox-makemigrations/archive')),
			
 
				+                ),
			
 
				+            ],
			
 
				+            database_operations=[
			
 
				+                # No database changes - output_dir type change is cosmetic for Django admin
			
 
				+            ],
			
 
				         ),
			
 
				     ]
			
--- a/archivebox/crawls/migrations/0005_drop_seed_id_column.py
+++ b/archivebox/crawls/migrations/0005_drop_seed_id_column.py
@@ -0,0 +1,28 @@
 
				+# Drop seed_id column from Django's state (leave in database to avoid FK issues)
			
 
				+
			
 
				+from django.db import migrations
			
 
				+
			
 
				+
			
 
				+class Migration(migrations.Migration):
			
 
				+
			
 
				+    dependencies = [
			
 
				+        ('crawls', '0004_alter_crawl_output_dir'),
			
 
				+    ]
			
 
				+
			
 
				+    operations = [
			
 
				+        # Update Django's state only - leave seed_id column in database (unused but harmless)
			
 
				+        # This avoids FK mismatch errors with crawls_crawlschedule
			
 
				+        migrations.SeparateDatabaseAndState(
			
 
				+            state_operations=[
			
 
				+                # Remove seed field from Django's migration state
			
 
				+                migrations.RemoveField(
			
 
				+                    model_name='crawl',
			
 
				+                    name='seed',
			
 
				+                ),
			
 
				+            ],
			
 
				+            database_operations=[
			
 
				+                # No database changes - seed_id column remains to avoid FK rebuild issues
			
 
				+                # crawls_seed table can be manually dropped by DBA if needed
			
 
				+            ],
			
 
				+        ),
			
 
				+    ]
			
--- a/archivebox/crawls/migrations/0006_alter_crawl_config_alter_crawl_output_dir_and_more.py
+++ b/archivebox/crawls/migrations/0006_alter_crawl_config_alter_crawl_output_dir_and_more.py
@@ -0,0 +1,35 @@
 
				+# Generated by Django 6.0 on 2025-12-29 06:45
			
 
				+
			
 
				+import pathlib
			
 
				+from django.db import migrations, models
			
 
				+
			
 
				+
			
 
				+class Migration(migrations.Migration):
			
 
				+
			
 
				+    dependencies = [
			
 
				+        ('crawls', '0005_drop_seed_id_column'),
			
 
				+    ]
			
 
				+
			
 
				+    operations = [
			
 
				+        # Update Django's state only - database already correct
			
 
				+        migrations.SeparateDatabaseAndState(
			
 
				+            state_operations=[
			
 
				+                migrations.AlterField(
			
 
				+                    model_name='crawl',
			
 
				+                    name='config',
			
 
				+                    field=models.JSONField(blank=True, default=dict, null=True),
			
 
				+                ),
			
 
				+                migrations.AlterField(
			
 
				+                    model_name='crawl',
			
 
				+                    name='output_dir',
			
 
				+                    field=models.FilePathField(blank=True, default='', path=pathlib.PurePosixPath('/Users/squash/Local/Code/archiveboxes/archivebox-nue/data/archive')),
			
 
				+                ),
			
 
				+                migrations.DeleteModel(
			
 
				+                    name='Seed',
			
 
				+                ),
			
 
				+            ],
			
 
				+            database_operations=[
			
 
				+                # No database changes - Seed table already dropped in 0005
			
 
				+            ],
			
 
				+        ),
			
 
				+    ]
			
--- a/archivebox/crawls/models.py
+++ b/archivebox/crawls/models.py
@@ -65,7 +65,7 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith
 
				     modified_at = models.DateTimeField(auto_now=True)
			
 
				 
			
 
				     urls = models.TextField(blank=False, null=False, help_text='Newline-separated list of URLs to crawl')
			
 
				-    config = models.JSONField(default=dict)
			
 
				+    config = models.JSONField(default=dict, null=True, blank=True)
			
 
				     max_depth = models.PositiveSmallIntegerField(default=0, validators=[MinValueValidator(0), MaxValueValidator(4)])
			
 
				     tags_str = models.CharField(max_length=1024, blank=True, null=False, default='')
			
 
				     persona_id = models.UUIDField(null=True, blank=True)
			
@@ -77,7 +77,7 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith
 
				     status = ModelWithStateMachine.StatusField(choices=ModelWithStateMachine.StatusChoices, default=ModelWithStateMachine.StatusChoices.QUEUED)
			
 
				     retry_at = ModelWithStateMachine.RetryAtField(default=timezone.now)
			
 
				 
			
 
				-    state_machine_name = 'crawls.models.CrawlMachine'
			
 
				+    state_machine_name = 'archivebox.crawls.models.CrawlMachine'
			
 
				     retry_at_field_name = 'retry_at'
			
 
				     state_field_name = 'status'
			
 
				     StatusChoices = ModelWithStateMachine.StatusChoices
			
@@ -190,7 +190,6 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith
 
				                 'status': Snapshot.INITIAL_STATE,
			
 
				                 'retry_at': timezone.now(),
			
 
				                 'timestamp': str(timezone.now().timestamp()),
			
 
				-                'created_by_id': self.created_by_id,
			
 
				                 'depth': 0,
			
 
				             },
			
 
				         )
			
@@ -290,7 +289,7 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith
 
				                     'timestamp': timestamp or str(timezone.now().timestamp()),
			
 
				                     'status': Snapshot.INITIAL_STATE,
			
 
				                     'retry_at': timezone.now(),
			
 
				-                    'created_by_id': self.created_by_id,
			
 
				+                    # Note: created_by removed in 0.9.0 - Snapshot inherits from Crawl
			
 
				                 }
			
 
				             )
			
 
				 
			
--- a/archivebox/machine/apps.py
+++ b/archivebox/machine/apps.py
@@ -7,8 +7,13 @@ class MachineConfig(AppConfig):
 
				     default_auto_field = 'django.db.models.BigAutoField'
			
 
				 
			
 
				     name = 'archivebox.machine'
			
 
				+    label = 'machine'  # Explicit label for migrations
			
 
				     verbose_name = 'Machine Info'
			
 
				 
			
 
				+    def ready(self):
			
 
				+        """Import models to register state machines with the registry"""
			
 
				+        from archivebox.machine import models  # noqa: F401
			
 
				+
			
 
				 
			
 
				 def register_admin(admin_site):
			
 
				     from archivebox.machine.admin import register_admin
			
--- a/archivebox/machine/migrations/0001_squashed.py
+++ b/archivebox/machine/migrations/0001_squashed.py
@@ -85,6 +85,12 @@ class Migration(migrations.Migration):
 
				                 ('version', models.CharField(blank=True, default=None, max_length=32)),
			
 
				                 ('sha256', models.CharField(blank=True, default=None, max_length=64)),
			
 
				                 ('machine', models.ForeignKey(blank=True, default=None, on_delete=django.db.models.deletion.CASCADE, to='machine.machine')),
			
 
				+                # Fields added in migration 0005 (included here for fresh installs)
			
 
				+                ('binproviders', models.CharField(blank=True, default='env', max_length=127)),
			
 
				+                ('output_dir', models.CharField(blank=True, default='', max_length=255)),
			
 
				+                ('overrides', models.JSONField(blank=True, default=dict)),
			
 
				+                ('retry_at', models.DateTimeField(blank=True, db_index=True, default=django.utils.timezone.now, null=True)),
			
 
				+                ('status', models.CharField(choices=[('queued', 'Queued'), ('started', 'Started'), ('succeeded', 'Succeeded'), ('failed', 'Failed')], db_index=True, default='queued', max_length=16)),
			
 
				                 # dependency FK removed - Dependency model deleted
			
 
				             ],
			
 
				             options={
			
--- a/archivebox/machine/migrations/0005_binary_binproviders_binary_output_dir_and_more.py
+++ b/archivebox/machine/migrations/0005_binary_binproviders_binary_output_dir_and_more.py
@@ -0,0 +1,104 @@
 
				+# Generated by Django 6.0 on 2025-12-29 06:45
			
 
				+
			
 
				+import django.db.models.deletion
			
 
				+import django.utils.timezone
			
 
				+from archivebox.uuid_compat import uuid7
			
 
				+from django.db import migrations, models
			
 
				+
			
 
				+
			
 
				+class Migration(migrations.Migration):
			
 
				+
			
 
				+    dependencies = [
			
 
				+        ('machine', '0004_drop_dependency_table'),
			
 
				+    ]
			
 
				+
			
 
				+    operations = [
			
 
				+        # Update Django's state only - database already has correct schema
			
 
				+        migrations.SeparateDatabaseAndState(
			
 
				+            state_operations=[
			
 
				+                migrations.AddField(
			
 
				+                    model_name='binary',
			
 
				+                    name='binproviders',
			
 
				+                    field=models.CharField(blank=True, default='env', help_text='Comma-separated list of allowed providers: apt,brew,pip,npm,env', max_length=127),
			
 
				+                ),
			
 
				+                migrations.AddField(
			
 
				+                    model_name='binary',
			
 
				+                    name='output_dir',
			
 
				+                    field=models.CharField(blank=True, default='', help_text='Directory where installation hook logs are stored', max_length=255),
			
 
				+                ),
			
 
				+                migrations.AddField(
			
 
				+                    model_name='binary',
			
 
				+                    name='overrides',
			
 
				+                    field=models.JSONField(blank=True, default=dict, help_text="Provider-specific overrides: {'apt': {'packages': ['pkg']}, ...}"),
			
 
				+                ),
			
 
				+                migrations.AddField(
			
 
				+                    model_name='binary',
			
 
				+                    name='retry_at',
			
 
				+                    field=models.DateTimeField(blank=True, db_index=True, default=django.utils.timezone.now, help_text='When to retry this binary installation', null=True),
			
 
				+                ),
			
 
				+                migrations.AddField(
			
 
				+                    model_name='binary',
			
 
				+                    name='status',
			
 
				+                    field=models.CharField(choices=[('queued', 'Queued'), ('started', 'Started'), ('succeeded', 'Succeeded'), ('failed', 'Failed')], db_index=True, default='queued', max_length=16),
			
 
				+                ),
			
 
				+                migrations.AlterField(
			
 
				+                    model_name='binary',
			
 
				+                    name='abspath',
			
 
				+                    field=models.CharField(blank=True, default='', max_length=255),
			
 
				+                ),
			
 
				+                migrations.AlterField(
			
 
				+                    model_name='binary',
			
 
				+                    name='binprovider',
			
 
				+                    field=models.CharField(blank=True, default='', help_text='Provider that successfully installed this binary', max_length=31),
			
 
				+                ),
			
 
				+                migrations.AlterField(
			
 
				+                    model_name='binary',
			
 
				+                    name='id',
			
 
				+                    field=models.UUIDField(default=uuid7, editable=False, primary_key=True, serialize=False, unique=True),
			
 
				+                ),
			
 
				+                migrations.AlterField(
			
 
				+                    model_name='binary',
			
 
				+                    name='machine',
			
 
				+                    field=models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, to='machine.machine'),
			
 
				+                ),
			
 
				+                migrations.AlterField(
			
 
				+                    model_name='binary',
			
 
				+                    name='name',
			
 
				+                    field=models.CharField(blank=True, db_index=True, default='', max_length=63),
			
 
				+                ),
			
 
				+                migrations.AlterField(
			
 
				+                    model_name='binary',
			
 
				+                    name='sha256',
			
 
				+                    field=models.CharField(blank=True, default='', max_length=64),
			
 
				+                ),
			
 
				+                migrations.AlterField(
			
 
				+                    model_name='binary',
			
 
				+                    name='version',
			
 
				+                    field=models.CharField(blank=True, default='', max_length=32),
			
 
				+                ),
			
 
				+                migrations.AlterField(
			
 
				+                    model_name='machine',
			
 
				+                    name='config',
			
 
				+                    field=models.JSONField(blank=True, default=dict, help_text='Machine-specific config overrides (e.g., resolved binary paths like WGET_BINARY)', null=True),
			
 
				+                ),
			
 
				+                migrations.AlterField(
			
 
				+                    model_name='machine',
			
 
				+                    name='id',
			
 
				+                    field=models.UUIDField(default=uuid7, editable=False, primary_key=True, serialize=False, unique=True),
			
 
				+                ),
			
 
				+                migrations.AlterField(
			
 
				+                    model_name='machine',
			
 
				+                    name='stats',
			
 
				+                    field=models.JSONField(blank=True, default=dict, null=True),
			
 
				+                ),
			
 
				+                migrations.AlterField(
			
 
				+                    model_name='networkinterface',
			
 
				+                    name='id',
			
 
				+                    field=models.UUIDField(default=uuid7, editable=False, primary_key=True, serialize=False, unique=True),
			
 
				+                ),
			
 
				+            ],
			
 
				+            database_operations=[
			
 
				+                # No database changes - schema already correct from previous migrations
			
 
				+            ],
			
 
				+        ),
			
 
				+    ]
			
--- a/archivebox/machine/models.py
+++ b/archivebox/machine/models.py
@@ -44,8 +44,8 @@ class Machine(ModelWithHealthStats):
 
				     os_platform = models.CharField(max_length=63, default=None, null=False)
			
 
				     os_release = models.CharField(max_length=63, default=None, null=False)
			
 
				     os_kernel = models.CharField(max_length=255, default=None, null=False)
			
 
				-    stats = models.JSONField(default=dict, null=False)
			
 
				-    config = models.JSONField(default=dict, null=False, blank=True,
			
 
				+    stats = models.JSONField(default=dict, null=True, blank=True)
			
 
				+    config = models.JSONField(default=dict, null=True, blank=True,
			
 
				         help_text="Machine-specific config overrides (e.g., resolved binary paths like WGET_BINARY)")
			
 
				     num_uses_failed = models.PositiveIntegerField(default=0)
			
 
				     num_uses_succeeded = models.PositiveIntegerField(default=0)
			
@@ -213,7 +213,7 @@ class Binary(ModelWithHealthStats):
 
				     num_uses_failed = models.PositiveIntegerField(default=0)
			
 
				     num_uses_succeeded = models.PositiveIntegerField(default=0)
			
 
				 
			
 
				-    state_machine_name: str = 'machine.models.BinaryMachine'
			
 
				+    state_machine_name: str = 'archivebox.machine.models.BinaryMachine'
			
 
				 
			
 
				     objects: BinaryManager = BinaryManager()
			
 
				 
			
--- a/archivebox/personas/apps.py
+++ b/archivebox/personas/apps.py
@@ -4,3 +4,4 @@ from django.apps import AppConfig
 
				 class SessionsConfig(AppConfig):
			
 
				     default_auto_field = "django.db.models.BigAutoField"
			
 
				     name = "archivebox.personas"
			
 
				+    label = "personas"
			
--- a/archivebox/personas/models.py
+++ b/archivebox/personas/models.py
@@ -21,7 +21,7 @@
 
				 #     #    COOKIES_TXT_FILE: '/path/to/cookies.txt',
			
 
				 #     #    CHROME_USER_DATA_DIR: '/path/to/chrome/user/data/dir',
			
 
				 #     #    CHECK_SSL_VALIDITY: False,
			
 
				-#     #    SAVE_ARCHIVE_DOT_ORG: True,
			
 
				+#     #    SAVE_ARCHIVEDOTORG: True,
			
 
				 #     #    CHROME_BINARY: 'chromium'
			
 
				 #     #    ...
			
 
				 #     # }
			
--- a/archivebox/plugins/search_backend_ripgrep/tests/test_ripgrep_detection.py
+++ b/archivebox/plugins/search_backend_ripgrep/tests/test_ripgrep_detection.py
@@ -63,7 +63,7 @@ def test_ripgrep_hook_detects_binary_from_path():
 
				 
			
 
				 def test_ripgrep_hook_skips_when_backend_not_ripgrep():
			
 
				     """Test that ripgrep hook exits silently when search backend is not ripgrep."""
			
 
				-    hook_path = Path(__file__).parent.parent / 'on_Crawl__00_validate_ripgrep.py'
			
 
				+    hook_path = Path(__file__).parent.parent / 'on_Crawl__00_install_ripgrep.py'
			
 
				 
			
 
				     env = os.environ.copy()
			
 
				     env['SEARCH_BACKEND_ENGINE'] = 'sqlite'  # Different backend
			
@@ -82,7 +82,7 @@ def test_ripgrep_hook_skips_when_backend_not_ripgrep():
 
				 
			
 
				 def test_ripgrep_hook_handles_absolute_path():
			
 
				     """Test that ripgrep hook works when RIPGREP_BINARY is an absolute path."""
			
 
				-    hook_path = Path(__file__).parent.parent / 'on_Crawl__00_validate_ripgrep.py'
			
 
				+    hook_path = Path(__file__).parent.parent / 'on_Crawl__00_install_ripgrep.py'
			
 
				 
			
 
				     rg_path = shutil.which('rg')
			
 
				     if not rg_path:
			
@@ -222,7 +222,7 @@ def test_ripgrep_only_detected_when_backend_enabled():
 
				     if not shutil.which('rg'):
			
 
				         pytest.skip("ripgrep not installed")
			
 
				 
			
 
				-    hook_path = Path(__file__).parent.parent / 'on_Crawl__00_validate_ripgrep.py'
			
 
				+    hook_path = Path(__file__).parent.parent / 'on_Crawl__00_install_ripgrep.py'
			
 
				 
			
 
				     # Test 1: With ripgrep backend - should output Binary record
			
 
				     env1 = os.environ.copy()
			
--- a/archivebox/templates/core/snapshot.html
+++ b/archivebox/templates/core/snapshot.html
@@ -360,9 +360,11 @@
 
				                 <div class="row header-bottom-frames">
			
 
				                     {% for result_info in archiveresults %}
			
 
				                         {% if result_info.result %}
			
 
				+                            {% plugin_thumbnail result_info.result as thumbnail_html %}
			
 
				+                            {% if thumbnail_html %}
			
 
				                             <div class="col-lg-2">
			
 
				                                 <div class="card{% if forloop.first %} selected-card{% endif %}">
			
 
				-                                    {% plugin_thumbnail result_info.result %}
			
 
				+                                    {{ thumbnail_html }}
			
 
				                                     <div class="card-body">
			
 
				                                         <a href="{{ result_info.path }}" title="Open in new tab..." target="_blank" rel="noopener">
			
 
				                                             <p class="card-text"><code>{{ result_info.path }}</code></p>
			
@@ -373,6 +375,7 @@
 
				                                     </div>
			
 
				                                 </div>
			
 
				                             </div>
			
 
				+                            {% endif %}
			
 
				                         {% endif %}
			
 
				                     {% endfor %}
			
 
				 
			
@@ -395,7 +398,7 @@
 
				                 </div>
			
 
				             </div>
			
 
				         </header>
			
 
				-        <iframe sandbox="allow-same-origin allow-top-navigation-by-user-activation allow-scripts allow-forms" class="full-page-iframe" src="{{singlefile_path}}" name="preview"></iframe>
			
 
				+        <iframe sandbox="allow-same-origin allow-top-navigation-by-user-activation allow-scripts allow-forms" class="full-page-iframe" src="{{best_preview_path}}" name="preview"></iframe>
			
 
				     
			
 
				         <script>
			
 
				             /*! jQuery v3.2.1 -ajax,-ajax/jsonp,-ajax/load,-ajax/parseXML,-ajax/script,-ajax/var/location,-ajax/var/nonce,-ajax/var/rquery,-ajax/xhr,-manipulation/_evalUrl,-event/ajax,-effects,-effects/Tween,-effects/animatedSelector | (c) JS Foundation and other contributors | jquery.org/license */
			
--- a/archivebox/tests/test_hooks.py
+++ b/archivebox/tests/test_hooks.py
@@ -429,19 +429,6 @@ class TestInstallHookOutput(unittest.TestCase):
 
				         self.assertEqual(data['name'], 'wget')
			
 
				         self.assertTrue(data['abspath'].startswith('/'))
			
 
				 
			
 
				-    def test_install_hook_outputs_dependency(self):
			
 
				-        """Install hook should output Dependency JSONL when binary not found."""
			
 
				-        hook_output = json.dumps({
			
 
				-            'type': 'Dependency',
			
 
				-            'bin_name': 'wget',
			
 
				-            'bin_providers': 'apt,brew,env',
			
 
				-        })
			
 
				-
			
 
				-        data = json.loads(hook_output)
			
 
				-        self.assertEqual(data['type'], 'Dependency')
			
 
				-        self.assertEqual(data['bin_name'], 'wget')
			
 
				-        self.assertIn('apt', data['bin_providers'])
			
 
				-
			
 
				     def test_install_hook_outputs_machine_config(self):
			
 
				         """Install hook should output Machine config update JSONL."""
			
 
				         hook_output = json.dumps({
			
--- a/archivebox/tests/test_migrations_08_to_09.py
+++ b/archivebox/tests/test_migrations_08_to_09.py
@@ -459,7 +459,7 @@ class TestFilesystemMigration08to09(unittest.TestCase):
 
				                     'SAVE_MERCURY': 'True',
			
 
				                     'SAVE_PDF': 'True',
			
 
				                     'SAVE_MEDIA': 'True',
			
 
				-                    'SAVE_ARCHIVE_DOT_ORG': 'True',
			
 
				+                    'SAVE_ARCHIVEDOTORG': 'True',
			
 
				                     'SAVE_HEADERS': 'True',
			
 
				                     'SAVE_HTMLTOTEXT': 'True',
			
 
				                     'SAVE_GIT': 'True',
			
--- a/archivebox/tests/test_migrations_helpers.py
+++ b/archivebox/tests/test_migrations_helpers.py
@@ -949,19 +949,30 @@ def seed_0_8_data(db_path: Path) -> Dict[str, List[Dict]]:
 
				         ('core', '0072_rename_added_snapshot_bookmarked_at_and_more'),
			
 
				         ('core', '0073_rename_created_archiveresult_created_at_and_more'),
			
 
				         ('core', '0074_alter_snapshot_downloaded_at'),
			
 
				-        ('core', '0023_new_schema'),
			
 
				+        # For 0.8.x: DO NOT record 0023_new_schema - it replaces 0023-0074 for fresh installs
			
 
				+        # We already recorded 0023-0074 above, so Django will know the state
			
 
				+        # For 0.8.x: Record original machine migrations (before squashing)
			
 
				+        # DO NOT record 0001_squashed here - it replaces 0001-0004 for fresh installs
			
 
				         ('machine', '0001_initial'),
			
 
				         ('machine', '0002_alter_machine_stats_installedbinary'),
			
 
				         ('machine', '0003_alter_installedbinary_options_and_more'),
			
 
				         ('machine', '0004_alter_installedbinary_abspath_and_more'),
			
 
				-        ('machine', '0001_squashed'),
			
 
				+        # Then the new migrations after squashing
			
 
				         ('machine', '0002_rename_custom_cmds_to_overrides'),
			
 
				         ('machine', '0003_alter_dependency_id_alter_installedbinary_dependency_and_more'),
			
 
				         ('machine', '0004_drop_dependency_table'),
			
 
				+        # Crawls must come before core.0024 because 0024_b depends on it
			
 
				+        ('crawls', '0001_initial'),
			
 
				+        # Core 0024 migrations chain (in dependency order)
			
 
				+        ('core', '0024_b_clear_config_fields'),
			
 
				+        ('core', '0024_c_disable_fk_checks'),
			
 
				+        ('core', '0024_d_fix_crawls_config'),
			
 
				         ('core', '0024_snapshot_crawl'),
			
 
				+        ('core', '0024_f_add_snapshot_config'),
			
 
				         ('core', '0025_allow_duplicate_urls_per_crawl'),
			
 
				+        # For 0.8.x: Record original api migration (before squashing)
			
 
				+        # DO NOT record 0001_squashed here - it replaces 0001 for fresh installs
			
 
				         ('api', '0001_initial'),
			
 
				-        ('api', '0001_squashed'),
			
 
				         ('api', '0002_alter_apitoken_options'),
			
 
				         ('api', '0003_rename_user_apitoken_created_by_apitoken_abid_and_more'),
			
 
				         ('api', '0004_alter_apitoken_id_alter_apitoken_uuid'),
			
@@ -970,11 +981,9 @@ def seed_0_8_data(db_path: Path) -> Dict[str, List[Dict]]:
 
				         ('api', '0007_alter_apitoken_created_by'),
			
 
				         ('api', '0008_alter_apitoken_created_alter_apitoken_created_by_and_more'),
			
 
				         ('api', '0009_rename_created_apitoken_created_at_and_more'),
			
 
				-        ('crawls', '0001_initial'),
			
 
				-        ('crawls', '0002_drop_seed_model'),
			
 
				-        ('crawls', '0003_alter_crawl_output_dir'),
			
 
				-        ('crawls', '0004_alter_crawl_output_dir'),
			
 
				-        ('core', '0035_snapshot_crawl_non_nullable_remove_created_by'),
			
 
				+        # Note: crawls.0001_initial moved earlier (before core.0024) due to dependencies
			
 
				+        # Stop here - 0.8.x ends at core.0025, crawls.0001, and we want to TEST the later migrations
			
 
				+        # Do NOT record 0026+ as they need to be tested during migration
			
 
				     ]
			
 
				 
			
 
				     for app, name in migrations:
			
@@ -1000,7 +1009,7 @@ def run_archivebox(data_dir: Path, args: list, timeout: int = 60, env: dict = No
 
				     base_env['USE_COLOR'] = 'False'
			
 
				     base_env['SHOW_PROGRESS'] = 'False'
			
 
				     # Disable ALL extractors for faster tests (can be overridden by env parameter)
			
 
				-    base_env['SAVE_ARCHIVE_DOT_ORG'] = 'False'
			
 
				+    base_env['SAVE_ARCHIVEDOTORG'] = 'False'
			
 
				     base_env['SAVE_TITLE'] = 'False'
			
 
				     base_env['SAVE_FAVICON'] = 'False'
			
 
				     base_env['SAVE_WGET'] = 'False'
			
--- a/archivebox/workers/apps.py
+++ b/archivebox/workers/apps.py
@@ -4,4 +4,5 @@ from django.apps import AppConfig
 
				 class WorkersConfig(AppConfig):
			
 
				     default_auto_field = 'django.db.models.BigAutoField'
			
 
				     name = 'archivebox.workers'
			
 
				+    label = 'workers'
			
 
				 
			
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -2,7 +2,7 @@
 
				 #     mkdir -p ~/archivebox/data && cd ~/archivebox
			
 
				 #     curl -fsSL 'https://docker-compose.archivebox.io' > docker-compose.yml
			
 
				 #     docker compose run archivebox version
			
 
				-#     docker compose run archivebox config --set SAVE_ARCHIVE_DOT_ORG=False
			
 
				+#     docker compose run archivebox config --set SAVE_ARCHIVEDOTORG=False
			
 
				 #     docker compose run archivebox add --depth=1 'https://news.ycombinator.com'
			
 
				 #     docker compose run -T archivebox add < bookmarks.txt
			
 
				 #     docker compose up -d && open 'https://localhost:8000'
			
@@ -35,7 +35,7 @@ services:
 
				             # - MEDIA_MAX_SIZE=750m             # increase this filesize limit to allow archiving larger audio/video files
			
 
				             # - TIMEOUT=60                      # increase this number to 120+ seconds if you see many slow downloads timing out
			
 
				             # - CHECK_SSL_VALIDITY=True         # set to False to disable strict SSL checking (allows saving URLs w/ broken certs)
			
 
				-            # - SAVE_ARCHIVE_DOT_ORG=True       # set to False to disable submitting all URLs to Archive.org when archiving
			
 
				+            # - SAVE_ARCHIVEDOTORG=True       # set to False to disable submitting all URLs to Archive.org when archiving
			
 
				             # - USER_AGENT="..."                # set a custom USER_AGENT to avoid being blocked as a bot
			
 
				             # ...
			
 
				             # For more info, see: https://github.com/ArchiveBox/ArchiveBox/wiki/Docker#configuration
			
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -85,9 +85,9 @@ dependencies = [
 
				     ### Binary/Package Management
			
 
				     "abx-pkg>=0.1.0",        # for: detecting, versioning, and installing binaries via apt/brew/pip/npm
			
 
				     "gallery-dl>=1.31.1",
			
 
				-
			
 
				     ### UUID7 backport for Python <3.14
			
 
				     "uuid7>=0.1.0; python_version < '3.14'",  # for: uuid7 support on Python 3.13 (provides uuid_extensions module)
			
 
				+    "pytest-django>=4.11.1",
			
 
				 ]
			
 
				 
			
 
				 [project.optional-dependencies]
			
@@ -183,6 +183,7 @@ ignore = ["E731", "E303", "E266", "E241", "E222"]
 
				 
			
 
				 [tool.pytest.ini_options]
			
 
				 testpaths = [ "tests" ]
			
 
				+DJANGO_SETTINGS_MODULE = "archivebox.core.settings"
			
 
				 
			
 
				 [tool.mypy]
			
 
				 mypy_path = "archivebox,archivebox/typings"
			
--- a/tests/fixtures.py
+++ b/tests/fixtures.py
@@ -24,7 +24,7 @@ def disable_extractors_dict():
 
				         "SAVE_HEADERS": "false",
			
 
				         "USE_GIT": "false",
			
 
				         "SAVE_MEDIA": "false",
			
 
				-        "SAVE_ARCHIVE_DOT_ORG": "false",
			
 
				+        "SAVE_ARCHIVEDOTORG": "false",
			
 
				         "SAVE_TITLE": "false",
			
 
				         "SAVE_FAVICON": "false",
			
 
				     })
			
--- a/tests/test_recursive_crawl.py
+++ b/tests/test_recursive_crawl.py
@@ -33,7 +33,7 @@ def test_background_hooks_dont_block_parser_extractors(tmp_path, process):
 
				         "SAVE_HEADERS": "false",
			
 
				         "USE_GIT": "false",
			
 
				         "SAVE_MEDIA": "false",
			
 
				-        "SAVE_ARCHIVE_DOT_ORG": "false",
			
 
				+        "SAVE_ARCHIVEDOTORG": "false",
			
 
				         "SAVE_TITLE": "false",
			
 
				         "SAVE_FAVICON": "false",
			
 
				         # Enable chrome session (required for background hooks to start)
			
@@ -133,7 +133,7 @@ def test_parser_extractors_emit_snapshot_jsonl(tmp_path, process):
 
				         "SAVE_HEADERS": "false",
			
 
				         "USE_GIT": "false",
			
 
				         "SAVE_MEDIA": "false",
			
 
				-        "SAVE_ARCHIVE_DOT_ORG": "false",
			
 
				+        "SAVE_ARCHIVEDOTORG": "false",
			
 
				         "SAVE_TITLE": "false",
			
 
				         "SAVE_FAVICON": "false",
			
 
				         "USE_CHROME": "false",
			
--- a/uv.lock
+++ b/uv.lock
@@ -88,6 +88,7 @@ dependencies = [
 
				     { name = "py-machineid", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
			
 
				     { name = "pydantic", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
			
 
				     { name = "pydantic-settings", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
			
 
				+    { name = "pytest-django", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
			
 
				     { name = "python-benedict", extra = ["io", "parse"], marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
			
 
				     { name = "python-crontab", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
			
 
				     { name = "python-statemachine", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
			
@@ -186,6 +187,7 @@ requires-dist = [
 
				     { name = "py-machineid", specifier = ">=0.6.0" },
			
 
				     { name = "pydantic", specifier = ">=2.8.0" },
			
 
				     { name = "pydantic-settings", specifier = ">=2.5.2" },
			
 
				+    { name = "pytest-django", specifier = ">=4.11.1" },
			
 
				     { name = "python-benedict", extras = ["io", "parse"], specifier = ">=0.33.2" },
			
 
				     { name = "python-crontab", specifier = ">=3.2.0" },
			
 
				     { name = "python-ldap", marker = "extra == 'ldap'", specifier = ">=3.4.3" },
			
@@ -1848,6 +1850,18 @@ wheels = [
 
				     { url = "https://files.pythonhosted.org/packages/3b/ab/b3226f0bd7cdcf710fbede2b3548584366da3b19b5021e74f5bde2a8fa3f/pytest-9.0.2-py3-none-any.whl", hash = "sha256:711ffd45bf766d5264d487b917733b453d917afd2b0ad65223959f59089f875b", size = 374801, upload-time = "2025-12-06T21:30:49.154Z" },
			
 
				 ]
			
 
				 
			
 
				+[[package]]
			
 
				+name = "pytest-django"
			
 
				+version = "4.11.1"
			
 
				+source = { registry = "https://pypi.org/simple" }
			
 
				+dependencies = [
			
 
				+    { name = "pytest", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
			
 
				+]
			
 
				+sdist = { url = "https://files.pythonhosted.org/packages/b1/fb/55d580352db26eb3d59ad50c64321ddfe228d3d8ac107db05387a2fadf3a/pytest_django-4.11.1.tar.gz", hash = "sha256:a949141a1ee103cb0e7a20f1451d355f83f5e4a5d07bdd4dcfdd1fd0ff227991", size = 86202, upload-time = "2025-04-03T18:56:09.338Z" }
			
 
				+wheels = [
			
 
				+    { url = "https://files.pythonhosted.org/packages/be/ac/bd0608d229ec808e51a21044f3f2f27b9a37e7a0ebaca7247882e67876af/pytest_django-4.11.1-py3-none-any.whl", hash = "sha256:1b63773f648aa3d8541000c26929c1ea63934be1cfa674c76436966d73fe6a10", size = 25281, upload-time = "2025-04-03T18:56:07.678Z" },
			
 
				+]
			
 
				+
			
 
				 [[package]]
			
 
				 name = "python-benedict"
			
 
				 version = "0.35.0"