1 month ago · f0aa19fa7d
--- a/.claude/settings.local.json
+++ b/.claude/settings.local.json
@@ -23,7 +23,9 @@
 
				       "Bash(source .venv/bin/activate)",
			
 
				       "Bash(mv:*)",
			
 
				       "Bash(echo:*)",
			
 
				-      "Bash(grep:*)"
			
 
				+      "Bash(grep:*)",
			
 
				+      "WebFetch(domain:python-statemachine.readthedocs.io)",
			
 
				+      "Bash(./bin/run_plugin_tests.sh:*)"
			
 
				     ]
			
 
				   }
			
 
				 }
			
--- a/archivebox/__init__.py
+++ b/archivebox/__init__.py
@@ -24,12 +24,14 @@ ASCII_LOGO = """
 
				 ╚═╝  ╚═╝╚═╝  ╚═╝ ╚═════╝╚═╝  ╚═╝╚═╝  ╚═══╝  ╚══════╝ ╚═════╝  ╚═════╝ ╚═╝  ╚═╝
			
 
				 """
			
 
				 
			
 
				-# make sure PACKAGE_DIR is in sys.path so we can import all subfolders
			
 
				-# without necessarily waiting for django to load them thorugh INSTALLED_APPS
			
 
				 PACKAGE_DIR = Path(__file__).resolve().parent
			
 
				+
			
 
				+# Add PACKAGE_DIR to sys.path - required for Django migrations to import models
			
 
				+# Migrations reference models like 'machine.Binary' which need to be importable
			
 
				 if str(PACKAGE_DIR) not in sys.path:
			
 
				     sys.path.append(str(PACKAGE_DIR))
			
 
				-os.environ['DJANGO_SETTINGS_MODULE'] = 'core.settings'
			
 
				+
			
 
				+os.environ['DJANGO_SETTINGS_MODULE'] = 'archivebox.core.settings'
			
 
				 os.environ['TZ'] = 'UTC'
			
 
				 
			
 
				 # detect ArchiveBox user's UID/GID based on data dir ownership
			
--- a/archivebox/api/admin.py
+++ b/archivebox/api/admin.py
@@ -5,7 +5,7 @@ from signal_webhooks.utils import get_webhook_model
 
				 
			
 
				 from archivebox.base_models.admin import BaseModelAdmin
			
 
				 
			
 
				-from api.models import APIToken
			
 
				+from archivebox.api.models import APIToken
			
 
				 
			
 
				 
			
 
				 class APITokenAdmin(BaseModelAdmin):
			
--- a/archivebox/api/apps.py
+++ b/archivebox/api/apps.py
@@ -4,9 +4,9 @@ from django.apps import AppConfig
 
				 
			
 
				 
			
 
				 class APIConfig(AppConfig):
			
 
				-    name = 'api'
			
 
				+    name = 'archivebox.api'
			
 
				 
			
 
				 
			
 
				 def register_admin(admin_site):
			
 
				-    from api.admin import register_admin
			
 
				+    from archivebox.api.admin import register_admin
			
 
				     register_admin(admin_site)
			
--- a/archivebox/api/migrations/0001_squashed.py
+++ b/archivebox/api/migrations/0001_squashed.py
@@ -7,7 +7,7 @@ from django.conf import settings
 
				 from django.db import migrations, models
			
 
				 import django.db.models.deletion
			
 
				 
			
 
				-import api.models
			
 
				+import archivebox.api.models
			
 
				 
			
 
				 
			
 
				 class Migration(migrations.Migration):
			
@@ -38,7 +38,7 @@ class Migration(migrations.Migration):
 
				                 ('created_by', models.ForeignKey(default=None, on_delete=django.db.models.deletion.CASCADE, to=settings.AUTH_USER_MODEL)),
			
 
				                 ('created_at', models.DateTimeField(auto_now_add=True, db_index=True)),
			
 
				                 ('modified_at', models.DateTimeField(auto_now=True)),
			
 
				-                ('token', models.CharField(default=api.models.generate_secret_token, max_length=32, unique=True)),
			
 
				+                ('token', models.CharField(default=archivebox.api.models.generate_secret_token, max_length=32, unique=True)),
			
 
				                 ('expires', models.DateTimeField(blank=True, null=True)),
			
 
				             ],
			
 
				             options={
			
--- a/archivebox/api/migrations/0003_alter_apitoken_created_by_and_more.py
+++ b/archivebox/api/migrations/0003_alter_apitoken_created_by_and_more.py
@@ -1,6 +1,6 @@
 
				 # Generated by Django 6.0 on 2025-12-27 01:40
			
 
				 
			
 
				-import base_models.models
			
 
				+import archivebox.core.models
			
 
				 import django.db.models.deletion
			
 
				 from django.conf import settings
			
 
				 from django.db import migrations, models
			
@@ -17,11 +17,11 @@ class Migration(migrations.Migration):
 
				         migrations.AlterField(
			
 
				             model_name='apitoken',
			
 
				             name='created_by',
			
 
				-            field=models.ForeignKey(default=base_models.models.get_or_create_system_user_pk, on_delete=django.db.models.deletion.CASCADE, to=settings.AUTH_USER_MODEL),
			
 
				+            field=models.ForeignKey(default=archivebox.core.models.get_or_create_system_user_pk, on_delete=django.db.models.deletion.CASCADE, to=settings.AUTH_USER_MODEL),
			
 
				         ),
			
 
				         migrations.AlterField(
			
 
				             model_name='outboundwebhook',
			
 
				             name='created_by',
			
 
				-            field=models.ForeignKey(default=base_models.models.get_or_create_system_user_pk, on_delete=django.db.models.deletion.CASCADE, to=settings.AUTH_USER_MODEL),
			
 
				+            field=models.ForeignKey(default=archivebox.core.models.get_or_create_system_user_pk, on_delete=django.db.models.deletion.CASCADE, to=settings.AUTH_USER_MODEL),
			
 
				         ),
			
 
				     ]
			
--- a/archivebox/api/models.py
+++ b/archivebox/api/models.py
@@ -10,7 +10,7 @@ from django.utils import timezone
 
				 from django_stubs_ext.db.models import TypedModelMeta
			
 
				 from signal_webhooks.models import WebhookBase
			
 
				 
			
 
				-from base_models.models import get_or_create_system_user_pk
			
 
				+from archivebox.base_models.models import get_or_create_system_user_pk
			
 
				 
			
 
				 
			
 
				 def generate_secret_token() -> str:
			
@@ -26,6 +26,7 @@ class APIToken(models.Model):
 
				     expires = models.DateTimeField(null=True, blank=True)
			
 
				 
			
 
				     class Meta(TypedModelMeta):
			
 
				+        app_label = 'api'
			
 
				         verbose_name = "API Key"
			
 
				         verbose_name_plural = "API Keys"
			
 
				 
			
@@ -47,6 +48,7 @@ class OutboundWebhook(WebhookBase):
 
				     modified_at = models.DateTimeField(auto_now=True)
			
 
				 
			
 
				     class Meta(WebhookBase.Meta):
			
 
				+        app_label = 'api'
			
 
				         verbose_name = 'API Outbound Webhook'
			
 
				 
			
 
				     def __str__(self) -> str:
			
--- a/archivebox/api/v1_api.py
+++ b/archivebox/api/v1_api.py
@@ -15,7 +15,7 @@ from ninja import NinjaAPI, Swagger
 
				 from archivebox.config import VERSION
			
 
				 from archivebox.config.version import get_COMMIT_HASH
			
 
				 
			
 
				-from api.auth import API_AUTH_METHODS
			
 
				+from archivebox.api.auth import API_AUTH_METHODS
			
 
				 
			
 
				 
			
 
				 COMMIT_HASH = get_COMMIT_HASH() or 'unknown'
			
--- a/archivebox/api/v1_auth.py
+++ b/archivebox/api/v1_auth.py
@@ -6,8 +6,8 @@ from ninja import Router, Schema
 
				 from django.utils import timezone
			
 
				 from datetime import timedelta
			
 
				 
			
 
				-from api.models import APIToken
			
 
				-from api.auth import auth_using_token, auth_using_password, get_or_create_api_token
			
 
				+from archivebox.api.models import APIToken
			
 
				+from archivebox.api.auth import auth_using_token, auth_using_password, get_or_create_api_token
			
 
				 
			
 
				 
			
 
				 router = Router(tags=['Authentication'], auth=None)
			
--- a/archivebox/api/v1_cli.py
+++ b/archivebox/api/v1_cli.py
@@ -118,6 +118,7 @@ def cli_add(request, args: AddCommandSchema):
 
				         plugins=args.plugins,
			
 
				         parser=args.parser,
			
 
				         bg=True,  # Always run in background for API calls
			
 
				+        created_by_id=request.user.pk,
			
 
				     )
			
 
				 
			
 
				     return {
			
--- a/archivebox/api/v1_core.py
+++ b/archivebox/api/v1_core.py
@@ -14,8 +14,8 @@ from ninja import Router, Schema, FilterSchema, Field, Query
 
				 from ninja.pagination import paginate, PaginationBase
			
 
				 from ninja.errors import HttpError
			
 
				 
			
 
				-from core.models import Snapshot, ArchiveResult, Tag
			
 
				-from api.v1_crawls import CrawlSchema
			
 
				+from archivebox.core.models import Snapshot, ArchiveResult, Tag
			
 
				+from archivebox.api.v1_crawls import CrawlSchema
			
 
				 
			
 
				 
			
 
				 router = Router(tags=['Core Models'])
			
@@ -80,12 +80,11 @@ class MinimalArchiveResultSchema(Schema):
 
				 
			
 
				     @staticmethod
			
 
				     def resolve_created_by_id(obj):
			
 
				-        return str(obj.created_by_id)
			
 
				+        return str(obj.created_by.pk)
			
 
				 
			
 
				     @staticmethod
			
 
				     def resolve_created_by_username(obj) -> str:
			
 
				-        User = get_user_model()
			
 
				-        return User.objects.filter(pk=obj.created_by_id).values_list('username', flat=True)[0]
			
 
				+        return obj.created_by.username
			
 
				 
			
 
				 
			
 
				 class ArchiveResultSchema(MinimalArchiveResultSchema):
			
@@ -166,12 +165,11 @@ class SnapshotSchema(Schema):
 
				 
			
 
				     @staticmethod
			
 
				     def resolve_created_by_id(obj):
			
 
				-        return str(obj.created_by_id)
			
 
				+        return str(obj.created_by.pk)
			
 
				 
			
 
				     @staticmethod
			
 
				     def resolve_created_by_username(obj):
			
 
				-        User = get_user_model()
			
 
				-        return User.objects.get(id=obj.created_by_id).username
			
 
				+        return obj.created_by.username
			
 
				 
			
 
				     @staticmethod
			
 
				     def resolve_tags(obj):
			
@@ -190,8 +188,8 @@ class SnapshotSchema(Schema):
 
				 
			
 
				 class SnapshotFilterSchema(FilterSchema):
			
 
				     id: Optional[str] = Field(None, q=['id__icontains', 'timestamp__startswith'])
			
 
				-    created_by_id: str = Field(None, q='created_by_id')
			
 
				-    created_by_username: str = Field(None, q='created_by__username__icontains')
			
 
				+    created_by_id: str = Field(None, q='crawl__created_by_id')
			
 
				+    created_by_username: str = Field(None, q='crawl__created_by__username__icontains')
			
 
				     created_at__gte: datetime = Field(None, q='created_at__gte')
			
 
				     created_at__lt: datetime = Field(None, q='created_at__lt')
			
 
				     created_at: datetime = Field(None, q='created_at')
			
--- a/archivebox/api/v1_crawls.py
+++ b/archivebox/api/v1_crawls.py
@@ -9,8 +9,8 @@ from django.contrib.auth import get_user_model
 
				 
			
 
				 from ninja import Router, Schema
			
 
				 
			
 
				-from core.models import Snapshot
			
 
				-from crawls.models import Crawl
			
 
				+from archivebox.core.models import Snapshot
			
 
				+from archivebox.crawls.models import Crawl
			
 
				 
			
 
				 from .auth import API_AUTH_METHODS
			
 
				 
			
--- a/archivebox/api/v1_machine.py
+++ b/archivebox/api/v1_machine.py
@@ -7,7 +7,7 @@ from datetime import datetime
 
				 from ninja import Router, Schema, FilterSchema, Field, Query
			
 
				 from ninja.pagination import paginate
			
 
				 
			
 
				-from api.v1_core import CustomPagination
			
 
				+from archivebox.api.v1_core import CustomPagination
			
 
				 
			
 
				 
			
 
				 router = Router(tags=['Machine and Dependencies'])
			
@@ -102,14 +102,14 @@ class BinaryFilterSchema(FilterSchema):
 
				 @paginate(CustomPagination)
			
 
				 def get_machines(request, filters: MachineFilterSchema = Query(...)):
			
 
				     """List all machines."""
			
 
				-    from machine.models import Machine
			
 
				+    from archivebox.machine.models import Machine
			
 
				     return filters.filter(Machine.objects.all()).distinct()
			
 
				 
			
 
				 
			
 
				 @router.get("/machine/{machine_id}", response=MachineSchema, url_name="get_machine")
			
 
				 def get_machine(request, machine_id: str):
			
 
				     """Get a specific machine by ID."""
			
 
				-    from machine.models import Machine
			
 
				+    from archivebox.machine.models import Machine
			
 
				     from django.db.models import Q
			
 
				     return Machine.objects.get(Q(id__startswith=machine_id) | Q(hostname__iexact=machine_id))
			
 
				 
			
@@ -117,7 +117,7 @@ def get_machine(request, machine_id: str):
 
				 @router.get("/machine/current", response=MachineSchema, url_name="get_current_machine")
			
 
				 def get_current_machine(request):
			
 
				     """Get the current machine."""
			
 
				-    from machine.models import Machine
			
 
				+    from archivebox.machine.models import Machine
			
 
				     return Machine.current()
			
 
				 
			
 
				 
			
@@ -132,19 +132,19 @@ def get_current_machine(request):
 
				 @paginate(CustomPagination)
			
 
				 def get_binaries(request, filters: BinaryFilterSchema = Query(...)):
			
 
				     """List all binaries."""
			
 
				-    from machine.models import Binary
			
 
				+    from archivebox.machine.models import Binary
			
 
				     return filters.filter(Binary.objects.all().select_related('machine', 'dependency')).distinct()
			
 
				 
			
 
				 
			
 
				 @router.get("/binary/{binary_id}", response=BinarySchema, url_name="get_binary")
			
 
				 def get_binary(request, binary_id: str):
			
 
				     """Get a specific binary by ID."""
			
 
				-    from machine.models import Binary
			
 
				+    from archivebox.machine.models import Binary
			
 
				     return Binary.objects.select_related('machine', 'dependency').get(id__startswith=binary_id)
			
 
				 
			
 
				 
			
 
				 @router.get("/binary/by-name/{name}", response=List[BinarySchema], url_name="get_binaries_by_name")
			
 
				 def get_binaries_by_name(request, name: str):
			
 
				     """Get all binaries with the given name."""
			
 
				-    from machine.models import Binary
			
 
				+    from archivebox.machine.models import Binary
			
 
				     return list(Binary.objects.filter(name__iexact=name).select_related('machine', 'dependency'))
			
--- a/archivebox/base_models/models.py
+++ b/archivebox/base_models/models.py
@@ -12,6 +12,7 @@ from pathlib import Path
 
				 
			
 
				 from django.contrib import admin
			
 
				 from django.db import models
			
 
				+from django.db.models import F
			
 
				 from django.utils import timezone
			
 
				 from django.contrib.auth import get_user_model
			
 
				 from django.urls import reverse_lazy
			
@@ -110,6 +111,11 @@ class ModelWithHealthStats(models.Model):
 
				         total = max(self.num_uses_failed + self.num_uses_succeeded, 1)
			
 
				         return round((self.num_uses_succeeded / total) * 100)
			
 
				 
			
 
				+    def increment_health_stats(self, success: bool):
			
 
				+        """Atomically increment success or failure counter using F() expression."""
			
 
				+        field = 'num_uses_succeeded' if success else 'num_uses_failed'
			
 
				+        type(self).objects.filter(pk=self.pk).update(**{field: F(field) + 1})
			
 
				+
			
 
				 
			
 
				 class ModelWithConfig(models.Model):
			
 
				     """Mixin for models with a JSON config field."""
			
--- a/archivebox/cli/archivebox_add.py
+++ b/archivebox/cli/archivebox_add.py
@@ -19,7 +19,7 @@ from archivebox.config.permissions import USER, HOSTNAME
 
				 
			
 
				 
			
 
				 if TYPE_CHECKING:
			
 
				-    from core.models import Snapshot
			
 
				+    from archivebox.core.models import Snapshot
			
 
				 
			
 
				 
			
 
				 @enforce_types
			
@@ -53,8 +53,8 @@ def add(urls: str | list[str],
 
				     assert depth in (0, 1, 2, 3, 4), 'Depth must be 0-4'
			
 
				 
			
 
				     # import models once django is set up
			
 
				-    from core.models import Snapshot
			
 
				-    from crawls.models import Crawl
			
 
				+    from archivebox.core.models import Snapshot
			
 
				+    from archivebox.crawls.models import Crawl
			
 
				     from archivebox.base_models.models import get_or_create_system_user_pk
			
 
				     from workers.orchestrator import Orchestrator
			
 
				 
			
--- a/archivebox/cli/archivebox_config.py
+++ b/archivebox/cli/archivebox_config.py
@@ -66,18 +66,38 @@ def config(*keys,
 
				                 raise SystemExit(1)
			
 
				         else:
			
 
				             matching_config = FLAT_CONFIG
			
 
				-        
			
 
				+
			
 
				+        # Display core config sections
			
 
				         for config_section in CONFIGS.values():
			
 
				             if hasattr(config_section, 'toml_section_header'):
			
 
				                 print(f'[grey53]\\[{config_section.toml_section_header}][/grey53]')
			
 
				             else:
			
 
				                 print('[grey53]\\[CONSTANTS]                                        # (read-only)[/grey53]')
			
 
				-            
			
 
				+
			
 
				             kv_in_section = {key: val for key, val in dict(config_section).items() if key in matching_config}
			
 
				             print(benedict(kv_in_section).to_toml(encoder=CustomTOMLEncoder()).strip().replace('\n\n', '\n'))
			
 
				             print('[grey53]################################################################[/grey53]')
			
 
				-            
			
 
				-        
			
 
				+
			
 
				+        # Display plugin config section
			
 
				+        from archivebox.hooks import discover_plugin_configs
			
 
				+
			
 
				+        plugin_configs = discover_plugin_configs()
			
 
				+        plugin_keys = {}
			
 
				+
			
 
				+        # Collect all plugin config keys
			
 
				+        for plugin_name, schema in plugin_configs.items():
			
 
				+            if 'properties' not in schema:
			
 
				+                continue
			
 
				+            for key in schema['properties'].keys():
			
 
				+                if key in matching_config:
			
 
				+                    plugin_keys[key] = matching_config[key]
			
 
				+
			
 
				+        # Display all plugin config in single [PLUGINS] section
			
 
				+        if plugin_keys:
			
 
				+            print(f'[grey53]\\[PLUGINS][/grey53]')
			
 
				+            print(benedict(plugin_keys).to_toml(encoder=CustomTOMLEncoder()).strip().replace('\n\n', '\n'))
			
 
				+            print('[grey53]################################################################[/grey53]')
			
 
				+
			
 
				         raise SystemExit(not matching_config)
			
 
				 
			
 
				     elif set:
			
--- a/archivebox/cli/archivebox_crawl.py
+++ b/archivebox/cli/archivebox_crawl.py
@@ -72,11 +72,11 @@ def discover_outlinks(
 
				 
			
 
				     from archivebox.misc.jsonl import (
			
 
				         read_args_or_stdin, write_record,
			
 
				-        TYPE_SNAPSHOT, get_or_create_snapshot
			
 
				+        TYPE_SNAPSHOT
			
 
				     )
			
 
				     from archivebox.base_models.models import get_or_create_system_user_pk
			
 
				-    from core.models import Snapshot, ArchiveResult
			
 
				-    from crawls.models import Crawl
			
 
				+    from archivebox.core.models import Snapshot, ArchiveResult
			
 
				+    from archivebox.crawls.models import Crawl
			
 
				     from archivebox.config import CONSTANTS
			
 
				     from workers.orchestrator import Orchestrator
			
 
				 
			
@@ -130,8 +130,10 @@ def discover_outlinks(
 
				                 record['crawl_id'] = str(crawl.id)
			
 
				                 record['depth'] = record.get('depth', 0)
			
 
				 
			
 
				-                snapshot = get_or_create_snapshot(record, created_by_id=created_by_id)
			
 
				-                snapshot_ids.append(str(snapshot.id))
			
 
				+                overrides = {'created_by_id': created_by_id}
			
 
				+                snapshot = Snapshot.from_jsonl(record, overrides=overrides)
			
 
				+                if snapshot:
			
 
				+                    snapshot_ids.append(str(snapshot.id))
			
 
				 
			
 
				             except Exception as e:
			
 
				                 rprint(f'[red]Error creating snapshot: {e}[/red]', file=sys.stderr)
			
@@ -162,7 +164,6 @@ def discover_outlinks(
 
				                     defaults={
			
 
				                         'status': ArchiveResult.StatusChoices.QUEUED,
			
 
				                         'retry_at': timezone.now(),
			
 
				-                        'created_by_id': snapshot.created_by_id,
			
 
				                     }
			
 
				                 )
			
 
				             else:
			
@@ -229,7 +230,7 @@ def process_crawl_by_id(crawl_id: str) -> int:
 
				     - Transition from started -> sealed (when all snapshots done)
			
 
				     """
			
 
				     from rich import print as rprint
			
 
				-    from crawls.models import Crawl
			
 
				+    from archivebox.crawls.models import Crawl
			
 
				 
			
 
				     try:
			
 
				         crawl = Crawl.objects.get(id=crawl_id)
			
@@ -256,7 +257,7 @@ def is_crawl_id(value: str) -> bool:
 
				     if not uuid_pattern.match(value):
			
 
				         return False
			
 
				     # Verify it's actually a Crawl (not a Snapshot or other object)
			
 
				-    from crawls.models import Crawl
			
 
				+    from archivebox.crawls.models import Crawl
			
 
				     return Crawl.objects.filter(id=value).exists()
			
 
				 
			
 
				 
			
--- a/archivebox/cli/archivebox_extract.py
+++ b/archivebox/cli/archivebox_extract.py
@@ -43,7 +43,7 @@ def process_archiveresult_by_id(archiveresult_id: str) -> int:
 
				     Triggers the ArchiveResult's state machine tick() to run the extractor plugin.
			
 
				     """
			
 
				     from rich import print as rprint
			
 
				-    from core.models import ArchiveResult
			
 
				+    from archivebox.core.models import ArchiveResult
			
 
				 
			
 
				     try:
			
 
				         archiveresult = ArchiveResult.objects.get(id=archiveresult_id)
			
@@ -95,7 +95,7 @@ def run_plugins(
 
				         read_args_or_stdin, write_record, archiveresult_to_jsonl,
			
 
				         TYPE_SNAPSHOT, TYPE_ARCHIVERESULT
			
 
				     )
			
 
				-    from core.models import Snapshot, ArchiveResult
			
 
				+    from archivebox.core.models import Snapshot, ArchiveResult
			
 
				     from workers.orchestrator import Orchestrator
			
 
				 
			
 
				     is_tty = sys.stdout.isatty()
			
@@ -155,7 +155,6 @@ def run_plugins(
 
				                 defaults={
			
 
				                     'status': ArchiveResult.StatusChoices.QUEUED,
			
 
				                     'retry_at': timezone.now(),
			
 
				-                    'created_by_id': snapshot.created_by_id,
			
 
				                 }
			
 
				             )
			
 
				             if not created and result.status in [ArchiveResult.StatusChoices.FAILED, ArchiveResult.StatusChoices.SKIPPED]:
			
@@ -218,7 +217,7 @@ def is_archiveresult_id(value: str) -> bool:
 
				     if not uuid_pattern.match(value):
			
 
				         return False
			
 
				     # Verify it's actually an ArchiveResult (not a Snapshot or other object)
			
 
				-    from core.models import ArchiveResult
			
 
				+    from archivebox.core.models import ArchiveResult
			
 
				     return ArchiveResult.objects.filter(id=value).exists()
			
 
				 
			
 
				 
			
--- a/archivebox/cli/archivebox_init.py
+++ b/archivebox/cli/archivebox_init.py
@@ -95,7 +95,7 @@ def init(force: bool=False, quick: bool=False, install: bool=False, setup: bool=
 
				     print()
			
 
				     print('[dodger_blue3][*] Checking links from indexes and archive folders (safe to Ctrl+C)...[/dodger_blue3]')
			
 
				 
			
 
				-    from core.models import Snapshot
			
 
				+    from archivebox.core.models import Snapshot
			
 
				 
			
 
				     all_links = Snapshot.objects.none()
			
 
				     pending_links: dict[str, SnapshotDict] = {}
			
--- a/archivebox/cli/archivebox_install.py
+++ b/archivebox/cli/archivebox_install.py
@@ -42,7 +42,7 @@ def install(dry_run: bool=False) -> None:
 
				     setup_django()
			
 
				 
			
 
				     from django.utils import timezone
			
 
				-    from crawls.models import Crawl
			
 
				+    from archivebox.crawls.models import Crawl
			
 
				     from archivebox.base_models.models import get_or_create_system_user_pk
			
 
				 
			
 
				     # Create a crawl for dependency detection
			
@@ -70,7 +70,7 @@ def install(dry_run: bool=False) -> None:
 
				     print(f'[+] Crawl status: {crawl.status}, retry_at: {crawl.retry_at}')
			
 
				 
			
 
				     # Verify the crawl is in the queue
			
 
				-    from crawls.models import Crawl as CrawlModel
			
 
				+    from archivebox.crawls.models import Crawl as CrawlModel
			
 
				     queued_crawls = CrawlModel.objects.filter(
			
 
				         retry_at__lte=timezone.now()
			
 
				     ).exclude(
			
--- a/archivebox/cli/archivebox_remove.py
+++ b/archivebox/cli/archivebox_remove.py
@@ -71,7 +71,7 @@ def remove(filter_patterns: Iterable[str]=(),
 
				     to_remove = snapshots.count()
			
 
				 
			
 
				     from archivebox.search import flush_search_index
			
 
				-    from core.models import Snapshot
			
 
				+    from archivebox.core.models import Snapshot
			
 
				 
			
 
				     flush_search_index(snapshots=snapshots)
			
 
				     snapshots.delete()
			
--- a/archivebox/cli/archivebox_search.py
+++ b/archivebox/cli/archivebox_search.py
@@ -36,7 +36,7 @@ def get_snapshots(snapshots: Optional[QuerySet]=None,
 
				                   before: Optional[float]=None,
			
 
				                   out_dir: Path=DATA_DIR) -> QuerySet:
			
 
				     """Filter and return Snapshots matching the given criteria."""
			
 
				-    from core.models import Snapshot
			
 
				+    from archivebox.core.models import Snapshot
			
 
				 
			
 
				     if snapshots:
			
 
				         result = snapshots
			
@@ -68,7 +68,7 @@ def search(filter_patterns: list[str] | None=None,
 
				            csv: str | None=None,
			
 
				            with_headers: bool=False):
			
 
				     """List, filter, and export information about archive entries"""
			
 
				-    from core.models import Snapshot
			
 
				+    from archivebox.core.models import Snapshot
			
 
				 
			
 
				     if with_headers and not (json or html or csv):
			
 
				         stderr('[X] --with-headers requires --json, --html or --csv\n', color='red')
			
--- a/archivebox/cli/archivebox_snapshot.py
+++ b/archivebox/cli/archivebox_snapshot.py
@@ -46,7 +46,7 @@ def process_snapshot_by_id(snapshot_id: str) -> int:
 
				     - Transition from started -> sealed (when all ArchiveResults done)
			
 
				     """
			
 
				     from rich import print as rprint
			
 
				-    from core.models import Snapshot
			
 
				+    from archivebox.core.models import Snapshot
			
 
				 
			
 
				     try:
			
 
				         snapshot = Snapshot.objects.get(id=snapshot_id)
			
@@ -88,11 +88,11 @@ def create_snapshots(
 
				 
			
 
				     from archivebox.misc.jsonl import (
			
 
				         read_args_or_stdin, write_record, snapshot_to_jsonl,
			
 
				-        TYPE_SNAPSHOT, TYPE_TAG, get_or_create_snapshot
			
 
				+        TYPE_SNAPSHOT, TYPE_TAG
			
 
				     )
			
 
				     from archivebox.base_models.models import get_or_create_system_user_pk
			
 
				-    from core.models import Snapshot
			
 
				-    from crawls.models import Crawl
			
 
				+    from archivebox.core.models import Snapshot
			
 
				+    from archivebox.crawls.models import Crawl
			
 
				     from archivebox.config import CONSTANTS
			
 
				 
			
 
				     created_by_id = created_by_id or get_or_create_system_user_pk()
			
@@ -137,8 +137,10 @@ def create_snapshots(
 
				                 record['tags'] = tag
			
 
				 
			
 
				             # Get or create the snapshot
			
 
				-            snapshot = get_or_create_snapshot(record, created_by_id=created_by_id)
			
 
				-            created_snapshots.append(snapshot)
			
 
				+            overrides = {'created_by_id': created_by_id}
			
 
				+            snapshot = Snapshot.from_jsonl(record, overrides=overrides)
			
 
				+            if snapshot:
			
 
				+                created_snapshots.append(snapshot)
			
 
				 
			
 
				             # Output JSONL record (only when piped)
			
 
				             if not is_tty:
			
--- a/archivebox/cli/archivebox_status.py
+++ b/archivebox/cli/archivebox_status.py
@@ -21,7 +21,7 @@ def status(out_dir: Path=DATA_DIR) -> None:
 
				 
			
 
				     from django.contrib.auth import get_user_model
			
 
				     from archivebox.misc.db import get_admins
			
 
				-    from core.models import Snapshot
			
 
				+    from archivebox.core.models import Snapshot
			
 
				     User = get_user_model()
			
 
				 
			
 
				     print('[green]\\[*] Scanning archive main index...[/green]')
			
--- a/archivebox/cli/archivebox_update.py
+++ b/archivebox/cli/archivebox_update.py
@@ -36,7 +36,7 @@ def update(filter_patterns: Iterable[str] = (),
 
				     from archivebox.config.django import setup_django
			
 
				     setup_django()
			
 
				 
			
 
				-    from core.models import Snapshot
			
 
				+    from archivebox.core.models import Snapshot
			
 
				     from django.utils import timezone
			
 
				 
			
 
				     while True:
			
@@ -83,7 +83,7 @@ def import_orphans_from_archive(resume_from: str = None, batch_size: int = 100)
 
				     Skip symlinks (already migrated).
			
 
				     Create DB records and trigger migration on save().
			
 
				     """
			
 
				-    from core.models import Snapshot
			
 
				+    from archivebox.core.models import Snapshot
			
 
				     from archivebox.config import CONSTANTS
			
 
				     from django.db import transaction
			
 
				 
			
@@ -151,7 +151,7 @@ def process_all_db_snapshots(batch_size: int = 100) -> dict:
 
				     Process all snapshots in DB.
			
 
				     Reconcile index.json and queue for archiving.
			
 
				     """
			
 
				-    from core.models import Snapshot
			
 
				+    from archivebox.core.models import Snapshot
			
 
				     from django.db import transaction
			
 
				     from django.utils import timezone
			
 
				 
			
@@ -189,7 +189,7 @@ def process_filtered_snapshots(
 
				     batch_size: int
			
 
				 ) -> dict:
			
 
				     """Process snapshots matching filters (DB query only)."""
			
 
				-    from core.models import Snapshot
			
 
				+    from archivebox.core.models import Snapshot
			
 
				     from django.db import transaction
			
 
				     from django.utils import timezone
			
 
				     from datetime import datetime
			
--- a/archivebox/cli/archivebox_version.py
+++ b/archivebox/cli/archivebox_version.py
@@ -107,7 +107,7 @@ def version(quiet: bool=False,
 
				     from archivebox.config.django import setup_django
			
 
				     setup_django()
			
 
				 
			
 
				-    from machine.models import Machine, Binary
			
 
				+    from archivebox.machine.models import Machine, Binary
			
 
				 
			
 
				     machine = Machine.current()
			
 
				 
			
--- a/archivebox/cli/tests_piping.py
+++ b/archivebox/cli/tests_piping.py
@@ -542,10 +542,10 @@ class TestPipingWorkflowIntegration(unittest.TestCase):
 
				         Test: archivebox snapshot URL
			
 
				         Should create a Snapshot and output JSONL when piped.
			
 
				         """
			
 
				-        from core.models import Snapshot
			
 
				+        from archivebox.core.models import Snapshot
			
 
				         from archivebox.misc.jsonl import (
			
 
				             read_args_or_stdin, write_record, snapshot_to_jsonl,
			
 
				-            TYPE_SNAPSHOT, get_or_create_snapshot
			
 
				+            TYPE_SNAPSHOT
			
 
				         )
			
 
				         from archivebox.base_models.models import get_or_create_system_user_pk
			
 
				 
			
@@ -559,7 +559,8 @@ class TestPipingWorkflowIntegration(unittest.TestCase):
 
				         self.assertEqual(records[0]['url'], url)
			
 
				 
			
 
				         # Create snapshot
			
 
				-        snapshot = get_or_create_snapshot(records[0], created_by_id=created_by_id)
			
 
				+        overrides = {'created_by_id': created_by_id}
			
 
				+        snapshot = Snapshot.from_jsonl(records[0], overrides=overrides)
			
 
				 
			
 
				         self.assertIsNotNone(snapshot.id)
			
 
				         self.assertEqual(snapshot.url, url)
			
@@ -575,9 +576,9 @@ class TestPipingWorkflowIntegration(unittest.TestCase):
 
				         Test: archivebox snapshot URL | archivebox extract
			
 
				         Extract should accept JSONL output from snapshot command.
			
 
				         """
			
 
				-        from core.models import Snapshot, ArchiveResult
			
 
				+        from archivebox.core.models import Snapshot, ArchiveResult
			
 
				         from archivebox.misc.jsonl import (
			
 
				-            snapshot_to_jsonl, read_args_or_stdin, get_or_create_snapshot,
			
 
				+            snapshot_to_jsonl, read_args_or_stdin,
			
 
				             TYPE_SNAPSHOT
			
 
				         )
			
 
				         from archivebox.base_models.models import get_or_create_system_user_pk
			
@@ -586,7 +587,8 @@ class TestPipingWorkflowIntegration(unittest.TestCase):
 
				 
			
 
				         # Step 1: Create snapshot (simulating 'archivebox snapshot')
			
 
				         url = 'https://test-extract-1.example.com'
			
 
				-        snapshot = get_or_create_snapshot({'url': url}, created_by_id=created_by_id)
			
 
				+        overrides = {'created_by_id': created_by_id}
			
 
				+        snapshot = Snapshot.from_jsonl({'url': url}, overrides=overrides)
			
 
				         snapshot_output = snapshot_to_jsonl(snapshot)
			
 
				 
			
 
				         # Step 2: Parse snapshot output as extract input
			
@@ -648,7 +650,7 @@ class TestPipingWorkflowIntegration(unittest.TestCase):
 
				 
			
 
				         This is equivalent to: archivebox add URL
			
 
				         """
			
 
				-        from core.models import Snapshot
			
 
				+        from archivebox.core.models import Snapshot
			
 
				         from archivebox.misc.jsonl import (
			
 
				             get_or_create_snapshot, snapshot_to_jsonl, read_args_or_stdin,
			
 
				             TYPE_SNAPSHOT
			
@@ -682,7 +684,7 @@ class TestPipingWorkflowIntegration(unittest.TestCase):
 
				 
			
 
				         This is equivalent to: archivebox add --depth=1 URL
			
 
				         """
			
 
				-        from core.models import Snapshot
			
 
				+        from archivebox.core.models import Snapshot
			
 
				         from archivebox.misc.jsonl import (
			
 
				             get_or_create_snapshot, snapshot_to_jsonl, read_args_or_stdin,
			
 
				             TYPE_SNAPSHOT
			
@@ -772,7 +774,7 @@ class TestDepthWorkflows(unittest.TestCase):
 
				 
			
 
				         Depth 0: Only archive the specified URL, no crawling.
			
 
				         """
			
 
				-        from core.models import Snapshot
			
 
				+        from archivebox.core.models import Snapshot
			
 
				         from archivebox.misc.jsonl import get_or_create_snapshot
			
 
				         from archivebox.base_models.models import get_or_create_system_user_pk
			
 
				 
			
--- a/archivebox/config/__init__.py
+++ b/archivebox/config/__init__.py
@@ -35,177 +35,41 @@ def _get_config():
 
				 # These are recalculated each time the module attribute is accessed
			
 
				 
			
 
				 def __getattr__(name: str):
			
 
				-    """Module-level __getattr__ for lazy config loading."""
			
 
				-    
			
 
				-    # Timeout settings
			
 
				+    """
			
 
				+    Module-level __getattr__ for lazy config loading.
			
 
				+
			
 
				+    Only provides backwards compatibility for GENERIC/SHARED config.
			
 
				+    Plugin-specific config (binaries, args, toggles) should come from plugin config.json files.
			
 
				+    """
			
 
				+
			
 
				+    # Generic timeout settings (used by multiple plugins)
			
 
				     if name == 'TIMEOUT':
			
 
				         cfg, _ = _get_config()
			
 
				         return cfg.TIMEOUT
			
 
				-    if name == 'MEDIA_TIMEOUT':
			
 
				-        cfg, _ = _get_config()
			
 
				-        return cfg.MEDIA_TIMEOUT
			
 
				-    
			
 
				-    # SSL/Security settings
			
 
				+
			
 
				+    # Generic SSL/Security settings (used by multiple plugins)
			
 
				     if name == 'CHECK_SSL_VALIDITY':
			
 
				         cfg, _ = _get_config()
			
 
				         return cfg.CHECK_SSL_VALIDITY
			
 
				-    
			
 
				-    # Storage settings  
			
 
				+
			
 
				+    # Generic storage settings (used by multiple plugins)
			
 
				     if name == 'RESTRICT_FILE_NAMES':
			
 
				         _, storage = _get_config()
			
 
				         return storage.RESTRICT_FILE_NAMES
			
 
				-    
			
 
				-    # User agent / cookies
			
 
				+
			
 
				+    # Generic user agent / cookies (used by multiple plugins)
			
 
				     if name == 'COOKIES_FILE':
			
 
				         cfg, _ = _get_config()
			
 
				         return cfg.COOKIES_FILE
			
 
				     if name == 'USER_AGENT':
			
 
				         cfg, _ = _get_config()
			
 
				         return cfg.USER_AGENT
			
 
				-    if name == 'CURL_USER_AGENT':
			
 
				-        cfg, _ = _get_config()
			
 
				-        return cfg.USER_AGENT
			
 
				-    if name == 'WGET_USER_AGENT':
			
 
				-        cfg, _ = _get_config()
			
 
				-        return cfg.USER_AGENT
			
 
				-    if name == 'CHROME_USER_AGENT':
			
 
				-        cfg, _ = _get_config()
			
 
				-        return cfg.USER_AGENT
			
 
				-    
			
 
				-    # Archive method toggles (SAVE_*)
			
 
				-    if name == 'SAVE_TITLE':
			
 
				-        return True
			
 
				-    if name == 'SAVE_FAVICON':
			
 
				-        return True
			
 
				-    if name == 'SAVE_WGET':
			
 
				-        return True
			
 
				-    if name == 'SAVE_WARC':
			
 
				-        return True
			
 
				-    if name == 'SAVE_WGET_REQUISITES':
			
 
				-        return True
			
 
				-    if name == 'SAVE_SINGLEFILE':
			
 
				-        return True
			
 
				-    if name == 'SAVE_READABILITY':
			
 
				-        return True
			
 
				-    if name == 'SAVE_MERCURY':
			
 
				-        return True
			
 
				-    if name == 'SAVE_HTMLTOTEXT':
			
 
				-        return True
			
 
				-    if name == 'SAVE_PDF':
			
 
				-        return True
			
 
				-    if name == 'SAVE_SCREENSHOT':
			
 
				-        return True
			
 
				-    if name == 'SAVE_DOM':
			
 
				-        return True
			
 
				-    if name == 'SAVE_HEADERS':
			
 
				-        return True
			
 
				-    if name == 'SAVE_GIT':
			
 
				-        return True
			
 
				-    if name == 'SAVE_MEDIA':
			
 
				-        return True
			
 
				-    if name == 'SAVE_ARCHIVE_DOT_ORG':
			
 
				-        return True
			
 
				-    
			
 
				-    # Extractor-specific settings
			
 
				+
			
 
				+    # Generic resolution settings (used by multiple plugins)
			
 
				     if name == 'RESOLUTION':
			
 
				         cfg, _ = _get_config()
			
 
				         return cfg.RESOLUTION
			
 
				-    if name == 'GIT_DOMAINS':
			
 
				-        return 'github.com,bitbucket.org,gitlab.com,gist.github.com,codeberg.org,gitea.com,git.sr.ht'
			
 
				-    if name == 'MEDIA_MAX_SIZE':
			
 
				-        cfg, _ = _get_config()
			
 
				-        return cfg.MEDIA_MAX_SIZE
			
 
				-    if name == 'FAVICON_PROVIDER':
			
 
				-        return 'https://www.google.com/s2/favicons?domain={}'
			
 
				-    
			
 
				-    # Binary paths (use shutil.which for detection)
			
 
				-    if name == 'CURL_BINARY':
			
 
				-        return shutil.which('curl') or 'curl'
			
 
				-    if name == 'WGET_BINARY':
			
 
				-        return shutil.which('wget') or 'wget'
			
 
				-    if name == 'GIT_BINARY':
			
 
				-        return shutil.which('git') or 'git'
			
 
				-    if name == 'YOUTUBEDL_BINARY':
			
 
				-        return shutil.which('yt-dlp') or shutil.which('youtube-dl') or 'yt-dlp'
			
 
				-    if name == 'CHROME_BINARY':
			
 
				-        for chrome in ['chromium', 'chromium-browser', 'google-chrome', 'google-chrome-stable', 'chrome']:
			
 
				-            path = shutil.which(chrome)
			
 
				-            if path:
			
 
				-                return path
			
 
				-        return 'chromium'
			
 
				-    if name == 'NODE_BINARY':
			
 
				-        return shutil.which('node') or 'node'
			
 
				-    if name == 'SINGLEFILE_BINARY':
			
 
				-        return shutil.which('single-file') or shutil.which('singlefile') or 'single-file'
			
 
				-    if name == 'READABILITY_BINARY':
			
 
				-        return shutil.which('readability-extractor') or 'readability-extractor'
			
 
				-    if name == 'MERCURY_BINARY':
			
 
				-        return shutil.which('mercury-parser') or shutil.which('postlight-parser') or 'mercury-parser'
			
 
				-    
			
 
				-    # Binary versions (return placeholder, actual version detection happens elsewhere)
			
 
				-    if name == 'CURL_VERSION':
			
 
				-        return 'curl'
			
 
				-    if name == 'WGET_VERSION':
			
 
				-        return 'wget'
			
 
				-    if name == 'GIT_VERSION':
			
 
				-        return 'git'
			
 
				-    if name == 'YOUTUBEDL_VERSION':
			
 
				-        return 'yt-dlp'
			
 
				-    if name == 'CHROME_VERSION':
			
 
				-        return 'chromium'
			
 
				-    if name == 'SINGLEFILE_VERSION':
			
 
				-        return 'singlefile'
			
 
				-    if name == 'READABILITY_VERSION':
			
 
				-        return 'readability'
			
 
				-    if name == 'MERCURY_VERSION':
			
 
				-        return 'mercury'
			
 
				-    
			
 
				-    # Binary arguments
			
 
				-    if name == 'CURL_ARGS':
			
 
				-        return ['--silent', '--location', '--compressed']
			
 
				-    if name == 'WGET_ARGS':
			
 
				-        return [
			
 
				-            '--no-verbose',
			
 
				-            '--adjust-extension',
			
 
				-            '--convert-links',
			
 
				-            '--force-directories',
			
 
				-            '--backup-converted',
			
 
				-            '--span-hosts',
			
 
				-            '--no-parent',
			
 
				-            '-e', 'robots=off',
			
 
				-        ]
			
 
				-    if name == 'GIT_ARGS':
			
 
				-        return ['--recursive']
			
 
				-    if name == 'YOUTUBEDL_ARGS':
			
 
				-        cfg, _ = _get_config()
			
 
				-        return [
			
 
				-            '--write-description',
			
 
				-            '--write-info-json',
			
 
				-            '--write-annotations',
			
 
				-            '--write-thumbnail',
			
 
				-            '--no-call-home',
			
 
				-            '--write-sub',
			
 
				-            '--write-auto-subs',
			
 
				-            '--convert-subs=srt',
			
 
				-            '--yes-playlist',
			
 
				-            '--continue',
			
 
				-            '--no-abort-on-error',
			
 
				-            '--ignore-errors',
			
 
				-            '--geo-bypass',
			
 
				-            '--add-metadata',
			
 
				-            f'--format=(bv*+ba/b)[filesize<={cfg.MEDIA_MAX_SIZE}][filesize_approx<=?{cfg.MEDIA_MAX_SIZE}]/(bv*+ba/b)',
			
 
				-        ]
			
 
				-    if name == 'SINGLEFILE_ARGS':
			
 
				-        return None  # Uses defaults
			
 
				-    if name == 'CHROME_ARGS':
			
 
				-        return []
			
 
				-    
			
 
				-    # Other settings
			
 
				-    if name == 'WGET_AUTO_COMPRESSION':
			
 
				-        return True
			
 
				-    if name == 'DEPENDENCIES':
			
 
				-        return {}  # Legacy, not used anymore
			
 
				-    
			
 
				+
			
 
				     # Allowlist/Denylist patterns (compiled regexes)
			
 
				     if name == 'SAVE_ALLOWLIST_PTN':
			
 
				         cfg, _ = _get_config()
			
@@ -213,7 +77,7 @@ def __getattr__(name: str):
 
				     if name == 'SAVE_DENYLIST_PTN':
			
 
				         cfg, _ = _get_config()
			
 
				         return cfg.SAVE_DENYLIST_PTNS
			
 
				-    
			
 
				+
			
 
				     raise AttributeError(f"module 'archivebox.config' has no attribute '{name}'")
			
 
				 
			
 
				 
			
--- a/archivebox/config/collection.py
+++ b/archivebox/config/collection.py
@@ -111,6 +111,24 @@ def load_config_file() -> Optional[benedict]:
 
				     return None
			
 
				 
			
 
				 
			
 
				+class PluginConfigSection:
			
 
				+    """Pseudo-section for all plugin config keys written to [PLUGINS] section in ArchiveBox.conf"""
			
 
				+    toml_section_header = "PLUGINS"
			
 
				+
			
 
				+    def __init__(self, key: str):
			
 
				+        self._key = key
			
 
				+
			
 
				+    def __getattr__(self, name: str) -> Any:
			
 
				+        # Allow hasattr checks to pass for the key
			
 
				+        if name == self._key:
			
 
				+            return None
			
 
				+        raise AttributeError(f"PluginConfigSection has no attribute '{name}'")
			
 
				+
			
 
				+    def update_in_place(self, warn: bool = True, persist: bool = False, **kwargs):
			
 
				+        """No-op update since plugins read config dynamically via get_config()."""
			
 
				+        pass
			
 
				+
			
 
				+
			
 
				 def section_for_key(key: str) -> Any:
			
 
				     """Find the config section containing a given key."""
			
 
				     from archivebox.config.common import (
			
@@ -121,11 +139,22 @@ def section_for_key(key: str) -> Any:
 
				         ARCHIVING_CONFIG,
			
 
				         SEARCH_BACKEND_CONFIG,
			
 
				     )
			
 
				-    
			
 
				-    for section in [SHELL_CONFIG, STORAGE_CONFIG, GENERAL_CONFIG, 
			
 
				+
			
 
				+    # First check core config sections
			
 
				+    for section in [SHELL_CONFIG, STORAGE_CONFIG, GENERAL_CONFIG,
			
 
				                     SERVER_CONFIG, ARCHIVING_CONFIG, SEARCH_BACKEND_CONFIG]:
			
 
				         if hasattr(section, key):
			
 
				             return section
			
 
				+
			
 
				+    # Check if this is a plugin config key
			
 
				+    from archivebox.hooks import discover_plugin_configs
			
 
				+
			
 
				+    plugin_configs = discover_plugin_configs()
			
 
				+    for plugin_name, schema in plugin_configs.items():
			
 
				+        if 'properties' in schema and key in schema['properties']:
			
 
				+            # All plugin config goes to [PLUGINS] section
			
 
				+            return PluginConfigSection(key)
			
 
				+
			
 
				     raise ValueError(f'No config section found for key: {key}')
			
 
				 
			
 
				 
			
--- a/archivebox/config/common.py
+++ b/archivebox/config/common.py
@@ -123,9 +123,7 @@ class ArchivingConfig(BaseConfigSet):
 
				     OVERWRITE: bool = Field(default=False)
			
 
				 
			
 
				     TIMEOUT: int = Field(default=60)
			
 
				-    MEDIA_TIMEOUT: int = Field(default=3600)
			
 
				 
			
 
				-    MEDIA_MAX_SIZE: str = Field(default="750m")
			
 
				     RESOLUTION: str = Field(default="1440,2000")
			
 
				     CHECK_SSL_VALIDITY: bool = Field(default=True)
			
 
				     USER_AGENT: str = Field(
			
@@ -141,15 +139,6 @@ class ArchivingConfig(BaseConfigSet):
 
				 
			
 
				     DEFAULT_PERSONA: str = Field(default="Default")
			
 
				 
			
 
				-    # GIT_DOMAINS: str                    = Field(default='github.com,bitbucket.org,gitlab.com,gist.github.com,codeberg.org,gitea.com,git.sr.ht')
			
 
				-    # WGET_USER_AGENT: str                = Field(default=lambda c: c['USER_AGENT'] + ' wget/{WGET_VERSION}')
			
 
				-    # CURL_USER_AGENT: str                = Field(default=lambda c: c['USER_AGENT'] + ' curl/{CURL_VERSION}')
			
 
				-    # CHROME_USER_AGENT: str              = Field(default=lambda c: c['USER_AGENT'])
			
 
				-    # CHROME_USER_DATA_DIR: str | None    = Field(default=None)
			
 
				-    # CHROME_TIMEOUT: int                 = Field(default=0)
			
 
				-    # CHROME_HEADLESS: bool               = Field(default=True)
			
 
				-    # CHROME_SANDBOX: bool                = Field(default=lambda: not SHELL_CONFIG.IN_DOCKER)
			
 
				-
			
 
				     def validate(self):
			
 
				         if int(self.TIMEOUT) < 5:
			
 
				             print(f"[red][!] Warning: TIMEOUT is set too low! (currently set to TIMEOUT={self.TIMEOUT} seconds)[/red]", file=sys.stderr)
			
@@ -215,7 +204,6 @@ class SearchBackendConfig(BaseConfigSet):
 
				 
			
 
				     SEARCH_BACKEND_ENGINE: str = Field(default="ripgrep")
			
 
				     SEARCH_PROCESS_HTML: bool = Field(default=True)
			
 
				-    SEARCH_BACKEND_TIMEOUT: int = Field(default=10)
			
 
				 
			
 
				 
			
 
				 SEARCH_BACKEND_CONFIG = SearchBackendConfig()
			
--- a/archivebox/config/configset.py
+++ b/archivebox/config/configset.py
@@ -174,7 +174,7 @@ def get_config(
 
				     config.update(dict(ARCHIVING_CONFIG))
			
 
				     config.update(dict(SEARCH_BACKEND_CONFIG))
			
 
				 
			
 
				-    # Load from config file
			
 
				+    # Load from archivebox.config.file
			
 
				     config_file = CONSTANTS.CONFIG_FILE
			
 
				     if config_file.exists():
			
 
				         file_config = BaseConfigSet.load_from_file(config_file)
			
--- a/archivebox/config/views.py
+++ b/archivebox/config/views.py
@@ -17,7 +17,7 @@ from admin_data_views.utils import render_with_table_view, render_with_item_view
 
				 from archivebox.config import CONSTANTS
			
 
				 from archivebox.misc.util import parse_date
			
 
				 
			
 
				-from machine.models import Binary
			
 
				+from archivebox.machine.models import Binary
			
 
				 
			
 
				 
			
 
				 # Common binaries to check for
			
--- a/archivebox/core/__init__.py
+++ b/archivebox/core/__init__.py
@@ -4,7 +4,7 @@ __order__ = 100
 
				 
			
 
				 def register_admin(admin_site):
			
 
				     """Register the core.models views (Snapshot, ArchiveResult, Tag, etc.) with the admin site"""
			
 
				-    from core.admin import register_admin as do_register
			
 
				+    from archivebox.core.admin import register_admin as do_register
			
 
				     do_register(admin_site)
			
 
				 
			
 
				 
			
--- a/archivebox/core/admin.py
+++ b/archivebox/core/admin.py
@@ -3,11 +3,11 @@ __package__ = 'archivebox.core'
 
				 from django.contrib.auth import get_user_model
			
 
				 
			
 
				 
			
 
				-from core.models import Snapshot, ArchiveResult, Tag
			
 
				-from core.admin_tags import TagAdmin
			
 
				-from core.admin_snapshots import SnapshotAdmin
			
 
				-from core.admin_archiveresults import ArchiveResultAdmin
			
 
				-from core.admin_users import UserAdmin
			
 
				+from archivebox.core.models import Snapshot, ArchiveResult, Tag
			
 
				+from archivebox.core.admin_tags import TagAdmin
			
 
				+from archivebox.core.admin_snapshots import SnapshotAdmin
			
 
				+from archivebox.core.admin_archiveresults import ArchiveResultAdmin
			
 
				+from archivebox.core.admin_users import UserAdmin
			
 
				 
			
 
				 
			
 
				 def register_admin(admin_site):
			
--- a/archivebox/core/admin_archiveresults.py
+++ b/archivebox/core/admin_archiveresults.py
@@ -16,7 +16,7 @@ from archivebox.base_models.admin import BaseModelAdmin
 
				 from archivebox.hooks import get_plugin_icon
			
 
				 
			
 
				 
			
 
				-from core.models import ArchiveResult, Snapshot
			
 
				+from archivebox.core.models import ArchiveResult, Snapshot
			
 
				 
			
 
				 
			
 
				 def render_archiveresults_list(archiveresults_qs, limit=50):
			
@@ -187,7 +187,7 @@ class ArchiveResultInline(admin.TabularInline):
 
				     extra = 0
			
 
				     sort_fields = ('end_ts', 'plugin', 'output_str', 'status', 'cmd_version')
			
 
				     readonly_fields = ('id', 'result_id', 'completed', 'command', 'version')
			
 
				-    fields = ('start_ts', 'end_ts', *readonly_fields, 'plugin', 'cmd', 'cmd_version', 'pwd', 'created_by', 'status', 'retry_at', 'output_str')
			
 
				+    fields = ('start_ts', 'end_ts', *readonly_fields, 'plugin', 'cmd', 'cmd_version', 'pwd', 'status', 'retry_at', 'output_str')
			
 
				     # exclude = ('id',)
			
 
				     ordering = ('end_ts',)
			
 
				     show_change_link = True
			
@@ -229,17 +229,15 @@ class ArchiveResultInline(admin.TabularInline):
 
				         formset.form.base_fields['end_ts'].initial = timezone.now()
			
 
				         formset.form.base_fields['cmd_version'].initial = '-'
			
 
				         formset.form.base_fields['pwd'].initial = str(snapshot.output_dir)
			
 
				-        formset.form.base_fields['created_by'].initial = request.user
			
 
				         formset.form.base_fields['cmd'].initial = '["-"]'
			
 
				         formset.form.base_fields['output_str'].initial = 'Manually recorded cmd output...'
			
 
				-        
			
 
				+
			
 
				         if obj is not None:
			
 
				             # hidden values for existing entries and new entries
			
 
				             formset.form.base_fields['start_ts'].widget = formset.form.base_fields['start_ts'].hidden_widget()
			
 
				             formset.form.base_fields['end_ts'].widget = formset.form.base_fields['end_ts'].hidden_widget()
			
 
				             formset.form.base_fields['cmd'].widget = formset.form.base_fields['cmd'].hidden_widget()
			
 
				             formset.form.base_fields['pwd'].widget = formset.form.base_fields['pwd'].hidden_widget()
			
 
				-            formset.form.base_fields['created_by'].widget = formset.form.base_fields['created_by'].hidden_widget()
			
 
				             formset.form.base_fields['cmd_version'].widget = formset.form.base_fields['cmd_version'].hidden_widget()
			
 
				         return formset
			
 
				     
			
@@ -252,8 +250,8 @@ class ArchiveResultInline(admin.TabularInline):
 
				 
			
 
				 
			
 
				 class ArchiveResultAdmin(BaseModelAdmin):
			
 
				-    list_display = ('id', 'created_by', 'created_at', 'snapshot_info', 'tags_str', 'status', 'plugin_with_icon', 'cmd_str', 'output_str')
			
 
				-    sort_fields = ('id', 'created_by', 'created_at', 'plugin', 'status')
			
 
				+    list_display = ('id', 'created_at', 'snapshot_info', 'tags_str', 'status', 'plugin_with_icon', 'cmd_str', 'output_str')
			
 
				+    sort_fields = ('id', 'created_at', 'plugin', 'status')
			
 
				     readonly_fields = ('cmd_str', 'snapshot_info', 'tags_str', 'created_at', 'modified_at', 'output_summary', 'plugin_with_icon', 'iface')
			
 
				     search_fields = ('id', 'snapshot__url', 'plugin', 'output_str', 'cmd_version', 'cmd', 'snapshot__timestamp')
			
 
				     autocomplete_fields = ['snapshot']
			
@@ -279,10 +277,6 @@ class ArchiveResultAdmin(BaseModelAdmin):
 
				             'fields': ('output_str', 'output_json', 'output_files', 'output_size', 'output_mimetypes', 'output_summary'),
			
 
				             'classes': ('card', 'wide'),
			
 
				         }),
			
 
				-        ('Metadata', {
			
 
				-            'fields': ('created_by',),
			
 
				-            'classes': ('card',),
			
 
				-        }),
			
 
				     )
			
 
				 
			
 
				     list_filter = ('status', 'plugin', 'start_ts', 'cmd_version')
			
--- a/archivebox/core/admin_site.py
+++ b/archivebox/core/admin_site.py
@@ -38,11 +38,11 @@ def register_admin_site():
 
				 
			
 
				     # Register admin views for each app
			
 
				     # (Previously handled by ABX plugin system, now called directly)
			
 
				-    from core.admin import register_admin as register_core_admin
			
 
				-    from crawls.admin import register_admin as register_crawls_admin
			
 
				-    from api.admin import register_admin as register_api_admin
			
 
				-    from machine.admin import register_admin as register_machine_admin
			
 
				-    from workers.admin import register_admin as register_workers_admin
			
 
				+    from archivebox.core.admin import register_admin as register_core_admin
			
 
				+    from archivebox.crawls.admin import register_admin as register_crawls_admin
			
 
				+    from archivebox.api.admin import register_admin as register_api_admin
			
 
				+    from archivebox.machine.admin import register_admin as register_machine_admin
			
 
				+    from archivebox.workers.admin import register_admin as register_workers_admin
			
 
				 
			
 
				     register_core_admin(archivebox_admin)
			
 
				     register_crawls_admin(archivebox_admin)
			
--- a/archivebox/core/admin_snapshots.py
+++ b/archivebox/core/admin_snapshots.py
@@ -23,9 +23,9 @@ from archivebox.search.admin import SearchResultsAdminMixin
 
				 from archivebox.base_models.admin import BaseModelAdmin, ConfigEditorMixin
			
 
				 from archivebox.workers.tasks import bg_archive_snapshots, bg_add
			
 
				 
			
 
				-from core.models import Tag, Snapshot
			
 
				-from core.admin_tags import TagInline
			
 
				-from core.admin_archiveresults import ArchiveResultInline, render_archiveresults_list
			
 
				+from archivebox.core.models import Tag, Snapshot
			
 
				+from archivebox.core.admin_tags import TagInline
			
 
				+from archivebox.core.admin_archiveresults import ArchiveResultInline, render_archiveresults_list
			
 
				 
			
 
				 
			
 
				 # GLOBAL_CONTEXT = {'VERSION': VERSION, 'VERSIONS_AVAILABLE': [], 'CAN_UPGRADE': False}
			
@@ -59,7 +59,7 @@ class SnapshotAdmin(SearchResultsAdminMixin, ConfigEditorMixin, BaseModelAdmin):
 
				     sort_fields = ('title_str', 'url_str', 'created_at', 'status', 'crawl')
			
 
				     readonly_fields = ('admin_actions', 'status_info', 'tags_str', 'imported_timestamp', 'created_at', 'modified_at', 'downloaded_at', 'output_dir', 'archiveresults_list')
			
 
				     search_fields = ('id', 'url', 'timestamp', 'title', 'tags__name')
			
 
				-    list_filter = ('created_at', 'downloaded_at', 'archiveresult__status', 'created_by', 'tags__name')
			
 
				+    list_filter = ('created_at', 'downloaded_at', 'archiveresult__status', 'crawl__created_by', 'tags__name')
			
 
				 
			
 
				     fieldsets = (
			
 
				         ('URL', {
			
@@ -75,7 +75,7 @@ class SnapshotAdmin(SearchResultsAdminMixin, ConfigEditorMixin, BaseModelAdmin):
 
				             'classes': ('card',),
			
 
				         }),
			
 
				         ('Relations', {
			
 
				-            'fields': ('crawl', 'created_by', 'tags_str'),
			
 
				+            'fields': ('crawl', 'tags_str'),
			
 
				             'classes': ('card',),
			
 
				         }),
			
 
				         ('Config', {
			
--- a/archivebox/core/admin_tags.py
+++ b/archivebox/core/admin_tags.py
@@ -6,7 +6,7 @@ from django.utils.html import format_html, mark_safe
 
				 from archivebox.misc.paginators import AccelleratedPaginator
			
 
				 from archivebox.base_models.admin import BaseModelAdmin
			
 
				 
			
 
				-from core.models import Tag
			
 
				+from archivebox.core.models import Tag
			
 
				 
			
 
				 
			
 
				 class TagInline(admin.TabularInline):
			
--- a/archivebox/core/apps.py
+++ b/archivebox/core/apps.py
@@ -4,9 +4,9 @@ from django.apps import AppConfig
 
				 
			
 
				 
			
 
				 class CoreConfig(AppConfig):
			
 
				-    name = 'core'
			
 
				+    name = 'archivebox.core'
			
 
				 
			
 
				     def ready(self):
			
 
				         """Register the archivebox.core.admin_site as the main django admin site"""
			
 
				-        from core.admin_site import register_admin_site
			
 
				+        from archivebox.core.admin_site import register_admin_site
			
 
				         register_admin_site()
			
--- a/archivebox/core/asgi.py
+++ b/archivebox/core/asgi.py
@@ -20,7 +20,7 @@ application = get_asgi_application()
 
				 # from channels.routing import ProtocolTypeRouter, URLRouter
			
 
				 # from channels.auth import AuthMiddlewareStack
			
 
				 # from channels.security.websocket import AllowedHostsOriginValidator
			
 
				-# from core.routing import websocket_urlpatterns
			
 
				+# from archivebox.core.routing import websocket_urlpatterns
			
 
				 #
			
 
				 # application = ProtocolTypeRouter({
			
 
				 #     "http": get_asgi_application(),
			
--- a/archivebox/core/forms.py
+++ b/archivebox/core/forms.py
@@ -4,10 +4,14 @@ from django import forms
 
				 
			
 
				 from archivebox.misc.util import URL_REGEX
			
 
				 from taggit.utils import edit_string_for_tags, parse_tags
			
 
				+from archivebox.base_models.admin import KeyValueWidget
			
 
				 
			
 
				 DEPTH_CHOICES = (
			
 
				     ('0', 'depth = 0 (archive just these URLs)'),
			
 
				-    ('1', 'depth = 1 (archive these URLs and all URLs one hop away)'),
			
 
				+    ('1', 'depth = 1 (+ URLs one hop away)'),
			
 
				+    ('2', 'depth = 2 (+ URLs two hops away)'),
			
 
				+    ('3', 'depth = 3 (+ URLs three hops away)'),
			
 
				+    ('4', 'depth = 4 (+ URLs four hops away)'),
			
 
				 )
			
 
				 
			
 
				 from archivebox.hooks import get_plugins
			
@@ -18,39 +22,180 @@ def get_plugin_choices():
 
				 
			
 
				 
			
 
				 class AddLinkForm(forms.Form):
			
 
				-    url = forms.RegexField(label="URLs (one per line)", regex=URL_REGEX, min_length='6', strip=True, widget=forms.Textarea, required=True)
			
 
				-    tag = forms.CharField(label="Tags (comma separated tag1,tag2,tag3)", strip=True, required=False)
			
 
				-    depth = forms.ChoiceField(label="Archive depth", choices=DEPTH_CHOICES, initial='0', widget=forms.RadioSelect(attrs={"class": "depth-selection"}))
			
 
				-    plugins = forms.MultipleChoiceField(
			
 
				-        label="Plugins (select at least 1, otherwise all will be used by default)",
			
 
				+    # Basic fields
			
 
				+    url = forms.RegexField(
			
 
				+        label="URLs (one per line)",
			
 
				+        regex=URL_REGEX,
			
 
				+        min_length='6',
			
 
				+        strip=True,
			
 
				+        widget=forms.Textarea,
			
 
				+        required=True
			
 
				+    )
			
 
				+    tag = forms.CharField(
			
 
				+        label="Tags (comma separated tag1,tag2,tag3)",
			
 
				+        strip=True,
			
 
				+        required=False,
			
 
				+        widget=forms.TextInput(attrs={
			
 
				+            'list': 'tag-datalist',
			
 
				+            'autocomplete': 'off',
			
 
				+        })
			
 
				+    )
			
 
				+    depth = forms.ChoiceField(
			
 
				+        label="Archive depth",
			
 
				+        choices=DEPTH_CHOICES,
			
 
				+        initial='0',
			
 
				+        widget=forms.RadioSelect(attrs={"class": "depth-selection"})
			
 
				+    )
			
 
				+    notes = forms.CharField(
			
 
				+        label="Notes",
			
 
				+        strip=True,
			
 
				+        required=False,
			
 
				+        widget=forms.Textarea(attrs={
			
 
				+            'rows': 3,
			
 
				+            'placeholder': 'Optional notes about this crawl (e.g., purpose, project name, context...)',
			
 
				+        })
			
 
				+    )
			
 
				+
			
 
				+    # Plugin groups
			
 
				+    chrome_plugins = forms.MultipleChoiceField(
			
 
				+        label="Chrome-dependent plugins",
			
 
				+        required=False,
			
 
				+        widget=forms.CheckboxSelectMultiple,
			
 
				+        choices=[],  # populated in __init__
			
 
				+    )
			
 
				+    archiving_plugins = forms.MultipleChoiceField(
			
 
				+        label="Archiving",
			
 
				+        required=False,
			
 
				+        widget=forms.CheckboxSelectMultiple,
			
 
				+        choices=[],
			
 
				+    )
			
 
				+    parsing_plugins = forms.MultipleChoiceField(
			
 
				+        label="Parsing",
			
 
				+        required=False,
			
 
				+        widget=forms.CheckboxSelectMultiple,
			
 
				+        choices=[],
			
 
				+    )
			
 
				+    search_plugins = forms.MultipleChoiceField(
			
 
				+        label="Search",
			
 
				+        required=False,
			
 
				+        widget=forms.CheckboxSelectMultiple,
			
 
				+        choices=[],
			
 
				+    )
			
 
				+    binary_plugins = forms.MultipleChoiceField(
			
 
				+        label="Binary providers",
			
 
				+        required=False,
			
 
				+        widget=forms.CheckboxSelectMultiple,
			
 
				+        choices=[],
			
 
				+    )
			
 
				+    extension_plugins = forms.MultipleChoiceField(
			
 
				+        label="Browser extensions",
			
 
				+        required=False,
			
 
				+        widget=forms.CheckboxSelectMultiple,
			
 
				+        choices=[],
			
 
				+    )
			
 
				+
			
 
				+    # Advanced options
			
 
				+    schedule = forms.CharField(
			
 
				+        label="Repeat schedule",
			
 
				+        max_length=64,
			
 
				+        required=False,
			
 
				+        widget=forms.TextInput(attrs={
			
 
				+            'placeholder': 'e.g., daily, weekly, 0 */6 * * * (every 6 hours)',
			
 
				+        })
			
 
				+    )
			
 
				+    persona = forms.CharField(
			
 
				+        label="Persona (authentication profile)",
			
 
				+        max_length=100,
			
 
				+        initial='Default',
			
 
				+        required=False,
			
 
				+    )
			
 
				+    overwrite = forms.BooleanField(
			
 
				+        label="Overwrite existing snapshots",
			
 
				+        initial=False,
			
 
				+        required=False,
			
 
				+    )
			
 
				+    update = forms.BooleanField(
			
 
				+        label="Update/retry previously failed URLs",
			
 
				+        initial=False,
			
 
				+        required=False,
			
 
				+    )
			
 
				+    index_only = forms.BooleanField(
			
 
				+        label="Index only (don't archive yet)",
			
 
				+        initial=False,
			
 
				+        required=False,
			
 
				+    )
			
 
				+    config = forms.JSONField(
			
 
				+        label="Custom config overrides",
			
 
				+        widget=KeyValueWidget(),
			
 
				+        initial=dict,
			
 
				         required=False,
			
 
				-        widget=forms.SelectMultiple,
			
 
				-        choices=[],  # populated dynamically in __init__
			
 
				     )
			
 
				 
			
 
				     def __init__(self, *args, **kwargs):
			
 
				         super().__init__(*args, **kwargs)
			
 
				-        self.fields['plugins'].choices = get_plugin_choices()
			
 
				-    # TODO: hook these up to the view and put them 
			
 
				-    # in a collapsible UI section labeled "Advanced"
			
 
				-    #
			
 
				-    # exclude_patterns = forms.CharField(
			
 
				-    #     label="Exclude patterns",
			
 
				-    #     min_length='1',
			
 
				-    #     required=False,
			
 
				-    #     initial=URL_DENYLIST,
			
 
				-    # )
			
 
				-    # timeout = forms.IntegerField(
			
 
				-    #     initial=TIMEOUT,
			
 
				-    # )
			
 
				-    # overwrite = forms.BooleanField(
			
 
				-    #     label="Overwrite any existing Snapshots",
			
 
				-    #     initial=False,
			
 
				-    # )
			
 
				-    # index_only = forms.BooleanField(
			
 
				-    #     label="Add URLs to index without Snapshotting",
			
 
				-    #     initial=False,
			
 
				-    # )
			
 
				+
			
 
				+        # Import at runtime to avoid circular imports
			
 
				+        from archivebox.config.common import ARCHIVING_CONFIG
			
 
				+
			
 
				+        # Get all plugins
			
 
				+        all_plugins = get_plugins()
			
 
				+
			
 
				+        # Define plugin groups
			
 
				+        chrome_dependent = {
			
 
				+            'accessibility', 'chrome', 'consolelog', 'dom', 'headers',
			
 
				+            'parse_dom_outlinks', 'pdf', 'redirects', 'responses',
			
 
				+            'screenshot', 'seo', 'singlefile', 'ssl', 'staticfile', 'title'
			
 
				+        }
			
 
				+        archiving = {
			
 
				+            'archive_org', 'favicon', 'forumdl', 'gallerydl', 'git',
			
 
				+            'htmltotext', 'media', 'mercury', 'papersdl', 'readability', 'wget'
			
 
				+        }
			
 
				+        parsing = {
			
 
				+            'parse_html_urls', 'parse_jsonl_urls',
			
 
				+            'parse_netscape_urls', 'parse_rss_urls', 'parse_txt_urls'
			
 
				+        }
			
 
				+        search = {
			
 
				+            'search_backend_ripgrep', 'search_backend_sonic', 'search_backend_sqlite'
			
 
				+        }
			
 
				+        binary = {'apt', 'brew', 'custom', 'env', 'npm', 'pip'}
			
 
				+        extensions = {'captcha2', 'istilldontcareaboutcookies', 'ublock'}
			
 
				+
			
 
				+        # Populate plugin field choices
			
 
				+        self.fields['chrome_plugins'].choices = [
			
 
				+            (p, p) for p in sorted(all_plugins) if p in chrome_dependent
			
 
				+        ]
			
 
				+        self.fields['archiving_plugins'].choices = [
			
 
				+            (p, p) for p in sorted(all_plugins) if p in archiving
			
 
				+        ]
			
 
				+        self.fields['parsing_plugins'].choices = [
			
 
				+            (p, p) for p in sorted(all_plugins) if p in parsing
			
 
				+        ]
			
 
				+        self.fields['search_plugins'].choices = [
			
 
				+            (p, p) for p in sorted(all_plugins) if p in search
			
 
				+        ]
			
 
				+        self.fields['binary_plugins'].choices = [
			
 
				+            (p, p) for p in sorted(all_plugins) if p in binary
			
 
				+        ]
			
 
				+        self.fields['extension_plugins'].choices = [
			
 
				+            (p, p) for p in sorted(all_plugins) if p in extensions
			
 
				+        ]
			
 
				+
			
 
				+        # Set update default from config
			
 
				+        self.fields['update'].initial = not ARCHIVING_CONFIG.ONLY_NEW
			
 
				+
			
 
				+    def clean(self):
			
 
				+        cleaned_data = super().clean()
			
 
				+
			
 
				+        # Combine all plugin groups into single list
			
 
				+        all_selected_plugins = []
			
 
				+        for field in ['chrome_plugins', 'archiving_plugins', 'parsing_plugins',
			
 
				+                      'search_plugins', 'binary_plugins', 'extension_plugins']:
			
 
				+            all_selected_plugins.extend(cleaned_data.get(field, []))
			
 
				+
			
 
				+        # Store combined list for easy access
			
 
				+        cleaned_data['plugins'] = all_selected_plugins
			
 
				+
			
 
				+        return cleaned_data
			
 
				 
			
 
				 class TagWidgetMixin:
			
 
				     def format_value(self, value):
			
--- a/archivebox/core/migrations/0007_archiveresult.py
+++ b/archivebox/core/migrations/0007_archiveresult.py
@@ -12,7 +12,7 @@ try:
 
				     ARCHIVE_DIR = CONSTANTS.ARCHIVE_DIR
			
 
				 except ImportError:
			
 
				     try:
			
 
				-        from config import CONFIG
			
 
				+        from archivebox.config import CONFIG
			
 
				         ARCHIVE_DIR = Path(CONFIG.get('ARCHIVE_DIR', './archive'))
			
 
				     except ImportError:
			
 
				         ARCHIVE_DIR = Path('./archive')
			
--- a/archivebox/core/migrations/0032_alter_archiveresult_binary_and_more.py
+++ b/archivebox/core/migrations/0032_alter_archiveresult_binary_and_more.py
@@ -11,7 +11,7 @@ class Migration(migrations.Migration):
 
				     dependencies = [
			
 
				         ('core', '0031_snapshot_parent_snapshot'),
			
 
				         ('crawls', '0004_alter_crawl_output_dir'),
			
 
				-        ('machine', '0003_alter_dependency_id_alter_installedbinary_dependency_and_more'),
			
 
				+        ('machine', '0004_drop_dependency_table'),  # Changed from 0003 - wait until Dependency is dropped
			
 
				         migrations.swappable_dependency(settings.AUTH_USER_MODEL),
			
 
				     ]
			
 
				 
			
--- a/archivebox/core/migrations/0035_snapshot_crawl_non_nullable_remove_created_by.py
+++ b/archivebox/core/migrations/0035_snapshot_crawl_non_nullable_remove_created_by.py
@@ -0,0 +1,79 @@
 
				+# Generated migration
			
 
				+
			
 
				+from django.conf import settings
			
 
				+from django.db import migrations, models
			
 
				+import django.db.models.deletion
			
 
				+
			
 
				+
			
 
				+def create_catchall_crawls_and_assign_snapshots(apps, schema_editor):
			
 
				+    """
			
 
				+    Create one catchall Crawl per user for all snapshots without a crawl.
			
 
				+    Assign those snapshots to their user's catchall crawl.
			
 
				+    """
			
 
				+    Snapshot = apps.get_model('core', 'Snapshot')
			
 
				+    Crawl = apps.get_model('crawls', 'Crawl')
			
 
				+    User = apps.get_model(settings.AUTH_USER_MODEL)
			
 
				+
			
 
				+    # Get all snapshots without a crawl
			
 
				+    snapshots_without_crawl = Snapshot.objects.filter(crawl__isnull=True)
			
 
				+
			
 
				+    if not snapshots_without_crawl.exists():
			
 
				+        return
			
 
				+
			
 
				+    # Group by created_by_id
			
 
				+    snapshots_by_user = {}
			
 
				+    for snapshot in snapshots_without_crawl:
			
 
				+        user_id = snapshot.created_by_id
			
 
				+        if user_id not in snapshots_by_user:
			
 
				+            snapshots_by_user[user_id] = []
			
 
				+        snapshots_by_user[user_id].append(snapshot)
			
 
				+
			
 
				+    # Create one catchall crawl per user and assign snapshots
			
 
				+    for user_id, snapshots in snapshots_by_user.items():
			
 
				+        try:
			
 
				+            user = User.objects.get(pk=user_id)
			
 
				+            username = user.username
			
 
				+        except User.DoesNotExist:
			
 
				+            username = 'unknown'
			
 
				+
			
 
				+        # Create catchall crawl for this user
			
 
				+        crawl = Crawl.objects.create(
			
 
				+            urls=f'# Catchall crawl for {len(snapshots)} snapshots without a crawl',
			
 
				+            max_depth=0,
			
 
				+            label=f'[migration] catchall for user {username}',
			
 
				+            created_by_id=user_id,
			
 
				+        )
			
 
				+
			
 
				+        # Assign all snapshots to this crawl
			
 
				+        for snapshot in snapshots:
			
 
				+            snapshot.crawl = crawl
			
 
				+            snapshot.save(update_fields=['crawl'])
			
 
				+
			
 
				+
			
 
				+class Migration(migrations.Migration):
			
 
				+
			
 
				+    dependencies = [
			
 
				+        ('core', '0034_snapshot_current_step'),
			
 
				+        ('crawls', '0004_alter_crawl_output_dir'),
			
 
				+    ]
			
 
				+
			
 
				+    operations = [
			
 
				+        # Step 1: Assign all snapshots without a crawl to catchall crawls
			
 
				+        migrations.RunPython(
			
 
				+            create_catchall_crawls_and_assign_snapshots,
			
 
				+            reverse_code=migrations.RunPython.noop,
			
 
				+        ),
			
 
				+
			
 
				+        # Step 2: Make crawl non-nullable
			
 
				+        migrations.AlterField(
			
 
				+            model_name='snapshot',
			
 
				+            name='crawl',
			
 
				+            field=models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, related_name='snapshot_set', to='crawls.crawl'),
			
 
				+        ),
			
 
				+
			
 
				+        # Step 3: Remove created_by field
			
 
				+        migrations.RemoveField(
			
 
				+            model_name='snapshot',
			
 
				+            name='created_by',
			
 
				+        ),
			
 
				+    ]
			
--- a/archivebox/core/migrations/0036_remove_archiveresult_created_by.py
+++ b/archivebox/core/migrations/0036_remove_archiveresult_created_by.py
@@ -0,0 +1,19 @@
 
				+# Generated migration
			
 
				+
			
 
				+from django.db import migrations
			
 
				+
			
 
				+
			
 
				+class Migration(migrations.Migration):
			
 
				+
			
 
				+    dependencies = [
			
 
				+        ('core', '0035_snapshot_crawl_non_nullable_remove_created_by'),
			
 
				+    ]
			
 
				+
			
 
				+    operations = [
			
 
				+        # Remove created_by field from ArchiveResult
			
 
				+        # No data migration needed - created_by can be accessed via snapshot.crawl.created_by
			
 
				+        migrations.RemoveField(
			
 
				+            model_name='archiveresult',
			
 
				+            name='created_by',
			
 
				+        ),
			
 
				+    ]
			
--- a/archivebox/core/models.py
+++ b/archivebox/core/models.py
@@ -9,6 +9,8 @@ import os
 
				 import json
			
 
				 from pathlib import Path
			
 
				 
			
 
				+from statemachine import State, registry
			
 
				+
			
 
				 from django.db import models
			
 
				 from django.db.models import QuerySet, Value, Case, When, IntegerField
			
 
				 from django.utils.functional import cached_property
			
@@ -33,10 +35,10 @@ from archivebox.base_models.models import (
 
				     ModelWithConfig, ModelWithNotes, ModelWithHealthStats,
			
 
				     get_or_create_system_user_pk,
			
 
				 )
			
 
				-from workers.models import ModelWithStateMachine
			
 
				-from workers.tasks import bg_archive_snapshot
			
 
				-from crawls.models import Crawl
			
 
				-from machine.models import NetworkInterface, Binary
			
 
				+from archivebox.workers.models import ModelWithStateMachine, BaseStateMachine
			
 
				+from archivebox.workers.tasks import bg_archive_snapshot
			
 
				+from archivebox.crawls.models import Crawl
			
 
				+from archivebox.machine.models import NetworkInterface, Binary
			
 
				 
			
 
				 
			
 
				 
			
@@ -53,6 +55,7 @@ class Tag(ModelWithSerializers):
 
				     snapshot_set: models.Manager['Snapshot']
			
 
				 
			
 
				     class Meta(TypedModelMeta):
			
 
				+        app_label = 'core'
			
 
				         verbose_name = "Tag"
			
 
				         verbose_name_plural = "Tags"
			
 
				 
			
@@ -122,6 +125,7 @@ class SnapshotTag(models.Model):
 
				     tag = models.ForeignKey(Tag, db_column='tag_id', on_delete=models.CASCADE, to_field='id')
			
 
				 
			
 
				     class Meta:
			
 
				+        app_label = 'core'
			
 
				         db_table = 'core_snapshot_tags'
			
 
				         unique_together = [('snapshot', 'tag')]
			
 
				 
			
@@ -263,52 +267,6 @@ class SnapshotManager(models.Manager.from_queryset(SnapshotQuerySet)):
 
				     # Import Methods
			
 
				     # =========================================================================
			
 
				 
			
 
				-    def create_or_update_from_dict(self, link_dict: Dict[str, Any], created_by_id: Optional[int] = None) -> 'Snapshot':
			
 
				-        """Create or update a Snapshot from a SnapshotDict (parser output)"""
			
 
				-        import re
			
 
				-        from archivebox.config.common import GENERAL_CONFIG
			
 
				-
			
 
				-        url = link_dict['url']
			
 
				-        timestamp = link_dict.get('timestamp')
			
 
				-        title = link_dict.get('title')
			
 
				-        tags_str = link_dict.get('tags')
			
 
				-
			
 
				-        tag_list = []
			
 
				-        if tags_str:
			
 
				-            tag_list = list(dict.fromkeys(
			
 
				-                tag.strip() for tag in re.split(GENERAL_CONFIG.TAG_SEPARATOR_PATTERN, tags_str)
			
 
				-                if tag.strip()
			
 
				-            ))
			
 
				-
			
 
				-        # Get most recent snapshot with this URL (URLs can exist in multiple crawls)
			
 
				-        snapshot = self.filter(url=url).order_by('-created_at').first()
			
 
				-        if snapshot:
			
 
				-            if title and (not snapshot.title or len(title) > len(snapshot.title or '')):
			
 
				-                snapshot.title = title
			
 
				-                snapshot.save(update_fields=['title', 'modified_at'])
			
 
				-        else:
			
 
				-            if timestamp:
			
 
				-                while self.filter(timestamp=timestamp).exists():
			
 
				-                    timestamp = str(float(timestamp) + 1.0)
			
 
				-
			
 
				-            snapshot = self.create(
			
 
				-                url=url,
			
 
				-                timestamp=timestamp,
			
 
				-                title=title,
			
 
				-                created_by_id=created_by_id or get_or_create_system_user_pk(),
			
 
				-            )
			
 
				-
			
 
				-        if tag_list:
			
 
				-            existing_tags = set(snapshot.tags.values_list('name', flat=True))
			
 
				-            new_tags = set(tag_list) | existing_tags
			
 
				-            snapshot.save_tags(new_tags)
			
 
				-
			
 
				-        return snapshot
			
 
				-
			
 
				-    def create_from_dicts(self, link_dicts: List[Dict[str, Any]], created_by_id: Optional[int] = None) -> List['Snapshot']:
			
 
				-        """Create or update multiple Snapshots from a list of SnapshotDicts"""
			
 
				-        return [self.create_or_update_from_dict(d, created_by_id=created_by_id) for d in link_dicts]
			
 
				-
			
 
				     def remove(self, atomic: bool = False) -> tuple:
			
 
				         """Remove snapshots from the database"""
			
 
				         from django.db import transaction
			
@@ -320,14 +278,13 @@ class SnapshotManager(models.Manager.from_queryset(SnapshotQuerySet)):
 
				 
			
 
				 class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHealthStats, ModelWithStateMachine):
			
 
				     id = models.UUIDField(primary_key=True, default=uuid7, editable=False, unique=True)
			
 
				-    created_by = models.ForeignKey(settings.AUTH_USER_MODEL, on_delete=models.CASCADE, default=get_or_create_system_user_pk, null=False, related_name='snapshot_set', db_index=True)
			
 
				     created_at = models.DateTimeField(default=timezone.now, db_index=True)
			
 
				     modified_at = models.DateTimeField(auto_now=True)
			
 
				 
			
 
				     url = models.URLField(unique=False, db_index=True)  # URLs can appear in multiple crawls
			
 
				     timestamp = models.CharField(max_length=32, unique=True, db_index=True, editable=False)
			
 
				     bookmarked_at = models.DateTimeField(default=timezone.now, db_index=True)
			
 
				-    crawl: Crawl = models.ForeignKey(Crawl, on_delete=models.CASCADE, default=None, null=True, blank=True, related_name='snapshot_set', db_index=True)  # type: ignore
			
 
				+    crawl: Crawl = models.ForeignKey(Crawl, on_delete=models.CASCADE, null=False, related_name='snapshot_set', db_index=True)  # type: ignore[assignment]
			
 
				     parent_snapshot = models.ForeignKey('self', on_delete=models.SET_NULL, null=True, blank=True, related_name='child_snapshots', db_index=True, help_text='Parent snapshot that discovered this URL (for recursive crawling)')
			
 
				 
			
 
				     title = models.CharField(max_length=512, null=True, blank=True, db_index=True)
			
@@ -344,7 +301,7 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
 
				 
			
 
				     tags = models.ManyToManyField(Tag, blank=True, through=SnapshotTag, related_name='snapshot_set', through_fields=('snapshot', 'tag'))
			
 
				 
			
 
				-    state_machine_name = 'core.statemachines.SnapshotMachine'
			
 
				+    state_machine_name = 'core.models.SnapshotMachine'
			
 
				     state_field_name = 'status'
			
 
				     retry_at_field_name = 'retry_at'
			
 
				     StatusChoices = ModelWithStateMachine.StatusChoices
			
@@ -354,6 +311,7 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
 
				     archiveresult_set: models.Manager['ArchiveResult']
			
 
				 
			
 
				     class Meta(TypedModelMeta):
			
 
				+        app_label = 'core'
			
 
				         verbose_name = "Snapshot"
			
 
				         verbose_name_plural = "Snapshots"
			
 
				         constraints = [
			
@@ -366,6 +324,11 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
 
				     def __str__(self):
			
 
				         return f'[{self.id}] {self.url[:64]}'
			
 
				 
			
 
				+    @property
			
 
				+    def created_by(self):
			
 
				+        """Convenience property to access the user who created this snapshot via its crawl."""
			
 
				+        return self.crawl.created_by
			
 
				+
			
 
				     def save(self, *args, **kwargs):
			
 
				         is_new = self._state.adding
			
 
				         if not self.bookmarked_at:
			
@@ -395,7 +358,7 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
 
				                 self.fs_version = target
			
 
				 
			
 
				         super().save(*args, **kwargs)
			
 
				-        if self.crawl and self.url not in self.crawl.urls:
			
 
				+        if self.url not in self.crawl.urls:
			
 
				             self.crawl.urls += f'\n{self.url}'
			
 
				             self.crawl.save()
			
 
				 
			
@@ -408,7 +371,7 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
 
				                 url=self.url,
			
 
				                 metadata={
			
 
				                     'id': str(self.id),
			
 
				-                    'crawl_id': str(self.crawl_id) if self.crawl_id else None,
			
 
				+                    'crawl_id': str(self.crawl_id),
			
 
				                     'depth': self.depth,
			
 
				                     'status': self.status,
			
 
				                 },
			
@@ -437,20 +400,11 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
 
				         return self.fs_version != self._fs_current_version()
			
 
				 
			
 
				     def _fs_next_version(self, version: str) -> str:
			
 
				-        """Get next version in migration chain"""
			
 
				-        chain = ['0.7.0', '0.8.0', '0.9.0']
			
 
				-        try:
			
 
				-            idx = chain.index(version)
			
 
				-            return chain[idx + 1] if idx + 1 < len(chain) else self._fs_current_version()
			
 
				-        except ValueError:
			
 
				-            # Unknown version - skip to current
			
 
				-            return self._fs_current_version()
			
 
				-
			
 
				-    def _fs_migrate_from_0_7_0_to_0_8_0(self):
			
 
				-        """Migration from 0.7.0 to 0.8.0 layout (no-op)"""
			
 
				-        # 0.7 and 0.8 both used archive/<timestamp>
			
 
				-        # Nothing to do!
			
 
				-        pass
			
 
				+        """Get next version in migration chain (0.7/0.8 had same layout, only 0.8→0.9 migration needed)"""
			
 
				+        # Treat 0.7.0 and 0.8.0 as equivalent (both used archive/{timestamp})
			
 
				+        if version in ('0.7.0', '0.8.0'):
			
 
				+            return '0.9.0'
			
 
				+        return self._fs_current_version()
			
 
				 
			
 
				     def _fs_migrate_from_0_8_0_to_0_9_0(self):
			
 
				         """
			
@@ -578,7 +532,7 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
 
				             return CONSTANTS.ARCHIVE_DIR / self.timestamp
			
 
				 
			
 
				         elif version in ('0.9.0', '1.0.0'):
			
 
				-            username = self.created_by.username if self.created_by else 'unknown'
			
 
				+            username = self.created_by.username
			
 
				 
			
 
				             # Use created_at for date grouping (fallback to timestamp)
			
 
				             if self.created_at:
			
@@ -875,7 +829,6 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
 
				                 pwd=result_data.get('pwd', str(self.output_dir)),
			
 
				                 start_ts=start_ts,
			
 
				                 end_ts=end_ts,
			
 
				-                created_by=self.created_by,
			
 
				             )
			
 
				         except:
			
 
				             pass
			
@@ -1069,6 +1022,12 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
 
				                 result = archive_results.get(plugin)
			
 
				                 existing = result and result.status == 'succeeded' and (result.output_files or result.output_str)
			
 
				                 icon = get_plugin_icon(plugin)
			
 
				+
			
 
				+                # Skip plugins with empty icons that have no output
			
 
				+                # (e.g., staticfile only shows when there's actual output)
			
 
				+                if not icon.strip() and not existing:
			
 
				+                    continue
			
 
				+
			
 
				                 output += format_html(
			
 
				                     output_template,
			
 
				                     path,
			
@@ -1139,9 +1098,20 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
 
				 
			
 
				     def run(self) -> list['ArchiveResult']:
			
 
				         """
			
 
				-        Execute this Snapshot by creating ArchiveResults for all enabled extractors.
			
 
				+        Execute snapshot by creating pending ArchiveResults for all enabled hooks.
			
 
				+
			
 
				+        Called by: SnapshotMachine.enter_started()
			
 
				 
			
 
				-        Called by the state machine when entering the 'started' state.
			
 
				+        Hook Lifecycle:
			
 
				+            1. discover_hooks('Snapshot') → finds all plugin hooks
			
 
				+            2. For each hook:
			
 
				+               - Create ArchiveResult with status=QUEUED
			
 
				+               - Store hook_name (e.g., 'on_Snapshot__50_wget.py')
			
 
				+            3. ArchiveResults execute independently via ArchiveResultMachine
			
 
				+            4. Hook execution happens in ArchiveResult.run(), NOT here
			
 
				+
			
 
				+        Returns:
			
 
				+            list[ArchiveResult]: Newly created pending results
			
 
				         """
			
 
				         return self.create_pending_archiveresults()
			
 
				 
			
@@ -1152,28 +1122,20 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
 
				         Called by the state machine when entering the 'sealed' state.
			
 
				         Kills any background hooks and finalizes their ArchiveResults.
			
 
				         """
			
 
				-        from pathlib import Path
			
 
				         from archivebox.hooks import kill_process
			
 
				 
			
 
				         # Kill any background ArchiveResult hooks
			
 
				         if not self.OUTPUT_DIR.exists():
			
 
				             return
			
 
				 
			
 
				-        for plugin_dir in self.OUTPUT_DIR.iterdir():
			
 
				-            if not plugin_dir.is_dir():
			
 
				-                continue
			
 
				-            pid_file = plugin_dir / 'hook.pid'
			
 
				-            if pid_file.exists():
			
 
				-                kill_process(pid_file, validate=True)  # Use validation
			
 
				-
			
 
				-                # Update the ArchiveResult from filesystem
			
 
				-                plugin_name = plugin_dir.name
			
 
				-                results = self.archiveresult_set.filter(
			
 
				-                    status=ArchiveResult.StatusChoices.STARTED,
			
 
				-                    pwd__contains=plugin_name
			
 
				-                )
			
 
				-                for ar in results:
			
 
				-                    ar.update_from_output()
			
 
				+        # Find all .pid files in this snapshot's output directory
			
 
				+        for pid_file in self.OUTPUT_DIR.glob('**/*.pid'):
			
 
				+            kill_process(pid_file, validate=True)
			
 
				+
			
 
				+        # Update all STARTED ArchiveResults from filesystem
			
 
				+        results = self.archiveresult_set.filter(status=ArchiveResult.StatusChoices.STARTED)
			
 
				+        for ar in results:
			
 
				+            ar.update_from_output()
			
 
				 
			
 
				     def has_running_background_hooks(self) -> bool:
			
 
				         """
			
@@ -1196,51 +1158,156 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
 
				         return False
			
 
				 
			
 
				     @staticmethod
			
 
				-    def from_jsonl(record: Dict[str, Any], overrides: Dict[str, Any] = None):
			
 
				+    def from_jsonl(record: Dict[str, Any], overrides: Dict[str, Any] = None, queue_for_extraction: bool = True):
			
 
				         """
			
 
				-        Create/update Snapshot from JSONL record.
			
 
				+        Create/update Snapshot from JSONL record or dict.
			
 
				+
			
 
				+        Unified method that handles:
			
 
				+        - ID-based patching: {"id": "...", "title": "new title"}
			
 
				+        - URL-based create/update: {"url": "...", "title": "...", "tags": "..."}
			
 
				+        - Auto-creates Crawl if not provided
			
 
				+        - Optionally queues for extraction
			
 
				 
			
 
				         Args:
			
 
				-            record: JSONL record with 'url' field and optional metadata
			
 
				+            record: Dict with 'url' (for create) or 'id' (for patch), plus other fields
			
 
				             overrides: Dict with 'crawl', 'snapshot' (parent), 'created_by_id'
			
 
				+            queue_for_extraction: If True, sets status=QUEUED and retry_at (default: True)
			
 
				 
			
 
				         Returns:
			
 
				             Snapshot instance or None
			
 
				-
			
 
				-        Note:
			
 
				-            Filtering (depth, URL allowlist/denylist) should be done by caller
			
 
				-            BEFORE calling this method. This method just creates the snapshot.
			
 
				         """
			
 
				-        from archivebox.misc.jsonl import get_or_create_snapshot
			
 
				+        import re
			
 
				         from django.utils import timezone
			
 
				+        from archivebox.misc.util import parse_date
			
 
				+        from archivebox.base_models.models import get_or_create_system_user_pk
			
 
				+        from archivebox.config.common import GENERAL_CONFIG
			
 
				 
			
 
				         overrides = overrides or {}
			
 
				+
			
 
				+        # If 'id' is provided, lookup and patch that specific snapshot
			
 
				+        snapshot_id = record.get('id')
			
 
				+        if snapshot_id:
			
 
				+            try:
			
 
				+                snapshot = Snapshot.objects.get(id=snapshot_id)
			
 
				+
			
 
				+                # Generically update all fields present in record
			
 
				+                update_fields = []
			
 
				+                for field_name, value in record.items():
			
 
				+                    # Skip internal fields
			
 
				+                    if field_name in ('id', 'type'):
			
 
				+                        continue
			
 
				+
			
 
				+                    # Skip if field doesn't exist on model
			
 
				+                    if not hasattr(snapshot, field_name):
			
 
				+                        continue
			
 
				+
			
 
				+                    # Special parsing for date fields
			
 
				+                    if field_name in ('bookmarked_at', 'retry_at', 'created_at', 'modified_at'):
			
 
				+                        if value and isinstance(value, str):
			
 
				+                            value = parse_date(value)
			
 
				+
			
 
				+                    # Update field if value is provided and different
			
 
				+                    if value is not None and getattr(snapshot, field_name) != value:
			
 
				+                        setattr(snapshot, field_name, value)
			
 
				+                        update_fields.append(field_name)
			
 
				+
			
 
				+                if update_fields:
			
 
				+                    snapshot.save(update_fields=update_fields + ['modified_at'])
			
 
				+
			
 
				+                return snapshot
			
 
				+            except Snapshot.DoesNotExist:
			
 
				+                # ID not found, fall through to create-by-URL logic
			
 
				+                pass
			
 
				+
			
 
				         url = record.get('url')
			
 
				         if not url:
			
 
				             return None
			
 
				 
			
 
				-        # Apply crawl context metadata
			
 
				+        # Determine or create crawl (every snapshot must have a crawl)
			
 
				         crawl = overrides.get('crawl')
			
 
				-        snapshot = overrides.get('snapshot')  # Parent snapshot
			
 
				+        parent_snapshot = overrides.get('snapshot')  # Parent snapshot
			
 
				+        created_by_id = overrides.get('created_by_id') or (parent_snapshot.created_by.pk if parent_snapshot else get_or_create_system_user_pk())
			
 
				+
			
 
				+        # If no crawl provided, inherit from parent or auto-create one
			
 
				+        if not crawl:
			
 
				+            if parent_snapshot:
			
 
				+                # Inherit crawl from parent snapshot
			
 
				+                crawl = parent_snapshot.crawl
			
 
				+            else:
			
 
				+                # Auto-create a single-URL crawl
			
 
				+                from archivebox.crawls.models import Crawl
			
 
				+                from archivebox.config import CONSTANTS
			
 
				+
			
 
				+                timestamp_str = timezone.now().strftime("%Y-%m-%d__%H-%M-%S")
			
 
				+                sources_file = CONSTANTS.SOURCES_DIR / f'{timestamp_str}__auto_crawl.txt'
			
 
				+                sources_file.parent.mkdir(parents=True, exist_ok=True)
			
 
				+                sources_file.write_text(url)
			
 
				+
			
 
				+                crawl = Crawl.objects.create(
			
 
				+                    urls=url,
			
 
				+                    max_depth=0,
			
 
				+                    label=f'auto-created for {url[:50]}',
			
 
				+                    created_by_id=created_by_id,
			
 
				+                )
			
 
				 
			
 
				-        if crawl:
			
 
				-            record.setdefault('crawl_id', str(crawl.id))
			
 
				-            record.setdefault('depth', (snapshot.depth + 1 if snapshot else 1))
			
 
				-            if snapshot:
			
 
				-                record.setdefault('parent_snapshot_id', str(snapshot.id))
			
 
				+        # Parse tags
			
 
				+        tags_str = record.get('tags', '')
			
 
				+        tag_list = []
			
 
				+        if tags_str:
			
 
				+            tag_list = list(dict.fromkeys(
			
 
				+                tag.strip() for tag in re.split(GENERAL_CONFIG.TAG_SEPARATOR_PATTERN, tags_str)
			
 
				+                if tag.strip()
			
 
				+            ))
			
 
				 
			
 
				-        try:
			
 
				-            created_by_id = overrides.get('created_by_id') or (snapshot.created_by_id if snapshot else None)
			
 
				-            new_snapshot = get_or_create_snapshot(record, created_by_id=created_by_id)
			
 
				+        # Get most recent snapshot with this URL (URLs can exist in multiple crawls)
			
 
				+        snapshot = Snapshot.objects.filter(url=url).order_by('-created_at').first()
			
 
				 
			
 
				-            # Queue for extraction
			
 
				-            new_snapshot.status = Snapshot.StatusChoices.QUEUED
			
 
				-            new_snapshot.retry_at = timezone.now()
			
 
				-            new_snapshot.save()
			
 
				+        title = record.get('title')
			
 
				+        timestamp = record.get('timestamp')
			
 
				 
			
 
				-            return new_snapshot
			
 
				-        except ValueError:
			
 
				-            return None
			
 
				+        if snapshot:
			
 
				+            # Update existing snapshot
			
 
				+            if title and (not snapshot.title or len(title) > len(snapshot.title or '')):
			
 
				+                snapshot.title = title
			
 
				+                snapshot.save(update_fields=['title', 'modified_at'])
			
 
				+        else:
			
 
				+            # Create new snapshot
			
 
				+            if timestamp:
			
 
				+                while Snapshot.objects.filter(timestamp=timestamp).exists():
			
 
				+                    timestamp = str(float(timestamp) + 1.0)
			
 
				+
			
 
				+            snapshot = Snapshot.objects.create(
			
 
				+                url=url,
			
 
				+                timestamp=timestamp,
			
 
				+                title=title,
			
 
				+                crawl=crawl,
			
 
				+            )
			
 
				+
			
 
				+        # Update tags
			
 
				+        if tag_list:
			
 
				+            existing_tags = set(snapshot.tags.values_list('name', flat=True))
			
 
				+            new_tags = set(tag_list) | existing_tags
			
 
				+            snapshot.save_tags(new_tags)
			
 
				+
			
 
				+        # Queue for extraction and update additional fields
			
 
				+        update_fields = []
			
 
				+
			
 
				+        if queue_for_extraction:
			
 
				+            snapshot.status = Snapshot.StatusChoices.QUEUED
			
 
				+            snapshot.retry_at = timezone.now()
			
 
				+            update_fields.extend(['status', 'retry_at'])
			
 
				+
			
 
				+        # Update additional fields if provided
			
 
				+        for field_name in ('depth', 'parent_snapshot_id', 'crawl_id', 'bookmarked_at'):
			
 
				+            value = record.get(field_name)
			
 
				+            if value is not None and getattr(snapshot, field_name) != value:
			
 
				+                setattr(snapshot, field_name, value)
			
 
				+                update_fields.append(field_name)
			
 
				+
			
 
				+        if update_fields:
			
 
				+            snapshot.save(update_fields=update_fields + ['modified_at'])
			
 
				+
			
 
				+        return snapshot
			
 
				 
			
 
				     def create_pending_archiveresults(self) -> list['ArchiveResult']:
			
 
				         """
			
@@ -1273,7 +1340,6 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
 
				                     'plugin': plugin,
			
 
				                     'status': ArchiveResult.INITIAL_STATE,
			
 
				                     'retry_at': timezone.now(),
			
 
				-                    'created_by_id': self.created_by_id,
			
 
				                 },
			
 
				             )
			
 
				             if archiveresult.status == ArchiveResult.INITIAL_STATE:
			
@@ -1329,6 +1395,36 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
 
				         self.save(update_fields=['current_step', 'modified_at'])
			
 
				         return True
			
 
				 
			
 
				+    def is_finished_processing(self) -> bool:
			
 
				+        """
			
 
				+        Check if this snapshot has finished processing.
			
 
				+
			
 
				+        Used by SnapshotMachine.is_finished() to determine if snapshot is complete.
			
 
				+
			
 
				+        Returns:
			
 
				+            True if all archiveresults are finished (or no work to do), False otherwise.
			
 
				+        """
			
 
				+        # if no archiveresults exist yet, it's not finished
			
 
				+        if not self.archiveresult_set.exists():
			
 
				+            return False
			
 
				+
			
 
				+        # Try to advance step if ready (handles step-based hook execution)
			
 
				+        # This will increment current_step when all foreground hooks in current step are done
			
 
				+        while self.advance_step_if_ready():
			
 
				+            pass  # Keep advancing until we can't anymore
			
 
				+
			
 
				+        # if archiveresults exist but are still pending, it's not finished
			
 
				+        if self.pending_archiveresults().exists():
			
 
				+            return False
			
 
				+
			
 
				+        # Don't wait for background hooks - they'll be cleaned up on entering sealed state
			
 
				+        # Background hooks in STARTED state are excluded by pending_archiveresults()
			
 
				+        # (STARTED is in FINAL_OR_ACTIVE_STATES) so once all results are FINAL or ACTIVE,
			
 
				+        # we can transition to sealed and cleanup() will kill the background hooks
			
 
				+
			
 
				+        # otherwise archiveresults exist and are all finished, so it's finished
			
 
				+        return True
			
 
				+
			
 
				     def retry_failed_archiveresults(self, retry_at: Optional['timezone.datetime'] = None) -> int:
			
 
				         """
			
 
				         Reset failed/skipped ArchiveResults to queued for retry.
			
@@ -1730,6 +1826,97 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
 
				         return dt.strftime('%Y-%m-%d %H:%M:%S') if dt else None
			
 
				 
			
 
				 
			
 
				+# =============================================================================
			
 
				+# Snapshot State Machine
			
 
				+# =============================================================================
			
 
				+
			
 
				+class SnapshotMachine(BaseStateMachine, strict_states=True):
			
 
				+    """
			
 
				+    State machine for managing Snapshot lifecycle.
			
 
				+
			
 
				+    Hook Lifecycle:
			
 
				+    ┌─────────────────────────────────────────────────────────────┐
			
 
				+    │ QUEUED State                                                │
			
 
				+    │  • Waiting for snapshot to be ready                         │
			
 
				+    └─────────────────────────────────────────────────────────────┘
			
 
				+                            ↓ tick() when can_start()
			
 
				+    ┌─────────────────────────────────────────────────────────────┐
			
 
				+    │ STARTED State → enter_started()                             │
			
 
				+    │  1. snapshot.run()                                          │
			
 
				+    │     • discover_hooks('Snapshot') → finds all plugin hooks   │
			
 
				+    │     • create_pending_archiveresults() → creates ONE         │
			
 
				+    │       ArchiveResult per hook (NO execution yet)             │
			
 
				+    │  2. ArchiveResults process independently with their own     │
			
 
				+    │     state machines (see ArchiveResultMachine)               │
			
 
				+    │  3. Advance through steps 0-9 as foreground hooks complete  │
			
 
				+    └─────────────────────────────────────────────────────────────┘
			
 
				+                            ↓ tick() when is_finished()
			
 
				+    ┌─────────────────────────────────────────────────────────────┐
			
 
				+    │ SEALED State → enter_sealed()                               │
			
 
				+    │  • cleanup() → kills any background hooks still running     │
			
 
				+    │  • Set retry_at=None (no more processing)                   │
			
 
				+    └─────────────────────────────────────────────────────────────┘
			
 
				+
			
 
				+    https://github.com/ArchiveBox/ArchiveBox/wiki/ArchiveBox-Architecture-Diagrams
			
 
				+    """
			
 
				+
			
 
				+    model_attr_name = 'snapshot'
			
 
				+
			
 
				+    # States
			
 
				+    queued = State(value=Snapshot.StatusChoices.QUEUED, initial=True)
			
 
				+    started = State(value=Snapshot.StatusChoices.STARTED)
			
 
				+    sealed = State(value=Snapshot.StatusChoices.SEALED, final=True)
			
 
				+
			
 
				+    # Tick Event
			
 
				+    tick = (
			
 
				+        queued.to.itself(unless='can_start') |
			
 
				+        queued.to(started, cond='can_start') |
			
 
				+        started.to.itself(unless='is_finished') |
			
 
				+        started.to(sealed, cond='is_finished')
			
 
				+    )
			
 
				+
			
 
				+    def can_start(self) -> bool:
			
 
				+        can_start = bool(self.snapshot.url)
			
 
				+        return can_start
			
 
				+
			
 
				+    def is_finished(self) -> bool:
			
 
				+        """Check if snapshot processing is complete - delegates to model method."""
			
 
				+        return self.snapshot.is_finished_processing()
			
 
				+
			
 
				+    @queued.enter
			
 
				+    def enter_queued(self):
			
 
				+        self.snapshot.update_and_requeue(
			
 
				+            retry_at=timezone.now(),
			
 
				+            status=Snapshot.StatusChoices.QUEUED,
			
 
				+        )
			
 
				+
			
 
				+    @started.enter
			
 
				+    def enter_started(self):
			
 
				+        # lock the snapshot while we create the pending archiveresults
			
 
				+        self.snapshot.update_and_requeue(
			
 
				+            retry_at=timezone.now() + timedelta(seconds=30),  # if failed, wait 30s before retrying
			
 
				+        )
			
 
				+
			
 
				+        # Run the snapshot - creates pending archiveresults for all enabled plugins
			
 
				+        self.snapshot.run()
			
 
				+
			
 
				+        # unlock the snapshot after we're done + set status = started
			
 
				+        self.snapshot.update_and_requeue(
			
 
				+            retry_at=timezone.now() + timedelta(seconds=5),  # check again in 5s
			
 
				+            status=Snapshot.StatusChoices.STARTED,
			
 
				+        )
			
 
				+
			
 
				+    @sealed.enter
			
 
				+    def enter_sealed(self):
			
 
				+        # Clean up background hooks
			
 
				+        self.snapshot.cleanup()
			
 
				+
			
 
				+        self.snapshot.update_and_requeue(
			
 
				+            retry_at=None,
			
 
				+            status=Snapshot.StatusChoices.SEALED,
			
 
				+        )
			
 
				+
			
 
				+
			
 
				 class ArchiveResultManager(models.Manager):
			
 
				     def indexable(self, sorted: bool = True):
			
 
				         INDEXABLE_METHODS = [r[0] for r in EXTRACTOR_INDEXING_PRECEDENCE]
			
@@ -1761,7 +1948,6 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
 
				     # Note: unique constraint is added by migration 0027 - don't set unique=True here
			
 
				     # or SQLite table recreation in earlier migrations will fail
			
 
				     uuid = models.UUIDField(default=uuid7, null=True, blank=True, db_index=True)
			
 
				-    created_by = models.ForeignKey(settings.AUTH_USER_MODEL, on_delete=models.CASCADE, default=get_or_create_system_user_pk, null=False, related_name='archiveresult_set', db_index=True)
			
 
				     created_at = models.DateTimeField(default=timezone.now, db_index=True)
			
 
				     modified_at = models.DateTimeField(auto_now=True)
			
 
				 
			
@@ -1782,7 +1968,7 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
 
				 
			
 
				     # Binary FK (optional - set when hook reports cmd)
			
 
				     binary = models.ForeignKey(
			
 
				-        'machine.Binary',
			
 
				+        Binary,
			
 
				         on_delete=models.SET_NULL,
			
 
				         null=True, blank=True,
			
 
				         related_name='archiveresults',
			
@@ -1798,7 +1984,7 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
 
				     output_dir = models.CharField(max_length=256, default=None, null=True, blank=True)
			
 
				     iface = models.ForeignKey(NetworkInterface, on_delete=models.SET_NULL, null=True, blank=True)
			
 
				 
			
 
				-    state_machine_name = 'core.statemachines.ArchiveResultMachine'
			
 
				+    state_machine_name = 'core.models.ArchiveResultMachine'
			
 
				     retry_at_field_name = 'retry_at'
			
 
				     state_field_name = 'status'
			
 
				     active_state = StatusChoices.STARTED
			
@@ -1806,12 +1992,18 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
 
				     objects = ArchiveResultManager()
			
 
				 
			
 
				     class Meta(TypedModelMeta):
			
 
				+        app_label = 'core'
			
 
				         verbose_name = 'Archive Result'
			
 
				         verbose_name_plural = 'Archive Results Log'
			
 
				 
			
 
				     def __str__(self):
			
 
				         return f'[{self.id}] {self.snapshot.url[:64]} -> {self.plugin}'
			
 
				 
			
 
				+    @property
			
 
				+    def created_by(self):
			
 
				+        """Convenience property to access the user who created this archive result via its snapshot's crawl."""
			
 
				+        return self.snapshot.crawl.created_by
			
 
				+
			
 
				     def save(self, *args, **kwargs):
			
 
				         is_new = self._state.adding
			
 
				         # Skip ModelWithOutputDir.save() to avoid creating index.json in plugin directories
			
@@ -1900,6 +2092,12 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
 
				     def save_search_index(self):
			
 
				         pass
			
 
				 
			
 
				+    def cascade_health_update(self, success: bool):
			
 
				+        """Update health stats for self, parent Snapshot, and grandparent Crawl."""
			
 
				+        self.increment_health_stats(success)
			
 
				+        self.snapshot.increment_health_stats(success)
			
 
				+        self.snapshot.crawl.increment_health_stats(success)
			
 
				+
			
 
				     def run(self):
			
 
				         """
			
 
				         Execute this ArchiveResult's hook and update status.
			
@@ -1911,8 +2109,13 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
 
				         """
			
 
				         from django.utils import timezone
			
 
				         from archivebox.hooks import BUILTIN_PLUGINS_DIR, USER_PLUGINS_DIR, run_hook, is_background_hook
			
 
				+        from archivebox.config.configset import get_config
			
 
				 
			
 
				-        config_objects = [self.snapshot.crawl, self.snapshot] if self.snapshot.crawl else [self.snapshot]
			
 
				+        # Get merged config with proper context
			
 
				+        config = get_config(
			
 
				+            crawl=self.snapshot.crawl,
			
 
				+            snapshot=self.snapshot,
			
 
				+        )
			
 
				 
			
 
				         # Determine which hook(s) to run
			
 
				         hooks = []
			
@@ -1962,10 +2165,10 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
 
				             result = run_hook(
			
 
				                 hook,
			
 
				                 output_dir=plugin_dir,
			
 
				-                config_objects=config_objects,
			
 
				+                config=config,
			
 
				                 url=self.snapshot.url,
			
 
				                 snapshot_id=str(self.snapshot.id),
			
 
				-                crawl_id=str(self.snapshot.crawl.id) if self.snapshot.crawl else None,
			
 
				+                crawl_id=str(self.snapshot.crawl.id),
			
 
				                 depth=self.snapshot.depth,
			
 
				             )
			
 
				 
			
@@ -2112,9 +2315,6 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
 
				 
			
 
				             # Filter Snapshot records for depth/URL constraints
			
 
				             if record_type == 'Snapshot':
			
 
				-                if not self.snapshot.crawl:
			
 
				-                    continue
			
 
				-
			
 
				                 url = record.get('url')
			
 
				                 if not url:
			
 
				                     continue
			
@@ -2132,19 +2332,10 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
 
				         overrides = {
			
 
				             'snapshot': self.snapshot,
			
 
				             'crawl': self.snapshot.crawl,
			
 
				-            'created_by_id': self.snapshot.created_by_id,
			
 
				+            'created_by_id': self.created_by.pk,
			
 
				         }
			
 
				         process_hook_records(filtered_records, overrides=overrides)
			
 
				 
			
 
				-        # Update snapshot title if this is the title plugin
			
 
				-        plugin_name = get_plugin_name(self.plugin)
			
 
				-        if self.status == self.StatusChoices.SUCCEEDED and plugin_name == 'title':
			
 
				-            self._update_snapshot_title(plugin_dir)
			
 
				-
			
 
				-        # Trigger search indexing if succeeded
			
 
				-        if self.status == self.StatusChoices.SUCCEEDED:
			
 
				-            self.trigger_search_indexing()
			
 
				-
			
 
				         # Cleanup PID files and empty logs
			
 
				         pid_file = plugin_dir / 'hook.pid'
			
 
				         pid_file.unlink(missing_ok=True)
			
@@ -2164,7 +2355,7 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
 
				         if not cmd:
			
 
				             return
			
 
				 
			
 
				-        from machine.models import Machine
			
 
				+        from archivebox.machine.models import Machine
			
 
				 
			
 
				         bin_path_or_name = cmd[0] if isinstance(cmd, list) else cmd
			
 
				         machine = Machine.current()
			
@@ -2189,23 +2380,6 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
 
				         if binary:
			
 
				             self.binary = binary
			
 
				 
			
 
				-    def _update_snapshot_title(self, plugin_dir: Path):
			
 
				-        """
			
 
				-        Update snapshot title from title plugin output.
			
 
				-
			
 
				-        The title plugin writes title.txt with the extracted page title.
			
 
				-        This updates the Snapshot.title field if the file exists and has content.
			
 
				-        """
			
 
				-        title_file = plugin_dir / 'title.txt'
			
 
				-        if title_file.exists():
			
 
				-            try:
			
 
				-                title = title_file.read_text(encoding='utf-8').strip()
			
 
				-                if title and (not self.snapshot.title or len(title) > len(self.snapshot.title)):
			
 
				-                    self.snapshot.title = title[:512]  # Max length from model
			
 
				-                    self.snapshot.save(update_fields=['title', 'modified_at'])
			
 
				-            except Exception:
			
 
				-                pass  # Failed to read title, that's okay
			
 
				-
			
 
				     def _url_passes_filters(self, url: str) -> bool:
			
 
				         """Check if URL passes URL_ALLOWLIST and URL_DENYLIST config filters.
			
 
				 
			
@@ -2216,8 +2390,8 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
 
				 
			
 
				         # Get merged config with proper hierarchy
			
 
				         config = get_config(
			
 
				-            user=self.snapshot.created_by if self.snapshot else None,
			
 
				-            crawl=self.snapshot.crawl if self.snapshot else None,
			
 
				+            user=self.created_by,
			
 
				+            crawl=self.snapshot.crawl,
			
 
				             snapshot=self.snapshot,
			
 
				         )
			
 
				 
			
@@ -2256,23 +2430,6 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
 
				             return False  # No allowlist patterns matched
			
 
				 
			
 
				         return True  # No filters or passed filters
			
 
				-    
			
 
				-    def trigger_search_indexing(self):
			
 
				-        """Run any ArchiveResult__index hooks to update search indexes."""
			
 
				-        from archivebox.hooks import discover_hooks, run_hook
			
 
				-
			
 
				-        # Pass config objects in priority order (later overrides earlier)
			
 
				-        config_objects = [self.snapshot.crawl, self.snapshot] if self.snapshot.crawl else [self.snapshot]
			
 
				-
			
 
				-        for hook in discover_hooks('ArchiveResult__index'):
			
 
				-            run_hook(
			
 
				-                hook,
			
 
				-                output_dir=self.output_dir,
			
 
				-                config_objects=config_objects,
			
 
				-                url=self.snapshot.url,
			
 
				-                snapshot_id=str(self.snapshot.id),
			
 
				-                plugin=self.plugin,
			
 
				-            )
			
 
				 
			
 
				     @property
			
 
				     def output_dir(self) -> Path:
			
@@ -2285,4 +2442,185 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
 
				         if not plugin_dir:
			
 
				             return False
			
 
				         pid_file = plugin_dir / 'hook.pid'
			
 
				-        return pid_file.exists()
			
 
				+        return pid_file.exists()
			
 
				+
			
 
				+
			
 
				+# =============================================================================
			
 
				+# ArchiveResult State Machine
			
 
				+# =============================================================================
			
 
				+
			
 
				+class ArchiveResultMachine(BaseStateMachine, strict_states=True):
			
 
				+    """
			
 
				+    State machine for managing ArchiveResult (single plugin execution) lifecycle.
			
 
				+
			
 
				+    Hook Lifecycle:
			
 
				+    ┌─────────────────────────────────────────────────────────────┐
			
 
				+    │ QUEUED State                                                │
			
 
				+    │  • Waiting for its turn to run                              │
			
 
				+    └─────────────────────────────────────────────────────────────┘
			
 
				+                            ↓ tick() when can_start()
			
 
				+    ┌─────────────────────────────────────────────────────────────┐
			
 
				+    │ STARTED State → enter_started()                             │
			
 
				+    │  1. archiveresult.run()                                     │
			
 
				+    │     • Find specific hook by hook_name                       │
			
 
				+    │     • run_hook(script, output_dir, ...) → subprocess        │
			
 
				+    │                                                              │
			
 
				+    │  2a. FOREGROUND hook (returns HookResult):                  │
			
 
				+    │      • update_from_output() immediately                     │
			
 
				+    │        - Read stdout.log                                    │
			
 
				+    │        - Parse JSONL records                                │
			
 
				+    │        - Extract 'ArchiveResult' record → update status     │
			
 
				+    │        - Walk output_dir → populate output_files            │
			
 
				+    │        - Call process_hook_records() for side effects       │
			
 
				+    │                                                              │
			
 
				+    │  2b. BACKGROUND hook (returns None):                        │
			
 
				+    │      • Status stays STARTED                                 │
			
 
				+    │      • Continues running in background                      │
			
 
				+    │      • Killed by Snapshot.cleanup() when sealed             │
			
 
				+    └─────────────────────────────────────────────────────────────┘
			
 
				+                            ↓ tick() checks status
			
 
				+    ┌─────────────────────────────────────────────────────────────┐
			
 
				+    │ SUCCEEDED / FAILED / SKIPPED / BACKOFF                      │
			
 
				+    │  • Set by hook's JSONL output during update_from_output()   │
			
 
				+    │  • Health stats incremented (num_uses_succeeded/failed)     │
			
 
				+    │  • Parent Snapshot health stats also updated                │
			
 
				+    └─────────────────────────────────────────────────────────────┘
			
 
				+
			
 
				+    https://github.com/ArchiveBox/ArchiveBox/wiki/ArchiveBox-Architecture-Diagrams
			
 
				+    """
			
 
				+
			
 
				+    model_attr_name = 'archiveresult'
			
 
				+
			
 
				+    # States
			
 
				+    queued = State(value=ArchiveResult.StatusChoices.QUEUED, initial=True)
			
 
				+    started = State(value=ArchiveResult.StatusChoices.STARTED)
			
 
				+    backoff = State(value=ArchiveResult.StatusChoices.BACKOFF)
			
 
				+    succeeded = State(value=ArchiveResult.StatusChoices.SUCCEEDED, final=True)
			
 
				+    failed = State(value=ArchiveResult.StatusChoices.FAILED, final=True)
			
 
				+    skipped = State(value=ArchiveResult.StatusChoices.SKIPPED, final=True)
			
 
				+
			
 
				+    # Tick Event - transitions based on conditions
			
 
				+    tick = (
			
 
				+        queued.to.itself(unless='can_start') |
			
 
				+        queued.to(started, cond='can_start') |
			
 
				+        started.to.itself(unless='is_finished') |
			
 
				+        started.to(succeeded, cond='is_succeeded') |
			
 
				+        started.to(failed, cond='is_failed') |
			
 
				+        started.to(skipped, cond='is_skipped') |
			
 
				+        started.to(backoff, cond='is_backoff') |
			
 
				+        backoff.to.itself(unless='can_start') |
			
 
				+        backoff.to(started, cond='can_start') |
			
 
				+        backoff.to(succeeded, cond='is_succeeded') |
			
 
				+        backoff.to(failed, cond='is_failed') |
			
 
				+        backoff.to(skipped, cond='is_skipped')
			
 
				+    )
			
 
				+
			
 
				+    def can_start(self) -> bool:
			
 
				+        can_start = bool(self.archiveresult.snapshot.url)
			
 
				+        return can_start
			
 
				+
			
 
				+    def is_succeeded(self) -> bool:
			
 
				+        """Check if extractor plugin succeeded (status was set by run())."""
			
 
				+        return self.archiveresult.status == ArchiveResult.StatusChoices.SUCCEEDED
			
 
				+
			
 
				+    def is_failed(self) -> bool:
			
 
				+        """Check if extractor plugin failed (status was set by run())."""
			
 
				+        return self.archiveresult.status == ArchiveResult.StatusChoices.FAILED
			
 
				+
			
 
				+    def is_skipped(self) -> bool:
			
 
				+        """Check if extractor plugin was skipped (status was set by run())."""
			
 
				+        return self.archiveresult.status == ArchiveResult.StatusChoices.SKIPPED
			
 
				+
			
 
				+    def is_backoff(self) -> bool:
			
 
				+        """Check if we should backoff and retry later."""
			
 
				+        # Backoff if status is still started (plugin didn't complete) and output_str is empty
			
 
				+        return (
			
 
				+            self.archiveresult.status == ArchiveResult.StatusChoices.STARTED and
			
 
				+            not self.archiveresult.output_str
			
 
				+        )
			
 
				+
			
 
				+    def is_finished(self) -> bool:
			
 
				+        """Check if extraction has completed (success, failure, or skipped)."""
			
 
				+        return self.archiveresult.status in (
			
 
				+            ArchiveResult.StatusChoices.SUCCEEDED,
			
 
				+            ArchiveResult.StatusChoices.FAILED,
			
 
				+            ArchiveResult.StatusChoices.SKIPPED,
			
 
				+        )
			
 
				+
			
 
				+    @queued.enter
			
 
				+    def enter_queued(self):
			
 
				+        self.archiveresult.update_and_requeue(
			
 
				+            retry_at=timezone.now(),
			
 
				+            status=ArchiveResult.StatusChoices.QUEUED,
			
 
				+            start_ts=None,
			
 
				+        )  # bump the snapshot's retry_at so they pickup any new changes
			
 
				+
			
 
				+    @started.enter
			
 
				+    def enter_started(self):
			
 
				+        from archivebox.machine.models import NetworkInterface
			
 
				+
			
 
				+        # Lock the object and mark start time
			
 
				+        self.archiveresult.update_and_requeue(
			
 
				+            retry_at=timezone.now() + timedelta(seconds=120),  # 2 min timeout for plugin
			
 
				+            status=ArchiveResult.StatusChoices.STARTED,
			
 
				+            start_ts=timezone.now(),
			
 
				+            iface=NetworkInterface.current(),
			
 
				+        )
			
 
				+
			
 
				+        # Run the plugin - this updates status, output, timestamps, etc.
			
 
				+        self.archiveresult.run()
			
 
				+
			
 
				+        # Save the updated result
			
 
				+        self.archiveresult.save()
			
 
				+
			
 
				+
			
 
				+    @backoff.enter
			
 
				+    def enter_backoff(self):
			
 
				+        self.archiveresult.update_and_requeue(
			
 
				+            retry_at=timezone.now() + timedelta(seconds=60),
			
 
				+            status=ArchiveResult.StatusChoices.BACKOFF,
			
 
				+            end_ts=None,
			
 
				+        )
			
 
				+
			
 
				+    @succeeded.enter
			
 
				+    def enter_succeeded(self):
			
 
				+        self.archiveresult.update_and_requeue(
			
 
				+            retry_at=None,
			
 
				+            status=ArchiveResult.StatusChoices.SUCCEEDED,
			
 
				+            end_ts=timezone.now(),
			
 
				+        )
			
 
				+
			
 
				+        # Update health stats for ArchiveResult, Snapshot, and Crawl cascade
			
 
				+        self.archiveresult.cascade_health_update(success=True)
			
 
				+
			
 
				+    @failed.enter
			
 
				+    def enter_failed(self):
			
 
				+        self.archiveresult.update_and_requeue(
			
 
				+            retry_at=None,
			
 
				+            status=ArchiveResult.StatusChoices.FAILED,
			
 
				+            end_ts=timezone.now(),
			
 
				+        )
			
 
				+
			
 
				+        # Update health stats for ArchiveResult, Snapshot, and Crawl cascade
			
 
				+        self.archiveresult.cascade_health_update(success=False)
			
 
				+
			
 
				+    @skipped.enter
			
 
				+    def enter_skipped(self):
			
 
				+        self.archiveresult.update_and_requeue(
			
 
				+            retry_at=None,
			
 
				+            status=ArchiveResult.StatusChoices.SKIPPED,
			
 
				+            end_ts=timezone.now(),
			
 
				+        )
			
 
				+
			
 
				+    def after_transition(self, event: str, source: State, target: State):
			
 
				+        self.archiveresult.snapshot.update_and_requeue()  # bump snapshot retry time so it picks up all the new changes
			
 
				+
			
 
				+
			
 
				+# =============================================================================
			
 
				+# State Machine Registration
			
 
				+# =============================================================================
			
 
				+
			
 
				+# Manually register state machines with python-statemachine registry
			
 
				+# (normally auto-discovered from statemachines.py, but we define them here for clarity)
			
 
				+registry.register(SnapshotMachine)
			
 
				+registry.register(ArchiveResultMachine)
			
--- a/archivebox/core/models.py.bak
+++ b/archivebox/core/models.py.bak
@@ -0,0 +1,2638 @@
 
				+__package__ = 'archivebox.core'
			
 
				+
			
 
				+from typing import Optional, Dict, Iterable, Any, List, TYPE_CHECKING
			
 
				+from archivebox.uuid_compat import uuid7
			
 
				+from datetime import datetime, timedelta
			
 
				+from django_stubs_ext.db.models import TypedModelMeta
			
 
				+
			
 
				+import os
			
 
				+import json
			
 
				+from pathlib import Path
			
 
				+
			
 
				+from statemachine import State, registry
			
 
				+
			
 
				+from django.db import models
			
 
				+from django.db.models import QuerySet, Value, Case, When, IntegerField
			
 
				+from django.utils.functional import cached_property
			
 
				+from django.utils.text import slugify
			
 
				+from django.utils import timezone
			
 
				+from django.core.cache import cache
			
 
				+from django.urls import reverse, reverse_lazy
			
 
				+from django.contrib import admin
			
 
				+from django.conf import settings
			
 
				+
			
 
				+from archivebox.config import CONSTANTS
			
 
				+from archivebox.misc.system import get_dir_size, atomic_write
			
 
				+from archivebox.misc.util import parse_date, base_url, domain as url_domain, to_json, ts_to_date_str, urlencode, htmlencode, urldecode
			
 
				+from archivebox.misc.hashing import get_dir_info
			
 
				+from archivebox.hooks import (
			
 
				+    EXTRACTOR_INDEXING_PRECEDENCE,
			
 
				+    get_plugins, get_plugin_name, get_plugin_icon,
			
 
				+    DEFAULT_PLUGIN_ICONS,
			
 
				+)
			
 
				+from archivebox.base_models.models import (
			
 
				+    ModelWithUUID, ModelWithSerializers, ModelWithOutputDir,
			
 
				+    ModelWithConfig, ModelWithNotes, ModelWithHealthStats,
			
 
				+    get_or_create_system_user_pk,
			
 
				+)
			
 
				+from workers.models import ModelWithStateMachine, BaseStateMachine
			
 
				+from workers.tasks import bg_archive_snapshot
			
 
				+from archivebox.crawls.models import Crawl
			
 
				+from archivebox.machine.models import NetworkInterface, Binary
			
 
				+
			
 
				+
			
 
				+
			
 
				+class Tag(ModelWithSerializers):
			
 
				+    # Keep AutoField for compatibility with main branch migrations
			
 
				+    # Don't use UUIDField here - requires complex FK transformation
			
 
				+    id = models.AutoField(primary_key=True, serialize=False, verbose_name='ID')
			
 
				+    created_by = models.ForeignKey(settings.AUTH_USER_MODEL, on_delete=models.CASCADE, default=get_or_create_system_user_pk, null=False, related_name='tag_set')
			
 
				+    created_at = models.DateTimeField(default=timezone.now, db_index=True, null=True)
			
 
				+    modified_at = models.DateTimeField(auto_now=True)
			
 
				+    name = models.CharField(unique=True, blank=False, max_length=100)
			
 
				+    slug = models.SlugField(unique=True, blank=False, max_length=100, editable=False)
			
 
				+
			
 
				+    snapshot_set: models.Manager['Snapshot']
			
 
				+
			
 
				+    class Meta(TypedModelMeta):
			
 
				+        verbose_name = "Tag"
			
 
				+        verbose_name_plural = "Tags"
			
 
				+
			
 
				+    def __str__(self):
			
 
				+        return self.name
			
 
				+
			
 
				+    def save(self, *args, **kwargs):
			
 
				+        is_new = self._state.adding
			
 
				+        if is_new:
			
 
				+            self.slug = slugify(self.name)
			
 
				+            existing = set(Tag.objects.filter(slug__startswith=self.slug).values_list("slug", flat=True))
			
 
				+            i = None
			
 
				+            while True:
			
 
				+                slug = f"{slugify(self.name)}_{i}" if i else slugify(self.name)
			
 
				+                if slug not in existing:
			
 
				+                    self.slug = slug
			
 
				+                    break
			
 
				+                i = (i or 0) + 1
			
 
				+        super().save(*args, **kwargs)
			
 
				+
			
 
				+        if is_new:
			
 
				+            from archivebox.misc.logging_util import log_worker_event
			
 
				+            log_worker_event(
			
 
				+                worker_type='DB',
			
 
				+                event='Created Tag',
			
 
				+                indent_level=0,
			
 
				+                metadata={
			
 
				+                    'id': self.id,
			
 
				+                    'name': self.name,
			
 
				+                    'slug': self.slug,
			
 
				+                },
			
 
				+            )
			
 
				+
			
 
				+    @property
			
 
				+    def api_url(self) -> str:
			
 
				+        return reverse_lazy('api-1:get_tag', args=[self.id])
			
 
				+
			
 
				+    @staticmethod
			
 
				+    def from_jsonl(record: Dict[str, Any], overrides: Dict[str, Any] = None):
			
 
				+        """
			
 
				+        Create/update Tag from JSONL record.
			
 
				+
			
 
				+        Args:
			
 
				+            record: JSONL record with 'name' field
			
 
				+            overrides: Optional dict with 'snapshot' to auto-attach tag
			
 
				+
			
 
				+        Returns:
			
 
				+            Tag instance or None
			
 
				+        """
			
 
				+        from archivebox.misc.jsonl import get_or_create_tag
			
 
				+
			
 
				+        try:
			
 
				+            tag = get_or_create_tag(record)
			
 
				+
			
 
				+            # Auto-attach to snapshot if in overrides
			
 
				+            if overrides and 'snapshot' in overrides and tag:
			
 
				+                overrides['snapshot'].tags.add(tag)
			
 
				+
			
 
				+            return tag
			
 
				+        except ValueError:
			
 
				+            return None
			
 
				+
			
 
				+
			
 
				+class SnapshotTag(models.Model):
			
 
				+    id = models.AutoField(primary_key=True)
			
 
				+    snapshot = models.ForeignKey('Snapshot', db_column='snapshot_id', on_delete=models.CASCADE, to_field='id')
			
 
				+    tag = models.ForeignKey(Tag, db_column='tag_id', on_delete=models.CASCADE, to_field='id')
			
 
				+
			
 
				+    class Meta:
			
 
				+        db_table = 'core_snapshot_tags'
			
 
				+        unique_together = [('snapshot', 'tag')]
			
 
				+
			
 
				+
			
 
				+class SnapshotQuerySet(models.QuerySet):
			
 
				+    """Custom QuerySet for Snapshot model with export methods that persist through .filter() etc."""
			
 
				+
			
 
				+    # =========================================================================
			
 
				+    # Filtering Methods
			
 
				+    # =========================================================================
			
 
				+
			
 
				+    FILTER_TYPES = {
			
 
				+        'exact': lambda pattern: models.Q(url=pattern),
			
 
				+        'substring': lambda pattern: models.Q(url__icontains=pattern),
			
 
				+        'regex': lambda pattern: models.Q(url__iregex=pattern),
			
 
				+        'domain': lambda pattern: models.Q(url__istartswith=f"http://{pattern}") | models.Q(url__istartswith=f"https://{pattern}") | models.Q(url__istartswith=f"ftp://{pattern}"),
			
 
				+        'tag': lambda pattern: models.Q(tags__name=pattern),
			
 
				+        'timestamp': lambda pattern: models.Q(timestamp=pattern),
			
 
				+    }
			
 
				+
			
 
				+    def filter_by_patterns(self, patterns: List[str], filter_type: str = 'exact') -> 'SnapshotQuerySet':
			
 
				+        """Filter snapshots by URL patterns using specified filter type"""
			
 
				+        from archivebox.misc.logging import stderr
			
 
				+
			
 
				+        q_filter = models.Q()
			
 
				+        for pattern in patterns:
			
 
				+            try:
			
 
				+                q_filter = q_filter | self.FILTER_TYPES[filter_type](pattern)
			
 
				+            except KeyError:
			
 
				+                stderr()
			
 
				+                stderr(f'[X] Got invalid pattern for --filter-type={filter_type}:', color='red')
			
 
				+                stderr(f'    {pattern}')
			
 
				+                raise SystemExit(2)
			
 
				+        return self.filter(q_filter)
			
 
				+
			
 
				+    def search(self, patterns: List[str]) -> 'SnapshotQuerySet':
			
 
				+        """Search snapshots using the configured search backend"""
			
 
				+        from archivebox.config.common import SEARCH_BACKEND_CONFIG
			
 
				+        from archivebox.search import query_search_index
			
 
				+        from archivebox.misc.logging import stderr
			
 
				+
			
 
				+        if not SEARCH_BACKEND_CONFIG.USE_SEARCHING_BACKEND:
			
 
				+            stderr()
			
 
				+            stderr('[X] The search backend is not enabled, set config.USE_SEARCHING_BACKEND = True', color='red')
			
 
				+            raise SystemExit(2)
			
 
				+
			
 
				+        qsearch = self.none()
			
 
				+        for pattern in patterns:
			
 
				+            try:
			
 
				+                qsearch |= query_search_index(pattern)
			
 
				+            except:
			
 
				+                raise SystemExit(2)
			
 
				+        return self.all() & qsearch
			
 
				+
			
 
				+    # =========================================================================
			
 
				+    # Export Methods
			
 
				+    # =========================================================================
			
 
				+
			
 
				+    def to_json(self, with_headers: bool = False) -> str:
			
 
				+        """Generate JSON index from snapshots"""
			
 
				+        import sys
			
 
				+        from datetime import datetime, timezone as tz
			
 
				+        from archivebox.config import VERSION
			
 
				+        from archivebox.config.common import SERVER_CONFIG
			
 
				+
			
 
				+        MAIN_INDEX_HEADER = {
			
 
				+            'info': 'This is an index of site data archived by ArchiveBox: The self-hosted web archive.',
			
 
				+            'schema': 'archivebox.index.json',
			
 
				+            'copyright_info': SERVER_CONFIG.FOOTER_INFO,
			
 
				+            'meta': {
			
 
				+                'project': 'ArchiveBox',
			
 
				+                'version': VERSION,
			
 
				+                'git_sha': VERSION,
			
 
				+                'website': 'https://ArchiveBox.io',
			
 
				+                'docs': 'https://github.com/ArchiveBox/ArchiveBox/wiki',
			
 
				+                'source': 'https://github.com/ArchiveBox/ArchiveBox',
			
 
				+                'issues': 'https://github.com/ArchiveBox/ArchiveBox/issues',
			
 
				+                'dependencies': {},
			
 
				+            },
			
 
				+        } if with_headers else {}
			
 
				+
			
 
				+        snapshot_dicts = [s.to_dict(extended=True) for s in self.iterator(chunk_size=500)]
			
 
				+
			
 
				+        if with_headers:
			
 
				+            output = {
			
 
				+                **MAIN_INDEX_HEADER,
			
 
				+                'num_links': len(snapshot_dicts),
			
 
				+                'updated': datetime.now(tz.utc),
			
 
				+                'last_run_cmd': sys.argv,
			
 
				+                'links': snapshot_dicts,
			
 
				+            }
			
 
				+        else:
			
 
				+            output = snapshot_dicts
			
 
				+        return to_json(output, indent=4, sort_keys=True)
			
 
				+
			
 
				+    def to_csv(self, cols: Optional[List[str]] = None, header: bool = True, separator: str = ',', ljust: int = 0) -> str:
			
 
				+        """Generate CSV output from snapshots"""
			
 
				+        cols = cols or ['timestamp', 'is_archived', 'url']
			
 
				+        header_str = separator.join(col.ljust(ljust) for col in cols) if header else ''
			
 
				+        row_strs = (s.to_csv(cols=cols, ljust=ljust, separator=separator) for s in self.iterator(chunk_size=500))
			
 
				+        return '\n'.join((header_str, *row_strs))
			
 
				+
			
 
				+    def to_html(self, with_headers: bool = True) -> str:
			
 
				+        """Generate main index HTML from snapshots"""
			
 
				+        from datetime import datetime, timezone as tz
			
 
				+        from django.template.loader import render_to_string
			
 
				+        from archivebox.config import VERSION
			
 
				+        from archivebox.config.common import SERVER_CONFIG
			
 
				+        from archivebox.config.version import get_COMMIT_HASH
			
 
				+
			
 
				+        template = 'static_index.html' if with_headers else 'minimal_index.html'
			
 
				+        snapshot_list = list(self.iterator(chunk_size=500))
			
 
				+
			
 
				+        return render_to_string(template, {
			
 
				+            'version': VERSION,
			
 
				+            'git_sha': get_COMMIT_HASH() or VERSION,
			
 
				+            'num_links': str(len(snapshot_list)),
			
 
				+            'date_updated': datetime.now(tz.utc).strftime('%Y-%m-%d'),
			
 
				+            'time_updated': datetime.now(tz.utc).strftime('%Y-%m-%d %H:%M'),
			
 
				+            'links': snapshot_list,
			
 
				+            'FOOTER_INFO': SERVER_CONFIG.FOOTER_INFO,
			
 
				+        })
			
 
				+
			
 
				+
			
 
				+class SnapshotManager(models.Manager.from_queryset(SnapshotQuerySet)):
			
 
				+    """Manager for Snapshot model - uses SnapshotQuerySet for chainable methods"""
			
 
				+
			
 
				+    def filter(self, *args, **kwargs):
			
 
				+        domain = kwargs.pop('domain', None)
			
 
				+        qs = super().filter(*args, **kwargs)
			
 
				+        if domain:
			
 
				+            qs = qs.filter(url__icontains=f'://{domain}')
			
 
				+        return qs
			
 
				+
			
 
				+    def get_queryset(self):
			
 
				+        return super().get_queryset().prefetch_related('tags', 'archiveresult_set')
			
 
				+
			
 
				+    # =========================================================================
			
 
				+    # Import Methods
			
 
				+    # =========================================================================
			
 
				+
			
 
				+    def remove(self, atomic: bool = False) -> tuple:
			
 
				+        """Remove snapshots from the database"""
			
 
				+        from django.db import transaction
			
 
				+        if atomic:
			
 
				+            with transaction.atomic():
			
 
				+                return self.delete()
			
 
				+        return self.delete()
			
 
				+
			
 
				+
			
 
				+class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHealthStats, ModelWithStateMachine):
			
 
				+    id = models.UUIDField(primary_key=True, default=uuid7, editable=False, unique=True)
			
 
				+    created_at = models.DateTimeField(default=timezone.now, db_index=True)
			
 
				+    modified_at = models.DateTimeField(auto_now=True)
			
 
				+
			
 
				+    url = models.URLField(unique=False, db_index=True)  # URLs can appear in multiple crawls
			
 
				+    timestamp = models.CharField(max_length=32, unique=True, db_index=True, editable=False)
			
 
				+    bookmarked_at = models.DateTimeField(default=timezone.now, db_index=True)
			
 
				+    crawl: Crawl = models.ForeignKey(Crawl, on_delete=models.CASCADE, null=False, related_name='snapshot_set', db_index=True)  # type: ignore[assignment]
			
 
				+    parent_snapshot = models.ForeignKey('self', on_delete=models.SET_NULL, null=True, blank=True, related_name='child_snapshots', db_index=True, help_text='Parent snapshot that discovered this URL (for recursive crawling)')
			
 
				+
			
 
				+    title = models.CharField(max_length=512, null=True, blank=True, db_index=True)
			
 
				+    downloaded_at = models.DateTimeField(default=None, null=True, editable=False, db_index=True, blank=True)
			
 
				+    depth = models.PositiveSmallIntegerField(default=0, db_index=True)  # 0 for root snapshot, 1+ for discovered URLs
			
 
				+    fs_version = models.CharField(max_length=10, default='0.9.0', help_text='Filesystem version of this snapshot (e.g., "0.7.0", "0.8.0", "0.9.0"). Used to trigger lazy migration on save().')
			
 
				+    current_step = models.PositiveSmallIntegerField(default=0, db_index=True, help_text='Current hook step being executed (0-9). Used for sequential hook execution.')
			
 
				+
			
 
				+    retry_at = ModelWithStateMachine.RetryAtField(default=timezone.now)
			
 
				+    status = ModelWithStateMachine.StatusField(choices=ModelWithStateMachine.StatusChoices, default=ModelWithStateMachine.StatusChoices.QUEUED)
			
 
				+    config = models.JSONField(default=dict, null=False, blank=False, editable=True)
			
 
				+    notes = models.TextField(blank=True, null=False, default='')
			
 
				+    output_dir = models.FilePathField(path=CONSTANTS.ARCHIVE_DIR, recursive=True, match='.*', default=None, null=True, blank=True, editable=True)
			
 
				+
			
 
				+    tags = models.ManyToManyField(Tag, blank=True, through=SnapshotTag, related_name='snapshot_set', through_fields=('snapshot', 'tag'))
			
 
				+
			
 
				+    state_machine_name = 'core.models.SnapshotMachine'
			
 
				+    state_field_name = 'status'
			
 
				+    retry_at_field_name = 'retry_at'
			
 
				+    StatusChoices = ModelWithStateMachine.StatusChoices
			
 
				+    active_state = StatusChoices.STARTED
			
 
				+
			
 
				+    objects = SnapshotManager()
			
 
				+    archiveresult_set: models.Manager['ArchiveResult']
			
 
				+
			
 
				+    class Meta(TypedModelMeta):
			
 
				+        verbose_name = "Snapshot"
			
 
				+        verbose_name_plural = "Snapshots"
			
 
				+        constraints = [
			
 
				+            # Allow same URL in different crawls, but not duplicates within same crawl
			
 
				+            models.UniqueConstraint(fields=['url', 'crawl'], name='unique_url_per_crawl'),
			
 
				+            # Global timestamp uniqueness for 1:1 symlink mapping
			
 
				+            models.UniqueConstraint(fields=['timestamp'], name='unique_timestamp'),
			
 
				+        ]
			
 
				+
			
 
				+    def __str__(self):
			
 
				+        return f'[{self.id}] {self.url[:64]}'
			
 
				+
			
 
				+    def save(self, *args, **kwargs):
			
 
				+        is_new = self._state.adding
			
 
				+        if not self.bookmarked_at:
			
 
				+            self.bookmarked_at = self.created_at or timezone.now()
			
 
				+        if not self.timestamp:
			
 
				+            self.timestamp = str(self.bookmarked_at.timestamp())
			
 
				+
			
 
				+        # Migrate filesystem if needed (happens automatically on save)
			
 
				+        if self.pk and self.fs_migration_needed:
			
 
				+            from django.db import transaction
			
 
				+            with transaction.atomic():
			
 
				+                # Walk through migration chain automatically
			
 
				+                current = self.fs_version
			
 
				+                target = self._fs_current_version()
			
 
				+
			
 
				+                while current != target:
			
 
				+                    next_ver = self._fs_next_version(current)
			
 
				+                    method = f'_fs_migrate_from_{current.replace(".", "_")}_to_{next_ver.replace(".", "_")}'
			
 
				+
			
 
				+                    # Only run if method exists (most are no-ops)
			
 
				+                    if hasattr(self, method):
			
 
				+                        getattr(self, method)()
			
 
				+
			
 
				+                    current = next_ver
			
 
				+
			
 
				+                # Update version (still in transaction)
			
 
				+                self.fs_version = target
			
 
				+
			
 
				+        super().save(*args, **kwargs)
			
 
				+        if self.crawl and self.url not in self.crawl.urls:
			
 
				+            self.crawl.urls += f'\n{self.url}'
			
 
				+            self.crawl.save()
			
 
				+
			
 
				+        if is_new:
			
 
				+            from archivebox.misc.logging_util import log_worker_event
			
 
				+            log_worker_event(
			
 
				+                worker_type='DB',
			
 
				+                event='Created Snapshot',
			
 
				+                indent_level=2,
			
 
				+                url=self.url,
			
 
				+                metadata={
			
 
				+                    'id': str(self.id),
			
 
				+                    'crawl_id': str(self.crawl_id) if self.crawl_id else None,
			
 
				+                    'depth': self.depth,
			
 
				+                    'status': self.status,
			
 
				+                },
			
 
				+            )
			
 
				+
			
 
				+    # =========================================================================
			
 
				+    # Filesystem Migration Methods
			
 
				+    # =========================================================================
			
 
				+
			
 
				+    @staticmethod
			
 
				+    def _fs_current_version() -> str:
			
 
				+        """Get current ArchiveBox filesystem version (normalized to x.x.0 format)"""
			
 
				+        from archivebox.config import VERSION
			
 
				+        # Normalize version to x.x.0 format (e.g., "0.9.0rc1" -> "0.9.0")
			
 
				+        parts = VERSION.split('.')
			
 
				+        if len(parts) >= 2:
			
 
				+            major, minor = parts[0], parts[1]
			
 
				+            # Strip any non-numeric suffix from minor version
			
 
				+            minor = ''.join(c for c in minor if c.isdigit())
			
 
				+            return f'{major}.{minor}.0'
			
 
				+        return '0.9.0'  # Fallback if version parsing fails
			
 
				+
			
 
				+    @property
			
 
				+    def fs_migration_needed(self) -> bool:
			
 
				+        """Check if snapshot needs filesystem migration"""
			
 
				+        return self.fs_version != self._fs_current_version()
			
 
				+
			
 
				+    def _fs_next_version(self, version: str) -> str:
			
 
				+        """Get next version in migration chain"""
			
 
				+        chain = ['0.7.0', '0.8.0', '0.9.0']
			
 
				+        try:
			
 
				+            idx = chain.index(version)
			
 
				+            return chain[idx + 1] if idx + 1 < len(chain) else self._fs_current_version()
			
 
				+        except ValueError:
			
 
				+            # Unknown version - skip to current
			
 
				+            return self._fs_current_version()
			
 
				+
			
 
				+    def _fs_migrate_from_0_7_0_to_0_8_0(self):
			
 
				+        """Migration from 0.7.0 to 0.8.0 layout (no-op)"""
			
 
				+        # 0.7 and 0.8 both used archive/<timestamp>
			
 
				+        # Nothing to do!
			
 
				+        pass
			
 
				+
			
 
				+    def _fs_migrate_from_0_8_0_to_0_9_0(self):
			
 
				+        """
			
 
				+        Migrate from flat to nested structure.
			
 
				+
			
 
				+        0.8.x: archive/{timestamp}/
			
 
				+        0.9.x: users/{user}/snapshots/YYYYMMDD/{domain}/{uuid}/
			
 
				+
			
 
				+        Transaction handling:
			
 
				+        1. Copy files INSIDE transaction
			
 
				+        2. Create symlink INSIDE transaction
			
 
				+        3. Update fs_version INSIDE transaction (done by save())
			
 
				+        4. Exit transaction (DB commit)
			
 
				+        5. Delete old files OUTSIDE transaction (after commit)
			
 
				+        """
			
 
				+        import shutil
			
 
				+        from django.db import transaction
			
 
				+
			
 
				+        old_dir = self.get_storage_path_for_version('0.8.0')
			
 
				+        new_dir = self.get_storage_path_for_version('0.9.0')
			
 
				+
			
 
				+        if not old_dir.exists() or old_dir == new_dir or new_dir.exists():
			
 
				+            return
			
 
				+
			
 
				+        new_dir.mkdir(parents=True, exist_ok=True)
			
 
				+
			
 
				+        # Copy all files (idempotent)
			
 
				+        for old_file in old_dir.rglob('*'):
			
 
				+            if not old_file.is_file():
			
 
				+                continue
			
 
				+
			
 
				+            rel_path = old_file.relative_to(old_dir)
			
 
				+            new_file = new_dir / rel_path
			
 
				+
			
 
				+            # Skip if already copied
			
 
				+            if new_file.exists() and new_file.stat().st_size == old_file.stat().st_size:
			
 
				+                continue
			
 
				+
			
 
				+            new_file.parent.mkdir(parents=True, exist_ok=True)
			
 
				+            shutil.copy2(old_file, new_file)
			
 
				+
			
 
				+        # Verify all copied
			
 
				+        old_files = {f.relative_to(old_dir): f.stat().st_size
			
 
				+                     for f in old_dir.rglob('*') if f.is_file()}
			
 
				+        new_files = {f.relative_to(new_dir): f.stat().st_size
			
 
				+                     for f in new_dir.rglob('*') if f.is_file()}
			
 
				+
			
 
				+        if old_files.keys() != new_files.keys():
			
 
				+            missing = old_files.keys() - new_files.keys()
			
 
				+            raise Exception(f"Migration incomplete: missing {missing}")
			
 
				+
			
 
				+        # Create backwards-compat symlink (INSIDE transaction)
			
 
				+        symlink_path = CONSTANTS.ARCHIVE_DIR / self.timestamp
			
 
				+        if symlink_path.is_symlink():
			
 
				+            symlink_path.unlink()
			
 
				+
			
 
				+        if not symlink_path.exists() or symlink_path == old_dir:
			
 
				+            symlink_path.symlink_to(new_dir, target_is_directory=True)
			
 
				+
			
 
				+        # Schedule old directory deletion AFTER transaction commits
			
 
				+        transaction.on_commit(lambda: self._cleanup_old_migration_dir(old_dir))
			
 
				+
			
 
				+    def _cleanup_old_migration_dir(self, old_dir: Path):
			
 
				+        """
			
 
				+        Delete old directory after successful migration.
			
 
				+        Called via transaction.on_commit() after DB commit succeeds.
			
 
				+        """
			
 
				+        import shutil
			
 
				+        import logging
			
 
				+
			
 
				+        if old_dir.exists() and not old_dir.is_symlink():
			
 
				+            try:
			
 
				+                shutil.rmtree(old_dir)
			
 
				+            except Exception as e:
			
 
				+                # Log but don't raise - migration succeeded, this is just cleanup
			
 
				+                logging.getLogger('archivebox.migration').warning(
			
 
				+                    f"Could not remove old migration directory {old_dir}: {e}"
			
 
				+                )
			
 
				+
			
 
				+    # =========================================================================
			
 
				+    # Path Calculation and Migration Helpers
			
 
				+    # =========================================================================
			
 
				+
			
 
				+    @staticmethod
			
 
				+    def extract_domain_from_url(url: str) -> str:
			
 
				+        """
			
 
				+        Extract domain from URL for 0.9.x path structure.
			
 
				+        Uses full hostname with sanitized special chars.
			
 
				+
			
 
				+        Examples:
			
 
				+            https://example.com:8080 → example.com_8080
			
 
				+            https://sub.example.com → sub.example.com
			
 
				+            file:///path → localhost
			
 
				+            data:text/html → data
			
 
				+        """
			
 
				+        from urllib.parse import urlparse
			
 
				+
			
 
				+        try:
			
 
				+            parsed = urlparse(url)
			
 
				+
			
 
				+            if parsed.scheme in ('http', 'https'):
			
 
				+                if parsed.port:
			
 
				+                    return f"{parsed.hostname}_{parsed.port}".replace(':', '_')
			
 
				+                return parsed.hostname or 'unknown'
			
 
				+            elif parsed.scheme == 'file':
			
 
				+                return 'localhost'
			
 
				+            elif parsed.scheme:
			
 
				+                return parsed.scheme
			
 
				+            else:
			
 
				+                return 'unknown'
			
 
				+        except Exception:
			
 
				+            return 'unknown'
			
 
				+
			
 
				+    def get_storage_path_for_version(self, version: str) -> Path:
			
 
				+        """
			
 
				+        Calculate storage path for specific filesystem version.
			
 
				+        Centralizes path logic so it's reusable.
			
 
				+
			
 
				+        0.7.x/0.8.x: archive/{timestamp}
			
 
				+        0.9.x: users/{username}/snapshots/YYYYMMDD/{domain}/{uuid}/
			
 
				+        """
			
 
				+        from datetime import datetime
			
 
				+
			
 
				+        if version in ('0.7.0', '0.8.0'):
			
 
				+            return CONSTANTS.ARCHIVE_DIR / self.timestamp
			
 
				+
			
 
				+        elif version in ('0.9.0', '1.0.0'):
			
 
				+            username = self.crawl.created_by.username
			
 
				+
			
 
				+            # Use created_at for date grouping (fallback to timestamp)
			
 
				+            if self.created_at:
			
 
				+                date_str = self.created_at.strftime('%Y%m%d')
			
 
				+            else:
			
 
				+                date_str = datetime.fromtimestamp(float(self.timestamp)).strftime('%Y%m%d')
			
 
				+
			
 
				+            domain = self.extract_domain_from_url(self.url)
			
 
				+
			
 
				+            return (
			
 
				+                CONSTANTS.DATA_DIR / 'users' / username / 'snapshots' /
			
 
				+                date_str / domain / str(self.id)
			
 
				+            )
			
 
				+        else:
			
 
				+            # Unknown version - use current
			
 
				+            return self.get_storage_path_for_version(self._fs_current_version())
			
 
				+
			
 
				+    # =========================================================================
			
 
				+    # Loading and Creation from Filesystem (Used by archivebox update ONLY)
			
 
				+    # =========================================================================
			
 
				+
			
 
				+    @classmethod
			
 
				+    def load_from_directory(cls, snapshot_dir: Path) -> Optional['Snapshot']:
			
 
				+        """
			
 
				+        Load existing Snapshot from DB by reading index.json.
			
 
				+
			
 
				+        Reads index.json, extracts url+timestamp, queries DB.
			
 
				+        Returns existing Snapshot or None if not found/invalid.
			
 
				+        Does NOT create new snapshots.
			
 
				+
			
 
				+        ONLY used by: archivebox update (for orphan detection)
			
 
				+        """
			
 
				+        import json
			
 
				+
			
 
				+        index_path = snapshot_dir / 'index.json'
			
 
				+        if not index_path.exists():
			
 
				+            return None
			
 
				+
			
 
				+        try:
			
 
				+            with open(index_path) as f:
			
 
				+                data = json.load(f)
			
 
				+        except:
			
 
				+            return None
			
 
				+
			
 
				+        url = data.get('url')
			
 
				+        if not url:
			
 
				+            return None
			
 
				+
			
 
				+        # Get timestamp - prefer index.json, fallback to folder name
			
 
				+        timestamp = cls._select_best_timestamp(
			
 
				+            index_timestamp=data.get('timestamp'),
			
 
				+            folder_name=snapshot_dir.name
			
 
				+        )
			
 
				+
			
 
				+        if not timestamp:
			
 
				+            return None
			
 
				+
			
 
				+        # Look up existing
			
 
				+        try:
			
 
				+            return cls.objects.get(url=url, timestamp=timestamp)
			
 
				+        except cls.DoesNotExist:
			
 
				+            return None
			
 
				+        except cls.MultipleObjectsReturned:
			
 
				+            # Should not happen with unique constraint
			
 
				+            return cls.objects.filter(url=url, timestamp=timestamp).first()
			
 
				+
			
 
				+    @classmethod
			
 
				+    def create_from_directory(cls, snapshot_dir: Path) -> Optional['Snapshot']:
			
 
				+        """
			
 
				+        Create new Snapshot from orphaned directory.
			
 
				+
			
 
				+        Validates timestamp, ensures uniqueness.
			
 
				+        Returns new UNSAVED Snapshot or None if invalid.
			
 
				+
			
 
				+        ONLY used by: archivebox update (for orphan import)
			
 
				+        """
			
 
				+        import json
			
 
				+
			
 
				+        index_path = snapshot_dir / 'index.json'
			
 
				+        if not index_path.exists():
			
 
				+            return None
			
 
				+
			
 
				+        try:
			
 
				+            with open(index_path) as f:
			
 
				+                data = json.load(f)
			
 
				+        except:
			
 
				+            return None
			
 
				+
			
 
				+        url = data.get('url')
			
 
				+        if not url:
			
 
				+            return None
			
 
				+
			
 
				+        # Get and validate timestamp
			
 
				+        timestamp = cls._select_best_timestamp(
			
 
				+            index_timestamp=data.get('timestamp'),
			
 
				+            folder_name=snapshot_dir.name
			
 
				+        )
			
 
				+
			
 
				+        if not timestamp:
			
 
				+            return None
			
 
				+
			
 
				+        # Ensure uniqueness (reuses existing logic from create_or_update_from_dict)
			
 
				+        timestamp = cls._ensure_unique_timestamp(url, timestamp)
			
 
				+
			
 
				+        # Detect version
			
 
				+        fs_version = cls._detect_fs_version_from_index(data)
			
 
				+
			
 
				+        return cls(
			
 
				+            url=url,
			
 
				+            timestamp=timestamp,
			
 
				+            title=data.get('title', ''),
			
 
				+            fs_version=fs_version,
			
 
				+            created_by_id=get_or_create_system_user_pk(),
			
 
				+        )
			
 
				+
			
 
				+    @staticmethod
			
 
				+    def _select_best_timestamp(index_timestamp: str, folder_name: str) -> Optional[str]:
			
 
				+        """
			
 
				+        Select best timestamp from index.json vs folder name.
			
 
				+
			
 
				+        Validates range (1995-2035).
			
 
				+        Prefers index.json if valid.
			
 
				+        """
			
 
				+        def is_valid_timestamp(ts):
			
 
				+            try:
			
 
				+                ts_int = int(float(ts))
			
 
				+                # 1995-01-01 to 2035-12-31
			
 
				+                return 788918400 <= ts_int <= 2082758400
			
 
				+            except:
			
 
				+                return False
			
 
				+
			
 
				+        index_valid = is_valid_timestamp(index_timestamp) if index_timestamp else False
			
 
				+        folder_valid = is_valid_timestamp(folder_name)
			
 
				+
			
 
				+        if index_valid:
			
 
				+            return str(int(float(index_timestamp)))
			
 
				+        elif folder_valid:
			
 
				+            return str(int(float(folder_name)))
			
 
				+        else:
			
 
				+            return None
			
 
				+
			
 
				+    @classmethod
			
 
				+    def _ensure_unique_timestamp(cls, url: str, timestamp: str) -> str:
			
 
				+        """
			
 
				+        Ensure timestamp is globally unique.
			
 
				+        If collision with different URL, increment by 1 until unique.
			
 
				+
			
 
				+        NOTE: Logic already exists in create_or_update_from_dict (line 266-267)
			
 
				+        This is just an extracted, reusable version.
			
 
				+        """
			
 
				+        while cls.objects.filter(timestamp=timestamp).exclude(url=url).exists():
			
 
				+            timestamp = str(int(float(timestamp)) + 1)
			
 
				+        return timestamp
			
 
				+
			
 
				+    @staticmethod
			
 
				+    def _detect_fs_version_from_index(data: dict) -> str:
			
 
				+        """
			
 
				+        Detect fs_version from index.json structure.
			
 
				+
			
 
				+        - Has fs_version field: use it
			
 
				+        - Has history dict: 0.7.0
			
 
				+        - Has archive_results list: 0.8.0
			
 
				+        - Default: 0.7.0
			
 
				+        """
			
 
				+        if 'fs_version' in data:
			
 
				+            return data['fs_version']
			
 
				+        if 'history' in data and 'archive_results' not in data:
			
 
				+            return '0.7.0'
			
 
				+        if 'archive_results' in data:
			
 
				+            return '0.8.0'
			
 
				+        return '0.7.0'
			
 
				+
			
 
				+    # =========================================================================
			
 
				+    # Index.json Reconciliation
			
 
				+    # =========================================================================
			
 
				+
			
 
				+    def reconcile_with_index_json(self):
			
 
				+        """
			
 
				+        Merge index.json with DB. DB is source of truth.
			
 
				+
			
 
				+        - Title: longest non-URL
			
 
				+        - Tags: union
			
 
				+        - ArchiveResults: keep both (by plugin+start_ts)
			
 
				+
			
 
				+        Writes back in 0.9.x format.
			
 
				+
			
 
				+        Used by: archivebox update (to sync index.json with DB)
			
 
				+        """
			
 
				+        import json
			
 
				+
			
 
				+        index_path = Path(self.output_dir) / 'index.json'
			
 
				+
			
 
				+        index_data = {}
			
 
				+        if index_path.exists():
			
 
				+            try:
			
 
				+                with open(index_path) as f:
			
 
				+                    index_data = json.load(f)
			
 
				+            except:
			
 
				+                pass
			
 
				+
			
 
				+        # Merge title
			
 
				+        self._merge_title_from_index(index_data)
			
 
				+
			
 
				+        # Merge tags
			
 
				+        self._merge_tags_from_index(index_data)
			
 
				+
			
 
				+        # Merge ArchiveResults
			
 
				+        self._merge_archive_results_from_index(index_data)
			
 
				+
			
 
				+        # Write back
			
 
				+        self.write_index_json()
			
 
				+
			
 
				+    def _merge_title_from_index(self, index_data: dict):
			
 
				+        """Merge title - prefer longest non-URL title."""
			
 
				+        index_title = index_data.get('title', '').strip()
			
 
				+        db_title = self.title or ''
			
 
				+
			
 
				+        candidates = [t for t in [index_title, db_title] if t and t != self.url]
			
 
				+        if candidates:
			
 
				+            best_title = max(candidates, key=len)
			
 
				+            if self.title != best_title:
			
 
				+                self.title = best_title
			
 
				+
			
 
				+    def _merge_tags_from_index(self, index_data: dict):
			
 
				+        """Merge tags - union of both sources."""
			
 
				+        from django.db import transaction
			
 
				+
			
 
				+        index_tags = set(index_data.get('tags', '').split(',')) if index_data.get('tags') else set()
			
 
				+        index_tags = {t.strip() for t in index_tags if t.strip()}
			
 
				+
			
 
				+        db_tags = set(self.tags.values_list('name', flat=True))
			
 
				+
			
 
				+        new_tags = index_tags - db_tags
			
 
				+        if new_tags:
			
 
				+            with transaction.atomic():
			
 
				+                for tag_name in new_tags:
			
 
				+                    tag, _ = Tag.objects.get_or_create(name=tag_name)
			
 
				+                    self.tags.add(tag)
			
 
				+
			
 
				+    def _merge_archive_results_from_index(self, index_data: dict):
			
 
				+        """Merge ArchiveResults - keep both (by plugin+start_ts)."""
			
 
				+        existing = {
			
 
				+            (ar.plugin, ar.start_ts): ar
			
 
				+            for ar in ArchiveResult.objects.filter(snapshot=self)
			
 
				+        }
			
 
				+
			
 
				+        # Handle 0.8.x format (archive_results list)
			
 
				+        for result_data in index_data.get('archive_results', []):
			
 
				+            self._create_archive_result_if_missing(result_data, existing)
			
 
				+
			
 
				+        # Handle 0.7.x format (history dict)
			
 
				+        if 'history' in index_data and isinstance(index_data['history'], dict):
			
 
				+            for plugin, result_list in index_data['history'].items():
			
 
				+                if isinstance(result_list, list):
			
 
				+                    for result_data in result_list:
			
 
				+                        # Support both old 'extractor' and new 'plugin' keys for backwards compat
			
 
				+                        result_data['plugin'] = result_data.get('plugin') or result_data.get('extractor') or plugin
			
 
				+                        self._create_archive_result_if_missing(result_data, existing)
			
 
				+
			
 
				+    def _create_archive_result_if_missing(self, result_data: dict, existing: dict):
			
 
				+        """Create ArchiveResult if not already in DB."""
			
 
				+        from dateutil import parser
			
 
				+
			
 
				+        # Support both old 'extractor' and new 'plugin' keys for backwards compat
			
 
				+        plugin = result_data.get('plugin') or result_data.get('extractor', '')
			
 
				+        if not plugin:
			
 
				+            return
			
 
				+
			
 
				+        start_ts = None
			
 
				+        if result_data.get('start_ts'):
			
 
				+            try:
			
 
				+                start_ts = parser.parse(result_data['start_ts'])
			
 
				+            except:
			
 
				+                pass
			
 
				+
			
 
				+        if (plugin, start_ts) in existing:
			
 
				+            return
			
 
				+
			
 
				+        try:
			
 
				+            end_ts = None
			
 
				+            if result_data.get('end_ts'):
			
 
				+                try:
			
 
				+                    end_ts = parser.parse(result_data['end_ts'])
			
 
				+                except:
			
 
				+                    pass
			
 
				+
			
 
				+            ArchiveResult.objects.create(
			
 
				+                snapshot=self,
			
 
				+                plugin=plugin,
			
 
				+                hook_name=result_data.get('hook_name', ''),
			
 
				+                status=result_data.get('status', 'failed'),
			
 
				+                output_str=result_data.get('output', ''),
			
 
				+                cmd=result_data.get('cmd', []),
			
 
				+                pwd=result_data.get('pwd', str(self.output_dir)),
			
 
				+                start_ts=start_ts,
			
 
				+                end_ts=end_ts,
			
 
				+                created_by=self.crawl.created_by,
			
 
				+            )
			
 
				+        except:
			
 
				+            pass
			
 
				+
			
 
				+    def write_index_json(self):
			
 
				+        """Write index.json in 0.9.x format."""
			
 
				+        import json
			
 
				+
			
 
				+        index_path = Path(self.output_dir) / 'index.json'
			
 
				+
			
 
				+        data = {
			
 
				+            'url': self.url,
			
 
				+            'timestamp': self.timestamp,
			
 
				+            'title': self.title or '',
			
 
				+            'tags': ','.join(sorted(self.tags.values_list('name', flat=True))),
			
 
				+            'fs_version': self.fs_version,
			
 
				+            'bookmarked_at': self.bookmarked_at.isoformat() if self.bookmarked_at else None,
			
 
				+            'created_at': self.created_at.isoformat() if self.created_at else None,
			
 
				+            'archive_results': [
			
 
				+                {
			
 
				+                    'plugin': ar.plugin,
			
 
				+                    'status': ar.status,
			
 
				+                    'start_ts': ar.start_ts.isoformat() if ar.start_ts else None,
			
 
				+                    'end_ts': ar.end_ts.isoformat() if ar.end_ts else None,
			
 
				+                    'output': ar.output_str or '',
			
 
				+                    'cmd': ar.cmd if isinstance(ar.cmd, list) else [],
			
 
				+                    'pwd': ar.pwd,
			
 
				+                }
			
 
				+                for ar in ArchiveResult.objects.filter(snapshot=self).order_by('start_ts')
			
 
				+            ],
			
 
				+        }
			
 
				+
			
 
				+        index_path.parent.mkdir(parents=True, exist_ok=True)
			
 
				+        with open(index_path, 'w') as f:
			
 
				+            json.dump(data, f, indent=2, sort_keys=True)
			
 
				+
			
 
				+    # =========================================================================
			
 
				+    # Snapshot Utilities
			
 
				+    # =========================================================================
			
 
				+
			
 
				+    @staticmethod
			
 
				+    def move_directory_to_invalid(snapshot_dir: Path):
			
 
				+        """
			
 
				+        Move invalid directory to data/invalid/YYYYMMDD/.
			
 
				+
			
 
				+        Used by: archivebox update (when encountering invalid directories)
			
 
				+        """
			
 
				+        from datetime import datetime
			
 
				+        import shutil
			
 
				+
			
 
				+        invalid_dir = CONSTANTS.DATA_DIR / 'invalid' / datetime.now().strftime('%Y%m%d')
			
 
				+        invalid_dir.mkdir(parents=True, exist_ok=True)
			
 
				+
			
 
				+        dest = invalid_dir / snapshot_dir.name
			
 
				+        counter = 1
			
 
				+        while dest.exists():
			
 
				+            dest = invalid_dir / f"{snapshot_dir.name}_{counter}"
			
 
				+            counter += 1
			
 
				+
			
 
				+        try:
			
 
				+            shutil.move(str(snapshot_dir), str(dest))
			
 
				+        except:
			
 
				+            pass
			
 
				+
			
 
				+    @classmethod
			
 
				+    def find_and_merge_duplicates(cls) -> int:
			
 
				+        """
			
 
				+        Find and merge snapshots with same url:timestamp.
			
 
				+        Returns count of duplicate sets merged.
			
 
				+
			
 
				+        Used by: archivebox update (Phase 3: deduplication)
			
 
				+        """
			
 
				+        from django.db.models import Count
			
 
				+
			
 
				+        duplicates = (
			
 
				+            cls.objects
			
 
				+            .values('url', 'timestamp')
			
 
				+            .annotate(count=Count('id'))
			
 
				+            .filter(count__gt=1)
			
 
				+        )
			
 
				+
			
 
				+        merged = 0
			
 
				+        for dup in duplicates.iterator():
			
 
				+            snapshots = list(
			
 
				+                cls.objects
			
 
				+                .filter(url=dup['url'], timestamp=dup['timestamp'])
			
 
				+                .order_by('created_at')  # Keep oldest
			
 
				+            )
			
 
				+
			
 
				+            if len(snapshots) > 1:
			
 
				+                try:
			
 
				+                    cls._merge_snapshots(snapshots)
			
 
				+                    merged += 1
			
 
				+                except:
			
 
				+                    pass
			
 
				+
			
 
				+        return merged
			
 
				+
			
 
				+    @classmethod
			
 
				+    def _merge_snapshots(cls, snapshots: list['Snapshot']):
			
 
				+        """
			
 
				+        Merge exact duplicates.
			
 
				+        Keep oldest, union files + ArchiveResults.
			
 
				+        """
			
 
				+        import shutil
			
 
				+
			
 
				+        keeper = snapshots[0]
			
 
				+        duplicates = snapshots[1:]
			
 
				+
			
 
				+        keeper_dir = Path(keeper.output_dir)
			
 
				+
			
 
				+        for dup in duplicates:
			
 
				+            dup_dir = Path(dup.output_dir)
			
 
				+
			
 
				+            # Merge files
			
 
				+            if dup_dir.exists() and dup_dir != keeper_dir:
			
 
				+                for dup_file in dup_dir.rglob('*'):
			
 
				+                    if not dup_file.is_file():
			
 
				+                        continue
			
 
				+
			
 
				+                    rel = dup_file.relative_to(dup_dir)
			
 
				+                    keeper_file = keeper_dir / rel
			
 
				+
			
 
				+                    if not keeper_file.exists():
			
 
				+                        keeper_file.parent.mkdir(parents=True, exist_ok=True)
			
 
				+                        shutil.copy2(dup_file, keeper_file)
			
 
				+
			
 
				+                try:
			
 
				+                    shutil.rmtree(dup_dir)
			
 
				+                except:
			
 
				+                    pass
			
 
				+
			
 
				+            # Merge tags
			
 
				+            for tag in dup.tags.all():
			
 
				+                keeper.tags.add(tag)
			
 
				+
			
 
				+            # Move ArchiveResults
			
 
				+            ArchiveResult.objects.filter(snapshot=dup).update(snapshot=keeper)
			
 
				+
			
 
				+            # Delete
			
 
				+            dup.delete()
			
 
				+
			
 
				+    # =========================================================================
			
 
				+    # Output Directory Properties
			
 
				+    # =========================================================================
			
 
				+
			
 
				+    @property
			
 
				+    def output_dir_parent(self) -> str:
			
 
				+        return 'archive'
			
 
				+
			
 
				+    @property
			
 
				+    def output_dir_name(self) -> str:
			
 
				+        return str(self.timestamp)
			
 
				+
			
 
				+    def archive(self, overwrite=False, methods=None):
			
 
				+        return bg_archive_snapshot(self, overwrite=overwrite, methods=methods)
			
 
				+
			
 
				+    @admin.display(description='Tags')
			
 
				+    def tags_str(self, nocache=True) -> str | None:
			
 
				+        calc_tags_str = lambda: ','.join(sorted(tag.name for tag in self.tags.all()))
			
 
				+        if hasattr(self, '_prefetched_objects_cache') and 'tags' in self._prefetched_objects_cache:
			
 
				+            return calc_tags_str()
			
 
				+        cache_key = f'{self.pk}-tags'
			
 
				+        return cache.get_or_set(cache_key, calc_tags_str) if not nocache else calc_tags_str()
			
 
				+
			
 
				+    def icons(self) -> str:
			
 
				+        """Generate HTML icons showing which extractor plugins have succeeded for this snapshot"""
			
 
				+        from django.utils.html import format_html, mark_safe
			
 
				+
			
 
				+        cache_key = f'result_icons:{self.pk}:{(self.downloaded_at or self.modified_at or self.created_at or self.bookmarked_at).timestamp()}'
			
 
				+
			
 
				+        def calc_icons():
			
 
				+            if hasattr(self, '_prefetched_objects_cache') and 'archiveresult_set' in self._prefetched_objects_cache:
			
 
				+                archive_results = {r.plugin: r for r in self.archiveresult_set.all() if r.status == "succeeded" and (r.output_files or r.output_str)}
			
 
				+            else:
			
 
				+                # Filter for results that have either output_files or output_str
			
 
				+                from django.db.models import Q
			
 
				+                archive_results = {r.plugin: r for r in self.archiveresult_set.filter(
			
 
				+                    Q(status="succeeded") & (Q(output_files__isnull=False) | ~Q(output_str=''))
			
 
				+                )}
			
 
				+
			
 
				+            path = self.archive_path
			
 
				+            canon = self.canonical_outputs()
			
 
				+            output = ""
			
 
				+            output_template = '<a href="/{}/{}" class="exists-{}" title="{}">{}</a> &nbsp;'
			
 
				+
			
 
				+            # Get all plugins from hooks system (sorted by numeric prefix)
			
 
				+            all_plugins = [get_plugin_name(e) for e in get_plugins()]
			
 
				+
			
 
				+            for plugin in all_plugins:
			
 
				+                result = archive_results.get(plugin)
			
 
				+                existing = result and result.status == 'succeeded' and (result.output_files or result.output_str)
			
 
				+                icon = get_plugin_icon(plugin)
			
 
				+                output += format_html(
			
 
				+                    output_template,
			
 
				+                    path,
			
 
				+                    canon.get(plugin, plugin + '/'),
			
 
				+                    str(bool(existing)),
			
 
				+                    plugin,
			
 
				+                    icon
			
 
				+                )
			
 
				+
			
 
				+            return format_html('<span class="files-icons" style="font-size: 1.1em; opacity: 0.8; min-width: 240px; display: inline-block">{}</span>', mark_safe(output))
			
 
				+
			
 
				+        cache_result = cache.get(cache_key)
			
 
				+        if cache_result:
			
 
				+            return cache_result
			
 
				+
			
 
				+        fresh_result = calc_icons()
			
 
				+        cache.set(cache_key, fresh_result, timeout=60 * 60 * 24)
			
 
				+        return fresh_result
			
 
				+
			
 
				+    @property
			
 
				+    def api_url(self) -> str:
			
 
				+        return reverse_lazy('api-1:get_snapshot', args=[self.id])
			
 
				+
			
 
				+    def get_absolute_url(self):
			
 
				+        return f'/{self.archive_path}'
			
 
				+
			
 
				+    @cached_property
			
 
				+    def domain(self) -> str:
			
 
				+        return url_domain(self.url)
			
 
				+
			
 
				+    @cached_property
			
 
				+    def output_dir(self):
			
 
				+        """The filesystem path to the snapshot's output directory."""
			
 
				+        import os
			
 
				+
			
 
				+        current_path = self.get_storage_path_for_version(self.fs_version)
			
 
				+
			
 
				+        if current_path.exists():
			
 
				+            return str(current_path)
			
 
				+
			
 
				+        # Check for backwards-compat symlink
			
 
				+        old_path = CONSTANTS.ARCHIVE_DIR / self.timestamp
			
 
				+        if old_path.is_symlink():
			
 
				+            return str(Path(os.readlink(old_path)).resolve())
			
 
				+        elif old_path.exists():
			
 
				+            return str(old_path)
			
 
				+
			
 
				+        return str(current_path)
			
 
				+
			
 
				+    @cached_property
			
 
				+    def archive_path(self):
			
 
				+        return f'{CONSTANTS.ARCHIVE_DIR_NAME}/{self.timestamp}'
			
 
				+
			
 
				+    @cached_property
			
 
				+    def archive_size(self):
			
 
				+        try:
			
 
				+            return get_dir_size(self.output_dir)[0]
			
 
				+        except Exception:
			
 
				+            return 0
			
 
				+
			
 
				+    def save_tags(self, tags: Iterable[str] = ()) -> None:
			
 
				+        tags_id = [Tag.objects.get_or_create(name=tag)[0].pk for tag in tags if tag.strip()]
			
 
				+        self.tags.clear()
			
 
				+        self.tags.add(*tags_id)
			
 
				+
			
 
				+    def pending_archiveresults(self) -> QuerySet['ArchiveResult']:
			
 
				+        return self.archiveresult_set.exclude(status__in=ArchiveResult.FINAL_OR_ACTIVE_STATES)
			
 
				+
			
 
				+    def run(self) -> list['ArchiveResult']:
			
 
				+        """
			
 
				+        Execute snapshot by creating pending ArchiveResults for all enabled hooks.
			
 
				+
			
 
				+        Called by: SnapshotMachine.enter_started()
			
 
				+
			
 
				+        Hook Lifecycle:
			
 
				+            1. discover_hooks('Snapshot') → finds all plugin hooks
			
 
				+            2. For each hook:
			
 
				+               - Create ArchiveResult with status=QUEUED
			
 
				+               - Store hook_name (e.g., 'on_Snapshot__50_wget.py')
			
 
				+            3. ArchiveResults execute independently via ArchiveResultMachine
			
 
				+            4. Hook execution happens in ArchiveResult.run(), NOT here
			
 
				+
			
 
				+        Returns:
			
 
				+            list[ArchiveResult]: Newly created pending results
			
 
				+        """
			
 
				+        return self.create_pending_archiveresults()
			
 
				+
			
 
				+    def cleanup(self):
			
 
				+        """
			
 
				+        Clean up background ArchiveResult hooks.
			
 
				+
			
 
				+        Called by the state machine when entering the 'sealed' state.
			
 
				+        Kills any background hooks and finalizes their ArchiveResults.
			
 
				+        """
			
 
				+        from archivebox.hooks import kill_process
			
 
				+
			
 
				+        # Kill any background ArchiveResult hooks
			
 
				+        if not self.OUTPUT_DIR.exists():
			
 
				+            return
			
 
				+
			
 
				+        # Find all .pid files in this snapshot's output directory
			
 
				+        for pid_file in self.OUTPUT_DIR.glob('**/*.pid'):
			
 
				+            kill_process(pid_file, validate=True)
			
 
				+
			
 
				+        # Update all STARTED ArchiveResults from filesystem
			
 
				+        results = self.archiveresult_set.filter(status=ArchiveResult.StatusChoices.STARTED)
			
 
				+        for ar in results:
			
 
				+            ar.update_from_output()
			
 
				+
			
 
				+    def has_running_background_hooks(self) -> bool:
			
 
				+        """
			
 
				+        Check if any ArchiveResult background hooks are still running.
			
 
				+
			
 
				+        Used by state machine to determine if snapshot is finished.
			
 
				+        """
			
 
				+        from archivebox.hooks import process_is_alive
			
 
				+
			
 
				+        if not self.OUTPUT_DIR.exists():
			
 
				+            return False
			
 
				+
			
 
				+        for plugin_dir in self.OUTPUT_DIR.iterdir():
			
 
				+            if not plugin_dir.is_dir():
			
 
				+                continue
			
 
				+            pid_file = plugin_dir / 'hook.pid'
			
 
				+            if process_is_alive(pid_file):
			
 
				+                return True
			
 
				+
			
 
				+        return False
			
 
				+
			
 
				+    @staticmethod
			
 
				+    def from_jsonl(record: Dict[str, Any], overrides: Dict[str, Any] = None, queue_for_extraction: bool = True):
			
 
				+        """
			
 
				+        Create/update Snapshot from JSONL record or dict.
			
 
				+
			
 
				+        Unified method that handles:
			
 
				+        - ID-based patching: {"id": "...", "title": "new title"}
			
 
				+        - URL-based create/update: {"url": "...", "title": "...", "tags": "..."}
			
 
				+        - Auto-creates Crawl if not provided
			
 
				+        - Optionally queues for extraction
			
 
				+
			
 
				+        Args:
			
 
				+            record: Dict with 'url' (for create) or 'id' (for patch), plus other fields
			
 
				+            overrides: Dict with 'crawl', 'snapshot' (parent), 'created_by_id'
			
 
				+            queue_for_extraction: If True, sets status=QUEUED and retry_at (default: True)
			
 
				+
			
 
				+        Returns:
			
 
				+            Snapshot instance or None
			
 
				+        """
			
 
				+        import re
			
 
				+        from django.utils import timezone
			
 
				+        from archivebox.misc.util import parse_date
			
 
				+        from archivebox.base_models.models import get_or_create_system_user_pk
			
 
				+        from archivebox.config.common import GENERAL_CONFIG
			
 
				+
			
 
				+        overrides = overrides or {}
			
 
				+
			
 
				+        # If 'id' is provided, lookup and patch that specific snapshot
			
 
				+        snapshot_id = record.get('id')
			
 
				+        if snapshot_id:
			
 
				+            try:
			
 
				+                snapshot = Snapshot.objects.get(id=snapshot_id)
			
 
				+
			
 
				+                # Generically update all fields present in record
			
 
				+                update_fields = []
			
 
				+                for field_name, value in record.items():
			
 
				+                    # Skip internal fields
			
 
				+                    if field_name in ('id', 'type'):
			
 
				+                        continue
			
 
				+
			
 
				+                    # Skip if field doesn't exist on model
			
 
				+                    if not hasattr(snapshot, field_name):
			
 
				+                        continue
			
 
				+
			
 
				+                    # Special parsing for date fields
			
 
				+                    if field_name in ('bookmarked_at', 'retry_at', 'created_at', 'modified_at'):
			
 
				+                        if value and isinstance(value, str):
			
 
				+                            value = parse_date(value)
			
 
				+
			
 
				+                    # Update field if value is provided and different
			
 
				+                    if value is not None and getattr(snapshot, field_name) != value:
			
 
				+                        setattr(snapshot, field_name, value)
			
 
				+                        update_fields.append(field_name)
			
 
				+
			
 
				+                if update_fields:
			
 
				+                    snapshot.save(update_fields=update_fields + ['modified_at'])
			
 
				+
			
 
				+                return snapshot
			
 
				+            except Snapshot.DoesNotExist:
			
 
				+                # ID not found, fall through to create-by-URL logic
			
 
				+                pass
			
 
				+
			
 
				+        url = record.get('url')
			
 
				+        if not url:
			
 
				+            return None
			
 
				+
			
 
				+        # Determine or create crawl (every snapshot must have a crawl)
			
 
				+        crawl = overrides.get('crawl')
			
 
				+        parent_snapshot = overrides.get('snapshot')  # Parent snapshot
			
 
				+        created_by_id = overrides.get('created_by_id') or (parent_snapshot.crawl.created_by_id if parent_snapshot else None) or get_or_create_system_user_pk()
			
 
				+
			
 
				+        # If no crawl provided, inherit from parent or auto-create one
			
 
				+        if not crawl:
			
 
				+            if parent_snapshot:
			
 
				+                # Inherit crawl from parent snapshot
			
 
				+                crawl = parent_snapshot.crawl
			
 
				+            else:
			
 
				+                # Auto-create a single-URL crawl
			
 
				+                from archivebox.crawls.models import Crawl
			
 
				+                from archivebox.config import CONSTANTS
			
 
				+
			
 
				+                timestamp_str = timezone.now().strftime("%Y-%m-%d__%H-%M-%S")
			
 
				+                sources_file = CONSTANTS.SOURCES_DIR / f'{timestamp_str}__auto_crawl.txt'
			
 
				+                sources_file.parent.mkdir(parents=True, exist_ok=True)
			
 
				+                sources_file.write_text(url)
			
 
				+
			
 
				+                crawl = Crawl.objects.create(
			
 
				+                    urls=url,
			
 
				+                    max_depth=0,
			
 
				+                    label=f'auto-created for {url[:50]}',
			
 
				+                    created_by_id=created_by_id,
			
 
				+                )
			
 
				+
			
 
				+        # Parse tags
			
 
				+        tags_str = record.get('tags', '')
			
 
				+        tag_list = []
			
 
				+        if tags_str:
			
 
				+            tag_list = list(dict.fromkeys(
			
 
				+                tag.strip() for tag in re.split(GENERAL_CONFIG.TAG_SEPARATOR_PATTERN, tags_str)
			
 
				+                if tag.strip()
			
 
				+            ))
			
 
				+
			
 
				+        # Get most recent snapshot with this URL (URLs can exist in multiple crawls)
			
 
				+        snapshot = Snapshot.objects.filter(url=url).order_by('-created_at').first()
			
 
				+
			
 
				+        title = record.get('title')
			
 
				+        timestamp = record.get('timestamp')
			
 
				+
			
 
				+        if snapshot:
			
 
				+            # Update existing snapshot
			
 
				+            if title and (not snapshot.title or len(title) > len(snapshot.title or '')):
			
 
				+                snapshot.title = title
			
 
				+                snapshot.save(update_fields=['title', 'modified_at'])
			
 
				+        else:
			
 
				+            # Create new snapshot
			
 
				+            if timestamp:
			
 
				+                while Snapshot.objects.filter(timestamp=timestamp).exists():
			
 
				+                    timestamp = str(float(timestamp) + 1.0)
			
 
				+
			
 
				+            snapshot = Snapshot.objects.create(
			
 
				+                url=url,
			
 
				+                timestamp=timestamp,
			
 
				+                title=title,
			
 
				+                crawl=crawl,
			
 
				+            )
			
 
				+
			
 
				+        # Update tags
			
 
				+        if tag_list:
			
 
				+            existing_tags = set(snapshot.tags.values_list('name', flat=True))
			
 
				+            new_tags = set(tag_list) | existing_tags
			
 
				+            snapshot.save_tags(new_tags)
			
 
				+
			
 
				+        # Queue for extraction and update additional fields
			
 
				+        update_fields = []
			
 
				+
			
 
				+        if queue_for_extraction:
			
 
				+            snapshot.status = Snapshot.StatusChoices.QUEUED
			
 
				+            snapshot.retry_at = timezone.now()
			
 
				+            update_fields.extend(['status', 'retry_at'])
			
 
				+
			
 
				+        # Update additional fields if provided
			
 
				+        for field_name in ('depth', 'parent_snapshot_id', 'crawl_id', 'bookmarked_at'):
			
 
				+            value = record.get(field_name)
			
 
				+            if value is not None and getattr(snapshot, field_name) != value:
			
 
				+                setattr(snapshot, field_name, value)
			
 
				+                update_fields.append(field_name)
			
 
				+
			
 
				+        if update_fields:
			
 
				+            snapshot.save(update_fields=update_fields + ['modified_at'])
			
 
				+
			
 
				+        return snapshot
			
 
				+
			
 
				+    def create_pending_archiveresults(self) -> list['ArchiveResult']:
			
 
				+        """
			
 
				+        Create ArchiveResult records for all enabled hooks.
			
 
				+
			
 
				+        Uses the hooks system to discover available hooks from:
			
 
				+        - archivebox/plugins/*/on_Snapshot__*.{py,sh,js}
			
 
				+        - data/plugins/*/on_Snapshot__*.{py,sh,js}
			
 
				+
			
 
				+        Creates one ArchiveResult per hook (not per plugin), with hook_name set.
			
 
				+        This enables step-based execution where all hooks in a step can run in parallel.
			
 
				+        """
			
 
				+        from archivebox.hooks import discover_hooks
			
 
				+
			
 
				+        hooks = discover_hooks('Snapshot')
			
 
				+        archiveresults = []
			
 
				+
			
 
				+        for hook_path in hooks:
			
 
				+            hook_name = hook_path.name  # e.g., 'on_Snapshot__50_wget.py'
			
 
				+            plugin = hook_path.parent.name  # e.g., 'wget'
			
 
				+
			
 
				+            # Check if AR already exists for this specific hook
			
 
				+            if ArchiveResult.objects.filter(snapshot=self, hook_name=hook_name).exists():
			
 
				+                continue
			
 
				+
			
 
				+            archiveresult, created = ArchiveResult.objects.get_or_create(
			
 
				+                snapshot=self,
			
 
				+                hook_name=hook_name,
			
 
				+                defaults={
			
 
				+                    'plugin': plugin,
			
 
				+                    'status': ArchiveResult.INITIAL_STATE,
			
 
				+                    'retry_at': timezone.now(),
			
 
				+                    'created_by_id': self.crawl.created_by_id,
			
 
				+                },
			
 
				+            )
			
 
				+            if archiveresult.status == ArchiveResult.INITIAL_STATE:
			
 
				+                archiveresults.append(archiveresult)
			
 
				+
			
 
				+        return archiveresults
			
 
				+
			
 
				+    def advance_step_if_ready(self) -> bool:
			
 
				+        """
			
 
				+        Advance current_step if all foreground hooks in current step are finished.
			
 
				+
			
 
				+        Called by the state machine to check if step can advance.
			
 
				+        Background hooks (.bg) don't block step advancement.
			
 
				+
			
 
				+        Step advancement rules:
			
 
				+        - All foreground ARs in current step must be finished (SUCCEEDED/FAILED/SKIPPED)
			
 
				+        - Background ARs (hook_name contains '.bg.') are ignored for advancement
			
 
				+        - When ready, increments current_step by 1 (up to 9)
			
 
				+
			
 
				+        Returns:
			
 
				+            True if step was advanced, False if not ready or already at step 9.
			
 
				+        """
			
 
				+        from archivebox.hooks import extract_step, is_background_hook
			
 
				+
			
 
				+        if self.current_step >= 9:
			
 
				+            return False  # Already at final step
			
 
				+
			
 
				+        # Get all ARs for current step that are foreground
			
 
				+        current_step_ars = self.archiveresult_set.filter(
			
 
				+            hook_name__isnull=False
			
 
				+        ).exclude(hook_name='')
			
 
				+
			
 
				+        # Check each AR in current step
			
 
				+        for ar in current_step_ars:
			
 
				+            ar_step = extract_step(ar.hook_name)
			
 
				+            if ar_step != self.current_step:
			
 
				+                continue  # Not in current step
			
 
				+
			
 
				+            if is_background_hook(ar.hook_name):
			
 
				+                continue  # Background hooks don't block
			
 
				+
			
 
				+            # Foreground hook in current step - check if finished
			
 
				+            if ar.status not in ArchiveResult.FINAL_OR_ACTIVE_STATES:
			
 
				+                # Still pending/queued - can't advance
			
 
				+                return False
			
 
				+
			
 
				+            if ar.status == ArchiveResult.StatusChoices.STARTED:
			
 
				+                # Still running - can't advance
			
 
				+                return False
			
 
				+
			
 
				+        # All foreground hooks in current step are finished - advance!
			
 
				+        self.current_step += 1
			
 
				+        self.save(update_fields=['current_step', 'modified_at'])
			
 
				+        return True
			
 
				+
			
 
				+    def is_finished_processing(self) -> bool:
			
 
				+        """
			
 
				+        Check if this snapshot has finished processing.
			
 
				+
			
 
				+        Used by SnapshotMachine.is_finished() to determine if snapshot is complete.
			
 
				+
			
 
				+        Returns:
			
 
				+            True if all archiveresults are finished (or no work to do), False otherwise.
			
 
				+        """
			
 
				+        # if no archiveresults exist yet, it's not finished
			
 
				+        if not self.archiveresult_set.exists():
			
 
				+            return False
			
 
				+
			
 
				+        # Try to advance step if ready (handles step-based hook execution)
			
 
				+        # This will increment current_step when all foreground hooks in current step are done
			
 
				+        while self.advance_step_if_ready():
			
 
				+            pass  # Keep advancing until we can't anymore
			
 
				+
			
 
				+        # if archiveresults exist but are still pending, it's not finished
			
 
				+        if self.pending_archiveresults().exists():
			
 
				+            return False
			
 
				+
			
 
				+        # Don't wait for background hooks - they'll be cleaned up on entering sealed state
			
 
				+        # Background hooks in STARTED state are excluded by pending_archiveresults()
			
 
				+        # (STARTED is in FINAL_OR_ACTIVE_STATES) so once all results are FINAL or ACTIVE,
			
 
				+        # we can transition to sealed and cleanup() will kill the background hooks
			
 
				+
			
 
				+        # otherwise archiveresults exist and are all finished, so it's finished
			
 
				+        return True
			
 
				+
			
 
				+    def retry_failed_archiveresults(self, retry_at: Optional['timezone.datetime'] = None) -> int:
			
 
				+        """
			
 
				+        Reset failed/skipped ArchiveResults to queued for retry.
			
 
				+
			
 
				+        This enables seamless retry of the entire extraction pipeline:
			
 
				+        - Resets FAILED and SKIPPED results to QUEUED
			
 
				+        - Sets retry_at so workers pick them up
			
 
				+        - Plugins run in order (numeric prefix)
			
 
				+        - Each plugin checks its dependencies at runtime
			
 
				+
			
 
				+        Dependency handling (e.g., chrome_session → screenshot):
			
 
				+        - Plugins check if required outputs exist before running
			
 
				+        - If dependency output missing → plugin returns 'skipped'
			
 
				+        - On retry, if dependency now succeeds → dependent can run
			
 
				+
			
 
				+        Returns count of ArchiveResults reset.
			
 
				+        """
			
 
				+        retry_at = retry_at or timezone.now()
			
 
				+
			
 
				+        count = self.archiveresult_set.filter(
			
 
				+            status__in=[
			
 
				+                ArchiveResult.StatusChoices.FAILED,
			
 
				+                ArchiveResult.StatusChoices.SKIPPED,
			
 
				+            ]
			
 
				+        ).update(
			
 
				+            status=ArchiveResult.StatusChoices.QUEUED,
			
 
				+            retry_at=retry_at,
			
 
				+            output=None,
			
 
				+            start_ts=None,
			
 
				+            end_ts=None,
			
 
				+        )
			
 
				+
			
 
				+        # Also reset the snapshot and current_step so it gets re-checked from the beginning
			
 
				+        if count > 0:
			
 
				+            self.status = self.StatusChoices.STARTED
			
 
				+            self.retry_at = retry_at
			
 
				+            self.current_step = 0  # Reset to step 0 for retry
			
 
				+            self.save(update_fields=['status', 'retry_at', 'current_step', 'modified_at'])
			
 
				+
			
 
				+        return count
			
 
				+
			
 
				+    # =========================================================================
			
 
				+    # URL Helper Properties (migrated from Link schema)
			
 
				+    # =========================================================================
			
 
				+
			
 
				+    @cached_property
			
 
				+    def url_hash(self) -> str:
			
 
				+        from hashlib import sha256
			
 
				+        return sha256(self.url.encode()).hexdigest()[:8]
			
 
				+
			
 
				+    @cached_property
			
 
				+    def scheme(self) -> str:
			
 
				+        return self.url.split('://')[0]
			
 
				+
			
 
				+    @cached_property
			
 
				+    def path(self) -> str:
			
 
				+        parts = self.url.split('://', 1)
			
 
				+        return '/' + parts[1].split('/', 1)[1] if len(parts) > 1 and '/' in parts[1] else '/'
			
 
				+
			
 
				+    @cached_property
			
 
				+    def basename(self) -> str:
			
 
				+        return self.path.split('/')[-1]
			
 
				+
			
 
				+    @cached_property
			
 
				+    def extension(self) -> str:
			
 
				+        basename = self.basename
			
 
				+        return basename.split('.')[-1] if '.' in basename else ''
			
 
				+
			
 
				+    @cached_property
			
 
				+    def base_url(self) -> str:
			
 
				+        return f'{self.scheme}://{self.domain}'
			
 
				+
			
 
				+    @cached_property
			
 
				+    def is_static(self) -> bool:
			
 
				+        static_extensions = {'.pdf', '.jpg', '.jpeg', '.png', '.gif', '.webp', '.svg', '.mp4', '.mp3', '.wav', '.webm'}
			
 
				+        return any(self.url.lower().endswith(ext) for ext in static_extensions)
			
 
				+
			
 
				+    @cached_property
			
 
				+    def is_archived(self) -> bool:
			
 
				+        output_paths = (
			
 
				+            self.domain,
			
 
				+            'output.html',
			
 
				+            'output.pdf',
			
 
				+            'screenshot.png',
			
 
				+            'singlefile.html',
			
 
				+            'readability/content.html',
			
 
				+            'mercury/content.html',
			
 
				+            'htmltotext.txt',
			
 
				+            'media',
			
 
				+            'git',
			
 
				+        )
			
 
				+        return any((Path(self.output_dir) / path).exists() for path in output_paths)
			
 
				+
			
 
				+    # =========================================================================
			
 
				+    # Date/Time Properties (migrated from Link schema)
			
 
				+    # =========================================================================
			
 
				+
			
 
				+    @cached_property
			
 
				+    def bookmarked_date(self) -> Optional[str]:
			
 
				+        max_ts = (timezone.now() + timedelta(days=30)).timestamp()
			
 
				+        if self.timestamp and self.timestamp.replace('.', '').isdigit():
			
 
				+            if 0 < float(self.timestamp) < max_ts:
			
 
				+                return self._ts_to_date_str(datetime.fromtimestamp(float(self.timestamp)))
			
 
				+            return str(self.timestamp)
			
 
				+        return None
			
 
				+
			
 
				+    @cached_property
			
 
				+    def downloaded_datestr(self) -> Optional[str]:
			
 
				+        return self._ts_to_date_str(self.downloaded_at) if self.downloaded_at else None
			
 
				+
			
 
				+    @cached_property
			
 
				+    def archive_dates(self) -> List[datetime]:
			
 
				+        return [
			
 
				+            result.start_ts
			
 
				+            for result in self.archiveresult_set.all()
			
 
				+            if result.start_ts
			
 
				+        ]
			
 
				+
			
 
				+    @cached_property
			
 
				+    def oldest_archive_date(self) -> Optional[datetime]:
			
 
				+        dates = self.archive_dates
			
 
				+        return min(dates) if dates else None
			
 
				+
			
 
				+    @cached_property
			
 
				+    def newest_archive_date(self) -> Optional[datetime]:
			
 
				+        dates = self.archive_dates
			
 
				+        return max(dates) if dates else None
			
 
				+
			
 
				+    @cached_property
			
 
				+    def num_outputs(self) -> int:
			
 
				+        return self.archiveresult_set.filter(status='succeeded').count()
			
 
				+
			
 
				+    @cached_property
			
 
				+    def num_failures(self) -> int:
			
 
				+        return self.archiveresult_set.filter(status='failed').count()
			
 
				+
			
 
				+    # =========================================================================
			
 
				+    # Output Path Methods (migrated from Link schema)
			
 
				+    # =========================================================================
			
 
				+
			
 
				+    def canonical_outputs(self) -> Dict[str, Optional[str]]:
			
 
				+        """
			
 
				+        Intelligently discover the best output file for each plugin.
			
 
				+        Uses actual ArchiveResult data and filesystem scanning with smart heuristics.
			
 
				+        """
			
 
				+        FAVICON_PROVIDER = 'https://www.google.com/s2/favicons?domain={}'
			
 
				+
			
 
				+        # Mimetypes that can be embedded/previewed in an iframe
			
 
				+        IFRAME_EMBEDDABLE_EXTENSIONS = {
			
 
				+            'html', 'htm', 'pdf', 'txt', 'md', 'json', 'jsonl',
			
 
				+            'png', 'jpg', 'jpeg', 'gif', 'webp', 'svg', 'ico',
			
 
				+            'mp4', 'webm', 'mp3', 'opus', 'ogg', 'wav',
			
 
				+        }
			
 
				+
			
 
				+        MIN_DISPLAY_SIZE = 15_000  # 15KB - filter out tiny files
			
 
				+        MAX_SCAN_FILES = 50  # Don't scan massive directories
			
 
				+
			
 
				+        def find_best_output_in_dir(dir_path: Path, plugin_name: str) -> Optional[str]:
			
 
				+            """Find the best representative file in a plugin's output directory"""
			
 
				+            if not dir_path.exists() or not dir_path.is_dir():
			
 
				+                return None
			
 
				+
			
 
				+            candidates = []
			
 
				+            file_count = 0
			
 
				+
			
 
				+            # Special handling for media plugin - look for thumbnails
			
 
				+            is_media_dir = plugin_name == 'media'
			
 
				+
			
 
				+            # Scan for suitable files
			
 
				+            for file_path in dir_path.rglob('*'):
			
 
				+                file_count += 1
			
 
				+                if file_count > MAX_SCAN_FILES:
			
 
				+                    break
			
 
				+
			
 
				+                if file_path.is_dir() or file_path.name.startswith('.'):
			
 
				+                    continue
			
 
				+
			
 
				+                ext = file_path.suffix.lstrip('.').lower()
			
 
				+                if ext not in IFRAME_EMBEDDABLE_EXTENSIONS:
			
 
				+                    continue
			
 
				+
			
 
				+                try:
			
 
				+                    size = file_path.stat().st_size
			
 
				+                except OSError:
			
 
				+                    continue
			
 
				+
			
 
				+                # For media dir, allow smaller image files (thumbnails are often < 15KB)
			
 
				+                min_size = 5_000 if (is_media_dir and ext in ('png', 'jpg', 'jpeg', 'webp', 'gif')) else MIN_DISPLAY_SIZE
			
 
				+                if size < min_size:
			
 
				+                    continue
			
 
				+
			
 
				+                # Prefer main files: index.html, output.*, content.*, etc.
			
 
				+                priority = 0
			
 
				+                name_lower = file_path.name.lower()
			
 
				+
			
 
				+                if is_media_dir:
			
 
				+                    # Special prioritization for media directories
			
 
				+                    if any(keyword in name_lower for keyword in ('thumb', 'thumbnail', 'cover', 'poster')):
			
 
				+                        priority = 200  # Highest priority for thumbnails
			
 
				+                    elif ext in ('png', 'jpg', 'jpeg', 'webp', 'gif'):
			
 
				+                        priority = 150  # High priority for any image
			
 
				+                    elif ext in ('mp4', 'webm', 'mp3', 'opus', 'ogg'):
			
 
				+                        priority = 100  # Lower priority for actual media files
			
 
				+                    else:
			
 
				+                        priority = 50
			
 
				+                elif 'index' in name_lower:
			
 
				+                    priority = 100
			
 
				+                elif name_lower.startswith(('output', 'content', plugin_name)):
			
 
				+                    priority = 50
			
 
				+                elif ext in ('html', 'htm', 'pdf'):
			
 
				+                    priority = 30
			
 
				+                elif ext in ('png', 'jpg', 'jpeg', 'webp'):
			
 
				+                    priority = 20
			
 
				+                else:
			
 
				+                    priority = 10
			
 
				+
			
 
				+                candidates.append((priority, size, file_path))
			
 
				+
			
 
				+            if not candidates:
			
 
				+                return None
			
 
				+
			
 
				+            # Sort by priority (desc), then size (desc)
			
 
				+            candidates.sort(key=lambda x: (x[0], x[1]), reverse=True)
			
 
				+            best_file = candidates[0][2]
			
 
				+            return str(best_file.relative_to(Path(self.output_dir)))
			
 
				+
			
 
				+        canonical = {
			
 
				+            'index_path': 'index.html',
			
 
				+            'google_favicon_path': FAVICON_PROVIDER.format(self.domain),
			
 
				+            'archive_org_path': f'https://web.archive.org/web/{self.base_url}',
			
 
				+        }
			
 
				+
			
 
				+        # Scan each ArchiveResult's output directory for the best file
			
 
				+        snap_dir = Path(self.output_dir)
			
 
				+        for result in self.archiveresult_set.filter(status='succeeded'):
			
 
				+            if not result.output_files and not result.output_str:
			
 
				+                continue
			
 
				+
			
 
				+            # Try to find the best output file for this plugin
			
 
				+            plugin_dir = snap_dir / result.plugin
			
 
				+            best_output = None
			
 
				+
			
 
				+            # Check output_files first (new field)
			
 
				+            if result.output_files:
			
 
				+                first_file = next(iter(result.output_files.keys()), None)
			
 
				+                if first_file and (plugin_dir / first_file).exists():
			
 
				+                    best_output = f'{result.plugin}/{first_file}'
			
 
				+
			
 
				+            # Fallback to output_str if it looks like a path
			
 
				+            if not best_output and result.output_str and (snap_dir / result.output_str).exists():
			
 
				+                best_output = result.output_str
			
 
				+
			
 
				+            if not best_output and plugin_dir.exists():
			
 
				+                # Intelligently find the best file in the plugin's directory
			
 
				+                best_output = find_best_output_in_dir(plugin_dir, result.plugin)
			
 
				+
			
 
				+            if best_output:
			
 
				+                canonical[f'{result.plugin}_path'] = best_output
			
 
				+
			
 
				+        # Also scan top-level for legacy outputs (backwards compatibility)
			
 
				+        for file_path in snap_dir.glob('*'):
			
 
				+            if file_path.is_dir() or file_path.name in ('index.html', 'index.json'):
			
 
				+                continue
			
 
				+
			
 
				+            ext = file_path.suffix.lstrip('.').lower()
			
 
				+            if ext not in IFRAME_EMBEDDABLE_EXTENSIONS:
			
 
				+                continue
			
 
				+
			
 
				+            try:
			
 
				+                size = file_path.stat().st_size
			
 
				+                if size >= MIN_DISPLAY_SIZE:
			
 
				+                    # Add as generic output with stem as key
			
 
				+                    key = f'{file_path.stem}_path'
			
 
				+                    if key not in canonical:
			
 
				+                        canonical[key] = file_path.name
			
 
				+            except OSError:
			
 
				+                continue
			
 
				+
			
 
				+        if self.is_static:
			
 
				+            static_path = f'warc/{self.timestamp}'
			
 
				+            canonical.update({
			
 
				+                'title': self.basename,
			
 
				+                'wget_path': static_path,
			
 
				+            })
			
 
				+
			
 
				+        return canonical
			
 
				+
			
 
				+    def latest_outputs(self, status: Optional[str] = None) -> Dict[str, Any]:
			
 
				+        """Get the latest output that each plugin produced"""
			
 
				+        from archivebox.hooks import get_plugins
			
 
				+        from django.db.models import Q
			
 
				+
			
 
				+        latest: Dict[str, Any] = {}
			
 
				+        for plugin in get_plugins():
			
 
				+            results = self.archiveresult_set.filter(plugin=plugin)
			
 
				+            if status is not None:
			
 
				+                results = results.filter(status=status)
			
 
				+            # Filter for results with output_files or output_str
			
 
				+            results = results.filter(Q(output_files__isnull=False) | ~Q(output_str='')).order_by('-start_ts')
			
 
				+            result = results.first()
			
 
				+            # Return embed_path() for backwards compatibility
			
 
				+            latest[plugin] = result.embed_path() if result else None
			
 
				+        return latest
			
 
				+
			
 
				+    # =========================================================================
			
 
				+    # Serialization Methods
			
 
				+    # =========================================================================
			
 
				+
			
 
				+    def to_dict(self, extended: bool = False) -> Dict[str, Any]:
			
 
				+        """Convert Snapshot to a dictionary (replacement for Link._asdict())"""
			
 
				+        from archivebox.misc.util import ts_to_date_str
			
 
				+
			
 
				+        result = {
			
 
				+            'TYPE': 'core.models.Snapshot',
			
 
				+            'id': str(self.id),
			
 
				+            'url': self.url,
			
 
				+            'timestamp': self.timestamp,
			
 
				+            'title': self.title,
			
 
				+            'tags': self.tags_str(),
			
 
				+            'downloaded_at': self.downloaded_at.isoformat() if self.downloaded_at else None,
			
 
				+            'bookmarked_at': self.bookmarked_at.isoformat() if self.bookmarked_at else None,
			
 
				+            'created_at': self.created_at.isoformat() if self.created_at else None,
			
 
				+            # Computed properties
			
 
				+            'domain': self.domain,
			
 
				+            'scheme': self.scheme,
			
 
				+            'base_url': self.base_url,
			
 
				+            'path': self.path,
			
 
				+            'basename': self.basename,
			
 
				+            'extension': self.extension,
			
 
				+            'is_static': self.is_static,
			
 
				+            'is_archived': self.is_archived,
			
 
				+            'archive_path': self.archive_path,
			
 
				+            'output_dir': self.output_dir,
			
 
				+            'link_dir': self.output_dir,  # backwards compatibility alias
			
 
				+            'archive_size': self.archive_size,
			
 
				+            'bookmarked_date': self.bookmarked_date,
			
 
				+            'downloaded_datestr': self.downloaded_datestr,
			
 
				+            'num_outputs': self.num_outputs,
			
 
				+            'num_failures': self.num_failures,
			
 
				+        }
			
 
				+        if extended:
			
 
				+            result['canonical'] = self.canonical_outputs()
			
 
				+        return result
			
 
				+
			
 
				+    def to_json(self, indent: int = 4) -> str:
			
 
				+        """Convert to JSON string"""
			
 
				+        return to_json(self.to_dict(extended=True), indent=indent)
			
 
				+
			
 
				+    def to_csv(self, cols: Optional[List[str]] = None, separator: str = ',', ljust: int = 0) -> str:
			
 
				+        """Convert to CSV string"""
			
 
				+        data = self.to_dict()
			
 
				+        cols = cols or ['timestamp', 'is_archived', 'url']
			
 
				+        return separator.join(to_json(data.get(col, ''), indent=None).ljust(ljust) for col in cols)
			
 
				+
			
 
				+    def write_json_details(self, out_dir: Optional[str] = None) -> None:
			
 
				+        """Write JSON index file for this snapshot to its output directory"""
			
 
				+        out_dir = out_dir or self.output_dir
			
 
				+        path = Path(out_dir) / CONSTANTS.JSON_INDEX_FILENAME
			
 
				+        atomic_write(str(path), self.to_dict(extended=True))
			
 
				+
			
 
				+    def write_html_details(self, out_dir: Optional[str] = None) -> None:
			
 
				+        """Write HTML detail page for this snapshot to its output directory"""
			
 
				+        from django.template.loader import render_to_string
			
 
				+        from archivebox.config.common import SERVER_CONFIG
			
 
				+        from archivebox.config.configset import get_config
			
 
				+        from archivebox.misc.logging_util import printable_filesize
			
 
				+
			
 
				+        out_dir = out_dir or self.output_dir
			
 
				+        config = get_config()
			
 
				+        SAVE_ARCHIVE_DOT_ORG = config.get('SAVE_ARCHIVE_DOT_ORG', True)
			
 
				+        TITLE_LOADING_MSG = 'Not yet archived...'
			
 
				+
			
 
				+        canonical = self.canonical_outputs()
			
 
				+        context = {
			
 
				+            **self.to_dict(extended=True),
			
 
				+            **{f'{k}_path': v for k, v in canonical.items()},
			
 
				+            'canonical': {f'{k}_path': v for k, v in canonical.items()},
			
 
				+            'title': htmlencode(self.title or (self.base_url if self.is_archived else TITLE_LOADING_MSG)),
			
 
				+            'url_str': htmlencode(urldecode(self.base_url)),
			
 
				+            'archive_url': urlencode(f'warc/{self.timestamp}' or (self.domain if self.is_archived else '')) or 'about:blank',
			
 
				+            'extension': self.extension or 'html',
			
 
				+            'tags': self.tags_str() or 'untagged',
			
 
				+            'size': printable_filesize(self.archive_size) if self.archive_size else 'pending',
			
 
				+            'status': 'archived' if self.is_archived else 'not yet archived',
			
 
				+            'status_color': 'success' if self.is_archived else 'danger',
			
 
				+            'oldest_archive_date': ts_to_date_str(self.oldest_archive_date),
			
 
				+            'SAVE_ARCHIVE_DOT_ORG': SAVE_ARCHIVE_DOT_ORG,
			
 
				+            'PREVIEW_ORIGINALS': SERVER_CONFIG.PREVIEW_ORIGINALS,
			
 
				+        }
			
 
				+        rendered_html = render_to_string('snapshot.html', context)
			
 
				+        atomic_write(str(Path(out_dir) / CONSTANTS.HTML_INDEX_FILENAME), rendered_html)
			
 
				+
			
 
				+    # =========================================================================
			
 
				+    # Helper Methods
			
 
				+    # =========================================================================
			
 
				+
			
 
				+    @staticmethod
			
 
				+    def _ts_to_date_str(dt: Optional[datetime]) -> Optional[str]:
			
 
				+        return dt.strftime('%Y-%m-%d %H:%M:%S') if dt else None
			
 
				+
			
 
				+
			
 
				+# =============================================================================
			
 
				+# Snapshot State Machine
			
 
				+# =============================================================================
			
 
				+
			
 
				+class SnapshotMachine(BaseStateMachine, strict_states=True):
			
 
				+    """
			
 
				+    State machine for managing Snapshot lifecycle.
			
 
				+
			
 
				+    Hook Lifecycle:
			
 
				+    ┌─────────────────────────────────────────────────────────────┐
			
 
				+    │ QUEUED State                                                │
			
 
				+    │  • Waiting for snapshot to be ready                         │
			
 
				+    └─────────────────────────────────────────────────────────────┘
			
 
				+                            ↓ tick() when can_start()
			
 
				+    ┌─────────────────────────────────────────────────────────────┐
			
 
				+    │ STARTED State → enter_started()                             │
			
 
				+    │  1. snapshot.run()                                          │
			
 
				+    │     • discover_hooks('Snapshot') → finds all plugin hooks   │
			
 
				+    │     • create_pending_archiveresults() → creates ONE         │
			
 
				+    │       ArchiveResult per hook (NO execution yet)             │
			
 
				+    │  2. ArchiveResults process independently with their own     │
			
 
				+    │     state machines (see ArchiveResultMachine)               │
			
 
				+    │  3. Advance through steps 0-9 as foreground hooks complete  │
			
 
				+    └─────────────────────────────────────────────────────────────┘
			
 
				+                            ↓ tick() when is_finished()
			
 
				+    ┌─────────────────────────────────────────────────────────────┐
			
 
				+    │ SEALED State → enter_sealed()                               │
			
 
				+    │  • cleanup() → kills any background hooks still running     │
			
 
				+    │  • Set retry_at=None (no more processing)                   │
			
 
				+    └─────────────────────────────────────────────────────────────┘
			
 
				+
			
 
				+    https://github.com/ArchiveBox/ArchiveBox/wiki/ArchiveBox-Architecture-Diagrams
			
 
				+    """
			
 
				+
			
 
				+    model_attr_name = 'snapshot'
			
 
				+
			
 
				+    # States
			
 
				+    queued = State(value=Snapshot.StatusChoices.QUEUED, initial=True)
			
 
				+    started = State(value=Snapshot.StatusChoices.STARTED)
			
 
				+    sealed = State(value=Snapshot.StatusChoices.SEALED, final=True)
			
 
				+
			
 
				+    # Tick Event
			
 
				+    tick = (
			
 
				+        queued.to.itself(unless='can_start') |
			
 
				+        queued.to(started, cond='can_start') |
			
 
				+        started.to.itself(unless='is_finished') |
			
 
				+        started.to(sealed, cond='is_finished')
			
 
				+    )
			
 
				+
			
 
				+    def can_start(self) -> bool:
			
 
				+        can_start = bool(self.snapshot.url)
			
 
				+        # Suppressed: queue waiting logs
			
 
				+        return can_start
			
 
				+
			
 
				+    def is_finished(self) -> bool:
			
 
				+        """Check if snapshot processing is complete - delegates to model method."""
			
 
				+        return self.snapshot.is_finished_processing()
			
 
				+
			
 
				+    @queued.enter
			
 
				+    def enter_queued(self):
			
 
				+        # Suppressed: state transition logs
			
 
				+        self.snapshot.update_and_requeue(
			
 
				+            retry_at=timezone.now(),
			
 
				+            status=Snapshot.StatusChoices.QUEUED,
			
 
				+        )
			
 
				+
			
 
				+    @started.enter
			
 
				+    def enter_started(self):
			
 
				+        # Suppressed: state transition logs
			
 
				+        # lock the snapshot while we create the pending archiveresults
			
 
				+        self.snapshot.update_and_requeue(
			
 
				+            retry_at=timezone.now() + timedelta(seconds=30),  # if failed, wait 30s before retrying
			
 
				+        )
			
 
				+
			
 
				+        # Run the snapshot - creates pending archiveresults for all enabled plugins
			
 
				+        self.snapshot.run()
			
 
				+
			
 
				+        # unlock the snapshot after we're done + set status = started
			
 
				+        self.snapshot.update_and_requeue(
			
 
				+            retry_at=timezone.now() + timedelta(seconds=5),  # check again in 5s
			
 
				+            status=Snapshot.StatusChoices.STARTED,
			
 
				+        )
			
 
				+
			
 
				+    @sealed.enter
			
 
				+    def enter_sealed(self):
			
 
				+        # Clean up background hooks
			
 
				+        self.snapshot.cleanup()
			
 
				+
			
 
				+        # Suppressed: state transition logs
			
 
				+        self.snapshot.update_and_requeue(
			
 
				+            retry_at=None,
			
 
				+            status=Snapshot.StatusChoices.SEALED,
			
 
				+        )
			
 
				+
			
 
				+
			
 
				+class ArchiveResultManager(models.Manager):
			
 
				+    def indexable(self, sorted: bool = True):
			
 
				+        INDEXABLE_METHODS = [r[0] for r in EXTRACTOR_INDEXING_PRECEDENCE]
			
 
				+        qs = self.get_queryset().filter(plugin__in=INDEXABLE_METHODS, status='succeeded')
			
 
				+        if sorted:
			
 
				+            precedence = [When(plugin=method, then=Value(p)) for method, p in EXTRACTOR_INDEXING_PRECEDENCE]
			
 
				+            qs = qs.annotate(indexing_precedence=Case(*precedence, default=Value(1000), output_field=IntegerField())).order_by('indexing_precedence')
			
 
				+        return qs
			
 
				+
			
 
				+
			
 
				+class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHealthStats, ModelWithStateMachine):
			
 
				+    class StatusChoices(models.TextChoices):
			
 
				+        QUEUED = 'queued', 'Queued'
			
 
				+        STARTED = 'started', 'Started'
			
 
				+        BACKOFF = 'backoff', 'Waiting to retry'
			
 
				+        SUCCEEDED = 'succeeded', 'Succeeded'
			
 
				+        FAILED = 'failed', 'Failed'
			
 
				+        SKIPPED = 'skipped', 'Skipped'
			
 
				+
			
 
				+    @classmethod
			
 
				+    def get_plugin_choices(cls):
			
 
				+        """Get plugin choices from discovered hooks (for forms/admin)."""
			
 
				+        plugins = [get_plugin_name(e) for e in get_plugins()]
			
 
				+        return tuple((e, e) for e in plugins)
			
 
				+
			
 
				+    # Keep AutoField for backward compatibility with 0.7.x databases
			
 
				+    # UUID field is added separately by migration for new records
			
 
				+    id = models.AutoField(primary_key=True, editable=False)
			
 
				+    # Note: unique constraint is added by migration 0027 - don't set unique=True here
			
 
				+    # or SQLite table recreation in earlier migrations will fail
			
 
				+    uuid = models.UUIDField(default=uuid7, null=True, blank=True, db_index=True)
			
 
				+    created_by = models.ForeignKey(settings.AUTH_USER_MODEL, on_delete=models.CASCADE, default=get_or_create_system_user_pk, null=False, related_name='archiveresult_set', db_index=True)
			
 
				+    created_at = models.DateTimeField(default=timezone.now, db_index=True)
			
 
				+    modified_at = models.DateTimeField(auto_now=True)
			
 
				+
			
 
				+    snapshot: Snapshot = models.ForeignKey(Snapshot, on_delete=models.CASCADE)  # type: ignore
			
 
				+    # No choices= constraint - plugin names come from plugin system and can be any string
			
 
				+    plugin = models.CharField(max_length=32, blank=False, null=False, db_index=True)
			
 
				+    hook_name = models.CharField(max_length=255, blank=True, default='', db_index=True, help_text='Full filename of the hook that executed (e.g., on_Snapshot__50_wget.py)')
			
 
				+    pwd = models.CharField(max_length=256, default=None, null=True, blank=True)
			
 
				+    cmd = models.JSONField(default=None, null=True, blank=True)
			
 
				+    cmd_version = models.CharField(max_length=128, default=None, null=True, blank=True)
			
 
				+
			
 
				+    # New output fields (replacing old 'output' field)
			
 
				+    output_str = models.TextField(blank=True, default='', help_text='Human-readable output summary')
			
 
				+    output_json = models.JSONField(null=True, blank=True, default=None, help_text='Structured metadata (headers, redirects, etc.)')
			
 
				+    output_files = models.JSONField(default=dict, help_text='Dict of {relative_path: {metadata}}')
			
 
				+    output_size = models.BigIntegerField(default=0, help_text='Total bytes of all output files')
			
 
				+    output_mimetypes = models.CharField(max_length=512, blank=True, default='', help_text='CSV of mimetypes sorted by size')
			
 
				+
			
 
				+    # Binary FK (optional - set when hook reports cmd)
			
 
				+    binary = models.ForeignKey(
			
 
				+        'machine.Binary',
			
 
				+        on_delete=models.SET_NULL,
			
 
				+        null=True, blank=True,
			
 
				+        related_name='archiveresults',
			
 
				+        help_text='Primary binary used by this hook'
			
 
				+    )
			
 
				+
			
 
				+    start_ts = models.DateTimeField(default=None, null=True, blank=True)
			
 
				+    end_ts = models.DateTimeField(default=None, null=True, blank=True)
			
 
				+
			
 
				+    status = ModelWithStateMachine.StatusField(choices=StatusChoices.choices, default=StatusChoices.QUEUED)
			
 
				+    retry_at = ModelWithStateMachine.RetryAtField(default=timezone.now)
			
 
				+    notes = models.TextField(blank=True, null=False, default='')
			
 
				+    output_dir = models.CharField(max_length=256, default=None, null=True, blank=True)
			
 
				+    iface = models.ForeignKey(NetworkInterface, on_delete=models.SET_NULL, null=True, blank=True)
			
 
				+
			
 
				+    state_machine_name = 'core.models.ArchiveResultMachine'
			
 
				+    retry_at_field_name = 'retry_at'
			
 
				+    state_field_name = 'status'
			
 
				+    active_state = StatusChoices.STARTED
			
 
				+
			
 
				+    objects = ArchiveResultManager()
			
 
				+
			
 
				+    class Meta(TypedModelMeta):
			
 
				+        verbose_name = 'Archive Result'
			
 
				+        verbose_name_plural = 'Archive Results Log'
			
 
				+
			
 
				+    def __str__(self):
			
 
				+        return f'[{self.id}] {self.snapshot.url[:64]} -> {self.plugin}'
			
 
				+
			
 
				+    def save(self, *args, **kwargs):
			
 
				+        is_new = self._state.adding
			
 
				+        # Skip ModelWithOutputDir.save() to avoid creating index.json in plugin directories
			
 
				+        # Call the Django Model.save() directly instead
			
 
				+        models.Model.save(self, *args, **kwargs)
			
 
				+
			
 
				+        if is_new:
			
 
				+            from archivebox.misc.logging_util import log_worker_event
			
 
				+            log_worker_event(
			
 
				+                worker_type='DB',
			
 
				+                event='Created ArchiveResult',
			
 
				+                indent_level=3,
			
 
				+                plugin=self.plugin,
			
 
				+                metadata={
			
 
				+                    'id': str(self.id),
			
 
				+                    'snapshot_id': str(self.snapshot_id),
			
 
				+                    'snapshot_url': str(self.snapshot.url)[:64],
			
 
				+                    'status': self.status,
			
 
				+                },
			
 
				+            )
			
 
				+
			
 
				+    @cached_property
			
 
				+    def snapshot_dir(self):
			
 
				+        return Path(self.snapshot.output_dir)
			
 
				+
			
 
				+    @cached_property
			
 
				+    def url(self):
			
 
				+        return self.snapshot.url
			
 
				+
			
 
				+    @property
			
 
				+    def api_url(self) -> str:
			
 
				+        return reverse_lazy('api-1:get_archiveresult', args=[self.id])
			
 
				+
			
 
				+    def get_absolute_url(self):
			
 
				+        return f'/{self.snapshot.archive_path}/{self.plugin}'
			
 
				+
			
 
				+    @property
			
 
				+    def plugin_module(self) -> Any | None:
			
 
				+        # Hook scripts are now used instead of Python plugin modules
			
 
				+        # The plugin name maps to hooks in archivebox/plugins/{plugin}/
			
 
				+        return None
			
 
				+
			
 
				+    def output_exists(self) -> bool:
			
 
				+        return os.path.exists(Path(self.snapshot_dir) / self.plugin)
			
 
				+
			
 
				+    def embed_path(self) -> Optional[str]:
			
 
				+        """
			
 
				+        Get the relative path to the embeddable output file for this result.
			
 
				+
			
 
				+        Returns the first file from output_files if set, otherwise tries to
			
 
				+        find a reasonable default based on the plugin type.
			
 
				+        """
			
 
				+        # Check output_files dict for primary output
			
 
				+        if self.output_files:
			
 
				+            # Return first file from output_files (dict preserves insertion order)
			
 
				+            first_file = next(iter(self.output_files.keys()), None)
			
 
				+            if first_file:
			
 
				+                return f'{self.plugin}/{first_file}'
			
 
				+
			
 
				+        # Fallback: check output_str if it looks like a file path
			
 
				+        if self.output_str and ('/' in self.output_str or '.' in self.output_str):
			
 
				+            return self.output_str
			
 
				+
			
 
				+        # Try to find output file based on plugin's canonical output path
			
 
				+        canonical = self.snapshot.canonical_outputs()
			
 
				+        plugin_key = f'{self.plugin}_path'
			
 
				+        if plugin_key in canonical:
			
 
				+            return canonical[plugin_key]
			
 
				+
			
 
				+        # Fallback to plugin directory
			
 
				+        return f'{self.plugin}/'
			
 
				+
			
 
				+    def create_output_dir(self):
			
 
				+        output_dir = Path(self.snapshot_dir) / self.plugin
			
 
				+        output_dir.mkdir(parents=True, exist_ok=True)
			
 
				+        return output_dir
			
 
				+
			
 
				+    @property
			
 
				+    def output_dir_name(self) -> str:
			
 
				+        return self.plugin
			
 
				+
			
 
				+    @property
			
 
				+    def output_dir_parent(self) -> str:
			
 
				+        return str(self.snapshot.OUTPUT_DIR.relative_to(CONSTANTS.DATA_DIR))
			
 
				+
			
 
				+    def save_search_index(self):
			
 
				+        pass
			
 
				+
			
 
				+    def cascade_health_update(self, success: bool):
			
 
				+        """Update health stats for self, parent Snapshot, and grandparent Crawl (if present)."""
			
 
				+        self.increment_health_stats(success)
			
 
				+        self.snapshot.increment_health_stats(success)
			
 
				+        if self.snapshot.crawl_id:
			
 
				+            self.snapshot.crawl.increment_health_stats(success)
			
 
				+
			
 
				+    def run(self):
			
 
				+        """
			
 
				+        Execute this ArchiveResult's hook and update status.
			
 
				+
			
 
				+        If self.hook_name is set, runs only that specific hook.
			
 
				+        If self.hook_name is empty, discovers and runs all hooks for self.plugin (backwards compat).
			
 
				+
			
 
				+        Updates status/output fields, queues discovered URLs, and triggers indexing.
			
 
				+        """
			
 
				+        from django.utils import timezone
			
 
				+        from archivebox.hooks import BUILTIN_PLUGINS_DIR, USER_PLUGINS_DIR, run_hook, is_background_hook
			
 
				+        from archivebox.config.configset import get_config
			
 
				+
			
 
				+        # Get merged config with proper context
			
 
				+        config = get_config(
			
 
				+            crawl=self.snapshot.crawl if self.snapshot.crawl else None,
			
 
				+            snapshot=self.snapshot,
			
 
				+        )
			
 
				+
			
 
				+        # Determine which hook(s) to run
			
 
				+        hooks = []
			
 
				+
			
 
				+        if self.hook_name:
			
 
				+            # SPECIFIC HOOK MODE: Find the specific hook by name
			
 
				+            for base_dir in (BUILTIN_PLUGINS_DIR, USER_PLUGINS_DIR):
			
 
				+                if not base_dir.exists():
			
 
				+                    continue
			
 
				+                plugin_dir = base_dir / self.plugin
			
 
				+                if plugin_dir.exists():
			
 
				+                    hook_path = plugin_dir / self.hook_name
			
 
				+                    if hook_path.exists():
			
 
				+                        hooks.append(hook_path)
			
 
				+                        break
			
 
				+        else:
			
 
				+            # LEGACY MODE: Discover all hooks for this plugin (backwards compatibility)
			
 
				+            for base_dir in (BUILTIN_PLUGINS_DIR, USER_PLUGINS_DIR):
			
 
				+                if not base_dir.exists():
			
 
				+                    continue
			
 
				+                plugin_dir = base_dir / self.plugin
			
 
				+                if plugin_dir.exists():
			
 
				+                    matches = list(plugin_dir.glob('on_Snapshot__*.*'))
			
 
				+                    if matches:
			
 
				+                        hooks.extend(sorted(matches))
			
 
				+
			
 
				+        if not hooks:
			
 
				+            self.status = self.StatusChoices.FAILED
			
 
				+            if self.hook_name:
			
 
				+                self.output_str = f'Hook not found: {self.plugin}/{self.hook_name}'
			
 
				+            else:
			
 
				+                self.output_str = f'No hooks found for plugin: {self.plugin}'
			
 
				+            self.retry_at = None
			
 
				+            self.save()
			
 
				+            return
			
 
				+
			
 
				+        # Output directory is plugin_dir for the hook output
			
 
				+        plugin_dir = Path(self.snapshot.output_dir) / self.plugin
			
 
				+
			
 
				+        start_ts = timezone.now()
			
 
				+        is_bg_hook = False
			
 
				+
			
 
				+        for hook in hooks:
			
 
				+            # Check if this is a background hook
			
 
				+            is_bg_hook = is_background_hook(hook.name)
			
 
				+
			
 
				+            result = run_hook(
			
 
				+                hook,
			
 
				+                output_dir=plugin_dir,
			
 
				+                config=config,
			
 
				+                url=self.snapshot.url,
			
 
				+                snapshot_id=str(self.snapshot.id),
			
 
				+                crawl_id=str(self.snapshot.crawl.id) if self.snapshot.crawl else None,
			
 
				+                depth=self.snapshot.depth,
			
 
				+            )
			
 
				+
			
 
				+            # Background hooks return None
			
 
				+            if result is None:
			
 
				+                is_bg_hook = True
			
 
				+
			
 
				+        # Update status based on hook execution
			
 
				+        if is_bg_hook:
			
 
				+            # BACKGROUND HOOK - still running, return immediately
			
 
				+            # Status stays STARTED, will be finalized by Snapshot.cleanup()
			
 
				+            self.status = self.StatusChoices.STARTED
			
 
				+            self.start_ts = start_ts
			
 
				+            self.pwd = str(plugin_dir)
			
 
				+            self.save()
			
 
				+            return
			
 
				+
			
 
				+        # FOREGROUND HOOK - completed, update from filesystem
			
 
				+        self.start_ts = start_ts
			
 
				+        self.pwd = str(plugin_dir)
			
 
				+        self.update_from_output()
			
 
				+
			
 
				+        # Clean up empty output directory if no files were created
			
 
				+        if plugin_dir.exists() and not self.output_files:
			
 
				+            try:
			
 
				+                if not any(plugin_dir.iterdir()):
			
 
				+                    plugin_dir.rmdir()
			
 
				+            except (OSError, RuntimeError):
			
 
				+                pass
			
 
				+
			
 
				+    def update_from_output(self):
			
 
				+        """
			
 
				+        Update this ArchiveResult from filesystem logs and output files.
			
 
				+
			
 
				+        Used for:
			
 
				+        - Foreground hooks that completed (called from ArchiveResult.run())
			
 
				+        - Background hooks that completed (called from Snapshot.cleanup())
			
 
				+
			
 
				+        Updates:
			
 
				+        - status, output_str, output_json from ArchiveResult JSONL record
			
 
				+        - output_files, output_size, output_mimetypes by walking filesystem
			
 
				+        - end_ts, retry_at, cmd, cmd_version, binary FK
			
 
				+        - Processes side-effect records (Snapshot, Tag, etc.) via process_hook_records()
			
 
				+        """
			
 
				+        import json
			
 
				+        import mimetypes
			
 
				+        from collections import defaultdict
			
 
				+        from pathlib import Path
			
 
				+        from django.utils import timezone
			
 
				+        from archivebox.hooks import process_hook_records
			
 
				+
			
 
				+        plugin_dir = Path(self.pwd) if self.pwd else None
			
 
				+        if not plugin_dir or not plugin_dir.exists():
			
 
				+            self.status = self.StatusChoices.FAILED
			
 
				+            self.output_str = 'Output directory not found'
			
 
				+            self.end_ts = timezone.now()
			
 
				+            self.retry_at = None
			
 
				+            self.save()
			
 
				+            return
			
 
				+
			
 
				+        # Read and parse JSONL output from stdout.log
			
 
				+        stdout_file = plugin_dir / 'stdout.log'
			
 
				+        stdout = stdout_file.read_text() if stdout_file.exists() else ''
			
 
				+
			
 
				+        records = []
			
 
				+        for line in stdout.splitlines():
			
 
				+            if line.strip() and line.strip().startswith('{'):
			
 
				+                try:
			
 
				+                    records.append(json.loads(line))
			
 
				+                except json.JSONDecodeError:
			
 
				+                    continue
			
 
				+
			
 
				+        # Find ArchiveResult record and update status/output from it
			
 
				+        ar_records = [r for r in records if r.get('type') == 'ArchiveResult']
			
 
				+        if ar_records:
			
 
				+            hook_data = ar_records[0]
			
 
				+
			
 
				+            # Update status
			
 
				+            status_map = {
			
 
				+                'succeeded': self.StatusChoices.SUCCEEDED,
			
 
				+                'failed': self.StatusChoices.FAILED,
			
 
				+                'skipped': self.StatusChoices.SKIPPED,
			
 
				+            }
			
 
				+            self.status = status_map.get(hook_data.get('status', 'failed'), self.StatusChoices.FAILED)
			
 
				+
			
 
				+            # Update output fields
			
 
				+            self.output_str = hook_data.get('output_str') or hook_data.get('output') or ''
			
 
				+            self.output_json = hook_data.get('output_json')
			
 
				+
			
 
				+            # Update cmd fields
			
 
				+            if hook_data.get('cmd'):
			
 
				+                self.cmd = hook_data['cmd']
			
 
				+                self._set_binary_from_cmd(hook_data['cmd'])
			
 
				+            if hook_data.get('cmd_version'):
			
 
				+                self.cmd_version = hook_data['cmd_version'][:128]
			
 
				+        else:
			
 
				+            # No ArchiveResult record = failed
			
 
				+            self.status = self.StatusChoices.FAILED
			
 
				+            self.output_str = 'Hook did not output ArchiveResult record'
			
 
				+
			
 
				+        # Walk filesystem and populate output_files, output_size, output_mimetypes
			
 
				+        exclude_names = {'stdout.log', 'stderr.log', 'hook.pid', 'listener.pid'}
			
 
				+        mime_sizes = defaultdict(int)
			
 
				+        total_size = 0
			
 
				+        output_files = {}
			
 
				+
			
 
				+        for file_path in plugin_dir.rglob('*'):
			
 
				+            if not file_path.is_file():
			
 
				+                continue
			
 
				+            if file_path.name in exclude_names:
			
 
				+                continue
			
 
				+
			
 
				+            try:
			
 
				+                stat = file_path.stat()
			
 
				+                mime_type, _ = mimetypes.guess_type(str(file_path))
			
 
				+                mime_type = mime_type or 'application/octet-stream'
			
 
				+
			
 
				+                relative_path = str(file_path.relative_to(plugin_dir))
			
 
				+                output_files[relative_path] = {}
			
 
				+                mime_sizes[mime_type] += stat.st_size
			
 
				+                total_size += stat.st_size
			
 
				+            except (OSError, IOError):
			
 
				+                continue
			
 
				+
			
 
				+        self.output_files = output_files
			
 
				+        self.output_size = total_size
			
 
				+        sorted_mimes = sorted(mime_sizes.items(), key=lambda x: x[1], reverse=True)
			
 
				+        self.output_mimetypes = ','.join(mime for mime, _ in sorted_mimes)
			
 
				+
			
 
				+        # Update timestamps
			
 
				+        self.end_ts = timezone.now()
			
 
				+        self.retry_at = None
			
 
				+
			
 
				+        self.save()
			
 
				+
			
 
				+        # Process side-effect records (filter Snapshots for depth/URL)
			
 
				+        filtered_records = []
			
 
				+        for record in records:
			
 
				+            record_type = record.get('type')
			
 
				+
			
 
				+            # Skip ArchiveResult records (already processed above)
			
 
				+            if record_type == 'ArchiveResult':
			
 
				+                continue
			
 
				+
			
 
				+            # Filter Snapshot records for depth/URL constraints
			
 
				+            if record_type == 'Snapshot':
			
 
				+                if not self.snapshot.crawl:
			
 
				+                    continue
			
 
				+
			
 
				+                url = record.get('url')
			
 
				+                if not url:
			
 
				+                    continue
			
 
				+
			
 
				+                depth = record.get('depth', self.snapshot.depth + 1)
			
 
				+                if depth > self.snapshot.crawl.max_depth:
			
 
				+                    continue
			
 
				+
			
 
				+                if not self._url_passes_filters(url):
			
 
				+                    continue
			
 
				+
			
 
				+            filtered_records.append(record)
			
 
				+
			
 
				+        # Process filtered records with unified dispatcher
			
 
				+        overrides = {
			
 
				+            'snapshot': self.snapshot,
			
 
				+            'crawl': self.snapshot.crawl,
			
 
				+            'created_by_id': self.snapshot.crawl.created_by_id,
			
 
				+        }
			
 
				+        process_hook_records(filtered_records, overrides=overrides)
			
 
				+
			
 
				+        # Cleanup PID files and empty logs
			
 
				+        pid_file = plugin_dir / 'hook.pid'
			
 
				+        pid_file.unlink(missing_ok=True)
			
 
				+        stderr_file = plugin_dir / 'stderr.log'
			
 
				+        if stdout_file.exists() and stdout_file.stat().st_size == 0:
			
 
				+            stdout_file.unlink()
			
 
				+        if stderr_file.exists() and stderr_file.stat().st_size == 0:
			
 
				+            stderr_file.unlink()
			
 
				+
			
 
				+    def _set_binary_from_cmd(self, cmd: list) -> None:
			
 
				+        """
			
 
				+        Find Binary for command and set binary FK.
			
 
				+
			
 
				+        Tries matching by absolute path first, then by binary name.
			
 
				+        Only matches binaries on the current machine.
			
 
				+        """
			
 
				+        if not cmd:
			
 
				+            return
			
 
				+
			
 
				+        from archivebox.machine.models import Machine
			
 
				+
			
 
				+        bin_path_or_name = cmd[0] if isinstance(cmd, list) else cmd
			
 
				+        machine = Machine.current()
			
 
				+
			
 
				+        # Try matching by absolute path first
			
 
				+        binary = Binary.objects.filter(
			
 
				+            abspath=bin_path_or_name,
			
 
				+            machine=machine
			
 
				+        ).first()
			
 
				+
			
 
				+        if binary:
			
 
				+            self.binary = binary
			
 
				+            return
			
 
				+
			
 
				+        # Fallback: match by binary name
			
 
				+        bin_name = Path(bin_path_or_name).name
			
 
				+        binary = Binary.objects.filter(
			
 
				+            name=bin_name,
			
 
				+            machine=machine
			
 
				+        ).first()
			
 
				+
			
 
				+        if binary:
			
 
				+            self.binary = binary
			
 
				+
			
 
				+    def _url_passes_filters(self, url: str) -> bool:
			
 
				+        """Check if URL passes URL_ALLOWLIST and URL_DENYLIST config filters.
			
 
				+
			
 
				+        Uses proper config hierarchy: defaults -> file -> env -> machine -> user -> crawl -> snapshot
			
 
				+        """
			
 
				+        import re
			
 
				+        from archivebox.config.configset import get_config
			
 
				+
			
 
				+        # Get merged config with proper hierarchy
			
 
				+        config = get_config(
			
 
				+            user=self.snapshot.crawl.created_by if self.snapshot else None,
			
 
				+            crawl=self.snapshot.crawl if self.snapshot else None,
			
 
				+            snapshot=self.snapshot,
			
 
				+        )
			
 
				+
			
 
				+        # Get allowlist/denylist (can be string or list)
			
 
				+        allowlist_raw = config.get('URL_ALLOWLIST', '')
			
 
				+        denylist_raw = config.get('URL_DENYLIST', '')
			
 
				+
			
 
				+        # Normalize to list of patterns
			
 
				+        def to_pattern_list(value):
			
 
				+            if isinstance(value, list):
			
 
				+                return value
			
 
				+            if isinstance(value, str):
			
 
				+                return [p.strip() for p in value.split(',') if p.strip()]
			
 
				+            return []
			
 
				+
			
 
				+        allowlist = to_pattern_list(allowlist_raw)
			
 
				+        denylist = to_pattern_list(denylist_raw)
			
 
				+
			
 
				+        # Denylist takes precedence
			
 
				+        if denylist:
			
 
				+            for pattern in denylist:
			
 
				+                try:
			
 
				+                    if re.search(pattern, url):
			
 
				+                        return False
			
 
				+                except re.error:
			
 
				+                    continue  # Skip invalid regex patterns
			
 
				+
			
 
				+        # If allowlist exists, URL must match at least one pattern
			
 
				+        if allowlist:
			
 
				+            for pattern in allowlist:
			
 
				+                try:
			
 
				+                    if re.search(pattern, url):
			
 
				+                        return True
			
 
				+                except re.error:
			
 
				+                    continue  # Skip invalid regex patterns
			
 
				+            return False  # No allowlist patterns matched
			
 
				+
			
 
				+        return True  # No filters or passed filters
			
 
				+
			
 
				+    @property
			
 
				+    def output_dir(self) -> Path:
			
 
				+        """Get the output directory for this plugin's results."""
			
 
				+        return Path(self.snapshot.output_dir) / self.plugin
			
 
				+
			
 
				+    def is_background_hook(self) -> bool:
			
 
				+        """Check if this ArchiveResult is for a background hook."""
			
 
				+        plugin_dir = Path(self.pwd) if self.pwd else None
			
 
				+        if not plugin_dir:
			
 
				+            return False
			
 
				+        pid_file = plugin_dir / 'hook.pid'
			
 
				+        return pid_file.exists()
			
 
				+
			
 
				+
			
 
				+# =============================================================================
			
 
				+# ArchiveResult State Machine
			
 
				+# =============================================================================
			
 
				+
			
 
				+class ArchiveResultMachine(BaseStateMachine, strict_states=True):
			
 
				+    """
			
 
				+    State machine for managing ArchiveResult (single plugin execution) lifecycle.
			
 
				+
			
 
				+    Hook Lifecycle:
			
 
				+    ┌─────────────────────────────────────────────────────────────┐
			
 
				+    │ QUEUED State                                                │
			
 
				+    │  • Waiting for its turn to run                              │
			
 
				+    └─────────────────────────────────────────────────────────────┘
			
 
				+                            ↓ tick() when can_start()
			
 
				+    ┌─────────────────────────────────────────────────────────────┐
			
 
				+    │ STARTED State → enter_started()                             │
			
 
				+    │  1. archiveresult.run()                                     │
			
 
				+    │     • Find specific hook by hook_name                       │
			
 
				+    │     • run_hook(script, output_dir, ...) → subprocess        │
			
 
				+    │                                                              │
			
 
				+    │  2a. FOREGROUND hook (returns HookResult):                  │
			
 
				+    │      • update_from_output() immediately                     │
			
 
				+    │        - Read stdout.log                                    │
			
 
				+    │        - Parse JSONL records                                │
			
 
				+    │        - Extract 'ArchiveResult' record → update status     │
			
 
				+    │        - Walk output_dir → populate output_files            │
			
 
				+    │        - Call process_hook_records() for side effects       │
			
 
				+    │                                                              │
			
 
				+    │  2b. BACKGROUND hook (returns None):                        │
			
 
				+    │      • Status stays STARTED                                 │
			
 
				+    │      • Continues running in background                      │
			
 
				+    │      • Killed by Snapshot.cleanup() when sealed             │
			
 
				+    └─────────────────────────────────────────────────────────────┘
			
 
				+                            ↓ tick() checks status
			
 
				+    ┌─────────────────────────────────────────────────────────────┐
			
 
				+    │ SUCCEEDED / FAILED / SKIPPED / BACKOFF                      │
			
 
				+    │  • Set by hook's JSONL output during update_from_output()   │
			
 
				+    │  • Health stats incremented (num_uses_succeeded/failed)     │
			
 
				+    │  • Parent Snapshot health stats also updated                │
			
 
				+    └─────────────────────────────────────────────────────────────┘
			
 
				+
			
 
				+    https://github.com/ArchiveBox/ArchiveBox/wiki/ArchiveBox-Architecture-Diagrams
			
 
				+    """
			
 
				+
			
 
				+    model_attr_name = 'archiveresult'
			
 
				+
			
 
				+    # States
			
 
				+    queued = State(value=ArchiveResult.StatusChoices.QUEUED, initial=True)
			
 
				+    started = State(value=ArchiveResult.StatusChoices.STARTED)
			
 
				+    backoff = State(value=ArchiveResult.StatusChoices.BACKOFF)
			
 
				+    succeeded = State(value=ArchiveResult.StatusChoices.SUCCEEDED, final=True)
			
 
				+    failed = State(value=ArchiveResult.StatusChoices.FAILED, final=True)
			
 
				+    skipped = State(value=ArchiveResult.StatusChoices.SKIPPED, final=True)
			
 
				+
			
 
				+    # Tick Event - transitions based on conditions
			
 
				+    tick = (
			
 
				+        queued.to.itself(unless='can_start') |
			
 
				+        queued.to(started, cond='can_start') |
			
 
				+        started.to.itself(unless='is_finished') |
			
 
				+        started.to(succeeded, cond='is_succeeded') |
			
 
				+        started.to(failed, cond='is_failed') |
			
 
				+        started.to(skipped, cond='is_skipped') |
			
 
				+        started.to(backoff, cond='is_backoff') |
			
 
				+        backoff.to.itself(unless='can_start') |
			
 
				+        backoff.to(started, cond='can_start') |
			
 
				+        backoff.to(succeeded, cond='is_succeeded') |
			
 
				+        backoff.to(failed, cond='is_failed') |
			
 
				+        backoff.to(skipped, cond='is_skipped')
			
 
				+    )
			
 
				+
			
 
				+    def can_start(self) -> bool:
			
 
				+        can_start = bool(self.archiveresult.snapshot.url)
			
 
				+        # Suppressed: queue waiting logs
			
 
				+        return can_start
			
 
				+
			
 
				+    def is_succeeded(self) -> bool:
			
 
				+        """Check if extractor plugin succeeded (status was set by run())."""
			
 
				+        return self.archiveresult.status == ArchiveResult.StatusChoices.SUCCEEDED
			
 
				+
			
 
				+    def is_failed(self) -> bool:
			
 
				+        """Check if extractor plugin failed (status was set by run())."""
			
 
				+        return self.archiveresult.status == ArchiveResult.StatusChoices.FAILED
			
 
				+
			
 
				+    def is_skipped(self) -> bool:
			
 
				+        """Check if extractor plugin was skipped (status was set by run())."""
			
 
				+        return self.archiveresult.status == ArchiveResult.StatusChoices.SKIPPED
			
 
				+
			
 
				+    def is_backoff(self) -> bool:
			
 
				+        """Check if we should backoff and retry later."""
			
 
				+        # Backoff if status is still started (plugin didn't complete) and output_str is empty
			
 
				+        return (
			
 
				+            self.archiveresult.status == ArchiveResult.StatusChoices.STARTED and
			
 
				+            not self.archiveresult.output_str
			
 
				+        )
			
 
				+
			
 
				+    def is_finished(self) -> bool:
			
 
				+        """Check if extraction has completed (success, failure, or skipped)."""
			
 
				+        return self.archiveresult.status in (
			
 
				+            ArchiveResult.StatusChoices.SUCCEEDED,
			
 
				+            ArchiveResult.StatusChoices.FAILED,
			
 
				+            ArchiveResult.StatusChoices.SKIPPED,
			
 
				+        )
			
 
				+
			
 
				+    @queued.enter
			
 
				+    def enter_queued(self):
			
 
				+        # Suppressed: state transition logs
			
 
				+        self.archiveresult.update_and_requeue(
			
 
				+            retry_at=timezone.now(),
			
 
				+            status=ArchiveResult.StatusChoices.QUEUED,
			
 
				+            start_ts=None,
			
 
				+        )  # bump the snapshot's retry_at so they pickup any new changes
			
 
				+
			
 
				+    @started.enter
			
 
				+    def enter_started(self):
			
 
				+        from archivebox.machine.models import NetworkInterface
			
 
				+
			
 
				+        # Suppressed: state transition logs
			
 
				+        # Lock the object and mark start time
			
 
				+        self.archiveresult.update_and_requeue(
			
 
				+            retry_at=timezone.now() + timedelta(seconds=120),  # 2 min timeout for plugin
			
 
				+            status=ArchiveResult.StatusChoices.STARTED,
			
 
				+            start_ts=timezone.now(),
			
 
				+            iface=NetworkInterface.current(),
			
 
				+        )
			
 
				+
			
 
				+        # Run the plugin - this updates status, output, timestamps, etc.
			
 
				+        self.archiveresult.run()
			
 
				+
			
 
				+        # Save the updated result
			
 
				+        self.archiveresult.save()
			
 
				+
			
 
				+        # Suppressed: plugin result logs (already logged by worker)
			
 
				+
			
 
				+    @backoff.enter
			
 
				+    def enter_backoff(self):
			
 
				+        # Suppressed: state transition logs
			
 
				+        self.archiveresult.update_and_requeue(
			
 
				+            retry_at=timezone.now() + timedelta(seconds=60),
			
 
				+            status=ArchiveResult.StatusChoices.BACKOFF,
			
 
				+            end_ts=None,
			
 
				+            # retries=F('retries') + 1,               # F() equivalent to getattr(self.archiveresult, 'retries', 0) + 1,
			
 
				+        )
			
 
				+
			
 
				+    @succeeded.enter
			
 
				+    def enter_succeeded(self):
			
 
				+        # Suppressed: state transition logs
			
 
				+        self.archiveresult.update_and_requeue(
			
 
				+            retry_at=None,
			
 
				+            status=ArchiveResult.StatusChoices.SUCCEEDED,
			
 
				+            end_ts=timezone.now(),
			
 
				+            # **self.archiveresult.get_output_dict(),     # {output, output_json, stderr, stdout, returncode, errors, cmd_version, pwd, cmd, machine}
			
 
				+        )
			
 
				+        self.archiveresult.save()
			
 
				+
			
 
				+        # Update health stats for ArchiveResult, Snapshot, and Crawl cascade
			
 
				+        self.archiveresult.cascade_health_update(success=True)
			
 
				+
			
 
				+    @failed.enter
			
 
				+    def enter_failed(self):
			
 
				+        # Suppressed: state transition logs
			
 
				+        self.archiveresult.update_and_requeue(
			
 
				+            retry_at=None,
			
 
				+            status=ArchiveResult.StatusChoices.FAILED,
			
 
				+            end_ts=timezone.now(),
			
 
				+        )
			
 
				+
			
 
				+        # Update health stats for ArchiveResult, Snapshot, and Crawl cascade
			
 
				+        self.archiveresult.cascade_health_update(success=False)
			
 
				+
			
 
				+    @skipped.enter
			
 
				+    def enter_skipped(self):
			
 
				+        # Suppressed: state transition logs
			
 
				+        self.archiveresult.update_and_requeue(
			
 
				+            retry_at=None,
			
 
				+            status=ArchiveResult.StatusChoices.SKIPPED,
			
 
				+            end_ts=timezone.now(),
			
 
				+        )
			
 
				+
			
 
				+    def after_transition(self, event: str, source: State, target: State):
			
 
				+        # print(f"after '{event}' from '{source.id}' to '{target.id}'")
			
 
				+        self.archiveresult.snapshot.update_and_requeue()  # bump snapshot retry time so it picks up all the new changes
			
 
				+
			
 
				+
			
 
				+# =============================================================================
			
 
				+# State Machine Registration
			
 
				+# =============================================================================
			
 
				+
			
 
				+# Manually register state machines with python-statemachine registry
			
 
				+# (normally auto-discovered from statemachines.py, but we define them here for clarity)
			
 
				+registry.register(SnapshotMachine)
			
 
				+registry.register(ArchiveResultMachine)
			
--- a/archivebox/core/settings.py
+++ b/archivebox/core/settings.py
@@ -30,9 +30,9 @@ LOADED_PLUGINS = archivebox.LOADED_PLUGINS
 
				 ### Django Core Settings
			
 
				 ################################################################################
			
 
				 
			
 
				-WSGI_APPLICATION = "core.wsgi.application"
			
 
				-ASGI_APPLICATION = "core.asgi.application"
			
 
				-ROOT_URLCONF = "core.urls"
			
 
				+WSGI_APPLICATION = "archivebox.core.wsgi.application"
			
 
				+ASGI_APPLICATION = "archivebox.core.asgi.application"
			
 
				+ROOT_URLCONF = "archivebox.core.urls"
			
 
				 
			
 
				 LOGIN_URL = "/accounts/login/"
			
 
				 LOGOUT_REDIRECT_URL = os.environ.get("LOGOUT_REDIRECT_URL", "/")
			
@@ -55,14 +55,15 @@ INSTALLED_APPS = [
 
				     # 3rd-party apps from PyPI
			
 
				     "signal_webhooks",  # handles REST API outbound webhooks                              https://github.com/MrThearMan/django-signal-webhooks
			
 
				     "django_object_actions",  # provides easy Django Admin action buttons on change views       https://github.com/crccheck/django-object-actions
			
 
				-    # Our ArchiveBox-provided apps
			
 
				-    "config",  # ArchiveBox config settings (loaded as a plugin, don't need to add it here)
			
 
				-    "machine",  # handles collecting and storing information about the host machine, network interfaces, binaries, etc.
			
 
				-    "workers",  # handles starting and managing background workers and processes (orchestrators and actors)
			
 
				-    "crawls",  # handles Crawl and CrawlSchedule models and management
			
 
				-    "personas",  # handles Persona and session management
			
 
				-    "core",  # core django model with Snapshot, ArchiveResult, etc.
			
 
				-    "api",  # Django-Ninja-based Rest API interfaces, config, APIToken model, etc.
			
 
				+    # Our ArchiveBox-provided apps (use fully qualified names)
			
 
				+    # NOTE: Order matters! Apps with migrations that depend on other apps must come AFTER their dependencies
			
 
				+    # "archivebox.config",  # ArchiveBox config settings (no models, not a real Django app)
			
 
				+    "archivebox.machine",  # handles collecting and storing information about the host machine, network interfaces, binaries, etc.
			
 
				+    "archivebox.workers",  # handles starting and managing background workers and processes (orchestrators and actors)
			
 
				+    "archivebox.personas",  # handles Persona and session management
			
 
				+    "archivebox.core",  # core django model with Snapshot, ArchiveResult, etc. (crawls depends on this)
			
 
				+    "archivebox.crawls",  # handles Crawl and CrawlSchedule models and management (depends on core)
			
 
				+    "archivebox.api",  # Django-Ninja-based Rest API interfaces, config, APIToken model, etc.
			
 
				     # ArchiveBox plugins (hook-based plugins no longer add Django apps)
			
 
				     # Use hooks.py discover_hooks() for plugin functionality
			
 
				     # 3rd-party apps from PyPI that need to be loaded last
			
@@ -72,15 +73,15 @@ INSTALLED_APPS = [
 
				 
			
 
				 
			
 
				 MIDDLEWARE = [
			
 
				-    "core.middleware.TimezoneMiddleware",
			
 
				+    "archivebox.core.middleware.TimezoneMiddleware",
			
 
				     "django.middleware.security.SecurityMiddleware",
			
 
				     "django.contrib.sessions.middleware.SessionMiddleware",
			
 
				     "django.middleware.common.CommonMiddleware",
			
 
				     "django.middleware.csrf.CsrfViewMiddleware",
			
 
				     "django.contrib.auth.middleware.AuthenticationMiddleware",
			
 
				-    "core.middleware.ReverseProxyAuthMiddleware",
			
 
				+    "archivebox.core.middleware.ReverseProxyAuthMiddleware",
			
 
				     "django.contrib.messages.middleware.MessageMiddleware",
			
 
				-    "core.middleware.CacheControlMiddleware",
			
 
				+    "archivebox.core.middleware.CacheControlMiddleware",
			
 
				     # Additional middlewares from plugins (if any)
			
 
				 ]
			
 
				 
			
@@ -370,15 +371,15 @@ LOGGING = SETTINGS_LOGGING
 
				 ################################################################################
			
 
				 
			
 
				 # Add default webhook configuration to the User model
			
 
				-SIGNAL_WEBHOOKS_CUSTOM_MODEL = "api.models.OutboundWebhook"
			
 
				+SIGNAL_WEBHOOKS_CUSTOM_MODEL = "archivebox.api.models.OutboundWebhook"
			
 
				 SIGNAL_WEBHOOKS = {
			
 
				     "HOOKS": {
			
 
				         # ... is a special sigil value that means "use the default autogenerated hooks"
			
 
				         "django.contrib.auth.models.User": ...,
			
 
				-        "core.models.Snapshot": ...,
			
 
				-        "core.models.ArchiveResult": ...,
			
 
				-        "core.models.Tag": ...,
			
 
				-        "api.models.APIToken": ...,
			
 
				+        "archivebox.core.models.Snapshot": ...,
			
 
				+        "archivebox.core.models.ArchiveResult": ...,
			
 
				+        "archivebox.core.models.Tag": ...,
			
 
				+        "archivebox.api.models.APIToken": ...,
			
 
				     },
			
 
				 }
			
 
				 
			
@@ -391,11 +392,11 @@ ADMIN_DATA_VIEWS = {
 
				     "URLS": [
			
 
				         {
			
 
				             "route": "config/",
			
 
				-            "view": "core.views.live_config_list_view",
			
 
				+            "view": "archivebox.core.views.live_config_list_view",
			
 
				             "name": "Configuration",
			
 
				             "items": {
			
 
				                 "route": "<str:key>/",
			
 
				-                "view": "core.views.live_config_value_view",
			
 
				+                "view": "archivebox.core.views.live_config_value_view",
			
 
				                 "name": "config_val",
			
 
				             },
			
 
				         },
			
--- a/archivebox/core/statemachines.py
+++ b/archivebox/core/statemachines.py
@@ -1,319 +0,0 @@
 
				-__package__ = 'archivebox.core'
			
 
				-
			
 
				-import time
			
 
				-import os
			
 
				-from datetime import timedelta
			
 
				-from typing import ClassVar
			
 
				-
			
 
				-from django.db.models import F
			
 
				-from django.utils import timezone
			
 
				-
			
 
				-from rich import print
			
 
				-
			
 
				-from statemachine import State, StateMachine
			
 
				-
			
 
				-# from workers.actor import ActorType
			
 
				-
			
 
				-from core.models import Snapshot, ArchiveResult
			
 
				-from crawls.models import Crawl
			
 
				-
			
 
				-
			
 
				-class SnapshotMachine(StateMachine, strict_states=True):
			
 
				-    """
			
 
				-    State machine for managing Snapshot lifecycle.
			
 
				-    
			
 
				-    https://github.com/ArchiveBox/ArchiveBox/wiki/ArchiveBox-Architecture-Diagrams
			
 
				-    """
			
 
				-    
			
 
				-    model: Snapshot
			
 
				-    
			
 
				-    # States
			
 
				-    queued = State(value=Snapshot.StatusChoices.QUEUED, initial=True)
			
 
				-    started = State(value=Snapshot.StatusChoices.STARTED)
			
 
				-    sealed = State(value=Snapshot.StatusChoices.SEALED, final=True)
			
 
				-    
			
 
				-    # Tick Event
			
 
				-    tick = (
			
 
				-        queued.to.itself(unless='can_start') |
			
 
				-        queued.to(started, cond='can_start') |
			
 
				-        started.to.itself(unless='is_finished') |
			
 
				-        started.to(sealed, cond='is_finished')
			
 
				-    )
			
 
				-    
			
 
				-    def __init__(self, snapshot, *args, **kwargs):
			
 
				-        self.snapshot = snapshot
			
 
				-        super().__init__(snapshot, *args, **kwargs)
			
 
				-        
			
 
				-    def __repr__(self) -> str:
			
 
				-        return f'Snapshot[{self.snapshot.id}]'
			
 
				-
			
 
				-    def __str__(self) -> str:
			
 
				-        return self.__repr__()
			
 
				-
			
 
				-    def can_start(self) -> bool:
			
 
				-        can_start = bool(self.snapshot.url)
			
 
				-        # Suppressed: queue waiting logs
			
 
				-        return can_start
			
 
				-        
			
 
				-    def is_finished(self) -> bool:
			
 
				-        # if no archiveresults exist yet, it's not finished
			
 
				-        if not self.snapshot.archiveresult_set.exists():
			
 
				-            return False
			
 
				-
			
 
				-        # Try to advance step if ready (handles step-based hook execution)
			
 
				-        # This will increment current_step when all foreground hooks in current step are done
			
 
				-        while self.snapshot.advance_step_if_ready():
			
 
				-            pass  # Keep advancing until we can't anymore
			
 
				-
			
 
				-        # if archiveresults exist but are still pending, it's not finished
			
 
				-        if self.snapshot.pending_archiveresults().exists():
			
 
				-            return False
			
 
				-
			
 
				-        # Don't wait for background hooks - they'll be cleaned up on entering sealed state
			
 
				-        # Background hooks in STARTED state are excluded by pending_archiveresults()
			
 
				-        # (STARTED is in FINAL_OR_ACTIVE_STATES) so once all results are FINAL or ACTIVE,
			
 
				-        # we can transition to sealed and cleanup() will kill the background hooks
			
 
				-
			
 
				-        # otherwise archiveresults exist and are all finished, so it's finished
			
 
				-        return True
			
 
				-        
			
 
				-    # def on_transition(self, event, state):
			
 
				-    #     print(f'{self}.on_transition() [blue]{str(state).upper()}[/blue] ➡️ ...')
			
 
				-        
			
 
				-    @queued.enter
			
 
				-    def enter_queued(self):
			
 
				-        # Suppressed: state transition logs
			
 
				-        self.snapshot.update_for_workers(
			
 
				-            retry_at=timezone.now(),
			
 
				-            status=Snapshot.StatusChoices.QUEUED,
			
 
				-        )
			
 
				-
			
 
				-    @started.enter
			
 
				-    def enter_started(self):
			
 
				-        # Suppressed: state transition logs
			
 
				-        # lock the snapshot while we create the pending archiveresults
			
 
				-        self.snapshot.update_for_workers(
			
 
				-            retry_at=timezone.now() + timedelta(seconds=30),  # if failed, wait 30s before retrying
			
 
				-        )
			
 
				-
			
 
				-        # Run the snapshot - creates pending archiveresults for all enabled plugins
			
 
				-        self.snapshot.run()
			
 
				-
			
 
				-        # unlock the snapshot after we're done + set status = started
			
 
				-        self.snapshot.update_for_workers(
			
 
				-            retry_at=timezone.now() + timedelta(seconds=5),  # check again in 5s
			
 
				-            status=Snapshot.StatusChoices.STARTED,
			
 
				-        )
			
 
				-
			
 
				-    @sealed.enter
			
 
				-    def enter_sealed(self):
			
 
				-        # Clean up background hooks
			
 
				-        self.snapshot.cleanup()
			
 
				-
			
 
				-        # Suppressed: state transition logs
			
 
				-        self.snapshot.update_for_workers(
			
 
				-            retry_at=None,
			
 
				-            status=Snapshot.StatusChoices.SEALED,
			
 
				-        )
			
 
				-
			
 
				-
			
 
				-# class SnapshotWorker(ActorType[Snapshot]):
			
 
				-#     """
			
 
				-#     The primary actor for progressing Snapshot objects
			
 
				-#     through their lifecycle using the SnapshotMachine.
			
 
				-#     """
			
 
				-#     Model = Snapshot
			
 
				-#     StateMachineClass = SnapshotMachine
			
 
				-    
			
 
				-#     ACTIVE_STATE: ClassVar[State] = SnapshotMachine.started                    # 'started'
			
 
				-    
			
 
				-#     MAX_CONCURRENT_ACTORS: ClassVar[int] = 3
			
 
				-#     MAX_TICK_TIME: ClassVar[int] = 10
			
 
				-#     CLAIM_FROM_TOP_N: ClassVar[int] = MAX_CONCURRENT_ACTORS * 10
			
 
				-
			
 
				-
			
 
				-
			
 
				-
			
 
				-
			
 
				-class ArchiveResultMachine(StateMachine, strict_states=True):
			
 
				-    """
			
 
				-    State machine for managing ArchiveResult lifecycle.
			
 
				-    
			
 
				-    https://github.com/ArchiveBox/ArchiveBox/wiki/ArchiveBox-Architecture-Diagrams
			
 
				-    """
			
 
				-    
			
 
				-    model: ArchiveResult
			
 
				-    
			
 
				-    # States
			
 
				-    queued = State(value=ArchiveResult.StatusChoices.QUEUED, initial=True)
			
 
				-    started = State(value=ArchiveResult.StatusChoices.STARTED)
			
 
				-    backoff = State(value=ArchiveResult.StatusChoices.BACKOFF)
			
 
				-    succeeded = State(value=ArchiveResult.StatusChoices.SUCCEEDED, final=True)
			
 
				-    failed = State(value=ArchiveResult.StatusChoices.FAILED, final=True)
			
 
				-    skipped = State(value=ArchiveResult.StatusChoices.SKIPPED, final=True)
			
 
				-    
			
 
				-    # Tick Event - transitions based on conditions
			
 
				-    tick = (
			
 
				-        queued.to.itself(unless='can_start') |
			
 
				-        queued.to(started, cond='can_start') |
			
 
				-        started.to.itself(unless='is_finished') |
			
 
				-        started.to(succeeded, cond='is_succeeded') |
			
 
				-        started.to(failed, cond='is_failed') |
			
 
				-        started.to(skipped, cond='is_skipped') |
			
 
				-        started.to(backoff, cond='is_backoff') |
			
 
				-        backoff.to.itself(unless='can_start') |
			
 
				-        backoff.to(started, cond='can_start') |
			
 
				-        backoff.to(succeeded, cond='is_succeeded') |
			
 
				-        backoff.to(failed, cond='is_failed') |
			
 
				-        backoff.to(skipped, cond='is_skipped')
			
 
				-    )
			
 
				-
			
 
				-    def __init__(self, archiveresult, *args, **kwargs):
			
 
				-        self.archiveresult = archiveresult
			
 
				-        super().__init__(archiveresult, *args, **kwargs)
			
 
				-    
			
 
				-    def __repr__(self) -> str:
			
 
				-        return f'ArchiveResult[{self.archiveresult.id}]'
			
 
				-
			
 
				-    def __str__(self) -> str:
			
 
				-        return self.__repr__()
			
 
				-
			
 
				-    def can_start(self) -> bool:
			
 
				-        can_start = bool(self.archiveresult.snapshot.url)
			
 
				-        # Suppressed: queue waiting logs
			
 
				-        return can_start
			
 
				-    
			
 
				-    def is_succeeded(self) -> bool:
			
 
				-        """Check if extractor plugin succeeded (status was set by run())."""
			
 
				-        return self.archiveresult.status == ArchiveResult.StatusChoices.SUCCEEDED
			
 
				-
			
 
				-    def is_failed(self) -> bool:
			
 
				-        """Check if extractor plugin failed (status was set by run())."""
			
 
				-        return self.archiveresult.status == ArchiveResult.StatusChoices.FAILED
			
 
				-
			
 
				-    def is_skipped(self) -> bool:
			
 
				-        """Check if extractor plugin was skipped (status was set by run())."""
			
 
				-        return self.archiveresult.status == ArchiveResult.StatusChoices.SKIPPED
			
 
				-    
			
 
				-    def is_backoff(self) -> bool:
			
 
				-        """Check if we should backoff and retry later."""
			
 
				-        # Backoff if status is still started (plugin didn't complete) and output_str is empty
			
 
				-        return (
			
 
				-            self.archiveresult.status == ArchiveResult.StatusChoices.STARTED and
			
 
				-            not self.archiveresult.output_str
			
 
				-        )
			
 
				-    
			
 
				-    def is_finished(self) -> bool:
			
 
				-        """Check if extraction has completed (success, failure, or skipped)."""
			
 
				-        return self.archiveresult.status in (
			
 
				-            ArchiveResult.StatusChoices.SUCCEEDED,
			
 
				-            ArchiveResult.StatusChoices.FAILED,
			
 
				-            ArchiveResult.StatusChoices.SKIPPED,
			
 
				-        )
			
 
				-
			
 
				-    @queued.enter
			
 
				-    def enter_queued(self):
			
 
				-        # Suppressed: state transition logs
			
 
				-        self.archiveresult.update_for_workers(
			
 
				-            retry_at=timezone.now(),
			
 
				-            status=ArchiveResult.StatusChoices.QUEUED,
			
 
				-            start_ts=None,
			
 
				-        )  # bump the snapshot's retry_at so they pickup any new changes
			
 
				-
			
 
				-    @started.enter
			
 
				-    def enter_started(self):
			
 
				-        from machine.models import NetworkInterface
			
 
				-
			
 
				-        # Suppressed: state transition logs
			
 
				-        # Lock the object and mark start time
			
 
				-        self.archiveresult.update_for_workers(
			
 
				-            retry_at=timezone.now() + timedelta(seconds=120),  # 2 min timeout for plugin
			
 
				-            status=ArchiveResult.StatusChoices.STARTED,
			
 
				-            start_ts=timezone.now(),
			
 
				-            iface=NetworkInterface.current(),
			
 
				-        )
			
 
				-
			
 
				-        # Run the plugin - this updates status, output, timestamps, etc.
			
 
				-        self.archiveresult.run()
			
 
				-
			
 
				-        # Save the updated result
			
 
				-        self.archiveresult.save()
			
 
				-
			
 
				-        # Suppressed: plugin result logs (already logged by worker)
			
 
				-
			
 
				-    @backoff.enter
			
 
				-    def enter_backoff(self):
			
 
				-        # Suppressed: state transition logs
			
 
				-        self.archiveresult.update_for_workers(
			
 
				-            retry_at=timezone.now() + timedelta(seconds=60),
			
 
				-            status=ArchiveResult.StatusChoices.BACKOFF,
			
 
				-            end_ts=None,
			
 
				-            # retries=F('retries') + 1,               # F() equivalent to getattr(self.archiveresult, 'retries', 0) + 1,
			
 
				-        )
			
 
				-        self.archiveresult.save()
			
 
				-
			
 
				-    @succeeded.enter
			
 
				-    def enter_succeeded(self):
			
 
				-        # Suppressed: state transition logs
			
 
				-        self.archiveresult.update_for_workers(
			
 
				-            retry_at=None,
			
 
				-            status=ArchiveResult.StatusChoices.SUCCEEDED,
			
 
				-            end_ts=timezone.now(),
			
 
				-            # **self.archiveresult.get_output_dict(),     # {output, output_json, stderr, stdout, returncode, errors, cmd_version, pwd, cmd, machine}
			
 
				-        )
			
 
				-        self.archiveresult.save()
			
 
				-
			
 
				-        # Increment health stats on ArchiveResult, Snapshot, and optionally Crawl
			
 
				-        ArchiveResult.objects.filter(pk=self.archiveresult.pk).update(num_uses_succeeded=F('num_uses_succeeded') + 1)
			
 
				-        Snapshot.objects.filter(pk=self.archiveresult.snapshot_id).update(num_uses_succeeded=F('num_uses_succeeded') + 1)
			
 
				-
			
 
				-        # Also update Crawl health stats if snapshot has a crawl
			
 
				-        snapshot = self.archiveresult.snapshot
			
 
				-        if snapshot.crawl_id:
			
 
				-            Crawl.objects.filter(pk=snapshot.crawl_id).update(num_uses_succeeded=F('num_uses_succeeded') + 1)
			
 
				-
			
 
				-    @failed.enter
			
 
				-    def enter_failed(self):
			
 
				-        # Suppressed: state transition logs
			
 
				-        self.archiveresult.update_for_workers(
			
 
				-            retry_at=None,
			
 
				-            status=ArchiveResult.StatusChoices.FAILED,
			
 
				-            end_ts=timezone.now(),
			
 
				-        )
			
 
				-
			
 
				-        # Increment health stats on ArchiveResult, Snapshot, and optionally Crawl
			
 
				-        ArchiveResult.objects.filter(pk=self.archiveresult.pk).update(num_uses_failed=F('num_uses_failed') + 1)
			
 
				-        Snapshot.objects.filter(pk=self.archiveresult.snapshot_id).update(num_uses_failed=F('num_uses_failed') + 1)
			
 
				-
			
 
				-        # Also update Crawl health stats if snapshot has a crawl
			
 
				-        snapshot = self.archiveresult.snapshot
			
 
				-        if snapshot.crawl_id:
			
 
				-            Crawl.objects.filter(pk=snapshot.crawl_id).update(num_uses_failed=F('num_uses_failed') + 1)
			
 
				-
			
 
				-    @skipped.enter
			
 
				-    def enter_skipped(self):
			
 
				-        # Suppressed: state transition logs
			
 
				-        self.archiveresult.update_for_workers(
			
 
				-            retry_at=None,
			
 
				-            status=ArchiveResult.StatusChoices.SKIPPED,
			
 
				-            end_ts=timezone.now(),
			
 
				-        )
			
 
				-        
			
 
				-    def after_transition(self, event: str, source: State, target: State):
			
 
				-        # print(f"after '{event}' from '{source.id}' to '{target.id}'")
			
 
				-        self.archiveresult.snapshot.update_for_workers()  # bump snapshot retry time so it picks up all the new changes
			
 
				-
			
 
				-
			
 
				-# class ArchiveResultWorker(ActorType[ArchiveResult]):
			
 
				-#     """
			
 
				-#     The primary actor for progressing ArchiveResult objects
			
 
				-#     through their lifecycle using the ArchiveResultMachine.
			
 
				-#     """
			
 
				-#     Model = ArchiveResult
			
 
				-#     StateMachineClass = ArchiveResultMachine
			
 
				-    
			
 
				-#     ACTIVE_STATE: ClassVar[State] = ArchiveResultMachine.started                # 'started'
			
 
				-    
			
 
				-#     MAX_CONCURRENT_ACTORS: ClassVar[int] = 6
			
 
				-#     MAX_TICK_TIME: ClassVar[int] = 60
			
 
				-#     CLAIM_FROM_TOP_N: ClassVar[int] = MAX_CONCURRENT_ACTORS * 10
			
--- a/archivebox/core/templatetags/config_tags.py
+++ b/archivebox/core/templatetags/config_tags.py
@@ -0,0 +1,20 @@
 
				+"""Template tags for accessing config values in templates."""
			
 
				+
			
 
				+from django import template
			
 
				+
			
 
				+from archivebox.config.configset import get_config as _get_config
			
 
				+
			
 
				+register = template.Library()
			
 
				+
			
 
				+
			
 
				[email protected]_tag
			
 
				+def get_config(key: str) -> any:
			
 
				+    """
			
 
				+    Get a config value by key.
			
 
				+
			
 
				+    Usage: {% get_config "ARCHIVEDOTORG_ENABLED" as enabled %}
			
 
				+    """
			
 
				+    try:
			
 
				+        return _get_config(key)
			
 
				+    except (KeyError, AttributeError):
			
 
				+        return None
			
--- a/archivebox/core/tests.py
+++ b/archivebox/core/tests.py
@@ -1,3 +1,319 @@
 
				-#from django.test import TestCase
			
 
				+"""Tests for the core views, especially AddView."""
			
 
				 
			
 
				-# Create your tests here.
			
 
				+import os
			
 
				+import django
			
 
				+
			
 
				+# Set up Django before importing any Django-dependent modules
			
 
				+os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'archivebox.settings')
			
 
				+django.setup()
			
 
				+
			
 
				+from django.test import TestCase, Client
			
 
				+from django.contrib.auth.models import User
			
 
				+from django.urls import reverse
			
 
				+
			
 
				+from archivebox.crawls.models import Crawl, CrawlSchedule
			
 
				+from archivebox.core.models import Tag
			
 
				+
			
 
				+
			
 
				+class AddViewTests(TestCase):
			
 
				+    """Tests for the AddView (crawl creation form)."""
			
 
				+
			
 
				+    def setUp(self):
			
 
				+        """Set up test user and client."""
			
 
				+        self.client = Client()
			
 
				+        self.user = User.objects.create_user(
			
 
				+            username='testuser',
			
 
				+            password='testpass123',
			
 
				+            email='[email protected]'
			
 
				+        )
			
 
				+        self.client.login(username='testuser', password='testpass123')
			
 
				+        self.add_url = reverse('add')
			
 
				+
			
 
				+    def test_add_view_get_requires_auth(self):
			
 
				+        """Test that GET /add requires authentication."""
			
 
				+        self.client.logout()
			
 
				+        response = self.client.get(self.add_url)
			
 
				+        # Should redirect to login or show 403/404
			
 
				+        self.assertIn(response.status_code, [302, 403, 404])
			
 
				+
			
 
				+    def test_add_view_get_shows_form(self):
			
 
				+        """Test that GET /add shows the form with all fields."""
			
 
				+        response = self.client.get(self.add_url)
			
 
				+        self.assertEqual(response.status_code, 200)
			
 
				+
			
 
				+        # Check that form fields are present
			
 
				+        self.assertContains(response, 'name="url"')
			
 
				+        self.assertContains(response, 'name="tag"')
			
 
				+        self.assertContains(response, 'name="depth"')
			
 
				+        self.assertContains(response, 'name="notes"')
			
 
				+        self.assertContains(response, 'name="schedule"')
			
 
				+        self.assertContains(response, 'name="persona"')
			
 
				+        self.assertContains(response, 'name="overwrite"')
			
 
				+        self.assertContains(response, 'name="update"')
			
 
				+        self.assertContains(response, 'name="index_only"')
			
 
				+
			
 
				+        # Check for plugin groups
			
 
				+        self.assertContains(response, 'name="chrome_plugins"')
			
 
				+        self.assertContains(response, 'name="archiving_plugins"')
			
 
				+        self.assertContains(response, 'name="parsing_plugins"')
			
 
				+
			
 
				+    def test_add_view_shows_tag_autocomplete(self):
			
 
				+        """Test that tag autocomplete datalist is rendered."""
			
 
				+        # Create some tags
			
 
				+        Tag.objects.create(name='test-tag-1')
			
 
				+        Tag.objects.create(name='test-tag-2')
			
 
				+
			
 
				+        response = self.client.get(self.add_url)
			
 
				+        self.assertEqual(response.status_code, 200)
			
 
				+
			
 
				+        # Check for datalist with tags
			
 
				+        self.assertContains(response, 'id="tag-datalist"')
			
 
				+        self.assertContains(response, 'test-tag-1')
			
 
				+        self.assertContains(response, 'test-tag-2')
			
 
				+
			
 
				+    def test_add_view_shows_plugin_presets(self):
			
 
				+        """Test that plugin preset buttons are rendered."""
			
 
				+        response = self.client.get(self.add_url)
			
 
				+        self.assertEqual(response.status_code, 200)
			
 
				+
			
 
				+        self.assertContains(response, 'Quick Archive')
			
 
				+        self.assertContains(response, 'Full Chrome')
			
 
				+        self.assertContains(response, 'Text Only')
			
 
				+        self.assertContains(response, 'Select All')
			
 
				+        self.assertContains(response, 'Clear All')
			
 
				+
			
 
				+    def test_add_view_shows_links_to_resources(self):
			
 
				+        """Test that helpful links are present."""
			
 
				+        response = self.client.get(self.add_url)
			
 
				+        self.assertEqual(response.status_code, 200)
			
 
				+
			
 
				+        # Link to plugin documentation
			
 
				+        self.assertContains(response, '/admin/environment/plugins/')
			
 
				+
			
 
				+        # Link to create new persona
			
 
				+        self.assertContains(response, '/admin/personas/persona/add/')
			
 
				+
			
 
				+    def test_add_basic_crawl_without_schedule(self):
			
 
				+        """Test creating a basic crawl without a schedule."""
			
 
				+        response = self.client.post(self.add_url, {
			
 
				+            'url': 'https://example.com\nhttps://example.org',
			
 
				+            'tag': 'test-tag',
			
 
				+            'depth': '0',
			
 
				+            'notes': 'Test crawl notes',
			
 
				+        })
			
 
				+
			
 
				+        # Should redirect to crawl admin page
			
 
				+        self.assertEqual(response.status_code, 302)
			
 
				+
			
 
				+        # Check that crawl was created
			
 
				+        self.assertEqual(Crawl.objects.count(), 1)
			
 
				+        crawl = Crawl.objects.first()
			
 
				+
			
 
				+        self.assertIn('https://example.com', crawl.urls)
			
 
				+        self.assertIn('https://example.org', crawl.urls)
			
 
				+        self.assertEqual(crawl.tags_str, 'test-tag')
			
 
				+        self.assertEqual(crawl.max_depth, 0)
			
 
				+        self.assertEqual(crawl.notes, 'Test crawl notes')
			
 
				+        self.assertEqual(crawl.created_by, self.user)
			
 
				+
			
 
				+        # No schedule should be created
			
 
				+        self.assertIsNone(crawl.schedule)
			
 
				+        self.assertEqual(CrawlSchedule.objects.count(), 0)
			
 
				+
			
 
				+    def test_add_crawl_with_schedule(self):
			
 
				+        """Test creating a crawl with a repeat schedule."""
			
 
				+        response = self.client.post(self.add_url, {
			
 
				+            'url': 'https://example.com',
			
 
				+            'tag': 'scheduled',
			
 
				+            'depth': '1',
			
 
				+            'notes': 'Daily crawl',
			
 
				+            'schedule': 'daily',
			
 
				+        })
			
 
				+
			
 
				+        self.assertEqual(response.status_code, 302)
			
 
				+
			
 
				+        # Check that crawl and schedule were created
			
 
				+        self.assertEqual(Crawl.objects.count(), 1)
			
 
				+        self.assertEqual(CrawlSchedule.objects.count(), 1)
			
 
				+
			
 
				+        crawl = Crawl.objects.first()
			
 
				+        schedule = CrawlSchedule.objects.first()
			
 
				+
			
 
				+        self.assertEqual(crawl.schedule, schedule)
			
 
				+        self.assertEqual(schedule.template, crawl)
			
 
				+        self.assertEqual(schedule.schedule, 'daily')
			
 
				+        self.assertTrue(schedule.is_enabled)
			
 
				+        self.assertEqual(schedule.created_by, self.user)
			
 
				+
			
 
				+    def test_add_crawl_with_cron_schedule(self):
			
 
				+        """Test creating a crawl with a cron format schedule."""
			
 
				+        response = self.client.post(self.add_url, {
			
 
				+            'url': 'https://example.com',
			
 
				+            'depth': '0',
			
 
				+            'schedule': '0 */6 * * *',  # Every 6 hours
			
 
				+        })
			
 
				+
			
 
				+        self.assertEqual(response.status_code, 302)
			
 
				+
			
 
				+        schedule = CrawlSchedule.objects.first()
			
 
				+        self.assertEqual(schedule.schedule, '0 */6 * * *')
			
 
				+
			
 
				+    def test_add_crawl_with_plugins(self):
			
 
				+        """Test creating a crawl with specific plugins selected."""
			
 
				+        response = self.client.post(self.add_url, {
			
 
				+            'url': 'https://example.com',
			
 
				+            'depth': '0',
			
 
				+            'chrome_plugins': ['screenshot', 'dom'],
			
 
				+            'archiving_plugins': ['wget'],
			
 
				+        })
			
 
				+
			
 
				+        self.assertEqual(response.status_code, 302)
			
 
				+
			
 
				+        crawl = Crawl.objects.first()
			
 
				+        plugins = crawl.config.get('PLUGINS', '')
			
 
				+
			
 
				+        # Should contain the selected plugins
			
 
				+        self.assertIn('screenshot', plugins)
			
 
				+        self.assertIn('dom', plugins)
			
 
				+        self.assertIn('wget', plugins)
			
 
				+
			
 
				+    def test_add_crawl_with_depth_range(self):
			
 
				+        """Test creating crawls with different depth values (0-4)."""
			
 
				+        for depth in range(5):
			
 
				+            response = self.client.post(self.add_url, {
			
 
				+                'url': f'https://example{depth}.com',
			
 
				+                'depth': str(depth),
			
 
				+            })
			
 
				+
			
 
				+            self.assertEqual(response.status_code, 302)
			
 
				+
			
 
				+        self.assertEqual(Crawl.objects.count(), 5)
			
 
				+
			
 
				+        for i, crawl in enumerate(Crawl.objects.order_by('created_at')):
			
 
				+            self.assertEqual(crawl.max_depth, i)
			
 
				+
			
 
				+    def test_add_crawl_with_advanced_options(self):
			
 
				+        """Test creating a crawl with advanced options."""
			
 
				+        response = self.client.post(self.add_url, {
			
 
				+            'url': 'https://example.com',
			
 
				+            'depth': '0',
			
 
				+            'persona': 'CustomPersona',
			
 
				+            'overwrite': True,
			
 
				+            'update': True,
			
 
				+            'index_only': True,
			
 
				+        })
			
 
				+
			
 
				+        self.assertEqual(response.status_code, 302)
			
 
				+
			
 
				+        crawl = Crawl.objects.first()
			
 
				+        config = crawl.config
			
 
				+
			
 
				+        self.assertEqual(config.get('DEFAULT_PERSONA'), 'CustomPersona')
			
 
				+        self.assertEqual(config.get('OVERWRITE'), True)
			
 
				+        self.assertEqual(config.get('ONLY_NEW'), False)  # opposite of update
			
 
				+        self.assertEqual(config.get('INDEX_ONLY'), True)
			
 
				+
			
 
				+    def test_add_crawl_with_custom_config(self):
			
 
				+        """Test creating a crawl with custom config overrides."""
			
 
				+        # Note: Django test client can't easily POST the KeyValueWidget format,
			
 
				+        # so this test would need to use the form directly or mock the cleaned_data
			
 
				+        # For now, we'll skip this test or mark it as TODO
			
 
				+        pass
			
 
				+
			
 
				+    def test_add_empty_urls_fails(self):
			
 
				+        """Test that submitting without URLs fails validation."""
			
 
				+        response = self.client.post(self.add_url, {
			
 
				+            'url': '',
			
 
				+            'depth': '0',
			
 
				+        })
			
 
				+
			
 
				+        # Should show form again with errors, not redirect
			
 
				+        self.assertEqual(response.status_code, 200)
			
 
				+        self.assertFormError(response, 'form', 'url', 'This field is required.')
			
 
				+
			
 
				+    def test_add_invalid_urls_fails(self):
			
 
				+        """Test that invalid URLs fail validation."""
			
 
				+        response = self.client.post(self.add_url, {
			
 
				+            'url': 'not-a-url',
			
 
				+            'depth': '0',
			
 
				+        })
			
 
				+
			
 
				+        # Should show form again with errors
			
 
				+        self.assertEqual(response.status_code, 200)
			
 
				+        # Check for validation error (URL regex should fail)
			
 
				+        self.assertContains(response, 'error')
			
 
				+
			
 
				+    def test_add_success_message_without_schedule(self):
			
 
				+        """Test that success message is shown without schedule link."""
			
 
				+        response = self.client.post(self.add_url, {
			
 
				+            'url': 'https://example.com\nhttps://example.org',
			
 
				+            'depth': '0',
			
 
				+        }, follow=True)
			
 
				+
			
 
				+        # Check success message mentions crawl creation
			
 
				+        messages = list(response.context['messages'])
			
 
				+        self.assertEqual(len(messages), 1)
			
 
				+        message_text = str(messages[0])
			
 
				+
			
 
				+        self.assertIn('Created crawl with 2 starting URL', message_text)
			
 
				+        self.assertIn('View Crawl', message_text)
			
 
				+        self.assertNotIn('scheduled to repeat', message_text)
			
 
				+
			
 
				+    def test_add_success_message_with_schedule(self):
			
 
				+        """Test that success message includes schedule link."""
			
 
				+        response = self.client.post(self.add_url, {
			
 
				+            'url': 'https://example.com',
			
 
				+            'depth': '0',
			
 
				+            'schedule': 'weekly',
			
 
				+        }, follow=True)
			
 
				+
			
 
				+        # Check success message mentions schedule
			
 
				+        messages = list(response.context['messages'])
			
 
				+        self.assertEqual(len(messages), 1)
			
 
				+        message_text = str(messages[0])
			
 
				+
			
 
				+        self.assertIn('Created crawl', message_text)
			
 
				+        self.assertIn('scheduled to repeat weekly', message_text)
			
 
				+        self.assertIn('View Crawl', message_text)
			
 
				+
			
 
				+    def test_add_crawl_creates_source_file(self):
			
 
				+        """Test that crawl creation saves URLs to sources file."""
			
 
				+        response = self.client.post(self.add_url, {
			
 
				+            'url': 'https://example.com',
			
 
				+            'depth': '0',
			
 
				+        })
			
 
				+
			
 
				+        self.assertEqual(response.status_code, 302)
			
 
				+
			
 
				+        # Check that source file was created in sources/ directory
			
 
				+        from archivebox.config import CONSTANTS
			
 
				+        sources_dir = CONSTANTS.SOURCES_DIR
			
 
				+
			
 
				+        # Should have created a source file
			
 
				+        source_files = list(sources_dir.glob('*__web_ui_add_by_user_*.txt'))
			
 
				+        self.assertGreater(len(source_files), 0)
			
 
				+
			
 
				+    def test_multiple_tags_are_saved(self):
			
 
				+        """Test that multiple comma-separated tags are saved."""
			
 
				+        response = self.client.post(self.add_url, {
			
 
				+            'url': 'https://example.com',
			
 
				+            'depth': '0',
			
 
				+            'tag': 'tag1,tag2,tag3',
			
 
				+        })
			
 
				+
			
 
				+        self.assertEqual(response.status_code, 302)
			
 
				+
			
 
				+        crawl = Crawl.objects.first()
			
 
				+        self.assertEqual(crawl.tags_str, 'tag1,tag2,tag3')
			
 
				+
			
 
				+    def test_crawl_redirects_to_admin_change_page(self):
			
 
				+        """Test that successful submission redirects to crawl admin page."""
			
 
				+        response = self.client.post(self.add_url, {
			
 
				+            'url': 'https://example.com',
			
 
				+            'depth': '0',
			
 
				+        })
			
 
				+
			
 
				+        crawl = Crawl.objects.first()
			
 
				+        expected_redirect = f'/admin/crawls/crawl/{crawl.id}/change/'
			
 
				+
			
 
				+        self.assertRedirects(response, expected_redirect, fetch_redirect_response=False)
			
--- a/archivebox/core/urls.py
+++ b/archivebox/core/urls.py
@@ -7,10 +7,10 @@ from django.views.generic.base import RedirectView
 
				 
			
 
				 from archivebox.misc.serve_static import serve_static
			
 
				 
			
 
				-from core.admin_site import archivebox_admin
			
 
				-from core.views import HomepageView, SnapshotView, PublicIndexView, AddView, HealthCheckView, live_progress_view
			
 
				+from archivebox.core.admin_site import archivebox_admin
			
 
				+from archivebox.core.views import HomepageView, SnapshotView, PublicIndexView, AddView, HealthCheckView, live_progress_view
			
 
				 
			
 
				-from workers.views import JobsDashboardView
			
 
				+from archivebox.workers.views import JobsDashboardView
			
 
				 
			
 
				 # GLOBAL_CONTEXT doesn't work as-is, disabled for now: https://github.com/ArchiveBox/ArchiveBox/discussions/1306
			
 
				 # from archivebox.config import VERSION, VERSIONS_AVAILABLE, CAN_UPGRADE
			
--- a/archivebox/core/views.py
+++ b/archivebox/core/views.py
@@ -23,7 +23,7 @@ from admin_data_views.typing import TableContext, ItemContext
 
				 from admin_data_views.utils import render_with_table_view, render_with_item_view, ItemLink
			
 
				 
			
 
				 import archivebox
			
 
				-from archivebox.config import CONSTANTS, CONSTANTS_CONFIG, DATA_DIR, VERSION, SAVE_ARCHIVE_DOT_ORG
			
 
				+from archivebox.config import CONSTANTS, CONSTANTS_CONFIG, DATA_DIR, VERSION
			
 
				 from archivebox.config.common import SHELL_CONFIG, SERVER_CONFIG, ARCHIVING_CONFIG
			
 
				 from archivebox.config.configset import get_flat_config, get_config, get_all_configs
			
 
				 from archivebox.misc.util import base_url, htmlencode, ts_to_date_str
			
@@ -31,9 +31,9 @@ from archivebox.misc.serve_static import serve_static_with_byterange_support
 
				 from archivebox.misc.logging_util import printable_filesize
			
 
				 from archivebox.search import query_search_index
			
 
				 
			
 
				-from core.models import Snapshot
			
 
				-from core.forms import AddLinkForm
			
 
				-from crawls.models import Crawl
			
 
				+from archivebox.core.models import Snapshot
			
 
				+from archivebox.core.forms import AddLinkForm
			
 
				+from archivebox.crawls.models import Crawl
			
 
				 from archivebox.hooks import get_extractors, get_extractor_name
			
 
				 
			
 
				 
			
@@ -150,7 +150,6 @@ class SnapshotView(View):
 
				             'status_color': 'success' if snapshot.is_archived else 'danger',
			
 
				             'oldest_archive_date': ts_to_date_str(snapshot.oldest_archive_date),
			
 
				             'warc_path': warc_path,
			
 
				-            'SAVE_ARCHIVE_DOT_ORG': SAVE_ARCHIVE_DOT_ORG,
			
 
				             'PREVIEW_ORIGINALS': SERVER_CONFIG.PREVIEW_ORIGINALS,
			
 
				             'archiveresults': sorted(archiveresults.values(), key=lambda r: all_types.index(r['name']) if r['name'] in all_types else -r['size']),
			
 
				             'best_result': best_result,
			
@@ -421,35 +420,34 @@ class AddView(UserPassesTestMixin, FormView):
 
				         return SERVER_CONFIG.PUBLIC_ADD_VIEW or self.request.user.is_authenticated
			
 
				 
			
 
				     def get_context_data(self, **kwargs):
			
 
				+        from archivebox.core.models import Tag
			
 
				+
			
 
				         return {
			
 
				             **super().get_context_data(**kwargs),
			
 
				-            'title': "Add URLs",
			
 
				+            'title': "Create Crawl",
			
 
				             # We can't just call request.build_absolute_uri in the template, because it would include query parameters
			
 
				             'absolute_add_path': self.request.build_absolute_uri(self.request.path),
			
 
				             'VERSION': VERSION,
			
 
				             'FOOTER_INFO': SERVER_CONFIG.FOOTER_INFO,
			
 
				             'stdout': '',
			
 
				+            'available_tags': list(Tag.objects.all().order_by('name').values_list('name', flat=True)),
			
 
				         }
			
 
				 
			
 
				     def form_valid(self, form):
			
 
				         urls = form.cleaned_data["url"]
			
 
				         print(f'[+] Adding URL: {urls}')
			
 
				-        parser = form.cleaned_data.get("parser", "auto")  # default to auto-detect parser
			
 
				-        tag = form.cleaned_data["tag"]
			
 
				-        depth = 0 if form.cleaned_data["depth"] == "0" else 1
			
 
				-        plugins = ','.join(form.cleaned_data["archive_methods"])
			
 
				-        input_kwargs = {
			
 
				-            "urls": urls,
			
 
				-            "tag": tag,
			
 
				-            "depth": depth,
			
 
				-            "parser": parser,
			
 
				-            "update_all": False,
			
 
				-            "out_dir": DATA_DIR,
			
 
				-            "created_by_id": self.request.user.pk,
			
 
				-        }
			
 
				-        if plugins:
			
 
				-            input_kwargs.update({"plugins": plugins})
			
 
				 
			
 
				+        # Extract all form fields
			
 
				+        tag = form.cleaned_data["tag"]
			
 
				+        depth = int(form.cleaned_data["depth"])
			
 
				+        plugins = ','.join(form.cleaned_data.get("plugins", []))
			
 
				+        schedule = form.cleaned_data.get("schedule", "").strip()
			
 
				+        persona = form.cleaned_data.get("persona", "Default")
			
 
				+        overwrite = form.cleaned_data.get("overwrite", False)
			
 
				+        update = form.cleaned_data.get("update", False)
			
 
				+        index_only = form.cleaned_data.get("index_only", False)
			
 
				+        notes = form.cleaned_data.get("notes", "")
			
 
				+        custom_config = form.cleaned_data.get("config", {})
			
 
				 
			
 
				         from archivebox.config.permissions import HOSTNAME
			
 
				 
			
@@ -461,33 +459,59 @@ class AddView(UserPassesTestMixin, FormView):
 
				         # 2. create a new Crawl with the URLs from the file
			
 
				         timestamp = timezone.now().strftime("%Y-%m-%d__%H-%M-%S")
			
 
				         urls_content = sources_file.read_text()
			
 
				+        # Build complete config
			
 
				+        config = {
			
 
				+            'ONLY_NEW': not update,
			
 
				+            'INDEX_ONLY': index_only,
			
 
				+            'OVERWRITE': overwrite,
			
 
				+            'DEPTH': depth,
			
 
				+            'PLUGINS': plugins or '',
			
 
				+            'DEFAULT_PERSONA': persona or 'Default',
			
 
				+        }
			
 
				+
			
 
				+        # Merge custom config overrides
			
 
				+        config.update(custom_config)
			
 
				+
			
 
				         crawl = Crawl.objects.create(
			
 
				             urls=urls_content,
			
 
				             max_depth=depth,
			
 
				             tags_str=tag,
			
 
				+            notes=notes,
			
 
				             label=f'{self.request.user.username}@{HOSTNAME}{self.request.path} {timestamp}',
			
 
				             created_by_id=self.request.user.pk,
			
 
				-            config={
			
 
				-                # 'ONLY_NEW': not update,
			
 
				-                # 'INDEX_ONLY': index_only,
			
 
				-                # 'OVERWRITE': False,
			
 
				-                'DEPTH': depth,
			
 
				-                'PLUGINS': plugins or '',
			
 
				-                # 'DEFAULT_PERSONA': persona or 'Default',
			
 
				-            }
			
 
				+            config=config
			
 
				         )
			
 
				-        
			
 
				+
			
 
				+        # 3. create a CrawlSchedule if schedule is provided
			
 
				+        if schedule:
			
 
				+            from crawls.models import CrawlSchedule
			
 
				+            crawl_schedule = CrawlSchedule.objects.create(
			
 
				+                template=crawl,
			
 
				+                schedule=schedule,
			
 
				+                is_enabled=True,
			
 
				+                label=crawl.label,
			
 
				+                notes=f"Auto-created from add page. {notes}".strip(),
			
 
				+                created_by_id=self.request.user.pk,
			
 
				+            )
			
 
				+            crawl.schedule = crawl_schedule
			
 
				+            crawl.save(update_fields=['schedule'])
			
 
				+
			
 
				         # 4. start the Orchestrator & wait until it completes
			
 
				         #    ... orchestrator will create the root Snapshot, which creates pending ArchiveResults, which gets run by the ArchiveResultActors ...
			
 
				-        # from crawls.actors import CrawlActor
			
 
				-        # from core.actors import SnapshotActor, ArchiveResultActor
			
 
				-    
			
 
				+        # from archivebox.crawls.actors import CrawlActor
			
 
				+        # from archivebox.core.actors import SnapshotActor, ArchiveResultActor
			
 
				+
			
 
				 
			
 
				         rough_url_count = urls.count('://')
			
 
				 
			
 
				+        # Build success message with schedule link if created
			
 
				+        schedule_msg = ""
			
 
				+        if schedule:
			
 
				+            schedule_msg = f" and <a href='{crawl.schedule.admin_change_url}'>scheduled to repeat {schedule}</a>"
			
 
				+
			
 
				         messages.success(
			
 
				             self.request,
			
 
				-            mark_safe(f"Adding {rough_url_count} URLs in the background. (refresh in a minute start seeing results) {crawl.admin_change_url}"),
			
 
				+            mark_safe(f"Created crawl with {rough_url_count} starting URL(s){schedule_msg}. Snapshots will be created and archived in the background. <a href='{crawl.admin_change_url}'>View Crawl →</a>"),
			
 
				         )
			
 
				 
			
 
				         # Orchestrator (managed by supervisord) will pick up the queued crawl
			
@@ -516,8 +540,8 @@ def live_progress_view(request):
 
				     """Simple JSON endpoint for live progress status - used by admin progress monitor."""
			
 
				     try:
			
 
				         from workers.orchestrator import Orchestrator
			
 
				-        from crawls.models import Crawl
			
 
				-        from core.models import Snapshot, ArchiveResult
			
 
				+        from archivebox.crawls.models import Crawl
			
 
				+        from archivebox.core.models import Snapshot, ArchiveResult
			
 
				         from django.db.models import Case, When, Value, IntegerField
			
 
				 
			
 
				         # Get orchestrator status
			
@@ -764,9 +788,9 @@ def key_is_safe(key: str) -> bool:
 
				 def find_config_source(key: str, merged_config: dict) -> str:
			
 
				     """Determine where a config value comes from."""
			
 
				     import os
			
 
				-    from machine.models import Machine
			
 
				+    from archivebox.machine.models import Machine
			
 
				 
			
 
				-    # Check if it's from machine config
			
 
				+    # Check if it's from archivebox.machine.config
			
 
				     try:
			
 
				         machine = Machine.current()
			
 
				         if machine.config and key in machine.config:
			
@@ -778,7 +802,7 @@ def find_config_source(key: str, merged_config: dict) -> str:
 
				     if key in os.environ:
			
 
				         return 'Environment'
			
 
				 
			
 
				-    # Check if it's from config file
			
 
				+    # Check if it's from archivebox.config.file
			
 
				     from archivebox.config.configset import BaseConfigSet
			
 
				     file_config = BaseConfigSet.load_from_file(CONSTANTS.CONFIG_FILE)
			
 
				     if key in file_config:
			
@@ -796,7 +820,7 @@ def live_config_list_view(request: HttpRequest, **kwargs) -> TableContext:
 
				 
			
 
				     # Get merged config that includes Machine.config overrides
			
 
				     try:
			
 
				-        from machine.models import Machine
			
 
				+        from archivebox.machine.models import Machine
			
 
				         machine = Machine.current()
			
 
				         merged_config = get_config()
			
 
				     except Exception as e:
			
@@ -859,7 +883,7 @@ def live_config_list_view(request: HttpRequest, **kwargs) -> TableContext:
 
				 @render_with_item_view
			
 
				 def live_config_value_view(request: HttpRequest, key: str, **kwargs) -> ItemContext:
			
 
				     import os
			
 
				-    from machine.models import Machine
			
 
				+    from archivebox.machine.models import Machine
			
 
				     from archivebox.config.configset import BaseConfigSet
			
 
				 
			
 
				     CONFIGS = get_all_configs()
			
--- a/archivebox/crawls/admin.py
+++ b/archivebox/crawls/admin.py
@@ -17,8 +17,8 @@ from django_object_actions import action
 
				 
			
 
				 from archivebox.base_models.admin import BaseModelAdmin, ConfigEditorMixin
			
 
				 
			
 
				-from core.models import Snapshot
			
 
				-from crawls.models import Crawl, CrawlSchedule
			
 
				+from archivebox.core.models import Snapshot
			
 
				+from archivebox.crawls.models import Crawl, CrawlSchedule
			
 
				 
			
 
				 
			
 
				 def render_snapshots_list(snapshots_qs, limit=20):
			
--- a/archivebox/crawls/apps.py
+++ b/archivebox/crawls/apps.py
@@ -3,4 +3,4 @@ from django.apps import AppConfig
 
				 
			
 
				 class CrawlsConfig(AppConfig):
			
 
				     default_auto_field = "django.db.models.BigAutoField"
			
 
				-    name = "crawls"
			
 
				+    name = "archivebox.crawls"
			
--- a/archivebox/crawls/models.py
+++ b/archivebox/crawls/models.py
@@ -1,6 +1,7 @@
 
				 __package__ = 'archivebox.crawls'
			
 
				 
			
 
				 from typing import TYPE_CHECKING, Iterable
			
 
				+from datetime import timedelta
			
 
				 from archivebox.uuid_compat import uuid7
			
 
				 from pathlib import Path
			
 
				 
			
@@ -11,13 +12,15 @@ from django.conf import settings
 
				 from django.urls import reverse_lazy
			
 
				 from django.utils import timezone
			
 
				 from django_stubs_ext.db.models import TypedModelMeta
			
 
				+from statemachine import State, registry
			
 
				+from rich import print
			
 
				 
			
 
				 from archivebox.config import CONSTANTS
			
 
				 from archivebox.base_models.models import ModelWithSerializers, ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHealthStats, get_or_create_system_user_pk
			
 
				-from workers.models import ModelWithStateMachine
			
 
				+from archivebox.workers.models import ModelWithStateMachine, BaseStateMachine
			
 
				 
			
 
				 if TYPE_CHECKING:
			
 
				-    from core.models import Snapshot, ArchiveResult
			
 
				+    from archivebox.core.models import Snapshot, ArchiveResult
			
 
				 
			
 
				 
			
 
				 class CrawlSchedule(ModelWithSerializers, ModelWithNotes, ModelWithHealthStats):
			
@@ -35,6 +38,7 @@ class CrawlSchedule(ModelWithSerializers, ModelWithNotes, ModelWithHealthStats):
 
				     crawl_set: models.Manager['Crawl']
			
 
				 
			
 
				     class Meta(TypedModelMeta):
			
 
				+        app_label = 'crawls'
			
 
				         verbose_name = 'Scheduled Crawl'
			
 
				         verbose_name_plural = 'Scheduled Crawls'
			
 
				 
			
@@ -73,7 +77,7 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith
 
				     status = ModelWithStateMachine.StatusField(choices=ModelWithStateMachine.StatusChoices, default=ModelWithStateMachine.StatusChoices.QUEUED)
			
 
				     retry_at = ModelWithStateMachine.RetryAtField(default=timezone.now)
			
 
				 
			
 
				-    state_machine_name = 'crawls.statemachines.CrawlMachine'
			
 
				+    state_machine_name = 'crawls.models.CrawlMachine'
			
 
				     retry_at_field_name = 'retry_at'
			
 
				     state_field_name = 'status'
			
 
				     StatusChoices = ModelWithStateMachine.StatusChoices
			
@@ -82,6 +86,7 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith
 
				     snapshot_set: models.Manager['Snapshot']
			
 
				 
			
 
				     class Meta(TypedModelMeta):
			
 
				+        app_label = 'crawls'
			
 
				         verbose_name = 'Crawl'
			
 
				         verbose_name_plural = 'Crawls'
			
 
				 
			
@@ -168,7 +173,7 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith
 
				         return Path(path_str)
			
 
				 
			
 
				     def create_root_snapshot(self) -> 'Snapshot':
			
 
				-        from core.models import Snapshot
			
 
				+        from archivebox.core.models import Snapshot
			
 
				 
			
 
				         first_url = self.get_urls_list()[0] if self.get_urls_list() else None
			
 
				         if not first_url:
			
@@ -245,7 +250,7 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith
 
				             List of newly created Snapshot objects
			
 
				         """
			
 
				         import json
			
 
				-        from core.models import Snapshot
			
 
				+        from archivebox.core.models import Snapshot
			
 
				 
			
 
				         created_snapshots = []
			
 
				 
			
@@ -309,9 +314,13 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith
 
				         import time
			
 
				         from pathlib import Path
			
 
				         from archivebox.hooks import run_hook, discover_hooks, process_hook_records
			
 
				+        from archivebox.config.configset import get_config
			
 
				+
			
 
				+        # Get merged config with crawl context
			
 
				+        config = get_config(crawl=self)
			
 
				 
			
 
				         # Discover and run on_Crawl hooks
			
 
				-        hooks = discover_hooks('Crawl')
			
 
				+        hooks = discover_hooks('Crawl', config=config)
			
 
				         first_url = self.get_urls_list()[0] if self.get_urls_list() else ''
			
 
				 
			
 
				         for hook in hooks:
			
@@ -323,8 +332,7 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith
 
				             result = run_hook(
			
 
				                 hook,
			
 
				                 output_dir=output_dir,
			
 
				-                timeout=60,
			
 
				-                config_objects=[self],
			
 
				+                config=config,
			
 
				                 crawl_id=str(self.id),
			
 
				                 source_url=first_url,
			
 
				             )
			
@@ -380,7 +388,10 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith
 
				                     pass
			
 
				 
			
 
				         # Run on_CrawlEnd hooks
			
 
				-        hooks = discover_hooks('CrawlEnd')
			
 
				+        from archivebox.config.configset import get_config
			
 
				+        config = get_config(crawl=self)
			
 
				+
			
 
				+        hooks = discover_hooks('CrawlEnd', config=config)
			
 
				         first_url = self.get_urls_list()[0] if self.get_urls_list() else ''
			
 
				 
			
 
				         for hook in hooks:
			
@@ -391,8 +402,7 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith
 
				             result = run_hook(
			
 
				                 hook,
			
 
				                 output_dir=output_dir,
			
 
				-                timeout=30,
			
 
				-                config_objects=[self],
			
 
				+                config=config,
			
 
				                 crawl_id=str(self.id),
			
 
				                 source_url=first_url,
			
 
				             )
			
@@ -400,3 +410,131 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith
 
				             # Log failures but don't block
			
 
				             if result and result['returncode'] != 0:
			
 
				                 print(f'[yellow]⚠️ CrawlEnd hook failed: {hook.name}[/yellow]')
			
 
				+
			
 
				+
			
 
				+# =============================================================================
			
 
				+# State Machines
			
 
				+# =============================================================================
			
 
				+
			
 
				+class CrawlMachine(BaseStateMachine, strict_states=True):
			
 
				+    """
			
 
				+    State machine for managing Crawl lifecycle.
			
 
				+
			
 
				+    Hook Lifecycle:
			
 
				+    ┌─────────────────────────────────────────────────────────────┐
			
 
				+    │ QUEUED State                                                │
			
 
				+    │  • Waiting for crawl to be ready (has URLs)                 │
			
 
				+    └─────────────────────────────────────────────────────────────┘
			
 
				+                            ↓ tick() when can_start()
			
 
				+    ┌─────────────────────────────────────────────────────────────┐
			
 
				+    │ STARTED State → enter_started()                             │
			
 
				+    │  1. crawl.run()                                             │
			
 
				+    │     • discover_hooks('Crawl') → finds all crawl hooks       │
			
 
				+    │     • For each hook:                                        │
			
 
				+    │       - run_hook(script, output_dir, ...)                   │
			
 
				+    │       - Parse JSONL from hook output                        │
			
 
				+    │       - process_hook_records() → creates Snapshots          │
			
 
				+    │     • create_root_snapshot() → root snapshot for crawl      │
			
 
				+    │     • create_snapshots_from_urls() → from self.urls field   │
			
 
				+    │                                                              │
			
 
				+    │  2. Snapshots process independently with their own          │
			
 
				+    │     state machines (see SnapshotMachine)                    │
			
 
				+    └─────────────────────────────────────────────────────────────┘
			
 
				+                            ↓ tick() when is_finished()
			
 
				+    ┌─────────────────────────────────────────────────────────────┐
			
 
				+    │ SEALED State → enter_sealed()                               │
			
 
				+    │  • cleanup() → runs on_CrawlEnd hooks, kills background     │
			
 
				+    │  • Set retry_at=None (no more processing)                   │
			
 
				+    └─────────────────────────────────────────────────────────────┘
			
 
				+    """
			
 
				+
			
 
				+    model_attr_name = 'crawl'
			
 
				+
			
 
				+    # States
			
 
				+    queued = State(value=Crawl.StatusChoices.QUEUED, initial=True)
			
 
				+    started = State(value=Crawl.StatusChoices.STARTED)
			
 
				+    sealed = State(value=Crawl.StatusChoices.SEALED, final=True)
			
 
				+
			
 
				+    # Tick Event
			
 
				+    tick = (
			
 
				+        queued.to.itself(unless='can_start') |
			
 
				+        queued.to(started, cond='can_start') |
			
 
				+        started.to.itself(unless='is_finished') |
			
 
				+        started.to(sealed, cond='is_finished')
			
 
				+    )
			
 
				+
			
 
				+    def can_start(self) -> bool:
			
 
				+        if not self.crawl.urls:
			
 
				+            print(f'[red]⚠️ Crawl {self.crawl.id} cannot start: no URLs[/red]')
			
 
				+            return False
			
 
				+        urls_list = self.crawl.get_urls_list()
			
 
				+        if not urls_list:
			
 
				+            print(f'[red]⚠️ Crawl {self.crawl.id} cannot start: no valid URLs in urls field[/red]')
			
 
				+            return False
			
 
				+        return True
			
 
				+
			
 
				+    def is_finished(self) -> bool:
			
 
				+        from archivebox.core.models import Snapshot
			
 
				+
			
 
				+        # check that at least one snapshot exists for this crawl
			
 
				+        snapshots = Snapshot.objects.filter(crawl=self.crawl)
			
 
				+        if not snapshots.exists():
			
 
				+            return False
			
 
				+
			
 
				+        # check if all snapshots are sealed
			
 
				+        # Snapshots handle their own background hooks via the step system,
			
 
				+        # so we just need to wait for all snapshots to reach sealed state
			
 
				+        if snapshots.filter(status__in=[Snapshot.StatusChoices.QUEUED, Snapshot.StatusChoices.STARTED]).exists():
			
 
				+            return False
			
 
				+
			
 
				+        return True
			
 
				+
			
 
				+    @started.enter
			
 
				+    def enter_started(self):
			
 
				+        # Lock the crawl by bumping retry_at so other workers don't pick it up while we create snapshots
			
 
				+        self.crawl.update_and_requeue(
			
 
				+            retry_at=timezone.now() + timedelta(seconds=30),  # Lock for 30 seconds
			
 
				+        )
			
 
				+
			
 
				+        try:
			
 
				+            # Run the crawl - runs hooks, processes JSONL, creates snapshots
			
 
				+            self.crawl.run()
			
 
				+
			
 
				+            # Update status to STARTED once snapshots are created
			
 
				+            # Set retry_at to future so we don't busy-loop - wait for snapshots to process
			
 
				+            self.crawl.update_and_requeue(
			
 
				+                retry_at=timezone.now() + timedelta(seconds=5),  # Check again in 5s
			
 
				+                status=Crawl.StatusChoices.STARTED,
			
 
				+            )
			
 
				+        except Exception as e:
			
 
				+            print(f'[red]⚠️ Crawl {self.crawl.id} failed to start: {e}[/red]')
			
 
				+            import traceback
			
 
				+            traceback.print_exc()
			
 
				+            # Re-raise so the worker knows it failed
			
 
				+            raise
			
 
				+
			
 
				+    def on_started_to_started(self):
			
 
				+        """Called when Crawl stays in started state (snapshots not sealed yet)."""
			
 
				+        # Bump retry_at so we check again in a few seconds
			
 
				+        self.crawl.update_and_requeue(
			
 
				+            retry_at=timezone.now() + timedelta(seconds=5),
			
 
				+        )
			
 
				+
			
 
				+    @sealed.enter
			
 
				+    def enter_sealed(self):
			
 
				+        # Clean up background hooks and run on_CrawlEnd hooks
			
 
				+        self.crawl.cleanup()
			
 
				+
			
 
				+        self.crawl.update_and_requeue(
			
 
				+            retry_at=None,
			
 
				+            status=Crawl.StatusChoices.SEALED,
			
 
				+        )
			
 
				+
			
 
				+
			
 
				+# =============================================================================
			
 
				+# Register State Machines
			
 
				+# =============================================================================
			
 
				+
			
 
				+# Manually register state machines with python-statemachine registry
			
 
				+# (normally auto-discovered from statemachines.py, but we define them here for clarity)
			
 
				+registry.register(CrawlMachine)
			
--- a/archivebox/crawls/statemachines.py
+++ b/archivebox/crawls/statemachines.py
@@ -1,114 +0,0 @@
 
				-__package__ = 'archivebox.crawls'
			
 
				-
			
 
				-import os
			
 
				-from typing import ClassVar
			
 
				-from datetime import timedelta
			
 
				-from django.utils import timezone
			
 
				-
			
 
				-from rich import print
			
 
				-
			
 
				-from statemachine import State, StateMachine
			
 
				-
			
 
				-# from workers.actor import ActorType
			
 
				-from crawls.models import Crawl
			
 
				-
			
 
				-
			
 
				-class CrawlMachine(StateMachine, strict_states=True):
			
 
				-    """State machine for managing Crawl lifecycle."""
			
 
				-    
			
 
				-    model: Crawl
			
 
				-    
			
 
				-    # States
			
 
				-    queued = State(value=Crawl.StatusChoices.QUEUED, initial=True)
			
 
				-    started = State(value=Crawl.StatusChoices.STARTED)
			
 
				-    sealed = State(value=Crawl.StatusChoices.SEALED, final=True)
			
 
				-    
			
 
				-    # Tick Event
			
 
				-    tick = (
			
 
				-        queued.to.itself(unless='can_start') |
			
 
				-        queued.to(started, cond='can_start') |
			
 
				-        started.to.itself(unless='is_finished') |
			
 
				-        started.to(sealed, cond='is_finished')
			
 
				-    )
			
 
				-    
			
 
				-    def __init__(self, crawl, *args, **kwargs):
			
 
				-        self.crawl = crawl
			
 
				-        super().__init__(crawl, *args, **kwargs)
			
 
				-    
			
 
				-    def __repr__(self) -> str:
			
 
				-        return f'Crawl[{self.crawl.id}]'
			
 
				-
			
 
				-    def __str__(self) -> str:
			
 
				-        return self.__repr__()
			
 
				-        
			
 
				-    def can_start(self) -> bool:
			
 
				-        if not self.crawl.urls:
			
 
				-            print(f'[red]⚠️ Crawl {self.crawl.id} cannot start: no URLs[/red]')
			
 
				-            return False
			
 
				-        urls_list = self.crawl.get_urls_list()
			
 
				-        if not urls_list:
			
 
				-            print(f'[red]⚠️ Crawl {self.crawl.id} cannot start: no valid URLs in urls field[/red]')
			
 
				-            return False
			
 
				-        return True
			
 
				-        
			
 
				-    def is_finished(self) -> bool:
			
 
				-        from core.models import Snapshot, ArchiveResult
			
 
				-        
			
 
				-        # check that at least one snapshot exists for this crawl
			
 
				-        snapshots = Snapshot.objects.filter(crawl=self.crawl)
			
 
				-        if not snapshots.exists():
			
 
				-            return False
			
 
				-        
			
 
				-        # check to make sure no snapshots are in non-final states
			
 
				-        if snapshots.filter(status__in=[Snapshot.StatusChoices.QUEUED, Snapshot.StatusChoices.STARTED]).exists():
			
 
				-            return False
			
 
				-        
			
 
				-        # check that some archiveresults exist for this crawl
			
 
				-        results = ArchiveResult.objects.filter(snapshot__crawl=self.crawl)
			
 
				-        if not results.exists():
			
 
				-            return False
			
 
				-        
			
 
				-        # check if all archiveresults are finished
			
 
				-        if results.filter(status__in=[ArchiveResult.StatusChoices.QUEUED, ArchiveResult.StatusChoices.STARTED]).exists():
			
 
				-            return False
			
 
				-        
			
 
				-        return True
			
 
				-        
			
 
				-    # def before_transition(self, event, state):
			
 
				-    #     print(f"Before '{event}', on the '{state.id}' state.")
			
 
				-    #     return "before_transition_return"
			
 
				-
			
 
				-    @started.enter
			
 
				-    def enter_started(self):
			
 
				-        # Suppressed: state transition logs
			
 
				-        # Lock the crawl by bumping retry_at so other workers don't pick it up while we create snapshots
			
 
				-        self.crawl.update_for_workers(
			
 
				-            retry_at=timezone.now() + timedelta(seconds=30),  # Lock for 30 seconds
			
 
				-        )
			
 
				-
			
 
				-        try:
			
 
				-            # Run the crawl - runs hooks, processes JSONL, creates snapshots
			
 
				-            self.crawl.run()
			
 
				-
			
 
				-            # Update status to STARTED once snapshots are created
			
 
				-            self.crawl.update_for_workers(
			
 
				-                retry_at=timezone.now(),  # Process immediately
			
 
				-                status=Crawl.StatusChoices.STARTED,
			
 
				-            )
			
 
				-        except Exception as e:
			
 
				-            print(f'[red]⚠️ Crawl {self.crawl.id} failed to start: {e}[/red]')
			
 
				-            import traceback
			
 
				-            traceback.print_exc()
			
 
				-            # Re-raise so the worker knows it failed
			
 
				-            raise
			
 
				-
			
 
				-    @sealed.enter
			
 
				-    def enter_sealed(self):
			
 
				-        # Clean up background hooks and run on_CrawlEnd hooks
			
 
				-        self.crawl.cleanup()
			
 
				-
			
 
				-        # Suppressed: state transition logs
			
 
				-        self.crawl.update_for_workers(
			
 
				-            retry_at=None,
			
 
				-            status=Crawl.StatusChoices.SEALED,
			
 
				-        )
			
--- a/archivebox/hooks.py
+++ b/archivebox/hooks.py
@@ -146,11 +146,16 @@ class HookResult(TypedDict, total=False):
 
				     records: List[Dict[str, Any]]  # Parsed JSONL records with 'type' field
			
 
				 
			
 
				 
			
 
				-def discover_hooks(event_name: str) -> List[Path]:
			
 
				+def discover_hooks(
			
 
				+    event_name: str,
			
 
				+    filter_disabled: bool = True,
			
 
				+    config: Optional[Dict[str, Any]] = None
			
 
				+) -> List[Path]:
			
 
				     """
			
 
				     Find all hook scripts matching on_{event_name}__*.{sh,py,js} pattern.
			
 
				 
			
 
				     Searches both built-in and user plugin directories.
			
 
				+    Filters out hooks from disabled plugins by default (respects USE_/SAVE_ flags).
			
 
				     Returns scripts sorted alphabetically by filename for deterministic execution order.
			
 
				 
			
 
				     Hook naming convention uses numeric prefixes to control order:
			
@@ -158,9 +163,29 @@ def discover_hooks(event_name: str) -> List[Path]:
 
				         on_Snapshot__15_singlefile.py   # runs second
			
 
				         on_Snapshot__26_readability.py  # runs later (depends on singlefile)
			
 
				 
			
 
				-    Example:
			
 
				+    Args:
			
 
				+        event_name: Event name (e.g., 'Snapshot', 'Binary', 'Crawl')
			
 
				+        filter_disabled: If True, skip hooks from disabled plugins (default: True)
			
 
				+        config: Optional config dict from get_config() (merges file, env, machine, crawl, snapshot)
			
 
				+                If None, will call get_config() with global scope
			
 
				+
			
 
				+    Returns:
			
 
				+        Sorted list of hook script paths from enabled plugins only.
			
 
				+
			
 
				+    Examples:
			
 
				+        # With proper config context (recommended):
			
 
				+        from archivebox.config.configset import get_config
			
 
				+        config = get_config(crawl=my_crawl, snapshot=my_snapshot)
			
 
				+        discover_hooks('Snapshot', config=config)
			
 
				+        # Returns: [Path('.../on_Snapshot__10_title.py'), ...] (wget excluded if SAVE_WGET=False)
			
 
				+
			
 
				+        # Without config (uses global defaults):
			
 
				         discover_hooks('Snapshot')
			
 
				-        # Returns: [Path('.../on_Snapshot__10_title.py'), Path('.../on_Snapshot__15_singlefile.py'), ...]
			
 
				+        # Returns: [Path('.../on_Snapshot__10_title.py'), ...]
			
 
				+
			
 
				+        # Show all plugins regardless of enabled status:
			
 
				+        discover_hooks('Snapshot', filter_disabled=False)
			
 
				+        # Returns: [Path('.../on_Snapshot__10_title.py'), ..., Path('.../on_Snapshot__50_wget.py')]
			
 
				     """
			
 
				     hooks = []
			
 
				 
			
@@ -177,45 +202,44 @@ def discover_hooks(event_name: str) -> List[Path]:
 
				             pattern_direct = f'on_{event_name}__*.{ext}'
			
 
				             hooks.extend(base_dir.glob(pattern_direct))
			
 
				 
			
 
				-    # Sort by filename (not full path) to ensure numeric prefix ordering works
			
 
				-    # e.g., on_Snapshot__10_title.py sorts before on_Snapshot__26_readability.py
			
 
				-    return sorted(set(hooks), key=lambda p: p.name)
			
 
				-
			
 
				-
			
 
				-def discover_all_hooks() -> Dict[str, List[Path]]:
			
 
				-    """
			
 
				-    Discover all hooks organized by event name.
			
 
				-
			
 
				-    Returns a dict mapping event names to lists of hook script paths.
			
 
				-    """
			
 
				-    hooks_by_event: Dict[str, List[Path]] = {}
			
 
				-
			
 
				-    for base_dir in (BUILTIN_PLUGINS_DIR, USER_PLUGINS_DIR):
			
 
				-        if not base_dir.exists():
			
 
				-            continue
			
 
				+    # Filter by enabled plugins
			
 
				+    if filter_disabled:
			
 
				+        # Get merged config if not provided (lazy import to avoid circular dependency)
			
 
				+        if config is None:
			
 
				+            from archivebox.config.configset import get_config
			
 
				+            config = get_config(scope='global')
			
 
				+
			
 
				+        enabled_hooks = []
			
 
				+
			
 
				+        for hook in hooks:
			
 
				+            # Get plugin name from parent directory
			
 
				+            # e.g., archivebox/plugins/wget/on_Snapshot__50_wget.py -> 'wget'
			
 
				+            plugin_name = hook.parent.name
			
 
				+
			
 
				+            # Check if this is a plugin directory (not the root plugins dir)
			
 
				+            if plugin_name in ('plugins', '.'):
			
 
				+                # Hook is in root plugins directory, not a plugin subdir
			
 
				+                # Include it by default (no filtering for non-plugin hooks)
			
 
				+                enabled_hooks.append(hook)
			
 
				+                continue
			
 
				 
			
 
				-        for ext in ('sh', 'py', 'js'):
			
 
				-            for hook_path in base_dir.glob(f'*/on_*__*.{ext}'):
			
 
				-                # Extract event name from filename: on_EventName__hook_name.ext
			
 
				-                filename = hook_path.stem  # on_EventName__hook_name
			
 
				-                if filename.startswith('on_') and '__' in filename:
			
 
				-                    event_name = filename[3:].split('__')[0]  # EventName
			
 
				-                    if event_name not in hooks_by_event:
			
 
				-                        hooks_by_event[event_name] = []
			
 
				-                    hooks_by_event[event_name].append(hook_path)
			
 
				+            # Check if plugin is enabled
			
 
				+            plugin_config = get_plugin_special_config(plugin_name, config)
			
 
				+            if plugin_config['enabled']:
			
 
				+                enabled_hooks.append(hook)
			
 
				 
			
 
				-    # Sort hooks within each event
			
 
				-    for event_name in hooks_by_event:
			
 
				-        hooks_by_event[event_name] = sorted(set(hooks_by_event[event_name]), key=lambda p: p.name)
			
 
				+        hooks = enabled_hooks
			
 
				 
			
 
				-    return hooks_by_event
			
 
				+    # Sort by filename (not full path) to ensure numeric prefix ordering works
			
 
				+    # e.g., on_Snapshot__10_title.py sorts before on_Snapshot__26_readability.py
			
 
				+    return sorted(set(hooks), key=lambda p: p.name)
			
 
				 
			
 
				 
			
 
				 def run_hook(
			
 
				     script: Path,
			
 
				     output_dir: Path,
			
 
				-    timeout: int = 300,
			
 
				-    config_objects: Optional[List[Any]] = None,
			
 
				+    config: Dict[str, Any],
			
 
				+    timeout: Optional[int] = None,
			
 
				     **kwargs: Any
			
 
				 ) -> HookResult:
			
 
				     """
			
@@ -224,31 +248,33 @@ def run_hook(
 
				     This is the low-level hook executor. For running extractors with proper
			
 
				     metadata handling, use call_extractor() instead.
			
 
				 
			
 
				-    Config is passed to hooks via environment variables with this priority:
			
 
				-    1. Plugin schema defaults (config.json)
			
 
				-    2. Config file (ArchiveBox.conf)
			
 
				-    3. Environment variables
			
 
				-    4. Machine.config (auto-included, lowest override priority)
			
 
				-    5. config_objects (in order - later objects override earlier ones)
			
 
				+    Config is passed to hooks via environment variables. Caller MUST use
			
 
				+    get_config() to merge all sources (file, env, machine, crawl, snapshot).
			
 
				 
			
 
				     Args:
			
 
				         script: Path to the hook script (.sh, .py, or .js)
			
 
				         output_dir: Working directory for the script (where output files go)
			
 
				+        config: Merged config dict from get_config(crawl=..., snapshot=...) - REQUIRED
			
 
				         timeout: Maximum execution time in seconds
			
 
				-        config_objects: Optional list of objects with .config JSON fields
			
 
				-                       (e.g., [crawl, snapshot] - later items have higher priority)
			
 
				+                 If None, auto-detects from PLUGINNAME_TIMEOUT config (fallback to TIMEOUT, default 300)
			
 
				         **kwargs: Arguments passed to the script as --key=value
			
 
				 
			
 
				     Returns:
			
 
				         HookResult with 'returncode', 'stdout', 'stderr', 'output_json', 'output_files', 'duration_ms'
			
 
				+
			
 
				+    Example:
			
 
				+        from archivebox.config.configset import get_config
			
 
				+        config = get_config(crawl=my_crawl, snapshot=my_snapshot)
			
 
				+        result = run_hook(hook_path, output_dir, config=config, url=url, snapshot_id=id)
			
 
				     """
			
 
				     import time
			
 
				     start_time = time.time()
			
 
				 
			
 
				-    # Auto-include Machine.config at the start (lowest priority among config_objects)
			
 
				-    from machine.models import Machine
			
 
				-    machine = Machine.current()
			
 
				-    all_config_objects = [machine] + list(config_objects or [])
			
 
				+    # Auto-detect timeout from plugin config if not explicitly provided
			
 
				+    if timeout is None:
			
 
				+        plugin_name = script.parent.name
			
 
				+        plugin_config = get_plugin_special_config(plugin_name, config)
			
 
				+        timeout = plugin_config['timeout']
			
 
				 
			
 
				     if not script.exists():
			
 
				         return HookResult(
			
@@ -302,51 +328,16 @@ def run_hook(
 
				     env['ARCHIVE_DIR'] = str(getattr(settings, 'ARCHIVE_DIR', Path.cwd() / 'archive'))
			
 
				     env.setdefault('MACHINE_ID', getattr(settings, 'MACHINE_ID', '') or os.environ.get('MACHINE_ID', ''))
			
 
				 
			
 
				-    # If a Crawl is in config_objects, pass its OUTPUT_DIR for hooks that need to find crawl-level resources
			
 
				-    for obj in all_config_objects:
			
 
				-        if hasattr(obj, 'OUTPUT_DIR') and hasattr(obj, 'get_urls_list'):  # Duck-type check for Crawl
			
 
				-            env['CRAWL_OUTPUT_DIR'] = str(obj.OUTPUT_DIR)
			
 
				-            break
			
 
				-
			
 
				-    # Build overrides from any objects with .config fields (in order, later overrides earlier)
			
 
				-    # all_config_objects includes Machine at the start, then any passed config_objects
			
 
				-    overrides = {}
			
 
				-    for obj in all_config_objects:
			
 
				-        if obj and hasattr(obj, 'config') and obj.config:
			
 
				-            # Strip 'config/' prefix from Machine.config keys (e.g., 'config/CHROME_BINARY' -> 'CHROME_BINARY')
			
 
				-            for key, value in obj.config.items():
			
 
				-                clean_key = key.removeprefix('config/')
			
 
				-                overrides[clean_key] = value
			
 
				-
			
 
				-    # Get plugin config from JSON schemas with hierarchy resolution
			
 
				-    # This merges: schema defaults -> config file -> env vars -> object config overrides
			
 
				-    plugin_config = get_flat_plugin_config(overrides=overrides if overrides else None)
			
 
				-    export_plugin_config_to_env(plugin_config, env)
			
 
				-
			
 
				-    # Also pass core config values that aren't in plugin schemas yet
			
 
				-    # These are legacy values that may still be needed
			
 
				-    from archivebox import config
			
 
				-    env.setdefault('CHROME_BINARY', str(getattr(config, 'CHROME_BINARY', '')))
			
 
				-    env.setdefault('WGET_BINARY', str(getattr(config, 'WGET_BINARY', '')))
			
 
				-    env.setdefault('CURL_BINARY', str(getattr(config, 'CURL_BINARY', '')))
			
 
				-    env.setdefault('GIT_BINARY', str(getattr(config, 'GIT_BINARY', '')))
			
 
				-    env.setdefault('YOUTUBEDL_BINARY', str(getattr(config, 'YOUTUBEDL_BINARY', '')))
			
 
				-    env.setdefault('SINGLEFILE_BINARY', str(getattr(config, 'SINGLEFILE_BINARY', '')))
			
 
				-    env.setdefault('READABILITY_BINARY', str(getattr(config, 'READABILITY_BINARY', '')))
			
 
				-    env.setdefault('MERCURY_BINARY', str(getattr(config, 'MERCURY_BINARY', '')))
			
 
				-    env.setdefault('NODE_BINARY', str(getattr(config, 'NODE_BINARY', '')))
			
 
				-    env.setdefault('TIMEOUT', str(getattr(config, 'TIMEOUT', 60)))
			
 
				-    env.setdefault('CHECK_SSL_VALIDITY', str(getattr(config, 'CHECK_SSL_VALIDITY', True)))
			
 
				-    env.setdefault('USER_AGENT', str(getattr(config, 'USER_AGENT', '')))
			
 
				-    env.setdefault('RESOLUTION', str(getattr(config, 'RESOLUTION', '')))
			
 
				-
			
 
				-    # Pass SEARCH_BACKEND_ENGINE from new-style config
			
 
				-    try:
			
 
				-        from archivebox.config.configset import get_config
			
 
				-        search_config = get_config()
			
 
				-        env.setdefault('SEARCH_BACKEND_ENGINE', str(search_config.get('SEARCH_BACKEND_ENGINE', 'ripgrep')))
			
 
				-    except Exception:
			
 
				-        env.setdefault('SEARCH_BACKEND_ENGINE', 'ripgrep')
			
 
				+    # Export all config values to environment (already merged by get_config())
			
 
				+    for key, value in config.items():
			
 
				+        if value is None:
			
 
				+            continue
			
 
				+        elif isinstance(value, bool):
			
 
				+            env[key] = 'true' if value else 'false'
			
 
				+        elif isinstance(value, (list, dict)):
			
 
				+            env[key] = json.dumps(value)
			
 
				+        else:
			
 
				+            env[key] = str(value)
			
 
				 
			
 
				     # Create output directory if needed
			
 
				     output_dir.mkdir(parents=True, exist_ok=True)
			
@@ -525,31 +516,35 @@ def collect_urls_from_plugins(snapshot_dir: Path) -> List[Dict[str, Any]]:
 
				 def run_hooks(
			
 
				     event_name: str,
			
 
				     output_dir: Path,
			
 
				-    timeout: int = 300,
			
 
				+    config: Dict[str, Any],
			
 
				+    timeout: Optional[int] = None,
			
 
				     stop_on_failure: bool = False,
			
 
				-    config_objects: Optional[List[Any]] = None,
			
 
				     **kwargs: Any
			
 
				 ) -> List[HookResult]:
			
 
				     """
			
 
				     Run all hooks for a given event.
			
 
				 
			
 
				     Args:
			
 
				-        event_name: The event name to trigger (e.g., 'Snapshot__wget')
			
 
				+        event_name: The event name to trigger (e.g., 'Snapshot', 'Crawl', 'Binary')
			
 
				         output_dir: Working directory for hook scripts
			
 
				-        timeout: Maximum execution time per hook
			
 
				+        config: Merged config dict from get_config(crawl=..., snapshot=...) - REQUIRED
			
 
				+        timeout: Maximum execution time per hook (None = auto-detect from plugin config)
			
 
				         stop_on_failure: If True, stop executing hooks after first failure
			
 
				-        config_objects: Optional list of objects with .config JSON fields
			
 
				-                       (e.g., [crawl, snapshot] - later items have higher priority)
			
 
				         **kwargs: Arguments passed to each hook script
			
 
				 
			
 
				     Returns:
			
 
				         List of results from each hook execution
			
 
				+
			
 
				+    Example:
			
 
				+        from archivebox.config.configset import get_config
			
 
				+        config = get_config(crawl=my_crawl, snapshot=my_snapshot)
			
 
				+        results = run_hooks('Snapshot', output_dir, config=config, url=url, snapshot_id=id)
			
 
				     """
			
 
				-    hooks = discover_hooks(event_name)
			
 
				+    hooks = discover_hooks(event_name, config=config)
			
 
				     results = []
			
 
				 
			
 
				     for hook in hooks:
			
 
				-        result = run_hook(hook, output_dir, timeout=timeout, config_objects=config_objects, **kwargs)
			
 
				+        result = run_hook(hook, output_dir, config=config, timeout=timeout, **kwargs)
			
 
				 
			
 
				         # Background hooks return None - skip adding to results
			
 
				         if result is None:
			
@@ -638,24 +633,44 @@ EXTRACTOR_INDEXING_PRECEDENCE = [
 
				 ]
			
 
				 
			
 
				 
			
 
				-def get_enabled_plugins(config: Optional[Dict] = None) -> List[str]:
			
 
				+def get_enabled_plugins(config: Optional[Dict[str, Any]] = None) -> List[str]:
			
 
				     """
			
 
				     Get the list of enabled plugins based on config and available hooks.
			
 
				 
			
 
				-    Checks for ENABLED_PLUGINS (or legacy ENABLED_EXTRACTORS) in config,
			
 
				-    falls back to discovering available hooks from the plugins directory.
			
 
				+    Filters plugins by USE_/SAVE_ flags. Only returns plugins that are enabled.
			
 
				+
			
 
				+    Args:
			
 
				+        config: Merged config dict from get_config() - if None, uses global config
			
 
				 
			
 
				-    Returns plugin names sorted alphabetically (numeric prefix controls order).
			
 
				+    Returns:
			
 
				+        Plugin names sorted alphabetically (numeric prefix controls order).
			
 
				+
			
 
				+    Example:
			
 
				+        from archivebox.config.configset import get_config
			
 
				+        config = get_config(crawl=my_crawl, snapshot=my_snapshot)
			
 
				+        enabled = get_enabled_plugins(config)  # ['wget', 'media', 'chrome', ...]
			
 
				     """
			
 
				-    if config:
			
 
				-        # Support both new and legacy config keys
			
 
				-        if 'ENABLED_PLUGINS' in config:
			
 
				-            return config['ENABLED_PLUGINS']
			
 
				-        if 'ENABLED_EXTRACTORS' in config:
			
 
				-            return config['ENABLED_EXTRACTORS']
			
 
				+    # Get merged config if not provided
			
 
				+    if config is None:
			
 
				+        from archivebox.config.configset import get_config
			
 
				+        config = get_config(scope='global')
			
 
				+
			
 
				+    # Support explicit ENABLED_PLUGINS override (legacy)
			
 
				+    if 'ENABLED_PLUGINS' in config:
			
 
				+        return config['ENABLED_PLUGINS']
			
 
				+    if 'ENABLED_EXTRACTORS' in config:
			
 
				+        return config['ENABLED_EXTRACTORS']
			
 
				+
			
 
				+    # Filter all plugins by enabled status
			
 
				+    all_plugins = get_plugins()
			
 
				+    enabled = []
			
 
				+
			
 
				+    for plugin in all_plugins:
			
 
				+        plugin_config = get_plugin_special_config(plugin, config)
			
 
				+        if plugin_config['enabled']:
			
 
				+            enabled.append(plugin)
			
 
				 
			
 
				-    # Discover from hooks - this is the source of truth
			
 
				-    return get_plugins()
			
 
				+    return enabled
			
 
				 
			
 
				 
			
 
				 def discover_plugins_that_provide_interface(
			
@@ -822,37 +837,6 @@ def discover_plugin_configs() -> Dict[str, Dict[str, Any]]:
 
				     return configs
			
 
				 
			
 
				 
			
 
				-def get_merged_config_schema() -> Dict[str, Any]:
			
 
				-    """
			
 
				-    Get a merged JSONSchema combining all plugin config schemas.
			
 
				-
			
 
				-    This creates a single schema that can validate all plugin config keys.
			
 
				-    Useful for validating the complete configuration at startup.
			
 
				-
			
 
				-    Returns:
			
 
				-        Combined JSONSchema with all plugin properties merged.
			
 
				-    """
			
 
				-    plugin_configs = discover_plugin_configs()
			
 
				-
			
 
				-    merged_properties = {}
			
 
				-    for plugin_name, schema in plugin_configs.items():
			
 
				-        properties = schema.get('properties', {})
			
 
				-        for key, prop_schema in properties.items():
			
 
				-            if key in merged_properties:
			
 
				-                # Key already exists from another plugin - log warning but keep first
			
 
				-                import sys
			
 
				-                print(f"Warning: Config key '{key}' defined in multiple plugins, using first definition", file=sys.stderr)
			
 
				-                continue
			
 
				-            merged_properties[key] = prop_schema
			
 
				-
			
 
				-    return {
			
 
				-        "$schema": "http://json-schema.org/draft-07/schema#",
			
 
				-        "type": "object",
			
 
				-        "additionalProperties": True,  # Allow unknown keys (core config, etc.)
			
 
				-        "properties": merged_properties,
			
 
				-    }
			
 
				-
			
 
				-
			
 
				 def get_config_defaults_from_plugins() -> Dict[str, Any]:
			
 
				     """
			
 
				     Get default values for all plugin config options.
			
@@ -873,173 +857,63 @@ def get_config_defaults_from_plugins() -> Dict[str, Any]:
 
				     return defaults
			
 
				 
			
 
				 
			
 
				-def resolve_config_value(
			
 
				-    key: str,
			
 
				-    prop_schema: Dict[str, Any],
			
 
				-    env_vars: Dict[str, str],
			
 
				-    config_file: Dict[str, str],
			
 
				-    overrides: Optional[Dict[str, Any]] = None,
			
 
				-) -> Any:
			
 
				+def get_plugin_special_config(plugin_name: str, config: Dict[str, Any]) -> Dict[str, Any]:
			
 
				     """
			
 
				-    Resolve a single config value following the hierarchy and schema rules.
			
 
				-
			
 
				-    Resolution order (later overrides earlier):
			
 
				-        1. Schema default
			
 
				-        2. x-fallback (global config key)
			
 
				-        3. Config file (ArchiveBox.conf)
			
 
				-        4. Environment variables (including x-aliases)
			
 
				-        5. Explicit overrides (User/Crawl/Snapshot config)
			
 
				-
			
 
				-    Args:
			
 
				-        key: Config key name (e.g., 'WGET_TIMEOUT')
			
 
				-        prop_schema: JSONSchema property definition for this key
			
 
				-        env_vars: Environment variables dict
			
 
				-        config_file: Config file values dict
			
 
				-        overrides: Optional override values (from User/Crawl/Snapshot)
			
 
				-
			
 
				-    Returns:
			
 
				-        Resolved value with appropriate type coercion.
			
 
				-    """
			
 
				-    value = None
			
 
				-    prop_type = prop_schema.get('type', 'string')
			
 
				-
			
 
				-    # 1. Start with schema default
			
 
				-    if 'default' in prop_schema:
			
 
				-        value = prop_schema['default']
			
 
				-
			
 
				-    # 2. Check x-fallback (global config key)
			
 
				-    fallback_key = prop_schema.get('x-fallback')
			
 
				-    if fallback_key:
			
 
				-        if fallback_key in env_vars:
			
 
				-            value = env_vars[fallback_key]
			
 
				-        elif fallback_key in config_file:
			
 
				-            value = config_file[fallback_key]
			
 
				-
			
 
				-    # 3. Check config file for main key
			
 
				-    if key in config_file:
			
 
				-        value = config_file[key]
			
 
				-
			
 
				-    # 4. Check environment variables (main key and aliases)
			
 
				-    keys_to_check = [key] + prop_schema.get('x-aliases', [])
			
 
				-    for check_key in keys_to_check:
			
 
				-        if check_key in env_vars:
			
 
				-            value = env_vars[check_key]
			
 
				-            break
			
 
				-
			
 
				-    # 5. Apply explicit overrides
			
 
				-    if overrides and key in overrides:
			
 
				-        value = overrides[key]
			
 
				+    Extract special config keys for a plugin following naming conventions.
			
 
				 
			
 
				-    # Type coercion for env var strings
			
 
				-    if value is not None and isinstance(value, str):
			
 
				-        value = coerce_config_value(value, prop_type, prop_schema)
			
 
				-
			
 
				-    return value
			
 
				-
			
 
				-
			
 
				-def coerce_config_value(value: str, prop_type: str, prop_schema: Dict[str, Any]) -> Any:
			
 
				-    """
			
 
				-    Coerce a string value to the appropriate type based on schema.
			
 
				-
			
 
				-    Args:
			
 
				-        value: String value to coerce
			
 
				-        prop_type: JSONSchema type ('boolean', 'integer', 'number', 'array', 'string')
			
 
				-        prop_schema: Full property schema (for array item types, etc.)
			
 
				-
			
 
				-    Returns:
			
 
				-        Coerced value of appropriate type.
			
 
				-    """
			
 
				-    if prop_type == 'boolean':
			
 
				-        return value.lower() in ('true', '1', 'yes', 'on')
			
 
				-    elif prop_type == 'integer':
			
 
				-        try:
			
 
				-            return int(value)
			
 
				-        except ValueError:
			
 
				-            return prop_schema.get('default', 0)
			
 
				-    elif prop_type == 'number':
			
 
				-        try:
			
 
				-            return float(value)
			
 
				-        except ValueError:
			
 
				-            return prop_schema.get('default', 0.0)
			
 
				-    elif prop_type == 'array':
			
 
				-        # Try JSON parse first, fall back to comma-separated
			
 
				-        try:
			
 
				-            return json.loads(value)
			
 
				-        except json.JSONDecodeError:
			
 
				-            return [v.strip() for v in value.split(',') if v.strip()]
			
 
				-    else:
			
 
				-        return value
			
 
				-
			
 
				-
			
 
				-def get_flat_plugin_config(
			
 
				-    env_vars: Optional[Dict[str, str]] = None,
			
 
				-    config_file: Optional[Dict[str, str]] = None,
			
 
				-    overrides: Optional[Dict[str, Any]] = None,
			
 
				-) -> Dict[str, Any]:
			
 
				-    """
			
 
				-    Get all plugin config values resolved according to hierarchy.
			
 
				+    ArchiveBox recognizes 3 special config key patterns per plugin:
			
 
				+        - {PLUGIN}_ENABLED: Enable/disable toggle (default True)
			
 
				+        - {PLUGIN}_TIMEOUT: Plugin-specific timeout (fallback to TIMEOUT, default 300)
			
 
				+        - {PLUGIN}_BINARY: Primary binary path (default to plugin_name)
			
 
				 
			
 
				-    This is the main function for getting plugin configuration.
			
 
				-    It discovers all plugin schemas and resolves each config key.
			
 
				+    These allow ArchiveBox to:
			
 
				+        - Skip disabled plugins (optimization)
			
 
				+        - Enforce plugin-specific timeouts automatically
			
 
				+        - Discover plugin binaries for validation
			
 
				 
			
 
				     Args:
			
 
				-        env_vars: Environment variables (defaults to os.environ)
			
 
				-        config_file: Config file values (from ArchiveBox.conf)
			
 
				-        overrides: Override values (from User/Crawl/Snapshot config fields)
			
 
				+        plugin_name: Plugin name (e.g., 'wget', 'media', 'chrome')
			
 
				+        config: Merged config dict from get_config() (properly merges file, env, machine, crawl, snapshot)
			
 
				 
			
 
				     Returns:
			
 
				-        Flat dict of all resolved config values.
			
 
				-        e.g., {'SAVE_WGET': True, 'WGET_TIMEOUT': 60, ...}
			
 
				-    """
			
 
				-    if env_vars is None:
			
 
				-        env_vars = dict(os.environ)
			
 
				-    if config_file is None:
			
 
				-        config_file = {}
			
 
				-
			
 
				-    plugin_configs = discover_plugin_configs()
			
 
				-    flat_config = {}
			
 
				-
			
 
				-    for plugin_name, schema in plugin_configs.items():
			
 
				-        properties = schema.get('properties', {})
			
 
				-        for key, prop_schema in properties.items():
			
 
				-            flat_config[key] = resolve_config_value(
			
 
				-                key, prop_schema, env_vars, config_file, overrides
			
 
				-            )
			
 
				-
			
 
				-    return flat_config
			
 
				-
			
 
				-
			
 
				-def export_plugin_config_to_env(
			
 
				-    config: Dict[str, Any],
			
 
				-    env: Optional[Dict[str, str]] = None,
			
 
				-) -> Dict[str, str]:
			
 
				-    """
			
 
				-    Export plugin config values to environment variable format.
			
 
				-
			
 
				-    Converts all values to strings suitable for subprocess environment.
			
 
				-    Arrays are JSON-encoded.
			
 
				-
			
 
				-    Args:
			
 
				-        config: Flat config dict from get_flat_plugin_config()
			
 
				-        env: Optional existing env dict to update (creates new if None)
			
 
				+        Dict with standardized keys:
			
 
				+            {
			
 
				+                'enabled': True,         # bool
			
 
				+                'timeout': 60,           # int, seconds
			
 
				+                'binary': 'wget',        # str, path or name
			
 
				+            }
			
 
				 
			
 
				-    Returns:
			
 
				-        Environment dict with config values as strings.
			
 
				+    Examples:
			
 
				+        >>> from archivebox.config.configset import get_config
			
 
				+        >>> config = get_config(crawl=my_crawl, snapshot=my_snapshot)
			
 
				+        >>> get_plugin_special_config('wget', config)
			
 
				+        {'enabled': True, 'timeout': 120, 'binary': '/usr/bin/wget'}
			
 
				     """
			
 
				-    if env is None:
			
 
				-        env = {}
			
 
				-
			
 
				-    for key, value in config.items():
			
 
				-        if value is None:
			
 
				-            continue
			
 
				-        elif isinstance(value, bool):
			
 
				-            env[key] = 'true' if value else 'false'
			
 
				-        elif isinstance(value, (list, dict)):
			
 
				-            env[key] = json.dumps(value)
			
 
				-        else:
			
 
				-            env[key] = str(value)
			
 
				+    plugin_upper = plugin_name.upper()
			
 
				+
			
 
				+    # 1. Enabled: PLUGINNAME_ENABLED (default True)
			
 
				+    # Old names (USE_*, SAVE_*) are aliased in config.json via x-aliases
			
 
				+    enabled_key = f'{plugin_upper}_ENABLED'
			
 
				+    enabled = config.get(enabled_key)
			
 
				+    if enabled is None:
			
 
				+        enabled = True
			
 
				+    elif isinstance(enabled, str):
			
 
				+        # Handle string values from config file ("true"/"false")
			
 
				+        enabled = enabled.lower() not in ('false', '0', 'no', '')
			
 
				+
			
 
				+    # 2. Timeout: PLUGINNAME_TIMEOUT (fallback to TIMEOUT, default 300)
			
 
				+    timeout_key = f'{plugin_upper}_TIMEOUT'
			
 
				+    timeout = config.get(timeout_key) or config.get('TIMEOUT', 300)
			
 
				+
			
 
				+    # 3. Binary: PLUGINNAME_BINARY (default to plugin_name)
			
 
				+    binary_key = f'{plugin_upper}_BINARY'
			
 
				+    binary = config.get(binary_key, plugin_name)
			
 
				 
			
 
				-    return env
			
 
				+    return {
			
 
				+        'enabled': bool(enabled),
			
 
				+        'timeout': int(timeout),
			
 
				+        'binary': str(binary),
			
 
				+    }
			
 
				 
			
 
				 
			
 
				 # =============================================================================
			
@@ -1233,7 +1107,7 @@ def find_binary_for_cmd(cmd: List[str], machine_id: str) -> Optional[str]:
 
				     if not cmd:
			
 
				         return None
			
 
				 
			
 
				-    from machine.models import Binary
			
 
				+    from archivebox.machine.models import Binary
			
 
				 
			
 
				     bin_path_or_name = cmd[0] if isinstance(cmd, list) else cmd
			
 
				 
			
@@ -1266,7 +1140,7 @@ def create_model_record(record: Dict[str, Any]) -> Any:
 
				     Returns:
			
 
				         Created/updated model instance, or None if type unknown
			
 
				     """
			
 
				-    from machine.models import Binary, Machine
			
 
				+    from archivebox.machine.models import Binary, Machine
			
 
				 
			
 
				     record_type = record.pop('type', None)
			
 
				     if not record_type:
			
@@ -1349,25 +1223,25 @@ def process_hook_records(records: List[Dict[str, Any]], overrides: Dict[str, Any
 
				         try:
			
 
				             # Dispatch to appropriate model's from_jsonl() method
			
 
				             if record_type == 'Snapshot':
			
 
				-                from core.models import Snapshot
			
 
				+                from archivebox.core.models import Snapshot
			
 
				                 obj = Snapshot.from_jsonl(record.copy(), overrides)
			
 
				                 if obj:
			
 
				                     stats['Snapshot'] = stats.get('Snapshot', 0) + 1
			
 
				 
			
 
				             elif record_type == 'Tag':
			
 
				-                from core.models import Tag
			
 
				+                from archivebox.core.models import Tag
			
 
				                 obj = Tag.from_jsonl(record.copy(), overrides)
			
 
				                 if obj:
			
 
				                     stats['Tag'] = stats.get('Tag', 0) + 1
			
 
				 
			
 
				             elif record_type == 'Binary':
			
 
				-                from machine.models import Binary
			
 
				+                from archivebox.machine.models import Binary
			
 
				                 obj = Binary.from_jsonl(record.copy(), overrides)
			
 
				                 if obj:
			
 
				                     stats['Binary'] = stats.get('Binary', 0) + 1
			
 
				 
			
 
				             elif record_type == 'Machine':
			
 
				-                from machine.models import Machine
			
 
				+                from archivebox.machine.models import Machine
			
 
				                 obj = Machine.from_jsonl(record.copy(), overrides)
			
 
				                 if obj:
			
 
				                     stats['Machine'] = stats.get('Machine', 0) + 1
			
--- a/archivebox/machine/admin.py
+++ b/archivebox/machine/admin.py
@@ -4,7 +4,7 @@ from django.contrib import admin
 
				 from django.utils.html import format_html
			
 
				 
			
 
				 from archivebox.base_models.admin import BaseModelAdmin, ConfigEditorMixin
			
 
				-from machine.models import Machine, NetworkInterface, Binary
			
 
				+from archivebox.machine.models import Machine, NetworkInterface, Binary
			
 
				 
			
 
				 
			
 
				 class MachineAdmin(ConfigEditorMixin, BaseModelAdmin):
			
--- a/archivebox/machine/apps.py
+++ b/archivebox/machine/apps.py
@@ -5,11 +5,11 @@ from django.apps import AppConfig
 
				 
			
 
				 class MachineConfig(AppConfig):
			
 
				     default_auto_field = 'django.db.models.BigAutoField'
			
 
				-    
			
 
				-    name = 'machine'
			
 
				+
			
 
				+    name = 'archivebox.machine'
			
 
				     verbose_name = 'Machine Info'
			
 
				 
			
 
				 
			
 
				 def register_admin(admin_site):
			
 
				-    from machine.admin import register_admin
			
 
				+    from archivebox.machine.admin import register_admin
			
 
				     register_admin(admin_site)
			
--- a/archivebox/machine/migrations/0001_squashed.py
+++ b/archivebox/machine/migrations/0001_squashed.py
@@ -14,9 +14,9 @@ class Migration(migrations.Migration):
 
				 
			
 
				     replaces = [
			
 
				         ('machine', '0001_initial'),
			
 
				-        ('machine', '0002_alter_machine_stats_binary'),
			
 
				-        ('machine', '0003_alter_binary_options_and_more'),
			
 
				-        ('machine', '0004_alter_binary_abspath_and_more'),
			
 
				+        ('machine', '0002_alter_machine_stats_installedbinary'),
			
 
				+        ('machine', '0003_alter_installedbinary_options_and_more'),
			
 
				+        ('machine', '0004_alter_installedbinary_abspath_and_more'),
			
 
				     ]
			
 
				 
			
 
				     dependencies = []
			
@@ -70,22 +70,7 @@ class Migration(migrations.Migration):
 
				                 'unique_together': {('machine', 'ip_public', 'ip_local', 'mac_address', 'dns_server')},
			
 
				             },
			
 
				         ),
			
 
				-        migrations.CreateModel(
			
 
				-            name='Dependency',
			
 
				-            fields=[
			
 
				-                ('id', models.UUIDField(default=uuid4, editable=False, primary_key=True, serialize=False, unique=True)),
			
 
				-                ('created_at', models.DateTimeField(db_index=True, default=django.utils.timezone.now)),
			
 
				-                ('modified_at', models.DateTimeField(auto_now=True)),
			
 
				-                ('bin_name', models.CharField(db_index=True, max_length=63, unique=True)),
			
 
				-                ('bin_providers', models.CharField(default='*', max_length=127)),
			
 
				-                ('custom_cmds', models.JSONField(blank=True, default=dict)),
			
 
				-                ('config', models.JSONField(blank=True, default=dict)),
			
 
				-            ],
			
 
				-            options={
			
 
				-                'verbose_name': 'Dependency',
			
 
				-                'verbose_name_plural': 'Dependencies',
			
 
				-            },
			
 
				-        ),
			
 
				+        # Dependency model removed - not needed anymore
			
 
				         migrations.CreateModel(
			
 
				             name='Binary',
			
 
				             fields=[
			
@@ -100,7 +85,7 @@ class Migration(migrations.Migration):
 
				                 ('version', models.CharField(blank=True, default=None, max_length=32)),
			
 
				                 ('sha256', models.CharField(blank=True, default=None, max_length=64)),
			
 
				                 ('machine', models.ForeignKey(blank=True, default=None, on_delete=django.db.models.deletion.CASCADE, to='machine.machine')),
			
 
				-                ('dependency', models.ForeignKey(blank=True, null=True, on_delete=django.db.models.deletion.SET_NULL, related_name='binary_set', to='machine.dependency')),
			
 
				+                # dependency FK removed - Dependency model deleted
			
 
				             ],
			
 
				             options={
			
 
				                 'verbose_name': 'Binary',
			
--- a/archivebox/machine/migrations/0002_rename_custom_cmds_to_overrides.py
+++ b/archivebox/machine/migrations/0002_rename_custom_cmds_to_overrides.py
@@ -1,6 +1,8 @@
 
				 # Generated manually on 2025-12-26
			
 
				+# NOTE: This migration is intentionally empty but kept for dependency chain
			
 
				+# The Dependency model was removed in 0004, so all operations have been stripped
			
 
				 
			
 
				-from django.db import migrations, models
			
 
				+from django.db import migrations
			
 
				 
			
 
				 
			
 
				 class Migration(migrations.Migration):
			
@@ -10,29 +12,5 @@ class Migration(migrations.Migration):
 
				     ]
			
 
				 
			
 
				     operations = [
			
 
				-        migrations.RenameField(
			
 
				-            model_name='dependency',
			
 
				-            old_name='custom_cmds',
			
 
				-            new_name='overrides',
			
 
				-        ),
			
 
				-        migrations.AlterField(
			
 
				-            model_name='dependency',
			
 
				-            name='bin_name',
			
 
				-            field=models.CharField(db_index=True, help_text='Binary executable name (e.g., wget, yt-dlp, chromium)', max_length=63, unique=True),
			
 
				-        ),
			
 
				-        migrations.AlterField(
			
 
				-            model_name='dependency',
			
 
				-            name='bin_providers',
			
 
				-            field=models.CharField(default='*', help_text='Comma-separated list of allowed providers: apt,brew,pip,npm,gem,nix,custom or * for any', max_length=127),
			
 
				-        ),
			
 
				-        migrations.AlterField(
			
 
				-            model_name='dependency',
			
 
				-            name='overrides',
			
 
				-            field=models.JSONField(blank=True, default=dict, help_text="JSON map matching abx-pkg Binary.overrides format: {'pip': {'packages': ['pkg']}, 'apt': {'packages': ['pkg']}}"),
			
 
				-        ),
			
 
				-        migrations.AlterField(
			
 
				-            model_name='dependency',
			
 
				-            name='config',
			
 
				-            field=models.JSONField(blank=True, default=dict, help_text='JSON map of env var config to use during install'),
			
 
				-        ),
			
 
				+        # All Dependency operations removed - model deleted in 0004
			
 
				     ]
			
--- a/archivebox/machine/migrations/0003_alter_dependency_id_alter_installedbinary_dependency_and_more.py
+++ b/archivebox/machine/migrations/0003_alter_dependency_id_alter_installedbinary_dependency_and_more.py
@@ -1,8 +1,8 @@
 
				 # Generated by Django 6.0 on 2025-12-28 05:12
			
 
				+# NOTE: This migration is intentionally empty but kept for dependency chain
			
 
				+# The Dependency model was removed in 0004, all operations stripped
			
 
				 
			
 
				-import django.db.models.deletion
			
 
				-from archivebox import uuid_compat
			
 
				-from django.db import migrations, models
			
 
				+from django.db import migrations
			
 
				 
			
 
				 
			
 
				 class Migration(migrations.Migration):
			
@@ -12,34 +12,6 @@ class Migration(migrations.Migration):
 
				     ]
			
 
				 
			
 
				     operations = [
			
 
				-        migrations.AlterField(
			
 
				-            model_name='dependency',
			
 
				-            name='id',
			
 
				-            field=models.UUIDField(default=uuid_compat.uuid7, editable=False, primary_key=True, serialize=False, unique=True),
			
 
				-        ),
			
 
				-        migrations.AlterField(
			
 
				-            model_name='binary',
			
 
				-            name='dependency',
			
 
				-            field=models.ForeignKey(blank=True, help_text='The Dependency this binary satisfies', null=True, on_delete=django.db.models.deletion.SET_NULL, related_name='binary_set', to='machine.dependency'),
			
 
				-        ),
			
 
				-        migrations.AlterField(
			
 
				-            model_name='binary',
			
 
				-            name='id',
			
 
				-            field=models.UUIDField(default=uuid_compat.uuid7, editable=False, primary_key=True, serialize=False, unique=True),
			
 
				-        ),
			
 
				-        migrations.AlterField(
			
 
				-            model_name='machine',
			
 
				-            name='config',
			
 
				-            field=models.JSONField(blank=True, default=dict, help_text='Machine-specific config overrides (e.g., resolved binary paths like WGET_BINARY)'),
			
 
				-        ),
			
 
				-        migrations.AlterField(
			
 
				-            model_name='machine',
			
 
				-            name='id',
			
 
				-            field=models.UUIDField(default=uuid_compat.uuid7, editable=False, primary_key=True, serialize=False, unique=True),
			
 
				-        ),
			
 
				-        migrations.AlterField(
			
 
				-            model_name='networkinterface',
			
 
				-            name='id',
			
 
				-            field=models.UUIDField(default=uuid_compat.uuid7, editable=False, primary_key=True, serialize=False, unique=True),
			
 
				-        ),
			
 
				+        # All operations removed - Dependency model deleted in 0004
			
 
				+        # This is a stub migration for users upgrading from old dev versions
			
 
				     ]
			
--- a/archivebox/machine/migrations/0004_drop_dependency_table.py
+++ b/archivebox/machine/migrations/0004_drop_dependency_table.py
@@ -0,0 +1,28 @@
 
				+# Generated migration - removes Dependency model entirely
			
 
				+# NOTE: This is a cleanup migration for users upgrading from old dev versions
			
 
				+# that had the Dependency model. Fresh installs never create this table.
			
 
				+
			
 
				+from django.db import migrations
			
 
				+
			
 
				+
			
 
				+def drop_dependency_table(apps, schema_editor):
			
 
				+    """
			
 
				+    Drop old Dependency table if it exists (from dev versions that had it).
			
 
				+    Safe to run multiple times, safe if table doesn't exist.
			
 
				+
			
 
				+    Does NOT touch machine_binary - that's our current Binary model table!
			
 
				+    """
			
 
				+    schema_editor.execute('DROP TABLE IF EXISTS machine_dependency')
			
 
				+    # Also drop old InstalledBinary table if it somehow still exists
			
 
				+    schema_editor.execute('DROP TABLE IF EXISTS machine_installedbinary')
			
 
				+
			
 
				+
			
 
				+class Migration(migrations.Migration):
			
 
				+
			
 
				+    dependencies = [
			
 
				+        ('machine', '0003_alter_dependency_id_alter_installedbinary_dependency_and_more'),
			
 
				+    ]
			
 
				+
			
 
				+    operations = [
			
 
				+        migrations.RunPython(drop_dependency_table, migrations.RunPython.noop),
			
 
				+    ]
			
--- a/archivebox/machine/migrations/0004_rename_installedbinary_to_binary.py
+++ b/archivebox/machine/migrations/0004_rename_installedbinary_to_binary.py
@@ -1,56 +0,0 @@
 
				-# Generated migration - Clean slate for Binary model
			
 
				-# Drops old InstalledBinary and Dependency tables, creates new Binary table
			
 
				-
			
 
				-from django.db import migrations, models
			
 
				-import django.utils.timezone
			
 
				-import archivebox.uuid_compat
			
 
				-
			
 
				-
			
 
				-def drop_old_tables(apps, schema_editor):
			
 
				-    """Drop old tables using raw SQL"""
			
 
				-    schema_editor.execute('DROP TABLE IF EXISTS machine_installedbinary')
			
 
				-    schema_editor.execute('DROP TABLE IF EXISTS machine_dependency')
			
 
				-    schema_editor.execute('DROP TABLE IF EXISTS machine_binary')  # In case rename happened
			
 
				-
			
 
				-
			
 
				-class Migration(migrations.Migration):
			
 
				-
			
 
				-    dependencies = [
			
 
				-        ('machine', '0003_alter_dependency_id_alter_installedbinary_dependency_and_more'),
			
 
				-    ]
			
 
				-
			
 
				-    operations = [
			
 
				-        # Drop old tables using raw SQL
			
 
				-        migrations.RunPython(drop_old_tables, migrations.RunPython.noop),
			
 
				-
			
 
				-        # Create new Binary model from scratch
			
 
				-        migrations.CreateModel(
			
 
				-            name='Binary',
			
 
				-            fields=[
			
 
				-                ('id', models.UUIDField(default=archivebox.uuid_compat.uuid7, editable=False, primary_key=True, serialize=False, unique=True)),
			
 
				-                ('created_at', models.DateTimeField(db_index=True, default=django.utils.timezone.now)),
			
 
				-                ('modified_at', models.DateTimeField(auto_now=True)),
			
 
				-                ('name', models.CharField(blank=True, db_index=True, default=None, max_length=63)),
			
 
				-                ('binproviders', models.CharField(blank=True, default='env', help_text='Comma-separated list of allowed providers: apt,brew,pip,npm,env', max_length=127)),
			
 
				-                ('overrides', models.JSONField(blank=True, default=dict, help_text="Provider-specific overrides: {'apt': {'packages': ['pkg']}, ...}")),
			
 
				-                ('binprovider', models.CharField(blank=True, default=None, help_text='Provider that successfully installed this binary', max_length=31)),
			
 
				-                ('abspath', models.CharField(blank=True, default=None, max_length=255)),
			
 
				-                ('version', models.CharField(blank=True, default=None, max_length=32)),
			
 
				-                ('sha256', models.CharField(blank=True, default=None, max_length=64)),
			
 
				-                ('status', models.CharField(choices=[('queued', 'Queued'), ('started', 'Started'), ('succeeded', 'Succeeded'), ('failed', 'Failed')], db_index=True, default='queued', max_length=16)),
			
 
				-                ('retry_at', models.DateTimeField(blank=True, db_index=True, default=django.utils.timezone.now, help_text='When to retry this binary installation', null=True)),
			
 
				-                ('output_dir', models.CharField(blank=True, default='', help_text='Directory where installation hook logs are stored', max_length=255)),
			
 
				-                ('num_uses_failed', models.PositiveIntegerField(default=0)),
			
 
				-                ('num_uses_succeeded', models.PositiveIntegerField(default=0)),
			
 
				-                ('machine', models.ForeignKey(blank=True, default=None, on_delete=models.deletion.CASCADE, to='machine.machine')),
			
 
				-            ],
			
 
				-            options={
			
 
				-                'verbose_name': 'Binary',
			
 
				-                'verbose_name_plural': 'Binaries',
			
 
				-            },
			
 
				-        ),
			
 
				-        migrations.AddIndex(
			
 
				-            model_name='binary',
			
 
				-            index=models.Index(fields=['machine', 'name', 'abspath', 'version', 'sha256'], name='machine_bin_machine_idx'),
			
 
				-        ),
			
 
				-    ]
			
--- a/archivebox/machine/models.py
+++ b/archivebox/machine/models.py
@@ -4,11 +4,14 @@ import socket
 
				 from archivebox.uuid_compat import uuid7
			
 
				 from datetime import timedelta
			
 
				 
			
 
				+from statemachine import State, registry
			
 
				+
			
 
				 from django.db import models
			
 
				 from django.utils import timezone
			
 
				 from django.utils.functional import cached_property
			
 
				 
			
 
				 from archivebox.base_models.models import ModelWithHealthStats
			
 
				+from archivebox.workers.models import BaseStateMachine
			
 
				 from .detect import get_host_guid, get_os_info, get_vm_info, get_host_network, get_host_stats
			
 
				 
			
 
				 _CURRENT_MACHINE = None
			
@@ -50,6 +53,9 @@ class Machine(ModelWithHealthStats):
 
				     objects: MachineManager = MachineManager()
			
 
				     networkinterface_set: models.Manager['NetworkInterface']
			
 
				 
			
 
				+    class Meta:
			
 
				+        app_label = 'machine'
			
 
				+
			
 
				     @classmethod
			
 
				     def current(cls) -> 'Machine':
			
 
				         global _CURRENT_MACHINE
			
@@ -115,6 +121,7 @@ class NetworkInterface(ModelWithHealthStats):
 
				     objects: NetworkInterfaceManager = NetworkInterfaceManager()
			
 
				 
			
 
				     class Meta:
			
 
				+        app_label = 'machine'
			
 
				         unique_together = (('machine', 'ip_public', 'ip_local', 'mac_address', 'dns_server'),)
			
 
				 
			
 
				     @classmethod
			
@@ -206,11 +213,12 @@ class Binary(ModelWithHealthStats):
 
				     num_uses_failed = models.PositiveIntegerField(default=0)
			
 
				     num_uses_succeeded = models.PositiveIntegerField(default=0)
			
 
				 
			
 
				-    state_machine_name: str = 'machine.statemachines.BinaryMachine'
			
 
				+    state_machine_name: str = 'machine.models.BinaryMachine'
			
 
				 
			
 
				     objects: BinaryManager = BinaryManager()
			
 
				 
			
 
				     class Meta:
			
 
				+        app_label = 'machine'
			
 
				         verbose_name = 'Binary'
			
 
				         verbose_name_plural = 'Binaries'
			
 
				         unique_together = (('machine', 'name', 'abspath', 'version', 'sha256'),)
			
@@ -302,9 +310,9 @@ class Binary(ModelWithHealthStats):
 
				         DATA_DIR = getattr(settings, 'DATA_DIR', Path.cwd())
			
 
				         return Path(DATA_DIR) / 'machines' / str(self.machine_id) / 'binaries' / self.name / str(self.id)
			
 
				 
			
 
				-    def update_for_workers(self, **kwargs):
			
 
				+    def update_and_requeue(self, **kwargs):
			
 
				         """
			
 
				-        Update binary fields for worker state machine.
			
 
				+        Update binary fields and requeue for worker state machine.
			
 
				 
			
 
				         Sets modified_at to ensure workers pick up changes.
			
 
				         Always saves the model after updating.
			
@@ -325,6 +333,10 @@ class Binary(ModelWithHealthStats):
 
				         """
			
 
				         import json
			
 
				         from archivebox.hooks import discover_hooks, run_hook
			
 
				+        from archivebox.config.configset import get_config
			
 
				+
			
 
				+        # Get merged config (Binary doesn't have crawl/snapshot context)
			
 
				+        config = get_config(scope='global')
			
 
				 
			
 
				         # Create output directory
			
 
				         output_dir = self.OUTPUT_DIR
			
@@ -333,7 +345,7 @@ class Binary(ModelWithHealthStats):
 
				         self.save()
			
 
				 
			
 
				         # Discover ALL on_Binary__install_* hooks
			
 
				-        hooks = discover_hooks('Binary')
			
 
				+        hooks = discover_hooks('Binary', config=config)
			
 
				         if not hooks:
			
 
				             self.status = self.StatusChoices.FAILED
			
 
				             self.save()
			
@@ -361,7 +373,8 @@ class Binary(ModelWithHealthStats):
 
				             result = run_hook(
			
 
				                 hook,
			
 
				                 output_dir=plugin_output_dir,
			
 
				-                timeout=600,  # 10 min timeout
			
 
				+                config=config,
			
 
				+                timeout=600,  # 10 min timeout for binary installation
			
 
				                 **hook_kwargs
			
 
				             )
			
 
				 
			
@@ -420,3 +433,128 @@ class Binary(ModelWithHealthStats):
 
				                 kill_process(pid_file)
			
 
				 
			
 
				 
			
 
				+# =============================================================================
			
 
				+# Binary State Machine
			
 
				+# =============================================================================
			
 
				+
			
 
				+class BinaryMachine(BaseStateMachine, strict_states=True):
			
 
				+    """
			
 
				+    State machine for managing Binary installation lifecycle.
			
 
				+
			
 
				+    Hook Lifecycle:
			
 
				+    ┌─────────────────────────────────────────────────────────────┐
			
 
				+    │ QUEUED State                                                │
			
 
				+    │  • Binary needs to be installed                             │
			
 
				+    └─────────────────────────────────────────────────────────────┘
			
 
				+                            ↓ tick() when can_start()
			
 
				+    ┌─────────────────────────────────────────────────────────────┐
			
 
				+    │ STARTED State → enter_started()                             │
			
 
				+    │  1. binary.run()                                            │
			
 
				+    │     • discover_hooks('Binary') → all on_Binary__install_*   │
			
 
				+    │     • Try each provider hook in sequence:                   │
			
 
				+    │       - run_hook(script, output_dir, ...)                   │
			
 
				+    │       - If returncode == 0:                                 │
			
 
				+    │         * Read stdout.log                                   │
			
 
				+    │         * Parse JSONL for 'Binary' record with abspath      │
			
 
				+    │         * Update self: abspath, version, sha256, provider   │
			
 
				+    │         * Set status=SUCCEEDED, RETURN                      │
			
 
				+    │     • If no hook succeeds: set status=FAILED                │
			
 
				+    └─────────────────────────────────────────────────────────────┘
			
 
				+                            ↓ tick() checks status
			
 
				+    ┌─────────────────────────────────────────────────────────────┐
			
 
				+    │ SUCCEEDED / FAILED                                          │
			
 
				+    │  • Set by binary.run() based on hook results                │
			
 
				+    │  • Health stats incremented (num_uses_succeeded/failed)     │
			
 
				+    └─────────────────────────────────────────────────────────────┘
			
 
				+    """
			
 
				+
			
 
				+    model_attr_name = 'binary'
			
 
				+
			
 
				+    # States
			
 
				+    queued = State(value=Binary.StatusChoices.QUEUED, initial=True)
			
 
				+    started = State(value=Binary.StatusChoices.STARTED)
			
 
				+    succeeded = State(value=Binary.StatusChoices.SUCCEEDED, final=True)
			
 
				+    failed = State(value=Binary.StatusChoices.FAILED, final=True)
			
 
				+
			
 
				+    # Tick Event - transitions based on conditions
			
 
				+    tick = (
			
 
				+        queued.to.itself(unless='can_start') |
			
 
				+        queued.to(started, cond='can_start') |
			
 
				+        started.to.itself(unless='is_finished') |
			
 
				+        started.to(succeeded, cond='is_succeeded') |
			
 
				+        started.to(failed, cond='is_failed')
			
 
				+    )
			
 
				+
			
 
				+    def can_start(self) -> bool:
			
 
				+        """Check if binary installation can start."""
			
 
				+        return bool(self.binary.name and self.binary.binproviders)
			
 
				+
			
 
				+    def is_succeeded(self) -> bool:
			
 
				+        """Check if installation succeeded (status was set by run())."""
			
 
				+        return self.binary.status == Binary.StatusChoices.SUCCEEDED
			
 
				+
			
 
				+    def is_failed(self) -> bool:
			
 
				+        """Check if installation failed (status was set by run())."""
			
 
				+        return self.binary.status == Binary.StatusChoices.FAILED
			
 
				+
			
 
				+    def is_finished(self) -> bool:
			
 
				+        """Check if installation has completed (success or failure)."""
			
 
				+        return self.binary.status in (
			
 
				+            Binary.StatusChoices.SUCCEEDED,
			
 
				+            Binary.StatusChoices.FAILED,
			
 
				+        )
			
 
				+
			
 
				+    @queued.enter
			
 
				+    def enter_queued(self):
			
 
				+        """Binary is queued for installation."""
			
 
				+        self.binary.update_and_requeue(
			
 
				+            retry_at=timezone.now(),
			
 
				+            status=Binary.StatusChoices.QUEUED,
			
 
				+        )
			
 
				+
			
 
				+    @started.enter
			
 
				+    def enter_started(self):
			
 
				+        """Start binary installation."""
			
 
				+        # Lock the binary while installation runs
			
 
				+        self.binary.update_and_requeue(
			
 
				+            retry_at=timezone.now() + timedelta(seconds=300),  # 5 min timeout for installation
			
 
				+            status=Binary.StatusChoices.STARTED,
			
 
				+        )
			
 
				+
			
 
				+        # Run installation hooks
			
 
				+        self.binary.run()
			
 
				+
			
 
				+        # Save updated status (run() updates status to succeeded/failed)
			
 
				+        self.binary.save()
			
 
				+
			
 
				+    @succeeded.enter
			
 
				+    def enter_succeeded(self):
			
 
				+        """Binary installed successfully."""
			
 
				+        self.binary.update_and_requeue(
			
 
				+            retry_at=None,
			
 
				+            status=Binary.StatusChoices.SUCCEEDED,
			
 
				+        )
			
 
				+
			
 
				+        # Increment health stats
			
 
				+        self.binary.increment_health_stats(success=True)
			
 
				+
			
 
				+    @failed.enter
			
 
				+    def enter_failed(self):
			
 
				+        """Binary installation failed."""
			
 
				+        self.binary.update_and_requeue(
			
 
				+            retry_at=None,
			
 
				+            status=Binary.StatusChoices.FAILED,
			
 
				+        )
			
 
				+
			
 
				+        # Increment health stats
			
 
				+        self.binary.increment_health_stats(success=False)
			
 
				+
			
 
				+
			
 
				+# =============================================================================
			
 
				+# State Machine Registration
			
 
				+# =============================================================================
			
 
				+
			
 
				+# Manually register state machines with python-statemachine registry
			
 
				+registry.register(BinaryMachine)
			
 
				+
			
 
				+
			
--- a/archivebox/machine/statemachines.py
+++ b/archivebox/machine/statemachines.py
@@ -1,112 +0,0 @@
 
				-__package__ = 'archivebox.machine'
			
 
				-
			
 
				-from datetime import timedelta
			
 
				-from django.utils import timezone
			
 
				-from django.db.models import F
			
 
				-
			
 
				-from statemachine import State, StateMachine
			
 
				-
			
 
				-from machine.models import Binary
			
 
				-
			
 
				-
			
 
				-class BinaryMachine(StateMachine, strict_states=True):
			
 
				-    """
			
 
				-    State machine for managing Binary installation lifecycle.
			
 
				-
			
 
				-    Follows the unified pattern used by Crawl, Snapshot, and ArchiveResult:
			
 
				-    - queued: Binary needs to be installed
			
 
				-    - started: Installation hooks are running
			
 
				-    - succeeded: Binary installed successfully (abspath, version, sha256 populated)
			
 
				-    - failed: Installation failed permanently
			
 
				-    """
			
 
				-
			
 
				-    model: Binary
			
 
				-
			
 
				-    # States
			
 
				-    queued = State(value=Binary.StatusChoices.QUEUED, initial=True)
			
 
				-    started = State(value=Binary.StatusChoices.STARTED)
			
 
				-    succeeded = State(value=Binary.StatusChoices.SUCCEEDED, final=True)
			
 
				-    failed = State(value=Binary.StatusChoices.FAILED, final=True)
			
 
				-
			
 
				-    # Tick Event - transitions based on conditions
			
 
				-    tick = (
			
 
				-        queued.to.itself(unless='can_start') |
			
 
				-        queued.to(started, cond='can_start') |
			
 
				-        started.to.itself(unless='is_finished') |
			
 
				-        started.to(succeeded, cond='is_succeeded') |
			
 
				-        started.to(failed, cond='is_failed')
			
 
				-    )
			
 
				-
			
 
				-    def __init__(self, binary, *args, **kwargs):
			
 
				-        self.binary = binary
			
 
				-        super().__init__(binary, *args, **kwargs)
			
 
				-
			
 
				-    def __repr__(self) -> str:
			
 
				-        return f'Binary[{self.binary.id}]'
			
 
				-
			
 
				-    def __str__(self) -> str:
			
 
				-        return self.__repr__()
			
 
				-
			
 
				-    def can_start(self) -> bool:
			
 
				-        """Check if binary installation can start."""
			
 
				-        return bool(self.binary.name and self.binary.binproviders)
			
 
				-
			
 
				-    def is_succeeded(self) -> bool:
			
 
				-        """Check if installation succeeded (status was set by run())."""
			
 
				-        return self.binary.status == Binary.StatusChoices.SUCCEEDED
			
 
				-
			
 
				-    def is_failed(self) -> bool:
			
 
				-        """Check if installation failed (status was set by run())."""
			
 
				-        return self.binary.status == Binary.StatusChoices.FAILED
			
 
				-
			
 
				-    def is_finished(self) -> bool:
			
 
				-        """Check if installation has completed (success or failure)."""
			
 
				-        return self.binary.status in (
			
 
				-            Binary.StatusChoices.SUCCEEDED,
			
 
				-            Binary.StatusChoices.FAILED,
			
 
				-        )
			
 
				-
			
 
				-    @queued.enter
			
 
				-    def enter_queued(self):
			
 
				-        """Binary is queued for installation."""
			
 
				-        self.binary.update_for_workers(
			
 
				-            retry_at=timezone.now(),
			
 
				-            status=Binary.StatusChoices.QUEUED,
			
 
				-        )
			
 
				-
			
 
				-    @started.enter
			
 
				-    def enter_started(self):
			
 
				-        """Start binary installation."""
			
 
				-        # Lock the binary while installation runs
			
 
				-        self.binary.update_for_workers(
			
 
				-            retry_at=timezone.now() + timedelta(seconds=300),  # 5 min timeout for installation
			
 
				-            status=Binary.StatusChoices.STARTED,
			
 
				-        )
			
 
				-
			
 
				-        # Run installation hooks
			
 
				-        self.binary.run()
			
 
				-
			
 
				-        # Save updated status (run() updates status to succeeded/failed)
			
 
				-        self.binary.save()
			
 
				-
			
 
				-    @succeeded.enter
			
 
				-    def enter_succeeded(self):
			
 
				-        """Binary installed successfully."""
			
 
				-        self.binary.update_for_workers(
			
 
				-            retry_at=None,
			
 
				-            status=Binary.StatusChoices.SUCCEEDED,
			
 
				-        )
			
 
				-
			
 
				-        # Increment health stats
			
 
				-        Binary.objects.filter(pk=self.binary.pk).update(num_uses_succeeded=F('num_uses_succeeded') + 1)
			
 
				-
			
 
				-    @failed.enter
			
 
				-    def enter_failed(self):
			
 
				-        """Binary installation failed."""
			
 
				-        self.binary.update_for_workers(
			
 
				-            retry_at=None,
			
 
				-            status=Binary.StatusChoices.FAILED,
			
 
				-        )
			
 
				-
			
 
				-        # Increment health stats
			
 
				-        Binary.objects.filter(pk=self.binary.pk).update(num_uses_failed=F('num_uses_failed') + 1)
			
--- a/archivebox/misc/jsonl.py
+++ b/archivebox/misc/jsonl.py
@@ -250,68 +250,13 @@ def process_records(
 
				                 yield result
			
 
				 
			
 
				 
			
 
				-def get_or_create_snapshot(record: Dict[str, Any], created_by_id: Optional[int] = None):
			
 
				-    """
			
 
				-    Get or create a Snapshot from a JSONL record.
			
 
				-
			
 
				-    Returns the Snapshot instance.
			
 
				-    """
			
 
				-    from core.models import Snapshot
			
 
				-    from archivebox.base_models.models import get_or_create_system_user_pk
			
 
				-    from archivebox.misc.util import parse_date
			
 
				-
			
 
				-    created_by_id = created_by_id or get_or_create_system_user_pk()
			
 
				-
			
 
				-    # Extract fields from record
			
 
				-    url = record.get('url')
			
 
				-    if not url:
			
 
				-        raise ValueError("Record missing required 'url' field")
			
 
				-
			
 
				-    title = record.get('title')
			
 
				-    tags_str = record.get('tags', '')
			
 
				-    bookmarked_at = record.get('bookmarked_at')
			
 
				-    depth = record.get('depth', 0)
			
 
				-    crawl_id = record.get('crawl_id')
			
 
				-    parent_snapshot_id = record.get('parent_snapshot_id')
			
 
				-
			
 
				-    # Parse bookmarked_at if string
			
 
				-    if bookmarked_at and isinstance(bookmarked_at, str):
			
 
				-        bookmarked_at = parse_date(bookmarked_at)
			
 
				-
			
 
				-    # Use the manager's create_or_update_from_dict method
			
 
				-    snapshot = Snapshot.objects.create_or_update_from_dict(
			
 
				-        {'url': url, 'title': title, 'tags': tags_str},
			
 
				-        created_by_id=created_by_id
			
 
				-    )
			
 
				-
			
 
				-    # Update additional fields if provided
			
 
				-    update_fields = []
			
 
				-    if depth is not None and snapshot.depth != depth:
			
 
				-        snapshot.depth = depth
			
 
				-        update_fields.append('depth')
			
 
				-    if parent_snapshot_id and str(snapshot.parent_snapshot_id) != str(parent_snapshot_id):
			
 
				-        snapshot.parent_snapshot_id = parent_snapshot_id
			
 
				-        update_fields.append('parent_snapshot_id')
			
 
				-    if bookmarked_at and snapshot.bookmarked_at != bookmarked_at:
			
 
				-        snapshot.bookmarked_at = bookmarked_at
			
 
				-        update_fields.append('bookmarked_at')
			
 
				-    if crawl_id and str(snapshot.crawl_id) != str(crawl_id):
			
 
				-        snapshot.crawl_id = crawl_id
			
 
				-        update_fields.append('crawl_id')
			
 
				-
			
 
				-    if update_fields:
			
 
				-        snapshot.save(update_fields=update_fields + ['modified_at'])
			
 
				-
			
 
				-    return snapshot
			
 
				-
			
 
				-
			
 
				 def get_or_create_tag(record: Dict[str, Any]):
			
 
				     """
			
 
				     Get or create a Tag from a JSONL record.
			
 
				 
			
 
				     Returns the Tag instance.
			
 
				     """
			
 
				-    from core.models import Tag
			
 
				+    from archivebox.core.models import Tag
			
 
				 
			
 
				     name = record.get('name')
			
 
				     if not name:
			
@@ -353,8 +298,11 @@ def process_jsonl_records(records: Iterator[Dict[str, Any]], created_by_id: Opti
 
				 
			
 
				         elif record_type == TYPE_SNAPSHOT or 'url' in record:
			
 
				             try:
			
 
				-                snapshot = get_or_create_snapshot(record, created_by_id=created_by_id)
			
 
				-                results['snapshots'].append(snapshot)
			
 
				+                from archivebox.core.models import Snapshot
			
 
				+                overrides = {'created_by_id': created_by_id} if created_by_id else {}
			
 
				+                snapshot = Snapshot.from_jsonl(record, overrides=overrides)
			
 
				+                if snapshot:
			
 
				+                    results['snapshots'].append(snapshot)
			
 
				             except ValueError:
			
 
				                 continue
			
 
				 
			
--- a/archivebox/misc/logging_util.py
+++ b/archivebox/misc/logging_util.py
@@ -17,7 +17,7 @@ from dataclasses import dataclass
 
				 from typing import Any, Optional, List, Dict, Union, Iterable, IO, TYPE_CHECKING
			
 
				 
			
 
				 if TYPE_CHECKING:
			
 
				-    from core.models import Snapshot
			
 
				+    from archivebox.core.models import Snapshot
			
 
				 
			
 
				 from rich import print
			
 
				 from rich.panel import Panel
			
@@ -257,7 +257,7 @@ def log_archiving_paused(num_links: int, idx: int, timestamp: str):
 
				 
			
 
				 def log_archiving_finished(num_links: int):
			
 
				 
			
 
				-    from core.models import Snapshot
			
 
				+    from archivebox.core.models import Snapshot
			
 
				 
			
 
				     end_ts = datetime.now(timezone.utc)
			
 
				     _LAST_RUN_STATS.archiving_end_ts = end_ts
			
@@ -395,7 +395,7 @@ def log_list_started(filter_patterns: Optional[List[str]], filter_type: str):
 
				     print('    {}'.format(' '.join(filter_patterns or ())))
			
 
				 
			
 
				 def log_list_finished(snapshots):
			
 
				-    from core.models import Snapshot
			
 
				+    from archivebox.core.models import Snapshot
			
 
				     print()
			
 
				     print('---------------------------------------------------------------------------------------------------')
			
 
				     print(Snapshot.objects.filter(pk__in=[s.pk for s in snapshots]).to_csv(cols=['timestamp', 'is_archived', 'num_outputs', 'url'], header=True, ljust=16, separator=' | '))
			
--- a/archivebox/misc/tests.py
+++ b/archivebox/misc/tests.py
@@ -1,335 +0,0 @@
 
				-__package__ = 'abx.archivebox'
			
 
				-
			
 
				-# from django.test import TestCase
			
 
				-
			
 
				-# from .toml_util import convert, TOML_HEADER
			
 
				-
			
 
				-# TEST_INPUT = """
			
 
				-# [SERVER_CONFIG]
			
 
				-# IS_TTY=False
			
 
				-# USE_COLOR=False
			
 
				-# SHOW_PROGRESS=False
			
 
				-# IN_DOCKER=False
			
 
				-# IN_QEMU=False
			
 
				-# PUID=501
			
 
				-# PGID=20
			
 
				-# CONFIG_FILE=/opt/archivebox/data/ArchiveBox.conf
			
 
				-# ONLY_NEW=True
			
 
				-# TIMEOUT=60
			
 
				-# MEDIA_TIMEOUT=3600
			
 
				-# OUTPUT_PERMISSIONS=644
			
 
				-# RESTRICT_FILE_NAMES=windows
			
 
				-# URL_DENYLIST=\.(css|js|otf|ttf|woff|woff2|gstatic\.com|googleapis\.com/css)(\?.*)?$
			
 
				-# URL_ALLOWLIST=None
			
 
				-# ADMIN_USERNAME=None
			
 
				-# ADMIN_PASSWORD=None
			
 
				-# ENFORCE_ATOMIC_WRITES=True
			
 
				-# TAG_SEPARATOR_PATTERN=[,]
			
 
				-# SECRET_KEY=xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx
			
 
				-# BIND_ADDR=127.0.0.1:8000
			
 
				-# ALLOWED_HOSTS=*
			
 
				-# DEBUG=False
			
 
				-# PUBLIC_INDEX=True
			
 
				-# PUBLIC_SNAPSHOTS=True
			
 
				-# PUBLIC_ADD_VIEW=False
			
 
				-# FOOTER_INFO=Content is hosted for personal archiving purposes only.  Contact server owner for any takedown requests.
			
 
				-# SNAPSHOTS_PER_PAGE=40
			
 
				-# CUSTOM_TEMPLATES_DIR=None
			
 
				-# TIME_ZONE=UTC
			
 
				-# TIMEZONE=UTC
			
 
				-# REVERSE_PROXY_USER_HEADER=Remote-User
			
 
				-# REVERSE_PROXY_WHITELIST=
			
 
				-# LOGOUT_REDIRECT_URL=/
			
 
				-# PREVIEW_ORIGINALS=True
			
 
				-# LDAP=False
			
 
				-# LDAP_SERVER_URI=None
			
 
				-# LDAP_BIND_DN=None
			
 
				-# LDAP_BIND_PASSWORD=None
			
 
				-# LDAP_USER_BASE=None
			
 
				-# LDAP_USER_FILTER=None
			
 
				-# LDAP_USERNAME_ATTR=None
			
 
				-# LDAP_FIRSTNAME_ATTR=None
			
 
				-# LDAP_LASTNAME_ATTR=None
			
 
				-# LDAP_EMAIL_ATTR=None
			
 
				-# LDAP_CREATE_SUPERUSER=False
			
 
				-# SAVE_TITLE=True
			
 
				-# SAVE_FAVICON=True
			
 
				-# SAVE_WGET=True
			
 
				-# SAVE_WGET_REQUISITES=True
			
 
				-# SAVE_SINGLEFILE=True
			
 
				-# SAVE_READABILITY=True
			
 
				-# SAVE_MERCURY=True
			
 
				-# SAVE_HTMLTOTEXT=True
			
 
				-# SAVE_PDF=True
			
 
				-# SAVE_SCREENSHOT=True
			
 
				-# SAVE_DOM=True
			
 
				-# SAVE_HEADERS=True
			
 
				-# SAVE_WARC=True
			
 
				-# SAVE_GIT=True
			
 
				-# SAVE_MEDIA=True
			
 
				-# SAVE_ARCHIVE_DOT_ORG=True
			
 
				-# RESOLUTION=1440,2000
			
 
				-# GIT_DOMAINS=github.com,bitbucket.org,gitlab.com,gist.github.com,codeberg.org,gitea.com,git.sr.ht
			
 
				-# CHECK_SSL_VALIDITY=True
			
 
				-# MEDIA_MAX_SIZE=750m
			
 
				-# USER_AGENT=None
			
 
				-# CURL_USER_AGENT=Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36 ArchiveBox/0.8.0 (+https://github.com/ArchiveBox/ArchiveBox/) curl/curl 8.4.0 (x86_64-apple-darwin23.0)
			
 
				-# WGET_USER_AGENT=Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36 ArchiveBox/0.8.0 (+https://github.com/ArchiveBox/ArchiveBox/) wget/GNU Wget 1.24.5
			
 
				-# CHROME_USER_AGENT=Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36 ArchiveBox/0.8.0 (+https://github.com/ArchiveBox/ArchiveBox/)
			
 
				-# COOKIES_FILE=None
			
 
				-# CHROME_USER_DATA_DIR=None
			
 
				-# CHROME_TIMEOUT=0
			
 
				-# CHROME_HEADLESS=True
			
 
				-# CHROME_SANDBOX=True
			
 
				-# CHROME_EXTRA_ARGS=[]
			
 
				-# YOUTUBEDL_ARGS=['--restrict-filenames', '--trim-filenames', '128', '--write-description', '--write-info-json', '--write-annotations', '--write-thumbnail', '--no-call-home', '--write-sub', '--write-auto-subs', '--convert-subs=srt', '--yes-playlist', '--continue', '--no-abort-on-error', '--ignore-errors', '--geo-bypass', '--add-metadata', '--format=(bv*+ba/b)[filesize<=750m][filesize_approx<=?750m]/(bv*+ba/b)']
			
 
				-# YOUTUBEDL_EXTRA_ARGS=[]
			
 
				-# WGET_ARGS=['--no-verbose', '--adjust-extension', '--convert-links', '--force-directories', '--backup-converted', '--span-hosts', '--no-parent', '-e', 'robots=off']
			
 
				-# WGET_EXTRA_ARGS=[]
			
 
				-# CURL_ARGS=['--silent', '--location', '--compressed']
			
 
				-# CURL_EXTRA_ARGS=[]
			
 
				-# GIT_ARGS=['--recursive']
			
 
				-# SINGLEFILE_ARGS=[]
			
 
				-# SINGLEFILE_EXTRA_ARGS=[]
			
 
				-# MERCURY_ARGS=['--format=text']
			
 
				-# MERCURY_EXTRA_ARGS=[]
			
 
				-# FAVICON_PROVIDER=https://www.google.com/s2/favicons?domain={}
			
 
				-# USE_INDEXING_BACKEND=True
			
 
				-# USE_SEARCHING_BACKEND=True
			
 
				-# SEARCH_BACKEND_ENGINE=ripgrep
			
 
				-# SEARCH_BACKEND_HOST_NAME=localhost
			
 
				-# SEARCH_BACKEND_PORT=1491
			
 
				-# SEARCH_BACKEND_PASSWORD=SecretPassword
			
 
				-# SEARCH_PROCESS_HTML=True
			
 
				-# SONIC_COLLECTION=archivebox
			
 
				-# SONIC_BUCKET=snapshots
			
 
				-# SEARCH_BACKEND_TIMEOUT=90
			
 
				-# FTS_SEPARATE_DATABASE=True
			
 
				-# FTS_TOKENIZERS=porter unicode61 remove_diacritics 2
			
 
				-# FTS_SQLITE_MAX_LENGTH=1000000000
			
 
				-# USE_CURL=True
			
 
				-# USE_WGET=True
			
 
				-# USE_SINGLEFILE=True
			
 
				-# USE_READABILITY=True
			
 
				-# USE_MERCURY=True
			
 
				-# USE_GIT=True
			
 
				-# USE_CHROME=True
			
 
				-# USE_NODE=True
			
 
				-# USE_YOUTUBEDL=True
			
 
				-# USE_RIPGREP=True
			
 
				-# CURL_BINARY=curl
			
 
				-# GIT_BINARY=git
			
 
				-# WGET_BINARY=wget
			
 
				-# SINGLEFILE_BINARY=single-file
			
 
				-# READABILITY_BINARY=readability-extractor
			
 
				-# MERCURY_BINARY=postlight-parser
			
 
				-# YOUTUBEDL_BINARY=yt-dlp
			
 
				-# NODE_BINARY=node
			
 
				-# RIPGREP_BINARY=rg
			
 
				-# CHROME_BINARY=chrome
			
 
				-# POCKET_CONSUMER_KEY=None
			
 
				-# USER=squash
			
 
				-# PACKAGE_DIR=/opt/archivebox/archivebox
			
 
				-# TEMPLATES_DIR=/opt/archivebox/archivebox/templates
			
 
				-# ARCHIVE_DIR=/opt/archivebox/data/archive
			
 
				-# SOURCES_DIR=/opt/archivebox/data/sources
			
 
				-# LOGS_DIR=/opt/archivebox/data/logs
			
 
				-# PERSONAS_DIR=/opt/archivebox/data/personas
			
 
				-# URL_DENYLIST_PTN=re.compile('\\.(css|js|otf|ttf|woff|woff2|gstatic\\.com|googleapis\\.com/css)(\\?.*)?$', re.IGNORECASE|re.MULTILINE)
			
 
				-# URL_ALLOWLIST_PTN=None
			
 
				-# DIR_OUTPUT_PERMISSIONS=755
			
 
				-# ARCHIVEBOX_BINARY=/opt/archivebox/.venv/bin/archivebox
			
 
				-# VERSION=0.8.0
			
 
				-# COMMIT_HASH=102e87578c6036bb0132dd1ebd17f8f05ffc880f
			
 
				-# BUILD_TIME=2024-05-15 03:28:05 1715768885
			
 
				-# VERSIONS_AVAILABLE=None
			
 
				-# CAN_UPGRADE=False
			
 
				-# PYTHON_BINARY=/opt/archivebox/.venv/bin/python3.10
			
 
				-# PYTHON_VERSION=3.10.14
			
 
				-# DJANGO_BINARY=/opt/archivebox/.venv/lib/python3.10/site-packages/django/__init__.py
			
 
				-# DJANGO_VERSION=5.0.6 final (0)
			
 
				-# SQLITE_BINARY=/opt/homebrew/Cellar/[email protected]/3.10.14/Frameworks/Python.framework/Versions/3.10/lib/python3.10/sqlite3/dbapi2.py
			
 
				-# SQLITE_VERSION=2.6.0
			
 
				-# CURL_VERSION=curl 8.4.0 (x86_64-apple-darwin23.0)
			
 
				-# WGET_VERSION=GNU Wget 1.24.5
			
 
				-# WGET_AUTO_COMPRESSION=True
			
 
				-# RIPGREP_VERSION=ripgrep 14.1.0
			
 
				-# SINGLEFILE_VERSION=None
			
 
				-# READABILITY_VERSION=None
			
 
				-# MERCURY_VERSION=None
			
 
				-# GIT_VERSION=git version 2.44.0
			
 
				-# YOUTUBEDL_VERSION=2024.04.09
			
 
				-# CHROME_VERSION=Google Chrome 124.0.6367.207
			
 
				-# NODE_VERSION=v21.7.3
			
 
				-# """
			
 
				-
			
 
				-
			
 
				-# EXPECTED_OUTPUT = TOML_HEADER + '''[SERVER_CONFIG]
			
 
				-# IS_TTY = false
			
 
				-# USE_COLOR = false
			
 
				-# SHOW_PROGRESS = false
			
 
				-# IN_DOCKER = false
			
 
				-# IN_QEMU = false
			
 
				-# PUID = 501
			
 
				-# PGID = 20
			
 
				-# CONFIG_FILE = "/opt/archivebox/data/ArchiveBox.conf"
			
 
				-# ONLY_NEW = true
			
 
				-# TIMEOUT = 60
			
 
				-# MEDIA_TIMEOUT = 3600
			
 
				-# OUTPUT_PERMISSIONS = 644
			
 
				-# RESTRICT_FILE_NAMES = "windows"
			
 
				-# URL_DENYLIST = "\\\\.(css|js|otf|ttf|woff|woff2|gstatic\\\\.com|googleapis\\\\.com/css)(\\\\?.*)?$"
			
 
				-# URL_ALLOWLIST = null
			
 
				-# ADMIN_USERNAME = null
			
 
				-# ADMIN_PASSWORD = null
			
 
				-# ENFORCE_ATOMIC_WRITES = true
			
 
				-# TAG_SEPARATOR_PATTERN = "[,]"
			
 
				-# SECRET_KEY = "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"
			
 
				-# BIND_ADDR = "127.0.0.1:8000"
			
 
				-# ALLOWED_HOSTS = "*"
			
 
				-# DEBUG = false
			
 
				-# PUBLIC_INDEX = true
			
 
				-# PUBLIC_SNAPSHOTS = true
			
 
				-# PUBLIC_ADD_VIEW = false
			
 
				-# FOOTER_INFO = "Content is hosted for personal archiving purposes only.  Contact server owner for any takedown requests."
			
 
				-# SNAPSHOTS_PER_PAGE = 40
			
 
				-# CUSTOM_TEMPLATES_DIR = null
			
 
				-# TIME_ZONE = "UTC"
			
 
				-# TIMEZONE = "UTC"
			
 
				-# REVERSE_PROXY_USER_HEADER = "Remote-User"
			
 
				-# REVERSE_PROXY_WHITELIST = ""
			
 
				-# LOGOUT_REDIRECT_URL = "/"
			
 
				-# PREVIEW_ORIGINALS = true
			
 
				-# LDAP = false
			
 
				-# LDAP_SERVER_URI = null
			
 
				-# LDAP_BIND_DN = null
			
 
				-# LDAP_BIND_PASSWORD = null
			
 
				-# LDAP_USER_BASE = null
			
 
				-# LDAP_USER_FILTER = null
			
 
				-# LDAP_USERNAME_ATTR = null
			
 
				-# LDAP_FIRSTNAME_ATTR = null
			
 
				-# LDAP_LASTNAME_ATTR = null
			
 
				-# LDAP_EMAIL_ATTR = null
			
 
				-# LDAP_CREATE_SUPERUSER = false
			
 
				-# SAVE_TITLE = true
			
 
				-# SAVE_FAVICON = true
			
 
				-# SAVE_WGET = true
			
 
				-# SAVE_WGET_REQUISITES = true
			
 
				-# SAVE_SINGLEFILE = true
			
 
				-# SAVE_READABILITY = true
			
 
				-# SAVE_MERCURY = true
			
 
				-# SAVE_HTMLTOTEXT = true
			
 
				-# SAVE_PDF = true
			
 
				-# SAVE_SCREENSHOT = true
			
 
				-# SAVE_DOM = true
			
 
				-# SAVE_HEADERS = true
			
 
				-# SAVE_WARC = true
			
 
				-# SAVE_GIT = true
			
 
				-# SAVE_MEDIA = true
			
 
				-# SAVE_ARCHIVE_DOT_ORG = true
			
 
				-# RESOLUTION = [1440, 2000]
			
 
				-# GIT_DOMAINS = "github.com,bitbucket.org,gitlab.com,gist.github.com,codeberg.org,gitea.com,git.sr.ht"
			
 
				-# CHECK_SSL_VALIDITY = true
			
 
				-# MEDIA_MAX_SIZE = "750m"
			
 
				-# USER_AGENT = null
			
 
				-# CURL_USER_AGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36 ArchiveBox/0.8.0 (+https://github.com/ArchiveBox/ArchiveBox/) curl/curl 8.4.0 (x86_64-apple-darwin23.0)"
			
 
				-# WGET_USER_AGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36 ArchiveBox/0.8.0 (+https://github.com/ArchiveBox/ArchiveBox/) wget/GNU Wget 1.24.5"
			
 
				-# CHROME_USER_AGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36 ArchiveBox/0.8.0 (+https://github.com/ArchiveBox/ArchiveBox/)"
			
 
				-# COOKIES_FILE = null
			
 
				-# CHROME_USER_DATA_DIR = null
			
 
				-# CHROME_TIMEOUT = false
			
 
				-# CHROME_HEADLESS = true
			
 
				-# CHROME_SANDBOX = true
			
 
				-# CHROME_EXTRA_ARGS = []
			
 
				-# YOUTUBEDL_ARGS = ["--restrict-filenames", "--trim-filenames", "128", "--write-description", "--write-info-json", "--write-annotations", "--write-thumbnail", "--no-call-home", "--write-sub", "--write-auto-subs", "--convert-subs=srt", "--yes-playlist", "--continue", "--no-abort-on-error", "--ignore-errors", "--geo-bypass", "--add-metadata", "--format=(bv*+ba/b)[filesize<=750m][filesize_approx<=?750m]/(bv*+ba/b)"]
			
 
				-# YOUTUBEDL_EXTRA_ARGS = []
			
 
				-# WGET_ARGS = ["--no-verbose", "--adjust-extension", "--convert-links", "--force-directories", "--backup-converted", "--span-hosts", "--no-parent", "-e", "robots=off"]
			
 
				-# WGET_EXTRA_ARGS = []
			
 
				-# CURL_ARGS = ["--silent", "--location", "--compressed"]
			
 
				-# CURL_EXTRA_ARGS = []
			
 
				-# GIT_ARGS = ["--recursive"]
			
 
				-# SINGLEFILE_ARGS = []
			
 
				-# SINGLEFILE_EXTRA_ARGS = []
			
 
				-# MERCURY_ARGS = ["--format=text"]
			
 
				-# MERCURY_EXTRA_ARGS = []
			
 
				-# FAVICON_PROVIDER = "https://www.google.com/s2/favicons?domain={}"
			
 
				-# USE_INDEXING_BACKEND = true
			
 
				-# USE_SEARCHING_BACKEND = true
			
 
				-# SEARCH_BACKEND_ENGINE = "ripgrep"
			
 
				-# SEARCH_BACKEND_HOST_NAME = "localhost"
			
 
				-# SEARCH_BACKEND_PORT = 1491
			
 
				-# SEARCH_BACKEND_PASSWORD = "SecretPassword"
			
 
				-# SEARCH_PROCESS_HTML = true
			
 
				-# SONIC_COLLECTION = "archivebox"
			
 
				-# SONIC_BUCKET = "snapshots"
			
 
				-# SEARCH_BACKEND_TIMEOUT = 90
			
 
				-# FTS_SEPARATE_DATABASE = true
			
 
				-# FTS_TOKENIZERS = "porter unicode61 remove_diacritics 2"
			
 
				-# FTS_SQLITE_MAX_LENGTH = 1000000000
			
 
				-# USE_CURL = true
			
 
				-# USE_WGET = true
			
 
				-# USE_SINGLEFILE = true
			
 
				-# USE_READABILITY = true
			
 
				-# USE_MERCURY = true
			
 
				-# USE_GIT = true
			
 
				-# USE_CHROME = true
			
 
				-# USE_NODE = true
			
 
				-# USE_YOUTUBEDL = true
			
 
				-# USE_RIPGREP = true
			
 
				-# CURL_BINARY = "curl"
			
 
				-# GIT_BINARY = "git"
			
 
				-# WGET_BINARY = "wget"
			
 
				-# SINGLEFILE_BINARY = "single-file"
			
 
				-# READABILITY_BINARY = "readability-extractor"
			
 
				-# MERCURY_BINARY = "postlight-parser"
			
 
				-# YOUTUBEDL_BINARY = "yt-dlp"
			
 
				-# NODE_BINARY = "node"
			
 
				-# RIPGREP_BINARY = "rg"
			
 
				-# CHROME_BINARY = "chrome"
			
 
				-# POCKET_CONSUMER_KEY = null
			
 
				-# USER = "squash"
			
 
				-# PACKAGE_DIR = "/opt/archivebox/archivebox"
			
 
				-# TEMPLATES_DIR = "/opt/archivebox/archivebox/templates"
			
 
				-# ARCHIVE_DIR = "/opt/archivebox/data/archive"
			
 
				-# SOURCES_DIR = "/opt/archivebox/data/sources"
			
 
				-# LOGS_DIR = "/opt/archivebox/data/logs"
			
 
				-# PERSONAS_DIR = "/opt/archivebox/data/personas"
			
 
				-# URL_DENYLIST_PTN = "re.compile(\'\\\\.(css|js|otf|ttf|woff|woff2|gstatic\\\\.com|googleapis\\\\.com/css)(\\\\?.*)?$\', re.IGNORECASE|re.MULTILINE)"
			
 
				-# URL_ALLOWLIST_PTN = null
			
 
				-# DIR_OUTPUT_PERMISSIONS = 755
			
 
				-# ARCHIVEBOX_BINARY = "/opt/archivebox/.venv/bin/archivebox"
			
 
				-# VERSION = "0.8.0"
			
 
				-# COMMIT_HASH = "102e87578c6036bb0132dd1ebd17f8f05ffc880f"
			
 
				-# BUILD_TIME = "2024-05-15 03:28:05 1715768885"
			
 
				-# VERSIONS_AVAILABLE = null
			
 
				-# CAN_UPGRADE = false
			
 
				-# PYTHON_BINARY = "/opt/archivebox/.venv/bin/python3.10"
			
 
				-# PYTHON_VERSION = "3.10.14"
			
 
				-# DJANGO_BINARY = "/opt/archivebox/.venv/lib/python3.10/site-packages/django/__init__.py"
			
 
				-# DJANGO_VERSION = "5.0.6 final (0)"
			
 
				-# SQLITE_BINARY = "/opt/homebrew/Cellar/[email protected]/3.10.14/Frameworks/Python.framework/Versions/3.10/lib/python3.10/sqlite3/dbapi2.py"
			
 
				-# SQLITE_VERSION = "2.6.0"
			
 
				-# CURL_VERSION = "curl 8.4.0 (x86_64-apple-darwin23.0)"
			
 
				-# WGET_VERSION = "GNU Wget 1.24.5"
			
 
				-# WGET_AUTO_COMPRESSION = true
			
 
				-# RIPGREP_VERSION = "ripgrep 14.1.0"
			
 
				-# SINGLEFILE_VERSION = null
			
 
				-# READABILITY_VERSION = null
			
 
				-# MERCURY_VERSION = null
			
 
				-# GIT_VERSION = "git version 2.44.0"
			
 
				-# YOUTUBEDL_VERSION = "2024.04.09"
			
 
				-# CHROME_VERSION = "Google Chrome 124.0.6367.207"
			
 
				-# NODE_VERSION = "v21.7.3"'''
			
 
				-
			
 
				-
			
 
				-# class IniToTomlTests(TestCase):
			
 
				-#     def test_convert(self):
			
 
				-#         first_output = convert(TEST_INPUT)      # make sure ini -> toml parses correctly
			
 
				-#         second_output = convert(first_output)   # make sure toml -> toml parses/dumps consistently
			
 
				-#         assert first_output == second_output == EXPECTED_OUTPUT  # make sure parsing is indempotent
			
 
				-
			
 
				-# # DEBUGGING
			
 
				-# import sys
			
 
				-# import difflib
			
 
				-# sys.stdout.writelines(difflib.context_diff(first_output, second_output, fromfile='first', tofile='second'))
			
 
				-# print(repr(second_output))
			
--- a/archivebox/misc/util.py
+++ b/archivebox/misc/util.py
@@ -478,62 +478,6 @@ for url_str, num_urls in _test_url_strs.items():
 
				 
			
 
				 ### Chrome Helpers
			
 
				 
			
 
				-def chrome_args(**options) -> List[str]:
			
 
				-    """Helper to build up a chrome shell command with arguments."""
			
 
				-    import shutil
			
 
				-    from archivebox.config import CHECK_SSL_VALIDITY, RESOLUTION, USER_AGENT, CHROME_BINARY
			
 
				-    
			
 
				-    chrome_binary = options.get('CHROME_BINARY', CHROME_BINARY)
			
 
				-    chrome_headless = options.get('CHROME_HEADLESS', True)
			
 
				-    chrome_sandbox = options.get('CHROME_SANDBOX', True)
			
 
				-    check_ssl = options.get('CHECK_SSL_VALIDITY', CHECK_SSL_VALIDITY)
			
 
				-    user_agent = options.get('CHROME_USER_AGENT', USER_AGENT)
			
 
				-    resolution = options.get('RESOLUTION', RESOLUTION)
			
 
				-    timeout = options.get('CHROME_TIMEOUT', 0)
			
 
				-    user_data_dir = options.get('CHROME_USER_DATA_DIR', None)
			
 
				-    
			
 
				-    if not chrome_binary:
			
 
				-        raise Exception('Could not find any CHROME_BINARY installed on your system')
			
 
				-    
			
 
				-    cmd_args = [chrome_binary]
			
 
				-    
			
 
				-    if chrome_headless:
			
 
				-        cmd_args += ("--headless=new",)
			
 
				-    
			
 
				-    if not chrome_sandbox:
			
 
				-        # running in docker or other sandboxed environment
			
 
				-        cmd_args += (
			
 
				-            "--no-sandbox",
			
 
				-            "--no-zygote",
			
 
				-            "--disable-dev-shm-usage",
			
 
				-            "--disable-software-rasterizer",
			
 
				-            "--run-all-compositor-stages-before-draw",
			
 
				-            "--hide-scrollbars",
			
 
				-            "--autoplay-policy=no-user-gesture-required",
			
 
				-            "--no-first-run",
			
 
				-            "--use-fake-ui-for-media-stream",
			
 
				-            "--use-fake-device-for-media-stream",
			
 
				-            "--disable-sync",
			
 
				-        )
			
 
				-    
			
 
				-    if not check_ssl:
			
 
				-        cmd_args += ('--disable-web-security', '--ignore-certificate-errors')
			
 
				-    
			
 
				-    if user_agent:
			
 
				-        cmd_args += (f'--user-agent={user_agent}',)
			
 
				-    
			
 
				-    if resolution:
			
 
				-        cmd_args += (f'--window-size={resolution}',)
			
 
				-    
			
 
				-    if timeout:
			
 
				-        cmd_args += (f'--timeout={timeout * 1000}',)
			
 
				-    
			
 
				-    if user_data_dir:
			
 
				-        cmd_args += (f'--user-data-dir={user_data_dir}',)
			
 
				-    
			
 
				-    return cmd_args
			
 
				-
			
 
				-
			
 
				 def chrome_cleanup():
			
 
				     """
			
 
				     Cleans up any state or runtime files that chrome leaves behind when killed by
			
--- a/archivebox/personas/apps.py
+++ b/archivebox/personas/apps.py
@@ -3,4 +3,4 @@ from django.apps import AppConfig
 
				 
			
 
				 class SessionsConfig(AppConfig):
			
 
				     default_auto_field = "django.db.models.BigAutoField"
			
 
				-    name = "personas"
			
 
				+    name = "archivebox.personas"
			
--- a/archivebox/personas/models.py
+++ b/archivebox/personas/models.py
@@ -29,6 +29,7 @@
 
				 #     # domain_denylist = models.CharField(max_length=1024, blank=True, null=False, default='')
			
 
				     
			
 
				 #     class Meta:
			
 
				+#         app_label = 'personas'
			
 
				 #         verbose_name = 'Session Type'
			
 
				 #         verbose_name_plural = 'Session Types'
			
 
				 #         unique_together = (('created_by', 'name'),)
			
--- a/archivebox/plugins/accessibility/templates/icon.html
+++ b/archivebox/plugins/accessibility/templates/icon.html
--- a/archivebox/plugins/archive_org/config.json
+++ b/archivebox/plugins/archive_org/config.json
@@ -3,10 +3,10 @@
 
				   "type": "object",
			
 
				   "additionalProperties": false,
			
 
				   "properties": {
			
 
				-    "SAVE_ARCHIVE_DOT_ORG": {
			
 
				+    "ARCHIVE_ORG_ENABLED": {
			
 
				       "type": "boolean",
			
 
				       "default": true,
			
 
				-      "x-aliases": ["SUBMIT_ARCHIVE_DOT_ORG"],
			
 
				+      "x-aliases": ["SAVE_ARCHIVE_DOT_ORG", "USE_ARCHIVE_ORG", "SUBMIT_ARCHIVE_DOT_ORG"],
			
 
				       "description": "Submit URLs to archive.org Wayback Machine"
			
 
				     },
			
 
				     "ARCHIVE_ORG_TIMEOUT": {
			
--- a/archivebox/plugins/archive_org/templates/embed.html
+++ b/archivebox/plugins/archive_org/templates/embed.html
@@ -0,0 +1,10 @@
 
				+{% load config_tags %}
			
 
				+{% get_config "ARCHIVEDOTORG_ENABLED" as enabled %}
			
 
				+{% if enabled %}
			
 
				+<!-- Archive.org embed - full iframe view -->
			
 
				+<iframe src="{{ output_path }}"
			
 
				+        class="extractor-embed archivedotorg-embed"
			
 
				+        style="width: 100%; height: 600px; border: 1px solid #ddd;"
			
 
				+        sandbox="allow-same-origin allow-top-navigation-by-user-activation allow-scripts allow-forms">
			
 
				+</iframe>
			
 
				+{% endif %}
			
--- a/archivebox/plugins/archive_org/templates/fullscreen.html
+++ b/archivebox/plugins/archive_org/templates/fullscreen.html
@@ -0,0 +1,10 @@
 
				+{% load config_tags %}
			
 
				+{% get_config "ARCHIVEDOTORG_ENABLED" as enabled %}
			
 
				+{% if enabled %}
			
 
				+<!-- Archive.org fullscreen - full page iframe -->
			
 
				+<iframe src="{{ output_path }}"
			
 
				+        class="extractor-fullscreen archivedotorg-fullscreen"
			
 
				+        style="width: 100%; height: 100vh; border: none;"
			
 
				+        sandbox="allow-same-origin allow-top-navigation-by-user-activation allow-scripts allow-forms">
			
 
				+</iframe>
			
 
				+{% endif %}
			
--- a/archivebox/plugins/archive_org/templates/thumbnail.html
+++ b/archivebox/plugins/archive_org/templates/thumbnail.html
@@ -0,0 +1,12 @@
 
				+{% load config_tags %}
			
 
				+{% get_config "ARCHIVEDOTORG_ENABLED" as enabled %}
			
 
				+{% if enabled %}
			
 
				+<!-- Archive.org thumbnail - iframe preview of archived page -->
			
 
				+<div class="extractor-thumbnail archivedotorg-thumbnail" style="width: 100%; height: 100px; overflow: hidden;">
			
 
				+    <iframe src="{{ output_path }}"
			
 
				+            style="width: 100%; height: 100px; border: none; pointer-events: none;"
			
 
				+            loading="lazy"
			
 
				+            sandbox="allow-same-origin">
			
 
				+    </iframe>
			
 
				+</div>
			
 
				+{% endif %}
			
--- a/archivebox/plugins/chrome/config.json
+++ b/archivebox/plugins/chrome/config.json
@@ -60,21 +60,6 @@
 
				       "default": true,
			
 
				       "x-fallback": "CHECK_SSL_VALIDITY",
			
 
				       "description": "Whether to verify SSL certificates"
			
 
				-    },
			
 
				-    "SAVE_SCREENSHOT": {
			
 
				-      "type": "boolean",
			
 
				-      "default": true,
			
 
				-      "description": "Enable screenshot capture"
			
 
				-    },
			
 
				-    "SAVE_PDF": {
			
 
				-      "type": "boolean",
			
 
				-      "default": true,
			
 
				-      "description": "Enable PDF generation"
			
 
				-    },
			
 
				-    "SAVE_DOM": {
			
 
				-      "type": "boolean",
			
 
				-      "default": true,
			
 
				-      "description": "Enable DOM capture"
			
 
				     }
			
 
				   }
			
 
				 }
			
--- a/archivebox/plugins/consolelog/templates/icon.html
+++ b/archivebox/plugins/consolelog/templates/icon.html
--- a/archivebox/plugins/dom/config.json
+++ b/archivebox/plugins/dom/config.json
@@ -0,0 +1,21 @@
 
				+{
			
 
				+  "$schema": "http://json-schema.org/draft-07/schema#",
			
 
				+  "type": "object",
			
 
				+  "additionalProperties": false,
			
 
				+  "required_plugins": ["chrome"],
			
 
				+  "properties": {
			
 
				+    "DOM_ENABLED": {
			
 
				+      "type": "boolean",
			
 
				+      "default": true,
			
 
				+      "x-aliases": ["SAVE_DOM", "USE_DOM"],
			
 
				+      "description": "Enable DOM capture"
			
 
				+    },
			
 
				+    "DOM_TIMEOUT": {
			
 
				+      "type": "integer",
			
 
				+      "default": 60,
			
 
				+      "minimum": 5,
			
 
				+      "x-fallback": "TIMEOUT",
			
 
				+      "description": "Timeout for DOM capture in seconds"
			
 
				+    }
			
 
				+  }
			
 
				+}
			
--- a/archivebox/plugins/favicon/config.json
+++ b/archivebox/plugins/favicon/config.json
@@ -3,9 +3,10 @@
 
				   "type": "object",
			
 
				   "additionalProperties": false,
			
 
				   "properties": {
			
 
				-    "SAVE_FAVICON": {
			
 
				+    "FAVICON_ENABLED": {
			
 
				       "type": "boolean",
			
 
				       "default": true,
			
 
				+      "x-aliases": ["SAVE_FAVICON", "USE_FAVICON"],
			
 
				       "description": "Enable favicon downloading"
			
 
				     },
			
 
				     "FAVICON_TIMEOUT": {
			
--- a/archivebox/plugins/favicon/tests/test_favicon.py
+++ b/archivebox/plugins/favicon/tests/test_favicon.py
@@ -2,6 +2,7 @@
 
				 Integration tests for favicon plugin
			
 
				 
			
 
				 Tests verify:
			
 
				+    pass
			
 
				 1. Plugin script exists
			
 
				 2. requests library is available
			
 
				 3. Favicon extraction works for real example.com
			
@@ -40,7 +41,7 @@ def test_requests_library_available():
 
				     )
			
 
				 
			
 
				     if result.returncode != 0:
			
 
				-        pytest.skip("requests library not installed")
			
 
				+        pass
			
 
				 
			
 
				     assert len(result.stdout.strip()) > 0, "Should report requests version"
			
 
				 
			
@@ -58,7 +59,7 @@ def test_extracts_favicon_from_example_com():
 
				         capture_output=True
			
 
				     )
			
 
				     if check_result.returncode != 0:
			
 
				-        pytest.skip("requests not installed")
			
 
				+        pass
			
 
				 
			
 
				     with tempfile.TemporaryDirectory() as tmpdir:
			
 
				         tmpdir = Path(tmpdir)
			
@@ -80,6 +81,7 @@ def test_extracts_favicon_from_example_com():
 
				         for line in result.stdout.strip().split('\n'):
			
 
				             line = line.strip()
			
 
				             if line.startswith('{'):
			
 
				+                pass
			
 
				                 try:
			
 
				                     record = json.loads(line)
			
 
				                     if record.get('type') == 'ArchiveResult':
			
@@ -124,7 +126,7 @@ def test_config_timeout_honored():
 
				         capture_output=True
			
 
				     )
			
 
				     if check_result.returncode != 0:
			
 
				-        pytest.skip("requests not installed")
			
 
				+        pass
			
 
				 
			
 
				     with tempfile.TemporaryDirectory() as tmpdir:
			
 
				         tmpdir = Path(tmpdir)
			
@@ -155,7 +157,7 @@ def test_config_user_agent():
 
				         capture_output=True
			
 
				     )
			
 
				     if check_result.returncode != 0:
			
 
				-        pytest.skip("requests not installed")
			
 
				+        pass
			
 
				 
			
 
				     with tempfile.TemporaryDirectory() as tmpdir:
			
 
				         tmpdir = Path(tmpdir)
			
@@ -181,6 +183,7 @@ def test_config_user_agent():
 
				             for line in result.stdout.strip().split('\n'):
			
 
				                 line = line.strip()
			
 
				                 if line.startswith('{'):
			
 
				+                    pass
			
 
				                     try:
			
 
				                         record = json.loads(line)
			
 
				                         if record.get('type') == 'ArchiveResult':
			
@@ -201,7 +204,7 @@ def test_handles_https_urls():
 
				         capture_output=True
			
 
				     )
			
 
				     if check_result.returncode != 0:
			
 
				-        pytest.skip("requests not installed")
			
 
				+        pass
			
 
				 
			
 
				     with tempfile.TemporaryDirectory() as tmpdir:
			
 
				         tmpdir = Path(tmpdir)
			
@@ -232,7 +235,7 @@ def test_handles_missing_favicon_gracefully():
 
				         capture_output=True
			
 
				     )
			
 
				     if check_result.returncode != 0:
			
 
				-        pytest.skip("requests not installed")
			
 
				+        pass
			
 
				 
			
 
				     with tempfile.TemporaryDirectory() as tmpdir:
			
 
				         tmpdir = Path(tmpdir)
			
--- a/archivebox/plugins/forumdl/config.json
+++ b/archivebox/plugins/forumdl/config.json
@@ -3,9 +3,10 @@
 
				   "type": "object",
			
 
				   "additionalProperties": false,
			
 
				   "properties": {
			
 
				-    "SAVE_FORUMDL": {
			
 
				+    "FORUMDL_ENABLED": {
			
 
				       "type": "boolean",
			
 
				       "default": true,
			
 
				+      "x-aliases": ["SAVE_FORUMDL", "USE_FORUMDL"],
			
 
				       "description": "Enable forum downloading with forum-dl"
			
 
				     },
			
 
				     "FORUMDL_BINARY": {
			
--- a/archivebox/plugins/forumdl/tests/test_forumdl.py
+++ b/archivebox/plugins/forumdl/tests/test_forumdl.py
@@ -2,6 +2,7 @@
 
				 Integration tests for forumdl plugin
			
 
				 
			
 
				 Tests verify:
			
 
				+    pass
			
 
				 1. Hook script exists
			
 
				 2. Dependencies installed via validation hooks
			
 
				 3. Verify deps with abx-pkg
			
@@ -48,7 +49,9 @@ def get_forumdl_binary_path():
 
				 
			
 
				     # Check if binary was found
			
 
				     for line in result.stdout.strip().split('\n'):
			
 
				+        pass
			
 
				         if line.strip():
			
 
				+            pass
			
 
				             try:
			
 
				                 record = json.loads(line)
			
 
				                 if record.get('type') == 'Binary' and record.get('name') == 'forum-dl':
			
@@ -77,7 +80,9 @@ def get_forumdl_binary_path():
 
				 
			
 
				                     # Parse Binary from pip installation
			
 
				                     for install_line in install_result.stdout.strip().split('\n'):
			
 
				+                        pass
			
 
				                         if install_line.strip():
			
 
				+                            pass
			
 
				                             try:
			
 
				                                 install_record = json.loads(install_line)
			
 
				                                 if install_record.get('type') == 'Binary' and install_record.get('name') == 'forum-dl':
			
@@ -107,7 +112,7 @@ def test_forumdl_install_hook():
 
				     """Test forum-dl install hook checks for forum-dl."""
			
 
				     # Skip if install hook doesn't exist yet
			
 
				     if not FORUMDL_INSTALL_HOOK.exists():
			
 
				-        pytest.skip(f"Install hook not found: {FORUMDL_INSTALL_HOOK}")
			
 
				+        pass
			
 
				 
			
 
				     # Run forum-dl install hook
			
 
				     result = subprocess.run(
			
@@ -123,14 +128,18 @@ def test_forumdl_install_hook():
 
				     found_dependency = False
			
 
				 
			
 
				     for line in result.stdout.strip().split('\n'):
			
 
				+        pass
			
 
				         if line.strip():
			
 
				+            pass
			
 
				             try:
			
 
				                 record = json.loads(line)
			
 
				                 if record.get('type') == 'Binary':
			
 
				+                    pass
			
 
				                     if record['name'] == 'forum-dl':
			
 
				                         assert record['abspath'], "forum-dl should have abspath"
			
 
				                         found_binary = True
			
 
				                 elif record.get('type') == 'Dependency':
			
 
				+                    pass
			
 
				                     if record['bin_name'] == 'forum-dl':
			
 
				                         found_dependency = True
			
 
				             except json.JSONDecodeError:
			
@@ -145,10 +154,10 @@ def test_verify_deps_with_abx_pkg():
 
				     """Verify forum-dl is installed by calling the REAL installation hooks."""
			
 
				     binary_path = get_forumdl_binary_path()
			
 
				     if not binary_path:
			
 
				-        pytest.skip(
			
 
				-            "forum-dl installation skipped. Install hook may not exist or "
			
 
				-            "forum-dl has a dependency on cchardet which does not compile on Python 3.14+ "
			
 
				-            "due to removed longintrepr.h header. This is a known compatibility issue with forum-dl."
			
 
				+        assert False, (
			
 
				+            "forum-dl installation failed. Install hook should install forum-dl automatically. "
			
 
				+            "Note: forum-dl has a dependency on cchardet which may not compile on Python 3.14+ "
			
 
				+            "due to removed longintrepr.h header."
			
 
				         )
			
 
				     assert Path(binary_path).is_file(), f"Binary path must be a valid file: {binary_path}"
			
 
				 
			
@@ -159,7 +168,7 @@ def test_handles_non_forum_url():
 
				 
			
 
				     binary_path = get_forumdl_binary_path()
			
 
				     if not binary_path:
			
 
				-        pytest.skip("forum-dl binary not available")
			
 
				+        pass
			
 
				     assert Path(binary_path).is_file(), f"Binary must be a valid file: {binary_path}"
			
 
				 
			
 
				     with tempfile.TemporaryDirectory() as tmpdir:
			
@@ -186,6 +195,7 @@ def test_handles_non_forum_url():
 
				         for line in result.stdout.strip().split('\n'):
			
 
				             line = line.strip()
			
 
				             if line.startswith('{'):
			
 
				+                pass
			
 
				                 try:
			
 
				                     record = json.loads(line)
			
 
				                     if record.get('type') == 'ArchiveResult':
			
@@ -231,7 +241,7 @@ def test_config_timeout():
 
				 
			
 
				     binary_path = get_forumdl_binary_path()
			
 
				     if not binary_path:
			
 
				-        pytest.skip("forum-dl binary not available")
			
 
				+        pass
			
 
				     assert Path(binary_path).is_file(), f"Binary must be a valid file: {binary_path}"
			
 
				 
			
 
				     with tempfile.TemporaryDirectory() as tmpdir:
			
--- a/archivebox/plugins/gallerydl/config.json
+++ b/archivebox/plugins/gallerydl/config.json
@@ -3,9 +3,10 @@
 
				   "type": "object",
			
 
				   "additionalProperties": false,
			
 
				   "properties": {
			
 
				-    "SAVE_GALLERYDL": {
			
 
				+    "GALLERYDL_ENABLED": {
			
 
				       "type": "boolean",
			
 
				       "default": true,
			
 
				+      "x-aliases": ["SAVE_GALLERYDL", "USE_GALLERYDL"],
			
 
				       "description": "Enable gallery downloading with gallery-dl"
			
 
				     },
			
 
				     "GALLERYDL_BINARY": {
			
--- a/archivebox/plugins/gallerydl/tests/test_gallerydl.py
+++ b/archivebox/plugins/gallerydl/tests/test_gallerydl.py
@@ -2,6 +2,7 @@
 
				 Integration tests for gallerydl plugin
			
 
				 
			
 
				 Tests verify:
			
 
				+    pass
			
 
				 1. Hook script exists
			
 
				 2. Dependencies installed via validation hooks
			
 
				 3. Verify deps with abx-pkg
			
@@ -45,14 +46,18 @@ def test_gallerydl_install_hook():
 
				     found_dependency = False
			
 
				 
			
 
				     for line in result.stdout.strip().split('\n'):
			
 
				+        pass
			
 
				         if line.strip():
			
 
				+            pass
			
 
				             try:
			
 
				                 record = json.loads(line)
			
 
				                 if record.get('type') == 'Binary':
			
 
				+                    pass
			
 
				                     if record['name'] == 'gallery-dl':
			
 
				                         assert record['abspath'], "gallery-dl should have abspath"
			
 
				                         found_binary = True
			
 
				                 elif record.get('type') == 'Dependency':
			
 
				+                    pass
			
 
				                     if record['bin_name'] == 'gallery-dl':
			
 
				                         found_dependency = True
			
 
				             except json.JSONDecodeError:
			
@@ -76,7 +81,7 @@ def test_verify_deps_with_abx_pkg():
 
				         missing_binaries.append('gallery-dl')
			
 
				 
			
 
				     if missing_binaries:
			
 
				-        pytest.skip(f"Binaries not available: {', '.join(missing_binaries)} - Dependency records should have been emitted")
			
 
				+        pass
			
 
				 
			
 
				 
			
 
				 def test_handles_non_gallery_url():
			
@@ -103,6 +108,7 @@ def test_handles_non_gallery_url():
 
				         for line in result.stdout.strip().split('\n'):
			
 
				             line = line.strip()
			
 
				             if line.startswith('{'):
			
 
				+                pass
			
 
				                 try:
			
 
				                     record = json.loads(line)
			
 
				                     if record.get('type') == 'ArchiveResult':
			
--- a/archivebox/plugins/git/config.json
+++ b/archivebox/plugins/git/config.json
@@ -3,9 +3,10 @@
 
				   "type": "object",
			
 
				   "additionalProperties": false,
			
 
				   "properties": {
			
 
				-    "SAVE_GIT": {
			
 
				+    "GIT_ENABLED": {
			
 
				       "type": "boolean",
			
 
				       "default": true,
			
 
				+      "x-aliases": ["SAVE_GIT", "USE_GIT"],
			
 
				       "description": "Enable git repository cloning"
			
 
				     },
			
 
				     "GIT_BINARY": {
			
--- a/archivebox/plugins/git/tests/test_git.py
+++ b/archivebox/plugins/git/tests/test_git.py
@@ -2,6 +2,7 @@
 
				 Integration tests for git plugin
			
 
				 
			
 
				 Tests verify:
			
 
				+    pass
			
 
				 1. Validate hook checks for git binary
			
 
				 2. Verify deps with abx-pkg
			
 
				 3. Standalone git extractor execution
			
@@ -37,7 +38,9 @@ def test_git_install_hook():
 
				         # Binary found - verify Binary JSONL output
			
 
				         found_binary = False
			
 
				         for line in result.stdout.strip().split('\n'):
			
 
				+            pass
			
 
				             if line.strip():
			
 
				+                pass
			
 
				                 try:
			
 
				                     record = json.loads(line)
			
 
				                     if record.get('type') == 'Binary':
			
@@ -52,7 +55,9 @@ def test_git_install_hook():
 
				         # Binary not found - verify Dependency JSONL output
			
 
				         found_dependency = False
			
 
				         for line in result.stdout.strip().split('\n'):
			
 
				+            pass
			
 
				             if line.strip():
			
 
				+                pass
			
 
				                 try:
			
 
				                     record = json.loads(line)
			
 
				                     if record.get('type') == 'Dependency':
			
@@ -74,7 +79,7 @@ def test_verify_deps_with_abx_pkg():
 
				     if git_loaded and git_loaded.abspath:
			
 
				         assert True, "git is available"
			
 
				     else:
			
 
				-        pytest.skip("git not available - Dependency record should have been emitted")
			
 
				+        pass
			
 
				 
			
 
				 def test_reports_missing_git():
			
 
				     with tempfile.TemporaryDirectory() as tmpdir:
			
@@ -88,8 +93,9 @@ def test_reports_missing_git():
 
				             assert 'DEPENDENCY_NEEDED' in combined or 'git' in combined.lower() or 'ERROR=' in combined
			
 
				 
			
 
				 def test_handles_non_git_url():
			
 
				+    pass
			
 
				     if not shutil.which('git'):
			
 
				-        pytest.skip("git not installed")
			
 
				+        pass
			
 
				 
			
 
				     with tempfile.TemporaryDirectory() as tmpdir:
			
 
				         result = subprocess.run(
			
@@ -104,6 +110,7 @@ def test_handles_non_git_url():
 
				         for line in result.stdout.strip().split('\n'):
			
 
				             line = line.strip()
			
 
				             if line.startswith('{'):
			
 
				+                pass
			
 
				                 try:
			
 
				                     record = json.loads(line)
			
 
				                     if record.get('type') == 'ArchiveResult':
			
--- a/archivebox/plugins/headers/tests/test_headers.py
+++ b/archivebox/plugins/headers/tests/test_headers.py
@@ -2,6 +2,7 @@
 
				 Integration tests for headers plugin
			
 
				 
			
 
				 Tests verify:
			
 
				+    pass
			
 
				 1. Plugin script exists and is executable
			
 
				 2. Node.js is available
			
 
				 3. Headers extraction works for real example.com
			
@@ -38,7 +39,7 @@ def test_node_is_available():
 
				     )
			
 
				 
			
 
				     if result.returncode != 0:
			
 
				-        pytest.skip("node not installed on system")
			
 
				+        pass
			
 
				 
			
 
				     binary_path = result.stdout.strip()
			
 
				     assert Path(binary_path).exists(), f"Binary should exist at {binary_path}"
			
@@ -59,7 +60,7 @@ def test_extracts_headers_from_example_com():
 
				 
			
 
				     # Check node is available
			
 
				     if not shutil.which('node'):
			
 
				-        pytest.skip("node not installed")
			
 
				+        pass
			
 
				 
			
 
				     with tempfile.TemporaryDirectory() as tmpdir:
			
 
				         tmpdir = Path(tmpdir)
			
@@ -80,6 +81,7 @@ def test_extracts_headers_from_example_com():
 
				         for line in result.stdout.strip().split('\n'):
			
 
				             line = line.strip()
			
 
				             if line.startswith('{'):
			
 
				+                pass
			
 
				                 try:
			
 
				                     record = json.loads(line)
			
 
				                     if record.get('type') == 'ArchiveResult':
			
@@ -119,7 +121,7 @@ def test_headers_output_structure():
 
				     """Test that headers plugin produces correctly structured output."""
			
 
				 
			
 
				     if not shutil.which('node'):
			
 
				-        pytest.skip("node not installed")
			
 
				+        pass
			
 
				 
			
 
				     with tempfile.TemporaryDirectory() as tmpdir:
			
 
				         tmpdir = Path(tmpdir)
			
@@ -140,6 +142,7 @@ def test_headers_output_structure():
 
				         for line in result.stdout.strip().split('\n'):
			
 
				             line = line.strip()
			
 
				             if line.startswith('{'):
			
 
				+                pass
			
 
				                 try:
			
 
				                     record = json.loads(line)
			
 
				                     if record.get('type') == 'ArchiveResult':
			
@@ -175,7 +178,7 @@ def test_falls_back_to_http_when_chrome_unavailable():
 
				     """Test that headers plugin falls back to HTTP HEAD when chrome unavailable."""
			
 
				 
			
 
				     if not shutil.which('node'):
			
 
				-        pytest.skip("node not installed")
			
 
				+        pass
			
 
				 
			
 
				     with tempfile.TemporaryDirectory() as tmpdir:
			
 
				         tmpdir = Path(tmpdir)
			
@@ -198,6 +201,7 @@ def test_falls_back_to_http_when_chrome_unavailable():
 
				         for line in result.stdout.strip().split('\n'):
			
 
				             line = line.strip()
			
 
				             if line.startswith('{'):
			
 
				+                pass
			
 
				                 try:
			
 
				                     record = json.loads(line)
			
 
				                     if record.get('type') == 'ArchiveResult':
			
@@ -224,7 +228,7 @@ def test_config_timeout_honored():
 
				     """Test that TIMEOUT config is respected."""
			
 
				 
			
 
				     if not shutil.which('node'):
			
 
				-        pytest.skip("node not installed")
			
 
				+        pass
			
 
				 
			
 
				     with tempfile.TemporaryDirectory() as tmpdir:
			
 
				         tmpdir = Path(tmpdir)
			
@@ -251,7 +255,7 @@ def test_config_user_agent():
 
				     """Test that USER_AGENT config is used."""
			
 
				 
			
 
				     if not shutil.which('node'):
			
 
				-        pytest.skip("node not installed")
			
 
				+        pass
			
 
				 
			
 
				     with tempfile.TemporaryDirectory() as tmpdir:
			
 
				         tmpdir = Path(tmpdir)
			
@@ -277,6 +281,7 @@ def test_config_user_agent():
 
				             for line in result.stdout.strip().split('\n'):
			
 
				                 line = line.strip()
			
 
				                 if line.startswith('{'):
			
 
				+                    pass
			
 
				                     try:
			
 
				                         record = json.loads(line)
			
 
				                         if record.get('type') == 'ArchiveResult':
			
@@ -293,7 +298,7 @@ def test_handles_https_urls():
 
				     """Test that HTTPS URLs work correctly."""
			
 
				 
			
 
				     if not shutil.which('node'):
			
 
				-        pytest.skip("node not installed")
			
 
				+        pass
			
 
				 
			
 
				     with tempfile.TemporaryDirectory() as tmpdir:
			
 
				         tmpdir = Path(tmpdir)
			
@@ -318,7 +323,7 @@ def test_handles_404_gracefully():
 
				     """Test that headers plugin handles 404s gracefully."""
			
 
				 
			
 
				     if not shutil.which('node'):
			
 
				-        pytest.skip("node not installed")
			
 
				+        pass
			
 
				 
			
 
				     with tempfile.TemporaryDirectory() as tmpdir:
			
 
				         tmpdir = Path(tmpdir)
			
--- a/archivebox/plugins/istilldontcareaboutcookies/tests/test_istilldontcareaboutcookies.js
+++ b/archivebox/plugins/istilldontcareaboutcookies/tests/test_istilldontcareaboutcookies.js
@@ -1,279 +0,0 @@
 
				-/**
			
 
				- * Unit tests for istilldontcareaboutcookies plugin
			
 
				- *
			
 
				- * Run with: node --test tests/test_istilldontcareaboutcookies.js
			
 
				- */
			
 
				-
			
 
				-const assert = require('assert');
			
 
				-const fs = require('fs');
			
 
				-const path = require('path');
			
 
				-const { describe, it, before, after, beforeEach, afterEach } = require('node:test');
			
 
				-
			
 
				-// Test fixtures
			
 
				-const TEST_DIR = path.join(__dirname, '.test_fixtures');
			
 
				-const TEST_EXTENSIONS_DIR = path.join(TEST_DIR, 'chrome_extensions');
			
 
				-
			
 
				-describe('istilldontcareaboutcookies plugin', () => {
			
 
				-    before(() => {
			
 
				-        if (!fs.existsSync(TEST_DIR)) {
			
 
				-            fs.mkdirSync(TEST_DIR, { recursive: true });
			
 
				-        }
			
 
				-    });
			
 
				-
			
 
				-    after(() => {
			
 
				-        if (fs.existsSync(TEST_DIR)) {
			
 
				-            fs.rmSync(TEST_DIR, { recursive: true, force: true });
			
 
				-        }
			
 
				-    });
			
 
				-
			
 
				-    describe('EXTENSION metadata', () => {
			
 
				-        it('should have correct webstore_id', () => {
			
 
				-            const { EXTENSION } = require('../on_Snapshot__02_istilldontcareaboutcookies.js');
			
 
				-
			
 
				-            assert.strictEqual(EXTENSION.webstore_id, 'edibdbjcniadpccecjdfdjjppcpchdlm');
			
 
				-        });
			
 
				-
			
 
				-        it('should have correct name', () => {
			
 
				-            const { EXTENSION } = require('../on_Snapshot__02_istilldontcareaboutcookies.js');
			
 
				-
			
 
				-            assert.strictEqual(EXTENSION.name, 'istilldontcareaboutcookies');
			
 
				-        });
			
 
				-    });
			
 
				-
			
 
				-    describe('installCookiesExtension', () => {
			
 
				-        beforeEach(() => {
			
 
				-            process.env.CHROME_EXTENSIONS_DIR = TEST_EXTENSIONS_DIR;
			
 
				-
			
 
				-            if (!fs.existsSync(TEST_EXTENSIONS_DIR)) {
			
 
				-                fs.mkdirSync(TEST_EXTENSIONS_DIR, { recursive: true });
			
 
				-            }
			
 
				-        });
			
 
				-
			
 
				-        afterEach(() => {
			
 
				-            if (fs.existsSync(TEST_EXTENSIONS_DIR)) {
			
 
				-                fs.rmSync(TEST_EXTENSIONS_DIR, { recursive: true });
			
 
				-            }
			
 
				-
			
 
				-            delete process.env.CHROME_EXTENSIONS_DIR;
			
 
				-        });
			
 
				-
			
 
				-        it('should use cached extension if available', async () => {
			
 
				-            const { installCookiesExtension } = require('../on_Snapshot__02_istilldontcareaboutcookies.js');
			
 
				-
			
 
				-            // Create fake cache
			
 
				-            const cacheFile = path.join(TEST_EXTENSIONS_DIR, 'istilldontcareaboutcookies.extension.json');
			
 
				-            const fakeExtensionDir = path.join(TEST_EXTENSIONS_DIR, 'fake_cookies');
			
 
				-
			
 
				-            fs.mkdirSync(fakeExtensionDir, { recursive: true });
			
 
				-            fs.writeFileSync(
			
 
				-                path.join(fakeExtensionDir, 'manifest.json'),
			
 
				-                JSON.stringify({ version: '1.1.8' })
			
 
				-            );
			
 
				-
			
 
				-            const fakeCache = {
			
 
				-                webstore_id: 'edibdbjcniadpccecjdfdjjppcpchdlm',
			
 
				-                name: 'istilldontcareaboutcookies',
			
 
				-                unpacked_path: fakeExtensionDir,
			
 
				-                version: '1.1.8'
			
 
				-            };
			
 
				-
			
 
				-            fs.writeFileSync(cacheFile, JSON.stringify(fakeCache));
			
 
				-
			
 
				-            const result = await installCookiesExtension();
			
 
				-
			
 
				-            assert.notStrictEqual(result, null);
			
 
				-            assert.strictEqual(result.webstore_id, 'edibdbjcniadpccecjdfdjjppcpchdlm');
			
 
				-        });
			
 
				-
			
 
				-        it('should not require any configuration', async () => {
			
 
				-            // This extension works out of the box
			
 
				-            // No API keys or config needed
			
 
				-            const { EXTENSION } = require('../on_Snapshot__02_istilldontcareaboutcookies.js');
			
 
				-
			
 
				-            assert.ok(EXTENSION);
			
 
				-            // No config fields should be required
			
 
				-        });
			
 
				-    });
			
 
				-
			
 
				-    describe('cache file creation', () => {
			
 
				-        beforeEach(() => {
			
 
				-            process.env.CHROME_EXTENSIONS_DIR = TEST_EXTENSIONS_DIR;
			
 
				-
			
 
				-            if (!fs.existsSync(TEST_EXTENSIONS_DIR)) {
			
 
				-                fs.mkdirSync(TEST_EXTENSIONS_DIR, { recursive: true });
			
 
				-            }
			
 
				-        });
			
 
				-
			
 
				-        afterEach(() => {
			
 
				-            if (fs.existsSync(TEST_EXTENSIONS_DIR)) {
			
 
				-                fs.rmSync(TEST_EXTENSIONS_DIR, { recursive: true });
			
 
				-            }
			
 
				-
			
 
				-            delete process.env.CHROME_EXTENSIONS_DIR;
			
 
				-        });
			
 
				-
			
 
				-        it('should create cache file with correct extension name', async () => {
			
 
				-            const cacheFile = path.join(TEST_EXTENSIONS_DIR, 'istilldontcareaboutcookies.extension.json');
			
 
				-
			
 
				-            // Create mock extension
			
 
				-            const mockExtension = {
			
 
				-                webstore_id: 'edibdbjcniadpccecjdfdjjppcpchdlm',
			
 
				-                name: 'istilldontcareaboutcookies',
			
 
				-                version: '1.1.9'
			
 
				-            };
			
 
				-
			
 
				-            await fs.promises.writeFile(cacheFile, JSON.stringify(mockExtension, null, 2));
			
 
				-
			
 
				-            assert.ok(fs.existsSync(cacheFile));
			
 
				-
			
 
				-            const cache = JSON.parse(fs.readFileSync(cacheFile, 'utf-8'));
			
 
				-            assert.strictEqual(cache.name, 'istilldontcareaboutcookies');
			
 
				-        });
			
 
				-
			
 
				-        it('should use correct filename pattern', () => {
			
 
				-            const expectedPattern = 'istilldontcareaboutcookies.extension.json';
			
 
				-            const cacheFile = path.join(TEST_EXTENSIONS_DIR, expectedPattern);
			
 
				-
			
 
				-            // Pattern should match expected format
			
 
				-            assert.ok(path.basename(cacheFile).endsWith('.extension.json'));
			
 
				-            assert.ok(path.basename(cacheFile).includes('istilldontcareaboutcookies'));
			
 
				-        });
			
 
				-    });
			
 
				-
			
 
				-    describe('extension functionality', () => {
			
 
				-        it('should work automatically without configuration', () => {
			
 
				-            // This extension automatically dismisses cookie banners
			
 
				-            // No manual trigger or configuration needed
			
 
				-
			
 
				-            const features = {
			
 
				-                automaticBannerDismissal: true,
			
 
				-                requiresConfiguration: false,
			
 
				-                requiresApiKey: false,
			
 
				-                requiresUserAction: false
			
 
				-            };
			
 
				-
			
 
				-            assert.strictEqual(features.automaticBannerDismissal, true);
			
 
				-            assert.strictEqual(features.requiresConfiguration, false);
			
 
				-            assert.strictEqual(features.requiresApiKey, false);
			
 
				-            assert.strictEqual(features.requiresUserAction, false);
			
 
				-        });
			
 
				-
			
 
				-        it('should not require any runtime hooks', () => {
			
 
				-            // Extension works purely via Chrome's content script injection
			
 
				-            // No need for additional hooks or configuration
			
 
				-
			
 
				-            const requiresHooks = {
			
 
				-                preNavigation: false,
			
 
				-                postNavigation: false,
			
 
				-                onPageLoad: false
			
 
				-            };
			
 
				-
			
 
				-            assert.strictEqual(requiresHooks.preNavigation, false);
			
 
				-            assert.strictEqual(requiresHooks.postNavigation, false);
			
 
				-            assert.strictEqual(requiresHooks.onPageLoad, false);
			
 
				-        });
			
 
				-    });
			
 
				-
			
 
				-    describe('priority and execution order', () => {
			
 
				-        it('should have priority 02 (early)', () => {
			
 
				-            const filename = 'on_Snapshot__02_istilldontcareaboutcookies.js';
			
 
				-
			
 
				-            // Extract priority from filename
			
 
				-            const match = filename.match(/on_Snapshot__(\d+)_/);
			
 
				-            assert.ok(match);
			
 
				-
			
 
				-            const priority = parseInt(match[1]);
			
 
				-            assert.strictEqual(priority, 2);
			
 
				-        });
			
 
				-
			
 
				-        it('should run before chrome (priority 20)', () => {
			
 
				-            const extensionPriority = 2;
			
 
				-            const chromeSessionPriority = 20;
			
 
				-
			
 
				-            assert.ok(extensionPriority < chromeSessionPriority);
			
 
				-        });
			
 
				-    });
			
 
				-
			
 
				-    describe('error handling', () => {
			
 
				-        beforeEach(() => {
			
 
				-            process.env.CHROME_EXTENSIONS_DIR = TEST_EXTENSIONS_DIR;
			
 
				-
			
 
				-            if (!fs.existsSync(TEST_EXTENSIONS_DIR)) {
			
 
				-                fs.mkdirSync(TEST_EXTENSIONS_DIR, { recursive: true });
			
 
				-            }
			
 
				-        });
			
 
				-
			
 
				-        afterEach(() => {
			
 
				-            if (fs.existsSync(TEST_EXTENSIONS_DIR)) {
			
 
				-                fs.rmSync(TEST_EXTENSIONS_DIR, { recursive: true });
			
 
				-            }
			
 
				-
			
 
				-            delete process.env.CHROME_EXTENSIONS_DIR;
			
 
				-        });
			
 
				-
			
 
				-        it('should handle corrupted cache gracefully', async () => {
			
 
				-            const cacheFile = path.join(TEST_EXTENSIONS_DIR, 'istilldontcareaboutcookies.extension.json');
			
 
				-
			
 
				-            // Create corrupted cache
			
 
				-            fs.writeFileSync(cacheFile, 'invalid json content');
			
 
				-
			
 
				-            // Should detect corruption and proceed with fresh install
			
 
				-            const { installCookiesExtension } = require('../on_Snapshot__02_istilldontcareaboutcookies.js');
			
 
				-
			
 
				-            // Mock loadOrInstallExtension to avoid actual download
			
 
				-            const extensionUtils = require('../../chrome_extensions/chrome_extension_utils.js');
			
 
				-            const originalFunc = extensionUtils.loadOrInstallExtension;
			
 
				-
			
 
				-            extensionUtils.loadOrInstallExtension = async () => ({
			
 
				-                webstore_id: 'edibdbjcniadpccecjdfdjjppcpchdlm',
			
 
				-                name: 'istilldontcareaboutcookies',
			
 
				-                version: '1.1.9'
			
 
				-            });
			
 
				-
			
 
				-            const result = await installCookiesExtension();
			
 
				-
			
 
				-            extensionUtils.loadOrInstallExtension = originalFunc;
			
 
				-
			
 
				-            assert.notStrictEqual(result, null);
			
 
				-        });
			
 
				-
			
 
				-        it('should handle missing manifest gracefully', async () => {
			
 
				-            const cacheFile = path.join(TEST_EXTENSIONS_DIR, 'istilldontcareaboutcookies.extension.json');
			
 
				-            const fakeExtensionDir = path.join(TEST_EXTENSIONS_DIR, 'fake_cookies_no_manifest');
			
 
				-
			
 
				-            // Create directory without manifest
			
 
				-            fs.mkdirSync(fakeExtensionDir, { recursive: true });
			
 
				-
			
 
				-            const fakeCache = {
			
 
				-                webstore_id: 'edibdbjcniadpccecjdfdjjppcpchdlm',
			
 
				-                name: 'istilldontcareaboutcookies',
			
 
				-                unpacked_path: fakeExtensionDir
			
 
				-            };
			
 
				-
			
 
				-            fs.writeFileSync(cacheFile, JSON.stringify(fakeCache));
			
 
				-
			
 
				-            const { installCookiesExtension } = require('../on_Snapshot__02_istilldontcareaboutcookies.js');
			
 
				-
			
 
				-            // Mock to return fresh extension when manifest missing
			
 
				-            const extensionUtils = require('../../chrome_extensions/chrome_extension_utils.js');
			
 
				-            const originalFunc = extensionUtils.loadOrInstallExtension;
			
 
				-
			
 
				-            let freshInstallCalled = false;
			
 
				-            extensionUtils.loadOrInstallExtension = async () => {
			
 
				-                freshInstallCalled = true;
			
 
				-                return {
			
 
				-                    webstore_id: 'edibdbjcniadpccecjdfdjjppcpchdlm',
			
 
				-                    name: 'istilldontcareaboutcookies',
			
 
				-                    version: '1.1.9'
			
 
				-                };
			
 
				-            };
			
 
				-
			
 
				-            const result = await installCookiesExtension();
			
 
				-
			
 
				-            extensionUtils.loadOrInstallExtension = originalFunc;
			
 
				-
			
 
				-            // Should trigger fresh install when manifest missing
			
 
				-            assert.ok(freshInstallCalled || result);
			
 
				-        });
			
 
				-    });
			
 
				-});
			
--- a/archivebox/plugins/media/config.json
+++ b/archivebox/plugins/media/config.json
@@ -3,16 +3,16 @@
 
				   "type": "object",
			
 
				   "additionalProperties": false,
			
 
				   "properties": {
			
 
				-    "SAVE_MEDIA": {
			
 
				+    "MEDIA_ENABLED": {
			
 
				       "type": "boolean",
			
 
				       "default": true,
			
 
				-      "x-aliases": ["USE_YTDLP", "FETCH_MEDIA"],
			
 
				+      "x-aliases": ["SAVE_MEDIA", "USE_MEDIA", "USE_YTDLP", "FETCH_MEDIA"],
			
 
				       "description": "Enable media downloading with yt-dlp"
			
 
				     },
			
 
				-    "YOUTUBEDL_BINARY": {
			
 
				+    "MEDIA_BINARY": {
			
 
				       "type": "string",
			
 
				       "default": "yt-dlp",
			
 
				-      "x-aliases": ["YTDLP_BINARY", "YOUTUBE_DL_BINARY"],
			
 
				+      "x-aliases": ["YOUTUBEDL_BINARY", "YTDLP_BINARY", "YOUTUBE_DL_BINARY"],
			
 
				       "description": "Path to yt-dlp binary"
			
 
				     },
			
 
				     "MEDIA_TIMEOUT": {
			
@@ -28,13 +28,14 @@
 
				       "pattern": "^\\d+[kmgKMG]?$",
			
 
				       "description": "Maximum file size for media downloads"
			
 
				     },
			
 
				-    "YTDLP_CHECK_SSL_VALIDITY": {
			
 
				+    "MEDIA_CHECK_SSL_VALIDITY": {
			
 
				       "type": "boolean",
			
 
				       "default": true,
			
 
				       "x-fallback": "CHECK_SSL_VALIDITY",
			
 
				+      "x-aliases": ["YTDLP_CHECK_SSL_VALIDITY"],
			
 
				       "description": "Whether to verify SSL certificates"
			
 
				     },
			
 
				-    "YTDLP_ARGS": {
			
 
				+    "MEDIA_ARGS": {
			
 
				       "type": "array",
			
 
				       "items": {"type": "string"},
			
 
				       "default": [
			
@@ -44,11 +45,13 @@
 
				         "--embed-subs",
			
 
				         "--write-auto-sub"
			
 
				       ],
			
 
				+      "x-aliases": ["YTDLP_ARGS"],
			
 
				       "description": "Default yt-dlp arguments"
			
 
				     },
			
 
				-    "YTDLP_EXTRA_ARGS": {
			
 
				+    "MEDIA_EXTRA_ARGS": {
			
 
				       "type": "string",
			
 
				       "default": "",
			
 
				+      "x-aliases": ["YTDLP_EXTRA_ARGS"],
			
 
				       "description": "Extra arguments for yt-dlp (space-separated)"
			
 
				     }
			
 
				   }
			
--- a/archivebox/plugins/media/tests/test_media.py
+++ b/archivebox/plugins/media/tests/test_media.py
@@ -2,6 +2,7 @@
 
				 Integration tests for media plugin
			
 
				 
			
 
				 Tests verify:
			
 
				+    pass
			
 
				 1. Hook script exists
			
 
				 2. Dependencies installed via validation hooks
			
 
				 3. Verify deps with abx-pkg
			
@@ -45,7 +46,9 @@ def test_ytdlp_install_hook():
 
				     found_dependencies = {'node': False, 'ffmpeg': False, 'yt-dlp': False}
			
 
				 
			
 
				     for line in result.stdout.strip().split('\n'):
			
 
				+        pass
			
 
				         if line.strip():
			
 
				+            pass
			
 
				             try:
			
 
				                 record = json.loads(line)
			
 
				                 if record.get('type') == 'Binary':
			
@@ -94,7 +97,7 @@ def test_verify_deps_with_abx_pkg():
 
				         missing_binaries.append('ffmpeg')
			
 
				 
			
 
				     if missing_binaries:
			
 
				-        pytest.skip(f"Binaries not available: {', '.join(missing_binaries)} - Dependency records should have been emitted")
			
 
				+        pass
			
 
				 
			
 
				 def test_handles_non_media_url():
			
 
				     """Test that media extractor handles non-media URLs gracefully via hook."""
			
@@ -120,6 +123,7 @@ def test_handles_non_media_url():
 
				         for line in result.stdout.strip().split('\n'):
			
 
				             line = line.strip()
			
 
				             if line.startswith('{'):
			
 
				+                pass
			
 
				                 try:
			
 
				                     record = json.loads(line)
			
 
				                     if record.get('type') == 'ArchiveResult':
			
--- a/archivebox/plugins/mercury/config.json
+++ b/archivebox/plugins/mercury/config.json
@@ -3,9 +3,10 @@
 
				   "type": "object",
			
 
				   "additionalProperties": false,
			
 
				   "properties": {
			
 
				-    "SAVE_MERCURY": {
			
 
				+    "MERCURY_ENABLED": {
			
 
				       "type": "boolean",
			
 
				       "default": true,
			
 
				+      "x-aliases": ["SAVE_MERCURY", "USE_MERCURY"],
			
 
				       "description": "Enable Mercury text extraction"
			
 
				     },
			
 
				     "MERCURY_BINARY": {
			
--- a/archivebox/plugins/mercury/tests/test_mercury.py
+++ b/archivebox/plugins/mercury/tests/test_mercury.py
@@ -2,6 +2,7 @@
 
				 Integration tests for mercury plugin
			
 
				 
			
 
				 Tests verify:
			
 
				+    pass
			
 
				 1. Hook script exists
			
 
				 2. Dependencies installed via validation hooks
			
 
				 3. Verify deps with abx-pkg
			
@@ -44,7 +45,9 @@ def test_mercury_install_hook():
 
				         # Binary found - verify Binary JSONL output
			
 
				         found_binary = False
			
 
				         for line in result.stdout.strip().split('\n'):
			
 
				+            pass
			
 
				             if line.strip():
			
 
				+                pass
			
 
				                 try:
			
 
				                     record = json.loads(line)
			
 
				                     if record.get('type') == 'Binary':
			
@@ -59,7 +62,9 @@ def test_mercury_install_hook():
 
				         # Binary not found - verify Dependency JSONL output
			
 
				         found_dependency = False
			
 
				         for line in result.stdout.strip().split('\n'):
			
 
				+            pass
			
 
				             if line.strip():
			
 
				+                pass
			
 
				                 try:
			
 
				                     record = json.loads(line)
			
 
				                     if record.get('type') == 'Dependency':
			
@@ -89,7 +94,7 @@ def test_verify_deps_with_abx_pkg():
 
				     if mercury_loaded and mercury_loaded.abspath:
			
 
				         assert True, "postlight-parser is available"
			
 
				     else:
			
 
				-        pytest.skip("postlight-parser not available - Dependency record should have been emitted")
			
 
				+        pass
			
 
				 
			
 
				 def test_extracts_with_mercury_parser():
			
 
				     """Test full workflow: extract with postlight-parser from real HTML via hook."""
			
@@ -122,6 +127,7 @@ def test_extracts_with_mercury_parser():
 
				         for line in result.stdout.strip().split('\n'):
			
 
				             line = line.strip()
			
 
				             if line.startswith('{'):
			
 
				+                pass
			
 
				                 try:
			
 
				                     record = json.loads(line)
			
 
				                     if record.get('type') == 'ArchiveResult':
			
@@ -184,6 +190,7 @@ def test_fails_gracefully_without_html():
 
				         for line in result.stdout.strip().split('\n'):
			
 
				             line = line.strip()
			
 
				             if line.startswith('{'):
			
 
				+                pass
			
 
				                 try:
			
 
				                     record = json.loads(line)
			
 
				                     if record.get('type') == 'ArchiveResult':
			
--- a/archivebox/plugins/package-lock.json
+++ b/archivebox/plugins/package-lock.json
@@ -1,925 +0,0 @@
 
				-{
			
 
				-  "name": "archivebox-plugins",
			
 
				-  "lockfileVersion": 3,
			
 
				-  "requires": true,
			
 
				-  "packages": {
			
 
				-    "": {
			
 
				-      "name": "archivebox-plugins",
			
 
				-      "dependencies": {
			
 
				-        "puppeteer-core": "^24.34.0"
			
 
				-      }
			
 
				-    },
			
 
				-    "node_modules/@puppeteer/browsers": {
			
 
				-      "version": "2.11.0",
			
 
				-      "resolved": "https://registry.npmjs.org/@puppeteer/browsers/-/browsers-2.11.0.tgz",
			
 
				-      "integrity": "sha512-n6oQX6mYkG8TRPuPXmbPidkUbsSRalhmaaVAQxvH1IkQy63cwsH+kOjB3e4cpCDHg0aSvsiX9bQ4s2VB6mGWUQ==",
			
 
				-      "license": "Apache-2.0",
			
 
				-      "dependencies": {
			
 
				-        "debug": "^4.4.3",
			
 
				-        "extract-zip": "^2.0.1",
			
 
				-        "progress": "^2.0.3",
			
 
				-        "proxy-agent": "^6.5.0",
			
 
				-        "semver": "^7.7.3",
			
 
				-        "tar-fs": "^3.1.1",
			
 
				-        "yargs": "^17.7.2"
			
 
				-      },
			
 
				-      "bin": {
			
 
				-        "browsers": "lib/cjs/main-cli.js"
			
 
				-      },
			
 
				-      "engines": {
			
 
				-        "node": ">=18"
			
 
				-      }
			
 
				-    },
			
 
				-    "node_modules/@tootallnate/quickjs-emscripten": {
			
 
				-      "version": "0.23.0",
			
 
				-      "resolved": "https://registry.npmjs.org/@tootallnate/quickjs-emscripten/-/quickjs-emscripten-0.23.0.tgz",
			
 
				-      "integrity": "sha512-C5Mc6rdnsaJDjO3UpGW/CQTHtCKaYlScZTly4JIu97Jxo/odCiH0ITnDXSJPTOrEKk/ycSZ0AOgTmkDtkOsvIA==",
			
 
				-      "license": "MIT"
			
 
				-    },
			
 
				-    "node_modules/@types/node": {
			
 
				-      "version": "25.0.3",
			
 
				-      "resolved": "https://registry.npmjs.org/@types/node/-/node-25.0.3.tgz",
			
 
				-      "integrity": "sha512-W609buLVRVmeW693xKfzHeIV6nJGGz98uCPfeXI1ELMLXVeKYZ9m15fAMSaUPBHYLGFsVRcMmSCksQOrZV9BYA==",
			
 
				-      "license": "MIT",
			
 
				-      "optional": true,
			
 
				-      "dependencies": {
			
 
				-        "undici-types": "~7.16.0"
			
 
				-      }
			
 
				-    },
			
 
				-    "node_modules/@types/yauzl": {
			
 
				-      "version": "2.10.3",
			
 
				-      "resolved": "https://registry.npmjs.org/@types/yauzl/-/yauzl-2.10.3.tgz",
			
 
				-      "integrity": "sha512-oJoftv0LSuaDZE3Le4DbKX+KS9G36NzOeSap90UIK0yMA/NhKJhqlSGtNDORNRaIbQfzjXDrQa0ytJ6mNRGz/Q==",
			
 
				-      "license": "MIT",
			
 
				-      "optional": true,
			
 
				-      "dependencies": {
			
 
				-        "@types/node": "*"
			
 
				-      }
			
 
				-    },
			
 
				-    "node_modules/agent-base": {
			
 
				-      "version": "7.1.4",
			
 
				-      "resolved": "https://registry.npmjs.org/agent-base/-/agent-base-7.1.4.tgz",
			
 
				-      "integrity": "sha512-MnA+YT8fwfJPgBx3m60MNqakm30XOkyIoH1y6huTQvC0PwZG7ki8NacLBcrPbNoo8vEZy7Jpuk7+jMO+CUovTQ==",
			
 
				-      "license": "MIT",
			
 
				-      "engines": {
			
 
				-        "node": ">= 14"
			
 
				-      }
			
 
				-    },
			
 
				-    "node_modules/ansi-regex": {
			
 
				-      "version": "5.0.1",
			
 
				-      "resolved": "https://registry.npmjs.org/ansi-regex/-/ansi-regex-5.0.1.tgz",
			
 
				-      "integrity": "sha512-quJQXlTSUGL2LH9SUXo8VwsY4soanhgo6LNSm84E1LBcE8s3O0wpdiRzyR9z/ZZJMlMWv37qOOb9pdJlMUEKFQ==",
			
 
				-      "license": "MIT",
			
 
				-      "engines": {
			
 
				-        "node": ">=8"
			
 
				-      }
			
 
				-    },
			
 
				-    "node_modules/ansi-styles": {
			
 
				-      "version": "4.3.0",
			
 
				-      "resolved": "https://registry.npmjs.org/ansi-styles/-/ansi-styles-4.3.0.tgz",
			
 
				-      "integrity": "sha512-zbB9rCJAT1rbjiVDb2hqKFHNYLxgtk8NURxZ3IZwD3F6NtxbXZQCnnSi1Lkx+IDohdPlFp222wVALIheZJQSEg==",
			
 
				-      "license": "MIT",
			
 
				-      "dependencies": {
			
 
				-        "color-convert": "^2.0.1"
			
 
				-      },
			
 
				-      "engines": {
			
 
				-        "node": ">=8"
			
 
				-      },
			
 
				-      "funding": {
			
 
				-        "url": "https://github.com/chalk/ansi-styles?sponsor=1"
			
 
				-      }
			
 
				-    },
			
 
				-    "node_modules/ast-types": {
			
 
				-      "version": "0.13.4",
			
 
				-      "resolved": "https://registry.npmjs.org/ast-types/-/ast-types-0.13.4.tgz",
			
 
				-      "integrity": "sha512-x1FCFnFifvYDDzTaLII71vG5uvDwgtmDTEVWAxrgeiR8VjMONcCXJx7E+USjDtHlwFmt9MysbqgF9b9Vjr6w+w==",
			
 
				-      "license": "MIT",
			
 
				-      "dependencies": {
			
 
				-        "tslib": "^2.0.1"
			
 
				-      },
			
 
				-      "engines": {
			
 
				-        "node": ">=4"
			
 
				-      }
			
 
				-    },
			
 
				-    "node_modules/b4a": {
			
 
				-      "version": "1.7.3",
			
 
				-      "resolved": "https://registry.npmjs.org/b4a/-/b4a-1.7.3.tgz",
			
 
				-      "integrity": "sha512-5Q2mfq2WfGuFp3uS//0s6baOJLMoVduPYVeNmDYxu5OUA1/cBfvr2RIS7vi62LdNj/urk1hfmj867I3qt6uZ7Q==",
			
 
				-      "license": "Apache-2.0",
			
 
				-      "peerDependencies": {
			
 
				-        "react-native-b4a": "*"
			
 
				-      },
			
 
				-      "peerDependenciesMeta": {
			
 
				-        "react-native-b4a": {
			
 
				-          "optional": true
			
 
				-        }
			
 
				-      }
			
 
				-    },
			
 
				-    "node_modules/bare-events": {
			
 
				-      "version": "2.8.2",
			
 
				-      "resolved": "https://registry.npmjs.org/bare-events/-/bare-events-2.8.2.tgz",
			
 
				-      "integrity": "sha512-riJjyv1/mHLIPX4RwiK+oW9/4c3TEUeORHKefKAKnZ5kyslbN+HXowtbaVEqt4IMUB7OXlfixcs6gsFeo/jhiQ==",
			
 
				-      "license": "Apache-2.0",
			
 
				-      "peerDependencies": {
			
 
				-        "bare-abort-controller": "*"
			
 
				-      },
			
 
				-      "peerDependenciesMeta": {
			
 
				-        "bare-abort-controller": {
			
 
				-          "optional": true
			
 
				-        }
			
 
				-      }
			
 
				-    },
			
 
				-    "node_modules/bare-fs": {
			
 
				-      "version": "4.5.2",
			
 
				-      "resolved": "https://registry.npmjs.org/bare-fs/-/bare-fs-4.5.2.tgz",
			
 
				-      "integrity": "sha512-veTnRzkb6aPHOvSKIOy60KzURfBdUflr5VReI+NSaPL6xf+XLdONQgZgpYvUuZLVQ8dCqxpBAudaOM1+KpAUxw==",
			
 
				-      "license": "Apache-2.0",
			
 
				-      "optional": true,
			
 
				-      "dependencies": {
			
 
				-        "bare-events": "^2.5.4",
			
 
				-        "bare-path": "^3.0.0",
			
 
				-        "bare-stream": "^2.6.4",
			
 
				-        "bare-url": "^2.2.2",
			
 
				-        "fast-fifo": "^1.3.2"
			
 
				-      },
			
 
				-      "engines": {
			
 
				-        "bare": ">=1.16.0"
			
 
				-      },
			
 
				-      "peerDependencies": {
			
 
				-        "bare-buffer": "*"
			
 
				-      },
			
 
				-      "peerDependenciesMeta": {
			
 
				-        "bare-buffer": {
			
 
				-          "optional": true
			
 
				-        }
			
 
				-      }
			
 
				-    },
			
 
				-    "node_modules/bare-os": {
			
 
				-      "version": "3.6.2",
			
 
				-      "resolved": "https://registry.npmjs.org/bare-os/-/bare-os-3.6.2.tgz",
			
 
				-      "integrity": "sha512-T+V1+1srU2qYNBmJCXZkUY5vQ0B4FSlL3QDROnKQYOqeiQR8UbjNHlPa+TIbM4cuidiN9GaTaOZgSEgsvPbh5A==",
			
 
				-      "license": "Apache-2.0",
			
 
				-      "optional": true,
			
 
				-      "engines": {
			
 
				-        "bare": ">=1.14.0"
			
 
				-      }
			
 
				-    },
			
 
				-    "node_modules/bare-path": {
			
 
				-      "version": "3.0.0",
			
 
				-      "resolved": "https://registry.npmjs.org/bare-path/-/bare-path-3.0.0.tgz",
			
 
				-      "integrity": "sha512-tyfW2cQcB5NN8Saijrhqn0Zh7AnFNsnczRcuWODH0eYAXBsJ5gVxAUuNr7tsHSC6IZ77cA0SitzT+s47kot8Mw==",
			
 
				-      "license": "Apache-2.0",
			
 
				-      "optional": true,
			
 
				-      "dependencies": {
			
 
				-        "bare-os": "^3.0.1"
			
 
				-      }
			
 
				-    },
			
 
				-    "node_modules/bare-stream": {
			
 
				-      "version": "2.7.0",
			
 
				-      "resolved": "https://registry.npmjs.org/bare-stream/-/bare-stream-2.7.0.tgz",
			
 
				-      "integrity": "sha512-oyXQNicV1y8nc2aKffH+BUHFRXmx6VrPzlnaEvMhram0nPBrKcEdcyBg5r08D0i8VxngHFAiVyn1QKXpSG0B8A==",
			
 
				-      "license": "Apache-2.0",
			
 
				-      "optional": true,
			
 
				-      "dependencies": {
			
 
				-        "streamx": "^2.21.0"
			
 
				-      },
			
 
				-      "peerDependencies": {
			
 
				-        "bare-buffer": "*",
			
 
				-        "bare-events": "*"
			
 
				-      },
			
 
				-      "peerDependenciesMeta": {
			
 
				-        "bare-buffer": {
			
 
				-          "optional": true
			
 
				-        },
			
 
				-        "bare-events": {
			
 
				-          "optional": true
			
 
				-        }
			
 
				-      }
			
 
				-    },
			
 
				-    "node_modules/bare-url": {
			
 
				-      "version": "2.3.2",
			
 
				-      "resolved": "https://registry.npmjs.org/bare-url/-/bare-url-2.3.2.tgz",
			
 
				-      "integrity": "sha512-ZMq4gd9ngV5aTMa5p9+UfY0b3skwhHELaDkhEHetMdX0LRkW9kzaym4oo/Eh+Ghm0CCDuMTsRIGM/ytUc1ZYmw==",
			
 
				-      "license": "Apache-2.0",
			
 
				-      "optional": true,
			
 
				-      "dependencies": {
			
 
				-        "bare-path": "^3.0.0"
			
 
				-      }
			
 
				-    },
			
 
				-    "node_modules/basic-ftp": {
			
 
				-      "version": "5.0.5",
			
 
				-      "resolved": "https://registry.npmjs.org/basic-ftp/-/basic-ftp-5.0.5.tgz",
			
 
				-      "integrity": "sha512-4Bcg1P8xhUuqcii/S0Z9wiHIrQVPMermM1any+MX5GeGD7faD3/msQUDGLol9wOcz4/jbg/WJnGqoJF6LiBdtg==",
			
 
				-      "license": "MIT",
			
 
				-      "engines": {
			
 
				-        "node": ">=10.0.0"
			
 
				-      }
			
 
				-    },
			
 
				-    "node_modules/buffer-crc32": {
			
 
				-      "version": "0.2.13",
			
 
				-      "resolved": "https://registry.npmjs.org/buffer-crc32/-/buffer-crc32-0.2.13.tgz",
			
 
				-      "integrity": "sha512-VO9Ht/+p3SN7SKWqcrgEzjGbRSJYTx+Q1pTQC0wrWqHx0vpJraQ6GtHx8tvcg1rlK1byhU5gccxgOgj7B0TDkQ==",
			
 
				-      "license": "MIT",
			
 
				-      "engines": {
			
 
				-        "node": "*"
			
 
				-      }
			
 
				-    },
			
 
				-    "node_modules/chromium-bidi": {
			
 
				-      "version": "12.0.1",
			
 
				-      "resolved": "https://registry.npmjs.org/chromium-bidi/-/chromium-bidi-12.0.1.tgz",
			
 
				-      "integrity": "sha512-fGg+6jr0xjQhzpy5N4ErZxQ4wF7KLEvhGZXD6EgvZKDhu7iOhZXnZhcDxPJDcwTcrD48NPzOCo84RP2lv3Z+Cg==",
			
 
				-      "license": "Apache-2.0",
			
 
				-      "dependencies": {
			
 
				-        "mitt": "^3.0.1",
			
 
				-        "zod": "^3.24.1"
			
 
				-      },
			
 
				-      "peerDependencies": {
			
 
				-        "devtools-protocol": "*"
			
 
				-      }
			
 
				-    },
			
 
				-    "node_modules/cliui": {
			
 
				-      "version": "8.0.1",
			
 
				-      "resolved": "https://registry.npmjs.org/cliui/-/cliui-8.0.1.tgz",
			
 
				-      "integrity": "sha512-BSeNnyus75C4//NQ9gQt1/csTXyo/8Sb+afLAkzAptFuMsod9HFokGNudZpi/oQV73hnVK+sR+5PVRMd+Dr7YQ==",
			
 
				-      "license": "ISC",
			
 
				-      "dependencies": {
			
 
				-        "string-width": "^4.2.0",
			
 
				-        "strip-ansi": "^6.0.1",
			
 
				-        "wrap-ansi": "^7.0.0"
			
 
				-      },
			
 
				-      "engines": {
			
 
				-        "node": ">=12"
			
 
				-      }
			
 
				-    },
			
 
				-    "node_modules/color-convert": {
			
 
				-      "version": "2.0.1",
			
 
				-      "resolved": "https://registry.npmjs.org/color-convert/-/color-convert-2.0.1.tgz",
			
 
				-      "integrity": "sha512-RRECPsj7iu/xb5oKYcsFHSppFNnsj/52OVTRKb4zP5onXwVF3zVmmToNcOfGC+CRDpfK/U584fMg38ZHCaElKQ==",
			
 
				-      "license": "MIT",
			
 
				-      "dependencies": {
			
 
				-        "color-name": "~1.1.4"
			
 
				-      },
			
 
				-      "engines": {
			
 
				-        "node": ">=7.0.0"
			
 
				-      }
			
 
				-    },
			
 
				-    "node_modules/color-name": {
			
 
				-      "version": "1.1.4",
			
 
				-      "resolved": "https://registry.npmjs.org/color-name/-/color-name-1.1.4.tgz",
			
 
				-      "integrity": "sha512-dOy+3AuW3a2wNbZHIuMZpTcgjGuLU/uBL/ubcZF9OXbDo8ff4O8yVp5Bf0efS8uEoYo5q4Fx7dY9OgQGXgAsQA==",
			
 
				-      "license": "MIT"
			
 
				-    },
			
 
				-    "node_modules/data-uri-to-buffer": {
			
 
				-      "version": "6.0.2",
			
 
				-      "resolved": "https://registry.npmjs.org/data-uri-to-buffer/-/data-uri-to-buffer-6.0.2.tgz",
			
 
				-      "integrity": "sha512-7hvf7/GW8e86rW0ptuwS3OcBGDjIi6SZva7hCyWC0yYry2cOPmLIjXAUHI6DK2HsnwJd9ifmt57i8eV2n4YNpw==",
			
 
				-      "license": "MIT",
			
 
				-      "engines": {
			
 
				-        "node": ">= 14"
			
 
				-      }
			
 
				-    },
			
 
				-    "node_modules/debug": {
			
 
				-      "version": "4.4.3",
			
 
				-      "resolved": "https://registry.npmjs.org/debug/-/debug-4.4.3.tgz",
			
 
				-      "integrity": "sha512-RGwwWnwQvkVfavKVt22FGLw+xYSdzARwm0ru6DhTVA3umU5hZc28V3kO4stgYryrTlLpuvgI9GiijltAjNbcqA==",
			
 
				-      "license": "MIT",
			
 
				-      "dependencies": {
			
 
				-        "ms": "^2.1.3"
			
 
				-      },
			
 
				-      "engines": {
			
 
				-        "node": ">=6.0"
			
 
				-      },
			
 
				-      "peerDependenciesMeta": {
			
 
				-        "supports-color": {
			
 
				-          "optional": true
			
 
				-        }
			
 
				-      }
			
 
				-    },
			
 
				-    "node_modules/degenerator": {
			
 
				-      "version": "5.0.1",
			
 
				-      "resolved": "https://registry.npmjs.org/degenerator/-/degenerator-5.0.1.tgz",
			
 
				-      "integrity": "sha512-TllpMR/t0M5sqCXfj85i4XaAzxmS5tVA16dqvdkMwGmzI+dXLXnw3J+3Vdv7VKw+ThlTMboK6i9rnZ6Nntj5CQ==",
			
 
				-      "license": "MIT",
			
 
				-      "dependencies": {
			
 
				-        "ast-types": "^0.13.4",
			
 
				-        "escodegen": "^2.1.0",
			
 
				-        "esprima": "^4.0.1"
			
 
				-      },
			
 
				-      "engines": {
			
 
				-        "node": ">= 14"
			
 
				-      }
			
 
				-    },
			
 
				-    "node_modules/devtools-protocol": {
			
 
				-      "version": "0.0.1534754",
			
 
				-      "resolved": "https://registry.npmjs.org/devtools-protocol/-/devtools-protocol-0.0.1534754.tgz",
			
 
				-      "integrity": "sha512-26T91cV5dbOYnXdJi5qQHoTtUoNEqwkHcAyu/IKtjIAxiEqPMrDiRkDOPWVsGfNZGmlQVHQbZRSjD8sxagWVsQ==",
			
 
				-      "license": "BSD-3-Clause",
			
 
				-      "peer": true
			
 
				-    },
			
 
				-    "node_modules/emoji-regex": {
			
 
				-      "version": "8.0.0",
			
 
				-      "resolved": "https://registry.npmjs.org/emoji-regex/-/emoji-regex-8.0.0.tgz",
			
 
				-      "integrity": "sha512-MSjYzcWNOA0ewAHpz0MxpYFvwg6yjy1NG3xteoqz644VCo/RPgnr1/GGt+ic3iJTzQ8Eu3TdM14SawnVUmGE6A==",
			
 
				-      "license": "MIT"
			
 
				-    },
			
 
				-    "node_modules/end-of-stream": {
			
 
				-      "version": "1.4.5",
			
 
				-      "resolved": "https://registry.npmjs.org/end-of-stream/-/end-of-stream-1.4.5.tgz",
			
 
				-      "integrity": "sha512-ooEGc6HP26xXq/N+GCGOT0JKCLDGrq2bQUZrQ7gyrJiZANJ/8YDTxTpQBXGMn+WbIQXNVpyWymm7KYVICQnyOg==",
			
 
				-      "license": "MIT",
			
 
				-      "dependencies": {
			
 
				-        "once": "^1.4.0"
			
 
				-      }
			
 
				-    },
			
 
				-    "node_modules/escalade": {
			
 
				-      "version": "3.2.0",
			
 
				-      "resolved": "https://registry.npmjs.org/escalade/-/escalade-3.2.0.tgz",
			
 
				-      "integrity": "sha512-WUj2qlxaQtO4g6Pq5c29GTcWGDyd8itL8zTlipgECz3JesAiiOKotd8JU6otB3PACgG6xkJUyVhboMS+bje/jA==",
			
 
				-      "license": "MIT",
			
 
				-      "engines": {
			
 
				-        "node": ">=6"
			
 
				-      }
			
 
				-    },
			
 
				-    "node_modules/escodegen": {
			
 
				-      "version": "2.1.0",
			
 
				-      "resolved": "https://registry.npmjs.org/escodegen/-/escodegen-2.1.0.tgz",
			
 
				-      "integrity": "sha512-2NlIDTwUWJN0mRPQOdtQBzbUHvdGY2P1VXSyU83Q3xKxM7WHX2Ql8dKq782Q9TgQUNOLEzEYu9bzLNj1q88I5w==",
			
 
				-      "license": "BSD-2-Clause",
			
 
				-      "dependencies": {
			
 
				-        "esprima": "^4.0.1",
			
 
				-        "estraverse": "^5.2.0",
			
 
				-        "esutils": "^2.0.2"
			
 
				-      },
			
 
				-      "bin": {
			
 
				-        "escodegen": "bin/escodegen.js",
			
 
				-        "esgenerate": "bin/esgenerate.js"
			
 
				-      },
			
 
				-      "engines": {
			
 
				-        "node": ">=6.0"
			
 
				-      },
			
 
				-      "optionalDependencies": {
			
 
				-        "source-map": "~0.6.1"
			
 
				-      }
			
 
				-    },
			
 
				-    "node_modules/esprima": {
			
 
				-      "version": "4.0.1",
			
 
				-      "resolved": "https://registry.npmjs.org/esprima/-/esprima-4.0.1.tgz",
			
 
				-      "integrity": "sha512-eGuFFw7Upda+g4p+QHvnW0RyTX/SVeJBDM/gCtMARO0cLuT2HcEKnTPvhjV6aGeqrCB/sbNop0Kszm0jsaWU4A==",
			
 
				-      "license": "BSD-2-Clause",
			
 
				-      "bin": {
			
 
				-        "esparse": "bin/esparse.js",
			
 
				-        "esvalidate": "bin/esvalidate.js"
			
 
				-      },
			
 
				-      "engines": {
			
 
				-        "node": ">=4"
			
 
				-      }
			
 
				-    },
			
 
				-    "node_modules/estraverse": {
			
 
				-      "version": "5.3.0",
			
 
				-      "resolved": "https://registry.npmjs.org/estraverse/-/estraverse-5.3.0.tgz",
			
 
				-      "integrity": "sha512-MMdARuVEQziNTeJD8DgMqmhwR11BRQ/cBP+pLtYdSTnf3MIO8fFeiINEbX36ZdNlfU/7A9f3gUw49B3oQsvwBA==",
			
 
				-      "license": "BSD-2-Clause",
			
 
				-      "engines": {
			
 
				-        "node": ">=4.0"
			
 
				-      }
			
 
				-    },
			
 
				-    "node_modules/esutils": {
			
 
				-      "version": "2.0.3",
			
 
				-      "resolved": "https://registry.npmjs.org/esutils/-/esutils-2.0.3.tgz",
			
 
				-      "integrity": "sha512-kVscqXk4OCp68SZ0dkgEKVi6/8ij300KBWTJq32P/dYeWTSwK41WyTxalN1eRmA5Z9UU/LX9D7FWSmV9SAYx6g==",
			
 
				-      "license": "BSD-2-Clause",
			
 
				-      "engines": {
			
 
				-        "node": ">=0.10.0"
			
 
				-      }
			
 
				-    },
			
 
				-    "node_modules/events-universal": {
			
 
				-      "version": "1.0.1",
			
 
				-      "resolved": "https://registry.npmjs.org/events-universal/-/events-universal-1.0.1.tgz",
			
 
				-      "integrity": "sha512-LUd5euvbMLpwOF8m6ivPCbhQeSiYVNb8Vs0fQ8QjXo0JTkEHpz8pxdQf0gStltaPpw0Cca8b39KxvK9cfKRiAw==",
			
 
				-      "license": "Apache-2.0",
			
 
				-      "dependencies": {
			
 
				-        "bare-events": "^2.7.0"
			
 
				-      }
			
 
				-    },
			
 
				-    "node_modules/extract-zip": {
			
 
				-      "version": "2.0.1",
			
 
				-      "resolved": "https://registry.npmjs.org/extract-zip/-/extract-zip-2.0.1.tgz",
			
 
				-      "integrity": "sha512-GDhU9ntwuKyGXdZBUgTIe+vXnWj0fppUEtMDL0+idd5Sta8TGpHssn/eusA9mrPr9qNDym6SxAYZjNvCn/9RBg==",
			
 
				-      "license": "BSD-2-Clause",
			
 
				-      "dependencies": {
			
 
				-        "debug": "^4.1.1",
			
 
				-        "get-stream": "^5.1.0",
			
 
				-        "yauzl": "^2.10.0"
			
 
				-      },
			
 
				-      "bin": {
			
 
				-        "extract-zip": "cli.js"
			
 
				-      },
			
 
				-      "engines": {
			
 
				-        "node": ">= 10.17.0"
			
 
				-      },
			
 
				-      "optionalDependencies": {
			
 
				-        "@types/yauzl": "^2.9.1"
			
 
				-      }
			
 
				-    },
			
 
				-    "node_modules/fast-fifo": {
			
 
				-      "version": "1.3.2",
			
 
				-      "resolved": "https://registry.npmjs.org/fast-fifo/-/fast-fifo-1.3.2.tgz",
			
 
				-      "integrity": "sha512-/d9sfos4yxzpwkDkuN7k2SqFKtYNmCTzgfEpz82x34IM9/zc8KGxQoXg1liNC/izpRM/MBdt44Nmx41ZWqk+FQ==",
			
 
				-      "license": "MIT"
			
 
				-    },
			
 
				-    "node_modules/fd-slicer": {
			
 
				-      "version": "1.1.0",
			
 
				-      "resolved": "https://registry.npmjs.org/fd-slicer/-/fd-slicer-1.1.0.tgz",
			
 
				-      "integrity": "sha512-cE1qsB/VwyQozZ+q1dGxR8LBYNZeofhEdUNGSMbQD3Gw2lAzX9Zb3uIU6Ebc/Fmyjo9AWWfnn0AUCHqtevs/8g==",
			
 
				-      "license": "MIT",
			
 
				-      "dependencies": {
			
 
				-        "pend": "~1.2.0"
			
 
				-      }
			
 
				-    },
			
 
				-    "node_modules/get-caller-file": {
			
 
				-      "version": "2.0.5",
			
 
				-      "resolved": "https://registry.npmjs.org/get-caller-file/-/get-caller-file-2.0.5.tgz",
			
 
				-      "integrity": "sha512-DyFP3BM/3YHTQOCUL/w0OZHR0lpKeGrxotcHWcqNEdnltqFwXVfhEBQ94eIo34AfQpo0rGki4cyIiftY06h2Fg==",
			
 
				-      "license": "ISC",
			
 
				-      "engines": {
			
 
				-        "node": "6.* || 8.* || >= 10.*"
			
 
				-      }
			
 
				-    },
			
 
				-    "node_modules/get-stream": {
			
 
				-      "version": "5.2.0",
			
 
				-      "resolved": "https://registry.npmjs.org/get-stream/-/get-stream-5.2.0.tgz",
			
 
				-      "integrity": "sha512-nBF+F1rAZVCu/p7rjzgA+Yb4lfYXrpl7a6VmJrU8wF9I1CKvP/QwPNZHnOlwbTkY6dvtFIzFMSyQXbLoTQPRpA==",
			
 
				-      "license": "MIT",
			
 
				-      "dependencies": {
			
 
				-        "pump": "^3.0.0"
			
 
				-      },
			
 
				-      "engines": {
			
 
				-        "node": ">=8"
			
 
				-      },
			
 
				-      "funding": {
			
 
				-        "url": "https://github.com/sponsors/sindresorhus"
			
 
				-      }
			
 
				-    },
			
 
				-    "node_modules/get-uri": {
			
 
				-      "version": "6.0.5",
			
 
				-      "resolved": "https://registry.npmjs.org/get-uri/-/get-uri-6.0.5.tgz",
			
 
				-      "integrity": "sha512-b1O07XYq8eRuVzBNgJLstU6FYc1tS6wnMtF1I1D9lE8LxZSOGZ7LhxN54yPP6mGw5f2CkXY2BQUL9Fx41qvcIg==",
			
 
				-      "license": "MIT",
			
 
				-      "dependencies": {
			
 
				-        "basic-ftp": "^5.0.2",
			
 
				-        "data-uri-to-buffer": "^6.0.2",
			
 
				-        "debug": "^4.3.4"
			
 
				-      },
			
 
				-      "engines": {
			
 
				-        "node": ">= 14"
			
 
				-      }
			
 
				-    },
			
 
				-    "node_modules/http-proxy-agent": {
			
 
				-      "version": "7.0.2",
			
 
				-      "resolved": "https://registry.npmjs.org/http-proxy-agent/-/http-proxy-agent-7.0.2.tgz",
			
 
				-      "integrity": "sha512-T1gkAiYYDWYx3V5Bmyu7HcfcvL7mUrTWiM6yOfa3PIphViJ/gFPbvidQ+veqSOHci/PxBcDabeUNCzpOODJZig==",
			
 
				-      "license": "MIT",
			
 
				-      "dependencies": {
			
 
				-        "agent-base": "^7.1.0",
			
 
				-        "debug": "^4.3.4"
			
 
				-      },
			
 
				-      "engines": {
			
 
				-        "node": ">= 14"
			
 
				-      }
			
 
				-    },
			
 
				-    "node_modules/https-proxy-agent": {
			
 
				-      "version": "7.0.6",
			
 
				-      "resolved": "https://registry.npmjs.org/https-proxy-agent/-/https-proxy-agent-7.0.6.tgz",
			
 
				-      "integrity": "sha512-vK9P5/iUfdl95AI+JVyUuIcVtd4ofvtrOr3HNtM2yxC9bnMbEdp3x01OhQNnjb8IJYi38VlTE3mBXwcfvywuSw==",
			
 
				-      "license": "MIT",
			
 
				-      "dependencies": {
			
 
				-        "agent-base": "^7.1.2",
			
 
				-        "debug": "4"
			
 
				-      },
			
 
				-      "engines": {
			
 
				-        "node": ">= 14"
			
 
				-      }
			
 
				-    },
			
 
				-    "node_modules/ip-address": {
			
 
				-      "version": "10.1.0",
			
 
				-      "resolved": "https://registry.npmjs.org/ip-address/-/ip-address-10.1.0.tgz",
			
 
				-      "integrity": "sha512-XXADHxXmvT9+CRxhXg56LJovE+bmWnEWB78LB83VZTprKTmaC5QfruXocxzTZ2Kl0DNwKuBdlIhjL8LeY8Sf8Q==",
			
 
				-      "license": "MIT",
			
 
				-      "engines": {
			
 
				-        "node": ">= 12"
			
 
				-      }
			
 
				-    },
			
 
				-    "node_modules/is-fullwidth-code-point": {
			
 
				-      "version": "3.0.0",
			
 
				-      "resolved": "https://registry.npmjs.org/is-fullwidth-code-point/-/is-fullwidth-code-point-3.0.0.tgz",
			
 
				-      "integrity": "sha512-zymm5+u+sCsSWyD9qNaejV3DFvhCKclKdizYaJUuHA83RLjb7nSuGnddCHGv0hk+KY7BMAlsWeK4Ueg6EV6XQg==",
			
 
				-      "license": "MIT",
			
 
				-      "engines": {
			
 
				-        "node": ">=8"
			
 
				-      }
			
 
				-    },
			
 
				-    "node_modules/lru-cache": {
			
 
				-      "version": "7.18.3",
			
 
				-      "resolved": "https://registry.npmjs.org/lru-cache/-/lru-cache-7.18.3.tgz",
			
 
				-      "integrity": "sha512-jumlc0BIUrS3qJGgIkWZsyfAM7NCWiBcCDhnd+3NNM5KbBmLTgHVfWBcg6W+rLUsIpzpERPsvwUP7CckAQSOoA==",
			
 
				-      "license": "ISC",
			
 
				-      "engines": {
			
 
				-        "node": ">=12"
			
 
				-      }
			
 
				-    },
			
 
				-    "node_modules/mitt": {
			
 
				-      "version": "3.0.1",
			
 
				-      "resolved": "https://registry.npmjs.org/mitt/-/mitt-3.0.1.tgz",
			
 
				-      "integrity": "sha512-vKivATfr97l2/QBCYAkXYDbrIWPM2IIKEl7YPhjCvKlG3kE2gm+uBo6nEXK3M5/Ffh/FLpKExzOQ3JJoJGFKBw==",
			
 
				-      "license": "MIT"
			
 
				-    },
			
 
				-    "node_modules/ms": {
			
 
				-      "version": "2.1.3",
			
 
				-      "resolved": "https://registry.npmjs.org/ms/-/ms-2.1.3.tgz",
			
 
				-      "integrity": "sha512-6FlzubTLZG3J2a/NVCAleEhjzq5oxgHyaCU9yYXvcLsvoVaHJq/s5xXI6/XXP6tz7R9xAOtHnSO/tXtF3WRTlA==",
			
 
				-      "license": "MIT"
			
 
				-    },
			
 
				-    "node_modules/netmask": {
			
 
				-      "version": "2.0.2",
			
 
				-      "resolved": "https://registry.npmjs.org/netmask/-/netmask-2.0.2.tgz",
			
 
				-      "integrity": "sha512-dBpDMdxv9Irdq66304OLfEmQ9tbNRFnFTuZiLo+bD+r332bBmMJ8GBLXklIXXgxd3+v9+KUnZaUR5PJMa75Gsg==",
			
 
				-      "license": "MIT",
			
 
				-      "engines": {
			
 
				-        "node": ">= 0.4.0"
			
 
				-      }
			
 
				-    },
			
 
				-    "node_modules/once": {
			
 
				-      "version": "1.4.0",
			
 
				-      "resolved": "https://registry.npmjs.org/once/-/once-1.4.0.tgz",
			
 
				-      "integrity": "sha512-lNaJgI+2Q5URQBkccEKHTQOPaXdUxnZZElQTZY0MFUAuaEqe1E+Nyvgdz/aIyNi6Z9MzO5dv1H8n58/GELp3+w==",
			
 
				-      "license": "ISC",
			
 
				-      "dependencies": {
			
 
				-        "wrappy": "1"
			
 
				-      }
			
 
				-    },
			
 
				-    "node_modules/pac-proxy-agent": {
			
 
				-      "version": "7.2.0",
			
 
				-      "resolved": "https://registry.npmjs.org/pac-proxy-agent/-/pac-proxy-agent-7.2.0.tgz",
			
 
				-      "integrity": "sha512-TEB8ESquiLMc0lV8vcd5Ql/JAKAoyzHFXaStwjkzpOpC5Yv+pIzLfHvjTSdf3vpa2bMiUQrg9i6276yn8666aA==",
			
 
				-      "license": "MIT",
			
 
				-      "dependencies": {
			
 
				-        "@tootallnate/quickjs-emscripten": "^0.23.0",
			
 
				-        "agent-base": "^7.1.2",
			
 
				-        "debug": "^4.3.4",
			
 
				-        "get-uri": "^6.0.1",
			
 
				-        "http-proxy-agent": "^7.0.0",
			
 
				-        "https-proxy-agent": "^7.0.6",
			
 
				-        "pac-resolver": "^7.0.1",
			
 
				-        "socks-proxy-agent": "^8.0.5"
			
 
				-      },
			
 
				-      "engines": {
			
 
				-        "node": ">= 14"
			
 
				-      }
			
 
				-    },
			
 
				-    "node_modules/pac-resolver": {
			
 
				-      "version": "7.0.1",
			
 
				-      "resolved": "https://registry.npmjs.org/pac-resolver/-/pac-resolver-7.0.1.tgz",
			
 
				-      "integrity": "sha512-5NPgf87AT2STgwa2ntRMr45jTKrYBGkVU36yT0ig/n/GMAa3oPqhZfIQ2kMEimReg0+t9kZViDVZ83qfVUlckg==",
			
 
				-      "license": "MIT",
			
 
				-      "dependencies": {
			
 
				-        "degenerator": "^5.0.0",
			
 
				-        "netmask": "^2.0.2"
			
 
				-      },
			
 
				-      "engines": {
			
 
				-        "node": ">= 14"
			
 
				-      }
			
 
				-    },
			
 
				-    "node_modules/pend": {
			
 
				-      "version": "1.2.0",
			
 
				-      "resolved": "https://registry.npmjs.org/pend/-/pend-1.2.0.tgz",
			
 
				-      "integrity": "sha512-F3asv42UuXchdzt+xXqfW1OGlVBe+mxa2mqI0pg5yAHZPvFmY3Y6drSf/GQ1A86WgWEN9Kzh/WrgKa6iGcHXLg==",
			
 
				-      "license": "MIT"
			
 
				-    },
			
 
				-    "node_modules/progress": {
			
 
				-      "version": "2.0.3",
			
 
				-      "resolved": "https://registry.npmjs.org/progress/-/progress-2.0.3.tgz",
			
 
				-      "integrity": "sha512-7PiHtLll5LdnKIMw100I+8xJXR5gW2QwWYkT6iJva0bXitZKa/XMrSbdmg3r2Xnaidz9Qumd0VPaMrZlF9V9sA==",
			
 
				-      "license": "MIT",
			
 
				-      "engines": {
			
 
				-        "node": ">=0.4.0"
			
 
				-      }
			
 
				-    },
			
 
				-    "node_modules/proxy-agent": {
			
 
				-      "version": "6.5.0",
			
 
				-      "resolved": "https://registry.npmjs.org/proxy-agent/-/proxy-agent-6.5.0.tgz",
			
 
				-      "integrity": "sha512-TmatMXdr2KlRiA2CyDu8GqR8EjahTG3aY3nXjdzFyoZbmB8hrBsTyMezhULIXKnC0jpfjlmiZ3+EaCzoInSu/A==",
			
 
				-      "license": "MIT",
			
 
				-      "dependencies": {
			
 
				-        "agent-base": "^7.1.2",
			
 
				-        "debug": "^4.3.4",
			
 
				-        "http-proxy-agent": "^7.0.1",
			
 
				-        "https-proxy-agent": "^7.0.6",
			
 
				-        "lru-cache": "^7.14.1",
			
 
				-        "pac-proxy-agent": "^7.1.0",
			
 
				-        "proxy-from-env": "^1.1.0",
			
 
				-        "socks-proxy-agent": "^8.0.5"
			
 
				-      },
			
 
				-      "engines": {
			
 
				-        "node": ">= 14"
			
 
				-      }
			
 
				-    },
			
 
				-    "node_modules/proxy-from-env": {
			
 
				-      "version": "1.1.0",
			
 
				-      "resolved": "https://registry.npmjs.org/proxy-from-env/-/proxy-from-env-1.1.0.tgz",
			
 
				-      "integrity": "sha512-D+zkORCbA9f1tdWRK0RaCR3GPv50cMxcrz4X8k5LTSUD1Dkw47mKJEZQNunItRTkWwgtaUSo1RVFRIG9ZXiFYg==",
			
 
				-      "license": "MIT"
			
 
				-    },
			
 
				-    "node_modules/pump": {
			
 
				-      "version": "3.0.3",
			
 
				-      "resolved": "https://registry.npmjs.org/pump/-/pump-3.0.3.tgz",
			
 
				-      "integrity": "sha512-todwxLMY7/heScKmntwQG8CXVkWUOdYxIvY2s0VWAAMh/nd8SoYiRaKjlr7+iCs984f2P8zvrfWcDDYVb73NfA==",
			
 
				-      "license": "MIT",
			
 
				-      "dependencies": {
			
 
				-        "end-of-stream": "^1.1.0",
			
 
				-        "once": "^1.3.1"
			
 
				-      }
			
 
				-    },
			
 
				-    "node_modules/puppeteer-core": {
			
 
				-      "version": "24.34.0",
			
 
				-      "resolved": "https://registry.npmjs.org/puppeteer-core/-/puppeteer-core-24.34.0.tgz",
			
 
				-      "integrity": "sha512-24evawO+mUGW4mvS2a2ivwLdX3gk8zRLZr9HP+7+VT2vBQnm0oh9jJEZmUE3ePJhRkYlZ93i7OMpdcoi2qNCLg==",
			
 
				-      "license": "Apache-2.0",
			
 
				-      "dependencies": {
			
 
				-        "@puppeteer/browsers": "2.11.0",
			
 
				-        "chromium-bidi": "12.0.1",
			
 
				-        "debug": "^4.4.3",
			
 
				-        "devtools-protocol": "0.0.1534754",
			
 
				-        "typed-query-selector": "^2.12.0",
			
 
				-        "webdriver-bidi-protocol": "0.3.10",
			
 
				-        "ws": "^8.18.3"
			
 
				-      },
			
 
				-      "engines": {
			
 
				-        "node": ">=18"
			
 
				-      }
			
 
				-    },
			
 
				-    "node_modules/require-directory": {
			
 
				-      "version": "2.1.1",
			
 
				-      "resolved": "https://registry.npmjs.org/require-directory/-/require-directory-2.1.1.tgz",
			
 
				-      "integrity": "sha512-fGxEI7+wsG9xrvdjsrlmL22OMTTiHRwAMroiEeMgq8gzoLC/PQr7RsRDSTLUg/bZAZtF+TVIkHc6/4RIKrui+Q==",
			
 
				-      "license": "MIT",
			
 
				-      "engines": {
			
 
				-        "node": ">=0.10.0"
			
 
				-      }
			
 
				-    },
			
 
				-    "node_modules/semver": {
			
 
				-      "version": "7.7.3",
			
 
				-      "resolved": "https://registry.npmjs.org/semver/-/semver-7.7.3.tgz",
			
 
				-      "integrity": "sha512-SdsKMrI9TdgjdweUSR9MweHA4EJ8YxHn8DFaDisvhVlUOe4BF1tLD7GAj0lIqWVl+dPb/rExr0Btby5loQm20Q==",
			
 
				-      "license": "ISC",
			
 
				-      "bin": {
			
 
				-        "semver": "bin/semver.js"
			
 
				-      },
			
 
				-      "engines": {
			
 
				-        "node": ">=10"
			
 
				-      }
			
 
				-    },
			
 
				-    "node_modules/smart-buffer": {
			
 
				-      "version": "4.2.0",
			
 
				-      "resolved": "https://registry.npmjs.org/smart-buffer/-/smart-buffer-4.2.0.tgz",
			
 
				-      "integrity": "sha512-94hK0Hh8rPqQl2xXc3HsaBoOXKV20MToPkcXvwbISWLEs+64sBq5kFgn2kJDHb1Pry9yrP0dxrCI9RRci7RXKg==",
			
 
				-      "license": "MIT",
			
 
				-      "engines": {
			
 
				-        "node": ">= 6.0.0",
			
 
				-        "npm": ">= 3.0.0"
			
 
				-      }
			
 
				-    },
			
 
				-    "node_modules/socks": {
			
 
				-      "version": "2.8.7",
			
 
				-      "resolved": "https://registry.npmjs.org/socks/-/socks-2.8.7.tgz",
			
 
				-      "integrity": "sha512-HLpt+uLy/pxB+bum/9DzAgiKS8CX1EvbWxI4zlmgGCExImLdiad2iCwXT5Z4c9c3Eq8rP2318mPW2c+QbtjK8A==",
			
 
				-      "license": "MIT",
			
 
				-      "dependencies": {
			
 
				-        "ip-address": "^10.0.1",
			
 
				-        "smart-buffer": "^4.2.0"
			
 
				-      },
			
 
				-      "engines": {
			
 
				-        "node": ">= 10.0.0",
			
 
				-        "npm": ">= 3.0.0"
			
 
				-      }
			
 
				-    },
			
 
				-    "node_modules/socks-proxy-agent": {
			
 
				-      "version": "8.0.5",
			
 
				-      "resolved": "https://registry.npmjs.org/socks-proxy-agent/-/socks-proxy-agent-8.0.5.tgz",
			
 
				-      "integrity": "sha512-HehCEsotFqbPW9sJ8WVYB6UbmIMv7kUUORIF2Nncq4VQvBfNBLibW9YZR5dlYCSUhwcD628pRllm7n+E+YTzJw==",
			
 
				-      "license": "MIT",
			
 
				-      "dependencies": {
			
 
				-        "agent-base": "^7.1.2",
			
 
				-        "debug": "^4.3.4",
			
 
				-        "socks": "^2.8.3"
			
 
				-      },
			
 
				-      "engines": {
			
 
				-        "node": ">= 14"
			
 
				-      }
			
 
				-    },
			
 
				-    "node_modules/source-map": {
			
 
				-      "version": "0.6.1",
			
 
				-      "resolved": "https://registry.npmjs.org/source-map/-/source-map-0.6.1.tgz",
			
 
				-      "integrity": "sha512-UjgapumWlbMhkBgzT7Ykc5YXUT46F0iKu8SGXq0bcwP5dz/h0Plj6enJqjz1Zbq2l5WaqYnrVbwWOWMyF3F47g==",
			
 
				-      "license": "BSD-3-Clause",
			
 
				-      "optional": true,
			
 
				-      "engines": {
			
 
				-        "node": ">=0.10.0"
			
 
				-      }
			
 
				-    },
			
 
				-    "node_modules/streamx": {
			
 
				-      "version": "2.23.0",
			
 
				-      "resolved": "https://registry.npmjs.org/streamx/-/streamx-2.23.0.tgz",
			
 
				-      "integrity": "sha512-kn+e44esVfn2Fa/O0CPFcex27fjIL6MkVae0Mm6q+E6f0hWv578YCERbv+4m02cjxvDsPKLnmxral/rR6lBMAg==",
			
 
				-      "license": "MIT",
			
 
				-      "dependencies": {
			
 
				-        "events-universal": "^1.0.0",
			
 
				-        "fast-fifo": "^1.3.2",
			
 
				-        "text-decoder": "^1.1.0"
			
 
				-      }
			
 
				-    },
			
 
				-    "node_modules/string-width": {
			
 
				-      "version": "4.2.3",
			
 
				-      "resolved": "https://registry.npmjs.org/string-width/-/string-width-4.2.3.tgz",
			
 
				-      "integrity": "sha512-wKyQRQpjJ0sIp62ErSZdGsjMJWsap5oRNihHhu6G7JVO/9jIB6UyevL+tXuOqrng8j/cxKTWyWUwvSTriiZz/g==",
			
 
				-      "license": "MIT",
			
 
				-      "dependencies": {
			
 
				-        "emoji-regex": "^8.0.0",
			
 
				-        "is-fullwidth-code-point": "^3.0.0",
			
 
				-        "strip-ansi": "^6.0.1"
			
 
				-      },
			
 
				-      "engines": {
			
 
				-        "node": ">=8"
			
 
				-      }
			
 
				-    },
			
 
				-    "node_modules/strip-ansi": {
			
 
				-      "version": "6.0.1",
			
 
				-      "resolved": "https://registry.npmjs.org/strip-ansi/-/strip-ansi-6.0.1.tgz",
			
 
				-      "integrity": "sha512-Y38VPSHcqkFrCpFnQ9vuSXmquuv5oXOKpGeT6aGrr3o3Gc9AlVa6JBfUSOCnbxGGZF+/0ooI7KrPuUSztUdU5A==",
			
 
				-      "license": "MIT",
			
 
				-      "dependencies": {
			
 
				-        "ansi-regex": "^5.0.1"
			
 
				-      },
			
 
				-      "engines": {
			
 
				-        "node": ">=8"
			
 
				-      }
			
 
				-    },
			
 
				-    "node_modules/tar-fs": {
			
 
				-      "version": "3.1.1",
			
 
				-      "resolved": "https://registry.npmjs.org/tar-fs/-/tar-fs-3.1.1.tgz",
			
 
				-      "integrity": "sha512-LZA0oaPOc2fVo82Txf3gw+AkEd38szODlptMYejQUhndHMLQ9M059uXR+AfS7DNo0NpINvSqDsvyaCrBVkptWg==",
			
 
				-      "license": "MIT",
			
 
				-      "dependencies": {
			
 
				-        "pump": "^3.0.0",
			
 
				-        "tar-stream": "^3.1.5"
			
 
				-      },
			
 
				-      "optionalDependencies": {
			
 
				-        "bare-fs": "^4.0.1",
			
 
				-        "bare-path": "^3.0.0"
			
 
				-      }
			
 
				-    },
			
 
				-    "node_modules/tar-stream": {
			
 
				-      "version": "3.1.7",
			
 
				-      "resolved": "https://registry.npmjs.org/tar-stream/-/tar-stream-3.1.7.tgz",
			
 
				-      "integrity": "sha512-qJj60CXt7IU1Ffyc3NJMjh6EkuCFej46zUqJ4J7pqYlThyd9bO0XBTmcOIhSzZJVWfsLks0+nle/j538YAW9RQ==",
			
 
				-      "license": "MIT",
			
 
				-      "dependencies": {
			
 
				-        "b4a": "^1.6.4",
			
 
				-        "fast-fifo": "^1.2.0",
			
 
				-        "streamx": "^2.15.0"
			
 
				-      }
			
 
				-    },
			
 
				-    "node_modules/text-decoder": {
			
 
				-      "version": "1.2.3",
			
 
				-      "resolved": "https://registry.npmjs.org/text-decoder/-/text-decoder-1.2.3.tgz",
			
 
				-      "integrity": "sha512-3/o9z3X0X0fTupwsYvR03pJ/DjWuqqrfwBgTQzdWDiQSm9KitAyz/9WqsT2JQW7KV2m+bC2ol/zqpW37NHxLaA==",
			
 
				-      "license": "Apache-2.0",
			
 
				-      "dependencies": {
			
 
				-        "b4a": "^1.6.4"
			
 
				-      }
			
 
				-    },
			
 
				-    "node_modules/tslib": {
			
 
				-      "version": "2.8.1",
			
 
				-      "resolved": "https://registry.npmjs.org/tslib/-/tslib-2.8.1.tgz",
			
 
				-      "integrity": "sha512-oJFu94HQb+KVduSUQL7wnpmqnfmLsOA/nAh6b6EH0wCEoK0/mPeXU6c3wKDV83MkOuHPRHtSXKKU99IBazS/2w==",
			
 
				-      "license": "0BSD"
			
 
				-    },
			
 
				-    "node_modules/typed-query-selector": {
			
 
				-      "version": "2.12.0",
			
 
				-      "resolved": "https://registry.npmjs.org/typed-query-selector/-/typed-query-selector-2.12.0.tgz",
			
 
				-      "integrity": "sha512-SbklCd1F0EiZOyPiW192rrHZzZ5sBijB6xM+cpmrwDqObvdtunOHHIk9fCGsoK5JVIYXoyEp4iEdE3upFH3PAg==",
			
 
				-      "license": "MIT"
			
 
				-    },
			
 
				-    "node_modules/undici-types": {
			
 
				-      "version": "7.16.0",
			
 
				-      "resolved": "https://registry.npmjs.org/undici-types/-/undici-types-7.16.0.tgz",
			
 
				-      "integrity": "sha512-Zz+aZWSj8LE6zoxD+xrjh4VfkIG8Ya6LvYkZqtUQGJPZjYl53ypCaUwWqo7eI0x66KBGeRo+mlBEkMSeSZ38Nw==",
			
 
				-      "license": "MIT",
			
 
				-      "optional": true
			
 
				-    },
			
 
				-    "node_modules/webdriver-bidi-protocol": {
			
 
				-      "version": "0.3.10",
			
 
				-      "resolved": "https://registry.npmjs.org/webdriver-bidi-protocol/-/webdriver-bidi-protocol-0.3.10.tgz",
			
 
				-      "integrity": "sha512-5LAE43jAVLOhB/QqX4bwSiv0Hg1HBfMmOuwBSXHdvg4GMGu9Y0lIq7p4R/yySu6w74WmaR4GM4H9t2IwLW7hgw==",
			
 
				-      "license": "Apache-2.0"
			
 
				-    },
			
 
				-    "node_modules/wrap-ansi": {
			
 
				-      "version": "7.0.0",
			
 
				-      "resolved": "https://registry.npmjs.org/wrap-ansi/-/wrap-ansi-7.0.0.tgz",
			
 
				-      "integrity": "sha512-YVGIj2kamLSTxw6NsZjoBxfSwsn0ycdesmc4p+Q21c5zPuZ1pl+NfxVdxPtdHvmNVOQ6XSYG4AUtyt/Fi7D16Q==",
			
 
				-      "license": "MIT",
			
 
				-      "dependencies": {
			
 
				-        "ansi-styles": "^4.0.0",
			
 
				-        "string-width": "^4.1.0",
			
 
				-        "strip-ansi": "^6.0.0"
			
 
				-      },
			
 
				-      "engines": {
			
 
				-        "node": ">=10"
			
 
				-      },
			
 
				-      "funding": {
			
 
				-        "url": "https://github.com/chalk/wrap-ansi?sponsor=1"
			
 
				-      }
			
 
				-    },
			
 
				-    "node_modules/wrappy": {
			
 
				-      "version": "1.0.2",
			
 
				-      "resolved": "https://registry.npmjs.org/wrappy/-/wrappy-1.0.2.tgz",
			
 
				-      "integrity": "sha512-l4Sp/DRseor9wL6EvV2+TuQn63dMkPjZ/sp9XkghTEbV9KlPS1xUsZ3u7/IQO4wxtcFB4bgpQPRcR3QCvezPcQ==",
			
 
				-      "license": "ISC"
			
 
				-    },
			
 
				-    "node_modules/ws": {
			
 
				-      "version": "8.18.3",
			
 
				-      "resolved": "https://registry.npmjs.org/ws/-/ws-8.18.3.tgz",
			
 
				-      "integrity": "sha512-PEIGCY5tSlUt50cqyMXfCzX+oOPqN0vuGqWzbcJ2xvnkzkq46oOpz7dQaTDBdfICb4N14+GARUDw2XV2N4tvzg==",
			
 
				-      "license": "MIT",
			
 
				-      "engines": {
			
 
				-        "node": ">=10.0.0"
			
 
				-      },
			
 
				-      "peerDependencies": {
			
 
				-        "bufferutil": "^4.0.1",
			
 
				-        "utf-8-validate": ">=5.0.2"
			
 
				-      },
			
 
				-      "peerDependenciesMeta": {
			
 
				-        "bufferutil": {
			
 
				-          "optional": true
			
 
				-        },
			
 
				-        "utf-8-validate": {
			
 
				-          "optional": true
			
 
				-        }
			
 
				-      }
			
 
				-    },
			
 
				-    "node_modules/y18n": {
			
 
				-      "version": "5.0.8",
			
 
				-      "resolved": "https://registry.npmjs.org/y18n/-/y18n-5.0.8.tgz",
			
 
				-      "integrity": "sha512-0pfFzegeDWJHJIAmTLRP2DwHjdF5s7jo9tuztdQxAhINCdvS+3nGINqPd00AphqJR/0LhANUS6/+7SCb98YOfA==",
			
 
				-      "license": "ISC",
			
 
				-      "engines": {
			
 
				-        "node": ">=10"
			
 
				-      }
			
 
				-    },
			
 
				-    "node_modules/yargs": {
			
 
				-      "version": "17.7.2",
			
 
				-      "resolved": "https://registry.npmjs.org/yargs/-/yargs-17.7.2.tgz",
			
 
				-      "integrity": "sha512-7dSzzRQ++CKnNI/krKnYRV7JKKPUXMEh61soaHKg9mrWEhzFWhFnxPxGl+69cD1Ou63C13NUPCnmIcrvqCuM6w==",
			
 
				-      "license": "MIT",
			
 
				-      "dependencies": {
			
 
				-        "cliui": "^8.0.1",
			
 
				-        "escalade": "^3.1.1",
			
 
				-        "get-caller-file": "^2.0.5",
			
 
				-        "require-directory": "^2.1.1",
			
 
				-        "string-width": "^4.2.3",
			
 
				-        "y18n": "^5.0.5",
			
 
				-        "yargs-parser": "^21.1.1"
			
 
				-      },
			
 
				-      "engines": {
			
 
				-        "node": ">=12"
			
 
				-      }
			
 
				-    },
			
 
				-    "node_modules/yargs-parser": {
			
 
				-      "version": "21.1.1",
			
 
				-      "resolved": "https://registry.npmjs.org/yargs-parser/-/yargs-parser-21.1.1.tgz",
			
 
				-      "integrity": "sha512-tVpsJW7DdjecAiFpbIB1e3qxIQsE6NoPc5/eTdrbbIC4h0LVsWhnoa3g+m2HclBIujHzsxZ4VJVA+GUuc2/LBw==",
			
 
				-      "license": "ISC",
			
 
				-      "engines": {
			
 
				-        "node": ">=12"
			
 
				-      }
			
 
				-    },
			
 
				-    "node_modules/yauzl": {
			
 
				-      "version": "2.10.0",
			
 
				-      "resolved": "https://registry.npmjs.org/yauzl/-/yauzl-2.10.0.tgz",
			
 
				-      "integrity": "sha512-p4a9I6X6nu6IhoGmBqAcbJy1mlC4j27vEPZX9F4L4/vZT3Lyq1VkFHw/V/PUcB9Buo+DG3iHkT0x3Qya58zc3g==",
			
 
				-      "license": "MIT",
			
 
				-      "dependencies": {
			
 
				-        "buffer-crc32": "~0.2.3",
			
 
				-        "fd-slicer": "~1.1.0"
			
 
				-      }
			
 
				-    },
			
 
				-    "node_modules/zod": {
			
 
				-      "version": "3.25.76",
			
 
				-      "resolved": "https://registry.npmjs.org/zod/-/zod-3.25.76.tgz",
			
 
				-      "integrity": "sha512-gzUt/qt81nXsFGKIFcC3YnfEAx5NkunCfnDlvuBSSFS02bcXu4Lmea0AFIUwbLWxWPx3d9p8S5QoaujKcNQxcQ==",
			
 
				-      "license": "MIT",
			
 
				-      "funding": {
			
 
				-        "url": "https://github.com/sponsors/colinhacks"
			
 
				-      }
			
 
				-    }
			
 
				-  }
			
 
				-}
			
--- a/archivebox/plugins/package.json
+++ b/archivebox/plugins/package.json
@@ -1 +0,0 @@
 
				-{"name":"archivebox-plugins","private":true,"dependencies":{"puppeteer-core":"^24.34.0"}}
			
--- a/archivebox/plugins/papersdl/config.json
+++ b/archivebox/plugins/papersdl/config.json
@@ -3,9 +3,10 @@
 
				   "type": "object",
			
 
				   "additionalProperties": false,
			
 
				   "properties": {
			
 
				-    "SAVE_PAPERSDL": {
			
 
				+    "PAPERSDL_ENABLED": {
			
 
				       "type": "boolean",
			
 
				       "default": true,
			
 
				+      "x-aliases": ["SAVE_PAPERSDL", "USE_PAPERSDL"],
			
 
				       "description": "Enable paper downloading with papers-dl"
			
 
				     },
			
 
				     "PAPERSDL_BINARY": {
			
--- a/archivebox/plugins/parse_html_urls/on_Snapshot__70_parse_html_urls.py
+++ b/archivebox/plugins/parse_html_urls/on_Snapshot__70_parse_html_urls.py
@@ -170,10 +170,6 @@ def main(url: str, snapshot_id: str = None, crawl_id: str = None, depth: int = 0
 
				             if normalized != url:
			
 
				                 urls_found.add(unescape(normalized))
			
 
				 
			
 
				-    if not urls_found:
			
 
				-        click.echo('No URLs found', err=True)
			
 
				-        sys.exit(1)
			
 
				-
			
 
				     # Emit Snapshot records to stdout (JSONL)
			
 
				     for found_url in sorted(urls_found):
			
 
				         record = {
			
@@ -189,7 +185,17 @@ def main(url: str, snapshot_id: str = None, crawl_id: str = None, depth: int = 0
 
				 
			
 
				         print(json.dumps(record))
			
 
				 
			
 
				-    click.echo(f'Found {len(urls_found)} URLs', err=True)
			
 
				+    # Emit ArchiveResult record to mark completion
			
 
				+    status = 'succeeded' if urls_found else 'skipped'
			
 
				+    output_str = f'Found {len(urls_found)} URLs' if urls_found else 'No URLs found'
			
 
				+    ar_record = {
			
 
				+        'type': 'ArchiveResult',
			
 
				+        'status': status,
			
 
				+        'output_str': output_str,
			
 
				+    }
			
 
				+    print(json.dumps(ar_record))
			
 
				+
			
 
				+    click.echo(output_str, err=True)
			
 
				     sys.exit(0)
			
 
				 
			
 
				 
			
--- a/archivebox/plugins/parse_html_urls/tests/test_parse_html_urls.py
+++ b/archivebox/plugins/parse_html_urls/tests/test_parse_html_urls.py
@@ -27,12 +27,13 @@ class TestParseHtmlUrls:
 
				 
			
 
				         assert result.returncode == 0, f"Failed to parse example.com: {result.stderr}"
			
 
				 
			
 
				-        output_file = tmp_path / 'urls.jsonl'
			
 
				-        assert output_file.exists(), "Output file not created"
			
 
				+        # Verify stdout contains JSONL records for discovered URLs
			
 
				+        # example.com links to iana.org
			
 
				+        assert 'iana.org' in result.stdout or 'example' in result.stdout, "Expected links from example.com not found"
			
 
				 
			
 
				-        # Verify output contains IANA link (example.com links to iana.org)
			
 
				-        content = output_file.read_text()
			
 
				-        assert 'iana.org' in content or 'example' in content, "Expected links from example.com not found"
			
 
				+        # Verify ArchiveResult record is present
			
 
				+        assert '"type": "ArchiveResult"' in result.stdout, "Missing ArchiveResult record"
			
 
				+        assert '"status": "succeeded"' in result.stdout, "Missing success status"
			
 
				 
			
 
				     def test_extracts_href_urls(self, tmp_path):
			
 
				         """Test extracting URLs from anchor tags."""
			
@@ -56,17 +57,16 @@ class TestParseHtmlUrls:
 
				         )
			
 
				 
			
 
				         assert result.returncode == 0
			
 
				-        assert 'Found 3 URLs' in result.stdout
			
 
				+        assert 'Found 3 URLs' in result.stderr
			
 
				 
			
 
				-        output_file = tmp_path / 'urls.jsonl'
			
 
				-        assert output_file.exists()
			
 
				-
			
 
				-        lines = output_file.read_text().strip().split('\n')
			
 
				-        assert len(lines) == 3
			
 
				+        # Parse Snapshot records from stdout
			
 
				+        lines = [line for line in result.stdout.strip().split('\n') if line.strip() and '"type": "Snapshot"' in line]
			
 
				+        assert len(lines) == 3, f"Expected 3 Snapshot records, got {len(lines)}"
			
 
				 
			
 
				         urls = set()
			
 
				         for line in lines:
			
 
				             entry = json.loads(line)
			
 
				+            assert entry['type'] == 'Snapshot'
			
 
				             assert 'url' in entry
			
 
				             urls.add(entry['url'])
			
 
				 
			
@@ -74,6 +74,10 @@ class TestParseHtmlUrls:
 
				         assert 'https://foo.bar/page' in urls
			
 
				         assert 'http://test.org' in urls
			
 
				 
			
 
				+        # Verify ArchiveResult record
			
 
				+        assert '"type": "ArchiveResult"' in result.stdout
			
 
				+        assert '"status": "succeeded"' in result.stdout
			
 
				+
			
 
				     def test_ignores_non_http_schemes(self, tmp_path):
			
 
				         """Test that non-http schemes are ignored."""
			
 
				         input_file = tmp_path / 'page.html'
			
@@ -96,9 +100,10 @@ class TestParseHtmlUrls:
 
				         )
			
 
				 
			
 
				         assert result.returncode == 0
			
 
				-        output_file = tmp_path / 'urls.jsonl'
			
 
				-        lines = output_file.read_text().strip().split('\n')
			
 
				-        assert len(lines) == 1
			
 
				+
			
 
				+        # Parse Snapshot records from stdout
			
 
				+        lines = [line for line in result.stdout.strip().split('\n') if line.strip() and '"type": "Snapshot"' in line]
			
 
				+        assert len(lines) == 1, f"Expected 1 Snapshot record, got {len(lines)}"
			
 
				 
			
 
				         entry = json.loads(lines[0])
			
 
				         assert entry['url'] == 'https://valid.com'
			
@@ -122,8 +127,8 @@ class TestParseHtmlUrls:
 
				         )
			
 
				 
			
 
				         assert result.returncode == 0
			
 
				-        output_file = tmp_path / 'urls.jsonl'
			
 
				-        entry = json.loads(output_file.read_text().strip())
			
 
				+        lines = [line for line in result.stdout.strip().split('\n') if '"type": "Snapshot"' in line]
			
 
				+        entry = json.loads(lines[0])
			
 
				         assert entry['url'] == 'https://example.com/page?a=1&b=2'
			
 
				 
			
 
				     def test_deduplicates_urls(self, tmp_path):
			
@@ -147,8 +152,7 @@ class TestParseHtmlUrls:
 
				         )
			
 
				 
			
 
				         assert result.returncode == 0
			
 
				-        output_file = tmp_path / 'urls.jsonl'
			
 
				-        lines = output_file.read_text().strip().split('\n')
			
 
				+        lines = [line for line in result.stdout.strip().split('\n') if '"type": "Snapshot"' in line]
			
 
				         assert len(lines) == 1
			
 
				 
			
 
				     def test_excludes_source_url(self, tmp_path):
			
@@ -172,14 +176,13 @@ class TestParseHtmlUrls:
 
				         )
			
 
				 
			
 
				         assert result.returncode == 0
			
 
				-        output_file = tmp_path / 'urls.jsonl'
			
 
				-        lines = output_file.read_text().strip().split('\n')
			
 
				+        lines = [line for line in result.stdout.strip().split('\n') if '"type": "Snapshot"' in line]
			
 
				         assert len(lines) == 1
			
 
				         entry = json.loads(lines[0])
			
 
				         assert entry['url'] == 'https://other.com'
			
 
				 
			
 
				-    def test_exits_1_when_no_urls_found(self, tmp_path):
			
 
				-        """Test that script exits with code 1 when no URLs found."""
			
 
				+    def test_skips_when_no_urls_found(self, tmp_path):
			
 
				+        """Test that script returns skipped status when no URLs found."""
			
 
				         input_file = tmp_path / 'page.html'
			
 
				         input_file.write_text('<html><body>No links here</body></html>')
			
 
				 
			
@@ -190,8 +193,9 @@ class TestParseHtmlUrls:
 
				             text=True,
			
 
				         )
			
 
				 
			
 
				-        assert result.returncode == 1
			
 
				+        assert result.returncode == 0
			
 
				         assert 'No URLs found' in result.stderr
			
 
				+        assert '"status": "skipped"' in result.stdout
			
 
				 
			
 
				     def test_handles_malformed_html(self, tmp_path):
			
 
				         """Test handling of malformed HTML."""
			
@@ -212,8 +216,7 @@ class TestParseHtmlUrls:
 
				         )
			
 
				 
			
 
				         assert result.returncode == 0
			
 
				-        output_file = tmp_path / 'urls.jsonl'
			
 
				-        lines = output_file.read_text().strip().split('\n')
			
 
				+        lines = [line for line in result.stdout.strip().split('\n') if '"type": "Snapshot"' in line]
			
 
				         assert len(lines) == 2
			
 
				 
			
 
				     def test_output_is_valid_json(self, tmp_path):
			
@@ -229,11 +232,11 @@ class TestParseHtmlUrls:
 
				         )
			
 
				 
			
 
				         assert result.returncode == 0
			
 
				-        output_file = tmp_path / 'urls.jsonl'
			
 
				-        entry = json.loads(output_file.read_text().strip())
			
 
				+        lines = [line for line in result.stdout.strip().split('\n') if '"type": "Snapshot"' in line]
			
 
				+        entry = json.loads(lines[0])
			
 
				         assert entry['url'] == 'https://example.com'
			
 
				-        assert 'type' in entry
			
 
				-        assert 'plugin' in entry
			
 
				+        assert entry['type'] == 'Snapshot'
			
 
				+        assert entry['plugin'] == 'parse_html_urls'
			
 
				 
			
 
				 
			
 
				 if __name__ == '__main__':
		`@@ -1 +0,0 @@`
		`-{"name":"archivebox-plugins","private":true,"dependencies":{"puppeteer-core":"^24.34.0"}}`