Nick Sweeting 1 month ago
parent
commit
f0aa19fa7d
100 changed files with 4761 additions and 3308 deletions
  1. 3 1
      .claude/settings.local.json
  2. 5 3
      archivebox/__init__.py
  3. 1 1
      archivebox/api/admin.py
  4. 2 2
      archivebox/api/apps.py
  5. 2 2
      archivebox/api/migrations/0001_squashed.py
  6. 3 3
      archivebox/api/migrations/0003_alter_apitoken_created_by_and_more.py
  7. 3 1
      archivebox/api/models.py
  8. 1 1
      archivebox/api/v1_api.py
  9. 2 2
      archivebox/api/v1_auth.py
  10. 1 0
      archivebox/api/v1_cli.py
  11. 8 10
      archivebox/api/v1_core.py
  12. 2 2
      archivebox/api/v1_crawls.py
  13. 7 7
      archivebox/api/v1_machine.py
  14. 6 0
      archivebox/base_models/models.py
  15. 3 3
      archivebox/cli/archivebox_add.py
  16. 24 4
      archivebox/cli/archivebox_config.py
  17. 9 8
      archivebox/cli/archivebox_crawl.py
  18. 3 4
      archivebox/cli/archivebox_extract.py
  19. 1 1
      archivebox/cli/archivebox_init.py
  20. 2 2
      archivebox/cli/archivebox_install.py
  21. 1 1
      archivebox/cli/archivebox_remove.py
  22. 2 2
      archivebox/cli/archivebox_search.py
  23. 8 6
      archivebox/cli/archivebox_snapshot.py
  24. 1 1
      archivebox/cli/archivebox_status.py
  25. 4 4
      archivebox/cli/archivebox_update.py
  26. 1 1
      archivebox/cli/archivebox_version.py
  27. 11 9
      archivebox/cli/tests_piping.py
  28. 18 154
      archivebox/config/__init__.py
  29. 31 2
      archivebox/config/collection.py
  30. 0 12
      archivebox/config/common.py
  31. 1 1
      archivebox/config/configset.py
  32. 1 1
      archivebox/config/views.py
  33. 1 1
      archivebox/core/__init__.py
  34. 5 5
      archivebox/core/admin.py
  35. 5 11
      archivebox/core/admin_archiveresults.py
  36. 5 5
      archivebox/core/admin_site.py
  37. 5 5
      archivebox/core/admin_snapshots.py
  38. 1 1
      archivebox/core/admin_tags.py
  39. 2 2
      archivebox/core/apps.py
  40. 1 1
      archivebox/core/asgi.py
  41. 174 29
      archivebox/core/forms.py
  42. 1 1
      archivebox/core/migrations/0007_archiveresult.py
  43. 1 1
      archivebox/core/migrations/0032_alter_archiveresult_binary_and_more.py
  44. 79 0
      archivebox/core/migrations/0035_snapshot_crawl_non_nullable_remove_created_by.py
  45. 19 0
      archivebox/core/migrations/0036_remove_archiveresult_created_by.py
  46. 510 172
      archivebox/core/models.py
  47. 2638 0
      archivebox/core/models.py.bak
  48. 22 21
      archivebox/core/settings.py
  49. 0 319
      archivebox/core/statemachines.py
  50. 20 0
      archivebox/core/templatetags/config_tags.py
  51. 318 2
      archivebox/core/tests.py
  52. 3 3
      archivebox/core/urls.py
  53. 65 41
      archivebox/core/views.py
  54. 2 2
      archivebox/crawls/admin.py
  55. 1 1
      archivebox/crawls/apps.py
  56. 149 11
      archivebox/crawls/models.py
  57. 0 114
      archivebox/crawls/statemachines.py
  58. 179 305
      archivebox/hooks.py
  59. 1 1
      archivebox/machine/admin.py
  60. 3 3
      archivebox/machine/apps.py
  61. 5 20
      archivebox/machine/migrations/0001_squashed.py
  62. 4 26
      archivebox/machine/migrations/0002_rename_custom_cmds_to_overrides.py
  63. 5 33
      archivebox/machine/migrations/0003_alter_dependency_id_alter_installedbinary_dependency_and_more.py
  64. 28 0
      archivebox/machine/migrations/0004_drop_dependency_table.py
  65. 0 56
      archivebox/machine/migrations/0004_rename_installedbinary_to_binary.py
  66. 143 5
      archivebox/machine/models.py
  67. 0 112
      archivebox/machine/statemachines.py
  68. 6 58
      archivebox/misc/jsonl.py
  69. 3 3
      archivebox/misc/logging_util.py
  70. 0 335
      archivebox/misc/tests.py
  71. 0 56
      archivebox/misc/util.py
  72. 1 1
      archivebox/personas/apps.py
  73. 1 0
      archivebox/personas/models.py
  74. 0 0
      archivebox/plugins/accessibility/templates/icon.html
  75. 2 2
      archivebox/plugins/archive_org/config.json
  76. 10 0
      archivebox/plugins/archive_org/templates/embed.html
  77. 10 0
      archivebox/plugins/archive_org/templates/fullscreen.html
  78. 12 0
      archivebox/plugins/archive_org/templates/thumbnail.html
  79. 0 15
      archivebox/plugins/chrome/config.json
  80. 0 0
      archivebox/plugins/consolelog/templates/icon.html
  81. 21 0
      archivebox/plugins/dom/config.json
  82. 2 1
      archivebox/plugins/favicon/config.json
  83. 9 6
      archivebox/plugins/favicon/tests/test_favicon.py
  84. 2 1
      archivebox/plugins/forumdl/config.json
  85. 17 7
      archivebox/plugins/forumdl/tests/test_forumdl.py
  86. 2 1
      archivebox/plugins/gallerydl/config.json
  87. 7 1
      archivebox/plugins/gallerydl/tests/test_gallerydl.py
  88. 2 1
      archivebox/plugins/git/config.json
  89. 9 2
      archivebox/plugins/git/tests/test_git.py
  90. 13 8
      archivebox/plugins/headers/tests/test_headers.py
  91. 0 279
      archivebox/plugins/istilldontcareaboutcookies/tests/test_istilldontcareaboutcookies.js
  92. 10 7
      archivebox/plugins/media/config.json
  93. 5 1
      archivebox/plugins/media/tests/test_media.py
  94. 2 1
      archivebox/plugins/mercury/config.json
  95. 8 1
      archivebox/plugins/mercury/tests/test_mercury.py
  96. 0 925
      archivebox/plugins/package-lock.json
  97. 0 1
      archivebox/plugins/package.json
  98. 2 1
      archivebox/plugins/papersdl/config.json
  99. 11 5
      archivebox/plugins/parse_html_urls/on_Snapshot__70_parse_html_urls.py
  100. 32 29
      archivebox/plugins/parse_html_urls/tests/test_parse_html_urls.py

+ 3 - 1
.claude/settings.local.json

@@ -23,7 +23,9 @@
       "Bash(source .venv/bin/activate)",
       "Bash(mv:*)",
       "Bash(echo:*)",
-      "Bash(grep:*)"
+      "Bash(grep:*)",
+      "WebFetch(domain:python-statemachine.readthedocs.io)",
+      "Bash(./bin/run_plugin_tests.sh:*)"
     ]
   }
 }

+ 5 - 3
archivebox/__init__.py

@@ -24,12 +24,14 @@ ASCII_LOGO = """
 ╚═╝  ╚═╝╚═╝  ╚═╝ ╚═════╝╚═╝  ╚═╝╚═╝  ╚═══╝  ╚══════╝ ╚═════╝  ╚═════╝ ╚═╝  ╚═╝
 """
 
-# make sure PACKAGE_DIR is in sys.path so we can import all subfolders
-# without necessarily waiting for django to load them thorugh INSTALLED_APPS
 PACKAGE_DIR = Path(__file__).resolve().parent
+
+# Add PACKAGE_DIR to sys.path - required for Django migrations to import models
+# Migrations reference models like 'machine.Binary' which need to be importable
 if str(PACKAGE_DIR) not in sys.path:
     sys.path.append(str(PACKAGE_DIR))
-os.environ['DJANGO_SETTINGS_MODULE'] = 'core.settings'
+
+os.environ['DJANGO_SETTINGS_MODULE'] = 'archivebox.core.settings'
 os.environ['TZ'] = 'UTC'
 
 # detect ArchiveBox user's UID/GID based on data dir ownership

+ 1 - 1
archivebox/api/admin.py

@@ -5,7 +5,7 @@ from signal_webhooks.utils import get_webhook_model
 
 from archivebox.base_models.admin import BaseModelAdmin
 
-from api.models import APIToken
+from archivebox.api.models import APIToken
 
 
 class APITokenAdmin(BaseModelAdmin):

+ 2 - 2
archivebox/api/apps.py

@@ -4,9 +4,9 @@ from django.apps import AppConfig
 
 
 class APIConfig(AppConfig):
-    name = 'api'
+    name = 'archivebox.api'
 
 
 def register_admin(admin_site):
-    from api.admin import register_admin
+    from archivebox.api.admin import register_admin
     register_admin(admin_site)

+ 2 - 2
archivebox/api/migrations/0001_squashed.py

@@ -7,7 +7,7 @@ from django.conf import settings
 from django.db import migrations, models
 import django.db.models.deletion
 
-import api.models
+import archivebox.api.models
 
 
 class Migration(migrations.Migration):
@@ -38,7 +38,7 @@ class Migration(migrations.Migration):
                 ('created_by', models.ForeignKey(default=None, on_delete=django.db.models.deletion.CASCADE, to=settings.AUTH_USER_MODEL)),
                 ('created_at', models.DateTimeField(auto_now_add=True, db_index=True)),
                 ('modified_at', models.DateTimeField(auto_now=True)),
-                ('token', models.CharField(default=api.models.generate_secret_token, max_length=32, unique=True)),
+                ('token', models.CharField(default=archivebox.api.models.generate_secret_token, max_length=32, unique=True)),
                 ('expires', models.DateTimeField(blank=True, null=True)),
             ],
             options={

+ 3 - 3
archivebox/api/migrations/0003_alter_apitoken_created_by_and_more.py

@@ -1,6 +1,6 @@
 # Generated by Django 6.0 on 2025-12-27 01:40
 
-import base_models.models
+import archivebox.core.models
 import django.db.models.deletion
 from django.conf import settings
 from django.db import migrations, models
@@ -17,11 +17,11 @@ class Migration(migrations.Migration):
         migrations.AlterField(
             model_name='apitoken',
             name='created_by',
-            field=models.ForeignKey(default=base_models.models.get_or_create_system_user_pk, on_delete=django.db.models.deletion.CASCADE, to=settings.AUTH_USER_MODEL),
+            field=models.ForeignKey(default=archivebox.core.models.get_or_create_system_user_pk, on_delete=django.db.models.deletion.CASCADE, to=settings.AUTH_USER_MODEL),
         ),
         migrations.AlterField(
             model_name='outboundwebhook',
             name='created_by',
-            field=models.ForeignKey(default=base_models.models.get_or_create_system_user_pk, on_delete=django.db.models.deletion.CASCADE, to=settings.AUTH_USER_MODEL),
+            field=models.ForeignKey(default=archivebox.core.models.get_or_create_system_user_pk, on_delete=django.db.models.deletion.CASCADE, to=settings.AUTH_USER_MODEL),
         ),
     ]

+ 3 - 1
archivebox/api/models.py

@@ -10,7 +10,7 @@ from django.utils import timezone
 from django_stubs_ext.db.models import TypedModelMeta
 from signal_webhooks.models import WebhookBase
 
-from base_models.models import get_or_create_system_user_pk
+from archivebox.base_models.models import get_or_create_system_user_pk
 
 
 def generate_secret_token() -> str:
@@ -26,6 +26,7 @@ class APIToken(models.Model):
     expires = models.DateTimeField(null=True, blank=True)
 
     class Meta(TypedModelMeta):
+        app_label = 'api'
         verbose_name = "API Key"
         verbose_name_plural = "API Keys"
 
@@ -47,6 +48,7 @@ class OutboundWebhook(WebhookBase):
     modified_at = models.DateTimeField(auto_now=True)
 
     class Meta(WebhookBase.Meta):
+        app_label = 'api'
         verbose_name = 'API Outbound Webhook'
 
     def __str__(self) -> str:

+ 1 - 1
archivebox/api/v1_api.py

@@ -15,7 +15,7 @@ from ninja import NinjaAPI, Swagger
 from archivebox.config import VERSION
 from archivebox.config.version import get_COMMIT_HASH
 
-from api.auth import API_AUTH_METHODS
+from archivebox.api.auth import API_AUTH_METHODS
 
 
 COMMIT_HASH = get_COMMIT_HASH() or 'unknown'

+ 2 - 2
archivebox/api/v1_auth.py

@@ -6,8 +6,8 @@ from ninja import Router, Schema
 from django.utils import timezone
 from datetime import timedelta
 
-from api.models import APIToken
-from api.auth import auth_using_token, auth_using_password, get_or_create_api_token
+from archivebox.api.models import APIToken
+from archivebox.api.auth import auth_using_token, auth_using_password, get_or_create_api_token
 
 
 router = Router(tags=['Authentication'], auth=None)

+ 1 - 0
archivebox/api/v1_cli.py

@@ -118,6 +118,7 @@ def cli_add(request, args: AddCommandSchema):
         plugins=args.plugins,
         parser=args.parser,
         bg=True,  # Always run in background for API calls
+        created_by_id=request.user.pk,
     )
 
     return {

+ 8 - 10
archivebox/api/v1_core.py

@@ -14,8 +14,8 @@ from ninja import Router, Schema, FilterSchema, Field, Query
 from ninja.pagination import paginate, PaginationBase
 from ninja.errors import HttpError
 
-from core.models import Snapshot, ArchiveResult, Tag
-from api.v1_crawls import CrawlSchema
+from archivebox.core.models import Snapshot, ArchiveResult, Tag
+from archivebox.api.v1_crawls import CrawlSchema
 
 
 router = Router(tags=['Core Models'])
@@ -80,12 +80,11 @@ class MinimalArchiveResultSchema(Schema):
 
     @staticmethod
     def resolve_created_by_id(obj):
-        return str(obj.created_by_id)
+        return str(obj.created_by.pk)
 
     @staticmethod
     def resolve_created_by_username(obj) -> str:
-        User = get_user_model()
-        return User.objects.filter(pk=obj.created_by_id).values_list('username', flat=True)[0]
+        return obj.created_by.username
 
 
 class ArchiveResultSchema(MinimalArchiveResultSchema):
@@ -166,12 +165,11 @@ class SnapshotSchema(Schema):
 
     @staticmethod
     def resolve_created_by_id(obj):
-        return str(obj.created_by_id)
+        return str(obj.created_by.pk)
 
     @staticmethod
     def resolve_created_by_username(obj):
-        User = get_user_model()
-        return User.objects.get(id=obj.created_by_id).username
+        return obj.created_by.username
 
     @staticmethod
     def resolve_tags(obj):
@@ -190,8 +188,8 @@ class SnapshotSchema(Schema):
 
 class SnapshotFilterSchema(FilterSchema):
     id: Optional[str] = Field(None, q=['id__icontains', 'timestamp__startswith'])
-    created_by_id: str = Field(None, q='created_by_id')
-    created_by_username: str = Field(None, q='created_by__username__icontains')
+    created_by_id: str = Field(None, q='crawl__created_by_id')
+    created_by_username: str = Field(None, q='crawl__created_by__username__icontains')
     created_at__gte: datetime = Field(None, q='created_at__gte')
     created_at__lt: datetime = Field(None, q='created_at__lt')
     created_at: datetime = Field(None, q='created_at')

+ 2 - 2
archivebox/api/v1_crawls.py

@@ -9,8 +9,8 @@ from django.contrib.auth import get_user_model
 
 from ninja import Router, Schema
 
-from core.models import Snapshot
-from crawls.models import Crawl
+from archivebox.core.models import Snapshot
+from archivebox.crawls.models import Crawl
 
 from .auth import API_AUTH_METHODS
 

+ 7 - 7
archivebox/api/v1_machine.py

@@ -7,7 +7,7 @@ from datetime import datetime
 from ninja import Router, Schema, FilterSchema, Field, Query
 from ninja.pagination import paginate
 
-from api.v1_core import CustomPagination
+from archivebox.api.v1_core import CustomPagination
 
 
 router = Router(tags=['Machine and Dependencies'])
@@ -102,14 +102,14 @@ class BinaryFilterSchema(FilterSchema):
 @paginate(CustomPagination)
 def get_machines(request, filters: MachineFilterSchema = Query(...)):
     """List all machines."""
-    from machine.models import Machine
+    from archivebox.machine.models import Machine
     return filters.filter(Machine.objects.all()).distinct()
 
 
 @router.get("/machine/{machine_id}", response=MachineSchema, url_name="get_machine")
 def get_machine(request, machine_id: str):
     """Get a specific machine by ID."""
-    from machine.models import Machine
+    from archivebox.machine.models import Machine
     from django.db.models import Q
     return Machine.objects.get(Q(id__startswith=machine_id) | Q(hostname__iexact=machine_id))
 
@@ -117,7 +117,7 @@ def get_machine(request, machine_id: str):
 @router.get("/machine/current", response=MachineSchema, url_name="get_current_machine")
 def get_current_machine(request):
     """Get the current machine."""
-    from machine.models import Machine
+    from archivebox.machine.models import Machine
     return Machine.current()
 
 
@@ -132,19 +132,19 @@ def get_current_machine(request):
 @paginate(CustomPagination)
 def get_binaries(request, filters: BinaryFilterSchema = Query(...)):
     """List all binaries."""
-    from machine.models import Binary
+    from archivebox.machine.models import Binary
     return filters.filter(Binary.objects.all().select_related('machine', 'dependency')).distinct()
 
 
 @router.get("/binary/{binary_id}", response=BinarySchema, url_name="get_binary")
 def get_binary(request, binary_id: str):
     """Get a specific binary by ID."""
-    from machine.models import Binary
+    from archivebox.machine.models import Binary
     return Binary.objects.select_related('machine', 'dependency').get(id__startswith=binary_id)
 
 
 @router.get("/binary/by-name/{name}", response=List[BinarySchema], url_name="get_binaries_by_name")
 def get_binaries_by_name(request, name: str):
     """Get all binaries with the given name."""
-    from machine.models import Binary
+    from archivebox.machine.models import Binary
     return list(Binary.objects.filter(name__iexact=name).select_related('machine', 'dependency'))

+ 6 - 0
archivebox/base_models/models.py

@@ -12,6 +12,7 @@ from pathlib import Path
 
 from django.contrib import admin
 from django.db import models
+from django.db.models import F
 from django.utils import timezone
 from django.contrib.auth import get_user_model
 from django.urls import reverse_lazy
@@ -110,6 +111,11 @@ class ModelWithHealthStats(models.Model):
         total = max(self.num_uses_failed + self.num_uses_succeeded, 1)
         return round((self.num_uses_succeeded / total) * 100)
 
+    def increment_health_stats(self, success: bool):
+        """Atomically increment success or failure counter using F() expression."""
+        field = 'num_uses_succeeded' if success else 'num_uses_failed'
+        type(self).objects.filter(pk=self.pk).update(**{field: F(field) + 1})
+
 
 class ModelWithConfig(models.Model):
     """Mixin for models with a JSON config field."""

+ 3 - 3
archivebox/cli/archivebox_add.py

@@ -19,7 +19,7 @@ from archivebox.config.permissions import USER, HOSTNAME
 
 
 if TYPE_CHECKING:
-    from core.models import Snapshot
+    from archivebox.core.models import Snapshot
 
 
 @enforce_types
@@ -53,8 +53,8 @@ def add(urls: str | list[str],
     assert depth in (0, 1, 2, 3, 4), 'Depth must be 0-4'
 
     # import models once django is set up
-    from core.models import Snapshot
-    from crawls.models import Crawl
+    from archivebox.core.models import Snapshot
+    from archivebox.crawls.models import Crawl
     from archivebox.base_models.models import get_or_create_system_user_pk
     from workers.orchestrator import Orchestrator
 

+ 24 - 4
archivebox/cli/archivebox_config.py

@@ -66,18 +66,38 @@ def config(*keys,
                 raise SystemExit(1)
         else:
             matching_config = FLAT_CONFIG
-        
+
+        # Display core config sections
         for config_section in CONFIGS.values():
             if hasattr(config_section, 'toml_section_header'):
                 print(f'[grey53]\\[{config_section.toml_section_header}][/grey53]')
             else:
                 print('[grey53]\\[CONSTANTS]                                        # (read-only)[/grey53]')
-            
+
             kv_in_section = {key: val for key, val in dict(config_section).items() if key in matching_config}
             print(benedict(kv_in_section).to_toml(encoder=CustomTOMLEncoder()).strip().replace('\n\n', '\n'))
             print('[grey53]################################################################[/grey53]')
-            
-        
+
+        # Display plugin config section
+        from archivebox.hooks import discover_plugin_configs
+
+        plugin_configs = discover_plugin_configs()
+        plugin_keys = {}
+
+        # Collect all plugin config keys
+        for plugin_name, schema in plugin_configs.items():
+            if 'properties' not in schema:
+                continue
+            for key in schema['properties'].keys():
+                if key in matching_config:
+                    plugin_keys[key] = matching_config[key]
+
+        # Display all plugin config in single [PLUGINS] section
+        if plugin_keys:
+            print(f'[grey53]\\[PLUGINS][/grey53]')
+            print(benedict(plugin_keys).to_toml(encoder=CustomTOMLEncoder()).strip().replace('\n\n', '\n'))
+            print('[grey53]################################################################[/grey53]')
+
         raise SystemExit(not matching_config)
 
     elif set:

+ 9 - 8
archivebox/cli/archivebox_crawl.py

@@ -72,11 +72,11 @@ def discover_outlinks(
 
     from archivebox.misc.jsonl import (
         read_args_or_stdin, write_record,
-        TYPE_SNAPSHOT, get_or_create_snapshot
+        TYPE_SNAPSHOT
     )
     from archivebox.base_models.models import get_or_create_system_user_pk
-    from core.models import Snapshot, ArchiveResult
-    from crawls.models import Crawl
+    from archivebox.core.models import Snapshot, ArchiveResult
+    from archivebox.crawls.models import Crawl
     from archivebox.config import CONSTANTS
     from workers.orchestrator import Orchestrator
 
@@ -130,8 +130,10 @@ def discover_outlinks(
                 record['crawl_id'] = str(crawl.id)
                 record['depth'] = record.get('depth', 0)
 
-                snapshot = get_or_create_snapshot(record, created_by_id=created_by_id)
-                snapshot_ids.append(str(snapshot.id))
+                overrides = {'created_by_id': created_by_id}
+                snapshot = Snapshot.from_jsonl(record, overrides=overrides)
+                if snapshot:
+                    snapshot_ids.append(str(snapshot.id))
 
             except Exception as e:
                 rprint(f'[red]Error creating snapshot: {e}[/red]', file=sys.stderr)
@@ -162,7 +164,6 @@ def discover_outlinks(
                     defaults={
                         'status': ArchiveResult.StatusChoices.QUEUED,
                         'retry_at': timezone.now(),
-                        'created_by_id': snapshot.created_by_id,
                     }
                 )
             else:
@@ -229,7 +230,7 @@ def process_crawl_by_id(crawl_id: str) -> int:
     - Transition from started -> sealed (when all snapshots done)
     """
     from rich import print as rprint
-    from crawls.models import Crawl
+    from archivebox.crawls.models import Crawl
 
     try:
         crawl = Crawl.objects.get(id=crawl_id)
@@ -256,7 +257,7 @@ def is_crawl_id(value: str) -> bool:
     if not uuid_pattern.match(value):
         return False
     # Verify it's actually a Crawl (not a Snapshot or other object)
-    from crawls.models import Crawl
+    from archivebox.crawls.models import Crawl
     return Crawl.objects.filter(id=value).exists()
 
 

+ 3 - 4
archivebox/cli/archivebox_extract.py

@@ -43,7 +43,7 @@ def process_archiveresult_by_id(archiveresult_id: str) -> int:
     Triggers the ArchiveResult's state machine tick() to run the extractor plugin.
     """
     from rich import print as rprint
-    from core.models import ArchiveResult
+    from archivebox.core.models import ArchiveResult
 
     try:
         archiveresult = ArchiveResult.objects.get(id=archiveresult_id)
@@ -95,7 +95,7 @@ def run_plugins(
         read_args_or_stdin, write_record, archiveresult_to_jsonl,
         TYPE_SNAPSHOT, TYPE_ARCHIVERESULT
     )
-    from core.models import Snapshot, ArchiveResult
+    from archivebox.core.models import Snapshot, ArchiveResult
     from workers.orchestrator import Orchestrator
 
     is_tty = sys.stdout.isatty()
@@ -155,7 +155,6 @@ def run_plugins(
                 defaults={
                     'status': ArchiveResult.StatusChoices.QUEUED,
                     'retry_at': timezone.now(),
-                    'created_by_id': snapshot.created_by_id,
                 }
             )
             if not created and result.status in [ArchiveResult.StatusChoices.FAILED, ArchiveResult.StatusChoices.SKIPPED]:
@@ -218,7 +217,7 @@ def is_archiveresult_id(value: str) -> bool:
     if not uuid_pattern.match(value):
         return False
     # Verify it's actually an ArchiveResult (not a Snapshot or other object)
-    from core.models import ArchiveResult
+    from archivebox.core.models import ArchiveResult
     return ArchiveResult.objects.filter(id=value).exists()
 
 

+ 1 - 1
archivebox/cli/archivebox_init.py

@@ -95,7 +95,7 @@ def init(force: bool=False, quick: bool=False, install: bool=False, setup: bool=
     print()
     print('[dodger_blue3][*] Checking links from indexes and archive folders (safe to Ctrl+C)...[/dodger_blue3]')
 
-    from core.models import Snapshot
+    from archivebox.core.models import Snapshot
 
     all_links = Snapshot.objects.none()
     pending_links: dict[str, SnapshotDict] = {}

+ 2 - 2
archivebox/cli/archivebox_install.py

@@ -42,7 +42,7 @@ def install(dry_run: bool=False) -> None:
     setup_django()
 
     from django.utils import timezone
-    from crawls.models import Crawl
+    from archivebox.crawls.models import Crawl
     from archivebox.base_models.models import get_or_create_system_user_pk
 
     # Create a crawl for dependency detection
@@ -70,7 +70,7 @@ def install(dry_run: bool=False) -> None:
     print(f'[+] Crawl status: {crawl.status}, retry_at: {crawl.retry_at}')
 
     # Verify the crawl is in the queue
-    from crawls.models import Crawl as CrawlModel
+    from archivebox.crawls.models import Crawl as CrawlModel
     queued_crawls = CrawlModel.objects.filter(
         retry_at__lte=timezone.now()
     ).exclude(

+ 1 - 1
archivebox/cli/archivebox_remove.py

@@ -71,7 +71,7 @@ def remove(filter_patterns: Iterable[str]=(),
     to_remove = snapshots.count()
 
     from archivebox.search import flush_search_index
-    from core.models import Snapshot
+    from archivebox.core.models import Snapshot
 
     flush_search_index(snapshots=snapshots)
     snapshots.delete()

+ 2 - 2
archivebox/cli/archivebox_search.py

@@ -36,7 +36,7 @@ def get_snapshots(snapshots: Optional[QuerySet]=None,
                   before: Optional[float]=None,
                   out_dir: Path=DATA_DIR) -> QuerySet:
     """Filter and return Snapshots matching the given criteria."""
-    from core.models import Snapshot
+    from archivebox.core.models import Snapshot
 
     if snapshots:
         result = snapshots
@@ -68,7 +68,7 @@ def search(filter_patterns: list[str] | None=None,
            csv: str | None=None,
            with_headers: bool=False):
     """List, filter, and export information about archive entries"""
-    from core.models import Snapshot
+    from archivebox.core.models import Snapshot
 
     if with_headers and not (json or html or csv):
         stderr('[X] --with-headers requires --json, --html or --csv\n', color='red')

+ 8 - 6
archivebox/cli/archivebox_snapshot.py

@@ -46,7 +46,7 @@ def process_snapshot_by_id(snapshot_id: str) -> int:
     - Transition from started -> sealed (when all ArchiveResults done)
     """
     from rich import print as rprint
-    from core.models import Snapshot
+    from archivebox.core.models import Snapshot
 
     try:
         snapshot = Snapshot.objects.get(id=snapshot_id)
@@ -88,11 +88,11 @@ def create_snapshots(
 
     from archivebox.misc.jsonl import (
         read_args_or_stdin, write_record, snapshot_to_jsonl,
-        TYPE_SNAPSHOT, TYPE_TAG, get_or_create_snapshot
+        TYPE_SNAPSHOT, TYPE_TAG
     )
     from archivebox.base_models.models import get_or_create_system_user_pk
-    from core.models import Snapshot
-    from crawls.models import Crawl
+    from archivebox.core.models import Snapshot
+    from archivebox.crawls.models import Crawl
     from archivebox.config import CONSTANTS
 
     created_by_id = created_by_id or get_or_create_system_user_pk()
@@ -137,8 +137,10 @@ def create_snapshots(
                 record['tags'] = tag
 
             # Get or create the snapshot
-            snapshot = get_or_create_snapshot(record, created_by_id=created_by_id)
-            created_snapshots.append(snapshot)
+            overrides = {'created_by_id': created_by_id}
+            snapshot = Snapshot.from_jsonl(record, overrides=overrides)
+            if snapshot:
+                created_snapshots.append(snapshot)
 
             # Output JSONL record (only when piped)
             if not is_tty:

+ 1 - 1
archivebox/cli/archivebox_status.py

@@ -21,7 +21,7 @@ def status(out_dir: Path=DATA_DIR) -> None:
 
     from django.contrib.auth import get_user_model
     from archivebox.misc.db import get_admins
-    from core.models import Snapshot
+    from archivebox.core.models import Snapshot
     User = get_user_model()
 
     print('[green]\\[*] Scanning archive main index...[/green]')

+ 4 - 4
archivebox/cli/archivebox_update.py

@@ -36,7 +36,7 @@ def update(filter_patterns: Iterable[str] = (),
     from archivebox.config.django import setup_django
     setup_django()
 
-    from core.models import Snapshot
+    from archivebox.core.models import Snapshot
     from django.utils import timezone
 
     while True:
@@ -83,7 +83,7 @@ def import_orphans_from_archive(resume_from: str = None, batch_size: int = 100)
     Skip symlinks (already migrated).
     Create DB records and trigger migration on save().
     """
-    from core.models import Snapshot
+    from archivebox.core.models import Snapshot
     from archivebox.config import CONSTANTS
     from django.db import transaction
 
@@ -151,7 +151,7 @@ def process_all_db_snapshots(batch_size: int = 100) -> dict:
     Process all snapshots in DB.
     Reconcile index.json and queue for archiving.
     """
-    from core.models import Snapshot
+    from archivebox.core.models import Snapshot
     from django.db import transaction
     from django.utils import timezone
 
@@ -189,7 +189,7 @@ def process_filtered_snapshots(
     batch_size: int
 ) -> dict:
     """Process snapshots matching filters (DB query only)."""
-    from core.models import Snapshot
+    from archivebox.core.models import Snapshot
     from django.db import transaction
     from django.utils import timezone
     from datetime import datetime

+ 1 - 1
archivebox/cli/archivebox_version.py

@@ -107,7 +107,7 @@ def version(quiet: bool=False,
     from archivebox.config.django import setup_django
     setup_django()
 
-    from machine.models import Machine, Binary
+    from archivebox.machine.models import Machine, Binary
 
     machine = Machine.current()
 

+ 11 - 9
archivebox/cli/tests_piping.py

@@ -542,10 +542,10 @@ class TestPipingWorkflowIntegration(unittest.TestCase):
         Test: archivebox snapshot URL
         Should create a Snapshot and output JSONL when piped.
         """
-        from core.models import Snapshot
+        from archivebox.core.models import Snapshot
         from archivebox.misc.jsonl import (
             read_args_or_stdin, write_record, snapshot_to_jsonl,
-            TYPE_SNAPSHOT, get_or_create_snapshot
+            TYPE_SNAPSHOT
         )
         from archivebox.base_models.models import get_or_create_system_user_pk
 
@@ -559,7 +559,8 @@ class TestPipingWorkflowIntegration(unittest.TestCase):
         self.assertEqual(records[0]['url'], url)
 
         # Create snapshot
-        snapshot = get_or_create_snapshot(records[0], created_by_id=created_by_id)
+        overrides = {'created_by_id': created_by_id}
+        snapshot = Snapshot.from_jsonl(records[0], overrides=overrides)
 
         self.assertIsNotNone(snapshot.id)
         self.assertEqual(snapshot.url, url)
@@ -575,9 +576,9 @@ class TestPipingWorkflowIntegration(unittest.TestCase):
         Test: archivebox snapshot URL | archivebox extract
         Extract should accept JSONL output from snapshot command.
         """
-        from core.models import Snapshot, ArchiveResult
+        from archivebox.core.models import Snapshot, ArchiveResult
         from archivebox.misc.jsonl import (
-            snapshot_to_jsonl, read_args_or_stdin, get_or_create_snapshot,
+            snapshot_to_jsonl, read_args_or_stdin,
             TYPE_SNAPSHOT
         )
         from archivebox.base_models.models import get_or_create_system_user_pk
@@ -586,7 +587,8 @@ class TestPipingWorkflowIntegration(unittest.TestCase):
 
         # Step 1: Create snapshot (simulating 'archivebox snapshot')
         url = 'https://test-extract-1.example.com'
-        snapshot = get_or_create_snapshot({'url': url}, created_by_id=created_by_id)
+        overrides = {'created_by_id': created_by_id}
+        snapshot = Snapshot.from_jsonl({'url': url}, overrides=overrides)
         snapshot_output = snapshot_to_jsonl(snapshot)
 
         # Step 2: Parse snapshot output as extract input
@@ -648,7 +650,7 @@ class TestPipingWorkflowIntegration(unittest.TestCase):
 
         This is equivalent to: archivebox add URL
         """
-        from core.models import Snapshot
+        from archivebox.core.models import Snapshot
         from archivebox.misc.jsonl import (
             get_or_create_snapshot, snapshot_to_jsonl, read_args_or_stdin,
             TYPE_SNAPSHOT
@@ -682,7 +684,7 @@ class TestPipingWorkflowIntegration(unittest.TestCase):
 
         This is equivalent to: archivebox add --depth=1 URL
         """
-        from core.models import Snapshot
+        from archivebox.core.models import Snapshot
         from archivebox.misc.jsonl import (
             get_or_create_snapshot, snapshot_to_jsonl, read_args_or_stdin,
             TYPE_SNAPSHOT
@@ -772,7 +774,7 @@ class TestDepthWorkflows(unittest.TestCase):
 
         Depth 0: Only archive the specified URL, no crawling.
         """
-        from core.models import Snapshot
+        from archivebox.core.models import Snapshot
         from archivebox.misc.jsonl import get_or_create_snapshot
         from archivebox.base_models.models import get_or_create_system_user_pk
 

+ 18 - 154
archivebox/config/__init__.py

@@ -35,177 +35,41 @@ def _get_config():
 # These are recalculated each time the module attribute is accessed
 
 def __getattr__(name: str):
-    """Module-level __getattr__ for lazy config loading."""
-    
-    # Timeout settings
+    """
+    Module-level __getattr__ for lazy config loading.
+
+    Only provides backwards compatibility for GENERIC/SHARED config.
+    Plugin-specific config (binaries, args, toggles) should come from plugin config.json files.
+    """
+
+    # Generic timeout settings (used by multiple plugins)
     if name == 'TIMEOUT':
         cfg, _ = _get_config()
         return cfg.TIMEOUT
-    if name == 'MEDIA_TIMEOUT':
-        cfg, _ = _get_config()
-        return cfg.MEDIA_TIMEOUT
-    
-    # SSL/Security settings
+
+    # Generic SSL/Security settings (used by multiple plugins)
     if name == 'CHECK_SSL_VALIDITY':
         cfg, _ = _get_config()
         return cfg.CHECK_SSL_VALIDITY
-    
-    # Storage settings  
+
+    # Generic storage settings (used by multiple plugins)
     if name == 'RESTRICT_FILE_NAMES':
         _, storage = _get_config()
         return storage.RESTRICT_FILE_NAMES
-    
-    # User agent / cookies
+
+    # Generic user agent / cookies (used by multiple plugins)
     if name == 'COOKIES_FILE':
         cfg, _ = _get_config()
         return cfg.COOKIES_FILE
     if name == 'USER_AGENT':
         cfg, _ = _get_config()
         return cfg.USER_AGENT
-    if name == 'CURL_USER_AGENT':
-        cfg, _ = _get_config()
-        return cfg.USER_AGENT
-    if name == 'WGET_USER_AGENT':
-        cfg, _ = _get_config()
-        return cfg.USER_AGENT
-    if name == 'CHROME_USER_AGENT':
-        cfg, _ = _get_config()
-        return cfg.USER_AGENT
-    
-    # Archive method toggles (SAVE_*)
-    if name == 'SAVE_TITLE':
-        return True
-    if name == 'SAVE_FAVICON':
-        return True
-    if name == 'SAVE_WGET':
-        return True
-    if name == 'SAVE_WARC':
-        return True
-    if name == 'SAVE_WGET_REQUISITES':
-        return True
-    if name == 'SAVE_SINGLEFILE':
-        return True
-    if name == 'SAVE_READABILITY':
-        return True
-    if name == 'SAVE_MERCURY':
-        return True
-    if name == 'SAVE_HTMLTOTEXT':
-        return True
-    if name == 'SAVE_PDF':
-        return True
-    if name == 'SAVE_SCREENSHOT':
-        return True
-    if name == 'SAVE_DOM':
-        return True
-    if name == 'SAVE_HEADERS':
-        return True
-    if name == 'SAVE_GIT':
-        return True
-    if name == 'SAVE_MEDIA':
-        return True
-    if name == 'SAVE_ARCHIVE_DOT_ORG':
-        return True
-    
-    # Extractor-specific settings
+
+    # Generic resolution settings (used by multiple plugins)
     if name == 'RESOLUTION':
         cfg, _ = _get_config()
         return cfg.RESOLUTION
-    if name == 'GIT_DOMAINS':
-        return 'github.com,bitbucket.org,gitlab.com,gist.github.com,codeberg.org,gitea.com,git.sr.ht'
-    if name == 'MEDIA_MAX_SIZE':
-        cfg, _ = _get_config()
-        return cfg.MEDIA_MAX_SIZE
-    if name == 'FAVICON_PROVIDER':
-        return 'https://www.google.com/s2/favicons?domain={}'
-    
-    # Binary paths (use shutil.which for detection)
-    if name == 'CURL_BINARY':
-        return shutil.which('curl') or 'curl'
-    if name == 'WGET_BINARY':
-        return shutil.which('wget') or 'wget'
-    if name == 'GIT_BINARY':
-        return shutil.which('git') or 'git'
-    if name == 'YOUTUBEDL_BINARY':
-        return shutil.which('yt-dlp') or shutil.which('youtube-dl') or 'yt-dlp'
-    if name == 'CHROME_BINARY':
-        for chrome in ['chromium', 'chromium-browser', 'google-chrome', 'google-chrome-stable', 'chrome']:
-            path = shutil.which(chrome)
-            if path:
-                return path
-        return 'chromium'
-    if name == 'NODE_BINARY':
-        return shutil.which('node') or 'node'
-    if name == 'SINGLEFILE_BINARY':
-        return shutil.which('single-file') or shutil.which('singlefile') or 'single-file'
-    if name == 'READABILITY_BINARY':
-        return shutil.which('readability-extractor') or 'readability-extractor'
-    if name == 'MERCURY_BINARY':
-        return shutil.which('mercury-parser') or shutil.which('postlight-parser') or 'mercury-parser'
-    
-    # Binary versions (return placeholder, actual version detection happens elsewhere)
-    if name == 'CURL_VERSION':
-        return 'curl'
-    if name == 'WGET_VERSION':
-        return 'wget'
-    if name == 'GIT_VERSION':
-        return 'git'
-    if name == 'YOUTUBEDL_VERSION':
-        return 'yt-dlp'
-    if name == 'CHROME_VERSION':
-        return 'chromium'
-    if name == 'SINGLEFILE_VERSION':
-        return 'singlefile'
-    if name == 'READABILITY_VERSION':
-        return 'readability'
-    if name == 'MERCURY_VERSION':
-        return 'mercury'
-    
-    # Binary arguments
-    if name == 'CURL_ARGS':
-        return ['--silent', '--location', '--compressed']
-    if name == 'WGET_ARGS':
-        return [
-            '--no-verbose',
-            '--adjust-extension',
-            '--convert-links',
-            '--force-directories',
-            '--backup-converted',
-            '--span-hosts',
-            '--no-parent',
-            '-e', 'robots=off',
-        ]
-    if name == 'GIT_ARGS':
-        return ['--recursive']
-    if name == 'YOUTUBEDL_ARGS':
-        cfg, _ = _get_config()
-        return [
-            '--write-description',
-            '--write-info-json',
-            '--write-annotations',
-            '--write-thumbnail',
-            '--no-call-home',
-            '--write-sub',
-            '--write-auto-subs',
-            '--convert-subs=srt',
-            '--yes-playlist',
-            '--continue',
-            '--no-abort-on-error',
-            '--ignore-errors',
-            '--geo-bypass',
-            '--add-metadata',
-            f'--format=(bv*+ba/b)[filesize<={cfg.MEDIA_MAX_SIZE}][filesize_approx<=?{cfg.MEDIA_MAX_SIZE}]/(bv*+ba/b)',
-        ]
-    if name == 'SINGLEFILE_ARGS':
-        return None  # Uses defaults
-    if name == 'CHROME_ARGS':
-        return []
-    
-    # Other settings
-    if name == 'WGET_AUTO_COMPRESSION':
-        return True
-    if name == 'DEPENDENCIES':
-        return {}  # Legacy, not used anymore
-    
+
     # Allowlist/Denylist patterns (compiled regexes)
     if name == 'SAVE_ALLOWLIST_PTN':
         cfg, _ = _get_config()
@@ -213,7 +77,7 @@ def __getattr__(name: str):
     if name == 'SAVE_DENYLIST_PTN':
         cfg, _ = _get_config()
         return cfg.SAVE_DENYLIST_PTNS
-    
+
     raise AttributeError(f"module 'archivebox.config' has no attribute '{name}'")
 
 

+ 31 - 2
archivebox/config/collection.py

@@ -111,6 +111,24 @@ def load_config_file() -> Optional[benedict]:
     return None
 
 
+class PluginConfigSection:
+    """Pseudo-section for all plugin config keys written to [PLUGINS] section in ArchiveBox.conf"""
+    toml_section_header = "PLUGINS"
+
+    def __init__(self, key: str):
+        self._key = key
+
+    def __getattr__(self, name: str) -> Any:
+        # Allow hasattr checks to pass for the key
+        if name == self._key:
+            return None
+        raise AttributeError(f"PluginConfigSection has no attribute '{name}'")
+
+    def update_in_place(self, warn: bool = True, persist: bool = False, **kwargs):
+        """No-op update since plugins read config dynamically via get_config()."""
+        pass
+
+
 def section_for_key(key: str) -> Any:
     """Find the config section containing a given key."""
     from archivebox.config.common import (
@@ -121,11 +139,22 @@ def section_for_key(key: str) -> Any:
         ARCHIVING_CONFIG,
         SEARCH_BACKEND_CONFIG,
     )
-    
-    for section in [SHELL_CONFIG, STORAGE_CONFIG, GENERAL_CONFIG, 
+
+    # First check core config sections
+    for section in [SHELL_CONFIG, STORAGE_CONFIG, GENERAL_CONFIG,
                     SERVER_CONFIG, ARCHIVING_CONFIG, SEARCH_BACKEND_CONFIG]:
         if hasattr(section, key):
             return section
+
+    # Check if this is a plugin config key
+    from archivebox.hooks import discover_plugin_configs
+
+    plugin_configs = discover_plugin_configs()
+    for plugin_name, schema in plugin_configs.items():
+        if 'properties' in schema and key in schema['properties']:
+            # All plugin config goes to [PLUGINS] section
+            return PluginConfigSection(key)
+
     raise ValueError(f'No config section found for key: {key}')
 
 

+ 0 - 12
archivebox/config/common.py

@@ -123,9 +123,7 @@ class ArchivingConfig(BaseConfigSet):
     OVERWRITE: bool = Field(default=False)
 
     TIMEOUT: int = Field(default=60)
-    MEDIA_TIMEOUT: int = Field(default=3600)
 
-    MEDIA_MAX_SIZE: str = Field(default="750m")
     RESOLUTION: str = Field(default="1440,2000")
     CHECK_SSL_VALIDITY: bool = Field(default=True)
     USER_AGENT: str = Field(
@@ -141,15 +139,6 @@ class ArchivingConfig(BaseConfigSet):
 
     DEFAULT_PERSONA: str = Field(default="Default")
 
-    # GIT_DOMAINS: str                    = Field(default='github.com,bitbucket.org,gitlab.com,gist.github.com,codeberg.org,gitea.com,git.sr.ht')
-    # WGET_USER_AGENT: str                = Field(default=lambda c: c['USER_AGENT'] + ' wget/{WGET_VERSION}')
-    # CURL_USER_AGENT: str                = Field(default=lambda c: c['USER_AGENT'] + ' curl/{CURL_VERSION}')
-    # CHROME_USER_AGENT: str              = Field(default=lambda c: c['USER_AGENT'])
-    # CHROME_USER_DATA_DIR: str | None    = Field(default=None)
-    # CHROME_TIMEOUT: int                 = Field(default=0)
-    # CHROME_HEADLESS: bool               = Field(default=True)
-    # CHROME_SANDBOX: bool                = Field(default=lambda: not SHELL_CONFIG.IN_DOCKER)
-
     def validate(self):
         if int(self.TIMEOUT) < 5:
             print(f"[red][!] Warning: TIMEOUT is set too low! (currently set to TIMEOUT={self.TIMEOUT} seconds)[/red]", file=sys.stderr)
@@ -215,7 +204,6 @@ class SearchBackendConfig(BaseConfigSet):
 
     SEARCH_BACKEND_ENGINE: str = Field(default="ripgrep")
     SEARCH_PROCESS_HTML: bool = Field(default=True)
-    SEARCH_BACKEND_TIMEOUT: int = Field(default=10)
 
 
 SEARCH_BACKEND_CONFIG = SearchBackendConfig()

+ 1 - 1
archivebox/config/configset.py

@@ -174,7 +174,7 @@ def get_config(
     config.update(dict(ARCHIVING_CONFIG))
     config.update(dict(SEARCH_BACKEND_CONFIG))
 
-    # Load from config file
+    # Load from archivebox.config.file
     config_file = CONSTANTS.CONFIG_FILE
     if config_file.exists():
         file_config = BaseConfigSet.load_from_file(config_file)

+ 1 - 1
archivebox/config/views.py

@@ -17,7 +17,7 @@ from admin_data_views.utils import render_with_table_view, render_with_item_view
 from archivebox.config import CONSTANTS
 from archivebox.misc.util import parse_date
 
-from machine.models import Binary
+from archivebox.machine.models import Binary
 
 
 # Common binaries to check for

+ 1 - 1
archivebox/core/__init__.py

@@ -4,7 +4,7 @@ __order__ = 100
 
 def register_admin(admin_site):
     """Register the core.models views (Snapshot, ArchiveResult, Tag, etc.) with the admin site"""
-    from core.admin import register_admin as do_register
+    from archivebox.core.admin import register_admin as do_register
     do_register(admin_site)
 
 

+ 5 - 5
archivebox/core/admin.py

@@ -3,11 +3,11 @@ __package__ = 'archivebox.core'
 from django.contrib.auth import get_user_model
 
 
-from core.models import Snapshot, ArchiveResult, Tag
-from core.admin_tags import TagAdmin
-from core.admin_snapshots import SnapshotAdmin
-from core.admin_archiveresults import ArchiveResultAdmin
-from core.admin_users import UserAdmin
+from archivebox.core.models import Snapshot, ArchiveResult, Tag
+from archivebox.core.admin_tags import TagAdmin
+from archivebox.core.admin_snapshots import SnapshotAdmin
+from archivebox.core.admin_archiveresults import ArchiveResultAdmin
+from archivebox.core.admin_users import UserAdmin
 
 
 def register_admin(admin_site):

+ 5 - 11
archivebox/core/admin_archiveresults.py

@@ -16,7 +16,7 @@ from archivebox.base_models.admin import BaseModelAdmin
 from archivebox.hooks import get_plugin_icon
 
 
-from core.models import ArchiveResult, Snapshot
+from archivebox.core.models import ArchiveResult, Snapshot
 
 
 def render_archiveresults_list(archiveresults_qs, limit=50):
@@ -187,7 +187,7 @@ class ArchiveResultInline(admin.TabularInline):
     extra = 0
     sort_fields = ('end_ts', 'plugin', 'output_str', 'status', 'cmd_version')
     readonly_fields = ('id', 'result_id', 'completed', 'command', 'version')
-    fields = ('start_ts', 'end_ts', *readonly_fields, 'plugin', 'cmd', 'cmd_version', 'pwd', 'created_by', 'status', 'retry_at', 'output_str')
+    fields = ('start_ts', 'end_ts', *readonly_fields, 'plugin', 'cmd', 'cmd_version', 'pwd', 'status', 'retry_at', 'output_str')
     # exclude = ('id',)
     ordering = ('end_ts',)
     show_change_link = True
@@ -229,17 +229,15 @@ class ArchiveResultInline(admin.TabularInline):
         formset.form.base_fields['end_ts'].initial = timezone.now()
         formset.form.base_fields['cmd_version'].initial = '-'
         formset.form.base_fields['pwd'].initial = str(snapshot.output_dir)
-        formset.form.base_fields['created_by'].initial = request.user
         formset.form.base_fields['cmd'].initial = '["-"]'
         formset.form.base_fields['output_str'].initial = 'Manually recorded cmd output...'
-        
+
         if obj is not None:
             # hidden values for existing entries and new entries
             formset.form.base_fields['start_ts'].widget = formset.form.base_fields['start_ts'].hidden_widget()
             formset.form.base_fields['end_ts'].widget = formset.form.base_fields['end_ts'].hidden_widget()
             formset.form.base_fields['cmd'].widget = formset.form.base_fields['cmd'].hidden_widget()
             formset.form.base_fields['pwd'].widget = formset.form.base_fields['pwd'].hidden_widget()
-            formset.form.base_fields['created_by'].widget = formset.form.base_fields['created_by'].hidden_widget()
             formset.form.base_fields['cmd_version'].widget = formset.form.base_fields['cmd_version'].hidden_widget()
         return formset
     
@@ -252,8 +250,8 @@ class ArchiveResultInline(admin.TabularInline):
 
 
 class ArchiveResultAdmin(BaseModelAdmin):
-    list_display = ('id', 'created_by', 'created_at', 'snapshot_info', 'tags_str', 'status', 'plugin_with_icon', 'cmd_str', 'output_str')
-    sort_fields = ('id', 'created_by', 'created_at', 'plugin', 'status')
+    list_display = ('id', 'created_at', 'snapshot_info', 'tags_str', 'status', 'plugin_with_icon', 'cmd_str', 'output_str')
+    sort_fields = ('id', 'created_at', 'plugin', 'status')
     readonly_fields = ('cmd_str', 'snapshot_info', 'tags_str', 'created_at', 'modified_at', 'output_summary', 'plugin_with_icon', 'iface')
     search_fields = ('id', 'snapshot__url', 'plugin', 'output_str', 'cmd_version', 'cmd', 'snapshot__timestamp')
     autocomplete_fields = ['snapshot']
@@ -279,10 +277,6 @@ class ArchiveResultAdmin(BaseModelAdmin):
             'fields': ('output_str', 'output_json', 'output_files', 'output_size', 'output_mimetypes', 'output_summary'),
             'classes': ('card', 'wide'),
         }),
-        ('Metadata', {
-            'fields': ('created_by',),
-            'classes': ('card',),
-        }),
     )
 
     list_filter = ('status', 'plugin', 'start_ts', 'cmd_version')

+ 5 - 5
archivebox/core/admin_site.py

@@ -38,11 +38,11 @@ def register_admin_site():
 
     # Register admin views for each app
     # (Previously handled by ABX plugin system, now called directly)
-    from core.admin import register_admin as register_core_admin
-    from crawls.admin import register_admin as register_crawls_admin
-    from api.admin import register_admin as register_api_admin
-    from machine.admin import register_admin as register_machine_admin
-    from workers.admin import register_admin as register_workers_admin
+    from archivebox.core.admin import register_admin as register_core_admin
+    from archivebox.crawls.admin import register_admin as register_crawls_admin
+    from archivebox.api.admin import register_admin as register_api_admin
+    from archivebox.machine.admin import register_admin as register_machine_admin
+    from archivebox.workers.admin import register_admin as register_workers_admin
 
     register_core_admin(archivebox_admin)
     register_crawls_admin(archivebox_admin)

+ 5 - 5
archivebox/core/admin_snapshots.py

@@ -23,9 +23,9 @@ from archivebox.search.admin import SearchResultsAdminMixin
 from archivebox.base_models.admin import BaseModelAdmin, ConfigEditorMixin
 from archivebox.workers.tasks import bg_archive_snapshots, bg_add
 
-from core.models import Tag, Snapshot
-from core.admin_tags import TagInline
-from core.admin_archiveresults import ArchiveResultInline, render_archiveresults_list
+from archivebox.core.models import Tag, Snapshot
+from archivebox.core.admin_tags import TagInline
+from archivebox.core.admin_archiveresults import ArchiveResultInline, render_archiveresults_list
 
 
 # GLOBAL_CONTEXT = {'VERSION': VERSION, 'VERSIONS_AVAILABLE': [], 'CAN_UPGRADE': False}
@@ -59,7 +59,7 @@ class SnapshotAdmin(SearchResultsAdminMixin, ConfigEditorMixin, BaseModelAdmin):
     sort_fields = ('title_str', 'url_str', 'created_at', 'status', 'crawl')
     readonly_fields = ('admin_actions', 'status_info', 'tags_str', 'imported_timestamp', 'created_at', 'modified_at', 'downloaded_at', 'output_dir', 'archiveresults_list')
     search_fields = ('id', 'url', 'timestamp', 'title', 'tags__name')
-    list_filter = ('created_at', 'downloaded_at', 'archiveresult__status', 'created_by', 'tags__name')
+    list_filter = ('created_at', 'downloaded_at', 'archiveresult__status', 'crawl__created_by', 'tags__name')
 
     fieldsets = (
         ('URL', {
@@ -75,7 +75,7 @@ class SnapshotAdmin(SearchResultsAdminMixin, ConfigEditorMixin, BaseModelAdmin):
             'classes': ('card',),
         }),
         ('Relations', {
-            'fields': ('crawl', 'created_by', 'tags_str'),
+            'fields': ('crawl', 'tags_str'),
             'classes': ('card',),
         }),
         ('Config', {

+ 1 - 1
archivebox/core/admin_tags.py

@@ -6,7 +6,7 @@ from django.utils.html import format_html, mark_safe
 from archivebox.misc.paginators import AccelleratedPaginator
 from archivebox.base_models.admin import BaseModelAdmin
 
-from core.models import Tag
+from archivebox.core.models import Tag
 
 
 class TagInline(admin.TabularInline):

+ 2 - 2
archivebox/core/apps.py

@@ -4,9 +4,9 @@ from django.apps import AppConfig
 
 
 class CoreConfig(AppConfig):
-    name = 'core'
+    name = 'archivebox.core'
 
     def ready(self):
         """Register the archivebox.core.admin_site as the main django admin site"""
-        from core.admin_site import register_admin_site
+        from archivebox.core.admin_site import register_admin_site
         register_admin_site()

+ 1 - 1
archivebox/core/asgi.py

@@ -20,7 +20,7 @@ application = get_asgi_application()
 # from channels.routing import ProtocolTypeRouter, URLRouter
 # from channels.auth import AuthMiddlewareStack
 # from channels.security.websocket import AllowedHostsOriginValidator
-# from core.routing import websocket_urlpatterns
+# from archivebox.core.routing import websocket_urlpatterns
 #
 # application = ProtocolTypeRouter({
 #     "http": get_asgi_application(),

+ 174 - 29
archivebox/core/forms.py

@@ -4,10 +4,14 @@ from django import forms
 
 from archivebox.misc.util import URL_REGEX
 from taggit.utils import edit_string_for_tags, parse_tags
+from archivebox.base_models.admin import KeyValueWidget
 
 DEPTH_CHOICES = (
     ('0', 'depth = 0 (archive just these URLs)'),
-    ('1', 'depth = 1 (archive these URLs and all URLs one hop away)'),
+    ('1', 'depth = 1 (+ URLs one hop away)'),
+    ('2', 'depth = 2 (+ URLs two hops away)'),
+    ('3', 'depth = 3 (+ URLs three hops away)'),
+    ('4', 'depth = 4 (+ URLs four hops away)'),
 )
 
 from archivebox.hooks import get_plugins
@@ -18,39 +22,180 @@ def get_plugin_choices():
 
 
 class AddLinkForm(forms.Form):
-    url = forms.RegexField(label="URLs (one per line)", regex=URL_REGEX, min_length='6', strip=True, widget=forms.Textarea, required=True)
-    tag = forms.CharField(label="Tags (comma separated tag1,tag2,tag3)", strip=True, required=False)
-    depth = forms.ChoiceField(label="Archive depth", choices=DEPTH_CHOICES, initial='0', widget=forms.RadioSelect(attrs={"class": "depth-selection"}))
-    plugins = forms.MultipleChoiceField(
-        label="Plugins (select at least 1, otherwise all will be used by default)",
+    # Basic fields
+    url = forms.RegexField(
+        label="URLs (one per line)",
+        regex=URL_REGEX,
+        min_length='6',
+        strip=True,
+        widget=forms.Textarea,
+        required=True
+    )
+    tag = forms.CharField(
+        label="Tags (comma separated tag1,tag2,tag3)",
+        strip=True,
+        required=False,
+        widget=forms.TextInput(attrs={
+            'list': 'tag-datalist',
+            'autocomplete': 'off',
+        })
+    )
+    depth = forms.ChoiceField(
+        label="Archive depth",
+        choices=DEPTH_CHOICES,
+        initial='0',
+        widget=forms.RadioSelect(attrs={"class": "depth-selection"})
+    )
+    notes = forms.CharField(
+        label="Notes",
+        strip=True,
+        required=False,
+        widget=forms.Textarea(attrs={
+            'rows': 3,
+            'placeholder': 'Optional notes about this crawl (e.g., purpose, project name, context...)',
+        })
+    )
+
+    # Plugin groups
+    chrome_plugins = forms.MultipleChoiceField(
+        label="Chrome-dependent plugins",
+        required=False,
+        widget=forms.CheckboxSelectMultiple,
+        choices=[],  # populated in __init__
+    )
+    archiving_plugins = forms.MultipleChoiceField(
+        label="Archiving",
+        required=False,
+        widget=forms.CheckboxSelectMultiple,
+        choices=[],
+    )
+    parsing_plugins = forms.MultipleChoiceField(
+        label="Parsing",
+        required=False,
+        widget=forms.CheckboxSelectMultiple,
+        choices=[],
+    )
+    search_plugins = forms.MultipleChoiceField(
+        label="Search",
+        required=False,
+        widget=forms.CheckboxSelectMultiple,
+        choices=[],
+    )
+    binary_plugins = forms.MultipleChoiceField(
+        label="Binary providers",
+        required=False,
+        widget=forms.CheckboxSelectMultiple,
+        choices=[],
+    )
+    extension_plugins = forms.MultipleChoiceField(
+        label="Browser extensions",
+        required=False,
+        widget=forms.CheckboxSelectMultiple,
+        choices=[],
+    )
+
+    # Advanced options
+    schedule = forms.CharField(
+        label="Repeat schedule",
+        max_length=64,
+        required=False,
+        widget=forms.TextInput(attrs={
+            'placeholder': 'e.g., daily, weekly, 0 */6 * * * (every 6 hours)',
+        })
+    )
+    persona = forms.CharField(
+        label="Persona (authentication profile)",
+        max_length=100,
+        initial='Default',
+        required=False,
+    )
+    overwrite = forms.BooleanField(
+        label="Overwrite existing snapshots",
+        initial=False,
+        required=False,
+    )
+    update = forms.BooleanField(
+        label="Update/retry previously failed URLs",
+        initial=False,
+        required=False,
+    )
+    index_only = forms.BooleanField(
+        label="Index only (don't archive yet)",
+        initial=False,
+        required=False,
+    )
+    config = forms.JSONField(
+        label="Custom config overrides",
+        widget=KeyValueWidget(),
+        initial=dict,
         required=False,
-        widget=forms.SelectMultiple,
-        choices=[],  # populated dynamically in __init__
     )
 
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
-        self.fields['plugins'].choices = get_plugin_choices()
-    # TODO: hook these up to the view and put them 
-    # in a collapsible UI section labeled "Advanced"
-    #
-    # exclude_patterns = forms.CharField(
-    #     label="Exclude patterns",
-    #     min_length='1',
-    #     required=False,
-    #     initial=URL_DENYLIST,
-    # )
-    # timeout = forms.IntegerField(
-    #     initial=TIMEOUT,
-    # )
-    # overwrite = forms.BooleanField(
-    #     label="Overwrite any existing Snapshots",
-    #     initial=False,
-    # )
-    # index_only = forms.BooleanField(
-    #     label="Add URLs to index without Snapshotting",
-    #     initial=False,
-    # )
+
+        # Import at runtime to avoid circular imports
+        from archivebox.config.common import ARCHIVING_CONFIG
+
+        # Get all plugins
+        all_plugins = get_plugins()
+
+        # Define plugin groups
+        chrome_dependent = {
+            'accessibility', 'chrome', 'consolelog', 'dom', 'headers',
+            'parse_dom_outlinks', 'pdf', 'redirects', 'responses',
+            'screenshot', 'seo', 'singlefile', 'ssl', 'staticfile', 'title'
+        }
+        archiving = {
+            'archive_org', 'favicon', 'forumdl', 'gallerydl', 'git',
+            'htmltotext', 'media', 'mercury', 'papersdl', 'readability', 'wget'
+        }
+        parsing = {
+            'parse_html_urls', 'parse_jsonl_urls',
+            'parse_netscape_urls', 'parse_rss_urls', 'parse_txt_urls'
+        }
+        search = {
+            'search_backend_ripgrep', 'search_backend_sonic', 'search_backend_sqlite'
+        }
+        binary = {'apt', 'brew', 'custom', 'env', 'npm', 'pip'}
+        extensions = {'captcha2', 'istilldontcareaboutcookies', 'ublock'}
+
+        # Populate plugin field choices
+        self.fields['chrome_plugins'].choices = [
+            (p, p) for p in sorted(all_plugins) if p in chrome_dependent
+        ]
+        self.fields['archiving_plugins'].choices = [
+            (p, p) for p in sorted(all_plugins) if p in archiving
+        ]
+        self.fields['parsing_plugins'].choices = [
+            (p, p) for p in sorted(all_plugins) if p in parsing
+        ]
+        self.fields['search_plugins'].choices = [
+            (p, p) for p in sorted(all_plugins) if p in search
+        ]
+        self.fields['binary_plugins'].choices = [
+            (p, p) for p in sorted(all_plugins) if p in binary
+        ]
+        self.fields['extension_plugins'].choices = [
+            (p, p) for p in sorted(all_plugins) if p in extensions
+        ]
+
+        # Set update default from config
+        self.fields['update'].initial = not ARCHIVING_CONFIG.ONLY_NEW
+
+    def clean(self):
+        cleaned_data = super().clean()
+
+        # Combine all plugin groups into single list
+        all_selected_plugins = []
+        for field in ['chrome_plugins', 'archiving_plugins', 'parsing_plugins',
+                      'search_plugins', 'binary_plugins', 'extension_plugins']:
+            all_selected_plugins.extend(cleaned_data.get(field, []))
+
+        # Store combined list for easy access
+        cleaned_data['plugins'] = all_selected_plugins
+
+        return cleaned_data
 
 class TagWidgetMixin:
     def format_value(self, value):

+ 1 - 1
archivebox/core/migrations/0007_archiveresult.py

@@ -12,7 +12,7 @@ try:
     ARCHIVE_DIR = CONSTANTS.ARCHIVE_DIR
 except ImportError:
     try:
-        from config import CONFIG
+        from archivebox.config import CONFIG
         ARCHIVE_DIR = Path(CONFIG.get('ARCHIVE_DIR', './archive'))
     except ImportError:
         ARCHIVE_DIR = Path('./archive')

+ 1 - 1
archivebox/core/migrations/0032_alter_archiveresult_binary_and_more.py

@@ -11,7 +11,7 @@ class Migration(migrations.Migration):
     dependencies = [
         ('core', '0031_snapshot_parent_snapshot'),
         ('crawls', '0004_alter_crawl_output_dir'),
-        ('machine', '0003_alter_dependency_id_alter_installedbinary_dependency_and_more'),
+        ('machine', '0004_drop_dependency_table'),  # Changed from 0003 - wait until Dependency is dropped
         migrations.swappable_dependency(settings.AUTH_USER_MODEL),
     ]
 

+ 79 - 0
archivebox/core/migrations/0035_snapshot_crawl_non_nullable_remove_created_by.py

@@ -0,0 +1,79 @@
+# Generated migration
+
+from django.conf import settings
+from django.db import migrations, models
+import django.db.models.deletion
+
+
+def create_catchall_crawls_and_assign_snapshots(apps, schema_editor):
+    """
+    Create one catchall Crawl per user for all snapshots without a crawl.
+    Assign those snapshots to their user's catchall crawl.
+    """
+    Snapshot = apps.get_model('core', 'Snapshot')
+    Crawl = apps.get_model('crawls', 'Crawl')
+    User = apps.get_model(settings.AUTH_USER_MODEL)
+
+    # Get all snapshots without a crawl
+    snapshots_without_crawl = Snapshot.objects.filter(crawl__isnull=True)
+
+    if not snapshots_without_crawl.exists():
+        return
+
+    # Group by created_by_id
+    snapshots_by_user = {}
+    for snapshot in snapshots_without_crawl:
+        user_id = snapshot.created_by_id
+        if user_id not in snapshots_by_user:
+            snapshots_by_user[user_id] = []
+        snapshots_by_user[user_id].append(snapshot)
+
+    # Create one catchall crawl per user and assign snapshots
+    for user_id, snapshots in snapshots_by_user.items():
+        try:
+            user = User.objects.get(pk=user_id)
+            username = user.username
+        except User.DoesNotExist:
+            username = 'unknown'
+
+        # Create catchall crawl for this user
+        crawl = Crawl.objects.create(
+            urls=f'# Catchall crawl for {len(snapshots)} snapshots without a crawl',
+            max_depth=0,
+            label=f'[migration] catchall for user {username}',
+            created_by_id=user_id,
+        )
+
+        # Assign all snapshots to this crawl
+        for snapshot in snapshots:
+            snapshot.crawl = crawl
+            snapshot.save(update_fields=['crawl'])
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ('core', '0034_snapshot_current_step'),
+        ('crawls', '0004_alter_crawl_output_dir'),
+    ]
+
+    operations = [
+        # Step 1: Assign all snapshots without a crawl to catchall crawls
+        migrations.RunPython(
+            create_catchall_crawls_and_assign_snapshots,
+            reverse_code=migrations.RunPython.noop,
+        ),
+
+        # Step 2: Make crawl non-nullable
+        migrations.AlterField(
+            model_name='snapshot',
+            name='crawl',
+            field=models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, related_name='snapshot_set', to='crawls.crawl'),
+        ),
+
+        # Step 3: Remove created_by field
+        migrations.RemoveField(
+            model_name='snapshot',
+            name='created_by',
+        ),
+    ]

+ 19 - 0
archivebox/core/migrations/0036_remove_archiveresult_created_by.py

@@ -0,0 +1,19 @@
+# Generated migration
+
+from django.db import migrations
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ('core', '0035_snapshot_crawl_non_nullable_remove_created_by'),
+    ]
+
+    operations = [
+        # Remove created_by field from ArchiveResult
+        # No data migration needed - created_by can be accessed via snapshot.crawl.created_by
+        migrations.RemoveField(
+            model_name='archiveresult',
+            name='created_by',
+        ),
+    ]

+ 510 - 172
archivebox/core/models.py

@@ -9,6 +9,8 @@ import os
 import json
 from pathlib import Path
 
+from statemachine import State, registry
+
 from django.db import models
 from django.db.models import QuerySet, Value, Case, When, IntegerField
 from django.utils.functional import cached_property
@@ -33,10 +35,10 @@ from archivebox.base_models.models import (
     ModelWithConfig, ModelWithNotes, ModelWithHealthStats,
     get_or_create_system_user_pk,
 )
-from workers.models import ModelWithStateMachine
-from workers.tasks import bg_archive_snapshot
-from crawls.models import Crawl
-from machine.models import NetworkInterface, Binary
+from archivebox.workers.models import ModelWithStateMachine, BaseStateMachine
+from archivebox.workers.tasks import bg_archive_snapshot
+from archivebox.crawls.models import Crawl
+from archivebox.machine.models import NetworkInterface, Binary
 
 
 
@@ -53,6 +55,7 @@ class Tag(ModelWithSerializers):
     snapshot_set: models.Manager['Snapshot']
 
     class Meta(TypedModelMeta):
+        app_label = 'core'
         verbose_name = "Tag"
         verbose_name_plural = "Tags"
 
@@ -122,6 +125,7 @@ class SnapshotTag(models.Model):
     tag = models.ForeignKey(Tag, db_column='tag_id', on_delete=models.CASCADE, to_field='id')
 
     class Meta:
+        app_label = 'core'
         db_table = 'core_snapshot_tags'
         unique_together = [('snapshot', 'tag')]
 
@@ -263,52 +267,6 @@ class SnapshotManager(models.Manager.from_queryset(SnapshotQuerySet)):
     # Import Methods
     # =========================================================================
 
-    def create_or_update_from_dict(self, link_dict: Dict[str, Any], created_by_id: Optional[int] = None) -> 'Snapshot':
-        """Create or update a Snapshot from a SnapshotDict (parser output)"""
-        import re
-        from archivebox.config.common import GENERAL_CONFIG
-
-        url = link_dict['url']
-        timestamp = link_dict.get('timestamp')
-        title = link_dict.get('title')
-        tags_str = link_dict.get('tags')
-
-        tag_list = []
-        if tags_str:
-            tag_list = list(dict.fromkeys(
-                tag.strip() for tag in re.split(GENERAL_CONFIG.TAG_SEPARATOR_PATTERN, tags_str)
-                if tag.strip()
-            ))
-
-        # Get most recent snapshot with this URL (URLs can exist in multiple crawls)
-        snapshot = self.filter(url=url).order_by('-created_at').first()
-        if snapshot:
-            if title and (not snapshot.title or len(title) > len(snapshot.title or '')):
-                snapshot.title = title
-                snapshot.save(update_fields=['title', 'modified_at'])
-        else:
-            if timestamp:
-                while self.filter(timestamp=timestamp).exists():
-                    timestamp = str(float(timestamp) + 1.0)
-
-            snapshot = self.create(
-                url=url,
-                timestamp=timestamp,
-                title=title,
-                created_by_id=created_by_id or get_or_create_system_user_pk(),
-            )
-
-        if tag_list:
-            existing_tags = set(snapshot.tags.values_list('name', flat=True))
-            new_tags = set(tag_list) | existing_tags
-            snapshot.save_tags(new_tags)
-
-        return snapshot
-
-    def create_from_dicts(self, link_dicts: List[Dict[str, Any]], created_by_id: Optional[int] = None) -> List['Snapshot']:
-        """Create or update multiple Snapshots from a list of SnapshotDicts"""
-        return [self.create_or_update_from_dict(d, created_by_id=created_by_id) for d in link_dicts]
-
     def remove(self, atomic: bool = False) -> tuple:
         """Remove snapshots from the database"""
         from django.db import transaction
@@ -320,14 +278,13 @@ class SnapshotManager(models.Manager.from_queryset(SnapshotQuerySet)):
 
 class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHealthStats, ModelWithStateMachine):
     id = models.UUIDField(primary_key=True, default=uuid7, editable=False, unique=True)
-    created_by = models.ForeignKey(settings.AUTH_USER_MODEL, on_delete=models.CASCADE, default=get_or_create_system_user_pk, null=False, related_name='snapshot_set', db_index=True)
     created_at = models.DateTimeField(default=timezone.now, db_index=True)
     modified_at = models.DateTimeField(auto_now=True)
 
     url = models.URLField(unique=False, db_index=True)  # URLs can appear in multiple crawls
     timestamp = models.CharField(max_length=32, unique=True, db_index=True, editable=False)
     bookmarked_at = models.DateTimeField(default=timezone.now, db_index=True)
-    crawl: Crawl = models.ForeignKey(Crawl, on_delete=models.CASCADE, default=None, null=True, blank=True, related_name='snapshot_set', db_index=True)  # type: ignore
+    crawl: Crawl = models.ForeignKey(Crawl, on_delete=models.CASCADE, null=False, related_name='snapshot_set', db_index=True)  # type: ignore[assignment]
     parent_snapshot = models.ForeignKey('self', on_delete=models.SET_NULL, null=True, blank=True, related_name='child_snapshots', db_index=True, help_text='Parent snapshot that discovered this URL (for recursive crawling)')
 
     title = models.CharField(max_length=512, null=True, blank=True, db_index=True)
@@ -344,7 +301,7 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
 
     tags = models.ManyToManyField(Tag, blank=True, through=SnapshotTag, related_name='snapshot_set', through_fields=('snapshot', 'tag'))
 
-    state_machine_name = 'core.statemachines.SnapshotMachine'
+    state_machine_name = 'core.models.SnapshotMachine'
     state_field_name = 'status'
     retry_at_field_name = 'retry_at'
     StatusChoices = ModelWithStateMachine.StatusChoices
@@ -354,6 +311,7 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
     archiveresult_set: models.Manager['ArchiveResult']
 
     class Meta(TypedModelMeta):
+        app_label = 'core'
         verbose_name = "Snapshot"
         verbose_name_plural = "Snapshots"
         constraints = [
@@ -366,6 +324,11 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
     def __str__(self):
         return f'[{self.id}] {self.url[:64]}'
 
+    @property
+    def created_by(self):
+        """Convenience property to access the user who created this snapshot via its crawl."""
+        return self.crawl.created_by
+
     def save(self, *args, **kwargs):
         is_new = self._state.adding
         if not self.bookmarked_at:
@@ -395,7 +358,7 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
                 self.fs_version = target
 
         super().save(*args, **kwargs)
-        if self.crawl and self.url not in self.crawl.urls:
+        if self.url not in self.crawl.urls:
             self.crawl.urls += f'\n{self.url}'
             self.crawl.save()
 
@@ -408,7 +371,7 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
                 url=self.url,
                 metadata={
                     'id': str(self.id),
-                    'crawl_id': str(self.crawl_id) if self.crawl_id else None,
+                    'crawl_id': str(self.crawl_id),
                     'depth': self.depth,
                     'status': self.status,
                 },
@@ -437,20 +400,11 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
         return self.fs_version != self._fs_current_version()
 
     def _fs_next_version(self, version: str) -> str:
-        """Get next version in migration chain"""
-        chain = ['0.7.0', '0.8.0', '0.9.0']
-        try:
-            idx = chain.index(version)
-            return chain[idx + 1] if idx + 1 < len(chain) else self._fs_current_version()
-        except ValueError:
-            # Unknown version - skip to current
-            return self._fs_current_version()
-
-    def _fs_migrate_from_0_7_0_to_0_8_0(self):
-        """Migration from 0.7.0 to 0.8.0 layout (no-op)"""
-        # 0.7 and 0.8 both used archive/<timestamp>
-        # Nothing to do!
-        pass
+        """Get next version in migration chain (0.7/0.8 had same layout, only 0.8→0.9 migration needed)"""
+        # Treat 0.7.0 and 0.8.0 as equivalent (both used archive/{timestamp})
+        if version in ('0.7.0', '0.8.0'):
+            return '0.9.0'
+        return self._fs_current_version()
 
     def _fs_migrate_from_0_8_0_to_0_9_0(self):
         """
@@ -578,7 +532,7 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
             return CONSTANTS.ARCHIVE_DIR / self.timestamp
 
         elif version in ('0.9.0', '1.0.0'):
-            username = self.created_by.username if self.created_by else 'unknown'
+            username = self.created_by.username
 
             # Use created_at for date grouping (fallback to timestamp)
             if self.created_at:
@@ -875,7 +829,6 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
                 pwd=result_data.get('pwd', str(self.output_dir)),
                 start_ts=start_ts,
                 end_ts=end_ts,
-                created_by=self.created_by,
             )
         except:
             pass
@@ -1069,6 +1022,12 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
                 result = archive_results.get(plugin)
                 existing = result and result.status == 'succeeded' and (result.output_files or result.output_str)
                 icon = get_plugin_icon(plugin)
+
+                # Skip plugins with empty icons that have no output
+                # (e.g., staticfile only shows when there's actual output)
+                if not icon.strip() and not existing:
+                    continue
+
                 output += format_html(
                     output_template,
                     path,
@@ -1139,9 +1098,20 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
 
     def run(self) -> list['ArchiveResult']:
         """
-        Execute this Snapshot by creating ArchiveResults for all enabled extractors.
+        Execute snapshot by creating pending ArchiveResults for all enabled hooks.
+
+        Called by: SnapshotMachine.enter_started()
 
-        Called by the state machine when entering the 'started' state.
+        Hook Lifecycle:
+            1. discover_hooks('Snapshot') → finds all plugin hooks
+            2. For each hook:
+               - Create ArchiveResult with status=QUEUED
+               - Store hook_name (e.g., 'on_Snapshot__50_wget.py')
+            3. ArchiveResults execute independently via ArchiveResultMachine
+            4. Hook execution happens in ArchiveResult.run(), NOT here
+
+        Returns:
+            list[ArchiveResult]: Newly created pending results
         """
         return self.create_pending_archiveresults()
 
@@ -1152,28 +1122,20 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
         Called by the state machine when entering the 'sealed' state.
         Kills any background hooks and finalizes their ArchiveResults.
         """
-        from pathlib import Path
         from archivebox.hooks import kill_process
 
         # Kill any background ArchiveResult hooks
         if not self.OUTPUT_DIR.exists():
             return
 
-        for plugin_dir in self.OUTPUT_DIR.iterdir():
-            if not plugin_dir.is_dir():
-                continue
-            pid_file = plugin_dir / 'hook.pid'
-            if pid_file.exists():
-                kill_process(pid_file, validate=True)  # Use validation
-
-                # Update the ArchiveResult from filesystem
-                plugin_name = plugin_dir.name
-                results = self.archiveresult_set.filter(
-                    status=ArchiveResult.StatusChoices.STARTED,
-                    pwd__contains=plugin_name
-                )
-                for ar in results:
-                    ar.update_from_output()
+        # Find all .pid files in this snapshot's output directory
+        for pid_file in self.OUTPUT_DIR.glob('**/*.pid'):
+            kill_process(pid_file, validate=True)
+
+        # Update all STARTED ArchiveResults from filesystem
+        results = self.archiveresult_set.filter(status=ArchiveResult.StatusChoices.STARTED)
+        for ar in results:
+            ar.update_from_output()
 
     def has_running_background_hooks(self) -> bool:
         """
@@ -1196,51 +1158,156 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
         return False
 
     @staticmethod
-    def from_jsonl(record: Dict[str, Any], overrides: Dict[str, Any] = None):
+    def from_jsonl(record: Dict[str, Any], overrides: Dict[str, Any] = None, queue_for_extraction: bool = True):
         """
-        Create/update Snapshot from JSONL record.
+        Create/update Snapshot from JSONL record or dict.
+
+        Unified method that handles:
+        - ID-based patching: {"id": "...", "title": "new title"}
+        - URL-based create/update: {"url": "...", "title": "...", "tags": "..."}
+        - Auto-creates Crawl if not provided
+        - Optionally queues for extraction
 
         Args:
-            record: JSONL record with 'url' field and optional metadata
+            record: Dict with 'url' (for create) or 'id' (for patch), plus other fields
             overrides: Dict with 'crawl', 'snapshot' (parent), 'created_by_id'
+            queue_for_extraction: If True, sets status=QUEUED and retry_at (default: True)
 
         Returns:
             Snapshot instance or None
-
-        Note:
-            Filtering (depth, URL allowlist/denylist) should be done by caller
-            BEFORE calling this method. This method just creates the snapshot.
         """
-        from archivebox.misc.jsonl import get_or_create_snapshot
+        import re
         from django.utils import timezone
+        from archivebox.misc.util import parse_date
+        from archivebox.base_models.models import get_or_create_system_user_pk
+        from archivebox.config.common import GENERAL_CONFIG
 
         overrides = overrides or {}
+
+        # If 'id' is provided, lookup and patch that specific snapshot
+        snapshot_id = record.get('id')
+        if snapshot_id:
+            try:
+                snapshot = Snapshot.objects.get(id=snapshot_id)
+
+                # Generically update all fields present in record
+                update_fields = []
+                for field_name, value in record.items():
+                    # Skip internal fields
+                    if field_name in ('id', 'type'):
+                        continue
+
+                    # Skip if field doesn't exist on model
+                    if not hasattr(snapshot, field_name):
+                        continue
+
+                    # Special parsing for date fields
+                    if field_name in ('bookmarked_at', 'retry_at', 'created_at', 'modified_at'):
+                        if value and isinstance(value, str):
+                            value = parse_date(value)
+
+                    # Update field if value is provided and different
+                    if value is not None and getattr(snapshot, field_name) != value:
+                        setattr(snapshot, field_name, value)
+                        update_fields.append(field_name)
+
+                if update_fields:
+                    snapshot.save(update_fields=update_fields + ['modified_at'])
+
+                return snapshot
+            except Snapshot.DoesNotExist:
+                # ID not found, fall through to create-by-URL logic
+                pass
+
         url = record.get('url')
         if not url:
             return None
 
-        # Apply crawl context metadata
+        # Determine or create crawl (every snapshot must have a crawl)
         crawl = overrides.get('crawl')
-        snapshot = overrides.get('snapshot')  # Parent snapshot
+        parent_snapshot = overrides.get('snapshot')  # Parent snapshot
+        created_by_id = overrides.get('created_by_id') or (parent_snapshot.created_by.pk if parent_snapshot else get_or_create_system_user_pk())
+
+        # If no crawl provided, inherit from parent or auto-create one
+        if not crawl:
+            if parent_snapshot:
+                # Inherit crawl from parent snapshot
+                crawl = parent_snapshot.crawl
+            else:
+                # Auto-create a single-URL crawl
+                from archivebox.crawls.models import Crawl
+                from archivebox.config import CONSTANTS
+
+                timestamp_str = timezone.now().strftime("%Y-%m-%d__%H-%M-%S")
+                sources_file = CONSTANTS.SOURCES_DIR / f'{timestamp_str}__auto_crawl.txt'
+                sources_file.parent.mkdir(parents=True, exist_ok=True)
+                sources_file.write_text(url)
+
+                crawl = Crawl.objects.create(
+                    urls=url,
+                    max_depth=0,
+                    label=f'auto-created for {url[:50]}',
+                    created_by_id=created_by_id,
+                )
 
-        if crawl:
-            record.setdefault('crawl_id', str(crawl.id))
-            record.setdefault('depth', (snapshot.depth + 1 if snapshot else 1))
-            if snapshot:
-                record.setdefault('parent_snapshot_id', str(snapshot.id))
+        # Parse tags
+        tags_str = record.get('tags', '')
+        tag_list = []
+        if tags_str:
+            tag_list = list(dict.fromkeys(
+                tag.strip() for tag in re.split(GENERAL_CONFIG.TAG_SEPARATOR_PATTERN, tags_str)
+                if tag.strip()
+            ))
 
-        try:
-            created_by_id = overrides.get('created_by_id') or (snapshot.created_by_id if snapshot else None)
-            new_snapshot = get_or_create_snapshot(record, created_by_id=created_by_id)
+        # Get most recent snapshot with this URL (URLs can exist in multiple crawls)
+        snapshot = Snapshot.objects.filter(url=url).order_by('-created_at').first()
 
-            # Queue for extraction
-            new_snapshot.status = Snapshot.StatusChoices.QUEUED
-            new_snapshot.retry_at = timezone.now()
-            new_snapshot.save()
+        title = record.get('title')
+        timestamp = record.get('timestamp')
 
-            return new_snapshot
-        except ValueError:
-            return None
+        if snapshot:
+            # Update existing snapshot
+            if title and (not snapshot.title or len(title) > len(snapshot.title or '')):
+                snapshot.title = title
+                snapshot.save(update_fields=['title', 'modified_at'])
+        else:
+            # Create new snapshot
+            if timestamp:
+                while Snapshot.objects.filter(timestamp=timestamp).exists():
+                    timestamp = str(float(timestamp) + 1.0)
+
+            snapshot = Snapshot.objects.create(
+                url=url,
+                timestamp=timestamp,
+                title=title,
+                crawl=crawl,
+            )
+
+        # Update tags
+        if tag_list:
+            existing_tags = set(snapshot.tags.values_list('name', flat=True))
+            new_tags = set(tag_list) | existing_tags
+            snapshot.save_tags(new_tags)
+
+        # Queue for extraction and update additional fields
+        update_fields = []
+
+        if queue_for_extraction:
+            snapshot.status = Snapshot.StatusChoices.QUEUED
+            snapshot.retry_at = timezone.now()
+            update_fields.extend(['status', 'retry_at'])
+
+        # Update additional fields if provided
+        for field_name in ('depth', 'parent_snapshot_id', 'crawl_id', 'bookmarked_at'):
+            value = record.get(field_name)
+            if value is not None and getattr(snapshot, field_name) != value:
+                setattr(snapshot, field_name, value)
+                update_fields.append(field_name)
+
+        if update_fields:
+            snapshot.save(update_fields=update_fields + ['modified_at'])
+
+        return snapshot
 
     def create_pending_archiveresults(self) -> list['ArchiveResult']:
         """
@@ -1273,7 +1340,6 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
                     'plugin': plugin,
                     'status': ArchiveResult.INITIAL_STATE,
                     'retry_at': timezone.now(),
-                    'created_by_id': self.created_by_id,
                 },
             )
             if archiveresult.status == ArchiveResult.INITIAL_STATE:
@@ -1329,6 +1395,36 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
         self.save(update_fields=['current_step', 'modified_at'])
         return True
 
+    def is_finished_processing(self) -> bool:
+        """
+        Check if this snapshot has finished processing.
+
+        Used by SnapshotMachine.is_finished() to determine if snapshot is complete.
+
+        Returns:
+            True if all archiveresults are finished (or no work to do), False otherwise.
+        """
+        # if no archiveresults exist yet, it's not finished
+        if not self.archiveresult_set.exists():
+            return False
+
+        # Try to advance step if ready (handles step-based hook execution)
+        # This will increment current_step when all foreground hooks in current step are done
+        while self.advance_step_if_ready():
+            pass  # Keep advancing until we can't anymore
+
+        # if archiveresults exist but are still pending, it's not finished
+        if self.pending_archiveresults().exists():
+            return False
+
+        # Don't wait for background hooks - they'll be cleaned up on entering sealed state
+        # Background hooks in STARTED state are excluded by pending_archiveresults()
+        # (STARTED is in FINAL_OR_ACTIVE_STATES) so once all results are FINAL or ACTIVE,
+        # we can transition to sealed and cleanup() will kill the background hooks
+
+        # otherwise archiveresults exist and are all finished, so it's finished
+        return True
+
     def retry_failed_archiveresults(self, retry_at: Optional['timezone.datetime'] = None) -> int:
         """
         Reset failed/skipped ArchiveResults to queued for retry.
@@ -1730,6 +1826,97 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
         return dt.strftime('%Y-%m-%d %H:%M:%S') if dt else None
 
 
+# =============================================================================
+# Snapshot State Machine
+# =============================================================================
+
+class SnapshotMachine(BaseStateMachine, strict_states=True):
+    """
+    State machine for managing Snapshot lifecycle.
+
+    Hook Lifecycle:
+    ┌─────────────────────────────────────────────────────────────┐
+    │ QUEUED State                                                │
+    │  • Waiting for snapshot to be ready                         │
+    └─────────────────────────────────────────────────────────────┘
+                            ↓ tick() when can_start()
+    ┌─────────────────────────────────────────────────────────────┐
+    │ STARTED State → enter_started()                             │
+    │  1. snapshot.run()                                          │
+    │     • discover_hooks('Snapshot') → finds all plugin hooks   │
+    │     • create_pending_archiveresults() → creates ONE         │
+    │       ArchiveResult per hook (NO execution yet)             │
+    │  2. ArchiveResults process independently with their own     │
+    │     state machines (see ArchiveResultMachine)               │
+    │  3. Advance through steps 0-9 as foreground hooks complete  │
+    └─────────────────────────────────────────────────────────────┘
+                            ↓ tick() when is_finished()
+    ┌─────────────────────────────────────────────────────────────┐
+    │ SEALED State → enter_sealed()                               │
+    │  • cleanup() → kills any background hooks still running     │
+    │  • Set retry_at=None (no more processing)                   │
+    └─────────────────────────────────────────────────────────────┘
+
+    https://github.com/ArchiveBox/ArchiveBox/wiki/ArchiveBox-Architecture-Diagrams
+    """
+
+    model_attr_name = 'snapshot'
+
+    # States
+    queued = State(value=Snapshot.StatusChoices.QUEUED, initial=True)
+    started = State(value=Snapshot.StatusChoices.STARTED)
+    sealed = State(value=Snapshot.StatusChoices.SEALED, final=True)
+
+    # Tick Event
+    tick = (
+        queued.to.itself(unless='can_start') |
+        queued.to(started, cond='can_start') |
+        started.to.itself(unless='is_finished') |
+        started.to(sealed, cond='is_finished')
+    )
+
+    def can_start(self) -> bool:
+        can_start = bool(self.snapshot.url)
+        return can_start
+
+    def is_finished(self) -> bool:
+        """Check if snapshot processing is complete - delegates to model method."""
+        return self.snapshot.is_finished_processing()
+
+    @queued.enter
+    def enter_queued(self):
+        self.snapshot.update_and_requeue(
+            retry_at=timezone.now(),
+            status=Snapshot.StatusChoices.QUEUED,
+        )
+
+    @started.enter
+    def enter_started(self):
+        # lock the snapshot while we create the pending archiveresults
+        self.snapshot.update_and_requeue(
+            retry_at=timezone.now() + timedelta(seconds=30),  # if failed, wait 30s before retrying
+        )
+
+        # Run the snapshot - creates pending archiveresults for all enabled plugins
+        self.snapshot.run()
+
+        # unlock the snapshot after we're done + set status = started
+        self.snapshot.update_and_requeue(
+            retry_at=timezone.now() + timedelta(seconds=5),  # check again in 5s
+            status=Snapshot.StatusChoices.STARTED,
+        )
+
+    @sealed.enter
+    def enter_sealed(self):
+        # Clean up background hooks
+        self.snapshot.cleanup()
+
+        self.snapshot.update_and_requeue(
+            retry_at=None,
+            status=Snapshot.StatusChoices.SEALED,
+        )
+
+
 class ArchiveResultManager(models.Manager):
     def indexable(self, sorted: bool = True):
         INDEXABLE_METHODS = [r[0] for r in EXTRACTOR_INDEXING_PRECEDENCE]
@@ -1761,7 +1948,6 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
     # Note: unique constraint is added by migration 0027 - don't set unique=True here
     # or SQLite table recreation in earlier migrations will fail
     uuid = models.UUIDField(default=uuid7, null=True, blank=True, db_index=True)
-    created_by = models.ForeignKey(settings.AUTH_USER_MODEL, on_delete=models.CASCADE, default=get_or_create_system_user_pk, null=False, related_name='archiveresult_set', db_index=True)
     created_at = models.DateTimeField(default=timezone.now, db_index=True)
     modified_at = models.DateTimeField(auto_now=True)
 
@@ -1782,7 +1968,7 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
 
     # Binary FK (optional - set when hook reports cmd)
     binary = models.ForeignKey(
-        'machine.Binary',
+        Binary,
         on_delete=models.SET_NULL,
         null=True, blank=True,
         related_name='archiveresults',
@@ -1798,7 +1984,7 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
     output_dir = models.CharField(max_length=256, default=None, null=True, blank=True)
     iface = models.ForeignKey(NetworkInterface, on_delete=models.SET_NULL, null=True, blank=True)
 
-    state_machine_name = 'core.statemachines.ArchiveResultMachine'
+    state_machine_name = 'core.models.ArchiveResultMachine'
     retry_at_field_name = 'retry_at'
     state_field_name = 'status'
     active_state = StatusChoices.STARTED
@@ -1806,12 +1992,18 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
     objects = ArchiveResultManager()
 
     class Meta(TypedModelMeta):
+        app_label = 'core'
         verbose_name = 'Archive Result'
         verbose_name_plural = 'Archive Results Log'
 
     def __str__(self):
         return f'[{self.id}] {self.snapshot.url[:64]} -> {self.plugin}'
 
+    @property
+    def created_by(self):
+        """Convenience property to access the user who created this archive result via its snapshot's crawl."""
+        return self.snapshot.crawl.created_by
+
     def save(self, *args, **kwargs):
         is_new = self._state.adding
         # Skip ModelWithOutputDir.save() to avoid creating index.json in plugin directories
@@ -1900,6 +2092,12 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
     def save_search_index(self):
         pass
 
+    def cascade_health_update(self, success: bool):
+        """Update health stats for self, parent Snapshot, and grandparent Crawl."""
+        self.increment_health_stats(success)
+        self.snapshot.increment_health_stats(success)
+        self.snapshot.crawl.increment_health_stats(success)
+
     def run(self):
         """
         Execute this ArchiveResult's hook and update status.
@@ -1911,8 +2109,13 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
         """
         from django.utils import timezone
         from archivebox.hooks import BUILTIN_PLUGINS_DIR, USER_PLUGINS_DIR, run_hook, is_background_hook
+        from archivebox.config.configset import get_config
 
-        config_objects = [self.snapshot.crawl, self.snapshot] if self.snapshot.crawl else [self.snapshot]
+        # Get merged config with proper context
+        config = get_config(
+            crawl=self.snapshot.crawl,
+            snapshot=self.snapshot,
+        )
 
         # Determine which hook(s) to run
         hooks = []
@@ -1962,10 +2165,10 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
             result = run_hook(
                 hook,
                 output_dir=plugin_dir,
-                config_objects=config_objects,
+                config=config,
                 url=self.snapshot.url,
                 snapshot_id=str(self.snapshot.id),
-                crawl_id=str(self.snapshot.crawl.id) if self.snapshot.crawl else None,
+                crawl_id=str(self.snapshot.crawl.id),
                 depth=self.snapshot.depth,
             )
 
@@ -2112,9 +2315,6 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
 
             # Filter Snapshot records for depth/URL constraints
             if record_type == 'Snapshot':
-                if not self.snapshot.crawl:
-                    continue
-
                 url = record.get('url')
                 if not url:
                     continue
@@ -2132,19 +2332,10 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
         overrides = {
             'snapshot': self.snapshot,
             'crawl': self.snapshot.crawl,
-            'created_by_id': self.snapshot.created_by_id,
+            'created_by_id': self.created_by.pk,
         }
         process_hook_records(filtered_records, overrides=overrides)
 
-        # Update snapshot title if this is the title plugin
-        plugin_name = get_plugin_name(self.plugin)
-        if self.status == self.StatusChoices.SUCCEEDED and plugin_name == 'title':
-            self._update_snapshot_title(plugin_dir)
-
-        # Trigger search indexing if succeeded
-        if self.status == self.StatusChoices.SUCCEEDED:
-            self.trigger_search_indexing()
-
         # Cleanup PID files and empty logs
         pid_file = plugin_dir / 'hook.pid'
         pid_file.unlink(missing_ok=True)
@@ -2164,7 +2355,7 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
         if not cmd:
             return
 
-        from machine.models import Machine
+        from archivebox.machine.models import Machine
 
         bin_path_or_name = cmd[0] if isinstance(cmd, list) else cmd
         machine = Machine.current()
@@ -2189,23 +2380,6 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
         if binary:
             self.binary = binary
 
-    def _update_snapshot_title(self, plugin_dir: Path):
-        """
-        Update snapshot title from title plugin output.
-
-        The title plugin writes title.txt with the extracted page title.
-        This updates the Snapshot.title field if the file exists and has content.
-        """
-        title_file = plugin_dir / 'title.txt'
-        if title_file.exists():
-            try:
-                title = title_file.read_text(encoding='utf-8').strip()
-                if title and (not self.snapshot.title or len(title) > len(self.snapshot.title)):
-                    self.snapshot.title = title[:512]  # Max length from model
-                    self.snapshot.save(update_fields=['title', 'modified_at'])
-            except Exception:
-                pass  # Failed to read title, that's okay
-
     def _url_passes_filters(self, url: str) -> bool:
         """Check if URL passes URL_ALLOWLIST and URL_DENYLIST config filters.
 
@@ -2216,8 +2390,8 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
 
         # Get merged config with proper hierarchy
         config = get_config(
-            user=self.snapshot.created_by if self.snapshot else None,
-            crawl=self.snapshot.crawl if self.snapshot else None,
+            user=self.created_by,
+            crawl=self.snapshot.crawl,
             snapshot=self.snapshot,
         )
 
@@ -2256,23 +2430,6 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
             return False  # No allowlist patterns matched
 
         return True  # No filters or passed filters
-    
-    def trigger_search_indexing(self):
-        """Run any ArchiveResult__index hooks to update search indexes."""
-        from archivebox.hooks import discover_hooks, run_hook
-
-        # Pass config objects in priority order (later overrides earlier)
-        config_objects = [self.snapshot.crawl, self.snapshot] if self.snapshot.crawl else [self.snapshot]
-
-        for hook in discover_hooks('ArchiveResult__index'):
-            run_hook(
-                hook,
-                output_dir=self.output_dir,
-                config_objects=config_objects,
-                url=self.snapshot.url,
-                snapshot_id=str(self.snapshot.id),
-                plugin=self.plugin,
-            )
 
     @property
     def output_dir(self) -> Path:
@@ -2285,4 +2442,185 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
         if not plugin_dir:
             return False
         pid_file = plugin_dir / 'hook.pid'
-        return pid_file.exists()
+        return pid_file.exists()
+
+
+# =============================================================================
+# ArchiveResult State Machine
+# =============================================================================
+
+class ArchiveResultMachine(BaseStateMachine, strict_states=True):
+    """
+    State machine for managing ArchiveResult (single plugin execution) lifecycle.
+
+    Hook Lifecycle:
+    ┌─────────────────────────────────────────────────────────────┐
+    │ QUEUED State                                                │
+    │  • Waiting for its turn to run                              │
+    └─────────────────────────────────────────────────────────────┘
+                            ↓ tick() when can_start()
+    ┌─────────────────────────────────────────────────────────────┐
+    │ STARTED State → enter_started()                             │
+    │  1. archiveresult.run()                                     │
+    │     • Find specific hook by hook_name                       │
+    │     • run_hook(script, output_dir, ...) → subprocess        │
+    │                                                              │
+    │  2a. FOREGROUND hook (returns HookResult):                  │
+    │      • update_from_output() immediately                     │
+    │        - Read stdout.log                                    │
+    │        - Parse JSONL records                                │
+    │        - Extract 'ArchiveResult' record → update status     │
+    │        - Walk output_dir → populate output_files            │
+    │        - Call process_hook_records() for side effects       │
+    │                                                              │
+    │  2b. BACKGROUND hook (returns None):                        │
+    │      • Status stays STARTED                                 │
+    │      • Continues running in background                      │
+    │      • Killed by Snapshot.cleanup() when sealed             │
+    └─────────────────────────────────────────────────────────────┘
+                            ↓ tick() checks status
+    ┌─────────────────────────────────────────────────────────────┐
+    │ SUCCEEDED / FAILED / SKIPPED / BACKOFF                      │
+    │  • Set by hook's JSONL output during update_from_output()   │
+    │  • Health stats incremented (num_uses_succeeded/failed)     │
+    │  • Parent Snapshot health stats also updated                │
+    └─────────────────────────────────────────────────────────────┘
+
+    https://github.com/ArchiveBox/ArchiveBox/wiki/ArchiveBox-Architecture-Diagrams
+    """
+
+    model_attr_name = 'archiveresult'
+
+    # States
+    queued = State(value=ArchiveResult.StatusChoices.QUEUED, initial=True)
+    started = State(value=ArchiveResult.StatusChoices.STARTED)
+    backoff = State(value=ArchiveResult.StatusChoices.BACKOFF)
+    succeeded = State(value=ArchiveResult.StatusChoices.SUCCEEDED, final=True)
+    failed = State(value=ArchiveResult.StatusChoices.FAILED, final=True)
+    skipped = State(value=ArchiveResult.StatusChoices.SKIPPED, final=True)
+
+    # Tick Event - transitions based on conditions
+    tick = (
+        queued.to.itself(unless='can_start') |
+        queued.to(started, cond='can_start') |
+        started.to.itself(unless='is_finished') |
+        started.to(succeeded, cond='is_succeeded') |
+        started.to(failed, cond='is_failed') |
+        started.to(skipped, cond='is_skipped') |
+        started.to(backoff, cond='is_backoff') |
+        backoff.to.itself(unless='can_start') |
+        backoff.to(started, cond='can_start') |
+        backoff.to(succeeded, cond='is_succeeded') |
+        backoff.to(failed, cond='is_failed') |
+        backoff.to(skipped, cond='is_skipped')
+    )
+
+    def can_start(self) -> bool:
+        can_start = bool(self.archiveresult.snapshot.url)
+        return can_start
+
+    def is_succeeded(self) -> bool:
+        """Check if extractor plugin succeeded (status was set by run())."""
+        return self.archiveresult.status == ArchiveResult.StatusChoices.SUCCEEDED
+
+    def is_failed(self) -> bool:
+        """Check if extractor plugin failed (status was set by run())."""
+        return self.archiveresult.status == ArchiveResult.StatusChoices.FAILED
+
+    def is_skipped(self) -> bool:
+        """Check if extractor plugin was skipped (status was set by run())."""
+        return self.archiveresult.status == ArchiveResult.StatusChoices.SKIPPED
+
+    def is_backoff(self) -> bool:
+        """Check if we should backoff and retry later."""
+        # Backoff if status is still started (plugin didn't complete) and output_str is empty
+        return (
+            self.archiveresult.status == ArchiveResult.StatusChoices.STARTED and
+            not self.archiveresult.output_str
+        )
+
+    def is_finished(self) -> bool:
+        """Check if extraction has completed (success, failure, or skipped)."""
+        return self.archiveresult.status in (
+            ArchiveResult.StatusChoices.SUCCEEDED,
+            ArchiveResult.StatusChoices.FAILED,
+            ArchiveResult.StatusChoices.SKIPPED,
+        )
+
+    @queued.enter
+    def enter_queued(self):
+        self.archiveresult.update_and_requeue(
+            retry_at=timezone.now(),
+            status=ArchiveResult.StatusChoices.QUEUED,
+            start_ts=None,
+        )  # bump the snapshot's retry_at so they pickup any new changes
+
+    @started.enter
+    def enter_started(self):
+        from archivebox.machine.models import NetworkInterface
+
+        # Lock the object and mark start time
+        self.archiveresult.update_and_requeue(
+            retry_at=timezone.now() + timedelta(seconds=120),  # 2 min timeout for plugin
+            status=ArchiveResult.StatusChoices.STARTED,
+            start_ts=timezone.now(),
+            iface=NetworkInterface.current(),
+        )
+
+        # Run the plugin - this updates status, output, timestamps, etc.
+        self.archiveresult.run()
+
+        # Save the updated result
+        self.archiveresult.save()
+
+
+    @backoff.enter
+    def enter_backoff(self):
+        self.archiveresult.update_and_requeue(
+            retry_at=timezone.now() + timedelta(seconds=60),
+            status=ArchiveResult.StatusChoices.BACKOFF,
+            end_ts=None,
+        )
+
+    @succeeded.enter
+    def enter_succeeded(self):
+        self.archiveresult.update_and_requeue(
+            retry_at=None,
+            status=ArchiveResult.StatusChoices.SUCCEEDED,
+            end_ts=timezone.now(),
+        )
+
+        # Update health stats for ArchiveResult, Snapshot, and Crawl cascade
+        self.archiveresult.cascade_health_update(success=True)
+
+    @failed.enter
+    def enter_failed(self):
+        self.archiveresult.update_and_requeue(
+            retry_at=None,
+            status=ArchiveResult.StatusChoices.FAILED,
+            end_ts=timezone.now(),
+        )
+
+        # Update health stats for ArchiveResult, Snapshot, and Crawl cascade
+        self.archiveresult.cascade_health_update(success=False)
+
+    @skipped.enter
+    def enter_skipped(self):
+        self.archiveresult.update_and_requeue(
+            retry_at=None,
+            status=ArchiveResult.StatusChoices.SKIPPED,
+            end_ts=timezone.now(),
+        )
+
+    def after_transition(self, event: str, source: State, target: State):
+        self.archiveresult.snapshot.update_and_requeue()  # bump snapshot retry time so it picks up all the new changes
+
+
+# =============================================================================
+# State Machine Registration
+# =============================================================================
+
+# Manually register state machines with python-statemachine registry
+# (normally auto-discovered from statemachines.py, but we define them here for clarity)
+registry.register(SnapshotMachine)
+registry.register(ArchiveResultMachine)

+ 2638 - 0
archivebox/core/models.py.bak

@@ -0,0 +1,2638 @@
+__package__ = 'archivebox.core'
+
+from typing import Optional, Dict, Iterable, Any, List, TYPE_CHECKING
+from archivebox.uuid_compat import uuid7
+from datetime import datetime, timedelta
+from django_stubs_ext.db.models import TypedModelMeta
+
+import os
+import json
+from pathlib import Path
+
+from statemachine import State, registry
+
+from django.db import models
+from django.db.models import QuerySet, Value, Case, When, IntegerField
+from django.utils.functional import cached_property
+from django.utils.text import slugify
+from django.utils import timezone
+from django.core.cache import cache
+from django.urls import reverse, reverse_lazy
+from django.contrib import admin
+from django.conf import settings
+
+from archivebox.config import CONSTANTS
+from archivebox.misc.system import get_dir_size, atomic_write
+from archivebox.misc.util import parse_date, base_url, domain as url_domain, to_json, ts_to_date_str, urlencode, htmlencode, urldecode
+from archivebox.misc.hashing import get_dir_info
+from archivebox.hooks import (
+    EXTRACTOR_INDEXING_PRECEDENCE,
+    get_plugins, get_plugin_name, get_plugin_icon,
+    DEFAULT_PLUGIN_ICONS,
+)
+from archivebox.base_models.models import (
+    ModelWithUUID, ModelWithSerializers, ModelWithOutputDir,
+    ModelWithConfig, ModelWithNotes, ModelWithHealthStats,
+    get_or_create_system_user_pk,
+)
+from workers.models import ModelWithStateMachine, BaseStateMachine
+from workers.tasks import bg_archive_snapshot
+from archivebox.crawls.models import Crawl
+from archivebox.machine.models import NetworkInterface, Binary
+
+
+
+class Tag(ModelWithSerializers):
+    # Keep AutoField for compatibility with main branch migrations
+    # Don't use UUIDField here - requires complex FK transformation
+    id = models.AutoField(primary_key=True, serialize=False, verbose_name='ID')
+    created_by = models.ForeignKey(settings.AUTH_USER_MODEL, on_delete=models.CASCADE, default=get_or_create_system_user_pk, null=False, related_name='tag_set')
+    created_at = models.DateTimeField(default=timezone.now, db_index=True, null=True)
+    modified_at = models.DateTimeField(auto_now=True)
+    name = models.CharField(unique=True, blank=False, max_length=100)
+    slug = models.SlugField(unique=True, blank=False, max_length=100, editable=False)
+
+    snapshot_set: models.Manager['Snapshot']
+
+    class Meta(TypedModelMeta):
+        verbose_name = "Tag"
+        verbose_name_plural = "Tags"
+
+    def __str__(self):
+        return self.name
+
+    def save(self, *args, **kwargs):
+        is_new = self._state.adding
+        if is_new:
+            self.slug = slugify(self.name)
+            existing = set(Tag.objects.filter(slug__startswith=self.slug).values_list("slug", flat=True))
+            i = None
+            while True:
+                slug = f"{slugify(self.name)}_{i}" if i else slugify(self.name)
+                if slug not in existing:
+                    self.slug = slug
+                    break
+                i = (i or 0) + 1
+        super().save(*args, **kwargs)
+
+        if is_new:
+            from archivebox.misc.logging_util import log_worker_event
+            log_worker_event(
+                worker_type='DB',
+                event='Created Tag',
+                indent_level=0,
+                metadata={
+                    'id': self.id,
+                    'name': self.name,
+                    'slug': self.slug,
+                },
+            )
+
+    @property
+    def api_url(self) -> str:
+        return reverse_lazy('api-1:get_tag', args=[self.id])
+
+    @staticmethod
+    def from_jsonl(record: Dict[str, Any], overrides: Dict[str, Any] = None):
+        """
+        Create/update Tag from JSONL record.
+
+        Args:
+            record: JSONL record with 'name' field
+            overrides: Optional dict with 'snapshot' to auto-attach tag
+
+        Returns:
+            Tag instance or None
+        """
+        from archivebox.misc.jsonl import get_or_create_tag
+
+        try:
+            tag = get_or_create_tag(record)
+
+            # Auto-attach to snapshot if in overrides
+            if overrides and 'snapshot' in overrides and tag:
+                overrides['snapshot'].tags.add(tag)
+
+            return tag
+        except ValueError:
+            return None
+
+
+class SnapshotTag(models.Model):
+    id = models.AutoField(primary_key=True)
+    snapshot = models.ForeignKey('Snapshot', db_column='snapshot_id', on_delete=models.CASCADE, to_field='id')
+    tag = models.ForeignKey(Tag, db_column='tag_id', on_delete=models.CASCADE, to_field='id')
+
+    class Meta:
+        db_table = 'core_snapshot_tags'
+        unique_together = [('snapshot', 'tag')]
+
+
+class SnapshotQuerySet(models.QuerySet):
+    """Custom QuerySet for Snapshot model with export methods that persist through .filter() etc."""
+
+    # =========================================================================
+    # Filtering Methods
+    # =========================================================================
+
+    FILTER_TYPES = {
+        'exact': lambda pattern: models.Q(url=pattern),
+        'substring': lambda pattern: models.Q(url__icontains=pattern),
+        'regex': lambda pattern: models.Q(url__iregex=pattern),
+        'domain': lambda pattern: models.Q(url__istartswith=f"http://{pattern}") | models.Q(url__istartswith=f"https://{pattern}") | models.Q(url__istartswith=f"ftp://{pattern}"),
+        'tag': lambda pattern: models.Q(tags__name=pattern),
+        'timestamp': lambda pattern: models.Q(timestamp=pattern),
+    }
+
+    def filter_by_patterns(self, patterns: List[str], filter_type: str = 'exact') -> 'SnapshotQuerySet':
+        """Filter snapshots by URL patterns using specified filter type"""
+        from archivebox.misc.logging import stderr
+
+        q_filter = models.Q()
+        for pattern in patterns:
+            try:
+                q_filter = q_filter | self.FILTER_TYPES[filter_type](pattern)
+            except KeyError:
+                stderr()
+                stderr(f'[X] Got invalid pattern for --filter-type={filter_type}:', color='red')
+                stderr(f'    {pattern}')
+                raise SystemExit(2)
+        return self.filter(q_filter)
+
+    def search(self, patterns: List[str]) -> 'SnapshotQuerySet':
+        """Search snapshots using the configured search backend"""
+        from archivebox.config.common import SEARCH_BACKEND_CONFIG
+        from archivebox.search import query_search_index
+        from archivebox.misc.logging import stderr
+
+        if not SEARCH_BACKEND_CONFIG.USE_SEARCHING_BACKEND:
+            stderr()
+            stderr('[X] The search backend is not enabled, set config.USE_SEARCHING_BACKEND = True', color='red')
+            raise SystemExit(2)
+
+        qsearch = self.none()
+        for pattern in patterns:
+            try:
+                qsearch |= query_search_index(pattern)
+            except:
+                raise SystemExit(2)
+        return self.all() & qsearch
+
+    # =========================================================================
+    # Export Methods
+    # =========================================================================
+
+    def to_json(self, with_headers: bool = False) -> str:
+        """Generate JSON index from snapshots"""
+        import sys
+        from datetime import datetime, timezone as tz
+        from archivebox.config import VERSION
+        from archivebox.config.common import SERVER_CONFIG
+
+        MAIN_INDEX_HEADER = {
+            'info': 'This is an index of site data archived by ArchiveBox: The self-hosted web archive.',
+            'schema': 'archivebox.index.json',
+            'copyright_info': SERVER_CONFIG.FOOTER_INFO,
+            'meta': {
+                'project': 'ArchiveBox',
+                'version': VERSION,
+                'git_sha': VERSION,
+                'website': 'https://ArchiveBox.io',
+                'docs': 'https://github.com/ArchiveBox/ArchiveBox/wiki',
+                'source': 'https://github.com/ArchiveBox/ArchiveBox',
+                'issues': 'https://github.com/ArchiveBox/ArchiveBox/issues',
+                'dependencies': {},
+            },
+        } if with_headers else {}
+
+        snapshot_dicts = [s.to_dict(extended=True) for s in self.iterator(chunk_size=500)]
+
+        if with_headers:
+            output = {
+                **MAIN_INDEX_HEADER,
+                'num_links': len(snapshot_dicts),
+                'updated': datetime.now(tz.utc),
+                'last_run_cmd': sys.argv,
+                'links': snapshot_dicts,
+            }
+        else:
+            output = snapshot_dicts
+        return to_json(output, indent=4, sort_keys=True)
+
+    def to_csv(self, cols: Optional[List[str]] = None, header: bool = True, separator: str = ',', ljust: int = 0) -> str:
+        """Generate CSV output from snapshots"""
+        cols = cols or ['timestamp', 'is_archived', 'url']
+        header_str = separator.join(col.ljust(ljust) for col in cols) if header else ''
+        row_strs = (s.to_csv(cols=cols, ljust=ljust, separator=separator) for s in self.iterator(chunk_size=500))
+        return '\n'.join((header_str, *row_strs))
+
+    def to_html(self, with_headers: bool = True) -> str:
+        """Generate main index HTML from snapshots"""
+        from datetime import datetime, timezone as tz
+        from django.template.loader import render_to_string
+        from archivebox.config import VERSION
+        from archivebox.config.common import SERVER_CONFIG
+        from archivebox.config.version import get_COMMIT_HASH
+
+        template = 'static_index.html' if with_headers else 'minimal_index.html'
+        snapshot_list = list(self.iterator(chunk_size=500))
+
+        return render_to_string(template, {
+            'version': VERSION,
+            'git_sha': get_COMMIT_HASH() or VERSION,
+            'num_links': str(len(snapshot_list)),
+            'date_updated': datetime.now(tz.utc).strftime('%Y-%m-%d'),
+            'time_updated': datetime.now(tz.utc).strftime('%Y-%m-%d %H:%M'),
+            'links': snapshot_list,
+            'FOOTER_INFO': SERVER_CONFIG.FOOTER_INFO,
+        })
+
+
+class SnapshotManager(models.Manager.from_queryset(SnapshotQuerySet)):
+    """Manager for Snapshot model - uses SnapshotQuerySet for chainable methods"""
+
+    def filter(self, *args, **kwargs):
+        domain = kwargs.pop('domain', None)
+        qs = super().filter(*args, **kwargs)
+        if domain:
+            qs = qs.filter(url__icontains=f'://{domain}')
+        return qs
+
+    def get_queryset(self):
+        return super().get_queryset().prefetch_related('tags', 'archiveresult_set')
+
+    # =========================================================================
+    # Import Methods
+    # =========================================================================
+
+    def remove(self, atomic: bool = False) -> tuple:
+        """Remove snapshots from the database"""
+        from django.db import transaction
+        if atomic:
+            with transaction.atomic():
+                return self.delete()
+        return self.delete()
+
+
+class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHealthStats, ModelWithStateMachine):
+    id = models.UUIDField(primary_key=True, default=uuid7, editable=False, unique=True)
+    created_at = models.DateTimeField(default=timezone.now, db_index=True)
+    modified_at = models.DateTimeField(auto_now=True)
+
+    url = models.URLField(unique=False, db_index=True)  # URLs can appear in multiple crawls
+    timestamp = models.CharField(max_length=32, unique=True, db_index=True, editable=False)
+    bookmarked_at = models.DateTimeField(default=timezone.now, db_index=True)
+    crawl: Crawl = models.ForeignKey(Crawl, on_delete=models.CASCADE, null=False, related_name='snapshot_set', db_index=True)  # type: ignore[assignment]
+    parent_snapshot = models.ForeignKey('self', on_delete=models.SET_NULL, null=True, blank=True, related_name='child_snapshots', db_index=True, help_text='Parent snapshot that discovered this URL (for recursive crawling)')
+
+    title = models.CharField(max_length=512, null=True, blank=True, db_index=True)
+    downloaded_at = models.DateTimeField(default=None, null=True, editable=False, db_index=True, blank=True)
+    depth = models.PositiveSmallIntegerField(default=0, db_index=True)  # 0 for root snapshot, 1+ for discovered URLs
+    fs_version = models.CharField(max_length=10, default='0.9.0', help_text='Filesystem version of this snapshot (e.g., "0.7.0", "0.8.0", "0.9.0"). Used to trigger lazy migration on save().')
+    current_step = models.PositiveSmallIntegerField(default=0, db_index=True, help_text='Current hook step being executed (0-9). Used for sequential hook execution.')
+
+    retry_at = ModelWithStateMachine.RetryAtField(default=timezone.now)
+    status = ModelWithStateMachine.StatusField(choices=ModelWithStateMachine.StatusChoices, default=ModelWithStateMachine.StatusChoices.QUEUED)
+    config = models.JSONField(default=dict, null=False, blank=False, editable=True)
+    notes = models.TextField(blank=True, null=False, default='')
+    output_dir = models.FilePathField(path=CONSTANTS.ARCHIVE_DIR, recursive=True, match='.*', default=None, null=True, blank=True, editable=True)
+
+    tags = models.ManyToManyField(Tag, blank=True, through=SnapshotTag, related_name='snapshot_set', through_fields=('snapshot', 'tag'))
+
+    state_machine_name = 'core.models.SnapshotMachine'
+    state_field_name = 'status'
+    retry_at_field_name = 'retry_at'
+    StatusChoices = ModelWithStateMachine.StatusChoices
+    active_state = StatusChoices.STARTED
+
+    objects = SnapshotManager()
+    archiveresult_set: models.Manager['ArchiveResult']
+
+    class Meta(TypedModelMeta):
+        verbose_name = "Snapshot"
+        verbose_name_plural = "Snapshots"
+        constraints = [
+            # Allow same URL in different crawls, but not duplicates within same crawl
+            models.UniqueConstraint(fields=['url', 'crawl'], name='unique_url_per_crawl'),
+            # Global timestamp uniqueness for 1:1 symlink mapping
+            models.UniqueConstraint(fields=['timestamp'], name='unique_timestamp'),
+        ]
+
+    def __str__(self):
+        return f'[{self.id}] {self.url[:64]}'
+
+    def save(self, *args, **kwargs):
+        is_new = self._state.adding
+        if not self.bookmarked_at:
+            self.bookmarked_at = self.created_at or timezone.now()
+        if not self.timestamp:
+            self.timestamp = str(self.bookmarked_at.timestamp())
+
+        # Migrate filesystem if needed (happens automatically on save)
+        if self.pk and self.fs_migration_needed:
+            from django.db import transaction
+            with transaction.atomic():
+                # Walk through migration chain automatically
+                current = self.fs_version
+                target = self._fs_current_version()
+
+                while current != target:
+                    next_ver = self._fs_next_version(current)
+                    method = f'_fs_migrate_from_{current.replace(".", "_")}_to_{next_ver.replace(".", "_")}'
+
+                    # Only run if method exists (most are no-ops)
+                    if hasattr(self, method):
+                        getattr(self, method)()
+
+                    current = next_ver
+
+                # Update version (still in transaction)
+                self.fs_version = target
+
+        super().save(*args, **kwargs)
+        if self.crawl and self.url not in self.crawl.urls:
+            self.crawl.urls += f'\n{self.url}'
+            self.crawl.save()
+
+        if is_new:
+            from archivebox.misc.logging_util import log_worker_event
+            log_worker_event(
+                worker_type='DB',
+                event='Created Snapshot',
+                indent_level=2,
+                url=self.url,
+                metadata={
+                    'id': str(self.id),
+                    'crawl_id': str(self.crawl_id) if self.crawl_id else None,
+                    'depth': self.depth,
+                    'status': self.status,
+                },
+            )
+
+    # =========================================================================
+    # Filesystem Migration Methods
+    # =========================================================================
+
+    @staticmethod
+    def _fs_current_version() -> str:
+        """Get current ArchiveBox filesystem version (normalized to x.x.0 format)"""
+        from archivebox.config import VERSION
+        # Normalize version to x.x.0 format (e.g., "0.9.0rc1" -> "0.9.0")
+        parts = VERSION.split('.')
+        if len(parts) >= 2:
+            major, minor = parts[0], parts[1]
+            # Strip any non-numeric suffix from minor version
+            minor = ''.join(c for c in minor if c.isdigit())
+            return f'{major}.{minor}.0'
+        return '0.9.0'  # Fallback if version parsing fails
+
+    @property
+    def fs_migration_needed(self) -> bool:
+        """Check if snapshot needs filesystem migration"""
+        return self.fs_version != self._fs_current_version()
+
+    def _fs_next_version(self, version: str) -> str:
+        """Get next version in migration chain"""
+        chain = ['0.7.0', '0.8.0', '0.9.0']
+        try:
+            idx = chain.index(version)
+            return chain[idx + 1] if idx + 1 < len(chain) else self._fs_current_version()
+        except ValueError:
+            # Unknown version - skip to current
+            return self._fs_current_version()
+
+    def _fs_migrate_from_0_7_0_to_0_8_0(self):
+        """Migration from 0.7.0 to 0.8.0 layout (no-op)"""
+        # 0.7 and 0.8 both used archive/<timestamp>
+        # Nothing to do!
+        pass
+
+    def _fs_migrate_from_0_8_0_to_0_9_0(self):
+        """
+        Migrate from flat to nested structure.
+
+        0.8.x: archive/{timestamp}/
+        0.9.x: users/{user}/snapshots/YYYYMMDD/{domain}/{uuid}/
+
+        Transaction handling:
+        1. Copy files INSIDE transaction
+        2. Create symlink INSIDE transaction
+        3. Update fs_version INSIDE transaction (done by save())
+        4. Exit transaction (DB commit)
+        5. Delete old files OUTSIDE transaction (after commit)
+        """
+        import shutil
+        from django.db import transaction
+
+        old_dir = self.get_storage_path_for_version('0.8.0')
+        new_dir = self.get_storage_path_for_version('0.9.0')
+
+        if not old_dir.exists() or old_dir == new_dir or new_dir.exists():
+            return
+
+        new_dir.mkdir(parents=True, exist_ok=True)
+
+        # Copy all files (idempotent)
+        for old_file in old_dir.rglob('*'):
+            if not old_file.is_file():
+                continue
+
+            rel_path = old_file.relative_to(old_dir)
+            new_file = new_dir / rel_path
+
+            # Skip if already copied
+            if new_file.exists() and new_file.stat().st_size == old_file.stat().st_size:
+                continue
+
+            new_file.parent.mkdir(parents=True, exist_ok=True)
+            shutil.copy2(old_file, new_file)
+
+        # Verify all copied
+        old_files = {f.relative_to(old_dir): f.stat().st_size
+                     for f in old_dir.rglob('*') if f.is_file()}
+        new_files = {f.relative_to(new_dir): f.stat().st_size
+                     for f in new_dir.rglob('*') if f.is_file()}
+
+        if old_files.keys() != new_files.keys():
+            missing = old_files.keys() - new_files.keys()
+            raise Exception(f"Migration incomplete: missing {missing}")
+
+        # Create backwards-compat symlink (INSIDE transaction)
+        symlink_path = CONSTANTS.ARCHIVE_DIR / self.timestamp
+        if symlink_path.is_symlink():
+            symlink_path.unlink()
+
+        if not symlink_path.exists() or symlink_path == old_dir:
+            symlink_path.symlink_to(new_dir, target_is_directory=True)
+
+        # Schedule old directory deletion AFTER transaction commits
+        transaction.on_commit(lambda: self._cleanup_old_migration_dir(old_dir))
+
+    def _cleanup_old_migration_dir(self, old_dir: Path):
+        """
+        Delete old directory after successful migration.
+        Called via transaction.on_commit() after DB commit succeeds.
+        """
+        import shutil
+        import logging
+
+        if old_dir.exists() and not old_dir.is_symlink():
+            try:
+                shutil.rmtree(old_dir)
+            except Exception as e:
+                # Log but don't raise - migration succeeded, this is just cleanup
+                logging.getLogger('archivebox.migration').warning(
+                    f"Could not remove old migration directory {old_dir}: {e}"
+                )
+
+    # =========================================================================
+    # Path Calculation and Migration Helpers
+    # =========================================================================
+
+    @staticmethod
+    def extract_domain_from_url(url: str) -> str:
+        """
+        Extract domain from URL for 0.9.x path structure.
+        Uses full hostname with sanitized special chars.
+
+        Examples:
+            https://example.com:8080 → example.com_8080
+            https://sub.example.com → sub.example.com
+            file:///path → localhost
+            data:text/html → data
+        """
+        from urllib.parse import urlparse
+
+        try:
+            parsed = urlparse(url)
+
+            if parsed.scheme in ('http', 'https'):
+                if parsed.port:
+                    return f"{parsed.hostname}_{parsed.port}".replace(':', '_')
+                return parsed.hostname or 'unknown'
+            elif parsed.scheme == 'file':
+                return 'localhost'
+            elif parsed.scheme:
+                return parsed.scheme
+            else:
+                return 'unknown'
+        except Exception:
+            return 'unknown'
+
+    def get_storage_path_for_version(self, version: str) -> Path:
+        """
+        Calculate storage path for specific filesystem version.
+        Centralizes path logic so it's reusable.
+
+        0.7.x/0.8.x: archive/{timestamp}
+        0.9.x: users/{username}/snapshots/YYYYMMDD/{domain}/{uuid}/
+        """
+        from datetime import datetime
+
+        if version in ('0.7.0', '0.8.0'):
+            return CONSTANTS.ARCHIVE_DIR / self.timestamp
+
+        elif version in ('0.9.0', '1.0.0'):
+            username = self.crawl.created_by.username
+
+            # Use created_at for date grouping (fallback to timestamp)
+            if self.created_at:
+                date_str = self.created_at.strftime('%Y%m%d')
+            else:
+                date_str = datetime.fromtimestamp(float(self.timestamp)).strftime('%Y%m%d')
+
+            domain = self.extract_domain_from_url(self.url)
+
+            return (
+                CONSTANTS.DATA_DIR / 'users' / username / 'snapshots' /
+                date_str / domain / str(self.id)
+            )
+        else:
+            # Unknown version - use current
+            return self.get_storage_path_for_version(self._fs_current_version())
+
+    # =========================================================================
+    # Loading and Creation from Filesystem (Used by archivebox update ONLY)
+    # =========================================================================
+
+    @classmethod
+    def load_from_directory(cls, snapshot_dir: Path) -> Optional['Snapshot']:
+        """
+        Load existing Snapshot from DB by reading index.json.
+
+        Reads index.json, extracts url+timestamp, queries DB.
+        Returns existing Snapshot or None if not found/invalid.
+        Does NOT create new snapshots.
+
+        ONLY used by: archivebox update (for orphan detection)
+        """
+        import json
+
+        index_path = snapshot_dir / 'index.json'
+        if not index_path.exists():
+            return None
+
+        try:
+            with open(index_path) as f:
+                data = json.load(f)
+        except:
+            return None
+
+        url = data.get('url')
+        if not url:
+            return None
+
+        # Get timestamp - prefer index.json, fallback to folder name
+        timestamp = cls._select_best_timestamp(
+            index_timestamp=data.get('timestamp'),
+            folder_name=snapshot_dir.name
+        )
+
+        if not timestamp:
+            return None
+
+        # Look up existing
+        try:
+            return cls.objects.get(url=url, timestamp=timestamp)
+        except cls.DoesNotExist:
+            return None
+        except cls.MultipleObjectsReturned:
+            # Should not happen with unique constraint
+            return cls.objects.filter(url=url, timestamp=timestamp).first()
+
+    @classmethod
+    def create_from_directory(cls, snapshot_dir: Path) -> Optional['Snapshot']:
+        """
+        Create new Snapshot from orphaned directory.
+
+        Validates timestamp, ensures uniqueness.
+        Returns new UNSAVED Snapshot or None if invalid.
+
+        ONLY used by: archivebox update (for orphan import)
+        """
+        import json
+
+        index_path = snapshot_dir / 'index.json'
+        if not index_path.exists():
+            return None
+
+        try:
+            with open(index_path) as f:
+                data = json.load(f)
+        except:
+            return None
+
+        url = data.get('url')
+        if not url:
+            return None
+
+        # Get and validate timestamp
+        timestamp = cls._select_best_timestamp(
+            index_timestamp=data.get('timestamp'),
+            folder_name=snapshot_dir.name
+        )
+
+        if not timestamp:
+            return None
+
+        # Ensure uniqueness (reuses existing logic from create_or_update_from_dict)
+        timestamp = cls._ensure_unique_timestamp(url, timestamp)
+
+        # Detect version
+        fs_version = cls._detect_fs_version_from_index(data)
+
+        return cls(
+            url=url,
+            timestamp=timestamp,
+            title=data.get('title', ''),
+            fs_version=fs_version,
+            created_by_id=get_or_create_system_user_pk(),
+        )
+
+    @staticmethod
+    def _select_best_timestamp(index_timestamp: str, folder_name: str) -> Optional[str]:
+        """
+        Select best timestamp from index.json vs folder name.
+
+        Validates range (1995-2035).
+        Prefers index.json if valid.
+        """
+        def is_valid_timestamp(ts):
+            try:
+                ts_int = int(float(ts))
+                # 1995-01-01 to 2035-12-31
+                return 788918400 <= ts_int <= 2082758400
+            except:
+                return False
+
+        index_valid = is_valid_timestamp(index_timestamp) if index_timestamp else False
+        folder_valid = is_valid_timestamp(folder_name)
+
+        if index_valid:
+            return str(int(float(index_timestamp)))
+        elif folder_valid:
+            return str(int(float(folder_name)))
+        else:
+            return None
+
+    @classmethod
+    def _ensure_unique_timestamp(cls, url: str, timestamp: str) -> str:
+        """
+        Ensure timestamp is globally unique.
+        If collision with different URL, increment by 1 until unique.
+
+        NOTE: Logic already exists in create_or_update_from_dict (line 266-267)
+        This is just an extracted, reusable version.
+        """
+        while cls.objects.filter(timestamp=timestamp).exclude(url=url).exists():
+            timestamp = str(int(float(timestamp)) + 1)
+        return timestamp
+
+    @staticmethod
+    def _detect_fs_version_from_index(data: dict) -> str:
+        """
+        Detect fs_version from index.json structure.
+
+        - Has fs_version field: use it
+        - Has history dict: 0.7.0
+        - Has archive_results list: 0.8.0
+        - Default: 0.7.0
+        """
+        if 'fs_version' in data:
+            return data['fs_version']
+        if 'history' in data and 'archive_results' not in data:
+            return '0.7.0'
+        if 'archive_results' in data:
+            return '0.8.0'
+        return '0.7.0'
+
+    # =========================================================================
+    # Index.json Reconciliation
+    # =========================================================================
+
+    def reconcile_with_index_json(self):
+        """
+        Merge index.json with DB. DB is source of truth.
+
+        - Title: longest non-URL
+        - Tags: union
+        - ArchiveResults: keep both (by plugin+start_ts)
+
+        Writes back in 0.9.x format.
+
+        Used by: archivebox update (to sync index.json with DB)
+        """
+        import json
+
+        index_path = Path(self.output_dir) / 'index.json'
+
+        index_data = {}
+        if index_path.exists():
+            try:
+                with open(index_path) as f:
+                    index_data = json.load(f)
+            except:
+                pass
+
+        # Merge title
+        self._merge_title_from_index(index_data)
+
+        # Merge tags
+        self._merge_tags_from_index(index_data)
+
+        # Merge ArchiveResults
+        self._merge_archive_results_from_index(index_data)
+
+        # Write back
+        self.write_index_json()
+
+    def _merge_title_from_index(self, index_data: dict):
+        """Merge title - prefer longest non-URL title."""
+        index_title = index_data.get('title', '').strip()
+        db_title = self.title or ''
+
+        candidates = [t for t in [index_title, db_title] if t and t != self.url]
+        if candidates:
+            best_title = max(candidates, key=len)
+            if self.title != best_title:
+                self.title = best_title
+
+    def _merge_tags_from_index(self, index_data: dict):
+        """Merge tags - union of both sources."""
+        from django.db import transaction
+
+        index_tags = set(index_data.get('tags', '').split(',')) if index_data.get('tags') else set()
+        index_tags = {t.strip() for t in index_tags if t.strip()}
+
+        db_tags = set(self.tags.values_list('name', flat=True))
+
+        new_tags = index_tags - db_tags
+        if new_tags:
+            with transaction.atomic():
+                for tag_name in new_tags:
+                    tag, _ = Tag.objects.get_or_create(name=tag_name)
+                    self.tags.add(tag)
+
+    def _merge_archive_results_from_index(self, index_data: dict):
+        """Merge ArchiveResults - keep both (by plugin+start_ts)."""
+        existing = {
+            (ar.plugin, ar.start_ts): ar
+            for ar in ArchiveResult.objects.filter(snapshot=self)
+        }
+
+        # Handle 0.8.x format (archive_results list)
+        for result_data in index_data.get('archive_results', []):
+            self._create_archive_result_if_missing(result_data, existing)
+
+        # Handle 0.7.x format (history dict)
+        if 'history' in index_data and isinstance(index_data['history'], dict):
+            for plugin, result_list in index_data['history'].items():
+                if isinstance(result_list, list):
+                    for result_data in result_list:
+                        # Support both old 'extractor' and new 'plugin' keys for backwards compat
+                        result_data['plugin'] = result_data.get('plugin') or result_data.get('extractor') or plugin
+                        self._create_archive_result_if_missing(result_data, existing)
+
+    def _create_archive_result_if_missing(self, result_data: dict, existing: dict):
+        """Create ArchiveResult if not already in DB."""
+        from dateutil import parser
+
+        # Support both old 'extractor' and new 'plugin' keys for backwards compat
+        plugin = result_data.get('plugin') or result_data.get('extractor', '')
+        if not plugin:
+            return
+
+        start_ts = None
+        if result_data.get('start_ts'):
+            try:
+                start_ts = parser.parse(result_data['start_ts'])
+            except:
+                pass
+
+        if (plugin, start_ts) in existing:
+            return
+
+        try:
+            end_ts = None
+            if result_data.get('end_ts'):
+                try:
+                    end_ts = parser.parse(result_data['end_ts'])
+                except:
+                    pass
+
+            ArchiveResult.objects.create(
+                snapshot=self,
+                plugin=plugin,
+                hook_name=result_data.get('hook_name', ''),
+                status=result_data.get('status', 'failed'),
+                output_str=result_data.get('output', ''),
+                cmd=result_data.get('cmd', []),
+                pwd=result_data.get('pwd', str(self.output_dir)),
+                start_ts=start_ts,
+                end_ts=end_ts,
+                created_by=self.crawl.created_by,
+            )
+        except:
+            pass
+
+    def write_index_json(self):
+        """Write index.json in 0.9.x format."""
+        import json
+
+        index_path = Path(self.output_dir) / 'index.json'
+
+        data = {
+            'url': self.url,
+            'timestamp': self.timestamp,
+            'title': self.title or '',
+            'tags': ','.join(sorted(self.tags.values_list('name', flat=True))),
+            'fs_version': self.fs_version,
+            'bookmarked_at': self.bookmarked_at.isoformat() if self.bookmarked_at else None,
+            'created_at': self.created_at.isoformat() if self.created_at else None,
+            'archive_results': [
+                {
+                    'plugin': ar.plugin,
+                    'status': ar.status,
+                    'start_ts': ar.start_ts.isoformat() if ar.start_ts else None,
+                    'end_ts': ar.end_ts.isoformat() if ar.end_ts else None,
+                    'output': ar.output_str or '',
+                    'cmd': ar.cmd if isinstance(ar.cmd, list) else [],
+                    'pwd': ar.pwd,
+                }
+                for ar in ArchiveResult.objects.filter(snapshot=self).order_by('start_ts')
+            ],
+        }
+
+        index_path.parent.mkdir(parents=True, exist_ok=True)
+        with open(index_path, 'w') as f:
+            json.dump(data, f, indent=2, sort_keys=True)
+
+    # =========================================================================
+    # Snapshot Utilities
+    # =========================================================================
+
+    @staticmethod
+    def move_directory_to_invalid(snapshot_dir: Path):
+        """
+        Move invalid directory to data/invalid/YYYYMMDD/.
+
+        Used by: archivebox update (when encountering invalid directories)
+        """
+        from datetime import datetime
+        import shutil
+
+        invalid_dir = CONSTANTS.DATA_DIR / 'invalid' / datetime.now().strftime('%Y%m%d')
+        invalid_dir.mkdir(parents=True, exist_ok=True)
+
+        dest = invalid_dir / snapshot_dir.name
+        counter = 1
+        while dest.exists():
+            dest = invalid_dir / f"{snapshot_dir.name}_{counter}"
+            counter += 1
+
+        try:
+            shutil.move(str(snapshot_dir), str(dest))
+        except:
+            pass
+
+    @classmethod
+    def find_and_merge_duplicates(cls) -> int:
+        """
+        Find and merge snapshots with same url:timestamp.
+        Returns count of duplicate sets merged.
+
+        Used by: archivebox update (Phase 3: deduplication)
+        """
+        from django.db.models import Count
+
+        duplicates = (
+            cls.objects
+            .values('url', 'timestamp')
+            .annotate(count=Count('id'))
+            .filter(count__gt=1)
+        )
+
+        merged = 0
+        for dup in duplicates.iterator():
+            snapshots = list(
+                cls.objects
+                .filter(url=dup['url'], timestamp=dup['timestamp'])
+                .order_by('created_at')  # Keep oldest
+            )
+
+            if len(snapshots) > 1:
+                try:
+                    cls._merge_snapshots(snapshots)
+                    merged += 1
+                except:
+                    pass
+
+        return merged
+
+    @classmethod
+    def _merge_snapshots(cls, snapshots: list['Snapshot']):
+        """
+        Merge exact duplicates.
+        Keep oldest, union files + ArchiveResults.
+        """
+        import shutil
+
+        keeper = snapshots[0]
+        duplicates = snapshots[1:]
+
+        keeper_dir = Path(keeper.output_dir)
+
+        for dup in duplicates:
+            dup_dir = Path(dup.output_dir)
+
+            # Merge files
+            if dup_dir.exists() and dup_dir != keeper_dir:
+                for dup_file in dup_dir.rglob('*'):
+                    if not dup_file.is_file():
+                        continue
+
+                    rel = dup_file.relative_to(dup_dir)
+                    keeper_file = keeper_dir / rel
+
+                    if not keeper_file.exists():
+                        keeper_file.parent.mkdir(parents=True, exist_ok=True)
+                        shutil.copy2(dup_file, keeper_file)
+
+                try:
+                    shutil.rmtree(dup_dir)
+                except:
+                    pass
+
+            # Merge tags
+            for tag in dup.tags.all():
+                keeper.tags.add(tag)
+
+            # Move ArchiveResults
+            ArchiveResult.objects.filter(snapshot=dup).update(snapshot=keeper)
+
+            # Delete
+            dup.delete()
+
+    # =========================================================================
+    # Output Directory Properties
+    # =========================================================================
+
+    @property
+    def output_dir_parent(self) -> str:
+        return 'archive'
+
+    @property
+    def output_dir_name(self) -> str:
+        return str(self.timestamp)
+
+    def archive(self, overwrite=False, methods=None):
+        return bg_archive_snapshot(self, overwrite=overwrite, methods=methods)
+
+    @admin.display(description='Tags')
+    def tags_str(self, nocache=True) -> str | None:
+        calc_tags_str = lambda: ','.join(sorted(tag.name for tag in self.tags.all()))
+        if hasattr(self, '_prefetched_objects_cache') and 'tags' in self._prefetched_objects_cache:
+            return calc_tags_str()
+        cache_key = f'{self.pk}-tags'
+        return cache.get_or_set(cache_key, calc_tags_str) if not nocache else calc_tags_str()
+
+    def icons(self) -> str:
+        """Generate HTML icons showing which extractor plugins have succeeded for this snapshot"""
+        from django.utils.html import format_html, mark_safe
+
+        cache_key = f'result_icons:{self.pk}:{(self.downloaded_at or self.modified_at or self.created_at or self.bookmarked_at).timestamp()}'
+
+        def calc_icons():
+            if hasattr(self, '_prefetched_objects_cache') and 'archiveresult_set' in self._prefetched_objects_cache:
+                archive_results = {r.plugin: r for r in self.archiveresult_set.all() if r.status == "succeeded" and (r.output_files or r.output_str)}
+            else:
+                # Filter for results that have either output_files or output_str
+                from django.db.models import Q
+                archive_results = {r.plugin: r for r in self.archiveresult_set.filter(
+                    Q(status="succeeded") & (Q(output_files__isnull=False) | ~Q(output_str=''))
+                )}
+
+            path = self.archive_path
+            canon = self.canonical_outputs()
+            output = ""
+            output_template = '<a href="/{}/{}" class="exists-{}" title="{}">{}</a> &nbsp;'
+
+            # Get all plugins from hooks system (sorted by numeric prefix)
+            all_plugins = [get_plugin_name(e) for e in get_plugins()]
+
+            for plugin in all_plugins:
+                result = archive_results.get(plugin)
+                existing = result and result.status == 'succeeded' and (result.output_files or result.output_str)
+                icon = get_plugin_icon(plugin)
+                output += format_html(
+                    output_template,
+                    path,
+                    canon.get(plugin, plugin + '/'),
+                    str(bool(existing)),
+                    plugin,
+                    icon
+                )
+
+            return format_html('<span class="files-icons" style="font-size: 1.1em; opacity: 0.8; min-width: 240px; display: inline-block">{}</span>', mark_safe(output))
+
+        cache_result = cache.get(cache_key)
+        if cache_result:
+            return cache_result
+
+        fresh_result = calc_icons()
+        cache.set(cache_key, fresh_result, timeout=60 * 60 * 24)
+        return fresh_result
+
+    @property
+    def api_url(self) -> str:
+        return reverse_lazy('api-1:get_snapshot', args=[self.id])
+
+    def get_absolute_url(self):
+        return f'/{self.archive_path}'
+
+    @cached_property
+    def domain(self) -> str:
+        return url_domain(self.url)
+
+    @cached_property
+    def output_dir(self):
+        """The filesystem path to the snapshot's output directory."""
+        import os
+
+        current_path = self.get_storage_path_for_version(self.fs_version)
+
+        if current_path.exists():
+            return str(current_path)
+
+        # Check for backwards-compat symlink
+        old_path = CONSTANTS.ARCHIVE_DIR / self.timestamp
+        if old_path.is_symlink():
+            return str(Path(os.readlink(old_path)).resolve())
+        elif old_path.exists():
+            return str(old_path)
+
+        return str(current_path)
+
+    @cached_property
+    def archive_path(self):
+        return f'{CONSTANTS.ARCHIVE_DIR_NAME}/{self.timestamp}'
+
+    @cached_property
+    def archive_size(self):
+        try:
+            return get_dir_size(self.output_dir)[0]
+        except Exception:
+            return 0
+
+    def save_tags(self, tags: Iterable[str] = ()) -> None:
+        tags_id = [Tag.objects.get_or_create(name=tag)[0].pk for tag in tags if tag.strip()]
+        self.tags.clear()
+        self.tags.add(*tags_id)
+
+    def pending_archiveresults(self) -> QuerySet['ArchiveResult']:
+        return self.archiveresult_set.exclude(status__in=ArchiveResult.FINAL_OR_ACTIVE_STATES)
+
+    def run(self) -> list['ArchiveResult']:
+        """
+        Execute snapshot by creating pending ArchiveResults for all enabled hooks.
+
+        Called by: SnapshotMachine.enter_started()
+
+        Hook Lifecycle:
+            1. discover_hooks('Snapshot') → finds all plugin hooks
+            2. For each hook:
+               - Create ArchiveResult with status=QUEUED
+               - Store hook_name (e.g., 'on_Snapshot__50_wget.py')
+            3. ArchiveResults execute independently via ArchiveResultMachine
+            4. Hook execution happens in ArchiveResult.run(), NOT here
+
+        Returns:
+            list[ArchiveResult]: Newly created pending results
+        """
+        return self.create_pending_archiveresults()
+
+    def cleanup(self):
+        """
+        Clean up background ArchiveResult hooks.
+
+        Called by the state machine when entering the 'sealed' state.
+        Kills any background hooks and finalizes their ArchiveResults.
+        """
+        from archivebox.hooks import kill_process
+
+        # Kill any background ArchiveResult hooks
+        if not self.OUTPUT_DIR.exists():
+            return
+
+        # Find all .pid files in this snapshot's output directory
+        for pid_file in self.OUTPUT_DIR.glob('**/*.pid'):
+            kill_process(pid_file, validate=True)
+
+        # Update all STARTED ArchiveResults from filesystem
+        results = self.archiveresult_set.filter(status=ArchiveResult.StatusChoices.STARTED)
+        for ar in results:
+            ar.update_from_output()
+
+    def has_running_background_hooks(self) -> bool:
+        """
+        Check if any ArchiveResult background hooks are still running.
+
+        Used by state machine to determine if snapshot is finished.
+        """
+        from archivebox.hooks import process_is_alive
+
+        if not self.OUTPUT_DIR.exists():
+            return False
+
+        for plugin_dir in self.OUTPUT_DIR.iterdir():
+            if not plugin_dir.is_dir():
+                continue
+            pid_file = plugin_dir / 'hook.pid'
+            if process_is_alive(pid_file):
+                return True
+
+        return False
+
+    @staticmethod
+    def from_jsonl(record: Dict[str, Any], overrides: Dict[str, Any] = None, queue_for_extraction: bool = True):
+        """
+        Create/update Snapshot from JSONL record or dict.
+
+        Unified method that handles:
+        - ID-based patching: {"id": "...", "title": "new title"}
+        - URL-based create/update: {"url": "...", "title": "...", "tags": "..."}
+        - Auto-creates Crawl if not provided
+        - Optionally queues for extraction
+
+        Args:
+            record: Dict with 'url' (for create) or 'id' (for patch), plus other fields
+            overrides: Dict with 'crawl', 'snapshot' (parent), 'created_by_id'
+            queue_for_extraction: If True, sets status=QUEUED and retry_at (default: True)
+
+        Returns:
+            Snapshot instance or None
+        """
+        import re
+        from django.utils import timezone
+        from archivebox.misc.util import parse_date
+        from archivebox.base_models.models import get_or_create_system_user_pk
+        from archivebox.config.common import GENERAL_CONFIG
+
+        overrides = overrides or {}
+
+        # If 'id' is provided, lookup and patch that specific snapshot
+        snapshot_id = record.get('id')
+        if snapshot_id:
+            try:
+                snapshot = Snapshot.objects.get(id=snapshot_id)
+
+                # Generically update all fields present in record
+                update_fields = []
+                for field_name, value in record.items():
+                    # Skip internal fields
+                    if field_name in ('id', 'type'):
+                        continue
+
+                    # Skip if field doesn't exist on model
+                    if not hasattr(snapshot, field_name):
+                        continue
+
+                    # Special parsing for date fields
+                    if field_name in ('bookmarked_at', 'retry_at', 'created_at', 'modified_at'):
+                        if value and isinstance(value, str):
+                            value = parse_date(value)
+
+                    # Update field if value is provided and different
+                    if value is not None and getattr(snapshot, field_name) != value:
+                        setattr(snapshot, field_name, value)
+                        update_fields.append(field_name)
+
+                if update_fields:
+                    snapshot.save(update_fields=update_fields + ['modified_at'])
+
+                return snapshot
+            except Snapshot.DoesNotExist:
+                # ID not found, fall through to create-by-URL logic
+                pass
+
+        url = record.get('url')
+        if not url:
+            return None
+
+        # Determine or create crawl (every snapshot must have a crawl)
+        crawl = overrides.get('crawl')
+        parent_snapshot = overrides.get('snapshot')  # Parent snapshot
+        created_by_id = overrides.get('created_by_id') or (parent_snapshot.crawl.created_by_id if parent_snapshot else None) or get_or_create_system_user_pk()
+
+        # If no crawl provided, inherit from parent or auto-create one
+        if not crawl:
+            if parent_snapshot:
+                # Inherit crawl from parent snapshot
+                crawl = parent_snapshot.crawl
+            else:
+                # Auto-create a single-URL crawl
+                from archivebox.crawls.models import Crawl
+                from archivebox.config import CONSTANTS
+
+                timestamp_str = timezone.now().strftime("%Y-%m-%d__%H-%M-%S")
+                sources_file = CONSTANTS.SOURCES_DIR / f'{timestamp_str}__auto_crawl.txt'
+                sources_file.parent.mkdir(parents=True, exist_ok=True)
+                sources_file.write_text(url)
+
+                crawl = Crawl.objects.create(
+                    urls=url,
+                    max_depth=0,
+                    label=f'auto-created for {url[:50]}',
+                    created_by_id=created_by_id,
+                )
+
+        # Parse tags
+        tags_str = record.get('tags', '')
+        tag_list = []
+        if tags_str:
+            tag_list = list(dict.fromkeys(
+                tag.strip() for tag in re.split(GENERAL_CONFIG.TAG_SEPARATOR_PATTERN, tags_str)
+                if tag.strip()
+            ))
+
+        # Get most recent snapshot with this URL (URLs can exist in multiple crawls)
+        snapshot = Snapshot.objects.filter(url=url).order_by('-created_at').first()
+
+        title = record.get('title')
+        timestamp = record.get('timestamp')
+
+        if snapshot:
+            # Update existing snapshot
+            if title and (not snapshot.title or len(title) > len(snapshot.title or '')):
+                snapshot.title = title
+                snapshot.save(update_fields=['title', 'modified_at'])
+        else:
+            # Create new snapshot
+            if timestamp:
+                while Snapshot.objects.filter(timestamp=timestamp).exists():
+                    timestamp = str(float(timestamp) + 1.0)
+
+            snapshot = Snapshot.objects.create(
+                url=url,
+                timestamp=timestamp,
+                title=title,
+                crawl=crawl,
+            )
+
+        # Update tags
+        if tag_list:
+            existing_tags = set(snapshot.tags.values_list('name', flat=True))
+            new_tags = set(tag_list) | existing_tags
+            snapshot.save_tags(new_tags)
+
+        # Queue for extraction and update additional fields
+        update_fields = []
+
+        if queue_for_extraction:
+            snapshot.status = Snapshot.StatusChoices.QUEUED
+            snapshot.retry_at = timezone.now()
+            update_fields.extend(['status', 'retry_at'])
+
+        # Update additional fields if provided
+        for field_name in ('depth', 'parent_snapshot_id', 'crawl_id', 'bookmarked_at'):
+            value = record.get(field_name)
+            if value is not None and getattr(snapshot, field_name) != value:
+                setattr(snapshot, field_name, value)
+                update_fields.append(field_name)
+
+        if update_fields:
+            snapshot.save(update_fields=update_fields + ['modified_at'])
+
+        return snapshot
+
+    def create_pending_archiveresults(self) -> list['ArchiveResult']:
+        """
+        Create ArchiveResult records for all enabled hooks.
+
+        Uses the hooks system to discover available hooks from:
+        - archivebox/plugins/*/on_Snapshot__*.{py,sh,js}
+        - data/plugins/*/on_Snapshot__*.{py,sh,js}
+
+        Creates one ArchiveResult per hook (not per plugin), with hook_name set.
+        This enables step-based execution where all hooks in a step can run in parallel.
+        """
+        from archivebox.hooks import discover_hooks
+
+        hooks = discover_hooks('Snapshot')
+        archiveresults = []
+
+        for hook_path in hooks:
+            hook_name = hook_path.name  # e.g., 'on_Snapshot__50_wget.py'
+            plugin = hook_path.parent.name  # e.g., 'wget'
+
+            # Check if AR already exists for this specific hook
+            if ArchiveResult.objects.filter(snapshot=self, hook_name=hook_name).exists():
+                continue
+
+            archiveresult, created = ArchiveResult.objects.get_or_create(
+                snapshot=self,
+                hook_name=hook_name,
+                defaults={
+                    'plugin': plugin,
+                    'status': ArchiveResult.INITIAL_STATE,
+                    'retry_at': timezone.now(),
+                    'created_by_id': self.crawl.created_by_id,
+                },
+            )
+            if archiveresult.status == ArchiveResult.INITIAL_STATE:
+                archiveresults.append(archiveresult)
+
+        return archiveresults
+
+    def advance_step_if_ready(self) -> bool:
+        """
+        Advance current_step if all foreground hooks in current step are finished.
+
+        Called by the state machine to check if step can advance.
+        Background hooks (.bg) don't block step advancement.
+
+        Step advancement rules:
+        - All foreground ARs in current step must be finished (SUCCEEDED/FAILED/SKIPPED)
+        - Background ARs (hook_name contains '.bg.') are ignored for advancement
+        - When ready, increments current_step by 1 (up to 9)
+
+        Returns:
+            True if step was advanced, False if not ready or already at step 9.
+        """
+        from archivebox.hooks import extract_step, is_background_hook
+
+        if self.current_step >= 9:
+            return False  # Already at final step
+
+        # Get all ARs for current step that are foreground
+        current_step_ars = self.archiveresult_set.filter(
+            hook_name__isnull=False
+        ).exclude(hook_name='')
+
+        # Check each AR in current step
+        for ar in current_step_ars:
+            ar_step = extract_step(ar.hook_name)
+            if ar_step != self.current_step:
+                continue  # Not in current step
+
+            if is_background_hook(ar.hook_name):
+                continue  # Background hooks don't block
+
+            # Foreground hook in current step - check if finished
+            if ar.status not in ArchiveResult.FINAL_OR_ACTIVE_STATES:
+                # Still pending/queued - can't advance
+                return False
+
+            if ar.status == ArchiveResult.StatusChoices.STARTED:
+                # Still running - can't advance
+                return False
+
+        # All foreground hooks in current step are finished - advance!
+        self.current_step += 1
+        self.save(update_fields=['current_step', 'modified_at'])
+        return True
+
+    def is_finished_processing(self) -> bool:
+        """
+        Check if this snapshot has finished processing.
+
+        Used by SnapshotMachine.is_finished() to determine if snapshot is complete.
+
+        Returns:
+            True if all archiveresults are finished (or no work to do), False otherwise.
+        """
+        # if no archiveresults exist yet, it's not finished
+        if not self.archiveresult_set.exists():
+            return False
+
+        # Try to advance step if ready (handles step-based hook execution)
+        # This will increment current_step when all foreground hooks in current step are done
+        while self.advance_step_if_ready():
+            pass  # Keep advancing until we can't anymore
+
+        # if archiveresults exist but are still pending, it's not finished
+        if self.pending_archiveresults().exists():
+            return False
+
+        # Don't wait for background hooks - they'll be cleaned up on entering sealed state
+        # Background hooks in STARTED state are excluded by pending_archiveresults()
+        # (STARTED is in FINAL_OR_ACTIVE_STATES) so once all results are FINAL or ACTIVE,
+        # we can transition to sealed and cleanup() will kill the background hooks
+
+        # otherwise archiveresults exist and are all finished, so it's finished
+        return True
+
+    def retry_failed_archiveresults(self, retry_at: Optional['timezone.datetime'] = None) -> int:
+        """
+        Reset failed/skipped ArchiveResults to queued for retry.
+
+        This enables seamless retry of the entire extraction pipeline:
+        - Resets FAILED and SKIPPED results to QUEUED
+        - Sets retry_at so workers pick them up
+        - Plugins run in order (numeric prefix)
+        - Each plugin checks its dependencies at runtime
+
+        Dependency handling (e.g., chrome_session → screenshot):
+        - Plugins check if required outputs exist before running
+        - If dependency output missing → plugin returns 'skipped'
+        - On retry, if dependency now succeeds → dependent can run
+
+        Returns count of ArchiveResults reset.
+        """
+        retry_at = retry_at or timezone.now()
+
+        count = self.archiveresult_set.filter(
+            status__in=[
+                ArchiveResult.StatusChoices.FAILED,
+                ArchiveResult.StatusChoices.SKIPPED,
+            ]
+        ).update(
+            status=ArchiveResult.StatusChoices.QUEUED,
+            retry_at=retry_at,
+            output=None,
+            start_ts=None,
+            end_ts=None,
+        )
+
+        # Also reset the snapshot and current_step so it gets re-checked from the beginning
+        if count > 0:
+            self.status = self.StatusChoices.STARTED
+            self.retry_at = retry_at
+            self.current_step = 0  # Reset to step 0 for retry
+            self.save(update_fields=['status', 'retry_at', 'current_step', 'modified_at'])
+
+        return count
+
+    # =========================================================================
+    # URL Helper Properties (migrated from Link schema)
+    # =========================================================================
+
+    @cached_property
+    def url_hash(self) -> str:
+        from hashlib import sha256
+        return sha256(self.url.encode()).hexdigest()[:8]
+
+    @cached_property
+    def scheme(self) -> str:
+        return self.url.split('://')[0]
+
+    @cached_property
+    def path(self) -> str:
+        parts = self.url.split('://', 1)
+        return '/' + parts[1].split('/', 1)[1] if len(parts) > 1 and '/' in parts[1] else '/'
+
+    @cached_property
+    def basename(self) -> str:
+        return self.path.split('/')[-1]
+
+    @cached_property
+    def extension(self) -> str:
+        basename = self.basename
+        return basename.split('.')[-1] if '.' in basename else ''
+
+    @cached_property
+    def base_url(self) -> str:
+        return f'{self.scheme}://{self.domain}'
+
+    @cached_property
+    def is_static(self) -> bool:
+        static_extensions = {'.pdf', '.jpg', '.jpeg', '.png', '.gif', '.webp', '.svg', '.mp4', '.mp3', '.wav', '.webm'}
+        return any(self.url.lower().endswith(ext) for ext in static_extensions)
+
+    @cached_property
+    def is_archived(self) -> bool:
+        output_paths = (
+            self.domain,
+            'output.html',
+            'output.pdf',
+            'screenshot.png',
+            'singlefile.html',
+            'readability/content.html',
+            'mercury/content.html',
+            'htmltotext.txt',
+            'media',
+            'git',
+        )
+        return any((Path(self.output_dir) / path).exists() for path in output_paths)
+
+    # =========================================================================
+    # Date/Time Properties (migrated from Link schema)
+    # =========================================================================
+
+    @cached_property
+    def bookmarked_date(self) -> Optional[str]:
+        max_ts = (timezone.now() + timedelta(days=30)).timestamp()
+        if self.timestamp and self.timestamp.replace('.', '').isdigit():
+            if 0 < float(self.timestamp) < max_ts:
+                return self._ts_to_date_str(datetime.fromtimestamp(float(self.timestamp)))
+            return str(self.timestamp)
+        return None
+
+    @cached_property
+    def downloaded_datestr(self) -> Optional[str]:
+        return self._ts_to_date_str(self.downloaded_at) if self.downloaded_at else None
+
+    @cached_property
+    def archive_dates(self) -> List[datetime]:
+        return [
+            result.start_ts
+            for result in self.archiveresult_set.all()
+            if result.start_ts
+        ]
+
+    @cached_property
+    def oldest_archive_date(self) -> Optional[datetime]:
+        dates = self.archive_dates
+        return min(dates) if dates else None
+
+    @cached_property
+    def newest_archive_date(self) -> Optional[datetime]:
+        dates = self.archive_dates
+        return max(dates) if dates else None
+
+    @cached_property
+    def num_outputs(self) -> int:
+        return self.archiveresult_set.filter(status='succeeded').count()
+
+    @cached_property
+    def num_failures(self) -> int:
+        return self.archiveresult_set.filter(status='failed').count()
+
+    # =========================================================================
+    # Output Path Methods (migrated from Link schema)
+    # =========================================================================
+
+    def canonical_outputs(self) -> Dict[str, Optional[str]]:
+        """
+        Intelligently discover the best output file for each plugin.
+        Uses actual ArchiveResult data and filesystem scanning with smart heuristics.
+        """
+        FAVICON_PROVIDER = 'https://www.google.com/s2/favicons?domain={}'
+
+        # Mimetypes that can be embedded/previewed in an iframe
+        IFRAME_EMBEDDABLE_EXTENSIONS = {
+            'html', 'htm', 'pdf', 'txt', 'md', 'json', 'jsonl',
+            'png', 'jpg', 'jpeg', 'gif', 'webp', 'svg', 'ico',
+            'mp4', 'webm', 'mp3', 'opus', 'ogg', 'wav',
+        }
+
+        MIN_DISPLAY_SIZE = 15_000  # 15KB - filter out tiny files
+        MAX_SCAN_FILES = 50  # Don't scan massive directories
+
+        def find_best_output_in_dir(dir_path: Path, plugin_name: str) -> Optional[str]:
+            """Find the best representative file in a plugin's output directory"""
+            if not dir_path.exists() or not dir_path.is_dir():
+                return None
+
+            candidates = []
+            file_count = 0
+
+            # Special handling for media plugin - look for thumbnails
+            is_media_dir = plugin_name == 'media'
+
+            # Scan for suitable files
+            for file_path in dir_path.rglob('*'):
+                file_count += 1
+                if file_count > MAX_SCAN_FILES:
+                    break
+
+                if file_path.is_dir() or file_path.name.startswith('.'):
+                    continue
+
+                ext = file_path.suffix.lstrip('.').lower()
+                if ext not in IFRAME_EMBEDDABLE_EXTENSIONS:
+                    continue
+
+                try:
+                    size = file_path.stat().st_size
+                except OSError:
+                    continue
+
+                # For media dir, allow smaller image files (thumbnails are often < 15KB)
+                min_size = 5_000 if (is_media_dir and ext in ('png', 'jpg', 'jpeg', 'webp', 'gif')) else MIN_DISPLAY_SIZE
+                if size < min_size:
+                    continue
+
+                # Prefer main files: index.html, output.*, content.*, etc.
+                priority = 0
+                name_lower = file_path.name.lower()
+
+                if is_media_dir:
+                    # Special prioritization for media directories
+                    if any(keyword in name_lower for keyword in ('thumb', 'thumbnail', 'cover', 'poster')):
+                        priority = 200  # Highest priority for thumbnails
+                    elif ext in ('png', 'jpg', 'jpeg', 'webp', 'gif'):
+                        priority = 150  # High priority for any image
+                    elif ext in ('mp4', 'webm', 'mp3', 'opus', 'ogg'):
+                        priority = 100  # Lower priority for actual media files
+                    else:
+                        priority = 50
+                elif 'index' in name_lower:
+                    priority = 100
+                elif name_lower.startswith(('output', 'content', plugin_name)):
+                    priority = 50
+                elif ext in ('html', 'htm', 'pdf'):
+                    priority = 30
+                elif ext in ('png', 'jpg', 'jpeg', 'webp'):
+                    priority = 20
+                else:
+                    priority = 10
+
+                candidates.append((priority, size, file_path))
+
+            if not candidates:
+                return None
+
+            # Sort by priority (desc), then size (desc)
+            candidates.sort(key=lambda x: (x[0], x[1]), reverse=True)
+            best_file = candidates[0][2]
+            return str(best_file.relative_to(Path(self.output_dir)))
+
+        canonical = {
+            'index_path': 'index.html',
+            'google_favicon_path': FAVICON_PROVIDER.format(self.domain),
+            'archive_org_path': f'https://web.archive.org/web/{self.base_url}',
+        }
+
+        # Scan each ArchiveResult's output directory for the best file
+        snap_dir = Path(self.output_dir)
+        for result in self.archiveresult_set.filter(status='succeeded'):
+            if not result.output_files and not result.output_str:
+                continue
+
+            # Try to find the best output file for this plugin
+            plugin_dir = snap_dir / result.plugin
+            best_output = None
+
+            # Check output_files first (new field)
+            if result.output_files:
+                first_file = next(iter(result.output_files.keys()), None)
+                if first_file and (plugin_dir / first_file).exists():
+                    best_output = f'{result.plugin}/{first_file}'
+
+            # Fallback to output_str if it looks like a path
+            if not best_output and result.output_str and (snap_dir / result.output_str).exists():
+                best_output = result.output_str
+
+            if not best_output and plugin_dir.exists():
+                # Intelligently find the best file in the plugin's directory
+                best_output = find_best_output_in_dir(plugin_dir, result.plugin)
+
+            if best_output:
+                canonical[f'{result.plugin}_path'] = best_output
+
+        # Also scan top-level for legacy outputs (backwards compatibility)
+        for file_path in snap_dir.glob('*'):
+            if file_path.is_dir() or file_path.name in ('index.html', 'index.json'):
+                continue
+
+            ext = file_path.suffix.lstrip('.').lower()
+            if ext not in IFRAME_EMBEDDABLE_EXTENSIONS:
+                continue
+
+            try:
+                size = file_path.stat().st_size
+                if size >= MIN_DISPLAY_SIZE:
+                    # Add as generic output with stem as key
+                    key = f'{file_path.stem}_path'
+                    if key not in canonical:
+                        canonical[key] = file_path.name
+            except OSError:
+                continue
+
+        if self.is_static:
+            static_path = f'warc/{self.timestamp}'
+            canonical.update({
+                'title': self.basename,
+                'wget_path': static_path,
+            })
+
+        return canonical
+
+    def latest_outputs(self, status: Optional[str] = None) -> Dict[str, Any]:
+        """Get the latest output that each plugin produced"""
+        from archivebox.hooks import get_plugins
+        from django.db.models import Q
+
+        latest: Dict[str, Any] = {}
+        for plugin in get_plugins():
+            results = self.archiveresult_set.filter(plugin=plugin)
+            if status is not None:
+                results = results.filter(status=status)
+            # Filter for results with output_files or output_str
+            results = results.filter(Q(output_files__isnull=False) | ~Q(output_str='')).order_by('-start_ts')
+            result = results.first()
+            # Return embed_path() for backwards compatibility
+            latest[plugin] = result.embed_path() if result else None
+        return latest
+
+    # =========================================================================
+    # Serialization Methods
+    # =========================================================================
+
+    def to_dict(self, extended: bool = False) -> Dict[str, Any]:
+        """Convert Snapshot to a dictionary (replacement for Link._asdict())"""
+        from archivebox.misc.util import ts_to_date_str
+
+        result = {
+            'TYPE': 'core.models.Snapshot',
+            'id': str(self.id),
+            'url': self.url,
+            'timestamp': self.timestamp,
+            'title': self.title,
+            'tags': self.tags_str(),
+            'downloaded_at': self.downloaded_at.isoformat() if self.downloaded_at else None,
+            'bookmarked_at': self.bookmarked_at.isoformat() if self.bookmarked_at else None,
+            'created_at': self.created_at.isoformat() if self.created_at else None,
+            # Computed properties
+            'domain': self.domain,
+            'scheme': self.scheme,
+            'base_url': self.base_url,
+            'path': self.path,
+            'basename': self.basename,
+            'extension': self.extension,
+            'is_static': self.is_static,
+            'is_archived': self.is_archived,
+            'archive_path': self.archive_path,
+            'output_dir': self.output_dir,
+            'link_dir': self.output_dir,  # backwards compatibility alias
+            'archive_size': self.archive_size,
+            'bookmarked_date': self.bookmarked_date,
+            'downloaded_datestr': self.downloaded_datestr,
+            'num_outputs': self.num_outputs,
+            'num_failures': self.num_failures,
+        }
+        if extended:
+            result['canonical'] = self.canonical_outputs()
+        return result
+
+    def to_json(self, indent: int = 4) -> str:
+        """Convert to JSON string"""
+        return to_json(self.to_dict(extended=True), indent=indent)
+
+    def to_csv(self, cols: Optional[List[str]] = None, separator: str = ',', ljust: int = 0) -> str:
+        """Convert to CSV string"""
+        data = self.to_dict()
+        cols = cols or ['timestamp', 'is_archived', 'url']
+        return separator.join(to_json(data.get(col, ''), indent=None).ljust(ljust) for col in cols)
+
+    def write_json_details(self, out_dir: Optional[str] = None) -> None:
+        """Write JSON index file for this snapshot to its output directory"""
+        out_dir = out_dir or self.output_dir
+        path = Path(out_dir) / CONSTANTS.JSON_INDEX_FILENAME
+        atomic_write(str(path), self.to_dict(extended=True))
+
+    def write_html_details(self, out_dir: Optional[str] = None) -> None:
+        """Write HTML detail page for this snapshot to its output directory"""
+        from django.template.loader import render_to_string
+        from archivebox.config.common import SERVER_CONFIG
+        from archivebox.config.configset import get_config
+        from archivebox.misc.logging_util import printable_filesize
+
+        out_dir = out_dir or self.output_dir
+        config = get_config()
+        SAVE_ARCHIVE_DOT_ORG = config.get('SAVE_ARCHIVE_DOT_ORG', True)
+        TITLE_LOADING_MSG = 'Not yet archived...'
+
+        canonical = self.canonical_outputs()
+        context = {
+            **self.to_dict(extended=True),
+            **{f'{k}_path': v for k, v in canonical.items()},
+            'canonical': {f'{k}_path': v for k, v in canonical.items()},
+            'title': htmlencode(self.title or (self.base_url if self.is_archived else TITLE_LOADING_MSG)),
+            'url_str': htmlencode(urldecode(self.base_url)),
+            'archive_url': urlencode(f'warc/{self.timestamp}' or (self.domain if self.is_archived else '')) or 'about:blank',
+            'extension': self.extension or 'html',
+            'tags': self.tags_str() or 'untagged',
+            'size': printable_filesize(self.archive_size) if self.archive_size else 'pending',
+            'status': 'archived' if self.is_archived else 'not yet archived',
+            'status_color': 'success' if self.is_archived else 'danger',
+            'oldest_archive_date': ts_to_date_str(self.oldest_archive_date),
+            'SAVE_ARCHIVE_DOT_ORG': SAVE_ARCHIVE_DOT_ORG,
+            'PREVIEW_ORIGINALS': SERVER_CONFIG.PREVIEW_ORIGINALS,
+        }
+        rendered_html = render_to_string('snapshot.html', context)
+        atomic_write(str(Path(out_dir) / CONSTANTS.HTML_INDEX_FILENAME), rendered_html)
+
+    # =========================================================================
+    # Helper Methods
+    # =========================================================================
+
+    @staticmethod
+    def _ts_to_date_str(dt: Optional[datetime]) -> Optional[str]:
+        return dt.strftime('%Y-%m-%d %H:%M:%S') if dt else None
+
+
+# =============================================================================
+# Snapshot State Machine
+# =============================================================================
+
+class SnapshotMachine(BaseStateMachine, strict_states=True):
+    """
+    State machine for managing Snapshot lifecycle.
+
+    Hook Lifecycle:
+    ┌─────────────────────────────────────────────────────────────┐
+    │ QUEUED State                                                │
+    │  • Waiting for snapshot to be ready                         │
+    └─────────────────────────────────────────────────────────────┘
+                            ↓ tick() when can_start()
+    ┌─────────────────────────────────────────────────────────────┐
+    │ STARTED State → enter_started()                             │
+    │  1. snapshot.run()                                          │
+    │     • discover_hooks('Snapshot') → finds all plugin hooks   │
+    │     • create_pending_archiveresults() → creates ONE         │
+    │       ArchiveResult per hook (NO execution yet)             │
+    │  2. ArchiveResults process independently with their own     │
+    │     state machines (see ArchiveResultMachine)               │
+    │  3. Advance through steps 0-9 as foreground hooks complete  │
+    └─────────────────────────────────────────────────────────────┘
+                            ↓ tick() when is_finished()
+    ┌─────────────────────────────────────────────────────────────┐
+    │ SEALED State → enter_sealed()                               │
+    │  • cleanup() → kills any background hooks still running     │
+    │  • Set retry_at=None (no more processing)                   │
+    └─────────────────────────────────────────────────────────────┘
+
+    https://github.com/ArchiveBox/ArchiveBox/wiki/ArchiveBox-Architecture-Diagrams
+    """
+
+    model_attr_name = 'snapshot'
+
+    # States
+    queued = State(value=Snapshot.StatusChoices.QUEUED, initial=True)
+    started = State(value=Snapshot.StatusChoices.STARTED)
+    sealed = State(value=Snapshot.StatusChoices.SEALED, final=True)
+
+    # Tick Event
+    tick = (
+        queued.to.itself(unless='can_start') |
+        queued.to(started, cond='can_start') |
+        started.to.itself(unless='is_finished') |
+        started.to(sealed, cond='is_finished')
+    )
+
+    def can_start(self) -> bool:
+        can_start = bool(self.snapshot.url)
+        # Suppressed: queue waiting logs
+        return can_start
+
+    def is_finished(self) -> bool:
+        """Check if snapshot processing is complete - delegates to model method."""
+        return self.snapshot.is_finished_processing()
+
+    @queued.enter
+    def enter_queued(self):
+        # Suppressed: state transition logs
+        self.snapshot.update_and_requeue(
+            retry_at=timezone.now(),
+            status=Snapshot.StatusChoices.QUEUED,
+        )
+
+    @started.enter
+    def enter_started(self):
+        # Suppressed: state transition logs
+        # lock the snapshot while we create the pending archiveresults
+        self.snapshot.update_and_requeue(
+            retry_at=timezone.now() + timedelta(seconds=30),  # if failed, wait 30s before retrying
+        )
+
+        # Run the snapshot - creates pending archiveresults for all enabled plugins
+        self.snapshot.run()
+
+        # unlock the snapshot after we're done + set status = started
+        self.snapshot.update_and_requeue(
+            retry_at=timezone.now() + timedelta(seconds=5),  # check again in 5s
+            status=Snapshot.StatusChoices.STARTED,
+        )
+
+    @sealed.enter
+    def enter_sealed(self):
+        # Clean up background hooks
+        self.snapshot.cleanup()
+
+        # Suppressed: state transition logs
+        self.snapshot.update_and_requeue(
+            retry_at=None,
+            status=Snapshot.StatusChoices.SEALED,
+        )
+
+
+class ArchiveResultManager(models.Manager):
+    def indexable(self, sorted: bool = True):
+        INDEXABLE_METHODS = [r[0] for r in EXTRACTOR_INDEXING_PRECEDENCE]
+        qs = self.get_queryset().filter(plugin__in=INDEXABLE_METHODS, status='succeeded')
+        if sorted:
+            precedence = [When(plugin=method, then=Value(p)) for method, p in EXTRACTOR_INDEXING_PRECEDENCE]
+            qs = qs.annotate(indexing_precedence=Case(*precedence, default=Value(1000), output_field=IntegerField())).order_by('indexing_precedence')
+        return qs
+
+
+class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHealthStats, ModelWithStateMachine):
+    class StatusChoices(models.TextChoices):
+        QUEUED = 'queued', 'Queued'
+        STARTED = 'started', 'Started'
+        BACKOFF = 'backoff', 'Waiting to retry'
+        SUCCEEDED = 'succeeded', 'Succeeded'
+        FAILED = 'failed', 'Failed'
+        SKIPPED = 'skipped', 'Skipped'
+
+    @classmethod
+    def get_plugin_choices(cls):
+        """Get plugin choices from discovered hooks (for forms/admin)."""
+        plugins = [get_plugin_name(e) for e in get_plugins()]
+        return tuple((e, e) for e in plugins)
+
+    # Keep AutoField for backward compatibility with 0.7.x databases
+    # UUID field is added separately by migration for new records
+    id = models.AutoField(primary_key=True, editable=False)
+    # Note: unique constraint is added by migration 0027 - don't set unique=True here
+    # or SQLite table recreation in earlier migrations will fail
+    uuid = models.UUIDField(default=uuid7, null=True, blank=True, db_index=True)
+    created_by = models.ForeignKey(settings.AUTH_USER_MODEL, on_delete=models.CASCADE, default=get_or_create_system_user_pk, null=False, related_name='archiveresult_set', db_index=True)
+    created_at = models.DateTimeField(default=timezone.now, db_index=True)
+    modified_at = models.DateTimeField(auto_now=True)
+
+    snapshot: Snapshot = models.ForeignKey(Snapshot, on_delete=models.CASCADE)  # type: ignore
+    # No choices= constraint - plugin names come from plugin system and can be any string
+    plugin = models.CharField(max_length=32, blank=False, null=False, db_index=True)
+    hook_name = models.CharField(max_length=255, blank=True, default='', db_index=True, help_text='Full filename of the hook that executed (e.g., on_Snapshot__50_wget.py)')
+    pwd = models.CharField(max_length=256, default=None, null=True, blank=True)
+    cmd = models.JSONField(default=None, null=True, blank=True)
+    cmd_version = models.CharField(max_length=128, default=None, null=True, blank=True)
+
+    # New output fields (replacing old 'output' field)
+    output_str = models.TextField(blank=True, default='', help_text='Human-readable output summary')
+    output_json = models.JSONField(null=True, blank=True, default=None, help_text='Structured metadata (headers, redirects, etc.)')
+    output_files = models.JSONField(default=dict, help_text='Dict of {relative_path: {metadata}}')
+    output_size = models.BigIntegerField(default=0, help_text='Total bytes of all output files')
+    output_mimetypes = models.CharField(max_length=512, blank=True, default='', help_text='CSV of mimetypes sorted by size')
+
+    # Binary FK (optional - set when hook reports cmd)
+    binary = models.ForeignKey(
+        'machine.Binary',
+        on_delete=models.SET_NULL,
+        null=True, blank=True,
+        related_name='archiveresults',
+        help_text='Primary binary used by this hook'
+    )
+
+    start_ts = models.DateTimeField(default=None, null=True, blank=True)
+    end_ts = models.DateTimeField(default=None, null=True, blank=True)
+
+    status = ModelWithStateMachine.StatusField(choices=StatusChoices.choices, default=StatusChoices.QUEUED)
+    retry_at = ModelWithStateMachine.RetryAtField(default=timezone.now)
+    notes = models.TextField(blank=True, null=False, default='')
+    output_dir = models.CharField(max_length=256, default=None, null=True, blank=True)
+    iface = models.ForeignKey(NetworkInterface, on_delete=models.SET_NULL, null=True, blank=True)
+
+    state_machine_name = 'core.models.ArchiveResultMachine'
+    retry_at_field_name = 'retry_at'
+    state_field_name = 'status'
+    active_state = StatusChoices.STARTED
+
+    objects = ArchiveResultManager()
+
+    class Meta(TypedModelMeta):
+        verbose_name = 'Archive Result'
+        verbose_name_plural = 'Archive Results Log'
+
+    def __str__(self):
+        return f'[{self.id}] {self.snapshot.url[:64]} -> {self.plugin}'
+
+    def save(self, *args, **kwargs):
+        is_new = self._state.adding
+        # Skip ModelWithOutputDir.save() to avoid creating index.json in plugin directories
+        # Call the Django Model.save() directly instead
+        models.Model.save(self, *args, **kwargs)
+
+        if is_new:
+            from archivebox.misc.logging_util import log_worker_event
+            log_worker_event(
+                worker_type='DB',
+                event='Created ArchiveResult',
+                indent_level=3,
+                plugin=self.plugin,
+                metadata={
+                    'id': str(self.id),
+                    'snapshot_id': str(self.snapshot_id),
+                    'snapshot_url': str(self.snapshot.url)[:64],
+                    'status': self.status,
+                },
+            )
+
+    @cached_property
+    def snapshot_dir(self):
+        return Path(self.snapshot.output_dir)
+
+    @cached_property
+    def url(self):
+        return self.snapshot.url
+
+    @property
+    def api_url(self) -> str:
+        return reverse_lazy('api-1:get_archiveresult', args=[self.id])
+
+    def get_absolute_url(self):
+        return f'/{self.snapshot.archive_path}/{self.plugin}'
+
+    @property
+    def plugin_module(self) -> Any | None:
+        # Hook scripts are now used instead of Python plugin modules
+        # The plugin name maps to hooks in archivebox/plugins/{plugin}/
+        return None
+
+    def output_exists(self) -> bool:
+        return os.path.exists(Path(self.snapshot_dir) / self.plugin)
+
+    def embed_path(self) -> Optional[str]:
+        """
+        Get the relative path to the embeddable output file for this result.
+
+        Returns the first file from output_files if set, otherwise tries to
+        find a reasonable default based on the plugin type.
+        """
+        # Check output_files dict for primary output
+        if self.output_files:
+            # Return first file from output_files (dict preserves insertion order)
+            first_file = next(iter(self.output_files.keys()), None)
+            if first_file:
+                return f'{self.plugin}/{first_file}'
+
+        # Fallback: check output_str if it looks like a file path
+        if self.output_str and ('/' in self.output_str or '.' in self.output_str):
+            return self.output_str
+
+        # Try to find output file based on plugin's canonical output path
+        canonical = self.snapshot.canonical_outputs()
+        plugin_key = f'{self.plugin}_path'
+        if plugin_key in canonical:
+            return canonical[plugin_key]
+
+        # Fallback to plugin directory
+        return f'{self.plugin}/'
+
+    def create_output_dir(self):
+        output_dir = Path(self.snapshot_dir) / self.plugin
+        output_dir.mkdir(parents=True, exist_ok=True)
+        return output_dir
+
+    @property
+    def output_dir_name(self) -> str:
+        return self.plugin
+
+    @property
+    def output_dir_parent(self) -> str:
+        return str(self.snapshot.OUTPUT_DIR.relative_to(CONSTANTS.DATA_DIR))
+
+    def save_search_index(self):
+        pass
+
+    def cascade_health_update(self, success: bool):
+        """Update health stats for self, parent Snapshot, and grandparent Crawl (if present)."""
+        self.increment_health_stats(success)
+        self.snapshot.increment_health_stats(success)
+        if self.snapshot.crawl_id:
+            self.snapshot.crawl.increment_health_stats(success)
+
+    def run(self):
+        """
+        Execute this ArchiveResult's hook and update status.
+
+        If self.hook_name is set, runs only that specific hook.
+        If self.hook_name is empty, discovers and runs all hooks for self.plugin (backwards compat).
+
+        Updates status/output fields, queues discovered URLs, and triggers indexing.
+        """
+        from django.utils import timezone
+        from archivebox.hooks import BUILTIN_PLUGINS_DIR, USER_PLUGINS_DIR, run_hook, is_background_hook
+        from archivebox.config.configset import get_config
+
+        # Get merged config with proper context
+        config = get_config(
+            crawl=self.snapshot.crawl if self.snapshot.crawl else None,
+            snapshot=self.snapshot,
+        )
+
+        # Determine which hook(s) to run
+        hooks = []
+
+        if self.hook_name:
+            # SPECIFIC HOOK MODE: Find the specific hook by name
+            for base_dir in (BUILTIN_PLUGINS_DIR, USER_PLUGINS_DIR):
+                if not base_dir.exists():
+                    continue
+                plugin_dir = base_dir / self.plugin
+                if plugin_dir.exists():
+                    hook_path = plugin_dir / self.hook_name
+                    if hook_path.exists():
+                        hooks.append(hook_path)
+                        break
+        else:
+            # LEGACY MODE: Discover all hooks for this plugin (backwards compatibility)
+            for base_dir in (BUILTIN_PLUGINS_DIR, USER_PLUGINS_DIR):
+                if not base_dir.exists():
+                    continue
+                plugin_dir = base_dir / self.plugin
+                if plugin_dir.exists():
+                    matches = list(plugin_dir.glob('on_Snapshot__*.*'))
+                    if matches:
+                        hooks.extend(sorted(matches))
+
+        if not hooks:
+            self.status = self.StatusChoices.FAILED
+            if self.hook_name:
+                self.output_str = f'Hook not found: {self.plugin}/{self.hook_name}'
+            else:
+                self.output_str = f'No hooks found for plugin: {self.plugin}'
+            self.retry_at = None
+            self.save()
+            return
+
+        # Output directory is plugin_dir for the hook output
+        plugin_dir = Path(self.snapshot.output_dir) / self.plugin
+
+        start_ts = timezone.now()
+        is_bg_hook = False
+
+        for hook in hooks:
+            # Check if this is a background hook
+            is_bg_hook = is_background_hook(hook.name)
+
+            result = run_hook(
+                hook,
+                output_dir=plugin_dir,
+                config=config,
+                url=self.snapshot.url,
+                snapshot_id=str(self.snapshot.id),
+                crawl_id=str(self.snapshot.crawl.id) if self.snapshot.crawl else None,
+                depth=self.snapshot.depth,
+            )
+
+            # Background hooks return None
+            if result is None:
+                is_bg_hook = True
+
+        # Update status based on hook execution
+        if is_bg_hook:
+            # BACKGROUND HOOK - still running, return immediately
+            # Status stays STARTED, will be finalized by Snapshot.cleanup()
+            self.status = self.StatusChoices.STARTED
+            self.start_ts = start_ts
+            self.pwd = str(plugin_dir)
+            self.save()
+            return
+
+        # FOREGROUND HOOK - completed, update from filesystem
+        self.start_ts = start_ts
+        self.pwd = str(plugin_dir)
+        self.update_from_output()
+
+        # Clean up empty output directory if no files were created
+        if plugin_dir.exists() and not self.output_files:
+            try:
+                if not any(plugin_dir.iterdir()):
+                    plugin_dir.rmdir()
+            except (OSError, RuntimeError):
+                pass
+
+    def update_from_output(self):
+        """
+        Update this ArchiveResult from filesystem logs and output files.
+
+        Used for:
+        - Foreground hooks that completed (called from ArchiveResult.run())
+        - Background hooks that completed (called from Snapshot.cleanup())
+
+        Updates:
+        - status, output_str, output_json from ArchiveResult JSONL record
+        - output_files, output_size, output_mimetypes by walking filesystem
+        - end_ts, retry_at, cmd, cmd_version, binary FK
+        - Processes side-effect records (Snapshot, Tag, etc.) via process_hook_records()
+        """
+        import json
+        import mimetypes
+        from collections import defaultdict
+        from pathlib import Path
+        from django.utils import timezone
+        from archivebox.hooks import process_hook_records
+
+        plugin_dir = Path(self.pwd) if self.pwd else None
+        if not plugin_dir or not plugin_dir.exists():
+            self.status = self.StatusChoices.FAILED
+            self.output_str = 'Output directory not found'
+            self.end_ts = timezone.now()
+            self.retry_at = None
+            self.save()
+            return
+
+        # Read and parse JSONL output from stdout.log
+        stdout_file = plugin_dir / 'stdout.log'
+        stdout = stdout_file.read_text() if stdout_file.exists() else ''
+
+        records = []
+        for line in stdout.splitlines():
+            if line.strip() and line.strip().startswith('{'):
+                try:
+                    records.append(json.loads(line))
+                except json.JSONDecodeError:
+                    continue
+
+        # Find ArchiveResult record and update status/output from it
+        ar_records = [r for r in records if r.get('type') == 'ArchiveResult']
+        if ar_records:
+            hook_data = ar_records[0]
+
+            # Update status
+            status_map = {
+                'succeeded': self.StatusChoices.SUCCEEDED,
+                'failed': self.StatusChoices.FAILED,
+                'skipped': self.StatusChoices.SKIPPED,
+            }
+            self.status = status_map.get(hook_data.get('status', 'failed'), self.StatusChoices.FAILED)
+
+            # Update output fields
+            self.output_str = hook_data.get('output_str') or hook_data.get('output') or ''
+            self.output_json = hook_data.get('output_json')
+
+            # Update cmd fields
+            if hook_data.get('cmd'):
+                self.cmd = hook_data['cmd']
+                self._set_binary_from_cmd(hook_data['cmd'])
+            if hook_data.get('cmd_version'):
+                self.cmd_version = hook_data['cmd_version'][:128]
+        else:
+            # No ArchiveResult record = failed
+            self.status = self.StatusChoices.FAILED
+            self.output_str = 'Hook did not output ArchiveResult record'
+
+        # Walk filesystem and populate output_files, output_size, output_mimetypes
+        exclude_names = {'stdout.log', 'stderr.log', 'hook.pid', 'listener.pid'}
+        mime_sizes = defaultdict(int)
+        total_size = 0
+        output_files = {}
+
+        for file_path in plugin_dir.rglob('*'):
+            if not file_path.is_file():
+                continue
+            if file_path.name in exclude_names:
+                continue
+
+            try:
+                stat = file_path.stat()
+                mime_type, _ = mimetypes.guess_type(str(file_path))
+                mime_type = mime_type or 'application/octet-stream'
+
+                relative_path = str(file_path.relative_to(plugin_dir))
+                output_files[relative_path] = {}
+                mime_sizes[mime_type] += stat.st_size
+                total_size += stat.st_size
+            except (OSError, IOError):
+                continue
+
+        self.output_files = output_files
+        self.output_size = total_size
+        sorted_mimes = sorted(mime_sizes.items(), key=lambda x: x[1], reverse=True)
+        self.output_mimetypes = ','.join(mime for mime, _ in sorted_mimes)
+
+        # Update timestamps
+        self.end_ts = timezone.now()
+        self.retry_at = None
+
+        self.save()
+
+        # Process side-effect records (filter Snapshots for depth/URL)
+        filtered_records = []
+        for record in records:
+            record_type = record.get('type')
+
+            # Skip ArchiveResult records (already processed above)
+            if record_type == 'ArchiveResult':
+                continue
+
+            # Filter Snapshot records for depth/URL constraints
+            if record_type == 'Snapshot':
+                if not self.snapshot.crawl:
+                    continue
+
+                url = record.get('url')
+                if not url:
+                    continue
+
+                depth = record.get('depth', self.snapshot.depth + 1)
+                if depth > self.snapshot.crawl.max_depth:
+                    continue
+
+                if not self._url_passes_filters(url):
+                    continue
+
+            filtered_records.append(record)
+
+        # Process filtered records with unified dispatcher
+        overrides = {
+            'snapshot': self.snapshot,
+            'crawl': self.snapshot.crawl,
+            'created_by_id': self.snapshot.crawl.created_by_id,
+        }
+        process_hook_records(filtered_records, overrides=overrides)
+
+        # Cleanup PID files and empty logs
+        pid_file = plugin_dir / 'hook.pid'
+        pid_file.unlink(missing_ok=True)
+        stderr_file = plugin_dir / 'stderr.log'
+        if stdout_file.exists() and stdout_file.stat().st_size == 0:
+            stdout_file.unlink()
+        if stderr_file.exists() and stderr_file.stat().st_size == 0:
+            stderr_file.unlink()
+
+    def _set_binary_from_cmd(self, cmd: list) -> None:
+        """
+        Find Binary for command and set binary FK.
+
+        Tries matching by absolute path first, then by binary name.
+        Only matches binaries on the current machine.
+        """
+        if not cmd:
+            return
+
+        from archivebox.machine.models import Machine
+
+        bin_path_or_name = cmd[0] if isinstance(cmd, list) else cmd
+        machine = Machine.current()
+
+        # Try matching by absolute path first
+        binary = Binary.objects.filter(
+            abspath=bin_path_or_name,
+            machine=machine
+        ).first()
+
+        if binary:
+            self.binary = binary
+            return
+
+        # Fallback: match by binary name
+        bin_name = Path(bin_path_or_name).name
+        binary = Binary.objects.filter(
+            name=bin_name,
+            machine=machine
+        ).first()
+
+        if binary:
+            self.binary = binary
+
+    def _url_passes_filters(self, url: str) -> bool:
+        """Check if URL passes URL_ALLOWLIST and URL_DENYLIST config filters.
+
+        Uses proper config hierarchy: defaults -> file -> env -> machine -> user -> crawl -> snapshot
+        """
+        import re
+        from archivebox.config.configset import get_config
+
+        # Get merged config with proper hierarchy
+        config = get_config(
+            user=self.snapshot.crawl.created_by if self.snapshot else None,
+            crawl=self.snapshot.crawl if self.snapshot else None,
+            snapshot=self.snapshot,
+        )
+
+        # Get allowlist/denylist (can be string or list)
+        allowlist_raw = config.get('URL_ALLOWLIST', '')
+        denylist_raw = config.get('URL_DENYLIST', '')
+
+        # Normalize to list of patterns
+        def to_pattern_list(value):
+            if isinstance(value, list):
+                return value
+            if isinstance(value, str):
+                return [p.strip() for p in value.split(',') if p.strip()]
+            return []
+
+        allowlist = to_pattern_list(allowlist_raw)
+        denylist = to_pattern_list(denylist_raw)
+
+        # Denylist takes precedence
+        if denylist:
+            for pattern in denylist:
+                try:
+                    if re.search(pattern, url):
+                        return False
+                except re.error:
+                    continue  # Skip invalid regex patterns
+
+        # If allowlist exists, URL must match at least one pattern
+        if allowlist:
+            for pattern in allowlist:
+                try:
+                    if re.search(pattern, url):
+                        return True
+                except re.error:
+                    continue  # Skip invalid regex patterns
+            return False  # No allowlist patterns matched
+
+        return True  # No filters or passed filters
+
+    @property
+    def output_dir(self) -> Path:
+        """Get the output directory for this plugin's results."""
+        return Path(self.snapshot.output_dir) / self.plugin
+
+    def is_background_hook(self) -> bool:
+        """Check if this ArchiveResult is for a background hook."""
+        plugin_dir = Path(self.pwd) if self.pwd else None
+        if not plugin_dir:
+            return False
+        pid_file = plugin_dir / 'hook.pid'
+        return pid_file.exists()
+
+
+# =============================================================================
+# ArchiveResult State Machine
+# =============================================================================
+
+class ArchiveResultMachine(BaseStateMachine, strict_states=True):
+    """
+    State machine for managing ArchiveResult (single plugin execution) lifecycle.
+
+    Hook Lifecycle:
+    ┌─────────────────────────────────────────────────────────────┐
+    │ QUEUED State                                                │
+    │  • Waiting for its turn to run                              │
+    └─────────────────────────────────────────────────────────────┘
+                            ↓ tick() when can_start()
+    ┌─────────────────────────────────────────────────────────────┐
+    │ STARTED State → enter_started()                             │
+    │  1. archiveresult.run()                                     │
+    │     • Find specific hook by hook_name                       │
+    │     • run_hook(script, output_dir, ...) → subprocess        │
+    │                                                              │
+    │  2a. FOREGROUND hook (returns HookResult):                  │
+    │      • update_from_output() immediately                     │
+    │        - Read stdout.log                                    │
+    │        - Parse JSONL records                                │
+    │        - Extract 'ArchiveResult' record → update status     │
+    │        - Walk output_dir → populate output_files            │
+    │        - Call process_hook_records() for side effects       │
+    │                                                              │
+    │  2b. BACKGROUND hook (returns None):                        │
+    │      • Status stays STARTED                                 │
+    │      • Continues running in background                      │
+    │      • Killed by Snapshot.cleanup() when sealed             │
+    └─────────────────────────────────────────────────────────────┘
+                            ↓ tick() checks status
+    ┌─────────────────────────────────────────────────────────────┐
+    │ SUCCEEDED / FAILED / SKIPPED / BACKOFF                      │
+    │  • Set by hook's JSONL output during update_from_output()   │
+    │  • Health stats incremented (num_uses_succeeded/failed)     │
+    │  • Parent Snapshot health stats also updated                │
+    └─────────────────────────────────────────────────────────────┘
+
+    https://github.com/ArchiveBox/ArchiveBox/wiki/ArchiveBox-Architecture-Diagrams
+    """
+
+    model_attr_name = 'archiveresult'
+
+    # States
+    queued = State(value=ArchiveResult.StatusChoices.QUEUED, initial=True)
+    started = State(value=ArchiveResult.StatusChoices.STARTED)
+    backoff = State(value=ArchiveResult.StatusChoices.BACKOFF)
+    succeeded = State(value=ArchiveResult.StatusChoices.SUCCEEDED, final=True)
+    failed = State(value=ArchiveResult.StatusChoices.FAILED, final=True)
+    skipped = State(value=ArchiveResult.StatusChoices.SKIPPED, final=True)
+
+    # Tick Event - transitions based on conditions
+    tick = (
+        queued.to.itself(unless='can_start') |
+        queued.to(started, cond='can_start') |
+        started.to.itself(unless='is_finished') |
+        started.to(succeeded, cond='is_succeeded') |
+        started.to(failed, cond='is_failed') |
+        started.to(skipped, cond='is_skipped') |
+        started.to(backoff, cond='is_backoff') |
+        backoff.to.itself(unless='can_start') |
+        backoff.to(started, cond='can_start') |
+        backoff.to(succeeded, cond='is_succeeded') |
+        backoff.to(failed, cond='is_failed') |
+        backoff.to(skipped, cond='is_skipped')
+    )
+
+    def can_start(self) -> bool:
+        can_start = bool(self.archiveresult.snapshot.url)
+        # Suppressed: queue waiting logs
+        return can_start
+
+    def is_succeeded(self) -> bool:
+        """Check if extractor plugin succeeded (status was set by run())."""
+        return self.archiveresult.status == ArchiveResult.StatusChoices.SUCCEEDED
+
+    def is_failed(self) -> bool:
+        """Check if extractor plugin failed (status was set by run())."""
+        return self.archiveresult.status == ArchiveResult.StatusChoices.FAILED
+
+    def is_skipped(self) -> bool:
+        """Check if extractor plugin was skipped (status was set by run())."""
+        return self.archiveresult.status == ArchiveResult.StatusChoices.SKIPPED
+
+    def is_backoff(self) -> bool:
+        """Check if we should backoff and retry later."""
+        # Backoff if status is still started (plugin didn't complete) and output_str is empty
+        return (
+            self.archiveresult.status == ArchiveResult.StatusChoices.STARTED and
+            not self.archiveresult.output_str
+        )
+
+    def is_finished(self) -> bool:
+        """Check if extraction has completed (success, failure, or skipped)."""
+        return self.archiveresult.status in (
+            ArchiveResult.StatusChoices.SUCCEEDED,
+            ArchiveResult.StatusChoices.FAILED,
+            ArchiveResult.StatusChoices.SKIPPED,
+        )
+
+    @queued.enter
+    def enter_queued(self):
+        # Suppressed: state transition logs
+        self.archiveresult.update_and_requeue(
+            retry_at=timezone.now(),
+            status=ArchiveResult.StatusChoices.QUEUED,
+            start_ts=None,
+        )  # bump the snapshot's retry_at so they pickup any new changes
+
+    @started.enter
+    def enter_started(self):
+        from archivebox.machine.models import NetworkInterface
+
+        # Suppressed: state transition logs
+        # Lock the object and mark start time
+        self.archiveresult.update_and_requeue(
+            retry_at=timezone.now() + timedelta(seconds=120),  # 2 min timeout for plugin
+            status=ArchiveResult.StatusChoices.STARTED,
+            start_ts=timezone.now(),
+            iface=NetworkInterface.current(),
+        )
+
+        # Run the plugin - this updates status, output, timestamps, etc.
+        self.archiveresult.run()
+
+        # Save the updated result
+        self.archiveresult.save()
+
+        # Suppressed: plugin result logs (already logged by worker)
+
+    @backoff.enter
+    def enter_backoff(self):
+        # Suppressed: state transition logs
+        self.archiveresult.update_and_requeue(
+            retry_at=timezone.now() + timedelta(seconds=60),
+            status=ArchiveResult.StatusChoices.BACKOFF,
+            end_ts=None,
+            # retries=F('retries') + 1,               # F() equivalent to getattr(self.archiveresult, 'retries', 0) + 1,
+        )
+
+    @succeeded.enter
+    def enter_succeeded(self):
+        # Suppressed: state transition logs
+        self.archiveresult.update_and_requeue(
+            retry_at=None,
+            status=ArchiveResult.StatusChoices.SUCCEEDED,
+            end_ts=timezone.now(),
+            # **self.archiveresult.get_output_dict(),     # {output, output_json, stderr, stdout, returncode, errors, cmd_version, pwd, cmd, machine}
+        )
+        self.archiveresult.save()
+
+        # Update health stats for ArchiveResult, Snapshot, and Crawl cascade
+        self.archiveresult.cascade_health_update(success=True)
+
+    @failed.enter
+    def enter_failed(self):
+        # Suppressed: state transition logs
+        self.archiveresult.update_and_requeue(
+            retry_at=None,
+            status=ArchiveResult.StatusChoices.FAILED,
+            end_ts=timezone.now(),
+        )
+
+        # Update health stats for ArchiveResult, Snapshot, and Crawl cascade
+        self.archiveresult.cascade_health_update(success=False)
+
+    @skipped.enter
+    def enter_skipped(self):
+        # Suppressed: state transition logs
+        self.archiveresult.update_and_requeue(
+            retry_at=None,
+            status=ArchiveResult.StatusChoices.SKIPPED,
+            end_ts=timezone.now(),
+        )
+
+    def after_transition(self, event: str, source: State, target: State):
+        # print(f"after '{event}' from '{source.id}' to '{target.id}'")
+        self.archiveresult.snapshot.update_and_requeue()  # bump snapshot retry time so it picks up all the new changes
+
+
+# =============================================================================
+# State Machine Registration
+# =============================================================================
+
+# Manually register state machines with python-statemachine registry
+# (normally auto-discovered from statemachines.py, but we define them here for clarity)
+registry.register(SnapshotMachine)
+registry.register(ArchiveResultMachine)

+ 22 - 21
archivebox/core/settings.py

@@ -30,9 +30,9 @@ LOADED_PLUGINS = archivebox.LOADED_PLUGINS
 ### Django Core Settings
 ################################################################################
 
-WSGI_APPLICATION = "core.wsgi.application"
-ASGI_APPLICATION = "core.asgi.application"
-ROOT_URLCONF = "core.urls"
+WSGI_APPLICATION = "archivebox.core.wsgi.application"
+ASGI_APPLICATION = "archivebox.core.asgi.application"
+ROOT_URLCONF = "archivebox.core.urls"
 
 LOGIN_URL = "/accounts/login/"
 LOGOUT_REDIRECT_URL = os.environ.get("LOGOUT_REDIRECT_URL", "/")
@@ -55,14 +55,15 @@ INSTALLED_APPS = [
     # 3rd-party apps from PyPI
     "signal_webhooks",  # handles REST API outbound webhooks                              https://github.com/MrThearMan/django-signal-webhooks
     "django_object_actions",  # provides easy Django Admin action buttons on change views       https://github.com/crccheck/django-object-actions
-    # Our ArchiveBox-provided apps
-    "config",  # ArchiveBox config settings (loaded as a plugin, don't need to add it here)
-    "machine",  # handles collecting and storing information about the host machine, network interfaces, binaries, etc.
-    "workers",  # handles starting and managing background workers and processes (orchestrators and actors)
-    "crawls",  # handles Crawl and CrawlSchedule models and management
-    "personas",  # handles Persona and session management
-    "core",  # core django model with Snapshot, ArchiveResult, etc.
-    "api",  # Django-Ninja-based Rest API interfaces, config, APIToken model, etc.
+    # Our ArchiveBox-provided apps (use fully qualified names)
+    # NOTE: Order matters! Apps with migrations that depend on other apps must come AFTER their dependencies
+    # "archivebox.config",  # ArchiveBox config settings (no models, not a real Django app)
+    "archivebox.machine",  # handles collecting and storing information about the host machine, network interfaces, binaries, etc.
+    "archivebox.workers",  # handles starting and managing background workers and processes (orchestrators and actors)
+    "archivebox.personas",  # handles Persona and session management
+    "archivebox.core",  # core django model with Snapshot, ArchiveResult, etc. (crawls depends on this)
+    "archivebox.crawls",  # handles Crawl and CrawlSchedule models and management (depends on core)
+    "archivebox.api",  # Django-Ninja-based Rest API interfaces, config, APIToken model, etc.
     # ArchiveBox plugins (hook-based plugins no longer add Django apps)
     # Use hooks.py discover_hooks() for plugin functionality
     # 3rd-party apps from PyPI that need to be loaded last
@@ -72,15 +73,15 @@ INSTALLED_APPS = [
 
 
 MIDDLEWARE = [
-    "core.middleware.TimezoneMiddleware",
+    "archivebox.core.middleware.TimezoneMiddleware",
     "django.middleware.security.SecurityMiddleware",
     "django.contrib.sessions.middleware.SessionMiddleware",
     "django.middleware.common.CommonMiddleware",
     "django.middleware.csrf.CsrfViewMiddleware",
     "django.contrib.auth.middleware.AuthenticationMiddleware",
-    "core.middleware.ReverseProxyAuthMiddleware",
+    "archivebox.core.middleware.ReverseProxyAuthMiddleware",
     "django.contrib.messages.middleware.MessageMiddleware",
-    "core.middleware.CacheControlMiddleware",
+    "archivebox.core.middleware.CacheControlMiddleware",
     # Additional middlewares from plugins (if any)
 ]
 
@@ -370,15 +371,15 @@ LOGGING = SETTINGS_LOGGING
 ################################################################################
 
 # Add default webhook configuration to the User model
-SIGNAL_WEBHOOKS_CUSTOM_MODEL = "api.models.OutboundWebhook"
+SIGNAL_WEBHOOKS_CUSTOM_MODEL = "archivebox.api.models.OutboundWebhook"
 SIGNAL_WEBHOOKS = {
     "HOOKS": {
         # ... is a special sigil value that means "use the default autogenerated hooks"
         "django.contrib.auth.models.User": ...,
-        "core.models.Snapshot": ...,
-        "core.models.ArchiveResult": ...,
-        "core.models.Tag": ...,
-        "api.models.APIToken": ...,
+        "archivebox.core.models.Snapshot": ...,
+        "archivebox.core.models.ArchiveResult": ...,
+        "archivebox.core.models.Tag": ...,
+        "archivebox.api.models.APIToken": ...,
     },
 }
 
@@ -391,11 +392,11 @@ ADMIN_DATA_VIEWS = {
     "URLS": [
         {
             "route": "config/",
-            "view": "core.views.live_config_list_view",
+            "view": "archivebox.core.views.live_config_list_view",
             "name": "Configuration",
             "items": {
                 "route": "<str:key>/",
-                "view": "core.views.live_config_value_view",
+                "view": "archivebox.core.views.live_config_value_view",
                 "name": "config_val",
             },
         },

+ 0 - 319
archivebox/core/statemachines.py

@@ -1,319 +0,0 @@
-__package__ = 'archivebox.core'
-
-import time
-import os
-from datetime import timedelta
-from typing import ClassVar
-
-from django.db.models import F
-from django.utils import timezone
-
-from rich import print
-
-from statemachine import State, StateMachine
-
-# from workers.actor import ActorType
-
-from core.models import Snapshot, ArchiveResult
-from crawls.models import Crawl
-
-
-class SnapshotMachine(StateMachine, strict_states=True):
-    """
-    State machine for managing Snapshot lifecycle.
-    
-    https://github.com/ArchiveBox/ArchiveBox/wiki/ArchiveBox-Architecture-Diagrams
-    """
-    
-    model: Snapshot
-    
-    # States
-    queued = State(value=Snapshot.StatusChoices.QUEUED, initial=True)
-    started = State(value=Snapshot.StatusChoices.STARTED)
-    sealed = State(value=Snapshot.StatusChoices.SEALED, final=True)
-    
-    # Tick Event
-    tick = (
-        queued.to.itself(unless='can_start') |
-        queued.to(started, cond='can_start') |
-        started.to.itself(unless='is_finished') |
-        started.to(sealed, cond='is_finished')
-    )
-    
-    def __init__(self, snapshot, *args, **kwargs):
-        self.snapshot = snapshot
-        super().__init__(snapshot, *args, **kwargs)
-        
-    def __repr__(self) -> str:
-        return f'Snapshot[{self.snapshot.id}]'
-
-    def __str__(self) -> str:
-        return self.__repr__()
-
-    def can_start(self) -> bool:
-        can_start = bool(self.snapshot.url)
-        # Suppressed: queue waiting logs
-        return can_start
-        
-    def is_finished(self) -> bool:
-        # if no archiveresults exist yet, it's not finished
-        if not self.snapshot.archiveresult_set.exists():
-            return False
-
-        # Try to advance step if ready (handles step-based hook execution)
-        # This will increment current_step when all foreground hooks in current step are done
-        while self.snapshot.advance_step_if_ready():
-            pass  # Keep advancing until we can't anymore
-
-        # if archiveresults exist but are still pending, it's not finished
-        if self.snapshot.pending_archiveresults().exists():
-            return False
-
-        # Don't wait for background hooks - they'll be cleaned up on entering sealed state
-        # Background hooks in STARTED state are excluded by pending_archiveresults()
-        # (STARTED is in FINAL_OR_ACTIVE_STATES) so once all results are FINAL or ACTIVE,
-        # we can transition to sealed and cleanup() will kill the background hooks
-
-        # otherwise archiveresults exist and are all finished, so it's finished
-        return True
-        
-    # def on_transition(self, event, state):
-    #     print(f'{self}.on_transition() [blue]{str(state).upper()}[/blue] ➡️ ...')
-        
-    @queued.enter
-    def enter_queued(self):
-        # Suppressed: state transition logs
-        self.snapshot.update_for_workers(
-            retry_at=timezone.now(),
-            status=Snapshot.StatusChoices.QUEUED,
-        )
-
-    @started.enter
-    def enter_started(self):
-        # Suppressed: state transition logs
-        # lock the snapshot while we create the pending archiveresults
-        self.snapshot.update_for_workers(
-            retry_at=timezone.now() + timedelta(seconds=30),  # if failed, wait 30s before retrying
-        )
-
-        # Run the snapshot - creates pending archiveresults for all enabled plugins
-        self.snapshot.run()
-
-        # unlock the snapshot after we're done + set status = started
-        self.snapshot.update_for_workers(
-            retry_at=timezone.now() + timedelta(seconds=5),  # check again in 5s
-            status=Snapshot.StatusChoices.STARTED,
-        )
-
-    @sealed.enter
-    def enter_sealed(self):
-        # Clean up background hooks
-        self.snapshot.cleanup()
-
-        # Suppressed: state transition logs
-        self.snapshot.update_for_workers(
-            retry_at=None,
-            status=Snapshot.StatusChoices.SEALED,
-        )
-
-
-# class SnapshotWorker(ActorType[Snapshot]):
-#     """
-#     The primary actor for progressing Snapshot objects
-#     through their lifecycle using the SnapshotMachine.
-#     """
-#     Model = Snapshot
-#     StateMachineClass = SnapshotMachine
-    
-#     ACTIVE_STATE: ClassVar[State] = SnapshotMachine.started                    # 'started'
-    
-#     MAX_CONCURRENT_ACTORS: ClassVar[int] = 3
-#     MAX_TICK_TIME: ClassVar[int] = 10
-#     CLAIM_FROM_TOP_N: ClassVar[int] = MAX_CONCURRENT_ACTORS * 10
-
-
-
-
-
-class ArchiveResultMachine(StateMachine, strict_states=True):
-    """
-    State machine for managing ArchiveResult lifecycle.
-    
-    https://github.com/ArchiveBox/ArchiveBox/wiki/ArchiveBox-Architecture-Diagrams
-    """
-    
-    model: ArchiveResult
-    
-    # States
-    queued = State(value=ArchiveResult.StatusChoices.QUEUED, initial=True)
-    started = State(value=ArchiveResult.StatusChoices.STARTED)
-    backoff = State(value=ArchiveResult.StatusChoices.BACKOFF)
-    succeeded = State(value=ArchiveResult.StatusChoices.SUCCEEDED, final=True)
-    failed = State(value=ArchiveResult.StatusChoices.FAILED, final=True)
-    skipped = State(value=ArchiveResult.StatusChoices.SKIPPED, final=True)
-    
-    # Tick Event - transitions based on conditions
-    tick = (
-        queued.to.itself(unless='can_start') |
-        queued.to(started, cond='can_start') |
-        started.to.itself(unless='is_finished') |
-        started.to(succeeded, cond='is_succeeded') |
-        started.to(failed, cond='is_failed') |
-        started.to(skipped, cond='is_skipped') |
-        started.to(backoff, cond='is_backoff') |
-        backoff.to.itself(unless='can_start') |
-        backoff.to(started, cond='can_start') |
-        backoff.to(succeeded, cond='is_succeeded') |
-        backoff.to(failed, cond='is_failed') |
-        backoff.to(skipped, cond='is_skipped')
-    )
-
-    def __init__(self, archiveresult, *args, **kwargs):
-        self.archiveresult = archiveresult
-        super().__init__(archiveresult, *args, **kwargs)
-    
-    def __repr__(self) -> str:
-        return f'ArchiveResult[{self.archiveresult.id}]'
-
-    def __str__(self) -> str:
-        return self.__repr__()
-
-    def can_start(self) -> bool:
-        can_start = bool(self.archiveresult.snapshot.url)
-        # Suppressed: queue waiting logs
-        return can_start
-    
-    def is_succeeded(self) -> bool:
-        """Check if extractor plugin succeeded (status was set by run())."""
-        return self.archiveresult.status == ArchiveResult.StatusChoices.SUCCEEDED
-
-    def is_failed(self) -> bool:
-        """Check if extractor plugin failed (status was set by run())."""
-        return self.archiveresult.status == ArchiveResult.StatusChoices.FAILED
-
-    def is_skipped(self) -> bool:
-        """Check if extractor plugin was skipped (status was set by run())."""
-        return self.archiveresult.status == ArchiveResult.StatusChoices.SKIPPED
-    
-    def is_backoff(self) -> bool:
-        """Check if we should backoff and retry later."""
-        # Backoff if status is still started (plugin didn't complete) and output_str is empty
-        return (
-            self.archiveresult.status == ArchiveResult.StatusChoices.STARTED and
-            not self.archiveresult.output_str
-        )
-    
-    def is_finished(self) -> bool:
-        """Check if extraction has completed (success, failure, or skipped)."""
-        return self.archiveresult.status in (
-            ArchiveResult.StatusChoices.SUCCEEDED,
-            ArchiveResult.StatusChoices.FAILED,
-            ArchiveResult.StatusChoices.SKIPPED,
-        )
-
-    @queued.enter
-    def enter_queued(self):
-        # Suppressed: state transition logs
-        self.archiveresult.update_for_workers(
-            retry_at=timezone.now(),
-            status=ArchiveResult.StatusChoices.QUEUED,
-            start_ts=None,
-        )  # bump the snapshot's retry_at so they pickup any new changes
-
-    @started.enter
-    def enter_started(self):
-        from machine.models import NetworkInterface
-
-        # Suppressed: state transition logs
-        # Lock the object and mark start time
-        self.archiveresult.update_for_workers(
-            retry_at=timezone.now() + timedelta(seconds=120),  # 2 min timeout for plugin
-            status=ArchiveResult.StatusChoices.STARTED,
-            start_ts=timezone.now(),
-            iface=NetworkInterface.current(),
-        )
-
-        # Run the plugin - this updates status, output, timestamps, etc.
-        self.archiveresult.run()
-
-        # Save the updated result
-        self.archiveresult.save()
-
-        # Suppressed: plugin result logs (already logged by worker)
-
-    @backoff.enter
-    def enter_backoff(self):
-        # Suppressed: state transition logs
-        self.archiveresult.update_for_workers(
-            retry_at=timezone.now() + timedelta(seconds=60),
-            status=ArchiveResult.StatusChoices.BACKOFF,
-            end_ts=None,
-            # retries=F('retries') + 1,               # F() equivalent to getattr(self.archiveresult, 'retries', 0) + 1,
-        )
-        self.archiveresult.save()
-
-    @succeeded.enter
-    def enter_succeeded(self):
-        # Suppressed: state transition logs
-        self.archiveresult.update_for_workers(
-            retry_at=None,
-            status=ArchiveResult.StatusChoices.SUCCEEDED,
-            end_ts=timezone.now(),
-            # **self.archiveresult.get_output_dict(),     # {output, output_json, stderr, stdout, returncode, errors, cmd_version, pwd, cmd, machine}
-        )
-        self.archiveresult.save()
-
-        # Increment health stats on ArchiveResult, Snapshot, and optionally Crawl
-        ArchiveResult.objects.filter(pk=self.archiveresult.pk).update(num_uses_succeeded=F('num_uses_succeeded') + 1)
-        Snapshot.objects.filter(pk=self.archiveresult.snapshot_id).update(num_uses_succeeded=F('num_uses_succeeded') + 1)
-
-        # Also update Crawl health stats if snapshot has a crawl
-        snapshot = self.archiveresult.snapshot
-        if snapshot.crawl_id:
-            Crawl.objects.filter(pk=snapshot.crawl_id).update(num_uses_succeeded=F('num_uses_succeeded') + 1)
-
-    @failed.enter
-    def enter_failed(self):
-        # Suppressed: state transition logs
-        self.archiveresult.update_for_workers(
-            retry_at=None,
-            status=ArchiveResult.StatusChoices.FAILED,
-            end_ts=timezone.now(),
-        )
-
-        # Increment health stats on ArchiveResult, Snapshot, and optionally Crawl
-        ArchiveResult.objects.filter(pk=self.archiveresult.pk).update(num_uses_failed=F('num_uses_failed') + 1)
-        Snapshot.objects.filter(pk=self.archiveresult.snapshot_id).update(num_uses_failed=F('num_uses_failed') + 1)
-
-        # Also update Crawl health stats if snapshot has a crawl
-        snapshot = self.archiveresult.snapshot
-        if snapshot.crawl_id:
-            Crawl.objects.filter(pk=snapshot.crawl_id).update(num_uses_failed=F('num_uses_failed') + 1)
-
-    @skipped.enter
-    def enter_skipped(self):
-        # Suppressed: state transition logs
-        self.archiveresult.update_for_workers(
-            retry_at=None,
-            status=ArchiveResult.StatusChoices.SKIPPED,
-            end_ts=timezone.now(),
-        )
-        
-    def after_transition(self, event: str, source: State, target: State):
-        # print(f"after '{event}' from '{source.id}' to '{target.id}'")
-        self.archiveresult.snapshot.update_for_workers()  # bump snapshot retry time so it picks up all the new changes
-
-
-# class ArchiveResultWorker(ActorType[ArchiveResult]):
-#     """
-#     The primary actor for progressing ArchiveResult objects
-#     through their lifecycle using the ArchiveResultMachine.
-#     """
-#     Model = ArchiveResult
-#     StateMachineClass = ArchiveResultMachine
-    
-#     ACTIVE_STATE: ClassVar[State] = ArchiveResultMachine.started                # 'started'
-    
-#     MAX_CONCURRENT_ACTORS: ClassVar[int] = 6
-#     MAX_TICK_TIME: ClassVar[int] = 60
-#     CLAIM_FROM_TOP_N: ClassVar[int] = MAX_CONCURRENT_ACTORS * 10

+ 20 - 0
archivebox/core/templatetags/config_tags.py

@@ -0,0 +1,20 @@
+"""Template tags for accessing config values in templates."""
+
+from django import template
+
+from archivebox.config.configset import get_config as _get_config
+
+register = template.Library()
+
+
[email protected]_tag
+def get_config(key: str) -> any:
+    """
+    Get a config value by key.
+
+    Usage: {% get_config "ARCHIVEDOTORG_ENABLED" as enabled %}
+    """
+    try:
+        return _get_config(key)
+    except (KeyError, AttributeError):
+        return None

+ 318 - 2
archivebox/core/tests.py

@@ -1,3 +1,319 @@
-#from django.test import TestCase
+"""Tests for the core views, especially AddView."""
 
-# Create your tests here.
+import os
+import django
+
+# Set up Django before importing any Django-dependent modules
+os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'archivebox.settings')
+django.setup()
+
+from django.test import TestCase, Client
+from django.contrib.auth.models import User
+from django.urls import reverse
+
+from archivebox.crawls.models import Crawl, CrawlSchedule
+from archivebox.core.models import Tag
+
+
+class AddViewTests(TestCase):
+    """Tests for the AddView (crawl creation form)."""
+
+    def setUp(self):
+        """Set up test user and client."""
+        self.client = Client()
+        self.user = User.objects.create_user(
+            username='testuser',
+            password='testpass123',
+            email='[email protected]'
+        )
+        self.client.login(username='testuser', password='testpass123')
+        self.add_url = reverse('add')
+
+    def test_add_view_get_requires_auth(self):
+        """Test that GET /add requires authentication."""
+        self.client.logout()
+        response = self.client.get(self.add_url)
+        # Should redirect to login or show 403/404
+        self.assertIn(response.status_code, [302, 403, 404])
+
+    def test_add_view_get_shows_form(self):
+        """Test that GET /add shows the form with all fields."""
+        response = self.client.get(self.add_url)
+        self.assertEqual(response.status_code, 200)
+
+        # Check that form fields are present
+        self.assertContains(response, 'name="url"')
+        self.assertContains(response, 'name="tag"')
+        self.assertContains(response, 'name="depth"')
+        self.assertContains(response, 'name="notes"')
+        self.assertContains(response, 'name="schedule"')
+        self.assertContains(response, 'name="persona"')
+        self.assertContains(response, 'name="overwrite"')
+        self.assertContains(response, 'name="update"')
+        self.assertContains(response, 'name="index_only"')
+
+        # Check for plugin groups
+        self.assertContains(response, 'name="chrome_plugins"')
+        self.assertContains(response, 'name="archiving_plugins"')
+        self.assertContains(response, 'name="parsing_plugins"')
+
+    def test_add_view_shows_tag_autocomplete(self):
+        """Test that tag autocomplete datalist is rendered."""
+        # Create some tags
+        Tag.objects.create(name='test-tag-1')
+        Tag.objects.create(name='test-tag-2')
+
+        response = self.client.get(self.add_url)
+        self.assertEqual(response.status_code, 200)
+
+        # Check for datalist with tags
+        self.assertContains(response, 'id="tag-datalist"')
+        self.assertContains(response, 'test-tag-1')
+        self.assertContains(response, 'test-tag-2')
+
+    def test_add_view_shows_plugin_presets(self):
+        """Test that plugin preset buttons are rendered."""
+        response = self.client.get(self.add_url)
+        self.assertEqual(response.status_code, 200)
+
+        self.assertContains(response, 'Quick Archive')
+        self.assertContains(response, 'Full Chrome')
+        self.assertContains(response, 'Text Only')
+        self.assertContains(response, 'Select All')
+        self.assertContains(response, 'Clear All')
+
+    def test_add_view_shows_links_to_resources(self):
+        """Test that helpful links are present."""
+        response = self.client.get(self.add_url)
+        self.assertEqual(response.status_code, 200)
+
+        # Link to plugin documentation
+        self.assertContains(response, '/admin/environment/plugins/')
+
+        # Link to create new persona
+        self.assertContains(response, '/admin/personas/persona/add/')
+
+    def test_add_basic_crawl_without_schedule(self):
+        """Test creating a basic crawl without a schedule."""
+        response = self.client.post(self.add_url, {
+            'url': 'https://example.com\nhttps://example.org',
+            'tag': 'test-tag',
+            'depth': '0',
+            'notes': 'Test crawl notes',
+        })
+
+        # Should redirect to crawl admin page
+        self.assertEqual(response.status_code, 302)
+
+        # Check that crawl was created
+        self.assertEqual(Crawl.objects.count(), 1)
+        crawl = Crawl.objects.first()
+
+        self.assertIn('https://example.com', crawl.urls)
+        self.assertIn('https://example.org', crawl.urls)
+        self.assertEqual(crawl.tags_str, 'test-tag')
+        self.assertEqual(crawl.max_depth, 0)
+        self.assertEqual(crawl.notes, 'Test crawl notes')
+        self.assertEqual(crawl.created_by, self.user)
+
+        # No schedule should be created
+        self.assertIsNone(crawl.schedule)
+        self.assertEqual(CrawlSchedule.objects.count(), 0)
+
+    def test_add_crawl_with_schedule(self):
+        """Test creating a crawl with a repeat schedule."""
+        response = self.client.post(self.add_url, {
+            'url': 'https://example.com',
+            'tag': 'scheduled',
+            'depth': '1',
+            'notes': 'Daily crawl',
+            'schedule': 'daily',
+        })
+
+        self.assertEqual(response.status_code, 302)
+
+        # Check that crawl and schedule were created
+        self.assertEqual(Crawl.objects.count(), 1)
+        self.assertEqual(CrawlSchedule.objects.count(), 1)
+
+        crawl = Crawl.objects.first()
+        schedule = CrawlSchedule.objects.first()
+
+        self.assertEqual(crawl.schedule, schedule)
+        self.assertEqual(schedule.template, crawl)
+        self.assertEqual(schedule.schedule, 'daily')
+        self.assertTrue(schedule.is_enabled)
+        self.assertEqual(schedule.created_by, self.user)
+
+    def test_add_crawl_with_cron_schedule(self):
+        """Test creating a crawl with a cron format schedule."""
+        response = self.client.post(self.add_url, {
+            'url': 'https://example.com',
+            'depth': '0',
+            'schedule': '0 */6 * * *',  # Every 6 hours
+        })
+
+        self.assertEqual(response.status_code, 302)
+
+        schedule = CrawlSchedule.objects.first()
+        self.assertEqual(schedule.schedule, '0 */6 * * *')
+
+    def test_add_crawl_with_plugins(self):
+        """Test creating a crawl with specific plugins selected."""
+        response = self.client.post(self.add_url, {
+            'url': 'https://example.com',
+            'depth': '0',
+            'chrome_plugins': ['screenshot', 'dom'],
+            'archiving_plugins': ['wget'],
+        })
+
+        self.assertEqual(response.status_code, 302)
+
+        crawl = Crawl.objects.first()
+        plugins = crawl.config.get('PLUGINS', '')
+
+        # Should contain the selected plugins
+        self.assertIn('screenshot', plugins)
+        self.assertIn('dom', plugins)
+        self.assertIn('wget', plugins)
+
+    def test_add_crawl_with_depth_range(self):
+        """Test creating crawls with different depth values (0-4)."""
+        for depth in range(5):
+            response = self.client.post(self.add_url, {
+                'url': f'https://example{depth}.com',
+                'depth': str(depth),
+            })
+
+            self.assertEqual(response.status_code, 302)
+
+        self.assertEqual(Crawl.objects.count(), 5)
+
+        for i, crawl in enumerate(Crawl.objects.order_by('created_at')):
+            self.assertEqual(crawl.max_depth, i)
+
+    def test_add_crawl_with_advanced_options(self):
+        """Test creating a crawl with advanced options."""
+        response = self.client.post(self.add_url, {
+            'url': 'https://example.com',
+            'depth': '0',
+            'persona': 'CustomPersona',
+            'overwrite': True,
+            'update': True,
+            'index_only': True,
+        })
+
+        self.assertEqual(response.status_code, 302)
+
+        crawl = Crawl.objects.first()
+        config = crawl.config
+
+        self.assertEqual(config.get('DEFAULT_PERSONA'), 'CustomPersona')
+        self.assertEqual(config.get('OVERWRITE'), True)
+        self.assertEqual(config.get('ONLY_NEW'), False)  # opposite of update
+        self.assertEqual(config.get('INDEX_ONLY'), True)
+
+    def test_add_crawl_with_custom_config(self):
+        """Test creating a crawl with custom config overrides."""
+        # Note: Django test client can't easily POST the KeyValueWidget format,
+        # so this test would need to use the form directly or mock the cleaned_data
+        # For now, we'll skip this test or mark it as TODO
+        pass
+
+    def test_add_empty_urls_fails(self):
+        """Test that submitting without URLs fails validation."""
+        response = self.client.post(self.add_url, {
+            'url': '',
+            'depth': '0',
+        })
+
+        # Should show form again with errors, not redirect
+        self.assertEqual(response.status_code, 200)
+        self.assertFormError(response, 'form', 'url', 'This field is required.')
+
+    def test_add_invalid_urls_fails(self):
+        """Test that invalid URLs fail validation."""
+        response = self.client.post(self.add_url, {
+            'url': 'not-a-url',
+            'depth': '0',
+        })
+
+        # Should show form again with errors
+        self.assertEqual(response.status_code, 200)
+        # Check for validation error (URL regex should fail)
+        self.assertContains(response, 'error')
+
+    def test_add_success_message_without_schedule(self):
+        """Test that success message is shown without schedule link."""
+        response = self.client.post(self.add_url, {
+            'url': 'https://example.com\nhttps://example.org',
+            'depth': '0',
+        }, follow=True)
+
+        # Check success message mentions crawl creation
+        messages = list(response.context['messages'])
+        self.assertEqual(len(messages), 1)
+        message_text = str(messages[0])
+
+        self.assertIn('Created crawl with 2 starting URL', message_text)
+        self.assertIn('View Crawl', message_text)
+        self.assertNotIn('scheduled to repeat', message_text)
+
+    def test_add_success_message_with_schedule(self):
+        """Test that success message includes schedule link."""
+        response = self.client.post(self.add_url, {
+            'url': 'https://example.com',
+            'depth': '0',
+            'schedule': 'weekly',
+        }, follow=True)
+
+        # Check success message mentions schedule
+        messages = list(response.context['messages'])
+        self.assertEqual(len(messages), 1)
+        message_text = str(messages[0])
+
+        self.assertIn('Created crawl', message_text)
+        self.assertIn('scheduled to repeat weekly', message_text)
+        self.assertIn('View Crawl', message_text)
+
+    def test_add_crawl_creates_source_file(self):
+        """Test that crawl creation saves URLs to sources file."""
+        response = self.client.post(self.add_url, {
+            'url': 'https://example.com',
+            'depth': '0',
+        })
+
+        self.assertEqual(response.status_code, 302)
+
+        # Check that source file was created in sources/ directory
+        from archivebox.config import CONSTANTS
+        sources_dir = CONSTANTS.SOURCES_DIR
+
+        # Should have created a source file
+        source_files = list(sources_dir.glob('*__web_ui_add_by_user_*.txt'))
+        self.assertGreater(len(source_files), 0)
+
+    def test_multiple_tags_are_saved(self):
+        """Test that multiple comma-separated tags are saved."""
+        response = self.client.post(self.add_url, {
+            'url': 'https://example.com',
+            'depth': '0',
+            'tag': 'tag1,tag2,tag3',
+        })
+
+        self.assertEqual(response.status_code, 302)
+
+        crawl = Crawl.objects.first()
+        self.assertEqual(crawl.tags_str, 'tag1,tag2,tag3')
+
+    def test_crawl_redirects_to_admin_change_page(self):
+        """Test that successful submission redirects to crawl admin page."""
+        response = self.client.post(self.add_url, {
+            'url': 'https://example.com',
+            'depth': '0',
+        })
+
+        crawl = Crawl.objects.first()
+        expected_redirect = f'/admin/crawls/crawl/{crawl.id}/change/'
+
+        self.assertRedirects(response, expected_redirect, fetch_redirect_response=False)

+ 3 - 3
archivebox/core/urls.py

@@ -7,10 +7,10 @@ from django.views.generic.base import RedirectView
 
 from archivebox.misc.serve_static import serve_static
 
-from core.admin_site import archivebox_admin
-from core.views import HomepageView, SnapshotView, PublicIndexView, AddView, HealthCheckView, live_progress_view
+from archivebox.core.admin_site import archivebox_admin
+from archivebox.core.views import HomepageView, SnapshotView, PublicIndexView, AddView, HealthCheckView, live_progress_view
 
-from workers.views import JobsDashboardView
+from archivebox.workers.views import JobsDashboardView
 
 # GLOBAL_CONTEXT doesn't work as-is, disabled for now: https://github.com/ArchiveBox/ArchiveBox/discussions/1306
 # from archivebox.config import VERSION, VERSIONS_AVAILABLE, CAN_UPGRADE

+ 65 - 41
archivebox/core/views.py

@@ -23,7 +23,7 @@ from admin_data_views.typing import TableContext, ItemContext
 from admin_data_views.utils import render_with_table_view, render_with_item_view, ItemLink
 
 import archivebox
-from archivebox.config import CONSTANTS, CONSTANTS_CONFIG, DATA_DIR, VERSION, SAVE_ARCHIVE_DOT_ORG
+from archivebox.config import CONSTANTS, CONSTANTS_CONFIG, DATA_DIR, VERSION
 from archivebox.config.common import SHELL_CONFIG, SERVER_CONFIG, ARCHIVING_CONFIG
 from archivebox.config.configset import get_flat_config, get_config, get_all_configs
 from archivebox.misc.util import base_url, htmlencode, ts_to_date_str
@@ -31,9 +31,9 @@ from archivebox.misc.serve_static import serve_static_with_byterange_support
 from archivebox.misc.logging_util import printable_filesize
 from archivebox.search import query_search_index
 
-from core.models import Snapshot
-from core.forms import AddLinkForm
-from crawls.models import Crawl
+from archivebox.core.models import Snapshot
+from archivebox.core.forms import AddLinkForm
+from archivebox.crawls.models import Crawl
 from archivebox.hooks import get_extractors, get_extractor_name
 
 
@@ -150,7 +150,6 @@ class SnapshotView(View):
             'status_color': 'success' if snapshot.is_archived else 'danger',
             'oldest_archive_date': ts_to_date_str(snapshot.oldest_archive_date),
             'warc_path': warc_path,
-            'SAVE_ARCHIVE_DOT_ORG': SAVE_ARCHIVE_DOT_ORG,
             'PREVIEW_ORIGINALS': SERVER_CONFIG.PREVIEW_ORIGINALS,
             'archiveresults': sorted(archiveresults.values(), key=lambda r: all_types.index(r['name']) if r['name'] in all_types else -r['size']),
             'best_result': best_result,
@@ -421,35 +420,34 @@ class AddView(UserPassesTestMixin, FormView):
         return SERVER_CONFIG.PUBLIC_ADD_VIEW or self.request.user.is_authenticated
 
     def get_context_data(self, **kwargs):
+        from archivebox.core.models import Tag
+
         return {
             **super().get_context_data(**kwargs),
-            'title': "Add URLs",
+            'title': "Create Crawl",
             # We can't just call request.build_absolute_uri in the template, because it would include query parameters
             'absolute_add_path': self.request.build_absolute_uri(self.request.path),
             'VERSION': VERSION,
             'FOOTER_INFO': SERVER_CONFIG.FOOTER_INFO,
             'stdout': '',
+            'available_tags': list(Tag.objects.all().order_by('name').values_list('name', flat=True)),
         }
 
     def form_valid(self, form):
         urls = form.cleaned_data["url"]
         print(f'[+] Adding URL: {urls}')
-        parser = form.cleaned_data.get("parser", "auto")  # default to auto-detect parser
-        tag = form.cleaned_data["tag"]
-        depth = 0 if form.cleaned_data["depth"] == "0" else 1
-        plugins = ','.join(form.cleaned_data["archive_methods"])
-        input_kwargs = {
-            "urls": urls,
-            "tag": tag,
-            "depth": depth,
-            "parser": parser,
-            "update_all": False,
-            "out_dir": DATA_DIR,
-            "created_by_id": self.request.user.pk,
-        }
-        if plugins:
-            input_kwargs.update({"plugins": plugins})
 
+        # Extract all form fields
+        tag = form.cleaned_data["tag"]
+        depth = int(form.cleaned_data["depth"])
+        plugins = ','.join(form.cleaned_data.get("plugins", []))
+        schedule = form.cleaned_data.get("schedule", "").strip()
+        persona = form.cleaned_data.get("persona", "Default")
+        overwrite = form.cleaned_data.get("overwrite", False)
+        update = form.cleaned_data.get("update", False)
+        index_only = form.cleaned_data.get("index_only", False)
+        notes = form.cleaned_data.get("notes", "")
+        custom_config = form.cleaned_data.get("config", {})
 
         from archivebox.config.permissions import HOSTNAME
 
@@ -461,33 +459,59 @@ class AddView(UserPassesTestMixin, FormView):
         # 2. create a new Crawl with the URLs from the file
         timestamp = timezone.now().strftime("%Y-%m-%d__%H-%M-%S")
         urls_content = sources_file.read_text()
+        # Build complete config
+        config = {
+            'ONLY_NEW': not update,
+            'INDEX_ONLY': index_only,
+            'OVERWRITE': overwrite,
+            'DEPTH': depth,
+            'PLUGINS': plugins or '',
+            'DEFAULT_PERSONA': persona or 'Default',
+        }
+
+        # Merge custom config overrides
+        config.update(custom_config)
+
         crawl = Crawl.objects.create(
             urls=urls_content,
             max_depth=depth,
             tags_str=tag,
+            notes=notes,
             label=f'{self.request.user.username}@{HOSTNAME}{self.request.path} {timestamp}',
             created_by_id=self.request.user.pk,
-            config={
-                # 'ONLY_NEW': not update,
-                # 'INDEX_ONLY': index_only,
-                # 'OVERWRITE': False,
-                'DEPTH': depth,
-                'PLUGINS': plugins or '',
-                # 'DEFAULT_PERSONA': persona or 'Default',
-            }
+            config=config
         )
-        
+
+        # 3. create a CrawlSchedule if schedule is provided
+        if schedule:
+            from crawls.models import CrawlSchedule
+            crawl_schedule = CrawlSchedule.objects.create(
+                template=crawl,
+                schedule=schedule,
+                is_enabled=True,
+                label=crawl.label,
+                notes=f"Auto-created from add page. {notes}".strip(),
+                created_by_id=self.request.user.pk,
+            )
+            crawl.schedule = crawl_schedule
+            crawl.save(update_fields=['schedule'])
+
         # 4. start the Orchestrator & wait until it completes
         #    ... orchestrator will create the root Snapshot, which creates pending ArchiveResults, which gets run by the ArchiveResultActors ...
-        # from crawls.actors import CrawlActor
-        # from core.actors import SnapshotActor, ArchiveResultActor
-    
+        # from archivebox.crawls.actors import CrawlActor
+        # from archivebox.core.actors import SnapshotActor, ArchiveResultActor
+
 
         rough_url_count = urls.count('://')
 
+        # Build success message with schedule link if created
+        schedule_msg = ""
+        if schedule:
+            schedule_msg = f" and <a href='{crawl.schedule.admin_change_url}'>scheduled to repeat {schedule}</a>"
+
         messages.success(
             self.request,
-            mark_safe(f"Adding {rough_url_count} URLs in the background. (refresh in a minute start seeing results) {crawl.admin_change_url}"),
+            mark_safe(f"Created crawl with {rough_url_count} starting URL(s){schedule_msg}. Snapshots will be created and archived in the background. <a href='{crawl.admin_change_url}'>View Crawl →</a>"),
         )
 
         # Orchestrator (managed by supervisord) will pick up the queued crawl
@@ -516,8 +540,8 @@ def live_progress_view(request):
     """Simple JSON endpoint for live progress status - used by admin progress monitor."""
     try:
         from workers.orchestrator import Orchestrator
-        from crawls.models import Crawl
-        from core.models import Snapshot, ArchiveResult
+        from archivebox.crawls.models import Crawl
+        from archivebox.core.models import Snapshot, ArchiveResult
         from django.db.models import Case, When, Value, IntegerField
 
         # Get orchestrator status
@@ -764,9 +788,9 @@ def key_is_safe(key: str) -> bool:
 def find_config_source(key: str, merged_config: dict) -> str:
     """Determine where a config value comes from."""
     import os
-    from machine.models import Machine
+    from archivebox.machine.models import Machine
 
-    # Check if it's from machine config
+    # Check if it's from archivebox.machine.config
     try:
         machine = Machine.current()
         if machine.config and key in machine.config:
@@ -778,7 +802,7 @@ def find_config_source(key: str, merged_config: dict) -> str:
     if key in os.environ:
         return 'Environment'
 
-    # Check if it's from config file
+    # Check if it's from archivebox.config.file
     from archivebox.config.configset import BaseConfigSet
     file_config = BaseConfigSet.load_from_file(CONSTANTS.CONFIG_FILE)
     if key in file_config:
@@ -796,7 +820,7 @@ def live_config_list_view(request: HttpRequest, **kwargs) -> TableContext:
 
     # Get merged config that includes Machine.config overrides
     try:
-        from machine.models import Machine
+        from archivebox.machine.models import Machine
         machine = Machine.current()
         merged_config = get_config()
     except Exception as e:
@@ -859,7 +883,7 @@ def live_config_list_view(request: HttpRequest, **kwargs) -> TableContext:
 @render_with_item_view
 def live_config_value_view(request: HttpRequest, key: str, **kwargs) -> ItemContext:
     import os
-    from machine.models import Machine
+    from archivebox.machine.models import Machine
     from archivebox.config.configset import BaseConfigSet
 
     CONFIGS = get_all_configs()

+ 2 - 2
archivebox/crawls/admin.py

@@ -17,8 +17,8 @@ from django_object_actions import action
 
 from archivebox.base_models.admin import BaseModelAdmin, ConfigEditorMixin
 
-from core.models import Snapshot
-from crawls.models import Crawl, CrawlSchedule
+from archivebox.core.models import Snapshot
+from archivebox.crawls.models import Crawl, CrawlSchedule
 
 
 def render_snapshots_list(snapshots_qs, limit=20):

+ 1 - 1
archivebox/crawls/apps.py

@@ -3,4 +3,4 @@ from django.apps import AppConfig
 
 class CrawlsConfig(AppConfig):
     default_auto_field = "django.db.models.BigAutoField"
-    name = "crawls"
+    name = "archivebox.crawls"

+ 149 - 11
archivebox/crawls/models.py

@@ -1,6 +1,7 @@
 __package__ = 'archivebox.crawls'
 
 from typing import TYPE_CHECKING, Iterable
+from datetime import timedelta
 from archivebox.uuid_compat import uuid7
 from pathlib import Path
 
@@ -11,13 +12,15 @@ from django.conf import settings
 from django.urls import reverse_lazy
 from django.utils import timezone
 from django_stubs_ext.db.models import TypedModelMeta
+from statemachine import State, registry
+from rich import print
 
 from archivebox.config import CONSTANTS
 from archivebox.base_models.models import ModelWithSerializers, ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHealthStats, get_or_create_system_user_pk
-from workers.models import ModelWithStateMachine
+from archivebox.workers.models import ModelWithStateMachine, BaseStateMachine
 
 if TYPE_CHECKING:
-    from core.models import Snapshot, ArchiveResult
+    from archivebox.core.models import Snapshot, ArchiveResult
 
 
 class CrawlSchedule(ModelWithSerializers, ModelWithNotes, ModelWithHealthStats):
@@ -35,6 +38,7 @@ class CrawlSchedule(ModelWithSerializers, ModelWithNotes, ModelWithHealthStats):
     crawl_set: models.Manager['Crawl']
 
     class Meta(TypedModelMeta):
+        app_label = 'crawls'
         verbose_name = 'Scheduled Crawl'
         verbose_name_plural = 'Scheduled Crawls'
 
@@ -73,7 +77,7 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith
     status = ModelWithStateMachine.StatusField(choices=ModelWithStateMachine.StatusChoices, default=ModelWithStateMachine.StatusChoices.QUEUED)
     retry_at = ModelWithStateMachine.RetryAtField(default=timezone.now)
 
-    state_machine_name = 'crawls.statemachines.CrawlMachine'
+    state_machine_name = 'crawls.models.CrawlMachine'
     retry_at_field_name = 'retry_at'
     state_field_name = 'status'
     StatusChoices = ModelWithStateMachine.StatusChoices
@@ -82,6 +86,7 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith
     snapshot_set: models.Manager['Snapshot']
 
     class Meta(TypedModelMeta):
+        app_label = 'crawls'
         verbose_name = 'Crawl'
         verbose_name_plural = 'Crawls'
 
@@ -168,7 +173,7 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith
         return Path(path_str)
 
     def create_root_snapshot(self) -> 'Snapshot':
-        from core.models import Snapshot
+        from archivebox.core.models import Snapshot
 
         first_url = self.get_urls_list()[0] if self.get_urls_list() else None
         if not first_url:
@@ -245,7 +250,7 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith
             List of newly created Snapshot objects
         """
         import json
-        from core.models import Snapshot
+        from archivebox.core.models import Snapshot
 
         created_snapshots = []
 
@@ -309,9 +314,13 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith
         import time
         from pathlib import Path
         from archivebox.hooks import run_hook, discover_hooks, process_hook_records
+        from archivebox.config.configset import get_config
+
+        # Get merged config with crawl context
+        config = get_config(crawl=self)
 
         # Discover and run on_Crawl hooks
-        hooks = discover_hooks('Crawl')
+        hooks = discover_hooks('Crawl', config=config)
         first_url = self.get_urls_list()[0] if self.get_urls_list() else ''
 
         for hook in hooks:
@@ -323,8 +332,7 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith
             result = run_hook(
                 hook,
                 output_dir=output_dir,
-                timeout=60,
-                config_objects=[self],
+                config=config,
                 crawl_id=str(self.id),
                 source_url=first_url,
             )
@@ -380,7 +388,10 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith
                     pass
 
         # Run on_CrawlEnd hooks
-        hooks = discover_hooks('CrawlEnd')
+        from archivebox.config.configset import get_config
+        config = get_config(crawl=self)
+
+        hooks = discover_hooks('CrawlEnd', config=config)
         first_url = self.get_urls_list()[0] if self.get_urls_list() else ''
 
         for hook in hooks:
@@ -391,8 +402,7 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith
             result = run_hook(
                 hook,
                 output_dir=output_dir,
-                timeout=30,
-                config_objects=[self],
+                config=config,
                 crawl_id=str(self.id),
                 source_url=first_url,
             )
@@ -400,3 +410,131 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith
             # Log failures but don't block
             if result and result['returncode'] != 0:
                 print(f'[yellow]⚠️ CrawlEnd hook failed: {hook.name}[/yellow]')
+
+
+# =============================================================================
+# State Machines
+# =============================================================================
+
+class CrawlMachine(BaseStateMachine, strict_states=True):
+    """
+    State machine for managing Crawl lifecycle.
+
+    Hook Lifecycle:
+    ┌─────────────────────────────────────────────────────────────┐
+    │ QUEUED State                                                │
+    │  • Waiting for crawl to be ready (has URLs)                 │
+    └─────────────────────────────────────────────────────────────┘
+                            ↓ tick() when can_start()
+    ┌─────────────────────────────────────────────────────────────┐
+    │ STARTED State → enter_started()                             │
+    │  1. crawl.run()                                             │
+    │     • discover_hooks('Crawl') → finds all crawl hooks       │
+    │     • For each hook:                                        │
+    │       - run_hook(script, output_dir, ...)                   │
+    │       - Parse JSONL from hook output                        │
+    │       - process_hook_records() → creates Snapshots          │
+    │     • create_root_snapshot() → root snapshot for crawl      │
+    │     • create_snapshots_from_urls() → from self.urls field   │
+    │                                                              │
+    │  2. Snapshots process independently with their own          │
+    │     state machines (see SnapshotMachine)                    │
+    └─────────────────────────────────────────────────────────────┘
+                            ↓ tick() when is_finished()
+    ┌─────────────────────────────────────────────────────────────┐
+    │ SEALED State → enter_sealed()                               │
+    │  • cleanup() → runs on_CrawlEnd hooks, kills background     │
+    │  • Set retry_at=None (no more processing)                   │
+    └─────────────────────────────────────────────────────────────┘
+    """
+
+    model_attr_name = 'crawl'
+
+    # States
+    queued = State(value=Crawl.StatusChoices.QUEUED, initial=True)
+    started = State(value=Crawl.StatusChoices.STARTED)
+    sealed = State(value=Crawl.StatusChoices.SEALED, final=True)
+
+    # Tick Event
+    tick = (
+        queued.to.itself(unless='can_start') |
+        queued.to(started, cond='can_start') |
+        started.to.itself(unless='is_finished') |
+        started.to(sealed, cond='is_finished')
+    )
+
+    def can_start(self) -> bool:
+        if not self.crawl.urls:
+            print(f'[red]⚠️ Crawl {self.crawl.id} cannot start: no URLs[/red]')
+            return False
+        urls_list = self.crawl.get_urls_list()
+        if not urls_list:
+            print(f'[red]⚠️ Crawl {self.crawl.id} cannot start: no valid URLs in urls field[/red]')
+            return False
+        return True
+
+    def is_finished(self) -> bool:
+        from archivebox.core.models import Snapshot
+
+        # check that at least one snapshot exists for this crawl
+        snapshots = Snapshot.objects.filter(crawl=self.crawl)
+        if not snapshots.exists():
+            return False
+
+        # check if all snapshots are sealed
+        # Snapshots handle their own background hooks via the step system,
+        # so we just need to wait for all snapshots to reach sealed state
+        if snapshots.filter(status__in=[Snapshot.StatusChoices.QUEUED, Snapshot.StatusChoices.STARTED]).exists():
+            return False
+
+        return True
+
+    @started.enter
+    def enter_started(self):
+        # Lock the crawl by bumping retry_at so other workers don't pick it up while we create snapshots
+        self.crawl.update_and_requeue(
+            retry_at=timezone.now() + timedelta(seconds=30),  # Lock for 30 seconds
+        )
+
+        try:
+            # Run the crawl - runs hooks, processes JSONL, creates snapshots
+            self.crawl.run()
+
+            # Update status to STARTED once snapshots are created
+            # Set retry_at to future so we don't busy-loop - wait for snapshots to process
+            self.crawl.update_and_requeue(
+                retry_at=timezone.now() + timedelta(seconds=5),  # Check again in 5s
+                status=Crawl.StatusChoices.STARTED,
+            )
+        except Exception as e:
+            print(f'[red]⚠️ Crawl {self.crawl.id} failed to start: {e}[/red]')
+            import traceback
+            traceback.print_exc()
+            # Re-raise so the worker knows it failed
+            raise
+
+    def on_started_to_started(self):
+        """Called when Crawl stays in started state (snapshots not sealed yet)."""
+        # Bump retry_at so we check again in a few seconds
+        self.crawl.update_and_requeue(
+            retry_at=timezone.now() + timedelta(seconds=5),
+        )
+
+    @sealed.enter
+    def enter_sealed(self):
+        # Clean up background hooks and run on_CrawlEnd hooks
+        self.crawl.cleanup()
+
+        self.crawl.update_and_requeue(
+            retry_at=None,
+            status=Crawl.StatusChoices.SEALED,
+        )
+
+
+# =============================================================================
+# Register State Machines
+# =============================================================================
+
+# Manually register state machines with python-statemachine registry
+# (normally auto-discovered from statemachines.py, but we define them here for clarity)
+registry.register(CrawlMachine)

+ 0 - 114
archivebox/crawls/statemachines.py

@@ -1,114 +0,0 @@
-__package__ = 'archivebox.crawls'
-
-import os
-from typing import ClassVar
-from datetime import timedelta
-from django.utils import timezone
-
-from rich import print
-
-from statemachine import State, StateMachine
-
-# from workers.actor import ActorType
-from crawls.models import Crawl
-
-
-class CrawlMachine(StateMachine, strict_states=True):
-    """State machine for managing Crawl lifecycle."""
-    
-    model: Crawl
-    
-    # States
-    queued = State(value=Crawl.StatusChoices.QUEUED, initial=True)
-    started = State(value=Crawl.StatusChoices.STARTED)
-    sealed = State(value=Crawl.StatusChoices.SEALED, final=True)
-    
-    # Tick Event
-    tick = (
-        queued.to.itself(unless='can_start') |
-        queued.to(started, cond='can_start') |
-        started.to.itself(unless='is_finished') |
-        started.to(sealed, cond='is_finished')
-    )
-    
-    def __init__(self, crawl, *args, **kwargs):
-        self.crawl = crawl
-        super().__init__(crawl, *args, **kwargs)
-    
-    def __repr__(self) -> str:
-        return f'Crawl[{self.crawl.id}]'
-
-    def __str__(self) -> str:
-        return self.__repr__()
-        
-    def can_start(self) -> bool:
-        if not self.crawl.urls:
-            print(f'[red]⚠️ Crawl {self.crawl.id} cannot start: no URLs[/red]')
-            return False
-        urls_list = self.crawl.get_urls_list()
-        if not urls_list:
-            print(f'[red]⚠️ Crawl {self.crawl.id} cannot start: no valid URLs in urls field[/red]')
-            return False
-        return True
-        
-    def is_finished(self) -> bool:
-        from core.models import Snapshot, ArchiveResult
-        
-        # check that at least one snapshot exists for this crawl
-        snapshots = Snapshot.objects.filter(crawl=self.crawl)
-        if not snapshots.exists():
-            return False
-        
-        # check to make sure no snapshots are in non-final states
-        if snapshots.filter(status__in=[Snapshot.StatusChoices.QUEUED, Snapshot.StatusChoices.STARTED]).exists():
-            return False
-        
-        # check that some archiveresults exist for this crawl
-        results = ArchiveResult.objects.filter(snapshot__crawl=self.crawl)
-        if not results.exists():
-            return False
-        
-        # check if all archiveresults are finished
-        if results.filter(status__in=[ArchiveResult.StatusChoices.QUEUED, ArchiveResult.StatusChoices.STARTED]).exists():
-            return False
-        
-        return True
-        
-    # def before_transition(self, event, state):
-    #     print(f"Before '{event}', on the '{state.id}' state.")
-    #     return "before_transition_return"
-
-    @started.enter
-    def enter_started(self):
-        # Suppressed: state transition logs
-        # Lock the crawl by bumping retry_at so other workers don't pick it up while we create snapshots
-        self.crawl.update_for_workers(
-            retry_at=timezone.now() + timedelta(seconds=30),  # Lock for 30 seconds
-        )
-
-        try:
-            # Run the crawl - runs hooks, processes JSONL, creates snapshots
-            self.crawl.run()
-
-            # Update status to STARTED once snapshots are created
-            self.crawl.update_for_workers(
-                retry_at=timezone.now(),  # Process immediately
-                status=Crawl.StatusChoices.STARTED,
-            )
-        except Exception as e:
-            print(f'[red]⚠️ Crawl {self.crawl.id} failed to start: {e}[/red]')
-            import traceback
-            traceback.print_exc()
-            # Re-raise so the worker knows it failed
-            raise
-
-    @sealed.enter
-    def enter_sealed(self):
-        # Clean up background hooks and run on_CrawlEnd hooks
-        self.crawl.cleanup()
-
-        # Suppressed: state transition logs
-        self.crawl.update_for_workers(
-            retry_at=None,
-            status=Crawl.StatusChoices.SEALED,
-        )

+ 179 - 305
archivebox/hooks.py

@@ -146,11 +146,16 @@ class HookResult(TypedDict, total=False):
     records: List[Dict[str, Any]]  # Parsed JSONL records with 'type' field
 
 
-def discover_hooks(event_name: str) -> List[Path]:
+def discover_hooks(
+    event_name: str,
+    filter_disabled: bool = True,
+    config: Optional[Dict[str, Any]] = None
+) -> List[Path]:
     """
     Find all hook scripts matching on_{event_name}__*.{sh,py,js} pattern.
 
     Searches both built-in and user plugin directories.
+    Filters out hooks from disabled plugins by default (respects USE_/SAVE_ flags).
     Returns scripts sorted alphabetically by filename for deterministic execution order.
 
     Hook naming convention uses numeric prefixes to control order:
@@ -158,9 +163,29 @@ def discover_hooks(event_name: str) -> List[Path]:
         on_Snapshot__15_singlefile.py   # runs second
         on_Snapshot__26_readability.py  # runs later (depends on singlefile)
 
-    Example:
+    Args:
+        event_name: Event name (e.g., 'Snapshot', 'Binary', 'Crawl')
+        filter_disabled: If True, skip hooks from disabled plugins (default: True)
+        config: Optional config dict from get_config() (merges file, env, machine, crawl, snapshot)
+                If None, will call get_config() with global scope
+
+    Returns:
+        Sorted list of hook script paths from enabled plugins only.
+
+    Examples:
+        # With proper config context (recommended):
+        from archivebox.config.configset import get_config
+        config = get_config(crawl=my_crawl, snapshot=my_snapshot)
+        discover_hooks('Snapshot', config=config)
+        # Returns: [Path('.../on_Snapshot__10_title.py'), ...] (wget excluded if SAVE_WGET=False)
+
+        # Without config (uses global defaults):
         discover_hooks('Snapshot')
-        # Returns: [Path('.../on_Snapshot__10_title.py'), Path('.../on_Snapshot__15_singlefile.py'), ...]
+        # Returns: [Path('.../on_Snapshot__10_title.py'), ...]
+
+        # Show all plugins regardless of enabled status:
+        discover_hooks('Snapshot', filter_disabled=False)
+        # Returns: [Path('.../on_Snapshot__10_title.py'), ..., Path('.../on_Snapshot__50_wget.py')]
     """
     hooks = []
 
@@ -177,45 +202,44 @@ def discover_hooks(event_name: str) -> List[Path]:
             pattern_direct = f'on_{event_name}__*.{ext}'
             hooks.extend(base_dir.glob(pattern_direct))
 
-    # Sort by filename (not full path) to ensure numeric prefix ordering works
-    # e.g., on_Snapshot__10_title.py sorts before on_Snapshot__26_readability.py
-    return sorted(set(hooks), key=lambda p: p.name)
-
-
-def discover_all_hooks() -> Dict[str, List[Path]]:
-    """
-    Discover all hooks organized by event name.
-
-    Returns a dict mapping event names to lists of hook script paths.
-    """
-    hooks_by_event: Dict[str, List[Path]] = {}
-
-    for base_dir in (BUILTIN_PLUGINS_DIR, USER_PLUGINS_DIR):
-        if not base_dir.exists():
-            continue
+    # Filter by enabled plugins
+    if filter_disabled:
+        # Get merged config if not provided (lazy import to avoid circular dependency)
+        if config is None:
+            from archivebox.config.configset import get_config
+            config = get_config(scope='global')
+
+        enabled_hooks = []
+
+        for hook in hooks:
+            # Get plugin name from parent directory
+            # e.g., archivebox/plugins/wget/on_Snapshot__50_wget.py -> 'wget'
+            plugin_name = hook.parent.name
+
+            # Check if this is a plugin directory (not the root plugins dir)
+            if plugin_name in ('plugins', '.'):
+                # Hook is in root plugins directory, not a plugin subdir
+                # Include it by default (no filtering for non-plugin hooks)
+                enabled_hooks.append(hook)
+                continue
 
-        for ext in ('sh', 'py', 'js'):
-            for hook_path in base_dir.glob(f'*/on_*__*.{ext}'):
-                # Extract event name from filename: on_EventName__hook_name.ext
-                filename = hook_path.stem  # on_EventName__hook_name
-                if filename.startswith('on_') and '__' in filename:
-                    event_name = filename[3:].split('__')[0]  # EventName
-                    if event_name not in hooks_by_event:
-                        hooks_by_event[event_name] = []
-                    hooks_by_event[event_name].append(hook_path)
+            # Check if plugin is enabled
+            plugin_config = get_plugin_special_config(plugin_name, config)
+            if plugin_config['enabled']:
+                enabled_hooks.append(hook)
 
-    # Sort hooks within each event
-    for event_name in hooks_by_event:
-        hooks_by_event[event_name] = sorted(set(hooks_by_event[event_name]), key=lambda p: p.name)
+        hooks = enabled_hooks
 
-    return hooks_by_event
+    # Sort by filename (not full path) to ensure numeric prefix ordering works
+    # e.g., on_Snapshot__10_title.py sorts before on_Snapshot__26_readability.py
+    return sorted(set(hooks), key=lambda p: p.name)
 
 
 def run_hook(
     script: Path,
     output_dir: Path,
-    timeout: int = 300,
-    config_objects: Optional[List[Any]] = None,
+    config: Dict[str, Any],
+    timeout: Optional[int] = None,
     **kwargs: Any
 ) -> HookResult:
     """
@@ -224,31 +248,33 @@ def run_hook(
     This is the low-level hook executor. For running extractors with proper
     metadata handling, use call_extractor() instead.
 
-    Config is passed to hooks via environment variables with this priority:
-    1. Plugin schema defaults (config.json)
-    2. Config file (ArchiveBox.conf)
-    3. Environment variables
-    4. Machine.config (auto-included, lowest override priority)
-    5. config_objects (in order - later objects override earlier ones)
+    Config is passed to hooks via environment variables. Caller MUST use
+    get_config() to merge all sources (file, env, machine, crawl, snapshot).
 
     Args:
         script: Path to the hook script (.sh, .py, or .js)
         output_dir: Working directory for the script (where output files go)
+        config: Merged config dict from get_config(crawl=..., snapshot=...) - REQUIRED
         timeout: Maximum execution time in seconds
-        config_objects: Optional list of objects with .config JSON fields
-                       (e.g., [crawl, snapshot] - later items have higher priority)
+                 If None, auto-detects from PLUGINNAME_TIMEOUT config (fallback to TIMEOUT, default 300)
         **kwargs: Arguments passed to the script as --key=value
 
     Returns:
         HookResult with 'returncode', 'stdout', 'stderr', 'output_json', 'output_files', 'duration_ms'
+
+    Example:
+        from archivebox.config.configset import get_config
+        config = get_config(crawl=my_crawl, snapshot=my_snapshot)
+        result = run_hook(hook_path, output_dir, config=config, url=url, snapshot_id=id)
     """
     import time
     start_time = time.time()
 
-    # Auto-include Machine.config at the start (lowest priority among config_objects)
-    from machine.models import Machine
-    machine = Machine.current()
-    all_config_objects = [machine] + list(config_objects or [])
+    # Auto-detect timeout from plugin config if not explicitly provided
+    if timeout is None:
+        plugin_name = script.parent.name
+        plugin_config = get_plugin_special_config(plugin_name, config)
+        timeout = plugin_config['timeout']
 
     if not script.exists():
         return HookResult(
@@ -302,51 +328,16 @@ def run_hook(
     env['ARCHIVE_DIR'] = str(getattr(settings, 'ARCHIVE_DIR', Path.cwd() / 'archive'))
     env.setdefault('MACHINE_ID', getattr(settings, 'MACHINE_ID', '') or os.environ.get('MACHINE_ID', ''))
 
-    # If a Crawl is in config_objects, pass its OUTPUT_DIR for hooks that need to find crawl-level resources
-    for obj in all_config_objects:
-        if hasattr(obj, 'OUTPUT_DIR') and hasattr(obj, 'get_urls_list'):  # Duck-type check for Crawl
-            env['CRAWL_OUTPUT_DIR'] = str(obj.OUTPUT_DIR)
-            break
-
-    # Build overrides from any objects with .config fields (in order, later overrides earlier)
-    # all_config_objects includes Machine at the start, then any passed config_objects
-    overrides = {}
-    for obj in all_config_objects:
-        if obj and hasattr(obj, 'config') and obj.config:
-            # Strip 'config/' prefix from Machine.config keys (e.g., 'config/CHROME_BINARY' -> 'CHROME_BINARY')
-            for key, value in obj.config.items():
-                clean_key = key.removeprefix('config/')
-                overrides[clean_key] = value
-
-    # Get plugin config from JSON schemas with hierarchy resolution
-    # This merges: schema defaults -> config file -> env vars -> object config overrides
-    plugin_config = get_flat_plugin_config(overrides=overrides if overrides else None)
-    export_plugin_config_to_env(plugin_config, env)
-
-    # Also pass core config values that aren't in plugin schemas yet
-    # These are legacy values that may still be needed
-    from archivebox import config
-    env.setdefault('CHROME_BINARY', str(getattr(config, 'CHROME_BINARY', '')))
-    env.setdefault('WGET_BINARY', str(getattr(config, 'WGET_BINARY', '')))
-    env.setdefault('CURL_BINARY', str(getattr(config, 'CURL_BINARY', '')))
-    env.setdefault('GIT_BINARY', str(getattr(config, 'GIT_BINARY', '')))
-    env.setdefault('YOUTUBEDL_BINARY', str(getattr(config, 'YOUTUBEDL_BINARY', '')))
-    env.setdefault('SINGLEFILE_BINARY', str(getattr(config, 'SINGLEFILE_BINARY', '')))
-    env.setdefault('READABILITY_BINARY', str(getattr(config, 'READABILITY_BINARY', '')))
-    env.setdefault('MERCURY_BINARY', str(getattr(config, 'MERCURY_BINARY', '')))
-    env.setdefault('NODE_BINARY', str(getattr(config, 'NODE_BINARY', '')))
-    env.setdefault('TIMEOUT', str(getattr(config, 'TIMEOUT', 60)))
-    env.setdefault('CHECK_SSL_VALIDITY', str(getattr(config, 'CHECK_SSL_VALIDITY', True)))
-    env.setdefault('USER_AGENT', str(getattr(config, 'USER_AGENT', '')))
-    env.setdefault('RESOLUTION', str(getattr(config, 'RESOLUTION', '')))
-
-    # Pass SEARCH_BACKEND_ENGINE from new-style config
-    try:
-        from archivebox.config.configset import get_config
-        search_config = get_config()
-        env.setdefault('SEARCH_BACKEND_ENGINE', str(search_config.get('SEARCH_BACKEND_ENGINE', 'ripgrep')))
-    except Exception:
-        env.setdefault('SEARCH_BACKEND_ENGINE', 'ripgrep')
+    # Export all config values to environment (already merged by get_config())
+    for key, value in config.items():
+        if value is None:
+            continue
+        elif isinstance(value, bool):
+            env[key] = 'true' if value else 'false'
+        elif isinstance(value, (list, dict)):
+            env[key] = json.dumps(value)
+        else:
+            env[key] = str(value)
 
     # Create output directory if needed
     output_dir.mkdir(parents=True, exist_ok=True)
@@ -525,31 +516,35 @@ def collect_urls_from_plugins(snapshot_dir: Path) -> List[Dict[str, Any]]:
 def run_hooks(
     event_name: str,
     output_dir: Path,
-    timeout: int = 300,
+    config: Dict[str, Any],
+    timeout: Optional[int] = None,
     stop_on_failure: bool = False,
-    config_objects: Optional[List[Any]] = None,
     **kwargs: Any
 ) -> List[HookResult]:
     """
     Run all hooks for a given event.
 
     Args:
-        event_name: The event name to trigger (e.g., 'Snapshot__wget')
+        event_name: The event name to trigger (e.g., 'Snapshot', 'Crawl', 'Binary')
         output_dir: Working directory for hook scripts
-        timeout: Maximum execution time per hook
+        config: Merged config dict from get_config(crawl=..., snapshot=...) - REQUIRED
+        timeout: Maximum execution time per hook (None = auto-detect from plugin config)
         stop_on_failure: If True, stop executing hooks after first failure
-        config_objects: Optional list of objects with .config JSON fields
-                       (e.g., [crawl, snapshot] - later items have higher priority)
         **kwargs: Arguments passed to each hook script
 
     Returns:
         List of results from each hook execution
+
+    Example:
+        from archivebox.config.configset import get_config
+        config = get_config(crawl=my_crawl, snapshot=my_snapshot)
+        results = run_hooks('Snapshot', output_dir, config=config, url=url, snapshot_id=id)
     """
-    hooks = discover_hooks(event_name)
+    hooks = discover_hooks(event_name, config=config)
     results = []
 
     for hook in hooks:
-        result = run_hook(hook, output_dir, timeout=timeout, config_objects=config_objects, **kwargs)
+        result = run_hook(hook, output_dir, config=config, timeout=timeout, **kwargs)
 
         # Background hooks return None - skip adding to results
         if result is None:
@@ -638,24 +633,44 @@ EXTRACTOR_INDEXING_PRECEDENCE = [
 ]
 
 
-def get_enabled_plugins(config: Optional[Dict] = None) -> List[str]:
+def get_enabled_plugins(config: Optional[Dict[str, Any]] = None) -> List[str]:
     """
     Get the list of enabled plugins based on config and available hooks.
 
-    Checks for ENABLED_PLUGINS (or legacy ENABLED_EXTRACTORS) in config,
-    falls back to discovering available hooks from the plugins directory.
+    Filters plugins by USE_/SAVE_ flags. Only returns plugins that are enabled.
+
+    Args:
+        config: Merged config dict from get_config() - if None, uses global config
 
-    Returns plugin names sorted alphabetically (numeric prefix controls order).
+    Returns:
+        Plugin names sorted alphabetically (numeric prefix controls order).
+
+    Example:
+        from archivebox.config.configset import get_config
+        config = get_config(crawl=my_crawl, snapshot=my_snapshot)
+        enabled = get_enabled_plugins(config)  # ['wget', 'media', 'chrome', ...]
     """
-    if config:
-        # Support both new and legacy config keys
-        if 'ENABLED_PLUGINS' in config:
-            return config['ENABLED_PLUGINS']
-        if 'ENABLED_EXTRACTORS' in config:
-            return config['ENABLED_EXTRACTORS']
+    # Get merged config if not provided
+    if config is None:
+        from archivebox.config.configset import get_config
+        config = get_config(scope='global')
+
+    # Support explicit ENABLED_PLUGINS override (legacy)
+    if 'ENABLED_PLUGINS' in config:
+        return config['ENABLED_PLUGINS']
+    if 'ENABLED_EXTRACTORS' in config:
+        return config['ENABLED_EXTRACTORS']
+
+    # Filter all plugins by enabled status
+    all_plugins = get_plugins()
+    enabled = []
+
+    for plugin in all_plugins:
+        plugin_config = get_plugin_special_config(plugin, config)
+        if plugin_config['enabled']:
+            enabled.append(plugin)
 
-    # Discover from hooks - this is the source of truth
-    return get_plugins()
+    return enabled
 
 
 def discover_plugins_that_provide_interface(
@@ -822,37 +837,6 @@ def discover_plugin_configs() -> Dict[str, Dict[str, Any]]:
     return configs
 
 
-def get_merged_config_schema() -> Dict[str, Any]:
-    """
-    Get a merged JSONSchema combining all plugin config schemas.
-
-    This creates a single schema that can validate all plugin config keys.
-    Useful for validating the complete configuration at startup.
-
-    Returns:
-        Combined JSONSchema with all plugin properties merged.
-    """
-    plugin_configs = discover_plugin_configs()
-
-    merged_properties = {}
-    for plugin_name, schema in plugin_configs.items():
-        properties = schema.get('properties', {})
-        for key, prop_schema in properties.items():
-            if key in merged_properties:
-                # Key already exists from another plugin - log warning but keep first
-                import sys
-                print(f"Warning: Config key '{key}' defined in multiple plugins, using first definition", file=sys.stderr)
-                continue
-            merged_properties[key] = prop_schema
-
-    return {
-        "$schema": "http://json-schema.org/draft-07/schema#",
-        "type": "object",
-        "additionalProperties": True,  # Allow unknown keys (core config, etc.)
-        "properties": merged_properties,
-    }
-
-
 def get_config_defaults_from_plugins() -> Dict[str, Any]:
     """
     Get default values for all plugin config options.
@@ -873,173 +857,63 @@ def get_config_defaults_from_plugins() -> Dict[str, Any]:
     return defaults
 
 
-def resolve_config_value(
-    key: str,
-    prop_schema: Dict[str, Any],
-    env_vars: Dict[str, str],
-    config_file: Dict[str, str],
-    overrides: Optional[Dict[str, Any]] = None,
-) -> Any:
+def get_plugin_special_config(plugin_name: str, config: Dict[str, Any]) -> Dict[str, Any]:
     """
-    Resolve a single config value following the hierarchy and schema rules.
-
-    Resolution order (later overrides earlier):
-        1. Schema default
-        2. x-fallback (global config key)
-        3. Config file (ArchiveBox.conf)
-        4. Environment variables (including x-aliases)
-        5. Explicit overrides (User/Crawl/Snapshot config)
-
-    Args:
-        key: Config key name (e.g., 'WGET_TIMEOUT')
-        prop_schema: JSONSchema property definition for this key
-        env_vars: Environment variables dict
-        config_file: Config file values dict
-        overrides: Optional override values (from User/Crawl/Snapshot)
-
-    Returns:
-        Resolved value with appropriate type coercion.
-    """
-    value = None
-    prop_type = prop_schema.get('type', 'string')
-
-    # 1. Start with schema default
-    if 'default' in prop_schema:
-        value = prop_schema['default']
-
-    # 2. Check x-fallback (global config key)
-    fallback_key = prop_schema.get('x-fallback')
-    if fallback_key:
-        if fallback_key in env_vars:
-            value = env_vars[fallback_key]
-        elif fallback_key in config_file:
-            value = config_file[fallback_key]
-
-    # 3. Check config file for main key
-    if key in config_file:
-        value = config_file[key]
-
-    # 4. Check environment variables (main key and aliases)
-    keys_to_check = [key] + prop_schema.get('x-aliases', [])
-    for check_key in keys_to_check:
-        if check_key in env_vars:
-            value = env_vars[check_key]
-            break
-
-    # 5. Apply explicit overrides
-    if overrides and key in overrides:
-        value = overrides[key]
+    Extract special config keys for a plugin following naming conventions.
 
-    # Type coercion for env var strings
-    if value is not None and isinstance(value, str):
-        value = coerce_config_value(value, prop_type, prop_schema)
-
-    return value
-
-
-def coerce_config_value(value: str, prop_type: str, prop_schema: Dict[str, Any]) -> Any:
-    """
-    Coerce a string value to the appropriate type based on schema.
-
-    Args:
-        value: String value to coerce
-        prop_type: JSONSchema type ('boolean', 'integer', 'number', 'array', 'string')
-        prop_schema: Full property schema (for array item types, etc.)
-
-    Returns:
-        Coerced value of appropriate type.
-    """
-    if prop_type == 'boolean':
-        return value.lower() in ('true', '1', 'yes', 'on')
-    elif prop_type == 'integer':
-        try:
-            return int(value)
-        except ValueError:
-            return prop_schema.get('default', 0)
-    elif prop_type == 'number':
-        try:
-            return float(value)
-        except ValueError:
-            return prop_schema.get('default', 0.0)
-    elif prop_type == 'array':
-        # Try JSON parse first, fall back to comma-separated
-        try:
-            return json.loads(value)
-        except json.JSONDecodeError:
-            return [v.strip() for v in value.split(',') if v.strip()]
-    else:
-        return value
-
-
-def get_flat_plugin_config(
-    env_vars: Optional[Dict[str, str]] = None,
-    config_file: Optional[Dict[str, str]] = None,
-    overrides: Optional[Dict[str, Any]] = None,
-) -> Dict[str, Any]:
-    """
-    Get all plugin config values resolved according to hierarchy.
+    ArchiveBox recognizes 3 special config key patterns per plugin:
+        - {PLUGIN}_ENABLED: Enable/disable toggle (default True)
+        - {PLUGIN}_TIMEOUT: Plugin-specific timeout (fallback to TIMEOUT, default 300)
+        - {PLUGIN}_BINARY: Primary binary path (default to plugin_name)
 
-    This is the main function for getting plugin configuration.
-    It discovers all plugin schemas and resolves each config key.
+    These allow ArchiveBox to:
+        - Skip disabled plugins (optimization)
+        - Enforce plugin-specific timeouts automatically
+        - Discover plugin binaries for validation
 
     Args:
-        env_vars: Environment variables (defaults to os.environ)
-        config_file: Config file values (from ArchiveBox.conf)
-        overrides: Override values (from User/Crawl/Snapshot config fields)
+        plugin_name: Plugin name (e.g., 'wget', 'media', 'chrome')
+        config: Merged config dict from get_config() (properly merges file, env, machine, crawl, snapshot)
 
     Returns:
-        Flat dict of all resolved config values.
-        e.g., {'SAVE_WGET': True, 'WGET_TIMEOUT': 60, ...}
-    """
-    if env_vars is None:
-        env_vars = dict(os.environ)
-    if config_file is None:
-        config_file = {}
-
-    plugin_configs = discover_plugin_configs()
-    flat_config = {}
-
-    for plugin_name, schema in plugin_configs.items():
-        properties = schema.get('properties', {})
-        for key, prop_schema in properties.items():
-            flat_config[key] = resolve_config_value(
-                key, prop_schema, env_vars, config_file, overrides
-            )
-
-    return flat_config
-
-
-def export_plugin_config_to_env(
-    config: Dict[str, Any],
-    env: Optional[Dict[str, str]] = None,
-) -> Dict[str, str]:
-    """
-    Export plugin config values to environment variable format.
-
-    Converts all values to strings suitable for subprocess environment.
-    Arrays are JSON-encoded.
-
-    Args:
-        config: Flat config dict from get_flat_plugin_config()
-        env: Optional existing env dict to update (creates new if None)
+        Dict with standardized keys:
+            {
+                'enabled': True,         # bool
+                'timeout': 60,           # int, seconds
+                'binary': 'wget',        # str, path or name
+            }
 
-    Returns:
-        Environment dict with config values as strings.
+    Examples:
+        >>> from archivebox.config.configset import get_config
+        >>> config = get_config(crawl=my_crawl, snapshot=my_snapshot)
+        >>> get_plugin_special_config('wget', config)
+        {'enabled': True, 'timeout': 120, 'binary': '/usr/bin/wget'}
     """
-    if env is None:
-        env = {}
-
-    for key, value in config.items():
-        if value is None:
-            continue
-        elif isinstance(value, bool):
-            env[key] = 'true' if value else 'false'
-        elif isinstance(value, (list, dict)):
-            env[key] = json.dumps(value)
-        else:
-            env[key] = str(value)
+    plugin_upper = plugin_name.upper()
+
+    # 1. Enabled: PLUGINNAME_ENABLED (default True)
+    # Old names (USE_*, SAVE_*) are aliased in config.json via x-aliases
+    enabled_key = f'{plugin_upper}_ENABLED'
+    enabled = config.get(enabled_key)
+    if enabled is None:
+        enabled = True
+    elif isinstance(enabled, str):
+        # Handle string values from config file ("true"/"false")
+        enabled = enabled.lower() not in ('false', '0', 'no', '')
+
+    # 2. Timeout: PLUGINNAME_TIMEOUT (fallback to TIMEOUT, default 300)
+    timeout_key = f'{plugin_upper}_TIMEOUT'
+    timeout = config.get(timeout_key) or config.get('TIMEOUT', 300)
+
+    # 3. Binary: PLUGINNAME_BINARY (default to plugin_name)
+    binary_key = f'{plugin_upper}_BINARY'
+    binary = config.get(binary_key, plugin_name)
 
-    return env
+    return {
+        'enabled': bool(enabled),
+        'timeout': int(timeout),
+        'binary': str(binary),
+    }
 
 
 # =============================================================================
@@ -1233,7 +1107,7 @@ def find_binary_for_cmd(cmd: List[str], machine_id: str) -> Optional[str]:
     if not cmd:
         return None
 
-    from machine.models import Binary
+    from archivebox.machine.models import Binary
 
     bin_path_or_name = cmd[0] if isinstance(cmd, list) else cmd
 
@@ -1266,7 +1140,7 @@ def create_model_record(record: Dict[str, Any]) -> Any:
     Returns:
         Created/updated model instance, or None if type unknown
     """
-    from machine.models import Binary, Machine
+    from archivebox.machine.models import Binary, Machine
 
     record_type = record.pop('type', None)
     if not record_type:
@@ -1349,25 +1223,25 @@ def process_hook_records(records: List[Dict[str, Any]], overrides: Dict[str, Any
         try:
             # Dispatch to appropriate model's from_jsonl() method
             if record_type == 'Snapshot':
-                from core.models import Snapshot
+                from archivebox.core.models import Snapshot
                 obj = Snapshot.from_jsonl(record.copy(), overrides)
                 if obj:
                     stats['Snapshot'] = stats.get('Snapshot', 0) + 1
 
             elif record_type == 'Tag':
-                from core.models import Tag
+                from archivebox.core.models import Tag
                 obj = Tag.from_jsonl(record.copy(), overrides)
                 if obj:
                     stats['Tag'] = stats.get('Tag', 0) + 1
 
             elif record_type == 'Binary':
-                from machine.models import Binary
+                from archivebox.machine.models import Binary
                 obj = Binary.from_jsonl(record.copy(), overrides)
                 if obj:
                     stats['Binary'] = stats.get('Binary', 0) + 1
 
             elif record_type == 'Machine':
-                from machine.models import Machine
+                from archivebox.machine.models import Machine
                 obj = Machine.from_jsonl(record.copy(), overrides)
                 if obj:
                     stats['Machine'] = stats.get('Machine', 0) + 1

+ 1 - 1
archivebox/machine/admin.py

@@ -4,7 +4,7 @@ from django.contrib import admin
 from django.utils.html import format_html
 
 from archivebox.base_models.admin import BaseModelAdmin, ConfigEditorMixin
-from machine.models import Machine, NetworkInterface, Binary
+from archivebox.machine.models import Machine, NetworkInterface, Binary
 
 
 class MachineAdmin(ConfigEditorMixin, BaseModelAdmin):

+ 3 - 3
archivebox/machine/apps.py

@@ -5,11 +5,11 @@ from django.apps import AppConfig
 
 class MachineConfig(AppConfig):
     default_auto_field = 'django.db.models.BigAutoField'
-    
-    name = 'machine'
+
+    name = 'archivebox.machine'
     verbose_name = 'Machine Info'
 
 
 def register_admin(admin_site):
-    from machine.admin import register_admin
+    from archivebox.machine.admin import register_admin
     register_admin(admin_site)

+ 5 - 20
archivebox/machine/migrations/0001_squashed.py

@@ -14,9 +14,9 @@ class Migration(migrations.Migration):
 
     replaces = [
         ('machine', '0001_initial'),
-        ('machine', '0002_alter_machine_stats_binary'),
-        ('machine', '0003_alter_binary_options_and_more'),
-        ('machine', '0004_alter_binary_abspath_and_more'),
+        ('machine', '0002_alter_machine_stats_installedbinary'),
+        ('machine', '0003_alter_installedbinary_options_and_more'),
+        ('machine', '0004_alter_installedbinary_abspath_and_more'),
     ]
 
     dependencies = []
@@ -70,22 +70,7 @@ class Migration(migrations.Migration):
                 'unique_together': {('machine', 'ip_public', 'ip_local', 'mac_address', 'dns_server')},
             },
         ),
-        migrations.CreateModel(
-            name='Dependency',
-            fields=[
-                ('id', models.UUIDField(default=uuid4, editable=False, primary_key=True, serialize=False, unique=True)),
-                ('created_at', models.DateTimeField(db_index=True, default=django.utils.timezone.now)),
-                ('modified_at', models.DateTimeField(auto_now=True)),
-                ('bin_name', models.CharField(db_index=True, max_length=63, unique=True)),
-                ('bin_providers', models.CharField(default='*', max_length=127)),
-                ('custom_cmds', models.JSONField(blank=True, default=dict)),
-                ('config', models.JSONField(blank=True, default=dict)),
-            ],
-            options={
-                'verbose_name': 'Dependency',
-                'verbose_name_plural': 'Dependencies',
-            },
-        ),
+        # Dependency model removed - not needed anymore
         migrations.CreateModel(
             name='Binary',
             fields=[
@@ -100,7 +85,7 @@ class Migration(migrations.Migration):
                 ('version', models.CharField(blank=True, default=None, max_length=32)),
                 ('sha256', models.CharField(blank=True, default=None, max_length=64)),
                 ('machine', models.ForeignKey(blank=True, default=None, on_delete=django.db.models.deletion.CASCADE, to='machine.machine')),
-                ('dependency', models.ForeignKey(blank=True, null=True, on_delete=django.db.models.deletion.SET_NULL, related_name='binary_set', to='machine.dependency')),
+                # dependency FK removed - Dependency model deleted
             ],
             options={
                 'verbose_name': 'Binary',

+ 4 - 26
archivebox/machine/migrations/0002_rename_custom_cmds_to_overrides.py

@@ -1,6 +1,8 @@
 # Generated manually on 2025-12-26
+# NOTE: This migration is intentionally empty but kept for dependency chain
+# The Dependency model was removed in 0004, so all operations have been stripped
 
-from django.db import migrations, models
+from django.db import migrations
 
 
 class Migration(migrations.Migration):
@@ -10,29 +12,5 @@ class Migration(migrations.Migration):
     ]
 
     operations = [
-        migrations.RenameField(
-            model_name='dependency',
-            old_name='custom_cmds',
-            new_name='overrides',
-        ),
-        migrations.AlterField(
-            model_name='dependency',
-            name='bin_name',
-            field=models.CharField(db_index=True, help_text='Binary executable name (e.g., wget, yt-dlp, chromium)', max_length=63, unique=True),
-        ),
-        migrations.AlterField(
-            model_name='dependency',
-            name='bin_providers',
-            field=models.CharField(default='*', help_text='Comma-separated list of allowed providers: apt,brew,pip,npm,gem,nix,custom or * for any', max_length=127),
-        ),
-        migrations.AlterField(
-            model_name='dependency',
-            name='overrides',
-            field=models.JSONField(blank=True, default=dict, help_text="JSON map matching abx-pkg Binary.overrides format: {'pip': {'packages': ['pkg']}, 'apt': {'packages': ['pkg']}}"),
-        ),
-        migrations.AlterField(
-            model_name='dependency',
-            name='config',
-            field=models.JSONField(blank=True, default=dict, help_text='JSON map of env var config to use during install'),
-        ),
+        # All Dependency operations removed - model deleted in 0004
     ]

+ 5 - 33
archivebox/machine/migrations/0003_alter_dependency_id_alter_installedbinary_dependency_and_more.py

@@ -1,8 +1,8 @@
 # Generated by Django 6.0 on 2025-12-28 05:12
+# NOTE: This migration is intentionally empty but kept for dependency chain
+# The Dependency model was removed in 0004, all operations stripped
 
-import django.db.models.deletion
-from archivebox import uuid_compat
-from django.db import migrations, models
+from django.db import migrations
 
 
 class Migration(migrations.Migration):
@@ -12,34 +12,6 @@ class Migration(migrations.Migration):
     ]
 
     operations = [
-        migrations.AlterField(
-            model_name='dependency',
-            name='id',
-            field=models.UUIDField(default=uuid_compat.uuid7, editable=False, primary_key=True, serialize=False, unique=True),
-        ),
-        migrations.AlterField(
-            model_name='binary',
-            name='dependency',
-            field=models.ForeignKey(blank=True, help_text='The Dependency this binary satisfies', null=True, on_delete=django.db.models.deletion.SET_NULL, related_name='binary_set', to='machine.dependency'),
-        ),
-        migrations.AlterField(
-            model_name='binary',
-            name='id',
-            field=models.UUIDField(default=uuid_compat.uuid7, editable=False, primary_key=True, serialize=False, unique=True),
-        ),
-        migrations.AlterField(
-            model_name='machine',
-            name='config',
-            field=models.JSONField(blank=True, default=dict, help_text='Machine-specific config overrides (e.g., resolved binary paths like WGET_BINARY)'),
-        ),
-        migrations.AlterField(
-            model_name='machine',
-            name='id',
-            field=models.UUIDField(default=uuid_compat.uuid7, editable=False, primary_key=True, serialize=False, unique=True),
-        ),
-        migrations.AlterField(
-            model_name='networkinterface',
-            name='id',
-            field=models.UUIDField(default=uuid_compat.uuid7, editable=False, primary_key=True, serialize=False, unique=True),
-        ),
+        # All operations removed - Dependency model deleted in 0004
+        # This is a stub migration for users upgrading from old dev versions
     ]

+ 28 - 0
archivebox/machine/migrations/0004_drop_dependency_table.py

@@ -0,0 +1,28 @@
+# Generated migration - removes Dependency model entirely
+# NOTE: This is a cleanup migration for users upgrading from old dev versions
+# that had the Dependency model. Fresh installs never create this table.
+
+from django.db import migrations
+
+
+def drop_dependency_table(apps, schema_editor):
+    """
+    Drop old Dependency table if it exists (from dev versions that had it).
+    Safe to run multiple times, safe if table doesn't exist.
+
+    Does NOT touch machine_binary - that's our current Binary model table!
+    """
+    schema_editor.execute('DROP TABLE IF EXISTS machine_dependency')
+    # Also drop old InstalledBinary table if it somehow still exists
+    schema_editor.execute('DROP TABLE IF EXISTS machine_installedbinary')
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ('machine', '0003_alter_dependency_id_alter_installedbinary_dependency_and_more'),
+    ]
+
+    operations = [
+        migrations.RunPython(drop_dependency_table, migrations.RunPython.noop),
+    ]

+ 0 - 56
archivebox/machine/migrations/0004_rename_installedbinary_to_binary.py

@@ -1,56 +0,0 @@
-# Generated migration - Clean slate for Binary model
-# Drops old InstalledBinary and Dependency tables, creates new Binary table
-
-from django.db import migrations, models
-import django.utils.timezone
-import archivebox.uuid_compat
-
-
-def drop_old_tables(apps, schema_editor):
-    """Drop old tables using raw SQL"""
-    schema_editor.execute('DROP TABLE IF EXISTS machine_installedbinary')
-    schema_editor.execute('DROP TABLE IF EXISTS machine_dependency')
-    schema_editor.execute('DROP TABLE IF EXISTS machine_binary')  # In case rename happened
-
-
-class Migration(migrations.Migration):
-
-    dependencies = [
-        ('machine', '0003_alter_dependency_id_alter_installedbinary_dependency_and_more'),
-    ]
-
-    operations = [
-        # Drop old tables using raw SQL
-        migrations.RunPython(drop_old_tables, migrations.RunPython.noop),
-
-        # Create new Binary model from scratch
-        migrations.CreateModel(
-            name='Binary',
-            fields=[
-                ('id', models.UUIDField(default=archivebox.uuid_compat.uuid7, editable=False, primary_key=True, serialize=False, unique=True)),
-                ('created_at', models.DateTimeField(db_index=True, default=django.utils.timezone.now)),
-                ('modified_at', models.DateTimeField(auto_now=True)),
-                ('name', models.CharField(blank=True, db_index=True, default=None, max_length=63)),
-                ('binproviders', models.CharField(blank=True, default='env', help_text='Comma-separated list of allowed providers: apt,brew,pip,npm,env', max_length=127)),
-                ('overrides', models.JSONField(blank=True, default=dict, help_text="Provider-specific overrides: {'apt': {'packages': ['pkg']}, ...}")),
-                ('binprovider', models.CharField(blank=True, default=None, help_text='Provider that successfully installed this binary', max_length=31)),
-                ('abspath', models.CharField(blank=True, default=None, max_length=255)),
-                ('version', models.CharField(blank=True, default=None, max_length=32)),
-                ('sha256', models.CharField(blank=True, default=None, max_length=64)),
-                ('status', models.CharField(choices=[('queued', 'Queued'), ('started', 'Started'), ('succeeded', 'Succeeded'), ('failed', 'Failed')], db_index=True, default='queued', max_length=16)),
-                ('retry_at', models.DateTimeField(blank=True, db_index=True, default=django.utils.timezone.now, help_text='When to retry this binary installation', null=True)),
-                ('output_dir', models.CharField(blank=True, default='', help_text='Directory where installation hook logs are stored', max_length=255)),
-                ('num_uses_failed', models.PositiveIntegerField(default=0)),
-                ('num_uses_succeeded', models.PositiveIntegerField(default=0)),
-                ('machine', models.ForeignKey(blank=True, default=None, on_delete=models.deletion.CASCADE, to='machine.machine')),
-            ],
-            options={
-                'verbose_name': 'Binary',
-                'verbose_name_plural': 'Binaries',
-            },
-        ),
-        migrations.AddIndex(
-            model_name='binary',
-            index=models.Index(fields=['machine', 'name', 'abspath', 'version', 'sha256'], name='machine_bin_machine_idx'),
-        ),
-    ]

+ 143 - 5
archivebox/machine/models.py

@@ -4,11 +4,14 @@ import socket
 from archivebox.uuid_compat import uuid7
 from datetime import timedelta
 
+from statemachine import State, registry
+
 from django.db import models
 from django.utils import timezone
 from django.utils.functional import cached_property
 
 from archivebox.base_models.models import ModelWithHealthStats
+from archivebox.workers.models import BaseStateMachine
 from .detect import get_host_guid, get_os_info, get_vm_info, get_host_network, get_host_stats
 
 _CURRENT_MACHINE = None
@@ -50,6 +53,9 @@ class Machine(ModelWithHealthStats):
     objects: MachineManager = MachineManager()
     networkinterface_set: models.Manager['NetworkInterface']
 
+    class Meta:
+        app_label = 'machine'
+
     @classmethod
     def current(cls) -> 'Machine':
         global _CURRENT_MACHINE
@@ -115,6 +121,7 @@ class NetworkInterface(ModelWithHealthStats):
     objects: NetworkInterfaceManager = NetworkInterfaceManager()
 
     class Meta:
+        app_label = 'machine'
         unique_together = (('machine', 'ip_public', 'ip_local', 'mac_address', 'dns_server'),)
 
     @classmethod
@@ -206,11 +213,12 @@ class Binary(ModelWithHealthStats):
     num_uses_failed = models.PositiveIntegerField(default=0)
     num_uses_succeeded = models.PositiveIntegerField(default=0)
 
-    state_machine_name: str = 'machine.statemachines.BinaryMachine'
+    state_machine_name: str = 'machine.models.BinaryMachine'
 
     objects: BinaryManager = BinaryManager()
 
     class Meta:
+        app_label = 'machine'
         verbose_name = 'Binary'
         verbose_name_plural = 'Binaries'
         unique_together = (('machine', 'name', 'abspath', 'version', 'sha256'),)
@@ -302,9 +310,9 @@ class Binary(ModelWithHealthStats):
         DATA_DIR = getattr(settings, 'DATA_DIR', Path.cwd())
         return Path(DATA_DIR) / 'machines' / str(self.machine_id) / 'binaries' / self.name / str(self.id)
 
-    def update_for_workers(self, **kwargs):
+    def update_and_requeue(self, **kwargs):
         """
-        Update binary fields for worker state machine.
+        Update binary fields and requeue for worker state machine.
 
         Sets modified_at to ensure workers pick up changes.
         Always saves the model after updating.
@@ -325,6 +333,10 @@ class Binary(ModelWithHealthStats):
         """
         import json
         from archivebox.hooks import discover_hooks, run_hook
+        from archivebox.config.configset import get_config
+
+        # Get merged config (Binary doesn't have crawl/snapshot context)
+        config = get_config(scope='global')
 
         # Create output directory
         output_dir = self.OUTPUT_DIR
@@ -333,7 +345,7 @@ class Binary(ModelWithHealthStats):
         self.save()
 
         # Discover ALL on_Binary__install_* hooks
-        hooks = discover_hooks('Binary')
+        hooks = discover_hooks('Binary', config=config)
         if not hooks:
             self.status = self.StatusChoices.FAILED
             self.save()
@@ -361,7 +373,8 @@ class Binary(ModelWithHealthStats):
             result = run_hook(
                 hook,
                 output_dir=plugin_output_dir,
-                timeout=600,  # 10 min timeout
+                config=config,
+                timeout=600,  # 10 min timeout for binary installation
                 **hook_kwargs
             )
 
@@ -420,3 +433,128 @@ class Binary(ModelWithHealthStats):
                 kill_process(pid_file)
 
 
+# =============================================================================
+# Binary State Machine
+# =============================================================================
+
+class BinaryMachine(BaseStateMachine, strict_states=True):
+    """
+    State machine for managing Binary installation lifecycle.
+
+    Hook Lifecycle:
+    ┌─────────────────────────────────────────────────────────────┐
+    │ QUEUED State                                                │
+    │  • Binary needs to be installed                             │
+    └─────────────────────────────────────────────────────────────┘
+                            ↓ tick() when can_start()
+    ┌─────────────────────────────────────────────────────────────┐
+    │ STARTED State → enter_started()                             │
+    │  1. binary.run()                                            │
+    │     • discover_hooks('Binary') → all on_Binary__install_*   │
+    │     • Try each provider hook in sequence:                   │
+    │       - run_hook(script, output_dir, ...)                   │
+    │       - If returncode == 0:                                 │
+    │         * Read stdout.log                                   │
+    │         * Parse JSONL for 'Binary' record with abspath      │
+    │         * Update self: abspath, version, sha256, provider   │
+    │         * Set status=SUCCEEDED, RETURN                      │
+    │     • If no hook succeeds: set status=FAILED                │
+    └─────────────────────────────────────────────────────────────┘
+                            ↓ tick() checks status
+    ┌─────────────────────────────────────────────────────────────┐
+    │ SUCCEEDED / FAILED                                          │
+    │  • Set by binary.run() based on hook results                │
+    │  • Health stats incremented (num_uses_succeeded/failed)     │
+    └─────────────────────────────────────────────────────────────┘
+    """
+
+    model_attr_name = 'binary'
+
+    # States
+    queued = State(value=Binary.StatusChoices.QUEUED, initial=True)
+    started = State(value=Binary.StatusChoices.STARTED)
+    succeeded = State(value=Binary.StatusChoices.SUCCEEDED, final=True)
+    failed = State(value=Binary.StatusChoices.FAILED, final=True)
+
+    # Tick Event - transitions based on conditions
+    tick = (
+        queued.to.itself(unless='can_start') |
+        queued.to(started, cond='can_start') |
+        started.to.itself(unless='is_finished') |
+        started.to(succeeded, cond='is_succeeded') |
+        started.to(failed, cond='is_failed')
+    )
+
+    def can_start(self) -> bool:
+        """Check if binary installation can start."""
+        return bool(self.binary.name and self.binary.binproviders)
+
+    def is_succeeded(self) -> bool:
+        """Check if installation succeeded (status was set by run())."""
+        return self.binary.status == Binary.StatusChoices.SUCCEEDED
+
+    def is_failed(self) -> bool:
+        """Check if installation failed (status was set by run())."""
+        return self.binary.status == Binary.StatusChoices.FAILED
+
+    def is_finished(self) -> bool:
+        """Check if installation has completed (success or failure)."""
+        return self.binary.status in (
+            Binary.StatusChoices.SUCCEEDED,
+            Binary.StatusChoices.FAILED,
+        )
+
+    @queued.enter
+    def enter_queued(self):
+        """Binary is queued for installation."""
+        self.binary.update_and_requeue(
+            retry_at=timezone.now(),
+            status=Binary.StatusChoices.QUEUED,
+        )
+
+    @started.enter
+    def enter_started(self):
+        """Start binary installation."""
+        # Lock the binary while installation runs
+        self.binary.update_and_requeue(
+            retry_at=timezone.now() + timedelta(seconds=300),  # 5 min timeout for installation
+            status=Binary.StatusChoices.STARTED,
+        )
+
+        # Run installation hooks
+        self.binary.run()
+
+        # Save updated status (run() updates status to succeeded/failed)
+        self.binary.save()
+
+    @succeeded.enter
+    def enter_succeeded(self):
+        """Binary installed successfully."""
+        self.binary.update_and_requeue(
+            retry_at=None,
+            status=Binary.StatusChoices.SUCCEEDED,
+        )
+
+        # Increment health stats
+        self.binary.increment_health_stats(success=True)
+
+    @failed.enter
+    def enter_failed(self):
+        """Binary installation failed."""
+        self.binary.update_and_requeue(
+            retry_at=None,
+            status=Binary.StatusChoices.FAILED,
+        )
+
+        # Increment health stats
+        self.binary.increment_health_stats(success=False)
+
+
+# =============================================================================
+# State Machine Registration
+# =============================================================================
+
+# Manually register state machines with python-statemachine registry
+registry.register(BinaryMachine)
+
+

+ 0 - 112
archivebox/machine/statemachines.py

@@ -1,112 +0,0 @@
-__package__ = 'archivebox.machine'
-
-from datetime import timedelta
-from django.utils import timezone
-from django.db.models import F
-
-from statemachine import State, StateMachine
-
-from machine.models import Binary
-
-
-class BinaryMachine(StateMachine, strict_states=True):
-    """
-    State machine for managing Binary installation lifecycle.
-
-    Follows the unified pattern used by Crawl, Snapshot, and ArchiveResult:
-    - queued: Binary needs to be installed
-    - started: Installation hooks are running
-    - succeeded: Binary installed successfully (abspath, version, sha256 populated)
-    - failed: Installation failed permanently
-    """
-
-    model: Binary
-
-    # States
-    queued = State(value=Binary.StatusChoices.QUEUED, initial=True)
-    started = State(value=Binary.StatusChoices.STARTED)
-    succeeded = State(value=Binary.StatusChoices.SUCCEEDED, final=True)
-    failed = State(value=Binary.StatusChoices.FAILED, final=True)
-
-    # Tick Event - transitions based on conditions
-    tick = (
-        queued.to.itself(unless='can_start') |
-        queued.to(started, cond='can_start') |
-        started.to.itself(unless='is_finished') |
-        started.to(succeeded, cond='is_succeeded') |
-        started.to(failed, cond='is_failed')
-    )
-
-    def __init__(self, binary, *args, **kwargs):
-        self.binary = binary
-        super().__init__(binary, *args, **kwargs)
-
-    def __repr__(self) -> str:
-        return f'Binary[{self.binary.id}]'
-
-    def __str__(self) -> str:
-        return self.__repr__()
-
-    def can_start(self) -> bool:
-        """Check if binary installation can start."""
-        return bool(self.binary.name and self.binary.binproviders)
-
-    def is_succeeded(self) -> bool:
-        """Check if installation succeeded (status was set by run())."""
-        return self.binary.status == Binary.StatusChoices.SUCCEEDED
-
-    def is_failed(self) -> bool:
-        """Check if installation failed (status was set by run())."""
-        return self.binary.status == Binary.StatusChoices.FAILED
-
-    def is_finished(self) -> bool:
-        """Check if installation has completed (success or failure)."""
-        return self.binary.status in (
-            Binary.StatusChoices.SUCCEEDED,
-            Binary.StatusChoices.FAILED,
-        )
-
-    @queued.enter
-    def enter_queued(self):
-        """Binary is queued for installation."""
-        self.binary.update_for_workers(
-            retry_at=timezone.now(),
-            status=Binary.StatusChoices.QUEUED,
-        )
-
-    @started.enter
-    def enter_started(self):
-        """Start binary installation."""
-        # Lock the binary while installation runs
-        self.binary.update_for_workers(
-            retry_at=timezone.now() + timedelta(seconds=300),  # 5 min timeout for installation
-            status=Binary.StatusChoices.STARTED,
-        )
-
-        # Run installation hooks
-        self.binary.run()
-
-        # Save updated status (run() updates status to succeeded/failed)
-        self.binary.save()
-
-    @succeeded.enter
-    def enter_succeeded(self):
-        """Binary installed successfully."""
-        self.binary.update_for_workers(
-            retry_at=None,
-            status=Binary.StatusChoices.SUCCEEDED,
-        )
-
-        # Increment health stats
-        Binary.objects.filter(pk=self.binary.pk).update(num_uses_succeeded=F('num_uses_succeeded') + 1)
-
-    @failed.enter
-    def enter_failed(self):
-        """Binary installation failed."""
-        self.binary.update_for_workers(
-            retry_at=None,
-            status=Binary.StatusChoices.FAILED,
-        )
-
-        # Increment health stats
-        Binary.objects.filter(pk=self.binary.pk).update(num_uses_failed=F('num_uses_failed') + 1)

+ 6 - 58
archivebox/misc/jsonl.py

@@ -250,68 +250,13 @@ def process_records(
                 yield result
 
 
-def get_or_create_snapshot(record: Dict[str, Any], created_by_id: Optional[int] = None):
-    """
-    Get or create a Snapshot from a JSONL record.
-
-    Returns the Snapshot instance.
-    """
-    from core.models import Snapshot
-    from archivebox.base_models.models import get_or_create_system_user_pk
-    from archivebox.misc.util import parse_date
-
-    created_by_id = created_by_id or get_or_create_system_user_pk()
-
-    # Extract fields from record
-    url = record.get('url')
-    if not url:
-        raise ValueError("Record missing required 'url' field")
-
-    title = record.get('title')
-    tags_str = record.get('tags', '')
-    bookmarked_at = record.get('bookmarked_at')
-    depth = record.get('depth', 0)
-    crawl_id = record.get('crawl_id')
-    parent_snapshot_id = record.get('parent_snapshot_id')
-
-    # Parse bookmarked_at if string
-    if bookmarked_at and isinstance(bookmarked_at, str):
-        bookmarked_at = parse_date(bookmarked_at)
-
-    # Use the manager's create_or_update_from_dict method
-    snapshot = Snapshot.objects.create_or_update_from_dict(
-        {'url': url, 'title': title, 'tags': tags_str},
-        created_by_id=created_by_id
-    )
-
-    # Update additional fields if provided
-    update_fields = []
-    if depth is not None and snapshot.depth != depth:
-        snapshot.depth = depth
-        update_fields.append('depth')
-    if parent_snapshot_id and str(snapshot.parent_snapshot_id) != str(parent_snapshot_id):
-        snapshot.parent_snapshot_id = parent_snapshot_id
-        update_fields.append('parent_snapshot_id')
-    if bookmarked_at and snapshot.bookmarked_at != bookmarked_at:
-        snapshot.bookmarked_at = bookmarked_at
-        update_fields.append('bookmarked_at')
-    if crawl_id and str(snapshot.crawl_id) != str(crawl_id):
-        snapshot.crawl_id = crawl_id
-        update_fields.append('crawl_id')
-
-    if update_fields:
-        snapshot.save(update_fields=update_fields + ['modified_at'])
-
-    return snapshot
-
-
 def get_or_create_tag(record: Dict[str, Any]):
     """
     Get or create a Tag from a JSONL record.
 
     Returns the Tag instance.
     """
-    from core.models import Tag
+    from archivebox.core.models import Tag
 
     name = record.get('name')
     if not name:
@@ -353,8 +298,11 @@ def process_jsonl_records(records: Iterator[Dict[str, Any]], created_by_id: Opti
 
         elif record_type == TYPE_SNAPSHOT or 'url' in record:
             try:
-                snapshot = get_or_create_snapshot(record, created_by_id=created_by_id)
-                results['snapshots'].append(snapshot)
+                from archivebox.core.models import Snapshot
+                overrides = {'created_by_id': created_by_id} if created_by_id else {}
+                snapshot = Snapshot.from_jsonl(record, overrides=overrides)
+                if snapshot:
+                    results['snapshots'].append(snapshot)
             except ValueError:
                 continue
 

+ 3 - 3
archivebox/misc/logging_util.py

@@ -17,7 +17,7 @@ from dataclasses import dataclass
 from typing import Any, Optional, List, Dict, Union, Iterable, IO, TYPE_CHECKING
 
 if TYPE_CHECKING:
-    from core.models import Snapshot
+    from archivebox.core.models import Snapshot
 
 from rich import print
 from rich.panel import Panel
@@ -257,7 +257,7 @@ def log_archiving_paused(num_links: int, idx: int, timestamp: str):
 
 def log_archiving_finished(num_links: int):
 
-    from core.models import Snapshot
+    from archivebox.core.models import Snapshot
 
     end_ts = datetime.now(timezone.utc)
     _LAST_RUN_STATS.archiving_end_ts = end_ts
@@ -395,7 +395,7 @@ def log_list_started(filter_patterns: Optional[List[str]], filter_type: str):
     print('    {}'.format(' '.join(filter_patterns or ())))
 
 def log_list_finished(snapshots):
-    from core.models import Snapshot
+    from archivebox.core.models import Snapshot
     print()
     print('---------------------------------------------------------------------------------------------------')
     print(Snapshot.objects.filter(pk__in=[s.pk for s in snapshots]).to_csv(cols=['timestamp', 'is_archived', 'num_outputs', 'url'], header=True, ljust=16, separator=' | '))

+ 0 - 335
archivebox/misc/tests.py

@@ -1,335 +0,0 @@
-__package__ = 'abx.archivebox'
-
-# from django.test import TestCase
-
-# from .toml_util import convert, TOML_HEADER
-
-# TEST_INPUT = """
-# [SERVER_CONFIG]
-# IS_TTY=False
-# USE_COLOR=False
-# SHOW_PROGRESS=False
-# IN_DOCKER=False
-# IN_QEMU=False
-# PUID=501
-# PGID=20
-# CONFIG_FILE=/opt/archivebox/data/ArchiveBox.conf
-# ONLY_NEW=True
-# TIMEOUT=60
-# MEDIA_TIMEOUT=3600
-# OUTPUT_PERMISSIONS=644
-# RESTRICT_FILE_NAMES=windows
-# URL_DENYLIST=\.(css|js|otf|ttf|woff|woff2|gstatic\.com|googleapis\.com/css)(\?.*)?$
-# URL_ALLOWLIST=None
-# ADMIN_USERNAME=None
-# ADMIN_PASSWORD=None
-# ENFORCE_ATOMIC_WRITES=True
-# TAG_SEPARATOR_PATTERN=[,]
-# SECRET_KEY=xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx
-# BIND_ADDR=127.0.0.1:8000
-# ALLOWED_HOSTS=*
-# DEBUG=False
-# PUBLIC_INDEX=True
-# PUBLIC_SNAPSHOTS=True
-# PUBLIC_ADD_VIEW=False
-# FOOTER_INFO=Content is hosted for personal archiving purposes only.  Contact server owner for any takedown requests.
-# SNAPSHOTS_PER_PAGE=40
-# CUSTOM_TEMPLATES_DIR=None
-# TIME_ZONE=UTC
-# TIMEZONE=UTC
-# REVERSE_PROXY_USER_HEADER=Remote-User
-# REVERSE_PROXY_WHITELIST=
-# LOGOUT_REDIRECT_URL=/
-# PREVIEW_ORIGINALS=True
-# LDAP=False
-# LDAP_SERVER_URI=None
-# LDAP_BIND_DN=None
-# LDAP_BIND_PASSWORD=None
-# LDAP_USER_BASE=None
-# LDAP_USER_FILTER=None
-# LDAP_USERNAME_ATTR=None
-# LDAP_FIRSTNAME_ATTR=None
-# LDAP_LASTNAME_ATTR=None
-# LDAP_EMAIL_ATTR=None
-# LDAP_CREATE_SUPERUSER=False
-# SAVE_TITLE=True
-# SAVE_FAVICON=True
-# SAVE_WGET=True
-# SAVE_WGET_REQUISITES=True
-# SAVE_SINGLEFILE=True
-# SAVE_READABILITY=True
-# SAVE_MERCURY=True
-# SAVE_HTMLTOTEXT=True
-# SAVE_PDF=True
-# SAVE_SCREENSHOT=True
-# SAVE_DOM=True
-# SAVE_HEADERS=True
-# SAVE_WARC=True
-# SAVE_GIT=True
-# SAVE_MEDIA=True
-# SAVE_ARCHIVE_DOT_ORG=True
-# RESOLUTION=1440,2000
-# GIT_DOMAINS=github.com,bitbucket.org,gitlab.com,gist.github.com,codeberg.org,gitea.com,git.sr.ht
-# CHECK_SSL_VALIDITY=True
-# MEDIA_MAX_SIZE=750m
-# USER_AGENT=None
-# CURL_USER_AGENT=Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36 ArchiveBox/0.8.0 (+https://github.com/ArchiveBox/ArchiveBox/) curl/curl 8.4.0 (x86_64-apple-darwin23.0)
-# WGET_USER_AGENT=Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36 ArchiveBox/0.8.0 (+https://github.com/ArchiveBox/ArchiveBox/) wget/GNU Wget 1.24.5
-# CHROME_USER_AGENT=Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36 ArchiveBox/0.8.0 (+https://github.com/ArchiveBox/ArchiveBox/)
-# COOKIES_FILE=None
-# CHROME_USER_DATA_DIR=None
-# CHROME_TIMEOUT=0
-# CHROME_HEADLESS=True
-# CHROME_SANDBOX=True
-# CHROME_EXTRA_ARGS=[]
-# YOUTUBEDL_ARGS=['--restrict-filenames', '--trim-filenames', '128', '--write-description', '--write-info-json', '--write-annotations', '--write-thumbnail', '--no-call-home', '--write-sub', '--write-auto-subs', '--convert-subs=srt', '--yes-playlist', '--continue', '--no-abort-on-error', '--ignore-errors', '--geo-bypass', '--add-metadata', '--format=(bv*+ba/b)[filesize<=750m][filesize_approx<=?750m]/(bv*+ba/b)']
-# YOUTUBEDL_EXTRA_ARGS=[]
-# WGET_ARGS=['--no-verbose', '--adjust-extension', '--convert-links', '--force-directories', '--backup-converted', '--span-hosts', '--no-parent', '-e', 'robots=off']
-# WGET_EXTRA_ARGS=[]
-# CURL_ARGS=['--silent', '--location', '--compressed']
-# CURL_EXTRA_ARGS=[]
-# GIT_ARGS=['--recursive']
-# SINGLEFILE_ARGS=[]
-# SINGLEFILE_EXTRA_ARGS=[]
-# MERCURY_ARGS=['--format=text']
-# MERCURY_EXTRA_ARGS=[]
-# FAVICON_PROVIDER=https://www.google.com/s2/favicons?domain={}
-# USE_INDEXING_BACKEND=True
-# USE_SEARCHING_BACKEND=True
-# SEARCH_BACKEND_ENGINE=ripgrep
-# SEARCH_BACKEND_HOST_NAME=localhost
-# SEARCH_BACKEND_PORT=1491
-# SEARCH_BACKEND_PASSWORD=SecretPassword
-# SEARCH_PROCESS_HTML=True
-# SONIC_COLLECTION=archivebox
-# SONIC_BUCKET=snapshots
-# SEARCH_BACKEND_TIMEOUT=90
-# FTS_SEPARATE_DATABASE=True
-# FTS_TOKENIZERS=porter unicode61 remove_diacritics 2
-# FTS_SQLITE_MAX_LENGTH=1000000000
-# USE_CURL=True
-# USE_WGET=True
-# USE_SINGLEFILE=True
-# USE_READABILITY=True
-# USE_MERCURY=True
-# USE_GIT=True
-# USE_CHROME=True
-# USE_NODE=True
-# USE_YOUTUBEDL=True
-# USE_RIPGREP=True
-# CURL_BINARY=curl
-# GIT_BINARY=git
-# WGET_BINARY=wget
-# SINGLEFILE_BINARY=single-file
-# READABILITY_BINARY=readability-extractor
-# MERCURY_BINARY=postlight-parser
-# YOUTUBEDL_BINARY=yt-dlp
-# NODE_BINARY=node
-# RIPGREP_BINARY=rg
-# CHROME_BINARY=chrome
-# POCKET_CONSUMER_KEY=None
-# USER=squash
-# PACKAGE_DIR=/opt/archivebox/archivebox
-# TEMPLATES_DIR=/opt/archivebox/archivebox/templates
-# ARCHIVE_DIR=/opt/archivebox/data/archive
-# SOURCES_DIR=/opt/archivebox/data/sources
-# LOGS_DIR=/opt/archivebox/data/logs
-# PERSONAS_DIR=/opt/archivebox/data/personas
-# URL_DENYLIST_PTN=re.compile('\\.(css|js|otf|ttf|woff|woff2|gstatic\\.com|googleapis\\.com/css)(\\?.*)?$', re.IGNORECASE|re.MULTILINE)
-# URL_ALLOWLIST_PTN=None
-# DIR_OUTPUT_PERMISSIONS=755
-# ARCHIVEBOX_BINARY=/opt/archivebox/.venv/bin/archivebox
-# VERSION=0.8.0
-# COMMIT_HASH=102e87578c6036bb0132dd1ebd17f8f05ffc880f
-# BUILD_TIME=2024-05-15 03:28:05 1715768885
-# VERSIONS_AVAILABLE=None
-# CAN_UPGRADE=False
-# PYTHON_BINARY=/opt/archivebox/.venv/bin/python3.10
-# PYTHON_VERSION=3.10.14
-# DJANGO_BINARY=/opt/archivebox/.venv/lib/python3.10/site-packages/django/__init__.py
-# DJANGO_VERSION=5.0.6 final (0)
-# SQLITE_BINARY=/opt/homebrew/Cellar/[email protected]/3.10.14/Frameworks/Python.framework/Versions/3.10/lib/python3.10/sqlite3/dbapi2.py
-# SQLITE_VERSION=2.6.0
-# CURL_VERSION=curl 8.4.0 (x86_64-apple-darwin23.0)
-# WGET_VERSION=GNU Wget 1.24.5
-# WGET_AUTO_COMPRESSION=True
-# RIPGREP_VERSION=ripgrep 14.1.0
-# SINGLEFILE_VERSION=None
-# READABILITY_VERSION=None
-# MERCURY_VERSION=None
-# GIT_VERSION=git version 2.44.0
-# YOUTUBEDL_VERSION=2024.04.09
-# CHROME_VERSION=Google Chrome 124.0.6367.207
-# NODE_VERSION=v21.7.3
-# """
-
-
-# EXPECTED_OUTPUT = TOML_HEADER + '''[SERVER_CONFIG]
-# IS_TTY = false
-# USE_COLOR = false
-# SHOW_PROGRESS = false
-# IN_DOCKER = false
-# IN_QEMU = false
-# PUID = 501
-# PGID = 20
-# CONFIG_FILE = "/opt/archivebox/data/ArchiveBox.conf"
-# ONLY_NEW = true
-# TIMEOUT = 60
-# MEDIA_TIMEOUT = 3600
-# OUTPUT_PERMISSIONS = 644
-# RESTRICT_FILE_NAMES = "windows"
-# URL_DENYLIST = "\\\\.(css|js|otf|ttf|woff|woff2|gstatic\\\\.com|googleapis\\\\.com/css)(\\\\?.*)?$"
-# URL_ALLOWLIST = null
-# ADMIN_USERNAME = null
-# ADMIN_PASSWORD = null
-# ENFORCE_ATOMIC_WRITES = true
-# TAG_SEPARATOR_PATTERN = "[,]"
-# SECRET_KEY = "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"
-# BIND_ADDR = "127.0.0.1:8000"
-# ALLOWED_HOSTS = "*"
-# DEBUG = false
-# PUBLIC_INDEX = true
-# PUBLIC_SNAPSHOTS = true
-# PUBLIC_ADD_VIEW = false
-# FOOTER_INFO = "Content is hosted for personal archiving purposes only.  Contact server owner for any takedown requests."
-# SNAPSHOTS_PER_PAGE = 40
-# CUSTOM_TEMPLATES_DIR = null
-# TIME_ZONE = "UTC"
-# TIMEZONE = "UTC"
-# REVERSE_PROXY_USER_HEADER = "Remote-User"
-# REVERSE_PROXY_WHITELIST = ""
-# LOGOUT_REDIRECT_URL = "/"
-# PREVIEW_ORIGINALS = true
-# LDAP = false
-# LDAP_SERVER_URI = null
-# LDAP_BIND_DN = null
-# LDAP_BIND_PASSWORD = null
-# LDAP_USER_BASE = null
-# LDAP_USER_FILTER = null
-# LDAP_USERNAME_ATTR = null
-# LDAP_FIRSTNAME_ATTR = null
-# LDAP_LASTNAME_ATTR = null
-# LDAP_EMAIL_ATTR = null
-# LDAP_CREATE_SUPERUSER = false
-# SAVE_TITLE = true
-# SAVE_FAVICON = true
-# SAVE_WGET = true
-# SAVE_WGET_REQUISITES = true
-# SAVE_SINGLEFILE = true
-# SAVE_READABILITY = true
-# SAVE_MERCURY = true
-# SAVE_HTMLTOTEXT = true
-# SAVE_PDF = true
-# SAVE_SCREENSHOT = true
-# SAVE_DOM = true
-# SAVE_HEADERS = true
-# SAVE_WARC = true
-# SAVE_GIT = true
-# SAVE_MEDIA = true
-# SAVE_ARCHIVE_DOT_ORG = true
-# RESOLUTION = [1440, 2000]
-# GIT_DOMAINS = "github.com,bitbucket.org,gitlab.com,gist.github.com,codeberg.org,gitea.com,git.sr.ht"
-# CHECK_SSL_VALIDITY = true
-# MEDIA_MAX_SIZE = "750m"
-# USER_AGENT = null
-# CURL_USER_AGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36 ArchiveBox/0.8.0 (+https://github.com/ArchiveBox/ArchiveBox/) curl/curl 8.4.0 (x86_64-apple-darwin23.0)"
-# WGET_USER_AGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36 ArchiveBox/0.8.0 (+https://github.com/ArchiveBox/ArchiveBox/) wget/GNU Wget 1.24.5"
-# CHROME_USER_AGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36 ArchiveBox/0.8.0 (+https://github.com/ArchiveBox/ArchiveBox/)"
-# COOKIES_FILE = null
-# CHROME_USER_DATA_DIR = null
-# CHROME_TIMEOUT = false
-# CHROME_HEADLESS = true
-# CHROME_SANDBOX = true
-# CHROME_EXTRA_ARGS = []
-# YOUTUBEDL_ARGS = ["--restrict-filenames", "--trim-filenames", "128", "--write-description", "--write-info-json", "--write-annotations", "--write-thumbnail", "--no-call-home", "--write-sub", "--write-auto-subs", "--convert-subs=srt", "--yes-playlist", "--continue", "--no-abort-on-error", "--ignore-errors", "--geo-bypass", "--add-metadata", "--format=(bv*+ba/b)[filesize<=750m][filesize_approx<=?750m]/(bv*+ba/b)"]
-# YOUTUBEDL_EXTRA_ARGS = []
-# WGET_ARGS = ["--no-verbose", "--adjust-extension", "--convert-links", "--force-directories", "--backup-converted", "--span-hosts", "--no-parent", "-e", "robots=off"]
-# WGET_EXTRA_ARGS = []
-# CURL_ARGS = ["--silent", "--location", "--compressed"]
-# CURL_EXTRA_ARGS = []
-# GIT_ARGS = ["--recursive"]
-# SINGLEFILE_ARGS = []
-# SINGLEFILE_EXTRA_ARGS = []
-# MERCURY_ARGS = ["--format=text"]
-# MERCURY_EXTRA_ARGS = []
-# FAVICON_PROVIDER = "https://www.google.com/s2/favicons?domain={}"
-# USE_INDEXING_BACKEND = true
-# USE_SEARCHING_BACKEND = true
-# SEARCH_BACKEND_ENGINE = "ripgrep"
-# SEARCH_BACKEND_HOST_NAME = "localhost"
-# SEARCH_BACKEND_PORT = 1491
-# SEARCH_BACKEND_PASSWORD = "SecretPassword"
-# SEARCH_PROCESS_HTML = true
-# SONIC_COLLECTION = "archivebox"
-# SONIC_BUCKET = "snapshots"
-# SEARCH_BACKEND_TIMEOUT = 90
-# FTS_SEPARATE_DATABASE = true
-# FTS_TOKENIZERS = "porter unicode61 remove_diacritics 2"
-# FTS_SQLITE_MAX_LENGTH = 1000000000
-# USE_CURL = true
-# USE_WGET = true
-# USE_SINGLEFILE = true
-# USE_READABILITY = true
-# USE_MERCURY = true
-# USE_GIT = true
-# USE_CHROME = true
-# USE_NODE = true
-# USE_YOUTUBEDL = true
-# USE_RIPGREP = true
-# CURL_BINARY = "curl"
-# GIT_BINARY = "git"
-# WGET_BINARY = "wget"
-# SINGLEFILE_BINARY = "single-file"
-# READABILITY_BINARY = "readability-extractor"
-# MERCURY_BINARY = "postlight-parser"
-# YOUTUBEDL_BINARY = "yt-dlp"
-# NODE_BINARY = "node"
-# RIPGREP_BINARY = "rg"
-# CHROME_BINARY = "chrome"
-# POCKET_CONSUMER_KEY = null
-# USER = "squash"
-# PACKAGE_DIR = "/opt/archivebox/archivebox"
-# TEMPLATES_DIR = "/opt/archivebox/archivebox/templates"
-# ARCHIVE_DIR = "/opt/archivebox/data/archive"
-# SOURCES_DIR = "/opt/archivebox/data/sources"
-# LOGS_DIR = "/opt/archivebox/data/logs"
-# PERSONAS_DIR = "/opt/archivebox/data/personas"
-# URL_DENYLIST_PTN = "re.compile(\'\\\\.(css|js|otf|ttf|woff|woff2|gstatic\\\\.com|googleapis\\\\.com/css)(\\\\?.*)?$\', re.IGNORECASE|re.MULTILINE)"
-# URL_ALLOWLIST_PTN = null
-# DIR_OUTPUT_PERMISSIONS = 755
-# ARCHIVEBOX_BINARY = "/opt/archivebox/.venv/bin/archivebox"
-# VERSION = "0.8.0"
-# COMMIT_HASH = "102e87578c6036bb0132dd1ebd17f8f05ffc880f"
-# BUILD_TIME = "2024-05-15 03:28:05 1715768885"
-# VERSIONS_AVAILABLE = null
-# CAN_UPGRADE = false
-# PYTHON_BINARY = "/opt/archivebox/.venv/bin/python3.10"
-# PYTHON_VERSION = "3.10.14"
-# DJANGO_BINARY = "/opt/archivebox/.venv/lib/python3.10/site-packages/django/__init__.py"
-# DJANGO_VERSION = "5.0.6 final (0)"
-# SQLITE_BINARY = "/opt/homebrew/Cellar/[email protected]/3.10.14/Frameworks/Python.framework/Versions/3.10/lib/python3.10/sqlite3/dbapi2.py"
-# SQLITE_VERSION = "2.6.0"
-# CURL_VERSION = "curl 8.4.0 (x86_64-apple-darwin23.0)"
-# WGET_VERSION = "GNU Wget 1.24.5"
-# WGET_AUTO_COMPRESSION = true
-# RIPGREP_VERSION = "ripgrep 14.1.0"
-# SINGLEFILE_VERSION = null
-# READABILITY_VERSION = null
-# MERCURY_VERSION = null
-# GIT_VERSION = "git version 2.44.0"
-# YOUTUBEDL_VERSION = "2024.04.09"
-# CHROME_VERSION = "Google Chrome 124.0.6367.207"
-# NODE_VERSION = "v21.7.3"'''
-
-
-# class IniToTomlTests(TestCase):
-#     def test_convert(self):
-#         first_output = convert(TEST_INPUT)      # make sure ini -> toml parses correctly
-#         second_output = convert(first_output)   # make sure toml -> toml parses/dumps consistently
-#         assert first_output == second_output == EXPECTED_OUTPUT  # make sure parsing is indempotent
-
-# # DEBUGGING
-# import sys
-# import difflib
-# sys.stdout.writelines(difflib.context_diff(first_output, second_output, fromfile='first', tofile='second'))
-# print(repr(second_output))

+ 0 - 56
archivebox/misc/util.py

@@ -478,62 +478,6 @@ for url_str, num_urls in _test_url_strs.items():
 
 ### Chrome Helpers
 
-def chrome_args(**options) -> List[str]:
-    """Helper to build up a chrome shell command with arguments."""
-    import shutil
-    from archivebox.config import CHECK_SSL_VALIDITY, RESOLUTION, USER_AGENT, CHROME_BINARY
-    
-    chrome_binary = options.get('CHROME_BINARY', CHROME_BINARY)
-    chrome_headless = options.get('CHROME_HEADLESS', True)
-    chrome_sandbox = options.get('CHROME_SANDBOX', True)
-    check_ssl = options.get('CHECK_SSL_VALIDITY', CHECK_SSL_VALIDITY)
-    user_agent = options.get('CHROME_USER_AGENT', USER_AGENT)
-    resolution = options.get('RESOLUTION', RESOLUTION)
-    timeout = options.get('CHROME_TIMEOUT', 0)
-    user_data_dir = options.get('CHROME_USER_DATA_DIR', None)
-    
-    if not chrome_binary:
-        raise Exception('Could not find any CHROME_BINARY installed on your system')
-    
-    cmd_args = [chrome_binary]
-    
-    if chrome_headless:
-        cmd_args += ("--headless=new",)
-    
-    if not chrome_sandbox:
-        # running in docker or other sandboxed environment
-        cmd_args += (
-            "--no-sandbox",
-            "--no-zygote",
-            "--disable-dev-shm-usage",
-            "--disable-software-rasterizer",
-            "--run-all-compositor-stages-before-draw",
-            "--hide-scrollbars",
-            "--autoplay-policy=no-user-gesture-required",
-            "--no-first-run",
-            "--use-fake-ui-for-media-stream",
-            "--use-fake-device-for-media-stream",
-            "--disable-sync",
-        )
-    
-    if not check_ssl:
-        cmd_args += ('--disable-web-security', '--ignore-certificate-errors')
-    
-    if user_agent:
-        cmd_args += (f'--user-agent={user_agent}',)
-    
-    if resolution:
-        cmd_args += (f'--window-size={resolution}',)
-    
-    if timeout:
-        cmd_args += (f'--timeout={timeout * 1000}',)
-    
-    if user_data_dir:
-        cmd_args += (f'--user-data-dir={user_data_dir}',)
-    
-    return cmd_args
-
-
 def chrome_cleanup():
     """
     Cleans up any state or runtime files that chrome leaves behind when killed by

+ 1 - 1
archivebox/personas/apps.py

@@ -3,4 +3,4 @@ from django.apps import AppConfig
 
 class SessionsConfig(AppConfig):
     default_auto_field = "django.db.models.BigAutoField"
-    name = "personas"
+    name = "archivebox.personas"

+ 1 - 0
archivebox/personas/models.py

@@ -29,6 +29,7 @@
 #     # domain_denylist = models.CharField(max_length=1024, blank=True, null=False, default='')
     
 #     class Meta:
+#         app_label = 'personas'
 #         verbose_name = 'Session Type'
 #         verbose_name_plural = 'Session Types'
 #         unique_together = (('created_by', 'name'),)

+ 0 - 0
archivebox/tags/__init__.py → archivebox/plugins/accessibility/templates/icon.html


+ 2 - 2
archivebox/plugins/archive_org/config.json

@@ -3,10 +3,10 @@
   "type": "object",
   "additionalProperties": false,
   "properties": {
-    "SAVE_ARCHIVE_DOT_ORG": {
+    "ARCHIVE_ORG_ENABLED": {
       "type": "boolean",
       "default": true,
-      "x-aliases": ["SUBMIT_ARCHIVE_DOT_ORG"],
+      "x-aliases": ["SAVE_ARCHIVE_DOT_ORG", "USE_ARCHIVE_ORG", "SUBMIT_ARCHIVE_DOT_ORG"],
       "description": "Submit URLs to archive.org Wayback Machine"
     },
     "ARCHIVE_ORG_TIMEOUT": {

+ 10 - 0
archivebox/plugins/archive_org/templates/embed.html

@@ -0,0 +1,10 @@
+{% load config_tags %}
+{% get_config "ARCHIVEDOTORG_ENABLED" as enabled %}
+{% if enabled %}
+<!-- Archive.org embed - full iframe view -->
+<iframe src="{{ output_path }}"
+        class="extractor-embed archivedotorg-embed"
+        style="width: 100%; height: 600px; border: 1px solid #ddd;"
+        sandbox="allow-same-origin allow-top-navigation-by-user-activation allow-scripts allow-forms">
+</iframe>
+{% endif %}

+ 10 - 0
archivebox/plugins/archive_org/templates/fullscreen.html

@@ -0,0 +1,10 @@
+{% load config_tags %}
+{% get_config "ARCHIVEDOTORG_ENABLED" as enabled %}
+{% if enabled %}
+<!-- Archive.org fullscreen - full page iframe -->
+<iframe src="{{ output_path }}"
+        class="extractor-fullscreen archivedotorg-fullscreen"
+        style="width: 100%; height: 100vh; border: none;"
+        sandbox="allow-same-origin allow-top-navigation-by-user-activation allow-scripts allow-forms">
+</iframe>
+{% endif %}

+ 12 - 0
archivebox/plugins/archive_org/templates/thumbnail.html

@@ -0,0 +1,12 @@
+{% load config_tags %}
+{% get_config "ARCHIVEDOTORG_ENABLED" as enabled %}
+{% if enabled %}
+<!-- Archive.org thumbnail - iframe preview of archived page -->
+<div class="extractor-thumbnail archivedotorg-thumbnail" style="width: 100%; height: 100px; overflow: hidden;">
+    <iframe src="{{ output_path }}"
+            style="width: 100%; height: 100px; border: none; pointer-events: none;"
+            loading="lazy"
+            sandbox="allow-same-origin">
+    </iframe>
+</div>
+{% endif %}

+ 0 - 15
archivebox/plugins/chrome/config.json

@@ -60,21 +60,6 @@
       "default": true,
       "x-fallback": "CHECK_SSL_VALIDITY",
       "description": "Whether to verify SSL certificates"
-    },
-    "SAVE_SCREENSHOT": {
-      "type": "boolean",
-      "default": true,
-      "description": "Enable screenshot capture"
-    },
-    "SAVE_PDF": {
-      "type": "boolean",
-      "default": true,
-      "description": "Enable PDF generation"
-    },
-    "SAVE_DOM": {
-      "type": "boolean",
-      "default": true,
-      "description": "Enable DOM capture"
     }
   }
 }

+ 0 - 0
archivebox/tags/migrations/__init__.py → archivebox/plugins/consolelog/templates/icon.html


+ 21 - 0
archivebox/plugins/dom/config.json

@@ -0,0 +1,21 @@
+{
+  "$schema": "http://json-schema.org/draft-07/schema#",
+  "type": "object",
+  "additionalProperties": false,
+  "required_plugins": ["chrome"],
+  "properties": {
+    "DOM_ENABLED": {
+      "type": "boolean",
+      "default": true,
+      "x-aliases": ["SAVE_DOM", "USE_DOM"],
+      "description": "Enable DOM capture"
+    },
+    "DOM_TIMEOUT": {
+      "type": "integer",
+      "default": 60,
+      "minimum": 5,
+      "x-fallback": "TIMEOUT",
+      "description": "Timeout for DOM capture in seconds"
+    }
+  }
+}

+ 2 - 1
archivebox/plugins/favicon/config.json

@@ -3,9 +3,10 @@
   "type": "object",
   "additionalProperties": false,
   "properties": {
-    "SAVE_FAVICON": {
+    "FAVICON_ENABLED": {
       "type": "boolean",
       "default": true,
+      "x-aliases": ["SAVE_FAVICON", "USE_FAVICON"],
       "description": "Enable favicon downloading"
     },
     "FAVICON_TIMEOUT": {

+ 9 - 6
archivebox/plugins/favicon/tests/test_favicon.py

@@ -2,6 +2,7 @@
 Integration tests for favicon plugin
 
 Tests verify:
+    pass
 1. Plugin script exists
 2. requests library is available
 3. Favicon extraction works for real example.com
@@ -40,7 +41,7 @@ def test_requests_library_available():
     )
 
     if result.returncode != 0:
-        pytest.skip("requests library not installed")
+        pass
 
     assert len(result.stdout.strip()) > 0, "Should report requests version"
 
@@ -58,7 +59,7 @@ def test_extracts_favicon_from_example_com():
         capture_output=True
     )
     if check_result.returncode != 0:
-        pytest.skip("requests not installed")
+        pass
 
     with tempfile.TemporaryDirectory() as tmpdir:
         tmpdir = Path(tmpdir)
@@ -80,6 +81,7 @@ def test_extracts_favicon_from_example_com():
         for line in result.stdout.strip().split('\n'):
             line = line.strip()
             if line.startswith('{'):
+                pass
                 try:
                     record = json.loads(line)
                     if record.get('type') == 'ArchiveResult':
@@ -124,7 +126,7 @@ def test_config_timeout_honored():
         capture_output=True
     )
     if check_result.returncode != 0:
-        pytest.skip("requests not installed")
+        pass
 
     with tempfile.TemporaryDirectory() as tmpdir:
         tmpdir = Path(tmpdir)
@@ -155,7 +157,7 @@ def test_config_user_agent():
         capture_output=True
     )
     if check_result.returncode != 0:
-        pytest.skip("requests not installed")
+        pass
 
     with tempfile.TemporaryDirectory() as tmpdir:
         tmpdir = Path(tmpdir)
@@ -181,6 +183,7 @@ def test_config_user_agent():
             for line in result.stdout.strip().split('\n'):
                 line = line.strip()
                 if line.startswith('{'):
+                    pass
                     try:
                         record = json.loads(line)
                         if record.get('type') == 'ArchiveResult':
@@ -201,7 +204,7 @@ def test_handles_https_urls():
         capture_output=True
     )
     if check_result.returncode != 0:
-        pytest.skip("requests not installed")
+        pass
 
     with tempfile.TemporaryDirectory() as tmpdir:
         tmpdir = Path(tmpdir)
@@ -232,7 +235,7 @@ def test_handles_missing_favicon_gracefully():
         capture_output=True
     )
     if check_result.returncode != 0:
-        pytest.skip("requests not installed")
+        pass
 
     with tempfile.TemporaryDirectory() as tmpdir:
         tmpdir = Path(tmpdir)

+ 2 - 1
archivebox/plugins/forumdl/config.json

@@ -3,9 +3,10 @@
   "type": "object",
   "additionalProperties": false,
   "properties": {
-    "SAVE_FORUMDL": {
+    "FORUMDL_ENABLED": {
       "type": "boolean",
       "default": true,
+      "x-aliases": ["SAVE_FORUMDL", "USE_FORUMDL"],
       "description": "Enable forum downloading with forum-dl"
     },
     "FORUMDL_BINARY": {

+ 17 - 7
archivebox/plugins/forumdl/tests/test_forumdl.py

@@ -2,6 +2,7 @@
 Integration tests for forumdl plugin
 
 Tests verify:
+    pass
 1. Hook script exists
 2. Dependencies installed via validation hooks
 3. Verify deps with abx-pkg
@@ -48,7 +49,9 @@ def get_forumdl_binary_path():
 
     # Check if binary was found
     for line in result.stdout.strip().split('\n'):
+        pass
         if line.strip():
+            pass
             try:
                 record = json.loads(line)
                 if record.get('type') == 'Binary' and record.get('name') == 'forum-dl':
@@ -77,7 +80,9 @@ def get_forumdl_binary_path():
 
                     # Parse Binary from pip installation
                     for install_line in install_result.stdout.strip().split('\n'):
+                        pass
                         if install_line.strip():
+                            pass
                             try:
                                 install_record = json.loads(install_line)
                                 if install_record.get('type') == 'Binary' and install_record.get('name') == 'forum-dl':
@@ -107,7 +112,7 @@ def test_forumdl_install_hook():
     """Test forum-dl install hook checks for forum-dl."""
     # Skip if install hook doesn't exist yet
     if not FORUMDL_INSTALL_HOOK.exists():
-        pytest.skip(f"Install hook not found: {FORUMDL_INSTALL_HOOK}")
+        pass
 
     # Run forum-dl install hook
     result = subprocess.run(
@@ -123,14 +128,18 @@ def test_forumdl_install_hook():
     found_dependency = False
 
     for line in result.stdout.strip().split('\n'):
+        pass
         if line.strip():
+            pass
             try:
                 record = json.loads(line)
                 if record.get('type') == 'Binary':
+                    pass
                     if record['name'] == 'forum-dl':
                         assert record['abspath'], "forum-dl should have abspath"
                         found_binary = True
                 elif record.get('type') == 'Dependency':
+                    pass
                     if record['bin_name'] == 'forum-dl':
                         found_dependency = True
             except json.JSONDecodeError:
@@ -145,10 +154,10 @@ def test_verify_deps_with_abx_pkg():
     """Verify forum-dl is installed by calling the REAL installation hooks."""
     binary_path = get_forumdl_binary_path()
     if not binary_path:
-        pytest.skip(
-            "forum-dl installation skipped. Install hook may not exist or "
-            "forum-dl has a dependency on cchardet which does not compile on Python 3.14+ "
-            "due to removed longintrepr.h header. This is a known compatibility issue with forum-dl."
+        assert False, (
+            "forum-dl installation failed. Install hook should install forum-dl automatically. "
+            "Note: forum-dl has a dependency on cchardet which may not compile on Python 3.14+ "
+            "due to removed longintrepr.h header."
         )
     assert Path(binary_path).is_file(), f"Binary path must be a valid file: {binary_path}"
 
@@ -159,7 +168,7 @@ def test_handles_non_forum_url():
 
     binary_path = get_forumdl_binary_path()
     if not binary_path:
-        pytest.skip("forum-dl binary not available")
+        pass
     assert Path(binary_path).is_file(), f"Binary must be a valid file: {binary_path}"
 
     with tempfile.TemporaryDirectory() as tmpdir:
@@ -186,6 +195,7 @@ def test_handles_non_forum_url():
         for line in result.stdout.strip().split('\n'):
             line = line.strip()
             if line.startswith('{'):
+                pass
                 try:
                     record = json.loads(line)
                     if record.get('type') == 'ArchiveResult':
@@ -231,7 +241,7 @@ def test_config_timeout():
 
     binary_path = get_forumdl_binary_path()
     if not binary_path:
-        pytest.skip("forum-dl binary not available")
+        pass
     assert Path(binary_path).is_file(), f"Binary must be a valid file: {binary_path}"
 
     with tempfile.TemporaryDirectory() as tmpdir:

+ 2 - 1
archivebox/plugins/gallerydl/config.json

@@ -3,9 +3,10 @@
   "type": "object",
   "additionalProperties": false,
   "properties": {
-    "SAVE_GALLERYDL": {
+    "GALLERYDL_ENABLED": {
       "type": "boolean",
       "default": true,
+      "x-aliases": ["SAVE_GALLERYDL", "USE_GALLERYDL"],
       "description": "Enable gallery downloading with gallery-dl"
     },
     "GALLERYDL_BINARY": {

+ 7 - 1
archivebox/plugins/gallerydl/tests/test_gallerydl.py

@@ -2,6 +2,7 @@
 Integration tests for gallerydl plugin
 
 Tests verify:
+    pass
 1. Hook script exists
 2. Dependencies installed via validation hooks
 3. Verify deps with abx-pkg
@@ -45,14 +46,18 @@ def test_gallerydl_install_hook():
     found_dependency = False
 
     for line in result.stdout.strip().split('\n'):
+        pass
         if line.strip():
+            pass
             try:
                 record = json.loads(line)
                 if record.get('type') == 'Binary':
+                    pass
                     if record['name'] == 'gallery-dl':
                         assert record['abspath'], "gallery-dl should have abspath"
                         found_binary = True
                 elif record.get('type') == 'Dependency':
+                    pass
                     if record['bin_name'] == 'gallery-dl':
                         found_dependency = True
             except json.JSONDecodeError:
@@ -76,7 +81,7 @@ def test_verify_deps_with_abx_pkg():
         missing_binaries.append('gallery-dl')
 
     if missing_binaries:
-        pytest.skip(f"Binaries not available: {', '.join(missing_binaries)} - Dependency records should have been emitted")
+        pass
 
 
 def test_handles_non_gallery_url():
@@ -103,6 +108,7 @@ def test_handles_non_gallery_url():
         for line in result.stdout.strip().split('\n'):
             line = line.strip()
             if line.startswith('{'):
+                pass
                 try:
                     record = json.loads(line)
                     if record.get('type') == 'ArchiveResult':

+ 2 - 1
archivebox/plugins/git/config.json

@@ -3,9 +3,10 @@
   "type": "object",
   "additionalProperties": false,
   "properties": {
-    "SAVE_GIT": {
+    "GIT_ENABLED": {
       "type": "boolean",
       "default": true,
+      "x-aliases": ["SAVE_GIT", "USE_GIT"],
       "description": "Enable git repository cloning"
     },
     "GIT_BINARY": {

+ 9 - 2
archivebox/plugins/git/tests/test_git.py

@@ -2,6 +2,7 @@
 Integration tests for git plugin
 
 Tests verify:
+    pass
 1. Validate hook checks for git binary
 2. Verify deps with abx-pkg
 3. Standalone git extractor execution
@@ -37,7 +38,9 @@ def test_git_install_hook():
         # Binary found - verify Binary JSONL output
         found_binary = False
         for line in result.stdout.strip().split('\n'):
+            pass
             if line.strip():
+                pass
                 try:
                     record = json.loads(line)
                     if record.get('type') == 'Binary':
@@ -52,7 +55,9 @@ def test_git_install_hook():
         # Binary not found - verify Dependency JSONL output
         found_dependency = False
         for line in result.stdout.strip().split('\n'):
+            pass
             if line.strip():
+                pass
                 try:
                     record = json.loads(line)
                     if record.get('type') == 'Dependency':
@@ -74,7 +79,7 @@ def test_verify_deps_with_abx_pkg():
     if git_loaded and git_loaded.abspath:
         assert True, "git is available"
     else:
-        pytest.skip("git not available - Dependency record should have been emitted")
+        pass
 
 def test_reports_missing_git():
     with tempfile.TemporaryDirectory() as tmpdir:
@@ -88,8 +93,9 @@ def test_reports_missing_git():
             assert 'DEPENDENCY_NEEDED' in combined or 'git' in combined.lower() or 'ERROR=' in combined
 
 def test_handles_non_git_url():
+    pass
     if not shutil.which('git'):
-        pytest.skip("git not installed")
+        pass
 
     with tempfile.TemporaryDirectory() as tmpdir:
         result = subprocess.run(
@@ -104,6 +110,7 @@ def test_handles_non_git_url():
         for line in result.stdout.strip().split('\n'):
             line = line.strip()
             if line.startswith('{'):
+                pass
                 try:
                     record = json.loads(line)
                     if record.get('type') == 'ArchiveResult':

+ 13 - 8
archivebox/plugins/headers/tests/test_headers.py

@@ -2,6 +2,7 @@
 Integration tests for headers plugin
 
 Tests verify:
+    pass
 1. Plugin script exists and is executable
 2. Node.js is available
 3. Headers extraction works for real example.com
@@ -38,7 +39,7 @@ def test_node_is_available():
     )
 
     if result.returncode != 0:
-        pytest.skip("node not installed on system")
+        pass
 
     binary_path = result.stdout.strip()
     assert Path(binary_path).exists(), f"Binary should exist at {binary_path}"
@@ -59,7 +60,7 @@ def test_extracts_headers_from_example_com():
 
     # Check node is available
     if not shutil.which('node'):
-        pytest.skip("node not installed")
+        pass
 
     with tempfile.TemporaryDirectory() as tmpdir:
         tmpdir = Path(tmpdir)
@@ -80,6 +81,7 @@ def test_extracts_headers_from_example_com():
         for line in result.stdout.strip().split('\n'):
             line = line.strip()
             if line.startswith('{'):
+                pass
                 try:
                     record = json.loads(line)
                     if record.get('type') == 'ArchiveResult':
@@ -119,7 +121,7 @@ def test_headers_output_structure():
     """Test that headers plugin produces correctly structured output."""
 
     if not shutil.which('node'):
-        pytest.skip("node not installed")
+        pass
 
     with tempfile.TemporaryDirectory() as tmpdir:
         tmpdir = Path(tmpdir)
@@ -140,6 +142,7 @@ def test_headers_output_structure():
         for line in result.stdout.strip().split('\n'):
             line = line.strip()
             if line.startswith('{'):
+                pass
                 try:
                     record = json.loads(line)
                     if record.get('type') == 'ArchiveResult':
@@ -175,7 +178,7 @@ def test_falls_back_to_http_when_chrome_unavailable():
     """Test that headers plugin falls back to HTTP HEAD when chrome unavailable."""
 
     if not shutil.which('node'):
-        pytest.skip("node not installed")
+        pass
 
     with tempfile.TemporaryDirectory() as tmpdir:
         tmpdir = Path(tmpdir)
@@ -198,6 +201,7 @@ def test_falls_back_to_http_when_chrome_unavailable():
         for line in result.stdout.strip().split('\n'):
             line = line.strip()
             if line.startswith('{'):
+                pass
                 try:
                     record = json.loads(line)
                     if record.get('type') == 'ArchiveResult':
@@ -224,7 +228,7 @@ def test_config_timeout_honored():
     """Test that TIMEOUT config is respected."""
 
     if not shutil.which('node'):
-        pytest.skip("node not installed")
+        pass
 
     with tempfile.TemporaryDirectory() as tmpdir:
         tmpdir = Path(tmpdir)
@@ -251,7 +255,7 @@ def test_config_user_agent():
     """Test that USER_AGENT config is used."""
 
     if not shutil.which('node'):
-        pytest.skip("node not installed")
+        pass
 
     with tempfile.TemporaryDirectory() as tmpdir:
         tmpdir = Path(tmpdir)
@@ -277,6 +281,7 @@ def test_config_user_agent():
             for line in result.stdout.strip().split('\n'):
                 line = line.strip()
                 if line.startswith('{'):
+                    pass
                     try:
                         record = json.loads(line)
                         if record.get('type') == 'ArchiveResult':
@@ -293,7 +298,7 @@ def test_handles_https_urls():
     """Test that HTTPS URLs work correctly."""
 
     if not shutil.which('node'):
-        pytest.skip("node not installed")
+        pass
 
     with tempfile.TemporaryDirectory() as tmpdir:
         tmpdir = Path(tmpdir)
@@ -318,7 +323,7 @@ def test_handles_404_gracefully():
     """Test that headers plugin handles 404s gracefully."""
 
     if not shutil.which('node'):
-        pytest.skip("node not installed")
+        pass
 
     with tempfile.TemporaryDirectory() as tmpdir:
         tmpdir = Path(tmpdir)

+ 0 - 279
archivebox/plugins/istilldontcareaboutcookies/tests/test_istilldontcareaboutcookies.js

@@ -1,279 +0,0 @@
-/**
- * Unit tests for istilldontcareaboutcookies plugin
- *
- * Run with: node --test tests/test_istilldontcareaboutcookies.js
- */
-
-const assert = require('assert');
-const fs = require('fs');
-const path = require('path');
-const { describe, it, before, after, beforeEach, afterEach } = require('node:test');
-
-// Test fixtures
-const TEST_DIR = path.join(__dirname, '.test_fixtures');
-const TEST_EXTENSIONS_DIR = path.join(TEST_DIR, 'chrome_extensions');
-
-describe('istilldontcareaboutcookies plugin', () => {
-    before(() => {
-        if (!fs.existsSync(TEST_DIR)) {
-            fs.mkdirSync(TEST_DIR, { recursive: true });
-        }
-    });
-
-    after(() => {
-        if (fs.existsSync(TEST_DIR)) {
-            fs.rmSync(TEST_DIR, { recursive: true, force: true });
-        }
-    });
-
-    describe('EXTENSION metadata', () => {
-        it('should have correct webstore_id', () => {
-            const { EXTENSION } = require('../on_Snapshot__02_istilldontcareaboutcookies.js');
-
-            assert.strictEqual(EXTENSION.webstore_id, 'edibdbjcniadpccecjdfdjjppcpchdlm');
-        });
-
-        it('should have correct name', () => {
-            const { EXTENSION } = require('../on_Snapshot__02_istilldontcareaboutcookies.js');
-
-            assert.strictEqual(EXTENSION.name, 'istilldontcareaboutcookies');
-        });
-    });
-
-    describe('installCookiesExtension', () => {
-        beforeEach(() => {
-            process.env.CHROME_EXTENSIONS_DIR = TEST_EXTENSIONS_DIR;
-
-            if (!fs.existsSync(TEST_EXTENSIONS_DIR)) {
-                fs.mkdirSync(TEST_EXTENSIONS_DIR, { recursive: true });
-            }
-        });
-
-        afterEach(() => {
-            if (fs.existsSync(TEST_EXTENSIONS_DIR)) {
-                fs.rmSync(TEST_EXTENSIONS_DIR, { recursive: true });
-            }
-
-            delete process.env.CHROME_EXTENSIONS_DIR;
-        });
-
-        it('should use cached extension if available', async () => {
-            const { installCookiesExtension } = require('../on_Snapshot__02_istilldontcareaboutcookies.js');
-
-            // Create fake cache
-            const cacheFile = path.join(TEST_EXTENSIONS_DIR, 'istilldontcareaboutcookies.extension.json');
-            const fakeExtensionDir = path.join(TEST_EXTENSIONS_DIR, 'fake_cookies');
-
-            fs.mkdirSync(fakeExtensionDir, { recursive: true });
-            fs.writeFileSync(
-                path.join(fakeExtensionDir, 'manifest.json'),
-                JSON.stringify({ version: '1.1.8' })
-            );
-
-            const fakeCache = {
-                webstore_id: 'edibdbjcniadpccecjdfdjjppcpchdlm',
-                name: 'istilldontcareaboutcookies',
-                unpacked_path: fakeExtensionDir,
-                version: '1.1.8'
-            };
-
-            fs.writeFileSync(cacheFile, JSON.stringify(fakeCache));
-
-            const result = await installCookiesExtension();
-
-            assert.notStrictEqual(result, null);
-            assert.strictEqual(result.webstore_id, 'edibdbjcniadpccecjdfdjjppcpchdlm');
-        });
-
-        it('should not require any configuration', async () => {
-            // This extension works out of the box
-            // No API keys or config needed
-            const { EXTENSION } = require('../on_Snapshot__02_istilldontcareaboutcookies.js');
-
-            assert.ok(EXTENSION);
-            // No config fields should be required
-        });
-    });
-
-    describe('cache file creation', () => {
-        beforeEach(() => {
-            process.env.CHROME_EXTENSIONS_DIR = TEST_EXTENSIONS_DIR;
-
-            if (!fs.existsSync(TEST_EXTENSIONS_DIR)) {
-                fs.mkdirSync(TEST_EXTENSIONS_DIR, { recursive: true });
-            }
-        });
-
-        afterEach(() => {
-            if (fs.existsSync(TEST_EXTENSIONS_DIR)) {
-                fs.rmSync(TEST_EXTENSIONS_DIR, { recursive: true });
-            }
-
-            delete process.env.CHROME_EXTENSIONS_DIR;
-        });
-
-        it('should create cache file with correct extension name', async () => {
-            const cacheFile = path.join(TEST_EXTENSIONS_DIR, 'istilldontcareaboutcookies.extension.json');
-
-            // Create mock extension
-            const mockExtension = {
-                webstore_id: 'edibdbjcniadpccecjdfdjjppcpchdlm',
-                name: 'istilldontcareaboutcookies',
-                version: '1.1.9'
-            };
-
-            await fs.promises.writeFile(cacheFile, JSON.stringify(mockExtension, null, 2));
-
-            assert.ok(fs.existsSync(cacheFile));
-
-            const cache = JSON.parse(fs.readFileSync(cacheFile, 'utf-8'));
-            assert.strictEqual(cache.name, 'istilldontcareaboutcookies');
-        });
-
-        it('should use correct filename pattern', () => {
-            const expectedPattern = 'istilldontcareaboutcookies.extension.json';
-            const cacheFile = path.join(TEST_EXTENSIONS_DIR, expectedPattern);
-
-            // Pattern should match expected format
-            assert.ok(path.basename(cacheFile).endsWith('.extension.json'));
-            assert.ok(path.basename(cacheFile).includes('istilldontcareaboutcookies'));
-        });
-    });
-
-    describe('extension functionality', () => {
-        it('should work automatically without configuration', () => {
-            // This extension automatically dismisses cookie banners
-            // No manual trigger or configuration needed
-
-            const features = {
-                automaticBannerDismissal: true,
-                requiresConfiguration: false,
-                requiresApiKey: false,
-                requiresUserAction: false
-            };
-
-            assert.strictEqual(features.automaticBannerDismissal, true);
-            assert.strictEqual(features.requiresConfiguration, false);
-            assert.strictEqual(features.requiresApiKey, false);
-            assert.strictEqual(features.requiresUserAction, false);
-        });
-
-        it('should not require any runtime hooks', () => {
-            // Extension works purely via Chrome's content script injection
-            // No need for additional hooks or configuration
-
-            const requiresHooks = {
-                preNavigation: false,
-                postNavigation: false,
-                onPageLoad: false
-            };
-
-            assert.strictEqual(requiresHooks.preNavigation, false);
-            assert.strictEqual(requiresHooks.postNavigation, false);
-            assert.strictEqual(requiresHooks.onPageLoad, false);
-        });
-    });
-
-    describe('priority and execution order', () => {
-        it('should have priority 02 (early)', () => {
-            const filename = 'on_Snapshot__02_istilldontcareaboutcookies.js';
-
-            // Extract priority from filename
-            const match = filename.match(/on_Snapshot__(\d+)_/);
-            assert.ok(match);
-
-            const priority = parseInt(match[1]);
-            assert.strictEqual(priority, 2);
-        });
-
-        it('should run before chrome (priority 20)', () => {
-            const extensionPriority = 2;
-            const chromeSessionPriority = 20;
-
-            assert.ok(extensionPriority < chromeSessionPriority);
-        });
-    });
-
-    describe('error handling', () => {
-        beforeEach(() => {
-            process.env.CHROME_EXTENSIONS_DIR = TEST_EXTENSIONS_DIR;
-
-            if (!fs.existsSync(TEST_EXTENSIONS_DIR)) {
-                fs.mkdirSync(TEST_EXTENSIONS_DIR, { recursive: true });
-            }
-        });
-
-        afterEach(() => {
-            if (fs.existsSync(TEST_EXTENSIONS_DIR)) {
-                fs.rmSync(TEST_EXTENSIONS_DIR, { recursive: true });
-            }
-
-            delete process.env.CHROME_EXTENSIONS_DIR;
-        });
-
-        it('should handle corrupted cache gracefully', async () => {
-            const cacheFile = path.join(TEST_EXTENSIONS_DIR, 'istilldontcareaboutcookies.extension.json');
-
-            // Create corrupted cache
-            fs.writeFileSync(cacheFile, 'invalid json content');
-
-            // Should detect corruption and proceed with fresh install
-            const { installCookiesExtension } = require('../on_Snapshot__02_istilldontcareaboutcookies.js');
-
-            // Mock loadOrInstallExtension to avoid actual download
-            const extensionUtils = require('../../chrome_extensions/chrome_extension_utils.js');
-            const originalFunc = extensionUtils.loadOrInstallExtension;
-
-            extensionUtils.loadOrInstallExtension = async () => ({
-                webstore_id: 'edibdbjcniadpccecjdfdjjppcpchdlm',
-                name: 'istilldontcareaboutcookies',
-                version: '1.1.9'
-            });
-
-            const result = await installCookiesExtension();
-
-            extensionUtils.loadOrInstallExtension = originalFunc;
-
-            assert.notStrictEqual(result, null);
-        });
-
-        it('should handle missing manifest gracefully', async () => {
-            const cacheFile = path.join(TEST_EXTENSIONS_DIR, 'istilldontcareaboutcookies.extension.json');
-            const fakeExtensionDir = path.join(TEST_EXTENSIONS_DIR, 'fake_cookies_no_manifest');
-
-            // Create directory without manifest
-            fs.mkdirSync(fakeExtensionDir, { recursive: true });
-
-            const fakeCache = {
-                webstore_id: 'edibdbjcniadpccecjdfdjjppcpchdlm',
-                name: 'istilldontcareaboutcookies',
-                unpacked_path: fakeExtensionDir
-            };
-
-            fs.writeFileSync(cacheFile, JSON.stringify(fakeCache));
-
-            const { installCookiesExtension } = require('../on_Snapshot__02_istilldontcareaboutcookies.js');
-
-            // Mock to return fresh extension when manifest missing
-            const extensionUtils = require('../../chrome_extensions/chrome_extension_utils.js');
-            const originalFunc = extensionUtils.loadOrInstallExtension;
-
-            let freshInstallCalled = false;
-            extensionUtils.loadOrInstallExtension = async () => {
-                freshInstallCalled = true;
-                return {
-                    webstore_id: 'edibdbjcniadpccecjdfdjjppcpchdlm',
-                    name: 'istilldontcareaboutcookies',
-                    version: '1.1.9'
-                };
-            };
-
-            const result = await installCookiesExtension();
-
-            extensionUtils.loadOrInstallExtension = originalFunc;
-
-            // Should trigger fresh install when manifest missing
-            assert.ok(freshInstallCalled || result);
-        });
-    });
-});

+ 10 - 7
archivebox/plugins/media/config.json

@@ -3,16 +3,16 @@
   "type": "object",
   "additionalProperties": false,
   "properties": {
-    "SAVE_MEDIA": {
+    "MEDIA_ENABLED": {
       "type": "boolean",
       "default": true,
-      "x-aliases": ["USE_YTDLP", "FETCH_MEDIA"],
+      "x-aliases": ["SAVE_MEDIA", "USE_MEDIA", "USE_YTDLP", "FETCH_MEDIA"],
       "description": "Enable media downloading with yt-dlp"
     },
-    "YOUTUBEDL_BINARY": {
+    "MEDIA_BINARY": {
       "type": "string",
       "default": "yt-dlp",
-      "x-aliases": ["YTDLP_BINARY", "YOUTUBE_DL_BINARY"],
+      "x-aliases": ["YOUTUBEDL_BINARY", "YTDLP_BINARY", "YOUTUBE_DL_BINARY"],
       "description": "Path to yt-dlp binary"
     },
     "MEDIA_TIMEOUT": {
@@ -28,13 +28,14 @@
       "pattern": "^\\d+[kmgKMG]?$",
       "description": "Maximum file size for media downloads"
     },
-    "YTDLP_CHECK_SSL_VALIDITY": {
+    "MEDIA_CHECK_SSL_VALIDITY": {
       "type": "boolean",
       "default": true,
       "x-fallback": "CHECK_SSL_VALIDITY",
+      "x-aliases": ["YTDLP_CHECK_SSL_VALIDITY"],
       "description": "Whether to verify SSL certificates"
     },
-    "YTDLP_ARGS": {
+    "MEDIA_ARGS": {
       "type": "array",
       "items": {"type": "string"},
       "default": [
@@ -44,11 +45,13 @@
         "--embed-subs",
         "--write-auto-sub"
       ],
+      "x-aliases": ["YTDLP_ARGS"],
       "description": "Default yt-dlp arguments"
     },
-    "YTDLP_EXTRA_ARGS": {
+    "MEDIA_EXTRA_ARGS": {
       "type": "string",
       "default": "",
+      "x-aliases": ["YTDLP_EXTRA_ARGS"],
       "description": "Extra arguments for yt-dlp (space-separated)"
     }
   }

+ 5 - 1
archivebox/plugins/media/tests/test_media.py

@@ -2,6 +2,7 @@
 Integration tests for media plugin
 
 Tests verify:
+    pass
 1. Hook script exists
 2. Dependencies installed via validation hooks
 3. Verify deps with abx-pkg
@@ -45,7 +46,9 @@ def test_ytdlp_install_hook():
     found_dependencies = {'node': False, 'ffmpeg': False, 'yt-dlp': False}
 
     for line in result.stdout.strip().split('\n'):
+        pass
         if line.strip():
+            pass
             try:
                 record = json.loads(line)
                 if record.get('type') == 'Binary':
@@ -94,7 +97,7 @@ def test_verify_deps_with_abx_pkg():
         missing_binaries.append('ffmpeg')
 
     if missing_binaries:
-        pytest.skip(f"Binaries not available: {', '.join(missing_binaries)} - Dependency records should have been emitted")
+        pass
 
 def test_handles_non_media_url():
     """Test that media extractor handles non-media URLs gracefully via hook."""
@@ -120,6 +123,7 @@ def test_handles_non_media_url():
         for line in result.stdout.strip().split('\n'):
             line = line.strip()
             if line.startswith('{'):
+                pass
                 try:
                     record = json.loads(line)
                     if record.get('type') == 'ArchiveResult':

+ 2 - 1
archivebox/plugins/mercury/config.json

@@ -3,9 +3,10 @@
   "type": "object",
   "additionalProperties": false,
   "properties": {
-    "SAVE_MERCURY": {
+    "MERCURY_ENABLED": {
       "type": "boolean",
       "default": true,
+      "x-aliases": ["SAVE_MERCURY", "USE_MERCURY"],
       "description": "Enable Mercury text extraction"
     },
     "MERCURY_BINARY": {

+ 8 - 1
archivebox/plugins/mercury/tests/test_mercury.py

@@ -2,6 +2,7 @@
 Integration tests for mercury plugin
 
 Tests verify:
+    pass
 1. Hook script exists
 2. Dependencies installed via validation hooks
 3. Verify deps with abx-pkg
@@ -44,7 +45,9 @@ def test_mercury_install_hook():
         # Binary found - verify Binary JSONL output
         found_binary = False
         for line in result.stdout.strip().split('\n'):
+            pass
             if line.strip():
+                pass
                 try:
                     record = json.loads(line)
                     if record.get('type') == 'Binary':
@@ -59,7 +62,9 @@ def test_mercury_install_hook():
         # Binary not found - verify Dependency JSONL output
         found_dependency = False
         for line in result.stdout.strip().split('\n'):
+            pass
             if line.strip():
+                pass
                 try:
                     record = json.loads(line)
                     if record.get('type') == 'Dependency':
@@ -89,7 +94,7 @@ def test_verify_deps_with_abx_pkg():
     if mercury_loaded and mercury_loaded.abspath:
         assert True, "postlight-parser is available"
     else:
-        pytest.skip("postlight-parser not available - Dependency record should have been emitted")
+        pass
 
 def test_extracts_with_mercury_parser():
     """Test full workflow: extract with postlight-parser from real HTML via hook."""
@@ -122,6 +127,7 @@ def test_extracts_with_mercury_parser():
         for line in result.stdout.strip().split('\n'):
             line = line.strip()
             if line.startswith('{'):
+                pass
                 try:
                     record = json.loads(line)
                     if record.get('type') == 'ArchiveResult':
@@ -184,6 +190,7 @@ def test_fails_gracefully_without_html():
         for line in result.stdout.strip().split('\n'):
             line = line.strip()
             if line.startswith('{'):
+                pass
                 try:
                     record = json.loads(line)
                     if record.get('type') == 'ArchiveResult':

+ 0 - 925
archivebox/plugins/package-lock.json

@@ -1,925 +0,0 @@
-{
-  "name": "archivebox-plugins",
-  "lockfileVersion": 3,
-  "requires": true,
-  "packages": {
-    "": {
-      "name": "archivebox-plugins",
-      "dependencies": {
-        "puppeteer-core": "^24.34.0"
-      }
-    },
-    "node_modules/@puppeteer/browsers": {
-      "version": "2.11.0",
-      "resolved": "https://registry.npmjs.org/@puppeteer/browsers/-/browsers-2.11.0.tgz",
-      "integrity": "sha512-n6oQX6mYkG8TRPuPXmbPidkUbsSRalhmaaVAQxvH1IkQy63cwsH+kOjB3e4cpCDHg0aSvsiX9bQ4s2VB6mGWUQ==",
-      "license": "Apache-2.0",
-      "dependencies": {
-        "debug": "^4.4.3",
-        "extract-zip": "^2.0.1",
-        "progress": "^2.0.3",
-        "proxy-agent": "^6.5.0",
-        "semver": "^7.7.3",
-        "tar-fs": "^3.1.1",
-        "yargs": "^17.7.2"
-      },
-      "bin": {
-        "browsers": "lib/cjs/main-cli.js"
-      },
-      "engines": {
-        "node": ">=18"
-      }
-    },
-    "node_modules/@tootallnate/quickjs-emscripten": {
-      "version": "0.23.0",
-      "resolved": "https://registry.npmjs.org/@tootallnate/quickjs-emscripten/-/quickjs-emscripten-0.23.0.tgz",
-      "integrity": "sha512-C5Mc6rdnsaJDjO3UpGW/CQTHtCKaYlScZTly4JIu97Jxo/odCiH0ITnDXSJPTOrEKk/ycSZ0AOgTmkDtkOsvIA==",
-      "license": "MIT"
-    },
-    "node_modules/@types/node": {
-      "version": "25.0.3",
-      "resolved": "https://registry.npmjs.org/@types/node/-/node-25.0.3.tgz",
-      "integrity": "sha512-W609buLVRVmeW693xKfzHeIV6nJGGz98uCPfeXI1ELMLXVeKYZ9m15fAMSaUPBHYLGFsVRcMmSCksQOrZV9BYA==",
-      "license": "MIT",
-      "optional": true,
-      "dependencies": {
-        "undici-types": "~7.16.0"
-      }
-    },
-    "node_modules/@types/yauzl": {
-      "version": "2.10.3",
-      "resolved": "https://registry.npmjs.org/@types/yauzl/-/yauzl-2.10.3.tgz",
-      "integrity": "sha512-oJoftv0LSuaDZE3Le4DbKX+KS9G36NzOeSap90UIK0yMA/NhKJhqlSGtNDORNRaIbQfzjXDrQa0ytJ6mNRGz/Q==",
-      "license": "MIT",
-      "optional": true,
-      "dependencies": {
-        "@types/node": "*"
-      }
-    },
-    "node_modules/agent-base": {
-      "version": "7.1.4",
-      "resolved": "https://registry.npmjs.org/agent-base/-/agent-base-7.1.4.tgz",
-      "integrity": "sha512-MnA+YT8fwfJPgBx3m60MNqakm30XOkyIoH1y6huTQvC0PwZG7ki8NacLBcrPbNoo8vEZy7Jpuk7+jMO+CUovTQ==",
-      "license": "MIT",
-      "engines": {
-        "node": ">= 14"
-      }
-    },
-    "node_modules/ansi-regex": {
-      "version": "5.0.1",
-      "resolved": "https://registry.npmjs.org/ansi-regex/-/ansi-regex-5.0.1.tgz",
-      "integrity": "sha512-quJQXlTSUGL2LH9SUXo8VwsY4soanhgo6LNSm84E1LBcE8s3O0wpdiRzyR9z/ZZJMlMWv37qOOb9pdJlMUEKFQ==",
-      "license": "MIT",
-      "engines": {
-        "node": ">=8"
-      }
-    },
-    "node_modules/ansi-styles": {
-      "version": "4.3.0",
-      "resolved": "https://registry.npmjs.org/ansi-styles/-/ansi-styles-4.3.0.tgz",
-      "integrity": "sha512-zbB9rCJAT1rbjiVDb2hqKFHNYLxgtk8NURxZ3IZwD3F6NtxbXZQCnnSi1Lkx+IDohdPlFp222wVALIheZJQSEg==",
-      "license": "MIT",
-      "dependencies": {
-        "color-convert": "^2.0.1"
-      },
-      "engines": {
-        "node": ">=8"
-      },
-      "funding": {
-        "url": "https://github.com/chalk/ansi-styles?sponsor=1"
-      }
-    },
-    "node_modules/ast-types": {
-      "version": "0.13.4",
-      "resolved": "https://registry.npmjs.org/ast-types/-/ast-types-0.13.4.tgz",
-      "integrity": "sha512-x1FCFnFifvYDDzTaLII71vG5uvDwgtmDTEVWAxrgeiR8VjMONcCXJx7E+USjDtHlwFmt9MysbqgF9b9Vjr6w+w==",
-      "license": "MIT",
-      "dependencies": {
-        "tslib": "^2.0.1"
-      },
-      "engines": {
-        "node": ">=4"
-      }
-    },
-    "node_modules/b4a": {
-      "version": "1.7.3",
-      "resolved": "https://registry.npmjs.org/b4a/-/b4a-1.7.3.tgz",
-      "integrity": "sha512-5Q2mfq2WfGuFp3uS//0s6baOJLMoVduPYVeNmDYxu5OUA1/cBfvr2RIS7vi62LdNj/urk1hfmj867I3qt6uZ7Q==",
-      "license": "Apache-2.0",
-      "peerDependencies": {
-        "react-native-b4a": "*"
-      },
-      "peerDependenciesMeta": {
-        "react-native-b4a": {
-          "optional": true
-        }
-      }
-    },
-    "node_modules/bare-events": {
-      "version": "2.8.2",
-      "resolved": "https://registry.npmjs.org/bare-events/-/bare-events-2.8.2.tgz",
-      "integrity": "sha512-riJjyv1/mHLIPX4RwiK+oW9/4c3TEUeORHKefKAKnZ5kyslbN+HXowtbaVEqt4IMUB7OXlfixcs6gsFeo/jhiQ==",
-      "license": "Apache-2.0",
-      "peerDependencies": {
-        "bare-abort-controller": "*"
-      },
-      "peerDependenciesMeta": {
-        "bare-abort-controller": {
-          "optional": true
-        }
-      }
-    },
-    "node_modules/bare-fs": {
-      "version": "4.5.2",
-      "resolved": "https://registry.npmjs.org/bare-fs/-/bare-fs-4.5.2.tgz",
-      "integrity": "sha512-veTnRzkb6aPHOvSKIOy60KzURfBdUflr5VReI+NSaPL6xf+XLdONQgZgpYvUuZLVQ8dCqxpBAudaOM1+KpAUxw==",
-      "license": "Apache-2.0",
-      "optional": true,
-      "dependencies": {
-        "bare-events": "^2.5.4",
-        "bare-path": "^3.0.0",
-        "bare-stream": "^2.6.4",
-        "bare-url": "^2.2.2",
-        "fast-fifo": "^1.3.2"
-      },
-      "engines": {
-        "bare": ">=1.16.0"
-      },
-      "peerDependencies": {
-        "bare-buffer": "*"
-      },
-      "peerDependenciesMeta": {
-        "bare-buffer": {
-          "optional": true
-        }
-      }
-    },
-    "node_modules/bare-os": {
-      "version": "3.6.2",
-      "resolved": "https://registry.npmjs.org/bare-os/-/bare-os-3.6.2.tgz",
-      "integrity": "sha512-T+V1+1srU2qYNBmJCXZkUY5vQ0B4FSlL3QDROnKQYOqeiQR8UbjNHlPa+TIbM4cuidiN9GaTaOZgSEgsvPbh5A==",
-      "license": "Apache-2.0",
-      "optional": true,
-      "engines": {
-        "bare": ">=1.14.0"
-      }
-    },
-    "node_modules/bare-path": {
-      "version": "3.0.0",
-      "resolved": "https://registry.npmjs.org/bare-path/-/bare-path-3.0.0.tgz",
-      "integrity": "sha512-tyfW2cQcB5NN8Saijrhqn0Zh7AnFNsnczRcuWODH0eYAXBsJ5gVxAUuNr7tsHSC6IZ77cA0SitzT+s47kot8Mw==",
-      "license": "Apache-2.0",
-      "optional": true,
-      "dependencies": {
-        "bare-os": "^3.0.1"
-      }
-    },
-    "node_modules/bare-stream": {
-      "version": "2.7.0",
-      "resolved": "https://registry.npmjs.org/bare-stream/-/bare-stream-2.7.0.tgz",
-      "integrity": "sha512-oyXQNicV1y8nc2aKffH+BUHFRXmx6VrPzlnaEvMhram0nPBrKcEdcyBg5r08D0i8VxngHFAiVyn1QKXpSG0B8A==",
-      "license": "Apache-2.0",
-      "optional": true,
-      "dependencies": {
-        "streamx": "^2.21.0"
-      },
-      "peerDependencies": {
-        "bare-buffer": "*",
-        "bare-events": "*"
-      },
-      "peerDependenciesMeta": {
-        "bare-buffer": {
-          "optional": true
-        },
-        "bare-events": {
-          "optional": true
-        }
-      }
-    },
-    "node_modules/bare-url": {
-      "version": "2.3.2",
-      "resolved": "https://registry.npmjs.org/bare-url/-/bare-url-2.3.2.tgz",
-      "integrity": "sha512-ZMq4gd9ngV5aTMa5p9+UfY0b3skwhHELaDkhEHetMdX0LRkW9kzaym4oo/Eh+Ghm0CCDuMTsRIGM/ytUc1ZYmw==",
-      "license": "Apache-2.0",
-      "optional": true,
-      "dependencies": {
-        "bare-path": "^3.0.0"
-      }
-    },
-    "node_modules/basic-ftp": {
-      "version": "5.0.5",
-      "resolved": "https://registry.npmjs.org/basic-ftp/-/basic-ftp-5.0.5.tgz",
-      "integrity": "sha512-4Bcg1P8xhUuqcii/S0Z9wiHIrQVPMermM1any+MX5GeGD7faD3/msQUDGLol9wOcz4/jbg/WJnGqoJF6LiBdtg==",
-      "license": "MIT",
-      "engines": {
-        "node": ">=10.0.0"
-      }
-    },
-    "node_modules/buffer-crc32": {
-      "version": "0.2.13",
-      "resolved": "https://registry.npmjs.org/buffer-crc32/-/buffer-crc32-0.2.13.tgz",
-      "integrity": "sha512-VO9Ht/+p3SN7SKWqcrgEzjGbRSJYTx+Q1pTQC0wrWqHx0vpJraQ6GtHx8tvcg1rlK1byhU5gccxgOgj7B0TDkQ==",
-      "license": "MIT",
-      "engines": {
-        "node": "*"
-      }
-    },
-    "node_modules/chromium-bidi": {
-      "version": "12.0.1",
-      "resolved": "https://registry.npmjs.org/chromium-bidi/-/chromium-bidi-12.0.1.tgz",
-      "integrity": "sha512-fGg+6jr0xjQhzpy5N4ErZxQ4wF7KLEvhGZXD6EgvZKDhu7iOhZXnZhcDxPJDcwTcrD48NPzOCo84RP2lv3Z+Cg==",
-      "license": "Apache-2.0",
-      "dependencies": {
-        "mitt": "^3.0.1",
-        "zod": "^3.24.1"
-      },
-      "peerDependencies": {
-        "devtools-protocol": "*"
-      }
-    },
-    "node_modules/cliui": {
-      "version": "8.0.1",
-      "resolved": "https://registry.npmjs.org/cliui/-/cliui-8.0.1.tgz",
-      "integrity": "sha512-BSeNnyus75C4//NQ9gQt1/csTXyo/8Sb+afLAkzAptFuMsod9HFokGNudZpi/oQV73hnVK+sR+5PVRMd+Dr7YQ==",
-      "license": "ISC",
-      "dependencies": {
-        "string-width": "^4.2.0",
-        "strip-ansi": "^6.0.1",
-        "wrap-ansi": "^7.0.0"
-      },
-      "engines": {
-        "node": ">=12"
-      }
-    },
-    "node_modules/color-convert": {
-      "version": "2.0.1",
-      "resolved": "https://registry.npmjs.org/color-convert/-/color-convert-2.0.1.tgz",
-      "integrity": "sha512-RRECPsj7iu/xb5oKYcsFHSppFNnsj/52OVTRKb4zP5onXwVF3zVmmToNcOfGC+CRDpfK/U584fMg38ZHCaElKQ==",
-      "license": "MIT",
-      "dependencies": {
-        "color-name": "~1.1.4"
-      },
-      "engines": {
-        "node": ">=7.0.0"
-      }
-    },
-    "node_modules/color-name": {
-      "version": "1.1.4",
-      "resolved": "https://registry.npmjs.org/color-name/-/color-name-1.1.4.tgz",
-      "integrity": "sha512-dOy+3AuW3a2wNbZHIuMZpTcgjGuLU/uBL/ubcZF9OXbDo8ff4O8yVp5Bf0efS8uEoYo5q4Fx7dY9OgQGXgAsQA==",
-      "license": "MIT"
-    },
-    "node_modules/data-uri-to-buffer": {
-      "version": "6.0.2",
-      "resolved": "https://registry.npmjs.org/data-uri-to-buffer/-/data-uri-to-buffer-6.0.2.tgz",
-      "integrity": "sha512-7hvf7/GW8e86rW0ptuwS3OcBGDjIi6SZva7hCyWC0yYry2cOPmLIjXAUHI6DK2HsnwJd9ifmt57i8eV2n4YNpw==",
-      "license": "MIT",
-      "engines": {
-        "node": ">= 14"
-      }
-    },
-    "node_modules/debug": {
-      "version": "4.4.3",
-      "resolved": "https://registry.npmjs.org/debug/-/debug-4.4.3.tgz",
-      "integrity": "sha512-RGwwWnwQvkVfavKVt22FGLw+xYSdzARwm0ru6DhTVA3umU5hZc28V3kO4stgYryrTlLpuvgI9GiijltAjNbcqA==",
-      "license": "MIT",
-      "dependencies": {
-        "ms": "^2.1.3"
-      },
-      "engines": {
-        "node": ">=6.0"
-      },
-      "peerDependenciesMeta": {
-        "supports-color": {
-          "optional": true
-        }
-      }
-    },
-    "node_modules/degenerator": {
-      "version": "5.0.1",
-      "resolved": "https://registry.npmjs.org/degenerator/-/degenerator-5.0.1.tgz",
-      "integrity": "sha512-TllpMR/t0M5sqCXfj85i4XaAzxmS5tVA16dqvdkMwGmzI+dXLXnw3J+3Vdv7VKw+ThlTMboK6i9rnZ6Nntj5CQ==",
-      "license": "MIT",
-      "dependencies": {
-        "ast-types": "^0.13.4",
-        "escodegen": "^2.1.0",
-        "esprima": "^4.0.1"
-      },
-      "engines": {
-        "node": ">= 14"
-      }
-    },
-    "node_modules/devtools-protocol": {
-      "version": "0.0.1534754",
-      "resolved": "https://registry.npmjs.org/devtools-protocol/-/devtools-protocol-0.0.1534754.tgz",
-      "integrity": "sha512-26T91cV5dbOYnXdJi5qQHoTtUoNEqwkHcAyu/IKtjIAxiEqPMrDiRkDOPWVsGfNZGmlQVHQbZRSjD8sxagWVsQ==",
-      "license": "BSD-3-Clause",
-      "peer": true
-    },
-    "node_modules/emoji-regex": {
-      "version": "8.0.0",
-      "resolved": "https://registry.npmjs.org/emoji-regex/-/emoji-regex-8.0.0.tgz",
-      "integrity": "sha512-MSjYzcWNOA0ewAHpz0MxpYFvwg6yjy1NG3xteoqz644VCo/RPgnr1/GGt+ic3iJTzQ8Eu3TdM14SawnVUmGE6A==",
-      "license": "MIT"
-    },
-    "node_modules/end-of-stream": {
-      "version": "1.4.5",
-      "resolved": "https://registry.npmjs.org/end-of-stream/-/end-of-stream-1.4.5.tgz",
-      "integrity": "sha512-ooEGc6HP26xXq/N+GCGOT0JKCLDGrq2bQUZrQ7gyrJiZANJ/8YDTxTpQBXGMn+WbIQXNVpyWymm7KYVICQnyOg==",
-      "license": "MIT",
-      "dependencies": {
-        "once": "^1.4.0"
-      }
-    },
-    "node_modules/escalade": {
-      "version": "3.2.0",
-      "resolved": "https://registry.npmjs.org/escalade/-/escalade-3.2.0.tgz",
-      "integrity": "sha512-WUj2qlxaQtO4g6Pq5c29GTcWGDyd8itL8zTlipgECz3JesAiiOKotd8JU6otB3PACgG6xkJUyVhboMS+bje/jA==",
-      "license": "MIT",
-      "engines": {
-        "node": ">=6"
-      }
-    },
-    "node_modules/escodegen": {
-      "version": "2.1.0",
-      "resolved": "https://registry.npmjs.org/escodegen/-/escodegen-2.1.0.tgz",
-      "integrity": "sha512-2NlIDTwUWJN0mRPQOdtQBzbUHvdGY2P1VXSyU83Q3xKxM7WHX2Ql8dKq782Q9TgQUNOLEzEYu9bzLNj1q88I5w==",
-      "license": "BSD-2-Clause",
-      "dependencies": {
-        "esprima": "^4.0.1",
-        "estraverse": "^5.2.0",
-        "esutils": "^2.0.2"
-      },
-      "bin": {
-        "escodegen": "bin/escodegen.js",
-        "esgenerate": "bin/esgenerate.js"
-      },
-      "engines": {
-        "node": ">=6.0"
-      },
-      "optionalDependencies": {
-        "source-map": "~0.6.1"
-      }
-    },
-    "node_modules/esprima": {
-      "version": "4.0.1",
-      "resolved": "https://registry.npmjs.org/esprima/-/esprima-4.0.1.tgz",
-      "integrity": "sha512-eGuFFw7Upda+g4p+QHvnW0RyTX/SVeJBDM/gCtMARO0cLuT2HcEKnTPvhjV6aGeqrCB/sbNop0Kszm0jsaWU4A==",
-      "license": "BSD-2-Clause",
-      "bin": {
-        "esparse": "bin/esparse.js",
-        "esvalidate": "bin/esvalidate.js"
-      },
-      "engines": {
-        "node": ">=4"
-      }
-    },
-    "node_modules/estraverse": {
-      "version": "5.3.0",
-      "resolved": "https://registry.npmjs.org/estraverse/-/estraverse-5.3.0.tgz",
-      "integrity": "sha512-MMdARuVEQziNTeJD8DgMqmhwR11BRQ/cBP+pLtYdSTnf3MIO8fFeiINEbX36ZdNlfU/7A9f3gUw49B3oQsvwBA==",
-      "license": "BSD-2-Clause",
-      "engines": {
-        "node": ">=4.0"
-      }
-    },
-    "node_modules/esutils": {
-      "version": "2.0.3",
-      "resolved": "https://registry.npmjs.org/esutils/-/esutils-2.0.3.tgz",
-      "integrity": "sha512-kVscqXk4OCp68SZ0dkgEKVi6/8ij300KBWTJq32P/dYeWTSwK41WyTxalN1eRmA5Z9UU/LX9D7FWSmV9SAYx6g==",
-      "license": "BSD-2-Clause",
-      "engines": {
-        "node": ">=0.10.0"
-      }
-    },
-    "node_modules/events-universal": {
-      "version": "1.0.1",
-      "resolved": "https://registry.npmjs.org/events-universal/-/events-universal-1.0.1.tgz",
-      "integrity": "sha512-LUd5euvbMLpwOF8m6ivPCbhQeSiYVNb8Vs0fQ8QjXo0JTkEHpz8pxdQf0gStltaPpw0Cca8b39KxvK9cfKRiAw==",
-      "license": "Apache-2.0",
-      "dependencies": {
-        "bare-events": "^2.7.0"
-      }
-    },
-    "node_modules/extract-zip": {
-      "version": "2.0.1",
-      "resolved": "https://registry.npmjs.org/extract-zip/-/extract-zip-2.0.1.tgz",
-      "integrity": "sha512-GDhU9ntwuKyGXdZBUgTIe+vXnWj0fppUEtMDL0+idd5Sta8TGpHssn/eusA9mrPr9qNDym6SxAYZjNvCn/9RBg==",
-      "license": "BSD-2-Clause",
-      "dependencies": {
-        "debug": "^4.1.1",
-        "get-stream": "^5.1.0",
-        "yauzl": "^2.10.0"
-      },
-      "bin": {
-        "extract-zip": "cli.js"
-      },
-      "engines": {
-        "node": ">= 10.17.0"
-      },
-      "optionalDependencies": {
-        "@types/yauzl": "^2.9.1"
-      }
-    },
-    "node_modules/fast-fifo": {
-      "version": "1.3.2",
-      "resolved": "https://registry.npmjs.org/fast-fifo/-/fast-fifo-1.3.2.tgz",
-      "integrity": "sha512-/d9sfos4yxzpwkDkuN7k2SqFKtYNmCTzgfEpz82x34IM9/zc8KGxQoXg1liNC/izpRM/MBdt44Nmx41ZWqk+FQ==",
-      "license": "MIT"
-    },
-    "node_modules/fd-slicer": {
-      "version": "1.1.0",
-      "resolved": "https://registry.npmjs.org/fd-slicer/-/fd-slicer-1.1.0.tgz",
-      "integrity": "sha512-cE1qsB/VwyQozZ+q1dGxR8LBYNZeofhEdUNGSMbQD3Gw2lAzX9Zb3uIU6Ebc/Fmyjo9AWWfnn0AUCHqtevs/8g==",
-      "license": "MIT",
-      "dependencies": {
-        "pend": "~1.2.0"
-      }
-    },
-    "node_modules/get-caller-file": {
-      "version": "2.0.5",
-      "resolved": "https://registry.npmjs.org/get-caller-file/-/get-caller-file-2.0.5.tgz",
-      "integrity": "sha512-DyFP3BM/3YHTQOCUL/w0OZHR0lpKeGrxotcHWcqNEdnltqFwXVfhEBQ94eIo34AfQpo0rGki4cyIiftY06h2Fg==",
-      "license": "ISC",
-      "engines": {
-        "node": "6.* || 8.* || >= 10.*"
-      }
-    },
-    "node_modules/get-stream": {
-      "version": "5.2.0",
-      "resolved": "https://registry.npmjs.org/get-stream/-/get-stream-5.2.0.tgz",
-      "integrity": "sha512-nBF+F1rAZVCu/p7rjzgA+Yb4lfYXrpl7a6VmJrU8wF9I1CKvP/QwPNZHnOlwbTkY6dvtFIzFMSyQXbLoTQPRpA==",
-      "license": "MIT",
-      "dependencies": {
-        "pump": "^3.0.0"
-      },
-      "engines": {
-        "node": ">=8"
-      },
-      "funding": {
-        "url": "https://github.com/sponsors/sindresorhus"
-      }
-    },
-    "node_modules/get-uri": {
-      "version": "6.0.5",
-      "resolved": "https://registry.npmjs.org/get-uri/-/get-uri-6.0.5.tgz",
-      "integrity": "sha512-b1O07XYq8eRuVzBNgJLstU6FYc1tS6wnMtF1I1D9lE8LxZSOGZ7LhxN54yPP6mGw5f2CkXY2BQUL9Fx41qvcIg==",
-      "license": "MIT",
-      "dependencies": {
-        "basic-ftp": "^5.0.2",
-        "data-uri-to-buffer": "^6.0.2",
-        "debug": "^4.3.4"
-      },
-      "engines": {
-        "node": ">= 14"
-      }
-    },
-    "node_modules/http-proxy-agent": {
-      "version": "7.0.2",
-      "resolved": "https://registry.npmjs.org/http-proxy-agent/-/http-proxy-agent-7.0.2.tgz",
-      "integrity": "sha512-T1gkAiYYDWYx3V5Bmyu7HcfcvL7mUrTWiM6yOfa3PIphViJ/gFPbvidQ+veqSOHci/PxBcDabeUNCzpOODJZig==",
-      "license": "MIT",
-      "dependencies": {
-        "agent-base": "^7.1.0",
-        "debug": "^4.3.4"
-      },
-      "engines": {
-        "node": ">= 14"
-      }
-    },
-    "node_modules/https-proxy-agent": {
-      "version": "7.0.6",
-      "resolved": "https://registry.npmjs.org/https-proxy-agent/-/https-proxy-agent-7.0.6.tgz",
-      "integrity": "sha512-vK9P5/iUfdl95AI+JVyUuIcVtd4ofvtrOr3HNtM2yxC9bnMbEdp3x01OhQNnjb8IJYi38VlTE3mBXwcfvywuSw==",
-      "license": "MIT",
-      "dependencies": {
-        "agent-base": "^7.1.2",
-        "debug": "4"
-      },
-      "engines": {
-        "node": ">= 14"
-      }
-    },
-    "node_modules/ip-address": {
-      "version": "10.1.0",
-      "resolved": "https://registry.npmjs.org/ip-address/-/ip-address-10.1.0.tgz",
-      "integrity": "sha512-XXADHxXmvT9+CRxhXg56LJovE+bmWnEWB78LB83VZTprKTmaC5QfruXocxzTZ2Kl0DNwKuBdlIhjL8LeY8Sf8Q==",
-      "license": "MIT",
-      "engines": {
-        "node": ">= 12"
-      }
-    },
-    "node_modules/is-fullwidth-code-point": {
-      "version": "3.0.0",
-      "resolved": "https://registry.npmjs.org/is-fullwidth-code-point/-/is-fullwidth-code-point-3.0.0.tgz",
-      "integrity": "sha512-zymm5+u+sCsSWyD9qNaejV3DFvhCKclKdizYaJUuHA83RLjb7nSuGnddCHGv0hk+KY7BMAlsWeK4Ueg6EV6XQg==",
-      "license": "MIT",
-      "engines": {
-        "node": ">=8"
-      }
-    },
-    "node_modules/lru-cache": {
-      "version": "7.18.3",
-      "resolved": "https://registry.npmjs.org/lru-cache/-/lru-cache-7.18.3.tgz",
-      "integrity": "sha512-jumlc0BIUrS3qJGgIkWZsyfAM7NCWiBcCDhnd+3NNM5KbBmLTgHVfWBcg6W+rLUsIpzpERPsvwUP7CckAQSOoA==",
-      "license": "ISC",
-      "engines": {
-        "node": ">=12"
-      }
-    },
-    "node_modules/mitt": {
-      "version": "3.0.1",
-      "resolved": "https://registry.npmjs.org/mitt/-/mitt-3.0.1.tgz",
-      "integrity": "sha512-vKivATfr97l2/QBCYAkXYDbrIWPM2IIKEl7YPhjCvKlG3kE2gm+uBo6nEXK3M5/Ffh/FLpKExzOQ3JJoJGFKBw==",
-      "license": "MIT"
-    },
-    "node_modules/ms": {
-      "version": "2.1.3",
-      "resolved": "https://registry.npmjs.org/ms/-/ms-2.1.3.tgz",
-      "integrity": "sha512-6FlzubTLZG3J2a/NVCAleEhjzq5oxgHyaCU9yYXvcLsvoVaHJq/s5xXI6/XXP6tz7R9xAOtHnSO/tXtF3WRTlA==",
-      "license": "MIT"
-    },
-    "node_modules/netmask": {
-      "version": "2.0.2",
-      "resolved": "https://registry.npmjs.org/netmask/-/netmask-2.0.2.tgz",
-      "integrity": "sha512-dBpDMdxv9Irdq66304OLfEmQ9tbNRFnFTuZiLo+bD+r332bBmMJ8GBLXklIXXgxd3+v9+KUnZaUR5PJMa75Gsg==",
-      "license": "MIT",
-      "engines": {
-        "node": ">= 0.4.0"
-      }
-    },
-    "node_modules/once": {
-      "version": "1.4.0",
-      "resolved": "https://registry.npmjs.org/once/-/once-1.4.0.tgz",
-      "integrity": "sha512-lNaJgI+2Q5URQBkccEKHTQOPaXdUxnZZElQTZY0MFUAuaEqe1E+Nyvgdz/aIyNi6Z9MzO5dv1H8n58/GELp3+w==",
-      "license": "ISC",
-      "dependencies": {
-        "wrappy": "1"
-      }
-    },
-    "node_modules/pac-proxy-agent": {
-      "version": "7.2.0",
-      "resolved": "https://registry.npmjs.org/pac-proxy-agent/-/pac-proxy-agent-7.2.0.tgz",
-      "integrity": "sha512-TEB8ESquiLMc0lV8vcd5Ql/JAKAoyzHFXaStwjkzpOpC5Yv+pIzLfHvjTSdf3vpa2bMiUQrg9i6276yn8666aA==",
-      "license": "MIT",
-      "dependencies": {
-        "@tootallnate/quickjs-emscripten": "^0.23.0",
-        "agent-base": "^7.1.2",
-        "debug": "^4.3.4",
-        "get-uri": "^6.0.1",
-        "http-proxy-agent": "^7.0.0",
-        "https-proxy-agent": "^7.0.6",
-        "pac-resolver": "^7.0.1",
-        "socks-proxy-agent": "^8.0.5"
-      },
-      "engines": {
-        "node": ">= 14"
-      }
-    },
-    "node_modules/pac-resolver": {
-      "version": "7.0.1",
-      "resolved": "https://registry.npmjs.org/pac-resolver/-/pac-resolver-7.0.1.tgz",
-      "integrity": "sha512-5NPgf87AT2STgwa2ntRMr45jTKrYBGkVU36yT0ig/n/GMAa3oPqhZfIQ2kMEimReg0+t9kZViDVZ83qfVUlckg==",
-      "license": "MIT",
-      "dependencies": {
-        "degenerator": "^5.0.0",
-        "netmask": "^2.0.2"
-      },
-      "engines": {
-        "node": ">= 14"
-      }
-    },
-    "node_modules/pend": {
-      "version": "1.2.0",
-      "resolved": "https://registry.npmjs.org/pend/-/pend-1.2.0.tgz",
-      "integrity": "sha512-F3asv42UuXchdzt+xXqfW1OGlVBe+mxa2mqI0pg5yAHZPvFmY3Y6drSf/GQ1A86WgWEN9Kzh/WrgKa6iGcHXLg==",
-      "license": "MIT"
-    },
-    "node_modules/progress": {
-      "version": "2.0.3",
-      "resolved": "https://registry.npmjs.org/progress/-/progress-2.0.3.tgz",
-      "integrity": "sha512-7PiHtLll5LdnKIMw100I+8xJXR5gW2QwWYkT6iJva0bXitZKa/XMrSbdmg3r2Xnaidz9Qumd0VPaMrZlF9V9sA==",
-      "license": "MIT",
-      "engines": {
-        "node": ">=0.4.0"
-      }
-    },
-    "node_modules/proxy-agent": {
-      "version": "6.5.0",
-      "resolved": "https://registry.npmjs.org/proxy-agent/-/proxy-agent-6.5.0.tgz",
-      "integrity": "sha512-TmatMXdr2KlRiA2CyDu8GqR8EjahTG3aY3nXjdzFyoZbmB8hrBsTyMezhULIXKnC0jpfjlmiZ3+EaCzoInSu/A==",
-      "license": "MIT",
-      "dependencies": {
-        "agent-base": "^7.1.2",
-        "debug": "^4.3.4",
-        "http-proxy-agent": "^7.0.1",
-        "https-proxy-agent": "^7.0.6",
-        "lru-cache": "^7.14.1",
-        "pac-proxy-agent": "^7.1.0",
-        "proxy-from-env": "^1.1.0",
-        "socks-proxy-agent": "^8.0.5"
-      },
-      "engines": {
-        "node": ">= 14"
-      }
-    },
-    "node_modules/proxy-from-env": {
-      "version": "1.1.0",
-      "resolved": "https://registry.npmjs.org/proxy-from-env/-/proxy-from-env-1.1.0.tgz",
-      "integrity": "sha512-D+zkORCbA9f1tdWRK0RaCR3GPv50cMxcrz4X8k5LTSUD1Dkw47mKJEZQNunItRTkWwgtaUSo1RVFRIG9ZXiFYg==",
-      "license": "MIT"
-    },
-    "node_modules/pump": {
-      "version": "3.0.3",
-      "resolved": "https://registry.npmjs.org/pump/-/pump-3.0.3.tgz",
-      "integrity": "sha512-todwxLMY7/heScKmntwQG8CXVkWUOdYxIvY2s0VWAAMh/nd8SoYiRaKjlr7+iCs984f2P8zvrfWcDDYVb73NfA==",
-      "license": "MIT",
-      "dependencies": {
-        "end-of-stream": "^1.1.0",
-        "once": "^1.3.1"
-      }
-    },
-    "node_modules/puppeteer-core": {
-      "version": "24.34.0",
-      "resolved": "https://registry.npmjs.org/puppeteer-core/-/puppeteer-core-24.34.0.tgz",
-      "integrity": "sha512-24evawO+mUGW4mvS2a2ivwLdX3gk8zRLZr9HP+7+VT2vBQnm0oh9jJEZmUE3ePJhRkYlZ93i7OMpdcoi2qNCLg==",
-      "license": "Apache-2.0",
-      "dependencies": {
-        "@puppeteer/browsers": "2.11.0",
-        "chromium-bidi": "12.0.1",
-        "debug": "^4.4.3",
-        "devtools-protocol": "0.0.1534754",
-        "typed-query-selector": "^2.12.0",
-        "webdriver-bidi-protocol": "0.3.10",
-        "ws": "^8.18.3"
-      },
-      "engines": {
-        "node": ">=18"
-      }
-    },
-    "node_modules/require-directory": {
-      "version": "2.1.1",
-      "resolved": "https://registry.npmjs.org/require-directory/-/require-directory-2.1.1.tgz",
-      "integrity": "sha512-fGxEI7+wsG9xrvdjsrlmL22OMTTiHRwAMroiEeMgq8gzoLC/PQr7RsRDSTLUg/bZAZtF+TVIkHc6/4RIKrui+Q==",
-      "license": "MIT",
-      "engines": {
-        "node": ">=0.10.0"
-      }
-    },
-    "node_modules/semver": {
-      "version": "7.7.3",
-      "resolved": "https://registry.npmjs.org/semver/-/semver-7.7.3.tgz",
-      "integrity": "sha512-SdsKMrI9TdgjdweUSR9MweHA4EJ8YxHn8DFaDisvhVlUOe4BF1tLD7GAj0lIqWVl+dPb/rExr0Btby5loQm20Q==",
-      "license": "ISC",
-      "bin": {
-        "semver": "bin/semver.js"
-      },
-      "engines": {
-        "node": ">=10"
-      }
-    },
-    "node_modules/smart-buffer": {
-      "version": "4.2.0",
-      "resolved": "https://registry.npmjs.org/smart-buffer/-/smart-buffer-4.2.0.tgz",
-      "integrity": "sha512-94hK0Hh8rPqQl2xXc3HsaBoOXKV20MToPkcXvwbISWLEs+64sBq5kFgn2kJDHb1Pry9yrP0dxrCI9RRci7RXKg==",
-      "license": "MIT",
-      "engines": {
-        "node": ">= 6.0.0",
-        "npm": ">= 3.0.0"
-      }
-    },
-    "node_modules/socks": {
-      "version": "2.8.7",
-      "resolved": "https://registry.npmjs.org/socks/-/socks-2.8.7.tgz",
-      "integrity": "sha512-HLpt+uLy/pxB+bum/9DzAgiKS8CX1EvbWxI4zlmgGCExImLdiad2iCwXT5Z4c9c3Eq8rP2318mPW2c+QbtjK8A==",
-      "license": "MIT",
-      "dependencies": {
-        "ip-address": "^10.0.1",
-        "smart-buffer": "^4.2.0"
-      },
-      "engines": {
-        "node": ">= 10.0.0",
-        "npm": ">= 3.0.0"
-      }
-    },
-    "node_modules/socks-proxy-agent": {
-      "version": "8.0.5",
-      "resolved": "https://registry.npmjs.org/socks-proxy-agent/-/socks-proxy-agent-8.0.5.tgz",
-      "integrity": "sha512-HehCEsotFqbPW9sJ8WVYB6UbmIMv7kUUORIF2Nncq4VQvBfNBLibW9YZR5dlYCSUhwcD628pRllm7n+E+YTzJw==",
-      "license": "MIT",
-      "dependencies": {
-        "agent-base": "^7.1.2",
-        "debug": "^4.3.4",
-        "socks": "^2.8.3"
-      },
-      "engines": {
-        "node": ">= 14"
-      }
-    },
-    "node_modules/source-map": {
-      "version": "0.6.1",
-      "resolved": "https://registry.npmjs.org/source-map/-/source-map-0.6.1.tgz",
-      "integrity": "sha512-UjgapumWlbMhkBgzT7Ykc5YXUT46F0iKu8SGXq0bcwP5dz/h0Plj6enJqjz1Zbq2l5WaqYnrVbwWOWMyF3F47g==",
-      "license": "BSD-3-Clause",
-      "optional": true,
-      "engines": {
-        "node": ">=0.10.0"
-      }
-    },
-    "node_modules/streamx": {
-      "version": "2.23.0",
-      "resolved": "https://registry.npmjs.org/streamx/-/streamx-2.23.0.tgz",
-      "integrity": "sha512-kn+e44esVfn2Fa/O0CPFcex27fjIL6MkVae0Mm6q+E6f0hWv578YCERbv+4m02cjxvDsPKLnmxral/rR6lBMAg==",
-      "license": "MIT",
-      "dependencies": {
-        "events-universal": "^1.0.0",
-        "fast-fifo": "^1.3.2",
-        "text-decoder": "^1.1.0"
-      }
-    },
-    "node_modules/string-width": {
-      "version": "4.2.3",
-      "resolved": "https://registry.npmjs.org/string-width/-/string-width-4.2.3.tgz",
-      "integrity": "sha512-wKyQRQpjJ0sIp62ErSZdGsjMJWsap5oRNihHhu6G7JVO/9jIB6UyevL+tXuOqrng8j/cxKTWyWUwvSTriiZz/g==",
-      "license": "MIT",
-      "dependencies": {
-        "emoji-regex": "^8.0.0",
-        "is-fullwidth-code-point": "^3.0.0",
-        "strip-ansi": "^6.0.1"
-      },
-      "engines": {
-        "node": ">=8"
-      }
-    },
-    "node_modules/strip-ansi": {
-      "version": "6.0.1",
-      "resolved": "https://registry.npmjs.org/strip-ansi/-/strip-ansi-6.0.1.tgz",
-      "integrity": "sha512-Y38VPSHcqkFrCpFnQ9vuSXmquuv5oXOKpGeT6aGrr3o3Gc9AlVa6JBfUSOCnbxGGZF+/0ooI7KrPuUSztUdU5A==",
-      "license": "MIT",
-      "dependencies": {
-        "ansi-regex": "^5.0.1"
-      },
-      "engines": {
-        "node": ">=8"
-      }
-    },
-    "node_modules/tar-fs": {
-      "version": "3.1.1",
-      "resolved": "https://registry.npmjs.org/tar-fs/-/tar-fs-3.1.1.tgz",
-      "integrity": "sha512-LZA0oaPOc2fVo82Txf3gw+AkEd38szODlptMYejQUhndHMLQ9M059uXR+AfS7DNo0NpINvSqDsvyaCrBVkptWg==",
-      "license": "MIT",
-      "dependencies": {
-        "pump": "^3.0.0",
-        "tar-stream": "^3.1.5"
-      },
-      "optionalDependencies": {
-        "bare-fs": "^4.0.1",
-        "bare-path": "^3.0.0"
-      }
-    },
-    "node_modules/tar-stream": {
-      "version": "3.1.7",
-      "resolved": "https://registry.npmjs.org/tar-stream/-/tar-stream-3.1.7.tgz",
-      "integrity": "sha512-qJj60CXt7IU1Ffyc3NJMjh6EkuCFej46zUqJ4J7pqYlThyd9bO0XBTmcOIhSzZJVWfsLks0+nle/j538YAW9RQ==",
-      "license": "MIT",
-      "dependencies": {
-        "b4a": "^1.6.4",
-        "fast-fifo": "^1.2.0",
-        "streamx": "^2.15.0"
-      }
-    },
-    "node_modules/text-decoder": {
-      "version": "1.2.3",
-      "resolved": "https://registry.npmjs.org/text-decoder/-/text-decoder-1.2.3.tgz",
-      "integrity": "sha512-3/o9z3X0X0fTupwsYvR03pJ/DjWuqqrfwBgTQzdWDiQSm9KitAyz/9WqsT2JQW7KV2m+bC2ol/zqpW37NHxLaA==",
-      "license": "Apache-2.0",
-      "dependencies": {
-        "b4a": "^1.6.4"
-      }
-    },
-    "node_modules/tslib": {
-      "version": "2.8.1",
-      "resolved": "https://registry.npmjs.org/tslib/-/tslib-2.8.1.tgz",
-      "integrity": "sha512-oJFu94HQb+KVduSUQL7wnpmqnfmLsOA/nAh6b6EH0wCEoK0/mPeXU6c3wKDV83MkOuHPRHtSXKKU99IBazS/2w==",
-      "license": "0BSD"
-    },
-    "node_modules/typed-query-selector": {
-      "version": "2.12.0",
-      "resolved": "https://registry.npmjs.org/typed-query-selector/-/typed-query-selector-2.12.0.tgz",
-      "integrity": "sha512-SbklCd1F0EiZOyPiW192rrHZzZ5sBijB6xM+cpmrwDqObvdtunOHHIk9fCGsoK5JVIYXoyEp4iEdE3upFH3PAg==",
-      "license": "MIT"
-    },
-    "node_modules/undici-types": {
-      "version": "7.16.0",
-      "resolved": "https://registry.npmjs.org/undici-types/-/undici-types-7.16.0.tgz",
-      "integrity": "sha512-Zz+aZWSj8LE6zoxD+xrjh4VfkIG8Ya6LvYkZqtUQGJPZjYl53ypCaUwWqo7eI0x66KBGeRo+mlBEkMSeSZ38Nw==",
-      "license": "MIT",
-      "optional": true
-    },
-    "node_modules/webdriver-bidi-protocol": {
-      "version": "0.3.10",
-      "resolved": "https://registry.npmjs.org/webdriver-bidi-protocol/-/webdriver-bidi-protocol-0.3.10.tgz",
-      "integrity": "sha512-5LAE43jAVLOhB/QqX4bwSiv0Hg1HBfMmOuwBSXHdvg4GMGu9Y0lIq7p4R/yySu6w74WmaR4GM4H9t2IwLW7hgw==",
-      "license": "Apache-2.0"
-    },
-    "node_modules/wrap-ansi": {
-      "version": "7.0.0",
-      "resolved": "https://registry.npmjs.org/wrap-ansi/-/wrap-ansi-7.0.0.tgz",
-      "integrity": "sha512-YVGIj2kamLSTxw6NsZjoBxfSwsn0ycdesmc4p+Q21c5zPuZ1pl+NfxVdxPtdHvmNVOQ6XSYG4AUtyt/Fi7D16Q==",
-      "license": "MIT",
-      "dependencies": {
-        "ansi-styles": "^4.0.0",
-        "string-width": "^4.1.0",
-        "strip-ansi": "^6.0.0"
-      },
-      "engines": {
-        "node": ">=10"
-      },
-      "funding": {
-        "url": "https://github.com/chalk/wrap-ansi?sponsor=1"
-      }
-    },
-    "node_modules/wrappy": {
-      "version": "1.0.2",
-      "resolved": "https://registry.npmjs.org/wrappy/-/wrappy-1.0.2.tgz",
-      "integrity": "sha512-l4Sp/DRseor9wL6EvV2+TuQn63dMkPjZ/sp9XkghTEbV9KlPS1xUsZ3u7/IQO4wxtcFB4bgpQPRcR3QCvezPcQ==",
-      "license": "ISC"
-    },
-    "node_modules/ws": {
-      "version": "8.18.3",
-      "resolved": "https://registry.npmjs.org/ws/-/ws-8.18.3.tgz",
-      "integrity": "sha512-PEIGCY5tSlUt50cqyMXfCzX+oOPqN0vuGqWzbcJ2xvnkzkq46oOpz7dQaTDBdfICb4N14+GARUDw2XV2N4tvzg==",
-      "license": "MIT",
-      "engines": {
-        "node": ">=10.0.0"
-      },
-      "peerDependencies": {
-        "bufferutil": "^4.0.1",
-        "utf-8-validate": ">=5.0.2"
-      },
-      "peerDependenciesMeta": {
-        "bufferutil": {
-          "optional": true
-        },
-        "utf-8-validate": {
-          "optional": true
-        }
-      }
-    },
-    "node_modules/y18n": {
-      "version": "5.0.8",
-      "resolved": "https://registry.npmjs.org/y18n/-/y18n-5.0.8.tgz",
-      "integrity": "sha512-0pfFzegeDWJHJIAmTLRP2DwHjdF5s7jo9tuztdQxAhINCdvS+3nGINqPd00AphqJR/0LhANUS6/+7SCb98YOfA==",
-      "license": "ISC",
-      "engines": {
-        "node": ">=10"
-      }
-    },
-    "node_modules/yargs": {
-      "version": "17.7.2",
-      "resolved": "https://registry.npmjs.org/yargs/-/yargs-17.7.2.tgz",
-      "integrity": "sha512-7dSzzRQ++CKnNI/krKnYRV7JKKPUXMEh61soaHKg9mrWEhzFWhFnxPxGl+69cD1Ou63C13NUPCnmIcrvqCuM6w==",
-      "license": "MIT",
-      "dependencies": {
-        "cliui": "^8.0.1",
-        "escalade": "^3.1.1",
-        "get-caller-file": "^2.0.5",
-        "require-directory": "^2.1.1",
-        "string-width": "^4.2.3",
-        "y18n": "^5.0.5",
-        "yargs-parser": "^21.1.1"
-      },
-      "engines": {
-        "node": ">=12"
-      }
-    },
-    "node_modules/yargs-parser": {
-      "version": "21.1.1",
-      "resolved": "https://registry.npmjs.org/yargs-parser/-/yargs-parser-21.1.1.tgz",
-      "integrity": "sha512-tVpsJW7DdjecAiFpbIB1e3qxIQsE6NoPc5/eTdrbbIC4h0LVsWhnoa3g+m2HclBIujHzsxZ4VJVA+GUuc2/LBw==",
-      "license": "ISC",
-      "engines": {
-        "node": ">=12"
-      }
-    },
-    "node_modules/yauzl": {
-      "version": "2.10.0",
-      "resolved": "https://registry.npmjs.org/yauzl/-/yauzl-2.10.0.tgz",
-      "integrity": "sha512-p4a9I6X6nu6IhoGmBqAcbJy1mlC4j27vEPZX9F4L4/vZT3Lyq1VkFHw/V/PUcB9Buo+DG3iHkT0x3Qya58zc3g==",
-      "license": "MIT",
-      "dependencies": {
-        "buffer-crc32": "~0.2.3",
-        "fd-slicer": "~1.1.0"
-      }
-    },
-    "node_modules/zod": {
-      "version": "3.25.76",
-      "resolved": "https://registry.npmjs.org/zod/-/zod-3.25.76.tgz",
-      "integrity": "sha512-gzUt/qt81nXsFGKIFcC3YnfEAx5NkunCfnDlvuBSSFS02bcXu4Lmea0AFIUwbLWxWPx3d9p8S5QoaujKcNQxcQ==",
-      "license": "MIT",
-      "funding": {
-        "url": "https://github.com/sponsors/colinhacks"
-      }
-    }
-  }
-}

+ 0 - 1
archivebox/plugins/package.json

@@ -1 +0,0 @@
-{"name":"archivebox-plugins","private":true,"dependencies":{"puppeteer-core":"^24.34.0"}}

+ 2 - 1
archivebox/plugins/papersdl/config.json

@@ -3,9 +3,10 @@
   "type": "object",
   "additionalProperties": false,
   "properties": {
-    "SAVE_PAPERSDL": {
+    "PAPERSDL_ENABLED": {
       "type": "boolean",
       "default": true,
+      "x-aliases": ["SAVE_PAPERSDL", "USE_PAPERSDL"],
       "description": "Enable paper downloading with papers-dl"
     },
     "PAPERSDL_BINARY": {

+ 11 - 5
archivebox/plugins/parse_html_urls/on_Snapshot__70_parse_html_urls.py

@@ -170,10 +170,6 @@ def main(url: str, snapshot_id: str = None, crawl_id: str = None, depth: int = 0
             if normalized != url:
                 urls_found.add(unescape(normalized))
 
-    if not urls_found:
-        click.echo('No URLs found', err=True)
-        sys.exit(1)
-
     # Emit Snapshot records to stdout (JSONL)
     for found_url in sorted(urls_found):
         record = {
@@ -189,7 +185,17 @@ def main(url: str, snapshot_id: str = None, crawl_id: str = None, depth: int = 0
 
         print(json.dumps(record))
 
-    click.echo(f'Found {len(urls_found)} URLs', err=True)
+    # Emit ArchiveResult record to mark completion
+    status = 'succeeded' if urls_found else 'skipped'
+    output_str = f'Found {len(urls_found)} URLs' if urls_found else 'No URLs found'
+    ar_record = {
+        'type': 'ArchiveResult',
+        'status': status,
+        'output_str': output_str,
+    }
+    print(json.dumps(ar_record))
+
+    click.echo(output_str, err=True)
     sys.exit(0)
 
 

+ 32 - 29
archivebox/plugins/parse_html_urls/tests/test_parse_html_urls.py

@@ -27,12 +27,13 @@ class TestParseHtmlUrls:
 
         assert result.returncode == 0, f"Failed to parse example.com: {result.stderr}"
 
-        output_file = tmp_path / 'urls.jsonl'
-        assert output_file.exists(), "Output file not created"
+        # Verify stdout contains JSONL records for discovered URLs
+        # example.com links to iana.org
+        assert 'iana.org' in result.stdout or 'example' in result.stdout, "Expected links from example.com not found"
 
-        # Verify output contains IANA link (example.com links to iana.org)
-        content = output_file.read_text()
-        assert 'iana.org' in content or 'example' in content, "Expected links from example.com not found"
+        # Verify ArchiveResult record is present
+        assert '"type": "ArchiveResult"' in result.stdout, "Missing ArchiveResult record"
+        assert '"status": "succeeded"' in result.stdout, "Missing success status"
 
     def test_extracts_href_urls(self, tmp_path):
         """Test extracting URLs from anchor tags."""
@@ -56,17 +57,16 @@ class TestParseHtmlUrls:
         )
 
         assert result.returncode == 0
-        assert 'Found 3 URLs' in result.stdout
+        assert 'Found 3 URLs' in result.stderr
 
-        output_file = tmp_path / 'urls.jsonl'
-        assert output_file.exists()
-
-        lines = output_file.read_text().strip().split('\n')
-        assert len(lines) == 3
+        # Parse Snapshot records from stdout
+        lines = [line for line in result.stdout.strip().split('\n') if line.strip() and '"type": "Snapshot"' in line]
+        assert len(lines) == 3, f"Expected 3 Snapshot records, got {len(lines)}"
 
         urls = set()
         for line in lines:
             entry = json.loads(line)
+            assert entry['type'] == 'Snapshot'
             assert 'url' in entry
             urls.add(entry['url'])
 
@@ -74,6 +74,10 @@ class TestParseHtmlUrls:
         assert 'https://foo.bar/page' in urls
         assert 'http://test.org' in urls
 
+        # Verify ArchiveResult record
+        assert '"type": "ArchiveResult"' in result.stdout
+        assert '"status": "succeeded"' in result.stdout
+
     def test_ignores_non_http_schemes(self, tmp_path):
         """Test that non-http schemes are ignored."""
         input_file = tmp_path / 'page.html'
@@ -96,9 +100,10 @@ class TestParseHtmlUrls:
         )
 
         assert result.returncode == 0
-        output_file = tmp_path / 'urls.jsonl'
-        lines = output_file.read_text().strip().split('\n')
-        assert len(lines) == 1
+
+        # Parse Snapshot records from stdout
+        lines = [line for line in result.stdout.strip().split('\n') if line.strip() and '"type": "Snapshot"' in line]
+        assert len(lines) == 1, f"Expected 1 Snapshot record, got {len(lines)}"
 
         entry = json.loads(lines[0])
         assert entry['url'] == 'https://valid.com'
@@ -122,8 +127,8 @@ class TestParseHtmlUrls:
         )
 
         assert result.returncode == 0
-        output_file = tmp_path / 'urls.jsonl'
-        entry = json.loads(output_file.read_text().strip())
+        lines = [line for line in result.stdout.strip().split('\n') if '"type": "Snapshot"' in line]
+        entry = json.loads(lines[0])
         assert entry['url'] == 'https://example.com/page?a=1&b=2'
 
     def test_deduplicates_urls(self, tmp_path):
@@ -147,8 +152,7 @@ class TestParseHtmlUrls:
         )
 
         assert result.returncode == 0
-        output_file = tmp_path / 'urls.jsonl'
-        lines = output_file.read_text().strip().split('\n')
+        lines = [line for line in result.stdout.strip().split('\n') if '"type": "Snapshot"' in line]
         assert len(lines) == 1
 
     def test_excludes_source_url(self, tmp_path):
@@ -172,14 +176,13 @@ class TestParseHtmlUrls:
         )
 
         assert result.returncode == 0
-        output_file = tmp_path / 'urls.jsonl'
-        lines = output_file.read_text().strip().split('\n')
+        lines = [line for line in result.stdout.strip().split('\n') if '"type": "Snapshot"' in line]
         assert len(lines) == 1
         entry = json.loads(lines[0])
         assert entry['url'] == 'https://other.com'
 
-    def test_exits_1_when_no_urls_found(self, tmp_path):
-        """Test that script exits with code 1 when no URLs found."""
+    def test_skips_when_no_urls_found(self, tmp_path):
+        """Test that script returns skipped status when no URLs found."""
         input_file = tmp_path / 'page.html'
         input_file.write_text('<html><body>No links here</body></html>')
 
@@ -190,8 +193,9 @@ class TestParseHtmlUrls:
             text=True,
         )
 
-        assert result.returncode == 1
+        assert result.returncode == 0
         assert 'No URLs found' in result.stderr
+        assert '"status": "skipped"' in result.stdout
 
     def test_handles_malformed_html(self, tmp_path):
         """Test handling of malformed HTML."""
@@ -212,8 +216,7 @@ class TestParseHtmlUrls:
         )
 
         assert result.returncode == 0
-        output_file = tmp_path / 'urls.jsonl'
-        lines = output_file.read_text().strip().split('\n')
+        lines = [line for line in result.stdout.strip().split('\n') if '"type": "Snapshot"' in line]
         assert len(lines) == 2
 
     def test_output_is_valid_json(self, tmp_path):
@@ -229,11 +232,11 @@ class TestParseHtmlUrls:
         )
 
         assert result.returncode == 0
-        output_file = tmp_path / 'urls.jsonl'
-        entry = json.loads(output_file.read_text().strip())
+        lines = [line for line in result.stdout.strip().split('\n') if '"type": "Snapshot"' in line]
+        entry = json.loads(lines[0])
         assert entry['url'] == 'https://example.com'
-        assert 'type' in entry
-        assert 'plugin' in entry
+        assert entry['type'] == 'Snapshot'
+        assert entry['plugin'] == 'parse_html_urls'
 
 
 if __name__ == '__main__':

Some files were not shown because too many files changed in this diff