1 month ago · 54f91c1339
--- a/TODO_hook_concurrency.md
+++ b/TODO_hook_concurrency.md
@@ -310,37 +310,40 @@ archivebox/plugins/{plugin_name}/
 
				 ## Implementation Checklist
			
 
				 
			
 
				 ### Phase 1: Schema Migration ✅
			
 
				-- [ ] Add `Snapshot.current_step` (IntegerField 0-9, default=0)
			
 
				-- [ ] Add `ArchiveResult.hook_name` (CharField, nullable) - just filename
			
 
				-- [ ] Create migration: `0033_snapshot_current_step_archiveresult_hook_name.py`
			
 
				+- [x] Add `Snapshot.current_step` (IntegerField 0-9, default=0)
			
 
				+- [x] Add `ArchiveResult.hook_name` (CharField, nullable) - just filename
			
 
				+- [x] Create migration: `0034_snapshot_current_step.py`
			
 
				 
			
 
				-### Phase 2: Core Logic Updates
			
 
				-- [ ] Add `extract_step(hook_name)` utility in `archivebox/hooks.py`
			
 
				+### Phase 2: Core Logic Updates ✅
			
 
				+- [x] Add `extract_step(hook_name)` utility in `archivebox/hooks.py`
			
 
				   - Extract first digit from `__XX_` pattern
			
 
				   - Default to 9 for unnumbered hooks
			
 
				-- [ ] Update `Snapshot.create_pending_archiveresults()` in `archivebox/core/models.py`:
			
 
				+- [x] Add `is_background_hook(hook_name)` utility in `archivebox/hooks.py`
			
 
				+  - Check for `.bg.` in filename
			
 
				+- [x] Update `Snapshot.create_pending_archiveresults()` in `archivebox/core/models.py`:
			
 
				   - Discover all hooks (not plugins)
			
 
				   - Create one AR per hook with `hook_name` set
			
 
				-- [ ] Update `ArchiveResult.run()` in `archivebox/core/models.py`:
			
 
				+- [x] Update `ArchiveResult.run()` in `archivebox/core/models.py`:
			
 
				   - If `hook_name` set: run single hook
			
 
				   - If `hook_name` None: discover all plugin hooks (existing behavior)
			
 
				-- [ ] Add `Snapshot.advance_step_if_ready()` method:
			
 
				+- [x] Add `Snapshot.advance_step_if_ready()` method:
			
 
				   - Check if all foreground ARs in current step finished
			
 
				   - Increment `current_step` if ready
			
 
				   - Ignore background hooks (.bg) in completion check
			
 
				-- [ ] Integrate with `SnapshotMachine.is_finished()` in `archivebox/core/statemachines.py`:
			
 
				+- [x] Integrate with `SnapshotMachine.is_finished()` in `archivebox/core/statemachines.py`:
			
 
				   - Call `advance_step_if_ready()` before checking if done
			
 
				 
			
 
				-### Phase 3: Worker Coordination
			
 
				-- [ ] Update worker AR claiming query in `archivebox/workers/worker.py`:
			
 
				+### Phase 3: Worker Coordination ✅
			
 
				+- [x] Update worker AR claiming query in `archivebox/workers/worker.py`:
			
 
				   - Filter: `extract_step(ar.hook_name) <= snapshot.current_step`
			
 
				-  - Note: May need to denormalize or use clever query since step is derived
			
 
				-  - Alternative: Claim any AR in QUEUED state, check step in Python before processing
			
 
				+  - Claims ARs in QUEUED state, checks step in Python before processing
			
 
				+  - Orders by hook_name for deterministic execution within step
			
 
				 
			
 
				-### Phase 4: Hook Renumbering
			
 
				-- [ ] Renumber hooks per renumbering map below
			
 
				-- [ ] Add `.bg` suffix to long-running hooks
			
 
				-- [ ] Test all hooks still work after renumbering
			
 
				+### Phase 4: Hook Renumbering ✅
			
 
				+- [x] Renumber hooks per renumbering map below
			
 
				+- [x] Add `.bg` suffix to long-running hooks (media, gallerydl, forumdl, papersdl)
			
 
				+- [x] Move parse_* hooks to step 7 (70-79)
			
 
				+- [x] Test all hooks still work after renumbering
			
 
				 
			
 
				 ## Migration Path
			
 
				 
			
@@ -353,25 +356,34 @@ No special migration needed:
 
				 
			
 
				 ### Renumbering Map
			
 
				 
			
 
				-**Current → New:**
			
 
				-```
			
 
				-git/on_Snapshot__12_git.py                    → git/on_Snapshot__62_git.py
			
 
				-media/on_Snapshot__51_media.py                → media/on_Snapshot__63_media.bg.py
			
 
				-gallerydl/on_Snapshot__52_gallerydl.py        → gallerydl/on_Snapshot__64_gallerydl.bg.py
			
 
				-forumdl/on_Snapshot__53_forumdl.py            → forumdl/on_Snapshot__65_forumdl.bg.py
			
 
				-papersdl/on_Snapshot__54_papersdl.py          → papersdl/on_Snapshot__66_papersdl.bg.py
			
 
				-
			
 
				-readability/on_Snapshot__52_readability.py    → readability/on_Snapshot__55_readability.py
			
 
				-mercury/on_Snapshot__53_mercury.py            → mercury/on_Snapshot__56_mercury.py
			
 
				-
			
 
				-singlefile/on_Snapshot__37_singlefile.py      → singlefile/on_Snapshot__50_singlefile.py
			
 
				-screenshot/on_Snapshot__34_screenshot.js      → screenshot/on_Snapshot__51_screenshot.js
			
 
				-pdf/on_Snapshot__35_pdf.js                    → pdf/on_Snapshot__52_pdf.js
			
 
				-dom/on_Snapshot__36_dom.js                    → dom/on_Snapshot__53_dom.js
			
 
				-title/on_Snapshot__32_title.js                → title/on_Snapshot__54_title.js
			
 
				-headers/on_Snapshot__33_headers.js            → headers/on_Snapshot__55_headers.js
			
 
				-
			
 
				-wget/on_Snapshot__50_wget.py                  → wget/on_Snapshot__61_wget.py
			
 
				+**Completed Renames:**
			
 
				+```
			
 
				+# Step 5: DOM Extraction (sequential, non-background)
			
 
				+singlefile/on_Snapshot__37_singlefile.py      → singlefile/on_Snapshot__50_singlefile.py ✅
			
 
				+screenshot/on_Snapshot__34_screenshot.js      → screenshot/on_Snapshot__51_screenshot.js ✅
			
 
				+pdf/on_Snapshot__35_pdf.js                    → pdf/on_Snapshot__52_pdf.js ✅
			
 
				+dom/on_Snapshot__36_dom.js                    → dom/on_Snapshot__53_dom.js ✅
			
 
				+title/on_Snapshot__32_title.js                → title/on_Snapshot__54_title.js ✅
			
 
				+readability/on_Snapshot__52_readability.py    → readability/on_Snapshot__55_readability.py ✅
			
 
				+headers/on_Snapshot__33_headers.js            → headers/on_Snapshot__55_headers.js ✅
			
 
				+mercury/on_Snapshot__53_mercury.py            → mercury/on_Snapshot__56_mercury.py ✅
			
 
				+htmltotext/on_Snapshot__54_htmltotext.py      → htmltotext/on_Snapshot__57_htmltotext.py ✅
			
 
				+
			
 
				+# Step 6: Post-DOM Extraction (background for long-running)
			
 
				+wget/on_Snapshot__50_wget.py                  → wget/on_Snapshot__61_wget.py ✅
			
 
				+git/on_Snapshot__12_git.py                    → git/on_Snapshot__62_git.py ✅
			
 
				+media/on_Snapshot__51_media.py                → media/on_Snapshot__63_media.bg.py ✅
			
 
				+gallerydl/on_Snapshot__52_gallerydl.py        → gallerydl/on_Snapshot__64_gallerydl.bg.py ✅
			
 
				+forumdl/on_Snapshot__53_forumdl.py            → forumdl/on_Snapshot__65_forumdl.bg.py ✅
			
 
				+papersdl/on_Snapshot__54_papersdl.py          → papersdl/on_Snapshot__66_papersdl.bg.py ✅
			
 
				+
			
 
				+# Step 7: URL Extraction (parse_* hooks moved from step 6)
			
 
				+parse_html_urls/on_Snapshot__60_parse_html_urls.py      → parse_html_urls/on_Snapshot__70_parse_html_urls.py ✅
			
 
				+parse_txt_urls/on_Snapshot__62_parse_txt_urls.py        → parse_txt_urls/on_Snapshot__71_parse_txt_urls.py ✅
			
 
				+parse_rss_urls/on_Snapshot__61_parse_rss_urls.py        → parse_rss_urls/on_Snapshot__72_parse_rss_urls.py ✅
			
 
				+parse_netscape_urls/on_Snapshot__63_parse_netscape_urls.py → parse_netscape_urls/on_Snapshot__73_parse_netscape_urls.py ✅
			
 
				+parse_jsonl_urls/on_Snapshot__64_parse_jsonl_urls.py    → parse_jsonl_urls/on_Snapshot__74_parse_jsonl_urls.py ✅
			
 
				+parse_dom_outlinks/on_Snapshot__40_parse_dom_outlinks.js → parse_dom_outlinks/on_Snapshot__75_parse_dom_outlinks.js ✅
			
 
				 ```
			
 
				 
			
 
				 ## Testing Strategy
			
--- a/archivebox/cli/archivebox_status.py
+++ b/archivebox/cli/archivebox_status.py
@@ -11,18 +11,6 @@ from archivebox.misc.util import enforce_types, docstring
 
				 from archivebox.config import DATA_DIR, CONSTANTS, ARCHIVE_DIR
			
 
				 from archivebox.config.common import SHELL_CONFIG
			
 
				 from archivebox.misc.legacy import parse_json_links_details
			
 
				-from archivebox.misc.folders import (
			
 
				-    get_indexed_folders,
			
 
				-    get_archived_folders,
			
 
				-    get_invalid_folders,
			
 
				-    get_unarchived_folders,
			
 
				-    get_present_folders,
			
 
				-    get_valid_folders,
			
 
				-    get_duplicate_folders,
			
 
				-    get_orphaned_folders,
			
 
				-    get_corrupted_folders,
			
 
				-    get_unrecognized_folders,
			
 
				-)
			
 
				 from archivebox.misc.system import get_dir_size
			
 
				 from archivebox.misc.logging_util import printable_filesize
			
 
				 
			
@@ -55,42 +43,40 @@ def status(out_dir: Path=DATA_DIR) -> None:
 
				     size = printable_filesize(num_bytes)
			
 
				     print(f'    Size: {size} across {num_files} files in {num_dirs} directories')
			
 
				 
			
 
				-    num_indexed = len(get_indexed_folders(links, out_dir=out_dir))
			
 
				-    num_archived = len(get_archived_folders(links, out_dir=out_dir))
			
 
				-    num_unarchived = len(get_unarchived_folders(links, out_dir=out_dir))
			
 
				-    print(f'    > indexed: {num_indexed}'.ljust(36), f'({get_indexed_folders.__doc__})')
			
 
				-    print(f'      > archived: {num_archived}'.ljust(36), f'({get_archived_folders.__doc__})')
			
 
				-    print(f'      > unarchived: {num_unarchived}'.ljust(36), f'({get_unarchived_folders.__doc__})')
			
 
				-    
			
 
				-    num_present = len(get_present_folders(links, out_dir=out_dir))
			
 
				-    num_valid = len(get_valid_folders(links, out_dir=out_dir))
			
 
				+    # Use DB as source of truth for snapshot status
			
 
				+    num_indexed = links.count()
			
 
				+    num_archived = links.filter(status='archived').count() or links.exclude(downloaded_at=None).count()
			
 
				+    num_unarchived = links.filter(status='queued').count() or links.filter(downloaded_at=None).count()
			
 
				+    print(f'    > indexed: {num_indexed}'.ljust(36), '(total snapshots in DB)')
			
 
				+    print(f'      > archived: {num_archived}'.ljust(36), '(snapshots with archived content)')
			
 
				+    print(f'      > unarchived: {num_unarchived}'.ljust(36), '(snapshots pending archiving)')
			
 
				+
			
 
				+    # Count directories on filesystem
			
 
				+    num_present = 0
			
 
				+    orphaned_dirs = []
			
 
				+    if ARCHIVE_DIR.exists():
			
 
				+        for entry in ARCHIVE_DIR.iterdir():
			
 
				+            if entry.is_dir():
			
 
				+                num_present += 1
			
 
				+                if not links.filter(timestamp=entry.name).exists():
			
 
				+                    orphaned_dirs.append(str(entry))
			
 
				+
			
 
				+    num_valid = min(num_present, num_indexed)  # approximate
			
 
				     print()
			
 
				-    print(f'    > present: {num_present}'.ljust(36), f'({get_present_folders.__doc__})')
			
 
				-    print(f'      > [green]valid:[/green] {num_valid}'.ljust(36), f'               ({get_valid_folders.__doc__})')
			
 
				-    
			
 
				-    duplicate = get_duplicate_folders(links, out_dir=out_dir)
			
 
				-    orphaned = get_orphaned_folders(links, out_dir=out_dir)
			
 
				-    corrupted = get_corrupted_folders(links, out_dir=out_dir)
			
 
				-    unrecognized = get_unrecognized_folders(links, out_dir=out_dir)
			
 
				-    num_invalid = len({**duplicate, **orphaned, **corrupted, **unrecognized})
			
 
				-    print(f'      > [red]invalid:[/red] {num_invalid}'.ljust(36), f'           ({get_invalid_folders.__doc__})')
			
 
				-    print(f'        > duplicate: {len(duplicate)}'.ljust(36), f'({get_duplicate_folders.__doc__})')
			
 
				-    print(f'        > orphaned: {len(orphaned)}'.ljust(36), f'({get_orphaned_folders.__doc__})')
			
 
				-    print(f'        > corrupted: {len(corrupted)}'.ljust(36), f'({get_corrupted_folders.__doc__})')
			
 
				-    print(f'        > unrecognized: {len(unrecognized)}'.ljust(36), f'({get_unrecognized_folders.__doc__})')
			
 
				+    print(f'    > present: {num_present}'.ljust(36), '(directories in archive/)')
			
 
				+    print(f'      > [green]valid:[/green] {num_valid}'.ljust(36), '               (directories with matching DB entry)')
			
 
				+
			
 
				+    num_orphaned = len(orphaned_dirs)
			
 
				+    print(f'      > [red]orphaned:[/red] {num_orphaned}'.ljust(36), '         (directories without matching DB entry)')
			
 
				 
			
 
				     if num_indexed:
			
 
				-        print('    [violet]Hint:[/violet] You can list link data directories by status like so:')
			
 
				-        print('        [green]archivebox list --status=<status>  (e.g. indexed, corrupted, archived, etc.)[/green]')
			
 
				+        print('    [violet]Hint:[/violet] You can list snapshots by status like so:')
			
 
				+        print('        [green]archivebox list --status=<status>  (e.g. archived, queued, etc.)[/green]')
			
 
				 
			
 
				-    if orphaned:
			
 
				+    if orphaned_dirs:
			
 
				         print('    [violet]Hint:[/violet] To automatically import orphaned data directories into the main index, run:')
			
 
				         print('        [green]archivebox init[/green]')
			
 
				 
			
 
				-    if num_invalid:
			
 
				-        print('    [violet]Hint:[/violet] You may need to manually remove or fix some invalid data directories, afterwards make sure to run:')
			
 
				-        print('        [green]archivebox init[/green]')
			
 
				-    
			
 
				     print()
			
 
				     print('[green]\\[*] Scanning recent archive changes and user logins:[/green]')
			
 
				     print(f'[yellow]   {CONSTANTS.LOGS_DIR}/*[/yellow]')
			
--- a/archivebox/core/migrations/0032_alter_archiveresult_binary_and_more.py
+++ b/archivebox/core/migrations/0032_alter_archiveresult_binary_and_more.py
@@ -1,7 +1,7 @@
 
				 # Generated by Django 6.0 on 2025-12-28 05:12
			
 
				 
			
 
				 import django.db.models.deletion
			
 
				-import uuid
			
 
				+from archivebox import uuid_compat
			
 
				 from django.conf import settings
			
 
				 from django.db import migrations, models
			
 
				 
			
@@ -49,7 +49,7 @@ class Migration(migrations.Migration):
 
				         migrations.AlterField(
			
 
				             model_name='archiveresult',
			
 
				             name='uuid',
			
 
				-            field=models.UUIDField(blank=True, db_index=True, default=uuid.uuid7, null=True),
			
 
				+            field=models.UUIDField(blank=True, db_index=True, default=uuid_compat.uuid7, null=True),
			
 
				         ),
			
 
				         migrations.AddConstraint(
			
 
				             model_name='snapshot',
			
--- a/archivebox/core/migrations/0034_snapshot_current_step.py
+++ b/archivebox/core/migrations/0034_snapshot_current_step.py
@@ -0,0 +1,23 @@
 
				+# Generated by Django 6.0 on 2025-12-28
			
 
				+# Add Snapshot.current_step field for hook step-based execution
			
 
				+
			
 
				+from django.db import migrations, models
			
 
				+
			
 
				+
			
 
				+class Migration(migrations.Migration):
			
 
				+
			
 
				+    dependencies = [
			
 
				+        ('core', '0033_rename_extractor_add_hook_name'),
			
 
				+    ]
			
 
				+
			
 
				+    operations = [
			
 
				+        migrations.AddField(
			
 
				+            model_name='snapshot',
			
 
				+            name='current_step',
			
 
				+            field=models.PositiveSmallIntegerField(
			
 
				+                default=0,
			
 
				+                db_index=True,
			
 
				+                help_text='Current hook step being executed (0-9). Used for sequential hook execution.'
			
 
				+            ),
			
 
				+        ),
			
 
				+    ]
			
--- a/archivebox/core/models.py
+++ b/archivebox/core/models.py
@@ -334,6 +334,7 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
 
				     downloaded_at = models.DateTimeField(default=None, null=True, editable=False, db_index=True, blank=True)
			
 
				     depth = models.PositiveSmallIntegerField(default=0, db_index=True)  # 0 for root snapshot, 1+ for discovered URLs
			
 
				     fs_version = models.CharField(max_length=10, default='0.9.0', help_text='Filesystem version of this snapshot (e.g., "0.7.0", "0.8.0", "0.9.0"). Used to trigger lazy migration on save().')
			
 
				+    current_step = models.PositiveSmallIntegerField(default=0, db_index=True, help_text='Current hook step being executed (0-9). Used for sequential hook execution.')
			
 
				 
			
 
				     retry_at = ModelWithStateMachine.RetryAtField(default=timezone.now)
			
 
				     status = ModelWithStateMachine.StatusField(choices=ModelWithStateMachine.StatusChoices, default=ModelWithStateMachine.StatusChoices.QUEUED)
			
@@ -1243,23 +1244,33 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
 
				 
			
 
				     def create_pending_archiveresults(self) -> list['ArchiveResult']:
			
 
				         """
			
 
				-        Create ArchiveResult records for all enabled plugins.
			
 
				+        Create ArchiveResult records for all enabled hooks.
			
 
				 
			
 
				-        Uses the hooks system to discover available plugins from:
			
 
				+        Uses the hooks system to discover available hooks from:
			
 
				         - archivebox/plugins/*/on_Snapshot__*.{py,sh,js}
			
 
				         - data/plugins/*/on_Snapshot__*.{py,sh,js}
			
 
				+
			
 
				+        Creates one ArchiveResult per hook (not per plugin), with hook_name set.
			
 
				+        This enables step-based execution where all hooks in a step can run in parallel.
			
 
				         """
			
 
				-        from archivebox.hooks import get_enabled_plugins
			
 
				+        from archivebox.hooks import discover_hooks
			
 
				 
			
 
				-        plugins = get_enabled_plugins()
			
 
				+        hooks = discover_hooks('Snapshot')
			
 
				         archiveresults = []
			
 
				 
			
 
				-        for plugin in plugins:
			
 
				-            if ArchiveResult.objects.filter(snapshot=self, plugin=plugin).exists():
			
 
				+        for hook_path in hooks:
			
 
				+            hook_name = hook_path.name  # e.g., 'on_Snapshot__50_wget.py'
			
 
				+            plugin = hook_path.parent.name  # e.g., 'wget'
			
 
				+
			
 
				+            # Check if AR already exists for this specific hook
			
 
				+            if ArchiveResult.objects.filter(snapshot=self, hook_name=hook_name).exists():
			
 
				                 continue
			
 
				-            archiveresult, _ = ArchiveResult.objects.get_or_create(
			
 
				-                snapshot=self, plugin=plugin,
			
 
				+
			
 
				+            archiveresult, created = ArchiveResult.objects.get_or_create(
			
 
				+                snapshot=self,
			
 
				+                hook_name=hook_name,
			
 
				                 defaults={
			
 
				+                    'plugin': plugin,
			
 
				                     'status': ArchiveResult.INITIAL_STATE,
			
 
				                     'retry_at': timezone.now(),
			
 
				                     'created_by_id': self.created_by_id,
			
@@ -1267,8 +1278,57 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
 
				             )
			
 
				             if archiveresult.status == ArchiveResult.INITIAL_STATE:
			
 
				                 archiveresults.append(archiveresult)
			
 
				+
			
 
				         return archiveresults
			
 
				 
			
 
				+    def advance_step_if_ready(self) -> bool:
			
 
				+        """
			
 
				+        Advance current_step if all foreground hooks in current step are finished.
			
 
				+
			
 
				+        Called by the state machine to check if step can advance.
			
 
				+        Background hooks (.bg) don't block step advancement.
			
 
				+
			
 
				+        Step advancement rules:
			
 
				+        - All foreground ARs in current step must be finished (SUCCEEDED/FAILED/SKIPPED)
			
 
				+        - Background ARs (hook_name contains '.bg.') are ignored for advancement
			
 
				+        - When ready, increments current_step by 1 (up to 9)
			
 
				+
			
 
				+        Returns:
			
 
				+            True if step was advanced, False if not ready or already at step 9.
			
 
				+        """
			
 
				+        from archivebox.hooks import extract_step, is_background_hook
			
 
				+
			
 
				+        if self.current_step >= 9:
			
 
				+            return False  # Already at final step
			
 
				+
			
 
				+        # Get all ARs for current step that are foreground
			
 
				+        current_step_ars = self.archiveresult_set.filter(
			
 
				+            hook_name__isnull=False
			
 
				+        ).exclude(hook_name='')
			
 
				+
			
 
				+        # Check each AR in current step
			
 
				+        for ar in current_step_ars:
			
 
				+            ar_step = extract_step(ar.hook_name)
			
 
				+            if ar_step != self.current_step:
			
 
				+                continue  # Not in current step
			
 
				+
			
 
				+            if is_background_hook(ar.hook_name):
			
 
				+                continue  # Background hooks don't block
			
 
				+
			
 
				+            # Foreground hook in current step - check if finished
			
 
				+            if ar.status not in ArchiveResult.FINAL_OR_ACTIVE_STATES:
			
 
				+                # Still pending/queued - can't advance
			
 
				+                return False
			
 
				+
			
 
				+            if ar.status == ArchiveResult.StatusChoices.STARTED:
			
 
				+                # Still running - can't advance
			
 
				+                return False
			
 
				+
			
 
				+        # All foreground hooks in current step are finished - advance!
			
 
				+        self.current_step += 1
			
 
				+        self.save(update_fields=['current_step', 'modified_at'])
			
 
				+        return True
			
 
				+
			
 
				     def retry_failed_archiveresults(self, retry_at: Optional['timezone.datetime'] = None) -> int:
			
 
				         """
			
 
				         Reset failed/skipped ArchiveResults to queued for retry.
			
@@ -1301,11 +1361,12 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
 
				             end_ts=None,
			
 
				         )
			
 
				 
			
 
				-        # Also reset the snapshot so it gets re-checked
			
 
				+        # Also reset the snapshot and current_step so it gets re-checked from the beginning
			
 
				         if count > 0:
			
 
				             self.status = self.StatusChoices.STARTED
			
 
				             self.retry_at = retry_at
			
 
				-            self.save(update_fields=['status', 'retry_at', 'modified_at'])
			
 
				+            self.current_step = 0  # Reset to step 0 for retry
			
 
				+            self.save(update_fields=['status', 'retry_at', 'current_step', 'modified_at'])
			
 
				 
			
 
				         return count
			
 
				 
			
@@ -1841,45 +1902,63 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
 
				 
			
 
				     def run(self):
			
 
				         """
			
 
				-        Execute this ArchiveResult's plugin and update status.
			
 
				+        Execute this ArchiveResult's hook and update status.
			
 
				 
			
 
				-        Discovers and runs the hook script for self.plugin,
			
 
				-        updates status/output fields, queues discovered URLs, and triggers indexing.
			
 
				+        If self.hook_name is set, runs only that specific hook.
			
 
				+        If self.hook_name is empty, discovers and runs all hooks for self.plugin (backwards compat).
			
 
				+
			
 
				+        Updates status/output fields, queues discovered URLs, and triggers indexing.
			
 
				         """
			
 
				         from django.utils import timezone
			
 
				-        from archivebox.hooks import BUILTIN_PLUGINS_DIR, USER_PLUGINS_DIR, run_hook
			
 
				+        from archivebox.hooks import BUILTIN_PLUGINS_DIR, USER_PLUGINS_DIR, run_hook, is_background_hook
			
 
				 
			
 
				         config_objects = [self.snapshot.crawl, self.snapshot] if self.snapshot.crawl else [self.snapshot]
			
 
				 
			
 
				-        # Find ALL hooks for this plugin
			
 
				-        # plugin = plugin name (e.g., 'chrome')
			
 
				-        # Each plugin can have multiple hooks that run in sequence
			
 
				+        # Determine which hook(s) to run
			
 
				         hooks = []
			
 
				-        for base_dir in (BUILTIN_PLUGINS_DIR, USER_PLUGINS_DIR):
			
 
				-            if not base_dir.exists():
			
 
				-                continue
			
 
				-            plugin_dir = base_dir / self.plugin
			
 
				-            if plugin_dir.exists():
			
 
				-                matches = list(plugin_dir.glob('on_Snapshot__*.*'))
			
 
				-                if matches:
			
 
				-                    # Sort by name for deterministic order (numeric prefix controls execution order)
			
 
				-                    hooks.extend(sorted(matches))
			
 
				+
			
 
				+        if self.hook_name:
			
 
				+            # SPECIFIC HOOK MODE: Find the specific hook by name
			
 
				+            for base_dir in (BUILTIN_PLUGINS_DIR, USER_PLUGINS_DIR):
			
 
				+                if not base_dir.exists():
			
 
				+                    continue
			
 
				+                plugin_dir = base_dir / self.plugin
			
 
				+                if plugin_dir.exists():
			
 
				+                    hook_path = plugin_dir / self.hook_name
			
 
				+                    if hook_path.exists():
			
 
				+                        hooks.append(hook_path)
			
 
				+                        break
			
 
				+        else:
			
 
				+            # LEGACY MODE: Discover all hooks for this plugin (backwards compatibility)
			
 
				+            for base_dir in (BUILTIN_PLUGINS_DIR, USER_PLUGINS_DIR):
			
 
				+                if not base_dir.exists():
			
 
				+                    continue
			
 
				+                plugin_dir = base_dir / self.plugin
			
 
				+                if plugin_dir.exists():
			
 
				+                    matches = list(plugin_dir.glob('on_Snapshot__*.*'))
			
 
				+                    if matches:
			
 
				+                        hooks.extend(sorted(matches))
			
 
				 
			
 
				         if not hooks:
			
 
				             self.status = self.StatusChoices.FAILED
			
 
				-            self.output_str = f'No hooks found for plugin: {self.plugin}'
			
 
				+            if self.hook_name:
			
 
				+                self.output_str = f'Hook not found: {self.plugin}/{self.hook_name}'
			
 
				+            else:
			
 
				+                self.output_str = f'No hooks found for plugin: {self.plugin}'
			
 
				             self.retry_at = None
			
 
				             self.save()
			
 
				             return
			
 
				 
			
 
				-        # plugin field contains plugin name
			
 
				+        # Output directory is plugin_dir for the hook output
			
 
				         plugin_dir = Path(self.snapshot.output_dir) / self.plugin
			
 
				 
			
 
				-        # Run ALL hooks in the plugin sequentially
			
 
				         start_ts = timezone.now()
			
 
				-        has_background_hook = False
			
 
				+        is_bg_hook = False
			
 
				 
			
 
				         for hook in hooks:
			
 
				+            # Check if this is a background hook
			
 
				+            is_bg_hook = is_background_hook(hook.name)
			
 
				+
			
 
				             result = run_hook(
			
 
				                 hook,
			
 
				                 output_dir=plugin_dir,
			
@@ -1890,20 +1969,21 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
 
				                 depth=self.snapshot.depth,
			
 
				             )
			
 
				 
			
 
				-            # If any hook is background, mark this ArchiveResult as started
			
 
				+            # Background hooks return None
			
 
				             if result is None:
			
 
				-                has_background_hook = True
			
 
				+                is_bg_hook = True
			
 
				 
			
 
				         # Update status based on hook execution
			
 
				-        if has_background_hook:
			
 
				-            # BACKGROUND HOOK(S) - still running, return immediately
			
 
				+        if is_bg_hook:
			
 
				+            # BACKGROUND HOOK - still running, return immediately
			
 
				+            # Status stays STARTED, will be finalized by Snapshot.cleanup()
			
 
				             self.status = self.StatusChoices.STARTED
			
 
				             self.start_ts = start_ts
			
 
				             self.pwd = str(plugin_dir)
			
 
				             self.save()
			
 
				             return
			
 
				 
			
 
				-        # ALL FOREGROUND HOOKS - completed, update from filesystem
			
 
				+        # FOREGROUND HOOK - completed, update from filesystem
			
 
				         self.start_ts = start_ts
			
 
				         self.pwd = str(plugin_dir)
			
 
				         self.update_from_output()
			
@@ -1911,11 +1991,10 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
 
				         # Clean up empty output directory if no files were created
			
 
				         if plugin_dir.exists() and not self.output_files:
			
 
				             try:
			
 
				-                # Only remove if directory is completely empty
			
 
				                 if not any(plugin_dir.iterdir()):
			
 
				                     plugin_dir.rmdir()
			
 
				             except (OSError, RuntimeError):
			
 
				-                pass  # Directory not empty or can't be removed, that's fine
			
 
				+                pass
			
 
				 
			
 
				     def update_from_output(self):
			
 
				         """
			
--- a/archivebox/core/statemachines.py
+++ b/archivebox/core/statemachines.py
@@ -60,6 +60,11 @@ class SnapshotMachine(StateMachine, strict_states=True):
 
				         if not self.snapshot.archiveresult_set.exists():
			
 
				             return False
			
 
				 
			
 
				+        # Try to advance step if ready (handles step-based hook execution)
			
 
				+        # This will increment current_step when all foreground hooks in current step are done
			
 
				+        while self.snapshot.advance_step_if_ready():
			
 
				+            pass  # Keep advancing until we can't anymore
			
 
				+
			
 
				         # if archiveresults exist but are still pending, it's not finished
			
 
				         if self.snapshot.pending_archiveresults().exists():
			
 
				             return False
			
--- a/archivebox/hooks.py
+++ b/archivebox/hooks.py
@@ -15,10 +15,21 @@ Hook contract:
 
				     Exit:   0 = success, non-zero = failure
			
 
				 
			
 
				 Execution order:
			
 
				-    - Extractors run sequentially within each Snapshot (ordered by numeric prefix)
			
 
				-    - Multiple Snapshots can process in parallel
			
 
				+    - Hooks are numbered 00-99 with first digit determining step (0-9)
			
 
				+    - All hooks in a step can run in parallel
			
 
				+    - Steps execute sequentially (step 0 → step 1 → ... → step 9)
			
 
				+    - Background hooks (.bg suffix) don't block step advancement
			
 
				     - Failed extractors don't block subsequent extractors
			
 
				 
			
 
				+Hook Naming Convention:
			
 
				+    on_{ModelName}__{run_order}_{description}[.bg].{ext}
			
 
				+
			
 
				+    Examples:
			
 
				+        on_Snapshot__00_setup.py         # Step 0, runs first
			
 
				+        on_Snapshot__20_chrome_tab.bg.js # Step 2, background (doesn't block)
			
 
				+        on_Snapshot__50_screenshot.js    # Step 5, foreground (blocks step)
			
 
				+        on_Snapshot__63_media.bg.py      # Step 6, background (long-running)
			
 
				+
			
 
				 Dependency handling:
			
 
				     Extractor plugins that depend on other plugins' output should check at runtime:
			
 
				 
			
@@ -39,11 +50,14 @@ API (all hook logic lives here):
 
				     discover_hooks(event)     -> List[Path]     Find hook scripts
			
 
				     run_hook(script, ...)     -> HookResult     Execute a hook script
			
 
				     run_hooks(event, ...)     -> List[HookResult]  Run all hooks for an event
			
 
				+    extract_step(hook_name)   -> int            Get step number (0-9) from hook name
			
 
				+    is_background_hook(name)  -> bool           Check if hook is background (.bg suffix)
			
 
				 """
			
 
				 
			
 
				 __package__ = 'archivebox'
			
 
				 
			
 
				 import os
			
 
				+import re
			
 
				 import json
			
 
				 import signal
			
 
				 import time
			
@@ -60,6 +74,63 @@ BUILTIN_PLUGINS_DIR = Path(__file__).parent / 'plugins'
 
				 USER_PLUGINS_DIR = Path(getattr(settings, 'DATA_DIR', Path.cwd())) / 'plugins'
			
 
				 
			
 
				 
			
 
				+# =============================================================================
			
 
				+# Hook Step Extraction
			
 
				+# =============================================================================
			
 
				+
			
 
				+def extract_step(hook_name: str) -> int:
			
 
				+    """
			
 
				+    Extract step number (0-9) from hook name.
			
 
				+
			
 
				+    Hooks are numbered 00-99 with the first digit determining the step.
			
 
				+    Pattern: on_{Model}__{XX}_{description}[.bg].{ext}
			
 
				+
			
 
				+    Args:
			
 
				+        hook_name: Hook filename (e.g., 'on_Snapshot__50_wget.py')
			
 
				+
			
 
				+    Returns:
			
 
				+        Step number 0-9, or 9 (default) for unnumbered hooks.
			
 
				+
			
 
				+    Examples:
			
 
				+        extract_step('on_Snapshot__05_chrome.py') -> 0
			
 
				+        extract_step('on_Snapshot__50_wget.py') -> 5
			
 
				+        extract_step('on_Snapshot__63_media.bg.py') -> 6
			
 
				+        extract_step('on_Snapshot__99_cleanup.sh') -> 9
			
 
				+        extract_step('on_Snapshot__unnumbered.py') -> 9 (default)
			
 
				+    """
			
 
				+    # Pattern matches __XX_ where XX is two digits
			
 
				+    match = re.search(r'__(\d{2})_', hook_name)
			
 
				+    if match:
			
 
				+        two_digit = int(match.group(1))
			
 
				+        step = two_digit // 10  # First digit is the step (0-9)
			
 
				+        return step
			
 
				+
			
 
				+    # Log warning for unnumbered hooks and default to step 9
			
 
				+    import sys
			
 
				+    print(f"Warning: Hook '{hook_name}' has no step number (expected __XX_), defaulting to step 9", file=sys.stderr)
			
 
				+    return 9
			
 
				+
			
 
				+
			
 
				+def is_background_hook(hook_name: str) -> bool:
			
 
				+    """
			
 
				+    Check if a hook is a background hook (doesn't block step advancement).
			
 
				+
			
 
				+    Background hooks have '.bg.' in their filename before the extension.
			
 
				+
			
 
				+    Args:
			
 
				+        hook_name: Hook filename (e.g., 'on_Snapshot__20_chrome_tab.bg.js')
			
 
				+
			
 
				+    Returns:
			
 
				+        True if background hook, False if foreground.
			
 
				+
			
 
				+    Examples:
			
 
				+        is_background_hook('on_Snapshot__20_chrome_tab.bg.js') -> True
			
 
				+        is_background_hook('on_Snapshot__50_wget.py') -> False
			
 
				+        is_background_hook('on_Snapshot__63_media.bg.py') -> True
			
 
				+    """
			
 
				+    return '.bg.' in hook_name or '__background' in hook_name
			
 
				+
			
 
				+
			
 
				 class HookResult(TypedDict, total=False):
			
 
				     """Raw result from run_hook()."""
			
 
				     returncode: int
			
--- a/archivebox/machine/migrations/0003_alter_dependency_id_alter_installedbinary_dependency_and_more.py
+++ b/archivebox/machine/migrations/0003_alter_dependency_id_alter_installedbinary_dependency_and_more.py
@@ -1,7 +1,7 @@
 
				 # Generated by Django 6.0 on 2025-12-28 05:12
			
 
				 
			
 
				 import django.db.models.deletion
			
 
				-import uuid
			
 
				+from archivebox import uuid_compat
			
 
				 from django.db import migrations, models
			
 
				 
			
 
				 
			
@@ -15,7 +15,7 @@ class Migration(migrations.Migration):
 
				         migrations.AlterField(
			
 
				             model_name='dependency',
			
 
				             name='id',
			
 
				-            field=models.UUIDField(default=uuid.uuid7, editable=False, primary_key=True, serialize=False, unique=True),
			
 
				+            field=models.UUIDField(default=uuid_compat.uuid7, editable=False, primary_key=True, serialize=False, unique=True),
			
 
				         ),
			
 
				         migrations.AlterField(
			
 
				             model_name='binary',
			
@@ -25,7 +25,7 @@ class Migration(migrations.Migration):
 
				         migrations.AlterField(
			
 
				             model_name='binary',
			
 
				             name='id',
			
 
				-            field=models.UUIDField(default=uuid.uuid7, editable=False, primary_key=True, serialize=False, unique=True),
			
 
				+            field=models.UUIDField(default=uuid_compat.uuid7, editable=False, primary_key=True, serialize=False, unique=True),
			
 
				         ),
			
 
				         migrations.AlterField(
			
 
				             model_name='machine',
			
@@ -35,11 +35,11 @@ class Migration(migrations.Migration):
 
				         migrations.AlterField(
			
 
				             model_name='machine',
			
 
				             name='id',
			
 
				-            field=models.UUIDField(default=uuid.uuid7, editable=False, primary_key=True, serialize=False, unique=True),
			
 
				+            field=models.UUIDField(default=uuid_compat.uuid7, editable=False, primary_key=True, serialize=False, unique=True),
			
 
				         ),
			
 
				         migrations.AlterField(
			
 
				             model_name='networkinterface',
			
 
				             name='id',
			
 
				-            field=models.UUIDField(default=uuid.uuid7, editable=False, primary_key=True, serialize=False, unique=True),
			
 
				+            field=models.UUIDField(default=uuid_compat.uuid7, editable=False, primary_key=True, serialize=False, unique=True),
			
 
				         ),
			
 
				     ]
			
--- a/archivebox/plugins/dom/on_Snapshot__53_dom.js
+++ b/archivebox/plugins/dom/on_Snapshot__53_dom.js
--- a/archivebox/plugins/forumdl/on_Snapshot__65_forumdl.bg.py
+++ b/archivebox/plugins/forumdl/on_Snapshot__65_forumdl.bg.py
--- a/archivebox/plugins/gallerydl/on_Snapshot__64_gallerydl.bg.py
+++ b/archivebox/plugins/gallerydl/on_Snapshot__64_gallerydl.bg.py
--- a/archivebox/plugins/git/on_Snapshot__62_git.py
+++ b/archivebox/plugins/git/on_Snapshot__62_git.py
--- a/archivebox/plugins/headers/on_Snapshot__55_headers.js
+++ b/archivebox/plugins/headers/on_Snapshot__55_headers.js
--- a/archivebox/plugins/htmltotext/on_Snapshot__57_htmltotext.py
+++ b/archivebox/plugins/htmltotext/on_Snapshot__57_htmltotext.py
--- a/archivebox/plugins/media/on_Snapshot__63_media.bg.py
+++ b/archivebox/plugins/media/on_Snapshot__63_media.bg.py
--- a/archivebox/plugins/mercury/on_Snapshot__56_mercury.py
+++ b/archivebox/plugins/mercury/on_Snapshot__56_mercury.py
--- a/archivebox/plugins/papersdl/on_Snapshot__66_papersdl.bg.py
+++ b/archivebox/plugins/papersdl/on_Snapshot__66_papersdl.bg.py
--- a/archivebox/plugins/parse_dom_outlinks/on_Snapshot__75_parse_dom_outlinks.js
+++ b/archivebox/plugins/parse_dom_outlinks/on_Snapshot__75_parse_dom_outlinks.js
--- a/archivebox/plugins/parse_html_urls/on_Snapshot__70_parse_html_urls.py
+++ b/archivebox/plugins/parse_html_urls/on_Snapshot__70_parse_html_urls.py
--- a/archivebox/plugins/parse_jsonl_urls/on_Snapshot__74_parse_jsonl_urls.py
+++ b/archivebox/plugins/parse_jsonl_urls/on_Snapshot__74_parse_jsonl_urls.py
--- a/archivebox/plugins/parse_netscape_urls/on_Snapshot__73_parse_netscape_urls.py
+++ b/archivebox/plugins/parse_netscape_urls/on_Snapshot__73_parse_netscape_urls.py
--- a/archivebox/plugins/parse_rss_urls/on_Snapshot__72_parse_rss_urls.py
+++ b/archivebox/plugins/parse_rss_urls/on_Snapshot__72_parse_rss_urls.py
--- a/archivebox/plugins/parse_txt_urls/on_Snapshot__71_parse_txt_urls.py
+++ b/archivebox/plugins/parse_txt_urls/on_Snapshot__71_parse_txt_urls.py
--- a/archivebox/plugins/pdf/on_Snapshot__52_pdf.js
+++ b/archivebox/plugins/pdf/on_Snapshot__52_pdf.js
--- a/archivebox/plugins/readability/on_Snapshot__55_readability.py
+++ b/archivebox/plugins/readability/on_Snapshot__55_readability.py
--- a/archivebox/plugins/screenshot/on_Snapshot__51_screenshot.js
+++ b/archivebox/plugins/screenshot/on_Snapshot__51_screenshot.js
--- a/archivebox/plugins/singlefile/on_Snapshot__50_singlefile.py
+++ b/archivebox/plugins/singlefile/on_Snapshot__50_singlefile.py
--- a/archivebox/plugins/title/on_Snapshot__54_title.js
+++ b/archivebox/plugins/title/on_Snapshot__54_title.js
--- a/archivebox/plugins/wget/on_Snapshot__61_wget.py
+++ b/archivebox/plugins/wget/on_Snapshot__61_wget.py
--- a/archivebox/workers/worker.py
+++ b/archivebox/workers/worker.py
@@ -352,18 +352,42 @@ class ArchiveResultWorker(Worker):
 
				         return ArchiveResult
			
 
				 
			
 
				     def get_queue(self) -> QuerySet:
			
 
				-        """Get queue of ArchiveResults ready for processing."""
			
 
				+        """
			
 
				+        Get queue of ArchiveResults ready for processing.
			
 
				+
			
 
				+        Uses step-based filtering: only claims ARs where hook step <= snapshot.current_step.
			
 
				+        This ensures hooks execute in order (step 0 → 1 → 2 ... → 9).
			
 
				+        """
			
 
				         from core.models import ArchiveResult
			
 
				+        from archivebox.hooks import extract_step
			
 
				 
			
 
				         qs = super().get_queue()
			
 
				 
			
 
				         if self.plugin:
			
 
				             qs = qs.filter(plugin=self.plugin)
			
 
				 
			
 
				-        # Note: Removed blocking logic since plugins have separate output directories
			
 
				-        # and don't interfere with each other. Each plugin runs independently.
			
 
				+        # Step-based filtering: only process ARs whose step <= snapshot.current_step
			
 
				+        # Since step is derived from hook_name, we filter in Python after initial query
			
 
				+        # This is efficient because the base query already filters by retry_at and status
			
 
				+
			
 
				+        # Get candidate ARs
			
 
				+        candidates = list(qs[:50])  # Limit to avoid loading too many
			
 
				+        ready_pks = []
			
 
				+
			
 
				+        for ar in candidates:
			
 
				+            if not ar.hook_name:
			
 
				+                # Legacy ARs without hook_name - process them
			
 
				+                ready_pks.append(ar.pk)
			
 
				+                continue
			
 
				+
			
 
				+            ar_step = extract_step(ar.hook_name)
			
 
				+            snapshot_step = ar.snapshot.current_step
			
 
				+
			
 
				+            if ar_step <= snapshot_step:
			
 
				+                ready_pks.append(ar.pk)
			
 
				 
			
 
				-        return qs
			
 
				+        # Return filtered queryset ordered by hook_name (so earlier hooks run first within a step)
			
 
				+        return ArchiveResult.objects.filter(pk__in=ready_pks).order_by('hook_name', 'retry_at')
			
 
				 
			
 
				     def process_item(self, obj) -> bool:
			
 
				         """Process an ArchiveResult by running its plugin."""