1 month ago · 4ccb0863bb
--- a/CLAUDE.md
+++ b/CLAUDE.md
@@ -212,3 +212,8 @@ sqlite3 /path/to/index.sqlite3 "PRAGMA table_info(core_snapshot);"
 
				 ```bash
			
 
				 sudo -u testuser bash -c 'source .venv/bin/activate && python -m pytest archivebox/tests/test_migrations_08_to_09.py -xvs 2>&1 | head -200'
			
 
				 ```
			
 
				+
			
 
				+### Kill Zombie Chrome Processes
			
 
				+```bash
			
 
				+./bin/kill_chrome.sh
			
 
				+```
			
--- a/TODO_hook_concurrency.md
+++ b/TODO_hook_concurrency.md
@@ -6,6 +6,48 @@ Snapshot.run() should enforce that snapshot hooks are run in **10 discrete, sequ
 
				 
			
 
				 For every discovered hook script, ArchiveBox should create an ArchiveResult in `queued` state, then manage running them using `retry_at` and inline logic to enforce this ordering.
			
 
				 
			
 
				+## Design Decisions
			
 
				+
			
 
				+### ArchiveResult Schema
			
 
				+- Add `ArchiveResult.hook_name` (CharField, nullable) - just filename, e.g., `'on_Snapshot__20_chrome_tab.bg.js'`
			
 
				+- Keep `ArchiveResult.plugin` - still important (plugin directory name)
			
 
				+- Step number derived on-the-fly from `hook_name` via `extract_step(hook_name)` - not stored
			
 
				+
			
 
				+### Snapshot Schema
			
 
				+- Add `Snapshot.current_step` (IntegerField 0-9, default=0)
			
 
				+- Integrate with `SnapshotMachine` state transitions for step advancement
			
 
				+
			
 
				+### Hook Discovery & Execution
			
 
				+- `Snapshot.run()` discovers all hooks upfront, creates one AR per hook with `hook_name` set
			
 
				+- All ARs for a given step can be claimed and executed in parallel by workers
			
 
				+- Workers claim ARs where `extract_step(ar.hook_name) <= snapshot.current_step`
			
 
				+- `Snapshot.advance_step_if_ready()` increments `current_step` when:
			
 
				+  - All **foreground** hooks in current step are finished (SUCCEEDED/FAILED/SKIPPED)
			
 
				+  - Background hooks don't block advancement (they continue running)
			
 
				+  - Called from `SnapshotMachine` state transitions
			
 
				+
			
 
				+### ArchiveResult.run() Behavior
			
 
				+- If `self.hook_name` is set: run that single hook
			
 
				+- If `self.hook_name` is None: discover all hooks for `self.plugin` and run sequentially
			
 
				+- Background hooks detected by `.bg.` in filename (e.g., `on_Snapshot__20_chrome_tab.bg.js`)
			
 
				+- Background hooks return immediately (ArchiveResult stays in STARTED state)
			
 
				+- Foreground hooks wait for completion, update status from JSONL output
			
 
				+
			
 
				+### Hook Execution Flow
			
 
				+1. **Within a step**: Workers claim all ARs for current step in parallel
			
 
				+2. **Foreground hooks** (no .bg): ArchiveResult waits for completion, transitions to SUCCEEDED/FAILED/SKIPPED
			
 
				+3. **Background hooks** (.bg): ArchiveResult transitions to STARTED, hook continues running
			
 
				+4. **Step advancement**: `Snapshot.advance_step_if_ready()` checks:
			
 
				+   - Are all foreground ARs in current step finished? (SUCCEEDED/FAILED/SKIPPED)
			
 
				+   - Ignore ARs still in STARTED (background hooks)
			
 
				+   - If yes, increment `current_step`
			
 
				+5. **Snapshot sealing**: When `current_step=9` and all foreground hooks done, kill background hooks via `Snapshot.cleanup()`
			
 
				+
			
 
				+### Unnumbered Hooks
			
 
				+- Extract step via `re.search(r'__(\d{2})_', hook_name)`, default to 9 if no match
			
 
				+- Log warning for unnumbered hooks
			
 
				+- Purely runtime derivation - no stored field
			
 
				+
			
 
				 ## Hook Numbering Convention
			
 
				 
			
 
				 Hooks scripts are numbered `00` to `99` to control:
			
@@ -31,20 +73,19 @@ on_Snapshot__53_media.bg.py
 
				 ## Background (.bg) vs Foreground Scripts
			
 
				 
			
 
				 ### Foreground Scripts (no .bg suffix)
			
 
				-- Run sequentially within their step
			
 
				-- Block step progression until they exit
			
 
				-- Should exit naturally when work is complete
			
 
				+- Launch in parallel with other hooks in their step
			
 
				+- Step waits for all foreground hooks to complete or timeout
			
 
				 - Get killed with SIGTERM if they exceed their `PLUGINNAME_TIMEOUT`
			
 
				+- Step advances when all foreground hooks finish
			
 
				 
			
 
				 ### Background Scripts (.bg suffix)
			
 
				-- Spawned and allowed to continue running
			
 
				-- Do NOT block step progression
			
 
				-- Run until **their own `PLUGINNAME_TIMEOUT` is reached** (not until step 99)
			
 
				-- Get polite SIGTERM when timeout expires, then SIGKILL 60s later if not exited
			
 
				-- Must implement their own concurrency control using filesystem (semaphore files, locks, etc.)
			
 
				+- Launch in parallel with other hooks in their step
			
 
				+- Do NOT block step progression - step can advance while they run
			
 
				+- Continue running across step boundaries until complete or timeout
			
 
				+- Get killed with SIGTERM when Snapshot transitions to SEALED (via `Snapshot.cleanup()`)
			
 
				 - Should exit naturally when work is complete (best case)
			
 
				 
			
 
				-**Important:** If a .bg script starts at step 05 with `MEDIA_TIMEOUT=3600s`, it gets the full 3600s regardless of when step 99 completes. It runs on its own timeline.
			
 
				+**Important:** A .bg script started in step 2 can keep running through steps 3, 4, 5... until the Snapshot seals or the hook exits naturally.
			
 
				 
			
 
				 ## Execution Step Guidelines
			
 
				 
			
@@ -268,54 +309,47 @@ archivebox/plugins/{plugin_name}/
 
				 
			
 
				 ## Implementation Checklist
			
 
				 
			
 
				-### Phase 1: Renumber Existing Hooks ✅
			
 
				-- [ ] Renumber DOM extractors to 50-59 range
			
 
				-- [ ] Ensure pdf/screenshot are NOT .bg (need sequential access)
			
 
				-- [ ] Ensure media (ytdlp) IS .bg (can run for hours)
			
 
				-- [ ] Add step comments to each plugin for clarity
			
 
				-
			
 
				-### Phase 2: Timeout Consistency ✅
			
 
				-- [x] All plugins support `PLUGINNAME_TIMEOUT` env var
			
 
				-- [x] All plugins fall back to generic `TIMEOUT` env var
			
 
				-- [x] Background scripts handle SIGTERM gracefully (or exit naturally)
			
 
				-
			
 
				-### Phase 3: Refactor Snapshot.run()
			
 
				-- [ ] Parse hook filenames to extract step number (first digit)
			
 
				-- [ ] Group hooks by step (0-9)
			
 
				-- [ ] Run each step sequentially
			
 
				-- [ ] Within each step:
			
 
				-  - [ ] Launch foreground hooks sequentially
			
 
				-  - [ ] Launch .bg hooks and track PIDs
			
 
				-  - [ ] Wait for foreground hooks to complete before next step
			
 
				-- [ ] Track .bg script timeouts independently
			
 
				-- [ ] Send SIGTERM to .bg scripts when their timeout expires
			
 
				-- [ ] Send SIGKILL 60s after SIGTERM if not exited
			
 
				-
			
 
				-### Phase 4: ArchiveResult Management
			
 
				-- [ ] Create one ArchiveResult per hook (not per plugin)
			
 
				-- [ ] Set initial state to `queued`
			
 
				-- [ ] Update state based on JSONL output and exit code
			
 
				-- [ ] Set `retry_at` for hooks that exit non-zero with no JSONL
			
 
				-- [ ] Don't retry hooks that emit `{"status": "failed"}`
			
 
				-
			
 
				-### Phase 5: JSONL Streaming
			
 
				-- [ ] Parse stdout JSONL line-by-line during hook execution
			
 
				-- [ ] Create/update DB rows as JSONL is emitted (streaming mode)
			
 
				-- [ ] Handle partial JSONL on hook crash
			
 
				-
			
 
				-### Phase 6: Zombie Process Management
			
 
				-- [ ] Read `.pid` files from hook output directories
			
 
				-- [ ] Sweep zombies on cleanup
			
 
				-- [ ] Handle double-forked processes correctly
			
 
				+### Phase 1: Schema Migration ✅
			
 
				+- [ ] Add `Snapshot.current_step` (IntegerField 0-9, default=0)
			
 
				+- [ ] Add `ArchiveResult.hook_name` (CharField, nullable) - just filename
			
 
				+- [ ] Create migration: `0033_snapshot_current_step_archiveresult_hook_name.py`
			
 
				+
			
 
				+### Phase 2: Core Logic Updates
			
 
				+- [ ] Add `extract_step(hook_name)` utility in `archivebox/hooks.py`
			
 
				+  - Extract first digit from `__XX_` pattern
			
 
				+  - Default to 9 for unnumbered hooks
			
 
				+- [ ] Update `Snapshot.create_pending_archiveresults()` in `archivebox/core/models.py`:
			
 
				+  - Discover all hooks (not plugins)
			
 
				+  - Create one AR per hook with `hook_name` set
			
 
				+- [ ] Update `ArchiveResult.run()` in `archivebox/core/models.py`:
			
 
				+  - If `hook_name` set: run single hook
			
 
				+  - If `hook_name` None: discover all plugin hooks (existing behavior)
			
 
				+- [ ] Add `Snapshot.advance_step_if_ready()` method:
			
 
				+  - Check if all foreground ARs in current step finished
			
 
				+  - Increment `current_step` if ready
			
 
				+  - Ignore background hooks (.bg) in completion check
			
 
				+- [ ] Integrate with `SnapshotMachine.is_finished()` in `archivebox/core/statemachines.py`:
			
 
				+  - Call `advance_step_if_ready()` before checking if done
			
 
				+
			
 
				+### Phase 3: Worker Coordination
			
 
				+- [ ] Update worker AR claiming query in `archivebox/workers/worker.py`:
			
 
				+  - Filter: `extract_step(ar.hook_name) <= snapshot.current_step`
			
 
				+  - Note: May need to denormalize or use clever query since step is derived
			
 
				+  - Alternative: Claim any AR in QUEUED state, check step in Python before processing
			
 
				+
			
 
				+### Phase 4: Hook Renumbering
			
 
				+- [ ] Renumber hooks per renumbering map below
			
 
				+- [ ] Add `.bg` suffix to long-running hooks
			
 
				+- [ ] Test all hooks still work after renumbering
			
 
				 
			
 
				 ## Migration Path
			
 
				 
			
 
				-### Backward Compatibility
			
 
				-During migration, support both old and new numbering:
			
 
				-1. Run hooks numbered 00-99 in step order
			
 
				-2. Run unnumbered hooks last (step 9) for compatibility
			
 
				-3. Log warnings for unnumbered hooks
			
 
				-4. Eventually require all hooks to be numbered
			
 
				+### Natural Compatibility
			
 
				+No special migration needed:
			
 
				+1. Existing ARs with `hook_name=None` continue to work (discover all plugin hooks at runtime)
			
 
				+2. New ARs get `hook_name` set (single hook per AR)
			
 
				+3. `ArchiveResult.run()` handles both cases naturally
			
 
				+4. Unnumbered hooks default to step 9 (log warning)
			
 
				 
			
 
				 ### Renumbering Map
			
 
				 
			
--- a/archivebox/core/admin_archiveresults.py
+++ b/archivebox/core/admin_archiveresults.py
@@ -252,9 +252,9 @@ class ArchiveResultInline(admin.TabularInline):
 
				 
			
 
				 
			
 
				 class ArchiveResultAdmin(BaseModelAdmin):
			
 
				-    list_display = ('id', 'created_by', 'created_at', 'snapshot_info', 'tags_str', 'status', 'extractor_with_icon', 'cmd_str', 'output_str')
			
 
				+    list_display = ('id', 'created_by', 'created_at', 'snapshot_info', 'tags_str', 'status', 'plugin_with_icon', 'cmd_str', 'output_str')
			
 
				     sort_fields = ('id', 'created_by', 'created_at', 'plugin', 'status')
			
 
				-    readonly_fields = ('cmd_str', 'snapshot_info', 'tags_str', 'created_at', 'modified_at', 'output_summary', 'extractor_with_icon', 'iface')
			
 
				+    readonly_fields = ('cmd_str', 'snapshot_info', 'tags_str', 'created_at', 'modified_at', 'output_summary', 'plugin_with_icon', 'iface')
			
 
				     search_fields = ('id', 'snapshot__url', 'plugin', 'output_str', 'cmd_version', 'cmd', 'snapshot__timestamp')
			
 
				     autocomplete_fields = ['snapshot']
			
 
				 
			
--- a/archivebox/core/admin_snapshots.py
+++ b/archivebox/core/admin_snapshots.py
@@ -46,9 +46,9 @@ class SnapshotActionForm(ActionForm):
 
				             ),
			
 
				         )
			
 
				 
			
 
				-    # TODO: allow selecting actions for specific extractors? is this useful?
			
 
				-    # extractor = forms.ChoiceField(
			
 
				-    #     choices=ArchiveResult.EXTRACTOR_CHOICES,
			
 
				+    # TODO: allow selecting actions for specific extractor plugins? is this useful?
			
 
				+    # plugin = forms.ChoiceField(
			
 
				+    #     choices=ArchiveResult.PLUGIN_CHOICES,
			
 
				     #     required=False,
			
 
				     #     widget=forms.MultileChoiceField(attrs={'class': "form-control"})
			
 
				     # )
			
--- a/archivebox/core/models.py
+++ b/archivebox/core/models.py
@@ -1041,7 +1041,7 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
 
				         return cache.get_or_set(cache_key, calc_tags_str) if not nocache else calc_tags_str()
			
 
				 
			
 
				     def icons(self) -> str:
			
 
				-        """Generate HTML icons showing which extractors have succeeded for this snapshot"""
			
 
				+        """Generate HTML icons showing which extractor plugins have succeeded for this snapshot"""
			
 
				         from django.utils.html import format_html, mark_safe
			
 
				 
			
 
				         cache_key = f'result_icons:{self.pk}:{(self.downloaded_at or self.modified_at or self.created_at or self.bookmarked_at).timestamp()}'
			
@@ -1475,7 +1475,7 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
 
				                         priority = 50
			
 
				                 elif 'index' in name_lower:
			
 
				                     priority = 100
			
 
				-                elif name_lower.startswith(('output', 'content', extractor_name)):
			
 
				+                elif name_lower.startswith(('output', 'content', plugin_name)):
			
 
				                     priority = 50
			
 
				                 elif ext in ('html', 'htm', 'pdf'):
			
 
				                     priority = 30
			
--- a/archivebox/core/statemachines.py
+++ b/archivebox/core/statemachines.py
@@ -91,7 +91,7 @@ class SnapshotMachine(StateMachine, strict_states=True):
 
				             retry_at=timezone.now() + timedelta(seconds=30),  # if failed, wait 30s before retrying
			
 
				         )
			
 
				 
			
 
				-        # Run the snapshot - creates pending archiveresults for all enabled extractors
			
 
				+        # Run the snapshot - creates pending archiveresults for all enabled plugins
			
 
				         self.snapshot.run()
			
 
				 
			
 
				         # unlock the snapshot after we're done + set status = started
			
@@ -179,15 +179,15 @@ class ArchiveResultMachine(StateMachine, strict_states=True):
 
				         return can_start
			
 
				     
			
 
				     def is_succeeded(self) -> bool:
			
 
				-        """Check if extraction succeeded (status was set by run_extractor())."""
			
 
				+        """Check if extractor plugin succeeded (status was set by run())."""
			
 
				         return self.archiveresult.status == ArchiveResult.StatusChoices.SUCCEEDED
			
 
				-    
			
 
				+
			
 
				     def is_failed(self) -> bool:
			
 
				-        """Check if extraction failed (status was set by run_extractor())."""
			
 
				+        """Check if extractor plugin failed (status was set by run())."""
			
 
				         return self.archiveresult.status == ArchiveResult.StatusChoices.FAILED
			
 
				-    
			
 
				+
			
 
				     def is_skipped(self) -> bool:
			
 
				-        """Check if extraction was skipped (status was set by run_extractor())."""
			
 
				+        """Check if extractor plugin was skipped (status was set by run())."""
			
 
				         return self.archiveresult.status == ArchiveResult.StatusChoices.SKIPPED
			
 
				     
			
 
				     def is_backoff(self) -> bool:
			
--- a/archivebox/core/views.py
+++ b/archivebox/core/views.py
@@ -96,8 +96,8 @@ class SnapshotView(View):
 
				             if not key.endswith('_path') or not path or path.startswith('http'):
			
 
				                 continue
			
 
				 
			
 
				-            extractor_name = key.replace('_path', '')
			
 
				-            if extractor_name in archiveresults:
			
 
				+            plugin_name = key.replace('_path', '')
			
 
				+            if plugin_name in archiveresults:
			
 
				                 continue  # Already have this from ArchiveResult
			
 
				 
			
 
				             file_path = snap_dir / path
			
@@ -107,8 +107,8 @@ class SnapshotView(View):
 
				             try:
			
 
				                 file_size = file_path.stat().st_size
			
 
				                 if file_size >= 15_000:  # Only show files > 15KB
			
 
				-                    archiveresults[extractor_name] = {
			
 
				-                        'name': extractor_name,
			
 
				+                    archiveresults[plugin_name] = {
			
 
				+                        'name': plugin_name,
			
 
				                         'path': path,
			
 
				                         'ts': ts_to_date_str(file_path.stat().st_mtime or 0),
			
 
				                         'size': file_size,
			
@@ -117,7 +117,7 @@ class SnapshotView(View):
 
				             except OSError:
			
 
				                 continue
			
 
				 
			
 
				-        # Get available extractors from hooks (sorted by numeric prefix for ordering)
			
 
				+        # Get available extractor plugins from hooks (sorted by numeric prefix for ordering)
			
 
				         # Convert to base names for display ordering
			
 
				         all_plugins = [get_extractor_name(e) for e in get_extractors()]
			
 
				         preferred_types = tuple(all_plugins)
			
@@ -437,7 +437,7 @@ class AddView(UserPassesTestMixin, FormView):
 
				         parser = form.cleaned_data.get("parser", "auto")  # default to auto-detect parser
			
 
				         tag = form.cleaned_data["tag"]
			
 
				         depth = 0 if form.cleaned_data["depth"] == "0" else 1
			
 
				-        extractors = ','.join(form.cleaned_data["archive_methods"])
			
 
				+        plugins = ','.join(form.cleaned_data["archive_methods"])
			
 
				         input_kwargs = {
			
 
				             "urls": urls,
			
 
				             "tag": tag,
			
@@ -447,8 +447,8 @@ class AddView(UserPassesTestMixin, FormView):
 
				             "out_dir": DATA_DIR,
			
 
				             "created_by_id": self.request.user.pk,
			
 
				         }
			
 
				-        if extractors:
			
 
				-            input_kwargs.update({"extractors": extractors})
			
 
				+        if plugins:
			
 
				+            input_kwargs.update({"plugins": plugins})
			
 
				 
			
 
				 
			
 
				         from archivebox.config.permissions import HOSTNAME
			
@@ -472,7 +472,7 @@ class AddView(UserPassesTestMixin, FormView):
 
				                 # 'INDEX_ONLY': index_only,
			
 
				                 # 'OVERWRITE': False,
			
 
				                 'DEPTH': depth,
			
 
				-                'EXTRACTORS': extractors or '',
			
 
				+                'PLUGINS': plugins or '',
			
 
				                 # 'DEFAULT_PERSONA': persona or 'Default',
			
 
				             }
			
 
				         )
			
@@ -580,17 +580,17 @@ def live_progress_view(request):
 
				                 snapshot_results = snapshot.archiveresult_set.all()
			
 
				 
			
 
				                 # Count in memory instead of DB queries
			
 
				-                total_extractors = len(snapshot_results)
			
 
				-                completed_extractors = sum(1 for ar in snapshot_results if ar.status == ArchiveResult.StatusChoices.SUCCEEDED)
			
 
				-                failed_extractors = sum(1 for ar in snapshot_results if ar.status == ArchiveResult.StatusChoices.FAILED)
			
 
				-                pending_extractors = sum(1 for ar in snapshot_results if ar.status == ArchiveResult.StatusChoices.QUEUED)
			
 
				+                total_plugins = len(snapshot_results)
			
 
				+                completed_plugins = sum(1 for ar in snapshot_results if ar.status == ArchiveResult.StatusChoices.SUCCEEDED)
			
 
				+                failed_plugins = sum(1 for ar in snapshot_results if ar.status == ArchiveResult.StatusChoices.FAILED)
			
 
				+                pending_plugins = sum(1 for ar in snapshot_results if ar.status == ArchiveResult.StatusChoices.QUEUED)
			
 
				 
			
 
				                 # Calculate snapshot progress
			
 
				-                snapshot_progress = int(((completed_extractors + failed_extractors) / total_extractors) * 100) if total_extractors > 0 else 0
			
 
				+                snapshot_progress = int(((completed_plugins + failed_plugins) / total_plugins) * 100) if total_plugins > 0 else 0
			
 
				 
			
 
				-                # Get all extractors for this snapshot (already prefetched, sort in Python)
			
 
				+                # Get all extractor plugins for this snapshot (already prefetched, sort in Python)
			
 
				                 # Order: started first, then queued, then completed
			
 
				-                def extractor_sort_key(ar):
			
 
				+                def plugin_sort_key(ar):
			
 
				                     status_order = {
			
 
				                         ArchiveResult.StatusChoices.STARTED: 0,
			
 
				                         ArchiveResult.StatusChoices.QUEUED: 1,
			
@@ -605,7 +605,7 @@ def live_progress_view(request):
 
				                         'plugin': ar.plugin,
			
 
				                         'status': ar.status,
			
 
				                     }
			
 
				-                    for ar in sorted(snapshot_results, key=extractor_sort_key)
			
 
				+                    for ar in sorted(snapshot_results, key=plugin_sort_key)
			
 
				                 ]
			
 
				 
			
 
				                 active_snapshots_for_crawl.append({
			
@@ -614,10 +614,10 @@ def live_progress_view(request):
 
				                     'status': snapshot.status,
			
 
				                     'started': snapshot.modified_at.isoformat() if snapshot.modified_at else None,
			
 
				                     'progress': snapshot_progress,
			
 
				-                    'total_extractors': total_extractors,
			
 
				-                    'completed_extractors': completed_extractors,
			
 
				-                    'failed_extractors': failed_extractors,
			
 
				-                    'pending_extractors': pending_extractors,
			
 
				+                    'total_plugins': total_plugins,
			
 
				+                    'completed_plugins': completed_plugins,
			
 
				+                    'failed_plugins': failed_plugins,
			
 
				+                    'pending_plugins': pending_plugins,
			
 
				                     'all_plugins': all_plugins,
			
 
				                 })
			
 
				 
			
--- a/archivebox/crawls/models.py
+++ b/archivebox/crawls/models.py
@@ -196,7 +196,7 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith
 
				         Add a URL to the crawl queue if not already present.
			
 
				 
			
 
				         Args:
			
 
				-            entry: dict with 'url', optional 'depth', 'title', 'timestamp', 'tags', 'via_snapshot', 'via_extractor'
			
 
				+            entry: dict with 'url', optional 'depth', 'title', 'timestamp', 'tags', 'via_snapshot', 'plugin'
			
 
				 
			
 
				         Returns:
			
 
				             True if URL was added, False if skipped (duplicate or depth exceeded)
			
--- a/archivebox/misc/logging_util.py
+++ b/archivebox/misc/logging_util.py
@@ -522,7 +522,7 @@ def log_worker_event(
 
				     pid: Optional[int] = None,
			
 
				     worker_id: Optional[str] = None,
			
 
				     url: Optional[str] = None,
			
 
				-    extractor: Optional[str] = None,
			
 
				+    plugin: Optional[str] = None,
			
 
				     metadata: Optional[Dict[str, Any]] = None,
			
 
				     error: Optional[Exception] = None,
			
 
				 ) -> None:
			
@@ -534,9 +534,9 @@ def log_worker_event(
 
				         event: Event name (Starting, Completed, Failed, etc.)
			
 
				         indent_level: Indentation level (0=Orchestrator, 1=CrawlWorker, 2=SnapshotWorker, 3=ArchiveResultWorker)
			
 
				         pid: Process ID
			
 
				-        worker_id: Worker ID (UUID for CrawlWorker, url for SnapshotWorker, extractor for ArchiveResultWorker)
			
 
				+        worker_id: Worker ID (UUID for CrawlWorker, url for SnapshotWorker, plugin for ArchiveResultWorker)
			
 
				         url: URL being processed (for SnapshotWorker/ArchiveResultWorker)
			
 
				-        extractor: Extractor name (for ArchiveResultWorker)
			
 
				+        plugin: Plugin name (for ArchiveResultWorker)
			
 
				         metadata: Dict of metadata to show in curly braces
			
 
				         error: Exception if event is an error
			
 
				     """
			
@@ -544,7 +544,7 @@ def log_worker_event(
 
				 
			
 
				     from rich.markup import escape
			
 
				 
			
 
				-    # Build worker identifier (without URL/extractor)
			
 
				+    # Build worker identifier (without URL/plugin)
			
 
				     worker_parts = [worker_type]
			
 
				     # Don't add pid/worker_id for DB operations (they happen in whatever process is running)
			
 
				     if pid and worker_type != 'DB':
			
@@ -556,12 +556,12 @@ def log_worker_event(
 
				     worker_label_base = worker_parts[0]
			
 
				     worker_bracket_content = ", ".join(worker_parts[1:]) if len(worker_parts) > 1 else None
			
 
				 
			
 
				-    # Build URL/extractor display (shown AFTER the label, outside brackets)
			
 
				+    # Build URL/plugin display (shown AFTER the label, outside brackets)
			
 
				     url_extractor_parts = []
			
 
				     if url:
			
 
				         url_extractor_parts.append(f'url: {escape(url)}')
			
 
				-    if extractor:
			
 
				-        url_extractor_parts.append(f'extractor: {escape(extractor)}')
			
 
				+    if plugin:
			
 
				+        url_extractor_parts.append(f'extractor: {escape(plugin)}')
			
 
				 
			
 
				     url_extractor_str = ' | '.join(url_extractor_parts) if url_extractor_parts else ''
			
 
				 
			
@@ -623,7 +623,7 @@ def log_worker_event(
 
				 
			
 
				     text.append(f' {event}{error_str}', style=color)
			
 
				 
			
 
				-    # Add URL/extractor info first (more important)
			
 
				+    # Add URL/plugin info first (more important)
			
 
				     if url_extractor_str:
			
 
				         text.append(f' | {url_extractor_str}')
			
 
				 
			
--- a/archivebox/misc/process_utils.py
+++ b/archivebox/misc/process_utils.py
@@ -1,14 +1,9 @@
 
				 """
			
 
				-Cross-platform process validation utilities using psutil.
			
 
				+Process validation using psutil and filesystem mtime.
			
 
				 
			
 
				-Uses filesystem mtime as a "password" to validate PIDs haven't been reused.
			
 
				-Since filesystem mtimes can be set arbitrarily, but process start times cannot,
			
 
				-we can detect PID reuse by comparing:
			
 
				-  - PID file mtime (set to process start time when we launched it)
			
 
				-  - Actual process start time (from psutil)
			
 
				-
			
 
				-If they match (within tolerance), it's our process.
			
 
				-If they don't match, the PID was reused by a different process.
			
 
				+Uses mtime as a "password": PID files are timestamped with process start time.
			
 
				+Since filesystem mtimes can be set arbitrarily but process start times cannot,
			
 
				+comparing them detects PID reuse.
			
 
				 """
			
 
				 
			
 
				 __package__ = 'archivebox.misc'
			
@@ -20,245 +15,70 @@ from typing import Optional
 
				 
			
 
				 try:
			
 
				     import psutil
			
 
				+    PSUTIL_AVAILABLE = True
			
 
				 except ImportError:
			
 
				-    psutil = None
			
 
				-
			
 
				-
			
 
				-def get_process_info(pid: int) -> Optional[dict]:
			
 
				-    """
			
 
				-    Get process information using psutil.
			
 
				-
			
 
				-    Args:
			
 
				-        pid: Process ID
			
 
				+    PSUTIL_AVAILABLE = False
			
 
				 
			
 
				-    Returns:
			
 
				-        Dict with 'start_time', 'cmdline', 'name', 'status' or None if not found
			
 
				-    """
			
 
				-    if psutil is None:
			
 
				-        return None
			
 
				-
			
 
				-    try:
			
 
				-        proc = psutil.Process(pid)
			
 
				-        return {
			
 
				-            'start_time': proc.create_time(),  # Unix epoch seconds
			
 
				-            'cmdline': proc.cmdline(),
			
 
				-            'name': proc.name(),
			
 
				-            'status': proc.status(),
			
 
				-        }
			
 
				-    except (psutil.NoSuchProcess, psutil.AccessDenied, psutil.ZombieProcess):
			
 
				-        return None
			
 
				 
			
 
				-
			
 
				-def validate_pid_file(
			
 
				-    pid_file: Path,
			
 
				-    cmd_file: Optional[Path] = None,
			
 
				-    tolerance_seconds: float = 5.0
			
 
				-) -> bool:
			
 
				-    """
			
 
				-    Validate PID file using mtime as "password".
			
 
				-
			
 
				-    Returns True only if ALL checks pass:
			
 
				-    1. PID file exists and contains valid integer
			
 
				-    2. Process with that PID exists
			
 
				-    3. File mtime matches process start time (within tolerance)
			
 
				-    4. If cmd_file provided, process cmdline contains expected args
			
 
				-
			
 
				-    Args:
			
 
				-        pid_file: Path to .pid file
			
 
				-        cmd_file: Optional path to cmd.sh for command validation
			
 
				-        tolerance_seconds: Allowed difference between mtime and start time
			
 
				-
			
 
				-    Returns:
			
 
				-        True if PID is validated, False if reused/invalid
			
 
				-    """
			
 
				-    if psutil is None:
			
 
				-        # Fallback: just check if process exists (no validation)
			
 
				-        return _validate_pid_file_without_psutil(pid_file)
			
 
				-
			
 
				-    # Check PID file exists
			
 
				-    if not pid_file.exists():
			
 
				+def validate_pid_file(pid_file: Path, cmd_file: Optional[Path] = None, tolerance: float = 5.0) -> bool:
			
 
				+    """Validate PID using mtime and optional cmd.sh. Returns True if process is ours."""
			
 
				+    if not PSUTIL_AVAILABLE or not pid_file.exists():
			
 
				         return False
			
 
				 
			
 
				-    # Read PID
			
 
				     try:
			
 
				         pid = int(pid_file.read_text().strip())
			
 
				-    except (ValueError, OSError):
			
 
				-        return False
			
 
				-
			
 
				-    # Get process info
			
 
				-    proc_info = get_process_info(pid)
			
 
				-    if proc_info is None:
			
 
				-        return False  # Process doesn't exist
			
 
				-
			
 
				-    # Check mtime matches process start time
			
 
				-    try:
			
 
				-        file_mtime = pid_file.stat().st_mtime
			
 
				-    except OSError:
			
 
				-        return False
			
 
				-
			
 
				-    proc_start_time = proc_info['start_time']
			
 
				-    time_diff = abs(file_mtime - proc_start_time)
			
 
				-
			
 
				-    if time_diff > tolerance_seconds:
			
 
				-        # PID was reused by different process
			
 
				-        return False
			
 
				-
			
 
				-    # Validate command if provided
			
 
				-    if cmd_file and cmd_file.exists():
			
 
				-        try:
			
 
				-            expected_cmd = cmd_file.read_text().strip()
			
 
				-            actual_cmdline = ' '.join(proc_info['cmdline'])
			
 
				-
			
 
				-            # Check for key indicators (chrome, debug port, etc.)
			
 
				-            # This is a heuristic - just checks if critical args are present
			
 
				-            if '--remote-debugging-port' in expected_cmd:
			
 
				-                if '--remote-debugging-port' not in actual_cmdline:
			
 
				-                    return False
			
 
				+        proc = psutil.Process(pid)
			
 
				 
			
 
				-            if 'chrome' in expected_cmd.lower() or 'chromium' in expected_cmd.lower():
			
 
				-                proc_name_lower = proc_info['name'].lower()
			
 
				-                if 'chrome' not in proc_name_lower and 'chromium' not in proc_name_lower:
			
 
				+        # Check mtime matches process start time
			
 
				+        if abs(pid_file.stat().st_mtime - proc.create_time()) > tolerance:
			
 
				+            return False  # PID reused
			
 
				+
			
 
				+        # Validate command if provided
			
 
				+        if cmd_file and cmd_file.exists():
			
 
				+            cmd = cmd_file.read_text()
			
 
				+            cmdline = ' '.join(proc.cmdline())
			
 
				+            if '--remote-debugging-port' in cmd and '--remote-debugging-port' not in cmdline:
			
 
				+                return False
			
 
				+            if ('chrome' in cmd.lower() or 'chromium' in cmd.lower()):
			
 
				+                if 'chrome' not in proc.name().lower() and 'chromium' not in proc.name().lower():
			
 
				                     return False
			
 
				 
			
 
				-        except OSError:
			
 
				-            pass  # Can't validate command, but other checks passed
			
 
				-
			
 
				-    return True
			
 
				-
			
 
				-
			
 
				-def _validate_pid_file_without_psutil(pid_file: Path) -> bool:
			
 
				-    """
			
 
				-    Fallback validation when psutil not available.
			
 
				-    Only checks if process exists, no validation.
			
 
				-    """
			
 
				-    if not pid_file.exists():
			
 
				-        return False
			
 
				-
			
 
				-    try:
			
 
				-        pid = int(pid_file.read_text().strip())
			
 
				-        os.kill(pid, 0)  # Signal 0 = check existence
			
 
				         return True
			
 
				-    except (OSError, ValueError, ProcessLookupError):
			
 
				+    except (psutil.NoSuchProcess, psutil.AccessDenied, psutil.ZombieProcess, ValueError, OSError):
			
 
				         return False
			
 
				 
			
 
				 
			
 
				 def write_pid_file_with_mtime(pid_file: Path, pid: int, start_time: float):
			
 
				-    """
			
 
				-    Write PID file and set mtime to process start time.
			
 
				-
			
 
				-    This creates a "password" that can be validated later to ensure
			
 
				-    the PID hasn't been reused by a different process.
			
 
				-
			
 
				-    Args:
			
 
				-        pid_file: Path to .pid file to create
			
 
				-        pid: Process ID to write
			
 
				-        start_time: Process start time as Unix epoch seconds
			
 
				-    """
			
 
				+    """Write PID file and set mtime to process start time."""
			
 
				     pid_file.write_text(str(pid))
			
 
				-
			
 
				-    # Set both atime and mtime to process start time
			
 
				     try:
			
 
				         os.utime(pid_file, (start_time, start_time))
			
 
				     except OSError:
			
 
				-        # If we can't set mtime, file is still written
			
 
				-        # Validation will be less reliable but won't break
			
 
				-        pass
			
 
				+        pass  # mtime optional, validation degrades gracefully
			
 
				 
			
 
				 
			
 
				 def write_cmd_file(cmd_file: Path, cmd: list[str]):
			
 
				-    """
			
 
				-    Write command script for validation.
			
 
				-
			
 
				-    Args:
			
 
				-        cmd_file: Path to cmd.sh to create
			
 
				-        cmd: Command list (e.g., ['chrome', '--remote-debugging-port=9222', ...])
			
 
				-    """
			
 
				-    # Shell escape arguments with spaces or special chars
			
 
				-    def shell_escape(arg: str) -> str:
			
 
				-        if ' ' in arg or '"' in arg or "'" in arg or '$' in arg:
			
 
				-            # Escape double quotes and wrap in double quotes
			
 
				-            return f'"{arg.replace(chr(34), chr(92) + chr(34))}"'
			
 
				-        return arg
			
 
				-
			
 
				-    escaped_cmd = [shell_escape(arg) for arg in cmd]
			
 
				-    script = '#!/bin/bash\n' + ' '.join(escaped_cmd) + '\n'
			
 
				+    """Write shell command script."""
			
 
				+    def escape(arg: str) -> str:
			
 
				+        return f'"{arg.replace(chr(34), chr(92)+chr(34))}"' if any(c in arg for c in ' "$') else arg
			
 
				 
			
 
				+    script = '#!/bin/bash\n' + ' '.join(escape(arg) for arg in cmd) + '\n'
			
 
				     cmd_file.write_text(script)
			
 
				     try:
			
 
				         cmd_file.chmod(0o755)
			
 
				     except OSError:
			
 
				-        pass  # Best effort
			
 
				-
			
 
				-
			
 
				-def safe_kill_process(
			
 
				-    pid_file: Path,
			
 
				-    cmd_file: Optional[Path] = None,
			
 
				-    signal_num: int = 15,  # SIGTERM
			
 
				-    validate: bool = True
			
 
				-) -> bool:
			
 
				-    """
			
 
				-    Safely kill a process with validation.
			
 
				+        pass
			
 
				 
			
 
				-    Args:
			
 
				-        pid_file: Path to .pid file
			
 
				-        cmd_file: Optional path to cmd.sh for validation
			
 
				-        signal_num: Signal to send (default SIGTERM=15)
			
 
				-        validate: If True, validate process identity before killing
			
 
				 
			
 
				-    Returns:
			
 
				-        True if process was killed, False if not found or validation failed
			
 
				-    """
			
 
				-    if not pid_file.exists():
			
 
				+def safe_kill_process(pid_file: Path, cmd_file: Optional[Path] = None, signal_num: int = 15) -> bool:
			
 
				+    """Kill process after validation. Returns True if killed."""
			
 
				+    if not validate_pid_file(pid_file, cmd_file):
			
 
				+        pid_file.unlink(missing_ok=True)  # Clean stale file
			
 
				         return False
			
 
				 
			
 
				-    # Validate process identity first
			
 
				-    if validate:
			
 
				-        if not validate_pid_file(pid_file, cmd_file):
			
 
				-            # PID reused by different process, don't kill
			
 
				-            # Clean up stale PID file
			
 
				-            try:
			
 
				-                pid_file.unlink()
			
 
				-            except OSError:
			
 
				-                pass
			
 
				-            return False
			
 
				-
			
 
				-    # Read PID and kill
			
 
				     try:
			
 
				         pid = int(pid_file.read_text().strip())
			
 
				         os.kill(pid, signal_num)
			
 
				         return True
			
 
				     except (OSError, ValueError, ProcessLookupError):
			
 
				         return False
			
 
				-
			
 
				-
			
 
				-def cleanup_stale_pid_files(directory: Path, cmd_file_name: str = 'cmd.sh') -> int:
			
 
				-    """
			
 
				-    Remove stale PID files from directory.
			
 
				-
			
 
				-    A PID file is stale if:
			
 
				-    - Process no longer exists, OR
			
 
				-    - Process exists but validation fails (PID reused)
			
 
				-
			
 
				-    Args:
			
 
				-        directory: Directory to scan for *.pid files
			
 
				-        cmd_file_name: Name of command file for validation (default: cmd.sh)
			
 
				-
			
 
				-    Returns:
			
 
				-        Number of stale PID files removed
			
 
				-    """
			
 
				-    if not directory.exists():
			
 
				-        return 0
			
 
				-
			
 
				-    removed = 0
			
 
				-    for pid_file in directory.glob('**/*.pid'):
			
 
				-        cmd_file = pid_file.parent / cmd_file_name
			
 
				-
			
 
				-        # Check if valid
			
 
				-        if not validate_pid_file(pid_file, cmd_file):
			
 
				-            try:
			
 
				-                pid_file.unlink()
			
 
				-                removed += 1
			
 
				-            except OSError:
			
 
				-                pass
			
 
				-
			
 
				-    return removed
			
--- a/archivebox/misc/shell_welcome_message.py
+++ b/archivebox/misc/shell_welcome_message.py
@@ -53,5 +53,5 @@ if __name__ == '__main__':
 
				     prnt('    add[blink][deep_sky_blue4]?[/deep_sky_blue4][/blink]                                                                        [grey53]# add ? after anything to get help[/]')
			
 
				     prnt('    add("https://example.com/some/new/url")                                     [grey53]# call CLI methods from the shell[/]')
			
 
				     prnt('    snap = Snapshot.objects.filter(url__contains="https://example.com").last()  [grey53]# query for individual snapshots[/]')
			
 
				-    prnt('    snap.archiveresult_set.all()                                                [grey53]# see extractor results[/]')
			
 
				+    prnt('    snap.archiveresult_set.all()                                                [grey53]# see extractor plugin results[/]')
			
 
				     prnt('    bool(re.compile(CONFIG.URL_DENYLIST).search("https://example.com/abc.exe")) [grey53]# test out a config change[/]')
			
--- a/archivebox/plugins/accessibility/on_Snapshot__39_accessibility.js
+++ b/archivebox/plugins/accessibility/on_Snapshot__39_accessibility.js
@@ -20,7 +20,7 @@ const path = require('path');
 
				 const puppeteer = require('puppeteer-core');
			
 
				 
			
 
				 // Extractor metadata
			
 
				-const EXTRACTOR_NAME = 'accessibility';
			
 
				+const PLUGIN_NAME = 'accessibility';
			
 
				 const OUTPUT_DIR = '.';
			
 
				 const OUTPUT_FILE = 'accessibility.json';
			
 
				 const CHROME_SESSION_DIR = '../chrome';
			
@@ -223,10 +223,14 @@ async function main() {
 
				             process.exit(0);
			
 
				         }
			
 
				 
			
 
				-        // Wait for page to be fully loaded
			
 
				-        const pageLoaded = await waitForChromeTabLoaded(60000);
			
 
				-        if (!pageLoaded) {
			
 
				-            throw new Error('Page not loaded after 60s (chrome_navigate must complete first)');
			
 
				+        // Check if Chrome session exists, then wait for page load
			
 
				+        const cdpUrl = getCdpUrl();
			
 
				+        if (cdpUrl) {
			
 
				+            // Wait for page to be fully loaded
			
 
				+            const pageLoaded = await waitForChromeTabLoaded(60000);
			
 
				+            if (!pageLoaded) {
			
 
				+                throw new Error('Page not loaded after 60s (chrome_navigate must complete first)');
			
 
				+            }
			
 
				         }
			
 
				 
			
 
				         const result = await extractAccessibility(url);
			
--- a/archivebox/plugins/archive_org/on_Snapshot__13_archive_org.py
+++ b/archivebox/plugins/archive_org/on_Snapshot__13_archive_org.py
@@ -25,7 +25,7 @@ import rich_click as click
 
				 
			
 
				 
			
 
				 # Extractor metadata
			
 
				-EXTRACTOR_NAME = 'archive_org'
			
 
				+PLUGIN_NAME = 'archive_org'
			
 
				 OUTPUT_DIR = '.'
			
 
				 OUTPUT_FILE = 'archive.org.txt'
			
 
				 
			
--- a/archivebox/plugins/chrome/on_Crawl__20_chrome_launch.bg.js
+++ b/archivebox/plugins/chrome/on_Crawl__20_chrome_launch.bg.js
@@ -26,101 +26,23 @@ const { spawn } = require('child_process');
 
				 const http = require('http');
			
 
				 
			
 
				 // Extractor metadata
			
 
				-const EXTRACTOR_NAME = 'chrome_launch';
			
 
				+const PLUGIN_NAME = 'chrome_launch';
			
 
				 const OUTPUT_DIR = 'chrome';
			
 
				 
			
 
				-// Helper: Write PID file with mtime set to process start time
			
 
				+// Helpers for PID file creation
			
 
				 function writePidWithMtime(filePath, pid, startTimeSeconds) {
			
 
				     fs.writeFileSync(filePath, String(pid));
			
 
				-    // Set both atime and mtime to process start time for validation
			
 
				     const startTimeMs = startTimeSeconds * 1000;
			
 
				     fs.utimesSync(filePath, new Date(startTimeMs), new Date(startTimeMs));
			
 
				 }
			
 
				 
			
 
				-// Helper: Write command script for validation
			
 
				 function writeCmdScript(filePath, binary, args) {
			
 
				-    // Shell escape arguments containing spaces or special characters
			
 
				-    const escapedArgs = args.map(arg => {
			
 
				-        if (arg.includes(' ') || arg.includes('"') || arg.includes('$')) {
			
 
				-            return `"${arg.replace(/"/g, '\\"')}"`;
			
 
				-        }
			
 
				-        return arg;
			
 
				-    });
			
 
				-    const script = `#!/bin/bash\n${binary} ${escapedArgs.join(' ')}\n`;
			
 
				-    fs.writeFileSync(filePath, script);
			
 
				+    const escape = arg => (arg.includes(' ') || arg.includes('"') || arg.includes('$'))
			
 
				+        ? `"${arg.replace(/"/g, '\\"')}"` : arg;
			
 
				+    fs.writeFileSync(filePath, `#!/bin/bash\n${binary} ${args.map(escape).join(' ')}\n`);
			
 
				     fs.chmodSync(filePath, 0o755);
			
 
				 }
			
 
				 
			
 
				-// Helper: Get process start time (cross-platform)
			
 
				-function getProcessStartTime(pid) {
			
 
				-    try {
			
 
				-        const { execSync } = require('child_process');
			
 
				-        if (process.platform === 'darwin') {
			
 
				-            // macOS: ps -p PID -o lstart= gives start time
			
 
				-            const output = execSync(`ps -p ${pid} -o lstart=`, { encoding: 'utf8', timeout: 1000 });
			
 
				-            return Date.parse(output.trim()) / 1000;  // Convert to epoch seconds
			
 
				-        } else {
			
 
				-            // Linux: read /proc/PID/stat field 22 (starttime in clock ticks)
			
 
				-            const stat = fs.readFileSync(`/proc/${pid}/stat`, 'utf8');
			
 
				-            const match = stat.match(/\) \w+ (\d+)/);
			
 
				-            if (match) {
			
 
				-                const startTicks = parseInt(match[1], 10);
			
 
				-                // Convert clock ticks to seconds (assuming 100 ticks/sec)
			
 
				-                const uptimeSeconds = parseFloat(fs.readFileSync('/proc/uptime', 'utf8').split(' ')[0]);
			
 
				-                const bootTime = Date.now() / 1000 - uptimeSeconds;
			
 
				-                return bootTime + (startTicks / 100);
			
 
				-            }
			
 
				-        }
			
 
				-    } catch (e) {
			
 
				-        // Can't get start time
			
 
				-        return null;
			
 
				-    }
			
 
				-    return null;
			
 
				-}
			
 
				-
			
 
				-// Helper: Validate PID using mtime and command
			
 
				-function validatePid(pid, pidFile, cmdFile) {
			
 
				-    try {
			
 
				-        // Check process exists
			
 
				-        try {
			
 
				-            process.kill(pid, 0);  // Signal 0 = check existence
			
 
				-        } catch (e) {
			
 
				-            return false;  // Process doesn't exist
			
 
				-        }
			
 
				-
			
 
				-        // Check mtime matches process start time (within 5 sec tolerance)
			
 
				-        const fileStat = fs.statSync(pidFile);
			
 
				-        const fileMtime = fileStat.mtimeMs / 1000;  // Convert to seconds
			
 
				-        const procStartTime = getProcessStartTime(pid);
			
 
				-
			
 
				-        if (procStartTime === null) {
			
 
				-            // Can't validate - fall back to basic existence check
			
 
				-            return true;
			
 
				-        }
			
 
				-
			
 
				-        if (Math.abs(fileMtime - procStartTime) > 5) {
			
 
				-            // PID was reused by different process
			
 
				-            return false;
			
 
				-        }
			
 
				-
			
 
				-        // Validate command if available
			
 
				-        if (fs.existsSync(cmdFile)) {
			
 
				-            const cmd = fs.readFileSync(cmdFile, 'utf8');
			
 
				-            // Check for Chrome/Chromium and debug port
			
 
				-            if (!cmd.includes('chrome') && !cmd.includes('chromium')) {
			
 
				-                return false;
			
 
				-            }
			
 
				-            if (!cmd.includes('--remote-debugging-port')) {
			
 
				-                return false;
			
 
				-            }
			
 
				-        }
			
 
				-
			
 
				-        return true;
			
 
				-    } catch (e) {
			
 
				-        return false;
			
 
				-    }
			
 
				-}
			
 
				-
			
 
				 // Global state for cleanup
			
 
				 let chromePid = null;
			
 
				 
			
@@ -332,20 +254,20 @@ function killZombieChrome() {
 
				                         const pid = parseInt(fs.readFileSync(pidFile, 'utf8').trim(), 10);
			
 
				                         if (isNaN(pid) || pid <= 0) continue;
			
 
				 
			
 
				-                        // Validate PID before killing
			
 
				-                        const cmdFile = path.join(chromeDir, 'cmd.sh');
			
 
				-                        if (!validatePid(pid, pidFile, cmdFile)) {
			
 
				-                            // PID reused or validation failed
			
 
				-                            console.error(`[!] PID ${pid} failed validation (reused or wrong process) - cleaning up`);
			
 
				+                        // Check if process exists (simple check, Python will validate properly)
			
 
				+                        try {
			
 
				+                            process.kill(pid, 0);
			
 
				+                        } catch (e) {
			
 
				+                            // Process dead, remove stale PID file
			
 
				                             try { fs.unlinkSync(pidFile); } catch (e) {}
			
 
				                             continue;
			
 
				                         }
			
 
				 
			
 
				-                        // Process alive, validated, and crawl is stale - zombie!
			
 
				-                        console.error(`[!] Found validated zombie (PID ${pid}) from stale crawl ${crawl.name}`);
			
 
				+                        // Process alive and crawl is stale - zombie!
			
 
				+                        console.error(`[!] Found zombie (PID ${pid}) from stale crawl ${crawl.name}`);
			
 
				 
			
 
				                         try {
			
 
				-                            // Kill process group first
			
 
				+                            // Kill process group
			
 
				                             try {
			
 
				                                 process.kill(-pid, 'SIGKILL');
			
 
				                             } catch (e) {
			
@@ -354,14 +276,10 @@ function killZombieChrome() {
 
				 
			
 
				                             killed++;
			
 
				                             console.error(`[+] Killed zombie (PID ${pid})`);
			
 
				-
			
 
				-                            // Remove PID file
			
 
				                             try { fs.unlinkSync(pidFile); } catch (e) {}
			
 
				-
			
 
				                         } catch (e) {
			
 
				                             console.error(`[!] Failed to kill PID ${pid}: ${e.message}`);
			
 
				                         }
			
 
				-
			
 
				                     } catch (e) {
			
 
				                         // Skip invalid PID files
			
 
				                     }
			
--- a/archivebox/plugins/chrome/on_Snapshot__20_chrome_tab.bg.js
+++ b/archivebox/plugins/chrome/on_Snapshot__20_chrome_tab.bg.js
@@ -29,7 +29,7 @@ const http = require('http');
 
				 const puppeteer = require('puppeteer-core');
			
 
				 
			
 
				 // Extractor metadata
			
 
				-const EXTRACTOR_NAME = 'chrome_tab';
			
 
				+const PLUGIN_NAME = 'chrome_tab';
			
 
				 const OUTPUT_DIR = '.';  // Hook already runs in chrome/ output directory
			
 
				 const CHROME_SESSION_DIR = '.';
			
 
				 
			
--- a/archivebox/plugins/chrome/on_Snapshot__30_chrome_navigate.js
+++ b/archivebox/plugins/chrome/on_Snapshot__30_chrome_navigate.js
@@ -19,7 +19,7 @@ const fs = require('fs');
 
				 const path = require('path');
			
 
				 const puppeteer = require('puppeteer-core');
			
 
				 
			
 
				-const EXTRACTOR_NAME = 'chrome_navigate';
			
 
				+const PLUGIN_NAME = 'chrome_navigate';
			
 
				 const CHROME_SESSION_DIR = '.';
			
 
				 const OUTPUT_DIR = '.';
			
 
				 
			
--- a/archivebox/plugins/consolelog/on_Snapshot__21_consolelog.bg.js
+++ b/archivebox/plugins/consolelog/on_Snapshot__21_consolelog.bg.js
@@ -14,7 +14,7 @@ const fs = require('fs');
 
				 const path = require('path');
			
 
				 const puppeteer = require('puppeteer-core');
			
 
				 
			
 
				-const EXTRACTOR_NAME = 'consolelog';
			
 
				+const PLUGIN_NAME = 'consolelog';
			
 
				 const OUTPUT_DIR = '.';
			
 
				 const OUTPUT_FILE = 'console.jsonl';
			
 
				 const PID_FILE = 'hook.pid';
			
--- a/archivebox/plugins/dom/on_Snapshot__36_dom.js
+++ b/archivebox/plugins/dom/on_Snapshot__36_dom.js
@@ -23,7 +23,7 @@ const path = require('path');
 
				 const puppeteer = require('puppeteer-core');
			
 
				 
			
 
				 // Extractor metadata
			
 
				-const EXTRACTOR_NAME = 'dom';
			
 
				+const PLUGIN_NAME = 'dom';
			
 
				 const OUTPUT_DIR = '.';
			
 
				 const OUTPUT_FILE = 'output.html';
			
 
				 const CHROME_SESSION_DIR = '../chrome';
			
@@ -252,10 +252,14 @@ async function main() {
 
				             }));
			
 
				             process.exit(0);
			
 
				         } else {
			
 
				-            // Wait for page to be fully loaded
			
 
				-            const pageLoaded = await waitForChromeTabLoaded(60000);
			
 
				-            if (!pageLoaded) {
			
 
				-                throw new Error('Page not loaded after 60s (chrome_navigate must complete first)');
			
 
				+            // Only wait for page load if using shared Chrome session
			
 
				+            const cdpUrl = getCdpUrl();
			
 
				+            if (cdpUrl) {
			
 
				+                // Wait for page to be fully loaded
			
 
				+                const pageLoaded = await waitForChromeTabLoaded(60000);
			
 
				+                if (!pageLoaded) {
			
 
				+                    throw new Error('Page not loaded after 60s (chrome_navigate must complete first)');
			
 
				+                }
			
 
				             }
			
 
				 
			
 
				             const result = await dumpDom(url);
			
--- a/archivebox/plugins/favicon/on_Snapshot__11_favicon.py
+++ b/archivebox/plugins/favicon/on_Snapshot__11_favicon.py
@@ -27,7 +27,7 @@ import rich_click as click
 
				 
			
 
				 
			
 
				 # Extractor metadata
			
 
				-EXTRACTOR_NAME = 'favicon'
			
 
				+PLUGIN_NAME = 'favicon'
			
 
				 OUTPUT_DIR = '.'
			
 
				 OUTPUT_FILE = 'favicon.ico'
			
 
				 
			
--- a/archivebox/plugins/forumdl/on_Snapshot__53_forumdl.py
+++ b/archivebox/plugins/forumdl/on_Snapshot__53_forumdl.py
@@ -31,7 +31,7 @@ import rich_click as click
 
				 
			
 
				 
			
 
				 # Extractor metadata
			
 
				-EXTRACTOR_NAME = 'forumdl'
			
 
				+PLUGIN_NAME = 'forumdl'
			
 
				 BIN_NAME = 'forum-dl'
			
 
				 BIN_PROVIDERS = 'pip,env'
			
 
				 OUTPUT_DIR = '.'
			
--- a/archivebox/plugins/gallerydl/on_Snapshot__52_gallerydl.py
+++ b/archivebox/plugins/gallerydl/on_Snapshot__52_gallerydl.py
@@ -32,7 +32,7 @@ import rich_click as click
 
				 
			
 
				 
			
 
				 # Extractor metadata
			
 
				-EXTRACTOR_NAME = 'gallerydl'
			
 
				+PLUGIN_NAME = 'gallerydl'
			
 
				 BIN_NAME = 'gallery-dl'
			
 
				 BIN_PROVIDERS = 'pip,env'
			
 
				 OUTPUT_DIR = '.'
			
--- a/archivebox/plugins/git/on_Snapshot__12_git.py
+++ b/archivebox/plugins/git/on_Snapshot__12_git.py
@@ -24,7 +24,7 @@ import rich_click as click
 
				 
			
 
				 
			
 
				 # Extractor metadata
			
 
				-EXTRACTOR_NAME = 'git'
			
 
				+PLUGIN_NAME = 'git'
			
 
				 BIN_NAME = 'git'
			
 
				 BIN_PROVIDERS = 'apt,brew,env'
			
 
				 OUTPUT_DIR = '.'
			
--- a/archivebox/plugins/headers/on_Snapshot__33_headers.js
+++ b/archivebox/plugins/headers/on_Snapshot__33_headers.js
@@ -21,7 +21,7 @@ const https = require('https');
 
				 const http = require('http');
			
 
				 
			
 
				 // Extractor metadata
			
 
				-const EXTRACTOR_NAME = 'headers';
			
 
				+const PLUGIN_NAME = 'headers';
			
 
				 const OUTPUT_DIR = '.';
			
 
				 const OUTPUT_FILE = 'headers.json';
			
 
				 const CHROME_SESSION_DIR = '../chrome';
			
--- a/archivebox/plugins/htmltotext/on_Snapshot__54_htmltotext.py
+++ b/archivebox/plugins/htmltotext/on_Snapshot__54_htmltotext.py
@@ -26,7 +26,7 @@ import rich_click as click
 
				 
			
 
				 
			
 
				 # Extractor metadata
			
 
				-EXTRACTOR_NAME = 'htmltotext'
			
 
				+PLUGIN_NAME = 'htmltotext'
			
 
				 OUTPUT_DIR = '.'
			
 
				 OUTPUT_FILE = 'htmltotext.txt'
			
 
				 
			
--- a/archivebox/plugins/media/on_Snapshot__51_media.py
+++ b/archivebox/plugins/media/on_Snapshot__51_media.py
@@ -34,7 +34,7 @@ import rich_click as click
 
				 
			
 
				 
			
 
				 # Extractor metadata
			
 
				-EXTRACTOR_NAME = 'media'
			
 
				+PLUGIN_NAME = 'media'
			
 
				 BIN_NAME = 'yt-dlp'
			
 
				 BIN_PROVIDERS = 'pip,apt,brew,env'
			
 
				 OUTPUT_DIR = '.'
			
--- a/archivebox/plugins/mercury/on_Snapshot__53_mercury.py
+++ b/archivebox/plugins/mercury/on_Snapshot__53_mercury.py
@@ -25,7 +25,7 @@ import rich_click as click
 
				 
			
 
				 
			
 
				 # Extractor metadata
			
 
				-EXTRACTOR_NAME = 'mercury'
			
 
				+PLUGIN_NAME = 'mercury'
			
 
				 BIN_NAME = 'postlight-parser'
			
 
				 BIN_PROVIDERS = 'npm,env'
			
 
				 OUTPUT_DIR = '.'
			
--- a/archivebox/plugins/papersdl/on_Snapshot__54_papersdl.py
+++ b/archivebox/plugins/papersdl/on_Snapshot__54_papersdl.py
@@ -28,7 +28,7 @@ import rich_click as click
 
				 
			
 
				 
			
 
				 # Extractor metadata
			
 
				-EXTRACTOR_NAME = 'papersdl'
			
 
				+PLUGIN_NAME = 'papersdl'
			
 
				 BIN_NAME = 'papers-dl'
			
 
				 BIN_PROVIDERS = 'pip,env'
			
 
				 OUTPUT_DIR = '.'
			
--- a/archivebox/plugins/parse_dom_outlinks/on_Snapshot__40_parse_dom_outlinks.js
+++ b/archivebox/plugins/parse_dom_outlinks/on_Snapshot__40_parse_dom_outlinks.js
@@ -23,7 +23,7 @@ const path = require('path');
 
				 const puppeteer = require('puppeteer-core');
			
 
				 
			
 
				 // Extractor metadata
			
 
				-const EXTRACTOR_NAME = 'parse_dom_outlinks';
			
 
				+const PLUGIN_NAME = 'parse_dom_outlinks';
			
 
				 const OUTPUT_DIR = '.';
			
 
				 const OUTPUT_FILE = 'outlinks.json';
			
 
				 const URLS_FILE = 'urls.jsonl';  // For crawl system
			
@@ -190,7 +190,7 @@ async function extractOutlinks(url) {
 
				         const urlsJsonl = crawlableUrls.map(href => JSON.stringify({
			
 
				             type: 'Snapshot',
			
 
				             url: href,
			
 
				-            via_extractor: EXTRACTOR_NAME,
			
 
				+            plugin: PLUGIN_NAME,
			
 
				         })).join('\n');
			
 
				 
			
 
				         if (urlsJsonl) {
			
@@ -236,10 +236,14 @@ async function main() {
 
				             process.exit(0);
			
 
				         }
			
 
				 
			
 
				-        // Wait for page to be fully loaded
			
 
				-        const pageLoaded = await waitForChromeTabLoaded(60000);
			
 
				-        if (!pageLoaded) {
			
 
				-            throw new Error('Page not loaded after 60s (chrome_navigate must complete first)');
			
 
				+        // Check if Chrome session exists, then wait for page load
			
 
				+        const cdpUrl = getCdpUrl();
			
 
				+        if (cdpUrl) {
			
 
				+            // Wait for page to be fully loaded
			
 
				+            const pageLoaded = await waitForChromeTabLoaded(60000);
			
 
				+            if (!pageLoaded) {
			
 
				+                throw new Error('Page not loaded after 60s (chrome_navigate must complete first)');
			
 
				+            }
			
 
				         }
			
 
				 
			
 
				         const result = await extractOutlinks(url);
			
--- a/archivebox/plugins/parse_html_urls/on_Snapshot__60_parse_html_urls.py
+++ b/archivebox/plugins/parse_html_urls/on_Snapshot__60_parse_html_urls.py
@@ -28,7 +28,7 @@ from urllib.parse import urljoin, urlparse
 
				 
			
 
				 import rich_click as click
			
 
				 
			
 
				-EXTRACTOR_NAME = 'parse_html_urls'
			
 
				+PLUGIN_NAME = 'parse_html_urls'
			
 
				 
			
 
				 # Check if parse_dom_outlinks extractor already ran
			
 
				 DOM_OUTLINKS_URLS_FILE = Path('parse_dom_outlinks/urls.jsonl')
			
@@ -179,7 +179,7 @@ def main(url: str, snapshot_id: str = None, crawl_id: str = None, depth: int = 0
 
				         record = {
			
 
				             'type': 'Snapshot',
			
 
				             'url': found_url,
			
 
				-            'via_extractor': EXTRACTOR_NAME,
			
 
				+            'plugin': PLUGIN_NAME,
			
 
				             'depth': depth + 1,
			
 
				         }
			
 
				         if snapshot_id:
			
--- a/archivebox/plugins/parse_html_urls/tests/test_parse_html_urls.py
+++ b/archivebox/plugins/parse_html_urls/tests/test_parse_html_urls.py
@@ -233,7 +233,7 @@ class TestParseHtmlUrls:
 
				         entry = json.loads(output_file.read_text().strip())
			
 
				         assert entry['url'] == 'https://example.com'
			
 
				         assert 'type' in entry
			
 
				-        assert 'via_extractor' in entry
			
 
				+        assert 'plugin' in entry
			
 
				 
			
 
				 
			
 
				 if __name__ == '__main__':
			
--- a/archivebox/plugins/parse_jsonl_urls/on_Snapshot__64_parse_jsonl_urls.py
+++ b/archivebox/plugins/parse_jsonl_urls/on_Snapshot__64_parse_jsonl_urls.py
@@ -24,7 +24,7 @@ from urllib.parse import urlparse
 
				 
			
 
				 import rich_click as click
			
 
				 
			
 
				-EXTRACTOR_NAME = 'parse_jsonl_urls'
			
 
				+PLUGIN_NAME = 'parse_jsonl_urls'
			
 
				 
			
 
				 
			
 
				 def parse_bookmarked_at(link: dict) -> str | None:
			
@@ -75,7 +75,7 @@ def json_object_to_entry(link: dict) -> dict | None:
 
				     entry = {
			
 
				         'type': 'Snapshot',
			
 
				         'url': unescape(url),
			
 
				-        'via_extractor': EXTRACTOR_NAME,
			
 
				+        'plugin': PLUGIN_NAME,
			
 
				     }
			
 
				 
			
 
				     # Parse title
			
--- a/archivebox/plugins/parse_jsonl_urls/tests/test_parse_jsonl_urls.py
+++ b/archivebox/plugins/parse_jsonl_urls/tests/test_parse_jsonl_urls.py
@@ -265,7 +265,7 @@ class TestParseJsonlUrls:
 
				         entry = json.loads(output_file.read_text().strip())
			
 
				         assert entry['url'] == 'https://example.com'
			
 
				         assert 'type' in entry
			
 
				-        assert 'via_extractor' in entry
			
 
				+        assert 'plugin' in entry
			
 
				 
			
 
				 
			
 
				 if __name__ == '__main__':
			
--- a/archivebox/plugins/parse_netscape_urls/on_Snapshot__63_parse_netscape_urls.py
+++ b/archivebox/plugins/parse_netscape_urls/on_Snapshot__63_parse_netscape_urls.py
@@ -22,7 +22,7 @@ from urllib.parse import urlparse
 
				 
			
 
				 import rich_click as click
			
 
				 
			
 
				-EXTRACTOR_NAME = 'parse_netscape_urls'
			
 
				+PLUGIN_NAME = 'parse_netscape_urls'
			
 
				 
			
 
				 # Constants for timestamp epoch detection
			
 
				 UNIX_EPOCH = 0  # 1970-01-01 00:00:00 UTC
			
@@ -187,7 +187,7 @@ def main(url: str, snapshot_id: str = None):
 
				             entry = {
			
 
				                 'type': 'Snapshot',
			
 
				                 'url': unescape(bookmark_url),
			
 
				-                'via_extractor': EXTRACTOR_NAME,
			
 
				+                'plugin': PLUGIN_NAME,
			
 
				             }
			
 
				             if title:
			
 
				                 entry['title'] = unescape(title)
			
--- a/archivebox/plugins/parse_rss_urls/on_Snapshot__61_parse_rss_urls.py
+++ b/archivebox/plugins/parse_rss_urls/on_Snapshot__61_parse_rss_urls.py
@@ -23,7 +23,7 @@ from urllib.parse import urlparse
 
				 
			
 
				 import rich_click as click
			
 
				 
			
 
				-EXTRACTOR_NAME = 'parse_rss_urls'
			
 
				+PLUGIN_NAME = 'parse_rss_urls'
			
 
				 
			
 
				 try:
			
 
				     import feedparser
			
@@ -107,7 +107,7 @@ def main(url: str, snapshot_id: str = None, crawl_id: str = None, depth: int = 0
 
				         entry = {
			
 
				             'type': 'Snapshot',
			
 
				             'url': unescape(item_url),
			
 
				-            'via_extractor': EXTRACTOR_NAME,
			
 
				+            'plugin': PLUGIN_NAME,
			
 
				             'depth': depth + 1,
			
 
				         }
			
 
				         if snapshot_id:
			
--- a/archivebox/plugins/parse_rss_urls/tests/test_parse_rss_urls_comprehensive.py
+++ b/archivebox/plugins/parse_rss_urls/tests/test_parse_rss_urls_comprehensive.py
@@ -47,7 +47,7 @@ class TestRssVariants:
 
				 
			
 
				         assert entry['url'] == 'https://example.com/article1'
			
 
				         assert entry['title'] == 'RSS 0.91 Article'
			
 
				-        assert entry['via_extractor'] == 'parse_rss_urls'
			
 
				+        assert entry['plugin'] == 'parse_rss_urls'
			
 
				 
			
 
				     def test_rss_10_rdf(self, tmp_path):
			
 
				         """Test RSS 1.0 (RDF) format."""
			
--- a/archivebox/plugins/parse_txt_urls/on_Snapshot__62_parse_txt_urls.py
+++ b/archivebox/plugins/parse_txt_urls/on_Snapshot__62_parse_txt_urls.py
@@ -25,7 +25,7 @@ from urllib.request import urlopen
 
				 
			
 
				 import rich_click as click
			
 
				 
			
 
				-EXTRACTOR_NAME = 'parse_txt_urls'
			
 
				+PLUGIN_NAME = 'parse_txt_urls'
			
 
				 
			
 
				 # URL regex from archivebox/misc/util.py
			
 
				 # https://mathiasbynens.be/demo/url-regex
			
@@ -127,7 +127,7 @@ def main(url: str, snapshot_id: str = None):
 
				             f.write(json.dumps({
			
 
				                 'type': 'Snapshot',
			
 
				                 'url': found_url,
			
 
				-                'via_extractor': EXTRACTOR_NAME,
			
 
				+                'plugin': PLUGIN_NAME,
			
 
				             }) + '\n')
			
 
				 
			
 
				     click.echo(f'Found {len(urls_found)} URLs')
			
--- a/archivebox/plugins/parse_txt_urls/tests/test_parse_txt_urls.py
+++ b/archivebox/plugins/parse_txt_urls/tests/test_parse_txt_urls.py
@@ -186,7 +186,7 @@ https://other.com
 
				         entry = json.loads(output_file.read_text().strip())
			
 
				         assert entry['url'] == 'https://example.com'
			
 
				         assert 'type' in entry
			
 
				-        assert 'via_extractor' in entry
			
 
				+        assert 'plugin' in entry
			
 
				 
			
 
				 
			
 
				 if __name__ == '__main__':
			
--- a/archivebox/plugins/pdf/on_Snapshot__35_pdf.js
+++ b/archivebox/plugins/pdf/on_Snapshot__35_pdf.js
@@ -22,7 +22,7 @@ const path = require('path');
 
				 const puppeteer = require('puppeteer-core');
			
 
				 
			
 
				 // Extractor metadata
			
 
				-const EXTRACTOR_NAME = 'pdf';
			
 
				+const PLUGIN_NAME = 'pdf';
			
 
				 const OUTPUT_DIR = '.';
			
 
				 const OUTPUT_FILE = 'output.pdf';
			
 
				 const CHROME_SESSION_DIR = '../chrome';
			
@@ -254,10 +254,14 @@ async function main() {
 
				             }));
			
 
				             process.exit(0);  // Permanent skip - staticfile already handled
			
 
				         } else {
			
 
				-            // Wait for page to be fully loaded
			
 
				-            const pageLoaded = await waitForChromeTabLoaded(60000);
			
 
				-            if (!pageLoaded) {
			
 
				-                throw new Error('Page not loaded after 60s (chrome_navigate must complete first)');
			
 
				+            // Only wait for page load if using shared Chrome session
			
 
				+            const cdpUrl = getCdpUrl();
			
 
				+            if (cdpUrl) {
			
 
				+                // Wait for page to be fully loaded
			
 
				+                const pageLoaded = await waitForChromeTabLoaded(60000);
			
 
				+                if (!pageLoaded) {
			
 
				+                    throw new Error('Page not loaded after 60s (chrome_navigate must complete first)');
			
 
				+                }
			
 
				             }
			
 
				 
			
 
				             const result = await printToPdf(url);
			
--- a/archivebox/plugins/extractor_utils.py
+++ b/archivebox/plugins/extractor_utils.py
@@ -1,11 +1,11 @@
 
				 #!/usr/bin/env python3
			
 
				 """
			
 
				-Shared utilities for extractor hooks.
			
 
				+Shared utilities for extractor plugin hooks.
			
 
				 
			
 
				-This module provides common functionality for all extractors to ensure
			
 
				+This module provides common functionality for all extractor plugins to ensure
			
 
				 consistent behavior, output format, error handling, and timing.
			
 
				 
			
 
				-All extractors should:
			
 
				+All extractor plugins should:
			
 
				 1. Import and use these utilities
			
 
				 2. Output consistent metadata (CMD, VERSION, OUTPUT, timing)
			
 
				 3. Write all files to $PWD
			
@@ -35,7 +35,7 @@ STATIC_EXTENSIONS = (
 
				 
			
 
				 
			
 
				 def is_static_file(url: str) -> bool:
			
 
				-    """Check if URL points to a static file that may not need browser extraction."""
			
 
				+    """Check if URL points to a static file that may not need browser-based extractor plugins."""
			
 
				     return url.lower().split('?')[0].split('#')[0].endswith(STATIC_EXTENSIONS)
			
 
				 
			
 
				 
			
@@ -96,7 +96,7 @@ def get_version(binary: str, version_args: list[str] | None = None) -> str:
 
				 
			
 
				 class ExtractorResult:
			
 
				     """
			
 
				-    Tracks extractor execution and produces consistent output.
			
 
				+    Tracks extractor plugin execution and produces consistent output.
			
 
				 
			
 
				     Usage:
			
 
				         result = ExtractorResult(name='wget', url=url)
			
@@ -152,7 +152,7 @@ class ExtractorResult:
 
				         return 1
			
 
				 
			
 
				     def finish(self, status: str | None = None):
			
 
				-        """Mark extraction as finished and print results."""
			
 
				+        """Mark extractor plugin execution as finished and print results."""
			
 
				         self.end_ts = datetime.now(timezone.utc)
			
 
				         if status:
			
 
				             self.status = status
			
--- a/archivebox/plugins/readability/on_Snapshot__52_readability.py
+++ b/archivebox/plugins/readability/on_Snapshot__52_readability.py
@@ -27,7 +27,7 @@ import rich_click as click
 
				 
			
 
				 
			
 
				 # Extractor metadata
			
 
				-EXTRACTOR_NAME = 'readability'
			
 
				+PLUGIN_NAME = 'readability'
			
 
				 BIN_NAME = 'readability-extractor'
			
 
				 BIN_PROVIDERS = 'npm,env'
			
 
				 OUTPUT_DIR = '.'
			
--- a/archivebox/plugins/redirects/on_Snapshot__31_redirects.bg.js
+++ b/archivebox/plugins/redirects/on_Snapshot__31_redirects.bg.js
@@ -14,7 +14,7 @@ const fs = require('fs');
 
				 const path = require('path');
			
 
				 const puppeteer = require('puppeteer-core');
			
 
				 
			
 
				-const EXTRACTOR_NAME = 'redirects';
			
 
				+const PLUGIN_NAME = 'redirects';
			
 
				 const OUTPUT_DIR = '.';
			
 
				 const OUTPUT_FILE = 'redirects.jsonl';
			
 
				 const PID_FILE = 'hook.pid';
			
@@ -235,7 +235,7 @@ function handleShutdown(signal) {
 
				         type: 'ArchiveResult',
			
 
				         status: 'succeeded',
			
 
				         output_str: OUTPUT_FILE,
			
 
				-        extractor: EXTRACTOR_NAME,
			
 
				+        plugin: PLUGIN_NAME,
			
 
				         original_url: originalUrl,
			
 
				         final_url: finalUrl || originalUrl,
			
 
				         redirect_count: redirectChain.length,
			
--- a/archivebox/plugins/responses/on_Snapshot__24_responses.bg.js
+++ b/archivebox/plugins/responses/on_Snapshot__24_responses.bg.js
@@ -15,7 +15,7 @@ const path = require('path');
 
				 const crypto = require('crypto');
			
 
				 const puppeteer = require('puppeteer-core');
			
 
				 
			
 
				-const EXTRACTOR_NAME = 'responses';
			
 
				+const PLUGIN_NAME = 'responses';
			
 
				 const OUTPUT_DIR = '.';
			
 
				 const PID_FILE = 'hook.pid';
			
 
				 const CHROME_SESSION_DIR = '../chrome';
			
--- a/archivebox/plugins/screenshot/on_Snapshot__34_screenshot.js
+++ b/archivebox/plugins/screenshot/on_Snapshot__34_screenshot.js
@@ -22,7 +22,7 @@ const path = require('path');
 
				 const puppeteer = require('puppeteer-core');
			
 
				 
			
 
				 // Extractor metadata
			
 
				-const EXTRACTOR_NAME = 'screenshot';
			
 
				+const PLUGIN_NAME = 'screenshot';
			
 
				 const OUTPUT_DIR = '.';
			
 
				 const OUTPUT_FILE = 'screenshot.png';
			
 
				 const CHROME_SESSION_DIR = '../chrome';
			
@@ -250,10 +250,14 @@ async function main() {
 
				             }));
			
 
				             process.exit(0);  // Permanent skip - staticfile already handled
			
 
				         } else {
			
 
				-            // Wait for page to be fully loaded
			
 
				-            const pageLoaded = await waitForChromeTabLoaded(60000);
			
 
				-            if (!pageLoaded) {
			
 
				-                throw new Error('Page not loaded after 60s (chrome_navigate must complete first)');
			
 
				+            // Only wait for page load if using shared Chrome session
			
 
				+            const cdpUrl = getCdpUrl();
			
 
				+            if (cdpUrl) {
			
 
				+                // Wait for page to be fully loaded
			
 
				+                const pageLoaded = await waitForChromeTabLoaded(60000);
			
 
				+                if (!pageLoaded) {
			
 
				+                    throw new Error('Page not loaded after 60s (chrome_navigate must complete first)');
			
 
				+                }
			
 
				             }
			
 
				 
			
 
				             const result = await takeScreenshot(url);
			
--- a/archivebox/plugins/search_backend_sonic/on_Snapshot__91_index_sonic.py
+++ b/archivebox/plugins/search_backend_sonic/on_Snapshot__91_index_sonic.py
@@ -27,7 +27,7 @@ import rich_click as click
 
				 
			
 
				 
			
 
				 # Extractor metadata
			
 
				-EXTRACTOR_NAME = 'index_sonic'
			
 
				+PLUGIN_NAME = 'index_sonic'
			
 
				 OUTPUT_DIR = '.'
			
 
				 
			
 
				 # Text file patterns to index
			
@@ -83,14 +83,14 @@ def find_indexable_content() -> list[tuple[str, str]]:
 
				     cwd = Path.cwd()
			
 
				 
			
 
				     for extractor, file_pattern in INDEXABLE_FILES:
			
 
				-        extractor_dir = cwd / extractor
			
 
				-        if not extractor_dir.exists():
			
 
				+        plugin_dir = cwd / extractor
			
 
				+        if not plugin_dir.exists():
			
 
				             continue
			
 
				 
			
 
				         if '*' in file_pattern:
			
 
				-            matches = list(extractor_dir.glob(file_pattern))
			
 
				+            matches = list(plugin_dir.glob(file_pattern))
			
 
				         else:
			
 
				-            match = extractor_dir / file_pattern
			
 
				+            match = plugin_dir / file_pattern
			
 
				             matches = [match] if match.exists() else []
			
 
				 
			
 
				         for match in matches:
			
--- a/archivebox/plugins/search_backend_sqlite/on_Snapshot__90_index_sqlite.py
+++ b/archivebox/plugins/search_backend_sqlite/on_Snapshot__90_index_sqlite.py
@@ -25,7 +25,7 @@ import rich_click as click
 
				 
			
 
				 
			
 
				 # Extractor metadata
			
 
				-EXTRACTOR_NAME = 'index_sqlite'
			
 
				+PLUGIN_NAME = 'index_sqlite'
			
 
				 OUTPUT_DIR = '.'
			
 
				 
			
 
				 # Text file patterns to index, in priority order
			
@@ -74,14 +74,14 @@ def find_indexable_content() -> list[tuple[str, str]]:
 
				     cwd = Path.cwd()
			
 
				 
			
 
				     for extractor, file_pattern in INDEXABLE_FILES:
			
 
				-        extractor_dir = cwd / extractor
			
 
				-        if not extractor_dir.exists():
			
 
				+        plugin_dir = cwd / extractor
			
 
				+        if not plugin_dir.exists():
			
 
				             continue
			
 
				 
			
 
				         if '*' in file_pattern:
			
 
				-            matches = list(extractor_dir.glob(file_pattern))
			
 
				+            matches = list(plugin_dir.glob(file_pattern))
			
 
				         else:
			
 
				-            match = extractor_dir / file_pattern
			
 
				+            match = plugin_dir / file_pattern
			
 
				             matches = [match] if match.exists() else []
			
 
				 
			
 
				         for match in matches:
			
--- a/archivebox/plugins/seo/on_Snapshot__38_seo.js
+++ b/archivebox/plugins/seo/on_Snapshot__38_seo.js
@@ -20,7 +20,7 @@ const path = require('path');
 
				 const puppeteer = require('puppeteer-core');
			
 
				 
			
 
				 // Extractor metadata
			
 
				-const EXTRACTOR_NAME = 'seo';
			
 
				+const PLUGIN_NAME = 'seo';
			
 
				 const OUTPUT_DIR = '.';
			
 
				 const OUTPUT_FILE = 'seo.json';
			
 
				 const CHROME_SESSION_DIR = '../chrome';
			
@@ -177,10 +177,14 @@ async function main() {
 
				             process.exit(0);
			
 
				         }
			
 
				 
			
 
				-        // Wait for page to be fully loaded
			
 
				-        const pageLoaded = await waitForChromeTabLoaded(60000);
			
 
				-        if (!pageLoaded) {
			
 
				-            throw new Error('Page not loaded after 60s (chrome_navigate must complete first)');
			
 
				+        // Check if Chrome session exists, then wait for page load
			
 
				+        const cdpUrl = getCdpUrl();
			
 
				+        if (cdpUrl) {
			
 
				+            // Wait for page to be fully loaded
			
 
				+            const pageLoaded = await waitForChromeTabLoaded(60000);
			
 
				+            if (!pageLoaded) {
			
 
				+                throw new Error('Page not loaded after 60s (chrome_navigate must complete first)');
			
 
				+            }
			
 
				         }
			
 
				 
			
 
				         const result = await extractSeo(url);
			
--- a/archivebox/plugins/singlefile/on_Snapshot__37_singlefile.py
+++ b/archivebox/plugins/singlefile/on_Snapshot__37_singlefile.py
@@ -36,7 +36,7 @@ import rich_click as click
 
				 
			
 
				 
			
 
				 # Extractor metadata
			
 
				-EXTRACTOR_NAME = 'singlefile'
			
 
				+PLUGIN_NAME = 'singlefile'
			
 
				 BIN_NAME = 'single-file'
			
 
				 BIN_PROVIDERS = 'npm,env'
			
 
				 OUTPUT_DIR = '.'
			
--- a/archivebox/plugins/ssl/on_Snapshot__23_ssl.bg.js
+++ b/archivebox/plugins/ssl/on_Snapshot__23_ssl.bg.js
@@ -14,7 +14,7 @@ const fs = require('fs');
 
				 const path = require('path');
			
 
				 const puppeteer = require('puppeteer-core');
			
 
				 
			
 
				-const EXTRACTOR_NAME = 'ssl';
			
 
				+const PLUGIN_NAME = 'ssl';
			
 
				 const OUTPUT_DIR = '.';
			
 
				 const OUTPUT_FILE = 'ssl.jsonl';
			
 
				 const PID_FILE = 'hook.pid';
			
--- a/archivebox/plugins/staticfile/on_Snapshot__31_staticfile.bg.js
+++ b/archivebox/plugins/staticfile/on_Snapshot__31_staticfile.bg.js
@@ -14,7 +14,7 @@ const fs = require('fs');
 
				 const path = require('path');
			
 
				 const puppeteer = require('puppeteer-core');
			
 
				 
			
 
				-const EXTRACTOR_NAME = 'staticfile';
			
 
				+const PLUGIN_NAME = 'staticfile';
			
 
				 const OUTPUT_DIR = '.';
			
 
				 const PID_FILE = 'hook.pid';
			
 
				 const CHROME_SESSION_DIR = '../chrome';
			
@@ -326,7 +326,7 @@ function handleShutdown(signal) {
 
				             type: 'ArchiveResult',
			
 
				             status: 'skipped',
			
 
				             output_str: 'No Content-Type detected',
			
 
				-            extractor: EXTRACTOR_NAME,
			
 
				+            plugin: PLUGIN_NAME,
			
 
				         };
			
 
				     } else if (!isStaticFile) {
			
 
				         // Not a static file (normal case for HTML pages)
			
@@ -334,7 +334,7 @@ function handleShutdown(signal) {
 
				             type: 'ArchiveResult',
			
 
				             status: 'skipped',
			
 
				             output_str: `Not a static file (Content-Type: ${detectedContentType})`,
			
 
				-            extractor: EXTRACTOR_NAME,
			
 
				+            plugin: PLUGIN_NAME,
			
 
				             content_type: detectedContentType,
			
 
				         };
			
 
				     } else if (downloadError) {
			
@@ -343,7 +343,7 @@ function handleShutdown(signal) {
 
				             type: 'ArchiveResult',
			
 
				             status: 'failed',
			
 
				             output_str: downloadError,
			
 
				-            extractor: EXTRACTOR_NAME,
			
 
				+            plugin: PLUGIN_NAME,
			
 
				             content_type: detectedContentType,
			
 
				         };
			
 
				     } else if (downloadedFilePath) {
			
@@ -352,7 +352,7 @@ function handleShutdown(signal) {
 
				             type: 'ArchiveResult',
			
 
				             status: 'succeeded',
			
 
				             output_str: downloadedFilePath,
			
 
				-            extractor: EXTRACTOR_NAME,
			
 
				+            plugin: PLUGIN_NAME,
			
 
				             content_type: detectedContentType,
			
 
				         };
			
 
				     } else {
			
@@ -361,7 +361,7 @@ function handleShutdown(signal) {
 
				             type: 'ArchiveResult',
			
 
				             status: 'failed',
			
 
				             output_str: 'Static file detected but download did not complete',
			
 
				-            extractor: EXTRACTOR_NAME,
			
 
				+            plugin: PLUGIN_NAME,
			
 
				             content_type: detectedContentType,
			
 
				         };
			
 
				     }
			
--- a/archivebox/plugins/title/on_Snapshot__32_title.js
+++ b/archivebox/plugins/title/on_Snapshot__32_title.js
@@ -20,7 +20,7 @@ const https = require('https');
 
				 const http = require('http');
			
 
				 
			
 
				 // Extractor metadata
			
 
				-const EXTRACTOR_NAME = 'title';
			
 
				+const PLUGIN_NAME = 'title';
			
 
				 const OUTPUT_DIR = '.';
			
 
				 const OUTPUT_FILE = 'title.txt';
			
 
				 const CHROME_SESSION_DIR = '../chrome';
			
--- a/archivebox/plugins/wget/on_Snapshot__50_wget.py
+++ b/archivebox/plugins/wget/on_Snapshot__50_wget.py
@@ -39,7 +39,7 @@ import rich_click as click
 
				 
			
 
				 
			
 
				 # Extractor metadata
			
 
				-EXTRACTOR_NAME = 'wget'
			
 
				+PLUGIN_NAME = 'wget'
			
 
				 BIN_NAME = 'wget'
			
 
				 BIN_PROVIDERS = 'apt,brew,env'
			
 
				 OUTPUT_DIR = '.'
			
--- a/bin/kill_chrome.sh
+++ b/bin/kill_chrome.sh
@@ -0,0 +1,156 @@
 
				+#!/usr/bin/env bash
			
 
				+# Kill zombie Chrome/Chromium processes listening on 127.0.0.1
			
 
				+# Works cross-platform on macOS and Linux
			
 
				+#
			
 
				+# Usage:
			
 
				+#   ./bin/kill_chrome.sh           # Kill Chrome processes with verification
			
 
				+#   ./bin/kill_chrome.sh --pkill   # Quick kill using pkill (less precise)
			
 
				+#   ./bin/kill_chrome.sh --help    # Show this help
			
 
				+
			
 
				+set -e
			
 
				+
			
 
				+# Detect OS
			
 
				+OS="$(uname -s)"
			
 
				+
			
 
				+# Chrome binary patterns to search for (cross-platform)
			
 
				+CHROME_PATTERNS=(
			
 
				+    "Google Chrome"
			
 
				+    "google-chrome"
			
 
				+    "chrome"
			
 
				+    "chromium"
			
 
				+    "chromium-browser"
			
 
				+    "Chromium"
			
 
				+)
			
 
				+
			
 
				+# Function to kill Chrome processes
			
 
				+kill_chrome_processes() {
			
 
				+    echo "Searching for Chrome processes listening on 127.0.0.1..."
			
 
				+    local killed=0
			
 
				+
			
 
				+    for pattern in "${CHROME_PATTERNS[@]}"; do
			
 
				+        # Find processes matching the pattern with remote debugging
			
 
				+        if [ "$OS" = "Darwin" ]; then
			
 
				+            # macOS
			
 
				+            pids=$(ps aux | grep -i "$pattern" | grep -E "(remote-debugging-port|remote-debugging-address=127\.0\.0\.1)" | grep -v grep | awk '{print $2}' || true)
			
 
				+        else
			
 
				+            # Linux
			
 
				+            pids=$(ps aux | grep -i "$pattern" | grep -E "(remote-debugging-port|remote-debugging-address=127\.0\.0\.1)" | grep -v grep | awk '{print $2}' || true)
			
 
				+        fi
			
 
				+
			
 
				+        if [ -n "$pids" ]; then
			
 
				+            echo "Found Chrome processes ($pattern): $pids"
			
 
				+            for pid in $pids; do
			
 
				+                # Try regular kill first
			
 
				+                if kill "$pid" 2>/dev/null; then
			
 
				+                    echo "  Killed $pid"
			
 
				+                    killed=$((killed + 1))
			
 
				+                    sleep 0.1
			
 
				+                fi
			
 
				+
			
 
				+                # Check if still alive
			
 
				+                if ps -p "$pid" > /dev/null 2>&1; then
			
 
				+                    # Check process state first to avoid attempting impossible kills
			
 
				+                    if [ "$OS" = "Darwin" ]; then
			
 
				+                        state=$(ps -o state -p "$pid" 2>/dev/null | tail -1 | tr -d ' ')
			
 
				+                    else
			
 
				+                        state=$(ps -o stat -p "$pid" 2>/dev/null | tail -1 | tr -d ' ')
			
 
				+                    fi
			
 
				+
			
 
				+                    # Check if it's a zombie/uninterruptible process BEFORE trying to kill
			
 
				+                    if [[ "$state" == *"Z"* ]] || [[ "$state" == *"D"* ]] || [[ "$state" == *"UNE"* ]]; then
			
 
				+                        echo "  WARNING: $pid is in uninterruptible/zombie state ($state) - cannot be killed"
			
 
				+                        echo "           Process will clean up automatically or requires system reboot"
			
 
				+                    else
			
 
				+                        # Try force kill
			
 
				+                        echo "  Force killing $pid with -9..."
			
 
				+                        if kill -9 "$pid" 2>/dev/null; then
			
 
				+                            # Wait briefly and verify
			
 
				+                            sleep 0.2
			
 
				+                            if ! ps -p "$pid" > /dev/null 2>&1; then
			
 
				+                                echo "  Force killed $pid"
			
 
				+                                killed=$((killed + 1))
			
 
				+                            else
			
 
				+                                echo "  WARNING: $pid survived kill -9 (state: $state)"
			
 
				+                            fi
			
 
				+                        else
			
 
				+                            echo "  ERROR: Failed to kill $pid (state: $state)"
			
 
				+                        fi
			
 
				+                    fi
			
 
				+                fi
			
 
				+            done
			
 
				+        fi
			
 
				+    done
			
 
				+
			
 
				+    if [ $killed -eq 0 ]; then
			
 
				+        echo "No Chrome processes listening on 127.0.0.1 found (or all are zombie/uninterruptible)"
			
 
				+    else
			
 
				+        echo "Successfully killed $killed Chrome process(es)"
			
 
				+    fi
			
 
				+
			
 
				+    # Show remaining Chrome processes (if any)
			
 
				+    echo ""
			
 
				+    echo "Remaining Chrome processes listening on 127.0.0.1:"
			
 
				+    for pattern in "${CHROME_PATTERNS[@]}"; do
			
 
				+        ps aux | grep -i "$pattern" | grep -E "(remote-debugging-port|remote-debugging-address=127\.0\.0\.1)" | grep -v grep || true
			
 
				+    done | head -10
			
 
				+
			
 
				+    if [ $(ps aux | grep -iE "(google chrome|chrome|chromium)" | grep -E "(remote-debugging-port|remote-debugging-address=127\.0\.0\.1)" | grep -v grep | wc -l) -eq 0 ]; then
			
 
				+        echo "  (none)"
			
 
				+    fi
			
 
				+}
			
 
				+
			
 
				+# Alternative approach using pkill (faster but less precise)
			
 
				+kill_chrome_pkill() {
			
 
				+    echo "Using pkill to kill all Chrome processes..."
			
 
				+
			
 
				+    for pattern in "${CHROME_PATTERNS[@]}"; do
			
 
				+        if pkill -9 -f "$pattern" 2>/dev/null; then
			
 
				+            echo "  Killed processes matching: $pattern"
			
 
				+        fi
			
 
				+    done
			
 
				+
			
 
				+    sleep 0.5
			
 
				+    echo "Done"
			
 
				+}
			
 
				+
			
 
				+# Show help
			
 
				+show_help() {
			
 
				+    cat << EOF
			
 
				+Kill zombie Chrome/Chromium processes listening on 127.0.0.1
			
 
				+
			
 
				+Usage:
			
 
				+  $0 [OPTIONS]
			
 
				+
			
 
				+Options:
			
 
				+  (none)           Kill Chrome processes with state verification (recommended)
			
 
				+  --pkill, -p      Quick kill using pkill (faster but less precise)
			
 
				+  --help, -h       Show this help message
			
 
				+
			
 
				+Description:
			
 
				+  This script finds and kills Chrome/Chromium processes that are listening
			
 
				+  on 127.0.0.1 (with --remote-debugging-port or --remote-debugging-address).
			
 
				+
			
 
				+  Supports multiple Chrome binary names:
			
 
				+    - Google Chrome / chrome / google-chrome
			
 
				+    - Chromium / chromium / chromium-browser
			
 
				+
			
 
				+  Works on macOS and Linux.
			
 
				+
			
 
				+  Zombie/uninterruptible processes (state UNE/Z/D) will be detected and
			
 
				+  reported but cannot be killed. They will clean up automatically.
			
 
				+
			
 
				+Examples:
			
 
				+  $0                 # Kill with verification
			
 
				+  $0 --pkill         # Quick kill all Chrome processes
			
 
				+
			
 
				+EOF
			
 
				+}
			
 
				+
			
 
				+# Parse arguments
			
 
				+if [ "$1" = "--help" ] || [ "$1" = "-h" ]; then
			
 
				+    show_help
			
 
				+elif [ "$1" = "--pkill" ] || [ "$1" = "-p" ]; then
			
 
				+    kill_chrome_pkill
			
 
				+else
			
 
				+    kill_chrome_processes
			
 
				+fi
			
--- a/tests/test_recursive_crawl.py
+++ b/tests/test_recursive_crawl.py
@@ -219,7 +219,13 @@ def test_recursive_crawl_creates_child_snapshots(tmp_path, process):
 
				 
			
 
				     # Kill the process
			
 
				     proc.kill()
			
 
				-    proc.wait()
			
 
				+    stdout, stderr = proc.communicate()
			
 
				+
			
 
				+    # Debug: print stderr to see what's happening
			
 
				+    if stderr:
			
 
				+        print(f"\n=== STDERR ===\n{stderr}\n=== END STDERR ===\n")
			
 
				+    if stdout:
			
 
				+        print(f"\n=== STDOUT (last 2000 chars) ===\n{stdout[-2000:]}\n=== END STDOUT ===\n")
			
 
				 
			
 
				     conn = sqlite3.connect('index.sqlite3')
			
 
				     c = conn.cursor()