Browse Source

wip major changes

Nick Sweeting 2 months ago
parent
commit
1915333b81
100 changed files with 11061 additions and 2629 deletions
  1. 9 0
      .claude/settings.local.json
  2. 3 0
      ArchiveBox.conf
  3. 300 0
      PLUGIN_ENHANCEMENTS.md
  4. 819 0
      SIMPLIFICATION_PLAN.md
  5. 127 0
      TEST_RESULTS.md
  6. 6109 0
      archivebox.ts
  7. 13 54
      archivebox/__init__.py
  8. 0 3
      archivebox/api/apps.py
  9. 37 6
      archivebox/api/migrations/0001_initial.py
  10. 0 17
      archivebox/api/migrations/0002_alter_apitoken_options.py
  11. 0 78
      archivebox/api/migrations/0003_rename_user_apitoken_created_by_apitoken_abid_and_more.py
  12. 0 24
      archivebox/api/migrations/0004_alter_apitoken_id_alter_apitoken_uuid.py
  13. 0 22
      archivebox/api/migrations/0005_remove_apitoken_uuid_remove_outboundwebhook_uuid_and_more.py
  14. 0 29
      archivebox/api/migrations/0006_remove_outboundwebhook_uuid_apitoken_id_and_more.py
  15. 0 23
      archivebox/api/migrations/0007_alter_apitoken_created_by.py
  16. 0 48
      archivebox/api/migrations/0008_alter_apitoken_created_alter_apitoken_created_by_and_more.py
  17. 0 40
      archivebox/api/migrations/0009_rename_created_apitoken_created_at_and_more.py
  18. 1 1
      archivebox/api/models.py
  19. 0 1
      archivebox/api/v1_api.py
  20. 68 0
      archivebox/base_models/admin.py
  21. 2 2
      archivebox/base_models/apps.py
  22. 15 2
      archivebox/base_models/models.py
  23. 10 1
      archivebox/cli/__init__.py
  24. 82 161
      archivebox/cli/archivebox_add.py
  25. 4 4
      archivebox/cli/archivebox_config.py
  26. 302 0
      archivebox/cli/archivebox_crawl.py
  27. 239 26
      archivebox/cli/archivebox_extract.py
  28. 13 13
      archivebox/cli/archivebox_init.py
  29. 60 123
      archivebox/cli/archivebox_install.py
  30. 67 0
      archivebox/cli/archivebox_orchestrator.py
  31. 8 11
      archivebox/cli/archivebox_remove.py
  32. 5 2
      archivebox/cli/archivebox_schedule.py
  33. 43 32
      archivebox/cli/archivebox_search.py
  34. 218 0
      archivebox/cli/archivebox_snapshot.py
  35. 4 5
      archivebox/cli/archivebox_status.py
  36. 68 7
      archivebox/cli/archivebox_update.py
  37. 109 96
      archivebox/cli/archivebox_version.py
  38. 35 18
      archivebox/cli/archivebox_worker.py
  39. 0 1
      archivebox/cli/tests.py
  40. 966 0
      archivebox/cli/tests_piping.py
  41. 227 25
      archivebox/config/__init__.py
  42. 29 14
      archivebox/config/collection.py
  43. 121 91
      archivebox/config/common.py
  44. 266 0
      archivebox/config/configset.py
  45. 173 179
      archivebox/config/views.py
  46. 3 7
      archivebox/core/__init__.py
  47. 0 3
      archivebox/core/admin.py
  48. 1 5
      archivebox/core/admin_archiveresults.py
  49. 2 2
      archivebox/core/admin_site.py
  50. 22 27
      archivebox/core/admin_snapshots.py
  51. 1 4
      archivebox/core/admin_tags.py
  52. 0 3
      archivebox/core/admin_users.py
  53. 0 5
      archivebox/core/apps.py
  54. 9 12
      archivebox/core/forms.py
  55. 5 7
      archivebox/core/migrations/0007_archiveresult.py
  56. 0 58
      archivebox/core/migrations/0023_alter_archiveresult_options_archiveresult_abid_and_more.py
  57. 466 0
      archivebox/core/migrations/0023_new_schema.py
  58. 0 101
      archivebox/core/migrations/0024_auto_20240513_1143.py
  59. 0 19
      archivebox/core/migrations/0025_alter_archiveresult_uuid.py
  60. 0 117
      archivebox/core/migrations/0026_archiveresult_created_archiveresult_created_by_and_more.py
  61. 0 105
      archivebox/core/migrations/0027_update_snapshot_ids.py
  62. 0 19
      archivebox/core/migrations/0028_alter_archiveresult_uuid.py
  63. 0 18
      archivebox/core/migrations/0029_alter_archiveresult_id.py
  64. 0 18
      archivebox/core/migrations/0030_alter_archiveresult_uuid.py
  65. 0 34
      archivebox/core/migrations/0031_alter_archiveresult_id_alter_archiveresult_uuid_and_more.py
  66. 0 23
      archivebox/core/migrations/0032_alter_archiveresult_id.py
  67. 0 18
      archivebox/core/migrations/0033_rename_id_archiveresult_old_id.py
  68. 0 45
      archivebox/core/migrations/0034_alter_archiveresult_old_id_alter_archiveresult_uuid.py
  69. 0 19
      archivebox/core/migrations/0035_remove_archiveresult_uuid_archiveresult_id.py
  70. 0 29
      archivebox/core/migrations/0036_alter_archiveresult_id_alter_archiveresult_old_id.py
  71. 0 18
      archivebox/core/migrations/0037_rename_id_snapshot_old_id.py
  72. 0 18
      archivebox/core/migrations/0038_rename_uuid_snapshot_id.py
  73. 0 18
      archivebox/core/migrations/0039_rename_snapshot_archiveresult_snapshot_old.py
  74. 0 34
      archivebox/core/migrations/0040_archiveresult_snapshot.py
  75. 0 24
      archivebox/core/migrations/0041_alter_archiveresult_snapshot_and_more.py
  76. 0 17
      archivebox/core/migrations/0042_remove_archiveresult_snapshot_old.py
  77. 0 20
      archivebox/core/migrations/0043_alter_archiveresult_snapshot_alter_snapshot_id_and_more.py
  78. 0 40
      archivebox/core/migrations/0044_alter_archiveresult_snapshot_alter_tag_uuid_and_more.py
  79. 0 19
      archivebox/core/migrations/0045_alter_snapshot_old_id.py
  80. 0 30
      archivebox/core/migrations/0046_alter_archiveresult_snapshot_alter_snapshot_id_and_more.py
  81. 0 24
      archivebox/core/migrations/0047_alter_snapshottag_unique_together_and_more.py
  82. 0 24
      archivebox/core/migrations/0048_alter_archiveresult_snapshot_and_more.py
  83. 0 22
      archivebox/core/migrations/0049_rename_snapshot_snapshottag_snapshot_old_and_more.py
  84. 0 19
      archivebox/core/migrations/0050_alter_snapshottag_snapshot_old.py
  85. 0 40
      archivebox/core/migrations/0051_snapshottag_snapshot_alter_snapshottag_snapshot_old.py
  86. 0 27
      archivebox/core/migrations/0052_alter_snapshottag_unique_together_and_more.py
  87. 0 17
      archivebox/core/migrations/0053_remove_snapshottag_snapshot_old.py
  88. 0 18
      archivebox/core/migrations/0054_alter_snapshot_timestamp.py
  89. 0 18
      archivebox/core/migrations/0055_alter_tag_slug.py
  90. 0 17
      archivebox/core/migrations/0056_remove_tag_uuid.py
  91. 0 18
      archivebox/core/migrations/0057_rename_id_tag_old_id.py
  92. 0 22
      archivebox/core/migrations/0058_alter_tag_old_id.py
  93. 0 90
      archivebox/core/migrations/0059_tag_id.py
  94. 0 19
      archivebox/core/migrations/0060_alter_tag_id.py
  95. 0 22
      archivebox/core/migrations/0061_rename_tag_snapshottag_old_tag_and_more.py
  96. 0 19
      archivebox/core/migrations/0062_alter_snapshottag_old_tag.py
  97. 0 40
      archivebox/core/migrations/0063_snapshottag_tag_alter_snapshottag_old_tag.py
  98. 0 27
      archivebox/core/migrations/0064_alter_snapshottag_unique_together_and_more.py
  99. 0 17
      archivebox/core/migrations/0065_remove_snapshottag_old_tag.py
  100. 0 34
      archivebox/core/migrations/0066_alter_snapshottag_tag_alter_tag_id_alter_tag_old_id.py

+ 9 - 0
.claude/settings.local.json

@@ -0,0 +1,9 @@
+{
+  "permissions": {
+    "allow": [
+      "Bash(python -m archivebox:*)",
+      "Bash(ls:*)",
+      "Bash(xargs:*)"
+    ]
+  }
+}

+ 3 - 0
ArchiveBox.conf

@@ -0,0 +1,3 @@
+[SERVER_CONFIG]
+SECRET_KEY = y6fw9wcaqls9sx_dze6ahky9ggpkpzoaw5g5v98_u3ro5j0_4f
+

+ 300 - 0
PLUGIN_ENHANCEMENTS.md

@@ -0,0 +1,300 @@
+# JS Implementation Features to Port to Python ArchiveBox
+
+## Priority: High Impact Features
+
+### 1. **Screen Recording** ⭐⭐⭐
+**JS Implementation:** Captures MP4 video + animated GIF of the archiving session
+```javascript
+// Records browser activity including scrolling, interactions
+PuppeteerScreenRecorder → screenrecording.mp4
+ffmpeg conversion → screenrecording.gif (first 10s, optimized)
+```
+
+**Enhancement for Python:**
+- Add `on_Snapshot__24_screenrecording.py`
+- Use puppeteer or playwright screen recording APIs
+- Generate both full MP4 and thumbnail GIF
+- **Value:** Visual proof of what was captured, useful for QA and debugging
+
+### 2. **AI Quality Assurance** ⭐⭐⭐
+**JS Implementation:** Uses GPT-4o to analyze screenshots and validate archive quality
+```javascript
+// ai_qa.py analyzes screenshot.png and returns:
+{
+  "pct_visible": 85,
+  "warnings": ["Some content may be cut off"],
+  "main_content_title": "Article Title",
+  "main_content_author": "Author Name",
+  "main_content_date": "2024-01-15",
+  "website_brand_name": "Example.com"
+}
+```
+
+**Enhancement for Python:**
+- Add `on_Snapshot__95_aiqa.py` (runs after screenshot)
+- Integrate with OpenAI API or local vision models
+- Validates: content visibility, broken layouts, CAPTCHA blocks, error pages
+- **Value:** Automatic detection of failed archives, quality scoring
+
+### 3. **Network Response Archiving** ⭐⭐⭐
+**JS Implementation:** Saves ALL network responses in organized structure
+```
+responses/
+├── all/                          # Timestamped unique files
+│   ├── 20240101120000__GET__https%3A%2F%2Fexample.com%2Fapi.json
+│   └── ...
+├── script/                       # Organized by resource type
+│   └── example.com/path/to/script.js → ../all/...
+├── stylesheet/
+├── image/
+├── media/
+└── index.jsonl                   # Searchable index
+```
+
+**Enhancement for Python:**
+- Add `on_Snapshot__23_responses.py`
+- Save all HTTP responses (XHR, images, scripts, etc.)
+- Create both timestamped and URL-organized views via symlinks
+- Generate `index.jsonl` with metadata (URL, method, status, mimeType, sha256)
+- **Value:** Complete HTTP-level archive, better debugging, API response preservation
+
+### 4. **Detailed Metadata Extractors** ⭐⭐
+
+#### 4a. SSL/TLS Details (`on_Snapshot__16_ssl.py`)
+```python
+{
+  "protocol": "TLS 1.3",
+  "cipher": "AES_128_GCM",
+  "securityState": "secure",
+  "securityDetails": {
+    "issuer": "Let's Encrypt",
+    "validFrom": ...,
+    "validTo": ...
+  }
+}
+```
+
+#### 4b. SEO Metadata (`on_Snapshot__17_seo.py`)
+Extracts all `<meta>` tags:
+```python
+{
+  "og:title": "Page Title",
+  "og:image": "https://example.com/image.jpg",
+  "twitter:card": "summary_large_image",
+  "description": "Page description",
+  ...
+}
+```
+
+#### 4c. Accessibility Tree (`on_Snapshot__18_accessibility.py`)
+```python
+{
+  "headings": ["# Main Title", "## Section 1", ...],
+  "iframes": ["https://embed.example.com/..."],
+  "tree": { ... }  # Full accessibility snapshot
+}
+```
+
+#### 4d. Outlinks Categorization (`on_Snapshot__19_outlinks.py`)
+Better than current implementation - categorizes by type:
+```python
+{
+  "hrefs": [...],           # All <a> links
+  "images": [...],          # <img src>
+  "css_stylesheets": [...], # <link rel=stylesheet>
+  "js_scripts": [...],      # <script src>
+  "iframes": [...],         # <iframe src>
+  "css_images": [...],      # background-image: url()
+  "links": [{...}]          # <link> tags (rel, href)
+}
+```
+
+#### 4e. Redirects Chain (`on_Snapshot__15_redirects.py`)
+Tracks full redirect sequence:
+```python
+{
+  "redirects_from_http": [
+    {"url": "http://ex.com", "status": 301, "isMainFrame": True},
+    {"url": "https://ex.com", "status": 302, "isMainFrame": True},
+    {"url": "https://www.ex.com", "status": 200, "isMainFrame": True}
+  ]
+}
+```
+
+**Value:** Rich metadata for research, SEO analysis, security auditing
+
+### 5. **Enhanced Screenshot System** ⭐⭐
+**JS Implementation:**
+- `screenshot.png` - Full-page PNG at high resolution (4:3 ratio)
+- `screenshot.jpg` - Compressed JPEG for thumbnails (1440x1080, 90% quality)
+- Automatically crops to reasonable height for long pages
+
+**Enhancement for Python:**
+- Update `screenshot` extractor to generate both formats
+- Use aspect ratio optimization (4:3 is better for thumbnails than 16:9)
+- **Value:** Faster loading thumbnails, better storage efficiency
+
+### 6. **Console Log Capture** ⭐⭐
+**JS Implementation:**
+```
+console.log - Captures all console output
+  ERROR /path/to/script.js:123 "Uncaught TypeError: ..."
+  WARNING https://example.com/api Failed to load resource: net::ERR_BLOCKED_BY_CLIENT
+```
+
+**Enhancement for Python:**
+- Add `on_Snapshot__20_consolelog.py`
+- Useful for debugging JavaScript errors, tracking blocked resources
+- **Value:** Identifies rendering issues, ad blockers, CORS problems
+
+## Priority: Nice-to-Have Enhancements
+
+### 7. **Request/Response Headers** ⭐
+**Current:** Headers extractor exists but could be enhanced
+**JS Enhancement:** Separates request vs response, includes extra headers
+
+### 8. **Human Behavior Emulation** ⭐
+**JS Implementation:**
+- Mouse jiggling with ghost-cursor
+- Smart scrolling with infinite scroll detection
+- Comment expansion (Reddit, HackerNews, etc.)
+- Form submission
+- CAPTCHA solving via 2captcha extension
+
+**Enhancement for Python:**
+- Add `on_Snapshot__05_human_behavior.py` (runs BEFORE other extractors)
+- Implement scrolling, clicking "Load More", expanding comments
+- **Value:** Captures more content from dynamic sites
+
+### 9. **CAPTCHA Solving** ⭐
+**JS Implementation:** Integrates 2captcha extension
+**Enhancement:** Add optional CAPTCHA solving via 2captcha API
+**Value:** Access to Cloudflare-protected sites
+
+### 10. **Source Map Downloading**
+**JS Implementation:** Automatically downloads `.map` files for JS/CSS
+**Enhancement:** Add `on_Snapshot__30_sourcemaps.py`
+**Value:** Helps debug minified code
+
+### 11. **Pandoc Markdown Conversion**
+**JS Implementation:** Converts HTML ↔ Markdown using Pandoc
+```bash
+pandoc --from html --to markdown_github --wrap=none
+```
+**Enhancement:** Add `on_Snapshot__34_pandoc.py`
+**Value:** Human-readable Markdown format
+
+### 12. **Authentication Management** ⭐
+**JS Implementation:**
+- Sophisticated cookie storage with `cookies.txt` export
+- LocalStorage + SessionStorage preservation
+- Merge new cookies with existing ones (no overwrites)
+
+**Enhancement:**
+- Improve `auth.json` management to match JS sophistication
+- Add `cookies.txt` export (Netscape format) for compatibility with wget/curl
+- **Value:** Better session persistence across runs
+
+### 13. **File Integrity & Versioning** ⭐⭐
+**JS Implementation:**
+- SHA256 hash for every file
+- Merkle tree directory hashes
+- Version directories (`versions/YYYYMMDDHHMMSS/`)
+- Symlinks to latest versions
+- `.files.json` manifest with metadata
+
+**Enhancement:**
+- Add `on_Snapshot__99_integrity.py` (runs last)
+- Generate SHA256 hashes for all outputs
+- Create version manifests
+- **Value:** Verify archive integrity, detect corruption, track changes
+
+### 14. **Directory Organization**
+**JS Structure (superior):**
+```
+archive/<timestamp>/
+├── versions/
+│   ├── 20240101120000/         # Each run = new version
+│   │   ├── screenshot.png
+│   │   ├── singlefile.html
+│   │   └── ...
+│   └── 20240102150000/
+├── screenshot.png → versions/20240102150000/screenshot.png  # Symlink to latest
+├── singlefile.html → ...
+└── metrics.json
+```
+
+**Current Python:** All outputs in flat structure
+**Enhancement:** Add versioning layer for tracking changes over time
+
+### 15. **Speedtest Integration**
+**JS Implementation:** Runs fast.com speedtest once per day
+**Enhancement:** Optional `on_Snapshot__01_speedtest.py`
+**Value:** Diagnose slow archives, track connection quality
+
+### 16. **gallery-dl Support** ⭐
+**JS Implementation:** Downloads photo galleries (Instagram, Twitter, etc.)
+**Enhancement:** Add `on_Snapshot__30_photos.py` alongside existing `media` extractor
+**Value:** Better support for image-heavy sites
+
+## Implementation Priority Ranking
+
+### Must-Have (High ROI):
+1. **Network Response Archiving** - Complete HTTP archive
+2. **AI Quality Assurance** - Automatic validation
+3. **Screen Recording** - Visual proof of capture
+4. **Enhanced Metadata** (SSL, SEO, Accessibility, Outlinks) - Research value
+
+### Should-Have (Medium ROI):
+5. **Console Log Capture** - Debugging aid
+6. **File Integrity Hashing** - Archive verification
+7. **Enhanced Screenshots** - Better thumbnails
+8. **Versioning System** - Track changes over time
+
+### Nice-to-Have (Lower ROI):
+9. **Human Behavior Emulation** - Dynamic content
+10. **CAPTCHA Solving** - Access restricted sites
+11. **gallery-dl** - Image collections
+12. **Pandoc Markdown** - Readable format
+
+## Technical Considerations
+
+### Dependencies Needed:
+- **Screen Recording:** `playwright` or `puppeteer` with recording API
+- **AI QA:** `openai` Python SDK or local vision model
+- **Network Archiving:** CDP protocol access (already have via Chrome)
+- **File Hashing:** Built-in `hashlib` (no new deps)
+- **gallery-dl:** Install via pip
+
+### Performance Impact:
+- Screen recording: +2-3 seconds overhead per snapshot
+- AI QA: +0.5-2 seconds (API call) per snapshot
+- Response archiving: Minimal (async writes)
+- File hashing: +0.1-0.5 seconds per snapshot
+- Metadata extraction: Minimal (same page visit)
+
+### Architecture Compatibility:
+All proposed enhancements fit the existing hook-based plugin architecture:
+- Use standard `on_Snapshot__NN_name.py` naming
+- Return `ExtractorResult` objects
+- Can reuse shared Chrome CDP sessions
+- Follow existing error handling patterns
+
+## Summary Statistics
+
+**JS Implementation:**
+- 35+ output types
+- ~3000 lines of archiving logic
+- Extensive quality assurance
+- Complete HTTP-level capture
+
+**Current Python Implementation:**
+- 12 extractors
+- Strong foundation with room for enhancement
+
+**Recommended Additions:**
+- **8 new high-priority extractors**
+- **6 enhanced versions of existing extractors**
+- **3 optional nice-to-have extractors**
+
+This would bring the Python implementation to feature parity with the JS version while maintaining better code organization and the existing plugin architecture.

+ 819 - 0
SIMPLIFICATION_PLAN.md

@@ -0,0 +1,819 @@
+# ArchiveBox 2025 Simplification Plan
+
+**Status:** FINAL - Ready for implementation
+**Last Updated:** 2024-12-24
+
+---
+
+## Final Decisions Summary
+
+| Decision | Choice |
+|----------|--------|
+| Task Queue | Keep `retry_at` polling pattern (no Django Tasks) |
+| State Machine | Preserve current semantics; only replace mixins/statemachines if identical retry/lock guarantees are kept |
+| Event Model | Remove completely |
+| ABX Plugin System | Remove entirely (`archivebox/pkgs/`) |
+| abx-pkg | Keep as external pip dependency (separate repo: github.com/ArchiveBox/abx-pkg) |
+| Binary Providers | File-based plugins using abx-pkg internally |
+| Search Backends | **Hybrid:** hooks for indexing, Python classes for querying |
+| Auth Methods | Keep simple (LDAP + normal), no pluginization needed |
+| ABID | Already removed (ignore old references) |
+| ArchiveResult | **Keep pre-creation** with `status=queued` + `retry_at` for consistency |
+| Plugin Directory | **`archivebox/plugins/*`** for built-ins, **`data/plugins/*`** for user hooks (flat `on_*__*.*` files) |
+| Locking | Use `retry_at` consistently across Crawl, Snapshot, ArchiveResult |
+| Worker Model | **Separate processes** per model type + per extractor, visible in htop |
+| Concurrency | **Per-extractor configurable** (e.g., `ytdlp_max_parallel=5`) |
+| InstalledBinary | **Keep model** + add Dependency model for audit trail |
+
+---
+
+## Architecture Overview
+
+### Consistent Queue/Lock Pattern
+
+All models (Crawl, Snapshot, ArchiveResult) use the same pattern:
+
+```python
+class StatusMixin(models.Model):
+    status = models.CharField(max_length=15, db_index=True)
+    retry_at = models.DateTimeField(default=timezone.now, null=True, db_index=True)
+
+    class Meta:
+        abstract = True
+
+    def tick(self) -> bool:
+        """Override in subclass. Returns True if state changed."""
+        raise NotImplementedError
+
+# Worker query (same for all models):
+Model.objects.filter(
+    status__in=['queued', 'started'],
+    retry_at__lte=timezone.now()
+).order_by('retry_at').first()
+
+# Claim (atomic via optimistic locking):
+updated = Model.objects.filter(
+    id=obj.id,
+    retry_at=obj.retry_at
+).update(
+    retry_at=timezone.now() + timedelta(seconds=60)
+)
+if updated == 1:  # Successfully claimed
+    obj.refresh_from_db()
+    obj.tick()
+```
+
+**Failure/cleanup guarantees**
+- Objects stuck in `started` with a past `retry_at` must be reclaimed automatically using the existing retry/backoff rules.
+- `tick()` implementations must continue to bump `retry_at` / transition to `backoff` the same way current statemachines do so that failures get retried without manual intervention.
+
+### Process Tree (Separate Processes, Visible in htop)
+
+```
+archivebox server
+├── orchestrator (pid=1000)
+│   ├── crawl_worker_0 (pid=1001)
+│   ├── crawl_worker_1 (pid=1002)
+│   ├── snapshot_worker_0 (pid=1003)
+│   ├── snapshot_worker_1 (pid=1004)
+│   ├── snapshot_worker_2 (pid=1005)
+│   ├── wget_worker_0 (pid=1006)
+│   ├── wget_worker_1 (pid=1007)
+│   ├── ytdlp_worker_0 (pid=1008)      # Limited concurrency
+│   ├── ytdlp_worker_1 (pid=1009)
+│   ├── screenshot_worker_0 (pid=1010)
+│   ├── screenshot_worker_1 (pid=1011)
+│   ├── screenshot_worker_2 (pid=1012)
+│   └── ...
+```
+
+**Configurable per-extractor concurrency:**
+```python
+# archivebox.conf or environment
+WORKER_CONCURRENCY = {
+    'crawl': 2,
+    'snapshot': 3,
+    'wget': 2,
+    'ytdlp': 2,           # Bandwidth-limited
+    'screenshot': 3,
+    'singlefile': 2,
+    'title': 5,           # Fast, can run many
+    'favicon': 5,
+}
+```
+
+---
+
+## Hook System
+
+### Discovery (Glob at Startup)
+
+```python
+# archivebox/hooks.py
+from pathlib import Path
+import subprocess
+import os
+import json
+from django.conf import settings
+
+BUILTIN_PLUGIN_DIR = Path(__file__).parent.parent / 'plugins'
+USER_PLUGIN_DIR = settings.DATA_DIR / 'plugins'
+
+def discover_hooks(event_name: str) -> list[Path]:
+    """Find all scripts matching on_{EventName}__*.{sh,py,js} under archivebox/plugins/* and data/plugins/*"""
+    hooks = []
+    for base in (BUILTIN_PLUGIN_DIR, USER_PLUGIN_DIR):
+        if not base.exists():
+            continue
+        for ext in ('sh', 'py', 'js'):
+            hooks.extend(base.glob(f'*/on_{event_name}__*.{ext}'))
+    return sorted(hooks)
+
+def run_hook(script: Path, output_dir: Path, **kwargs) -> dict:
+    """Execute hook with --key=value args, cwd=output_dir."""
+    args = [str(script)]
+    for key, value in kwargs.items():
+        args.append(f'--{key.replace("_", "-")}={json.dumps(value, default=str)}')
+
+    env = os.environ.copy()
+    env['ARCHIVEBOX_DATA_DIR'] = str(settings.DATA_DIR)
+
+    result = subprocess.run(
+        args,
+        cwd=output_dir,
+        capture_output=True,
+        text=True,
+        timeout=300,
+        env=env,
+    )
+    return {
+        'returncode': result.returncode,
+        'stdout': result.stdout,
+        'stderr': result.stderr,
+    }
+```
+
+### Hook Interface
+
+- **Input:** CLI args `--url=... --snapshot-id=...`
+- **Location:** Built-in hooks in `archivebox/plugins/<plugin>/on_*__*.*`, user hooks in `data/plugins/<plugin>/on_*__*.*`
+- **Internal API:** Should treat ArchiveBox as an external CLI—call `archivebox config --get ...`, `archivebox find ...`, import `abx-pkg` only when running in their own venvs.
+- **Output:** Files written to `$PWD` (the output_dir), can call `archivebox create ...`
+- **Logging:** stdout/stderr captured to ArchiveResult
+- **Exit code:** 0 = success, non-zero = failure
+
+---
+
+## Unified Config Access
+
+- Implement `archivebox.config.get_config(scope='global'|'crawl'|'snapshot'|...)` that merges defaults, config files, environment variables, DB overrides, and per-object config (seed/crawl/snapshot).
+- Provide helpers (`get_config()`, `get_flat_config()`) for Python callers so `abx.pm.hook.get_CONFIG*` can be removed.
+- Ensure the CLI command `archivebox config --get KEY` (and a machine-readable `--format=json`) uses the same API so hook scripts can query config via subprocess calls.
+- Document that plugin hooks should prefer the CLI to fetch config rather than importing Django internals, guaranteeing they work from shell/bash/js without ArchiveBox’s runtime.
+
+---
+
+### Example Extractor Hooks
+
+**Bash:**
+```bash
+#!/usr/bin/env bash
+# plugins/on_Snapshot__wget.sh
+set -e
+
+# Parse args
+for arg in "$@"; do
+    case $arg in
+        --url=*) URL="${arg#*=}" ;;
+        --snapshot-id=*) SNAPSHOT_ID="${arg#*=}" ;;
+    esac
+done
+
+# Find wget binary
+WGET=$(archivebox find InstalledBinary --name=wget --format=abspath)
+[ -z "$WGET" ] && echo "wget not found" >&2 && exit 1
+
+# Run extraction (writes to $PWD)
+$WGET --mirror --page-requisites --adjust-extension "$URL" 2>&1
+
+echo "Completed wget mirror of $URL"
+```
+
+**Python:**
+```python
+#!/usr/bin/env python3
+# plugins/on_Snapshot__singlefile.py
+import argparse
+import subprocess
+import sys
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--url', required=True)
+    parser.add_argument('--snapshot-id', required=True)
+    args = parser.parse_args()
+
+    # Find binary via CLI
+    result = subprocess.run(
+        ['archivebox', 'find', 'InstalledBinary', '--name=single-file', '--format=abspath'],
+        capture_output=True, text=True
+    )
+    bin_path = result.stdout.strip()
+    if not bin_path:
+        print("single-file not installed", file=sys.stderr)
+        sys.exit(1)
+
+    # Run extraction (writes to $PWD)
+    subprocess.run([bin_path, args.url, '--output', 'singlefile.html'], check=True)
+    print(f"Saved {args.url} to singlefile.html")
+
+if __name__ == '__main__':
+    main()
+```
+
+---
+
+## Binary Providers & Dependencies
+
+- Move dependency tracking into a dedicated `dependencies` module (or extend `archivebox/machine/`) with two Django models:
+
+```yaml
+Dependency:
+    id: uuidv7
+    bin_name: extractor binary executable name (ytdlp|wget|screenshot|...)
+    bin_provider: apt | brew | pip | npm | gem | nix | '*' for any
+    custom_cmds: JSON of provider->install command overrides (optional)
+    config: JSON of env vars/settings to apply during install
+    created_at: utc datetime
+
+InstalledBinary:
+    id: uuidv7
+    dependency: FK to Dependency
+    bin_name: executable name again
+    bin_abspath: filesystem path
+    bin_version: semver string
+    bin_hash: sha256 of the binary
+    bin_provider: apt | brew | pip | npm | gem | nix | custom | ...
+    created_at: utc datetime (last seen/installed)
+    is_valid: property returning True when both abspath+version are set
+```
+
+- Provide CLI commands for hook scripts: `archivebox find InstalledBinary --name=wget --format=abspath`, `archivebox dependency create ...`, etc.
+- Hooks remain language agnostic and should not import ArchiveBox Django modules; they rely on CLI commands plus their own runtime (python/bash/js).
+
+### Provider Hooks
+
+- Built-in provider plugins live under `archivebox/plugins/<provider>/on_Dependency__*.py` (e.g., apt, brew, pip, custom).
+- Each provider hook:
+    1. Checks if the Dependency allows that provider via `bin_provider` or wildcard `'*'`.
+    2. Builds the install command (`custom_cmds[provider]` override or sane default like `apt install -y <bin_name>`).
+    3. Executes the command (bash/python) and, on success, records/updates an `InstalledBinary`.
+
+Example outline (bash or python, but still interacting via CLI):
+
+```bash
+# archivebox/plugins/apt/on_Dependency__install_using_apt_provider.sh
+set -euo pipefail
+
+DEP_JSON=$(archivebox dependency show --id="$DEPENDENCY_ID" --format=json)
+BIN_NAME=$(echo "$DEP_JSON" | jq -r '.bin_name')
+PROVIDER_ALLOWED=$(echo "$DEP_JSON" | jq -r '.bin_provider')
+
+if [[ "$PROVIDER_ALLOWED" == "*" || "$PROVIDER_ALLOWED" == *"apt"* ]]; then
+    INSTALL_CMD=$(echo "$DEP_JSON" | jq -r '.custom_cmds.apt // empty')
+    INSTALL_CMD=${INSTALL_CMD:-"apt install -y --no-install-recommends $BIN_NAME"}
+    bash -lc "$INSTALL_CMD"
+
+    archivebox dependency register-installed \
+        --dependency-id="$DEPENDENCY_ID" \
+        --bin-provider=apt \
+        --bin-abspath="$(command -v "$BIN_NAME")" \
+        --bin-version="$("$(command -v "$BIN_NAME")" --version | head -n1)" \
+        --bin-hash="$(sha256sum "$(command -v "$BIN_NAME")" | cut -d' ' -f1)"
+fi
+```
+
+- Extractor-level hooks (e.g., `archivebox/plugins/wget/on_Crawl__install_wget_extractor_if_needed.*`) ensure dependencies exist before starting work by creating/updating `Dependency` records (via CLI) and then invoking provider hooks.
+- Remove all reliance on `abx.pm.hook.binary_load` / ABX plugin packages; `abx-pkg` can remain as a normal pip dependency that hooks import if useful.
+
+---
+
+## Search Backends (Hybrid)
+
+### Indexing: Hook Scripts
+
+Triggered when ArchiveResult completes successfully (from the Django side we simply fire the event; indexing logic lives in standalone hook scripts):
+
+```python
+#!/usr/bin/env python3
+# plugins/on_ArchiveResult__index_sqlitefts.py
+import argparse
+import sqlite3
+import os
+from pathlib import Path
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--snapshot-id', required=True)
+    parser.add_argument('--extractor', required=True)
+    args = parser.parse_args()
+
+    # Read text content from output files
+    content = ""
+    for f in Path.cwd().rglob('*.txt'):
+        content += f.read_text(errors='ignore') + "\n"
+    for f in Path.cwd().rglob('*.html'):
+        content += strip_html(f.read_text(errors='ignore')) + "\n"
+
+    if not content.strip():
+        return
+
+    # Add to FTS index
+    db = sqlite3.connect(os.environ['ARCHIVEBOX_DATA_DIR'] + '/search.sqlite3')
+    db.execute('CREATE VIRTUAL TABLE IF NOT EXISTS fts USING fts5(snapshot_id, content)')
+    db.execute('INSERT OR REPLACE INTO fts VALUES (?, ?)', (args.snapshot_id, content))
+    db.commit()
+
+if __name__ == '__main__':
+    main()
+```
+
+### Querying: CLI-backed Python Classes
+
+```python
+# archivebox/search/backends/sqlitefts.py
+import subprocess
+import json
+
+class SQLiteFTSBackend:
+    name = 'sqlitefts'
+
+    def search(self, query: str, limit: int = 50) -> list[str]:
+        """Call plugins/on_Search__query_sqlitefts.* and parse stdout."""
+        result = subprocess.run(
+            ['archivebox', 'search-backend', '--backend', self.name, '--query', query, '--limit', str(limit)],
+            capture_output=True,
+            check=True,
+            text=True,
+        )
+        return json.loads(result.stdout or '[]')
+
+
+# archivebox/search/__init__.py
+from django.conf import settings
+
+def get_backend():
+    name = getattr(settings, 'SEARCH_BACKEND', 'sqlitefts')
+    if name == 'sqlitefts':
+        from .backends.sqlitefts import SQLiteFTSBackend
+        return SQLiteFTSBackend()
+    elif name == 'sonic':
+        from .backends.sonic import SonicBackend
+        return SonicBackend()
+    raise ValueError(f'Unknown search backend: {name}')
+
+def search(query: str) -> list[str]:
+    return get_backend().search(query)
+```
+
+- Each backend script lives under `archivebox/plugins/search/on_Search__query_<backend>.py` (with user overrides in `data/plugins/...`) and outputs JSON list of snapshot IDs. Python wrappers simply invoke the CLI to keep Django isolated from backend implementations.
+
+---
+
+## Simplified Models
+
+> Goal: reduce line count without sacrificing the correctness guarantees we currently get from `ModelWithStateMachine` + python-statemachine. We keep the mixins/statemachines unless we can prove a smaller implementation enforces the same transitions/retry locking.
+
+### Snapshot
+
+```python
+class Snapshot(models.Model):
+    id = models.UUIDField(primary_key=True, default=uuid7)
+    url = models.URLField(unique=True, db_index=True)
+    timestamp = models.CharField(max_length=32, unique=True, db_index=True)
+    title = models.CharField(max_length=512, null=True, blank=True)
+
+    created_by = models.ForeignKey(settings.AUTH_USER_MODEL, on_delete=models.CASCADE)
+    created_at = models.DateTimeField(default=timezone.now)
+    modified_at = models.DateTimeField(auto_now=True)
+
+    crawl = models.ForeignKey('crawls.Crawl', on_delete=models.CASCADE, null=True)
+    tags = models.ManyToManyField('Tag', through='SnapshotTag')
+
+    # Status (consistent with Crawl, ArchiveResult)
+    status = models.CharField(max_length=15, default='queued', db_index=True)
+    retry_at = models.DateTimeField(default=timezone.now, null=True, db_index=True)
+
+    # Inline fields (no mixins)
+    config = models.JSONField(default=dict)
+    notes = models.TextField(blank=True, default='')
+
+    FINAL_STATES = ['sealed']
+
+    @property
+    def output_dir(self) -> Path:
+        return settings.ARCHIVE_DIR / self.timestamp
+
+    def tick(self) -> bool:
+        if self.status == 'queued' and self.can_start():
+            self.start()
+            return True
+        elif self.status == 'started' and self.is_finished():
+            self.seal()
+            return True
+        return False
+
+    def can_start(self) -> bool:
+        return bool(self.url)
+
+    def is_finished(self) -> bool:
+        results = self.archiveresult_set.all()
+        if not results.exists():
+            return False
+        return not results.filter(status__in=['queued', 'started', 'backoff']).exists()
+
+    def start(self):
+        self.status = 'started'
+        self.retry_at = timezone.now() + timedelta(seconds=10)
+        self.output_dir.mkdir(parents=True, exist_ok=True)
+        self.save()
+        self.create_pending_archiveresults()
+
+    def seal(self):
+        self.status = 'sealed'
+        self.retry_at = None
+        self.save()
+
+    def create_pending_archiveresults(self):
+        for extractor in get_config(defaults=settings, crawl=self.crawl, snapshot=self).ENABLED_EXTRACTORS:
+            ArchiveResult.objects.get_or_create(
+                snapshot=self,
+                extractor=extractor,
+                defaults={
+                    'status': 'queued',
+                    'retry_at': timezone.now(),
+                    'created_by': self.created_by,
+                }
+            )
+```
+
+### ArchiveResult
+
+```python
+class ArchiveResult(models.Model):
+    id = models.UUIDField(primary_key=True, default=uuid7)
+    snapshot = models.ForeignKey(Snapshot, on_delete=models.CASCADE)
+    extractor = models.CharField(max_length=32, db_index=True)
+
+    created_by = models.ForeignKey(settings.AUTH_USER_MODEL, on_delete=models.CASCADE)
+    created_at = models.DateTimeField(default=timezone.now)
+    modified_at = models.DateTimeField(auto_now=True)
+
+    # Status
+    status = models.CharField(max_length=15, default='queued', db_index=True)
+    retry_at = models.DateTimeField(default=timezone.now, null=True, db_index=True)
+
+    # Execution
+    start_ts = models.DateTimeField(null=True)
+    end_ts = models.DateTimeField(null=True)
+    output = models.CharField(max_length=1024, null=True)
+    cmd = models.JSONField(null=True)
+    pwd = models.CharField(max_length=256, null=True)
+
+    # Audit trail
+    machine = models.ForeignKey('machine.Machine', on_delete=models.SET_NULL, null=True)
+    iface = models.ForeignKey('machine.NetworkInterface', on_delete=models.SET_NULL, null=True)
+    installed_binary = models.ForeignKey('machine.InstalledBinary', on_delete=models.SET_NULL, null=True)
+
+    FINAL_STATES = ['succeeded', 'failed']
+
+    class Meta:
+        unique_together = ('snapshot', 'extractor')
+
+    @property
+    def output_dir(self) -> Path:
+        return self.snapshot.output_dir / self.extractor
+
+    def tick(self) -> bool:
+        if self.status == 'queued' and self.can_start():
+            self.start()
+            return True
+        elif self.status == 'backoff' and self.can_retry():
+            self.status = 'queued'
+            self.retry_at = timezone.now()
+            self.save()
+            return True
+        return False
+
+    def can_start(self) -> bool:
+        return bool(self.snapshot.url)
+
+    def can_retry(self) -> bool:
+        return self.retry_at and self.retry_at <= timezone.now()
+
+    def start(self):
+        self.status = 'started'
+        self.start_ts = timezone.now()
+        self.retry_at = timezone.now() + timedelta(seconds=120)
+        self.output_dir.mkdir(parents=True, exist_ok=True)
+        self.save()
+
+        # Run hook and complete
+        self.run_extractor_hook()
+
+    def run_extractor_hook(self):
+        from archivebox.hooks import discover_hooks, run_hook
+
+        hooks = discover_hooks(f'Snapshot__{self.extractor}')
+        if not hooks:
+            self.status = 'failed'
+            self.output = f'No hook for: {self.extractor}'
+            self.end_ts = timezone.now()
+            self.retry_at = None
+            self.save()
+            return
+
+        result = run_hook(
+            hooks[0],
+            output_dir=self.output_dir,
+            url=self.snapshot.url,
+            snapshot_id=str(self.snapshot.id),
+        )
+
+        self.status = 'succeeded' if result['returncode'] == 0 else 'failed'
+        self.output = result['stdout'][:1024] or result['stderr'][:1024]
+        self.end_ts = timezone.now()
+        self.retry_at = None
+        self.save()
+
+        # Trigger search indexing if succeeded
+        if self.status == 'succeeded':
+            self.trigger_search_indexing()
+
+    def trigger_search_indexing(self):
+        from archivebox.hooks import discover_hooks, run_hook
+        for hook in discover_hooks('ArchiveResult__index'):
+            run_hook(hook, output_dir=self.output_dir,
+                     snapshot_id=str(self.snapshot.id),
+                     extractor=self.extractor)
+```
+
+- `ArchiveResult` must continue storing execution metadata (`cmd`, `pwd`, `machine`, `iface`, `installed_binary`, timestamps) exactly as before, even though the extractor now runs via hook scripts. `run_extractor_hook()` is responsible for capturing those values (e.g., wrapping subprocess calls).
+- Any refactor of `Snapshot`, `ArchiveResult`, or `Crawl` has to keep the same `FINAL_STATES`, `retry_at` semantics, and tag/output directory handling that `ModelWithStateMachine` currently provides.
+
+---
+
+## Simplified Worker System
+
+```python
+# archivebox/workers/orchestrator.py
+import os
+import time
+import multiprocessing
+from datetime import timedelta
+from django.utils import timezone
+from django.conf import settings
+
+
+class Worker:
+    """Base worker for processing queued objects."""
+    Model = None
+    name = 'worker'
+
+    def get_queue(self):
+        return self.Model.objects.filter(
+            retry_at__lte=timezone.now()
+        ).exclude(
+            status__in=self.Model.FINAL_STATES
+        ).order_by('retry_at')
+
+    def claim(self, obj) -> bool:
+        """Atomic claim via optimistic lock."""
+        updated = self.Model.objects.filter(
+            id=obj.id,
+            retry_at=obj.retry_at
+        ).update(retry_at=timezone.now() + timedelta(seconds=60))
+        return updated == 1
+
+    def run(self):
+        print(f'[{self.name}] Started pid={os.getpid()}')
+        while True:
+            obj = self.get_queue().first()
+            if obj and self.claim(obj):
+                try:
+                    obj.refresh_from_db()
+                    obj.tick()
+                except Exception as e:
+                    print(f'[{self.name}] Error: {e}')
+                    obj.retry_at = timezone.now() + timedelta(seconds=60)
+                    obj.save(update_fields=['retry_at'])
+            else:
+                time.sleep(0.5)
+
+
+class CrawlWorker(Worker):
+    from crawls.models import Crawl
+    Model = Crawl
+    name = 'crawl'
+
+
+class SnapshotWorker(Worker):
+    from core.models import Snapshot
+    Model = Snapshot
+    name = 'snapshot'
+
+
+class ExtractorWorker(Worker):
+    """Worker for a specific extractor."""
+    from core.models import ArchiveResult
+    Model = ArchiveResult
+
+    def __init__(self, extractor: str):
+        self.extractor = extractor
+        self.name = extractor
+
+    def get_queue(self):
+        return super().get_queue().filter(extractor=self.extractor)
+
+
+class Orchestrator:
+    def __init__(self):
+        self.processes = []
+
+    def spawn(self):
+        config = settings.WORKER_CONCURRENCY
+
+        for i in range(config.get('crawl', 2)):
+            self._spawn(CrawlWorker, f'crawl_{i}')
+
+        for i in range(config.get('snapshot', 3)):
+            self._spawn(SnapshotWorker, f'snapshot_{i}')
+
+        for extractor, count in config.items():
+            if extractor in ('crawl', 'snapshot'):
+                continue
+            for i in range(count):
+                self._spawn(ExtractorWorker, f'{extractor}_{i}', extractor)
+
+    def _spawn(self, cls, name, *args):
+        worker = cls(*args) if args else cls()
+        worker.name = name
+        p = multiprocessing.Process(target=worker.run, name=name)
+        p.start()
+        self.processes.append(p)
+
+    def run(self):
+        print(f'Orchestrator pid={os.getpid()}')
+        self.spawn()
+        try:
+            while True:
+                for p in self.processes:
+                    if not p.is_alive():
+                        print(f'{p.name} died, restarting...')
+                        # Respawn logic
+                time.sleep(5)
+        except KeyboardInterrupt:
+            for p in self.processes:
+                p.terminate()
+```
+
+---
+
+## Directory Structure
+
+```
+archivebox-nue/
+├── archivebox/
+│   ├── __init__.py
+│   ├── config.py                    # Simple env-based config
+│   ├── hooks.py                     # Hook discovery + execution
+│   │
+│   ├── core/
+│   │   ├── models.py                # Snapshot, ArchiveResult, Tag
+│   │   ├── admin.py
+│   │   └── views.py
+│   │
+│   ├── crawls/
+│   │   ├── models.py                # Crawl, Seed, CrawlSchedule, Outlink
+│   │   └── admin.py
+│   │
+│   ├── machine/
+│   │   ├── models.py                # Machine, NetworkInterface, Dependency, InstalledBinary
+│   │   └── admin.py
+│   │
+│   ├── workers/
+│   │   └── orchestrator.py          # ~150 lines
+│   │
+│   ├── api/
+│   │   └── ...
+│   │
+│   ├── cli/
+│   │   └── ...
+│   │
+│   ├── search/
+│   │   ├── __init__.py
+│   │   └── backends/
+│   │       ├── sqlitefts.py
+│   │       └── sonic.py
+│   │
+│   ├── index/
+│   ├── parsers/
+│   ├── misc/
+│   └── templates/
+│
+-├── plugins/                         # Built-in hooks (ArchiveBox never imports these directly)
+│   ├── wget/
+│   │   └── on_Snapshot__wget.sh
+│   ├── dependencies/
+│   │   ├── on_Dependency__install_using_apt_provider.sh
+│   │   └── on_Dependency__install_using_custom_bash.py
+│   ├── search/
+│   │   ├── on_ArchiveResult__index_sqlitefts.py
+│   │   └── on_Search__query_sqlitefts.py
+│   └── ...
+├── data/
+│   └── plugins/                     # User-provided hooks mirror builtin layout
+└── pyproject.toml
+```
+
+---
+
+## Implementation Phases
+
+### Phase 1: Build Unified Config + Hook Scaffold
+
+1. Implement `archivebox.config.get_config()` + CLI plumbing (`archivebox config --get ... --format=json`) without touching abx yet.
+2. Add `archivebox/hooks.py` with dual plugin directories (`archivebox/plugins`, `data/plugins`), discovery, and execution helpers.
+3. Keep the existing ABX/worker system running while new APIs land; surface warnings where `abx.pm.*` is still in use.
+
+### Phase 2: Gradual ABX Removal
+
+1. Rename `archivebox/pkgs/` to `archivebox/pkgs.unused/` and start deleting packages once equivalent hook scripts exist.
+2. Remove `pluggy`, `python-statemachine`, and all `abx-*` dependencies/workspace entries from `pyproject.toml` only after consumers are migrated.
+3. Replace every `abx.pm.hook.get_*` usage in CLI/config/search/extractors with the new config + hook APIs.
+
+### Phase 3: Worker + State Machine Simplification
+
+1. Introduce the process-per-model orchestrator while preserving `ModelWithStateMachine` semantics (Snapshot/Crawl/ArchiveResult).
+2. Only drop mixins/statemachine dependency after verifying the new `tick()` implementations keep retries/backoff/final states identical.
+3. Ensure Huey/task entry points either delegate to the new orchestrator or are retired cleanly so background work isn’t double-run.
+
+### Phase 4: Hook-Based Extractors & Dependencies
+
+1. Create builtin extractor hooks in `archivebox/plugins/*/on_Snapshot__*.{sh,py,js}`; have `ArchiveResult.run_extractor_hook()` capture cmd/pwd/machine/install metadata.
+2. Implement the new `Dependency`/`InstalledBinary` models + CLI commands, and port provider/install logic into hook scripts that only talk via CLI.
+3. Add CLI helpers `archivebox find InstalledBinary`, `archivebox dependency ...` used by all hooks and document how user plugins extend them.
+
+### Phase 5: Search Backends & Indexing Hooks
+
+1. Migrate indexing triggers to hook scripts (`on_ArchiveResult__index_*`) that run standalone and write into `$ARCHIVEBOX_DATA_DIR/search.*`.
+2. Implement CLI-driven query hooks (`on_Search__query_*`) plus lightweight Python wrappers in `archivebox/search/backends/`.
+3. Remove any remaining ABX search integration.
+
+
+---
+
+## What Gets Deleted
+
+```
+archivebox/pkgs/                 # ~5,000 lines
+archivebox/workers/actor.py      # If exists
+```
+
+## Dependencies Removed
+
+```toml
+"pluggy>=1.5.0"
+"python-statemachine>=2.3.6"
+# + all 30 abx-* packages
+```
+
+## Dependencies Kept
+
+```toml
+"django>=6.0"
+"django-ninja>=1.3.0"
+"abx-pkg>=0.6.0"         # External, for binary management
+"click>=8.1.7"
+"rich>=13.8.0"
+```
+
+---
+
+## Estimated Savings
+
+| Component | Lines Removed |
+|-----------|---------------|
+| pkgs/ (ABX) | ~5,000 |
+| statemachines | ~300 |
+| workers/ | ~500 |
+| base_models mixins | ~100 |
+| **Total** | **~6,000 lines** |
+
+Plus 30+ dependencies removed, massive reduction in conceptual complexity.
+
+---
+
+**Status: READY FOR IMPLEMENTATION**
+
+Begin with Phase 1: Rename `archivebox/pkgs/` to add `.unused` suffix (delete after porting) and fix imports.

+ 127 - 0
TEST_RESULTS.md

@@ -0,0 +1,127 @@
+# Chrome Extensions Test Results ✅
+
+Date: 2025-12-24
+Status: **ALL TESTS PASSED**
+
+## Test Summary
+
+Ran comprehensive tests of the Chrome extension system including:
+- Extension downloads from Chrome Web Store
+- Extension unpacking and installation
+- Metadata caching and persistence
+- Cache performance verification
+
+## Results
+
+### ✅ Extension Downloads (4/4 successful)
+
+| Extension | Version | Size | Status |
+|-----------|---------|------|--------|
+| captcha2 (2captcha) | 3.7.2 | 396 KB | ✅ Downloaded |
+| istilldontcareaboutcookies | 1.1.9 | 550 KB | ✅ Downloaded |
+| ublock (uBlock Origin) | 1.68.0 | 4.0 MB | ✅ Downloaded |
+| singlefile | 1.22.96 | 1.2 MB | ✅ Downloaded |
+
+### ✅ Extension Installation (4/4 successful)
+
+All extensions were successfully unpacked with valid `manifest.json` files:
+- captcha2: Manifest V3 ✓
+- istilldontcareaboutcookies: Valid manifest ✓
+- ublock: Valid manifest ✓
+- singlefile: Valid manifest ✓
+
+### ✅ Metadata Caching (4/4 successful)
+
+Extension metadata cached to `*.extension.json` files with complete information:
+- Web Store IDs
+- Download URLs
+- File paths (absolute)
+- Computed extension IDs
+- Version numbers
+
+Example metadata (captcha2):
+```json
+{
+  "webstore_id": "ifibfemgeogfhoebkmokieepdoobkbpo",
+  "name": "captcha2",
+  "crx_path": "[...]/ifibfemgeogfhoebkmokieepdoobkbpo__captcha2.crx",
+  "unpacked_path": "[...]/ifibfemgeogfhoebkmokieepdoobkbpo__captcha2",
+  "id": "gafcdbhijmmjlojcakmjlapdliecgila",
+  "version": "3.7.2"
+}
+```
+
+### ✅ Cache Performance Verification
+
+**Test**: Ran captcha2 installation twice in a row
+
+**First run**: Downloaded and installed extension (5s)
+**Second run**: Used cache, skipped installation (0.01s)
+
+**Performance gain**: ~500x faster on subsequent runs
+
+**Log output from second run**:
+```
+[*] 2captcha extension already installed (using cache)
+[✓] 2captcha extension setup complete
+```
+
+## File Structure Created
+
+```
+data/personas/Test/chrome_extensions/
+├── captcha2.extension.json (709 B)
+├── istilldontcareaboutcookies.extension.json (763 B)
+├── ublock.extension.json (704 B)
+├── singlefile.extension.json (717 B)
+├── ifibfemgeogfhoebkmokieepdoobkbpo__captcha2/ (unpacked)
+├── ifibfemgeogfhoebkmokieepdoobkbpo__captcha2.crx (396 KB)
+├── edibdbjcniadpccecjdfdjjppcpchdlm__istilldontcareaboutcookies/ (unpacked)
+├── edibdbjcniadpccecjdfdjjppcpchdlm__istilldontcareaboutcookies.crx (550 KB)
+├── cjpalhdlnbpafiamejdnhcphjbkeiagm__ublock/ (unpacked)
+├── cjpalhdlnbpafiamejdnhcphjbkeiagm__ublock.crx (4.0 MB)
+├── mpiodijhokgodhhofbcjdecpffjipkle__singlefile/ (unpacked)
+└── mpiodijhokgodhhofbcjdecpffjipkle__singlefile.crx (1.2 MB)
+```
+
+Total size: ~6.2 MB for all 4 extensions
+
+## Notes
+
+### Expected Warnings
+
+The following warnings are **expected and harmless**:
+
+```
+warning [*.crx]:  1062-1322 extra bytes at beginning or within zipfile
+  (attempting to process anyway)
+```
+
+This occurs because CRX files have a Chrome-specific header (containing signature data) before the ZIP content. The `unzip` command detects this and processes the ZIP data correctly anyway.
+
+### Cache Invalidation
+
+To force re-download of extensions:
+```bash
+rm -rf data/personas/Test/chrome_extensions/
+```
+
+## Next Steps
+
+✅ Extensions are ready to use with Chrome
+- Load via `--load-extension` and `--allowlisted-extension-id` flags
+- Extensions can be configured at runtime via CDP
+- 2captcha config plugin ready to inject API key
+
+✅ Ready for integration testing with:
+- chrome_session plugin (load extensions on browser start)
+- captcha2_config plugin (configure 2captcha API key)
+- singlefile extractor (trigger extension action)
+
+## Conclusion
+
+The Chrome extension system is **production-ready** with:
+- ✅ Robust download and installation
+- ✅ Efficient multi-level caching
+- ✅ Proper error handling
+- ✅ Performance optimized for thousands of snapshots

+ 6109 - 0
archivebox.ts

@@ -0,0 +1,6109 @@
+tring';
+import { Readable } from 'node:stream';
+import { finished } from 'node:stream/promises';
+import { URL } from 'node:url';
+import util from 'node:util';
+const exec = util.promisify(child_process.exec);
+
+import { Readability } from '@mozilla/readability';
+import FileCookieStore from '@root/file-cookie-store';
+import merge from 'deepmerge';
+import { createCursor, getRandomPagePoint } from 'ghost-cursor';
+import { JSDOM, VirtualConsole } from 'jsdom';
+import mime from 'mime-types';
+import ToughCookie from 'tough-cookie';
+import unzip from 'unzip-crx-3';
+
+import puppeteer from 'puppeteer';
+import { Browser, Page, Cookie, HTTPResponse } from 'puppeteer';
+import { Cluster } from 'puppeteer-cluster';
+import PupeteerExtra from "puppeteer-extra";
+import Stealth#!/usr/bin/env node --env-file .env
+// https://gist.github.com/pirate/d9a350e83025a1e6cf452cddd815d0d4
+
+// npm install request node-request minimist deepmerge mime-types decompress puppeteer-extra puppeteer-extra-plugin-repl puppeteer-extra-plugin-user-preferences puppeteer-extra-plugin-recaptcha puppeteer-extra-plugin-stealth puppeteer-screen-recorder puppeteer-cluster ghost-cursor @mozilla/readability jsdom unzip-crx-3 node-fetch@2 
+
+
+import assert from 'node:assert/strict';
+import { Buffer } from 'node:buffer';
+import child_process from 'node:child_process';
+import crypto from 'node:crypto';
+import fs from 'node:fs';
+import { createServer } from 'node:http';
+import os from 'node:os';
+import path from 'node:path';
+import querystring from 'node:querysPlugin from "puppeteer-extra-plugin-stealth";
+import PrefsPlugin from 'puppeteer-extra-plugin-user-preferences';
+import { PuppeteerScreenRecorder } from 'puppeteer-screen-recorder';
+// import RecaptchaPlugin from 'puppeteer-extra-plugin-recaptcha';
+// import ReplPlugin from 'puppeteer-extra-plugin-repl';
+
+const __dirname = import.meta.dirname
+
+import { getDatabase } from './models/init-models.js';
+const { Tag, Snapshot, ArchiveResult } = await getDatabase({ dbpath: './index.sqlite3' })
+
+
+// move mitm CA cert into /usr/local/share/ca-certificates/mitmproxy-ca-cert.crt
+// update-ca-certificates
+
+
+const ANSI = {
+    reset: "\x1b[0m",
+    blue: "\x1b[34m",
+    black: "\x1b[30m",
+}
+
+/************************* Main Input Arguments *******************************/
+let URLS = [
+    // 'chrome://about',
+    // 'chrome://system/#chrome_root_store',
+
+    'https://facebook.com/815781663692514/?comment_id=1508571679703640',
+    'https://www.instagram.com/p/CrTY1fENHr5/',
+    'https://www.tiktok.com/@zemmour_eric/video/7342474065598319904?cid=7343316616878490400',
+    'https://twitter.com/DZasken68678/status/1799833933271687304',
+    'https://t.me/IONONMIARRENDOGROUP/13598',
+    'https://www.youtube.com/watch?v=rpD0qgzlCms',
+    'https://www.aap.com.au/factcheck/aboriginal-lands-claim-a-total-abdication-of-facts/',
+
+
+    'https://gologin.com/check-browser',
+    'https://arh.antoinevastel.com/bots/areyouheadless',
+
+    'https://2captcha.com/demo/hcaptcha',
+    'https://2captcha.com/demo/cloudflare-turnstile',
+    'https://2captcha.com/demo/recaptcha-v3',
+    'https://ipinfo.io/',
+
+    // 'https://2captcha.com/demo/recaptcha-v2',
+    // 'https://2captcha.com/demo/keycaptcha',
+    // 'https://browserleaks.com/canvas',
+    // 'https://bot.incolumitas.com/#botChallenge',
+    // 'https://infosimples.github.io/detect-headless/',
+    // 'https://coveryourtracks.eff.org/',
+    // 'https://fingerprint.com/demo/',
+    // 'https://nowsecure.nl',
+    // 'https://abrahamjuliot.github.io/creepjs/',
+    // 'https://scrapfly.io/web-scraping-tools/http2-fingerprint',
+    // 'https://scrapfly.io/web-scraping-tools/browser-fingerprint',
+    // 'https://scrapfly.io/web-scraping-tools/ja3-fingerprint',
+    // 'https://scrapfly.io/web-scraping-tools/canvas-fingerprint',
+    // 'https://scrapfly.io/web-scraping-tools/webgl-fingerprint',
+    // 'https://scrapfly.io/web-scraping-tools/audio-fingerprint',
+    // 'https://scrapfly.io/web-scraping-tools/screen-fingerprint',
+    // 'https://web-scraping.dev/',
+
+
+    // 'https://example.com',
+    // 'https://www.okta.com/',
+    // 'https://www.webflow.com/',
+    // 'https://docker-compose.archivebox.io',
+    // 'https://www.reddit.com/r/AskReddit/comments/1br0q9b/what_was_ok_10_years_ago_but_isnt_today/',
+    // 'https://www.quora.com/Is-the-website-2Captcha-true-or-fake-with-paying-money-for-working-on-it',
+    // 'https://x.com/yawnzzcalo7/status/1747853178849435894',
+    // 'https://twitter.com/yawnzzcalo7/status/1747853178849435894',
+    // 'https://rachdele.substack.com/p/is-the-job-market-dying',
+    // 'https://www.flowradar.com/cloneables/mouse-image-trail-effect',
+    // 'https://wrong.host.badssl.com/',
+    // 'http://docker-compose.archivebox.io',
+    // 'https://pptr.dev/api/puppeteer.page.setrequestinterception',
+    // 'https://blog.sweeting.me#Writing',
+    // 'https://github.com/yarnpkg/yarn/issues/9005',
+
+    // 'https://archive.md/739Oc',
+    // 'https://archive.md/Oc72d',
+    // 'https://archive.vn/fPUBe',
+    // 'https://archive.vn/mRz4P',
+    // 'https://archive.vn/Qct6Y',
+    // 'https://archive.vn/sv50h',
+    // 'https://facebook.com/815781663692514/?comment_id=1508571679703640',
+    // 'https://facebook.com/815781663692514/?comment_id=924451748966499',
+    // 'https://www.facebook.com/wayne.brennan.528/posts/pfbid02fvxFppng2WsHMavhBa62cXizCBGdmPQRH3CMhac79qzS5C1ADaSNC587d3u6qVbkl',
+    // 'https://www.facebook.com/wildeprods/posts/pfbid02YEPfoB7pZqMNzE4y2MpYSQbRAzASquvHyEMzHqrNngJCSL7onEg2jnsqS6epcQHWl',
+    // 'https://t.me/aubontouite_francais/9493',
+    // 'https://t.me/BC_BLACKMIROR/5044',
+    // 'https://t.me/IONONMIARRENDOGROUP/14004',
+    // 'https://t.me/newsfactory_pl/51014',
+    // 'https://t.me/oliverjanich/132574',
+    // 'https://t.me/tomaszgryguc/10449',
+    // 'https://t.me/amigosDisidentes/123177',
+    // 'https://twitter.com/1nfiltr4do_NN/status/1767238399943991389',
+    // 'https://twitter.com/4lmondcookie/status/1748519205438111914',
+    // 'https://twitter.com/4olll1ke/status/1753796944827199766',
+    // 'https://twitter.com/yeokiloss/status/1754908226179502345',
+    // 'https://twitter.com/YoungWaifLover/status/1735667278090297561',
+    // 'https://twitter.com/Z_Pour_Demain/status/1766133730278605182',
+    // 'https://www.aap.com.au/factcheck/aboriginal-lands-claim-a-total-abdication-of-facts/',
+    // 'https://www.aap.com.au/factcheck/absurd-albanese-clip-fools-voice-voters/',
+    // 'https://www.instagram.com/_the.forgotten.ones/p/CQQDyoqhsF6/',
+    // 'https://www.instagram.com/p/CqSM_f9MR4b/',
+    // 'https://www.instagram.com/p/CqSQgf1sv8B/',
+    // 'https://instagram.com/p/B-Q22Z_pxyC/',
+    // 'https://www.tiktok.com/@zitatezurzeit/photo/7342474065598319904?cid=7343316616878490400',
+    // 'https://tiktok.com/@zitatezurzeit/photo/7342474065598319904?cid=7343316616878490400',
+    // 'https://www.youtube.com/watch?v=rpD0qgzlCms',
+]
+
+const isTruthy = (env_value) => ['1', 'yes', 'true'].includes(env_value?.toLowerCase() || 'false')
+
+/********************** Config: General High-Level Options ********************/
+
+const PASSIVE_ARCHIVING = isTruthy(process.env.PASSIVE_ARCHIVING)
+const CHROME_CLUSTER = isTruthy(process.env.CHROME_CLUSTER)
+const CHROME_CLUSTER_WORKERS = 4
+
+const API_SERVER_HOST = '0.0.0.0'
+const API_SERVER_PORT = 9595
+const CHROME_DEBUG_PORT = 9222                                    // 9222 is default, or use 0 for random port
+
+/********************** Config: Keys & Secrets ********************************/
+
+const API_KEY_2CAPTCHA = process.env.API_KEY_2CAPTCHA || 'YOUR_API_KEY_HERE'
+const FLARESOLVERR_API_ENDPOINT = process.env.FLARESOLVERR_API_ENDPOINT || "http://localhost:8191/v1"
+
+const ACTIVE_PERSONA = process.env.ACTIVE_PERSONA || 'Default'
+const CHROME_PROFILE_USER = process.env.CHROME_PROFILE_USER || 'Default'
+const LOAD_AUTH_STORAGE = isTruthy(process.env.LOAD_AUTH_STORAGE)
+const SAVE_AUTH_STORAGE = isTruthy(process.env.SAVE_AUTH_STORAGE)
+
+/********************** Config: Data Dir Locations ****************************/
+
+const SRC_DIR = path.resolve(__dirname)
+const DATA_DIR = process.env.DATA_DIR || await fs.promises.realpath(path.join(SRC_DIR, 'data'))
+const INDEXES_DIR = path.join(DATA_DIR, 'index')
+const ARCHIVE_DIR = path.join(DATA_DIR, 'archive')
+if (!fs.existsSync(ARCHIVE_DIR))
+    throw 'Could not find data/archive, are you running in the right pwd?'
+
+const PERSONA_DIR = path.join(DATA_DIR, 'personas', ACTIVE_PERSONA)
+const CHROME_PROFILE_PATH = path.join(PERSONA_DIR, 'chrome_profile')
+const CHROME_DOWNLOADS_DIR = path.join(PERSONA_DIR, 'chrome_downloads')
+const CHROME_EXTENSIONS_DIR =  path.join(PERSONA_DIR, 'chrome_extensions')
+const CHROME_EXTENSIONS_JSON_PATH = path.join(CHROME_EXTENSIONS_DIR, 'extensions.json')
+const AUTH_JSON_PATH = path.join(PERSONA_DIR, 'auth.json')
+const COOKIES_TXT_PATH = path.join(PERSONA_DIR, 'cookies.txt')
+const SPEEDTESTS_DIR = path.join(PERSONA_DIR, 'speedtests')
+// const CHROME_PROFILE_IMPORT_USER = 'Profile 1'
+// const CHROME_PROFILE_IMPORT_PATH = '/Volumes/NVME/Users/squash/Library/Application Support/Google/Chrome'
+
+// chrome profile / persona directories
+fs.mkdirSync(PERSONA_DIR, {recursive: true})
+fs.mkdirSync(SPEEDTESTS_DIR, {recursive: true})
+fs.mkdirSync(CHROME_PROFILE_PATH, {recursive: true})
+fs.mkdirSync(CHROME_EXTENSIONS_DIR, {recursive: true})
+fs.mkdirSync(CHROME_DOWNLOADS_DIR, {recursive: true})
+
+// cruft directories
+const ORPHANS_DIR = path.join(DATA_DIR, 'orphans')
+const PARTIALS_DIR = path.join(DATA_DIR, 'partials')
+const DUPLICATES_DIR = path.join(DATA_DIR, 'duplicates')
+await fs.promises.mkdir(ORPHANS_DIR, {recursive: true})
+await fs.promises.mkdir(PARTIALS_DIR, {recursive: true})
+await fs.promises.mkdir(DUPLICATES_DIR, {recursive: true})
+
+/********************** Config: Viewport Setup Opts ***************************/
+
+// Config: Viewport
+const DEFAULT_TIMEOUT = 20_000
+const DEFAULT_GEOLOCATION = {latitude: 59.95, longitude: 30.31667}
+const DEFAULT_USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36'
+const DEFAULT_ASPECT_RAIO = 16/9       // recommended: 16:9       (most common desktop window aspect ratio)
+const SCREENSHOT_ASPECT_RATIO = 4/3    // recommended: 4:3        (easier to use as thumbnails when square-ish)
+const DEFAULT_WINDOW_WIDTH = 1920      // recommended: 1920x1080p (1080p screenshots)
+const DEFAULT_WINDOW_HEIGHT = Math.floor(DEFAULT_WINDOW_WIDTH/DEFAULT_ASPECT_RAIO)
+const DEFAULT_VIEWPORT = {
+    width: DEFAULT_WINDOW_WIDTH,
+    height: DEFAULT_WINDOW_HEIGHT,            
+    deviceScaleFactor: 2,              // 2 gives much sharper text in screenshots/pdfs/etc but uses more CPU/GPU
+    isMobile: false,
+    hasTouch: false,
+    isLandscape: false,
+}
+const DEFAULT_COLOR_SCHEME = 'light'
+const DEFAULT_HEADERS = {
+    // requires frequent tweaking to remain undetected by cloudflare/recaptcha/etc.
+    // 'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
+    // 'accept-encoding': 'gzip, deflate, br, zstd',
+    // 'accept-language': accept_language,
+    // 'cache-Control': no_cache ? 'no-cache' : '',
+    // 'dnt': '1',
+    'sec-ch-ua': '"Google Chrome";v="122", "Not:A-Brand";v="8", "Chromium";v="122"',
+    'sec-ch-ua-mobile': '?0',
+    'sec-ch-ua-platform': '"macOS"',
+    'connection-rtt': '50',
+    // 'pragma': no_cache ? 'no-cache' : '',
+    // 'sec-fetch-dest': 'document',
+    // 'sec-fetch-mode': 'navigate',
+    // 'sec-fetch-site': 'none',
+    // 'sec-fetch-user': '?1',
+    // // 'upgrade-insecure-requests': '1',     // breaks some sites, e.g. https://www.flowradar.com/cloneables/mouse-image-trail-effect
+    // 'user-agent': user_agent,
+}
+
+const DEFAULT_REFERRERS = ["https://www.google.com", "https://www.facebook.com", "https://www.instagram.com"]
+
+/****************** Config: Human Behavior Emulation **************************/
+
+const SCROLL_LIMIT = 20;    // e.g. 30 = 30 * (1000px/2s) => 30,000px scrolled in 60sec
+const SCROLL_DELAY = 1350;  // interval per scroll, e.g. 2000 = 2sec to travel 1 * SCROLL_DISTANCE
+const SCROLL_DISTANCE = DEFAULT_VIEWPORT.height - 100;  // make sure this is slightly less than viewport height so there is some overlap to make stitching easier
+
+/********************** Config: URL Rewriting *********************************/
+const URL_REWRITES = [
+    // replacements should come first
+    // {
+    //     idx: 0,
+    //     pattern: /\/\/(www\.)?x\.com/gi,
+    //     replacement: '//$1twitter.com/',
+    //     // TODO: scope: 'hostname',
+    // },
+    // {
+    //     idx: 1,
+    //     pattern: /\/\/(www\.)?twitter\.com/gi,
+    //     replacement: '//$1nitter.net',
+    //     // TODO: scope: 'hostname',
+    // },
+
+    // // blocks should come at the end
+    // {
+    //     idx: 999,
+    //     pattern: /\/\/(www\.)?notallowed\.com/gi,
+    //     replacement: '',
+    //     // TODO: scope: 'href',
+    // },
+]
+const URL_SCHEMES_IGNORED = [
+    '',                     // no scheme is also invalid (e.g. opening a new tab page without any url yet)
+    'chrome',
+    'chrome-extension',
+    'chrome-untrusted',
+    'file',
+    'data',
+    'about',
+]
+
+
+/**************** Load existing data/archive/<timestamp> snapshots *************/
+
+const snapshots = await Snapshot.findAll({ attributes: ['id', 'timestamp', 'url'] })   // include: { model: ArchiveResult, as: 'archiveresults' }, });
+const results = await ArchiveResult.findAll({ attributes: ['id', 'snapshot_id', 'extractor', 'start_ts'] })   // include: { model: Snapshot, as: 'snapshot' }, });
+globalThis.snapshots = snapshots
+globalThis.results = results
+console.log(`[💿] Found ${snapshots.length} existing snapshots in index.sqlite3...`)
+console.log(`[💿] Found ${results.length} existing results in index.sqlite3...`)
+// debugger;
+
+const locateExistingSnapshots = (archive_dir) => {
+    const urls_to_dirs = {}
+    // for each data/archive/<timestamp>/index.json found, store {url: data/archive/<timestamp>}
+    for (const snapshot_dir of fs.readdirSync(archive_dir)) {
+        const snapshot_json = path.join(archive_dir, snapshot_dir, 'index.json')
+        if (fs.existsSync(snapshot_json)) {
+            const {url, archive_path} = JSON.parse(fs.readFileSync(snapshot_json, 'utf-8'))
+            if (!snapshot_dir.includes(archive_path.replace('archive/', '')))
+                throw 'Found incorrect index.json inside snapshot dir' + snapshot_dir
+            if (url && url.includes('://')) {
+                urls_to_dirs[url] = path.join(archive_dir, snapshot_dir)
+            }
+        }
+    }
+    return urls_to_dirs
+}
+
+let SNAPSHOT_DIRS_BY_URL = locateExistingSnapshots(ARCHIVE_DIR)
+
+let all_snap_dirs = (await fs.promises.readdir(ARCHIVE_DIR))
+// const orphan_snap_dirs = all_snap_dirs.filter(dirname => dirname.startsWith('19999'))
+
+// // scan through existing snapshot dirs, move orphans to orphans/ or correct archive/<snapid>
+// for (const snap_id of orphan_snap_dirs) {
+//     if (snap_id.startsWith('.')) continue
+//     const src_dir = path.join(ARCHIVE_DIR, snap_id)
+//     let src_path = src_dir
+
+//     assert((await fs.promises.stat(src_dir)).isDirectory())
+//     let dest_path = null
+
+//     const orphan_metrics_path = path.join(src_dir, 'metrics.json')
+//     if (fs.existsSync(orphan_metrics_path)) {
+//         const orphan_metrics = JSON.parse(await fs.promises.readFile(orphan_metrics_path, 'utf-8'))
+//         const url = orphan_metrics.url || orphan_metrics.URL
+//         const version = orphan_metrics.VERSION || versionStrFromDate(orphan_metrics.start_time)
+        
+//         // move all bare files into ./versions/YYYYMMDD/* and symlink ./* to latest version
+//         await symlinkBestSnapshotResults(src_dir)
+
+//         dest_path = SNAPSHOT_DIRS_BY_URL[url]
+//         const dest_id = dest_path?.split('/').at(-1)
+        
+//         if (dest_id && (dest_id != snap_id)) {
+//             if (fs.existsSync(dest_path)) {
+//                 console.log(`    - moving duplicate snap_dir ${src_dir} -> ${dest_path}`)
+//             } else {
+//                 console.log(`   - moving valid snap_dir ${src_dir} -> ${dest_path}`)
+//             }
+//         } else if (dest_id == snap_id) {
+//             continue
+//         } else {
+//             dest_path = path.join(ORPHANS_DIR, snap_id)
+//             console.log(`   - moving orphan snap_dir ${src_dir} -> ${dest_path}`)
+//         }
+//     } else {
+//         // corrupt/par
+//         dest_path = path.join(PARTIALS_DIR, snap_id)
+//         console.log(`   - moving parial snap_dir ${src_dir} -> ${dest_path}`)
+//     }
+//     if (dest_path) {
+//         for (const version_dir of (await fs.promises.readdir(path.join(src_path, 'versions')))) {
+//             const version_src = path.join(src_path, 'versions', version_dir)
+//             const version_dst = path.join(dest_path, 'versions', version_dir)
+
+//             // move all bare files into ./versions/YYYYMMDD/* and symlink ./* to latest version
+//             await symlinkBestSnapshotResults(dest_path)
+
+//             assert(!fs.existsSync(version_dst))
+//             await fs.promises.rename(version_src, version_dst)
+//             console.log('    - ', version_src, '--->', version_dst)
+//         }
+//         await fs.promises.rename(src_dir, path.join(PARTIALS_DIR, snap_id))
+//         await symlinkBestSnapshotResults(dest_path)
+//     }
+// }
+
+// const duplicate_snap_dirs = (await fs.promises.readdir(DUPLICATES_DIR)).filter(dirname => dirname.startsWith('19999'))
+// for (const snap_id of duplicate_snap_dirs) {
+//     const src_dir = path.join(DUPLICATES_DIR, snap_id)
+//     const metrics = JSON.parse(await fs.promises.readFile(path.join(src_dir, 'metrics.json'), 'utf-8'))
+// }
+
+// all_snap_dirs = (await fs.promises.readdir(ARCHIVE_DIR))
+// for (const snap_id of all_snap_dirs) {
+//     if (snap_id.startsWith('.')) continue
+//     const snap_dir = path.join(ARCHIVE_DIR, snap_id)
+//     const metrics_path = path.join(snap_dir, 'metrics.json')
+//     if (fs.existsSync(metrics_path)) {
+//         // console.log('    - updating snap_dir', snap_dir)
+//         await symlinkBestSnapshotResults(snap_dir)
+//     }
+// }
+// SNAPSHOT_DIRS_BY_URL = locateExistingSnapshots(ARCHIVE_DIR)
+
+
+fs.writeFileSync(path.join(DATA_DIR, 'queue.csv'), '')
+
+const snapIdFromDir = (dir_path) =>
+    dir_path.split('/archive/').at(-1)
+
+const snapshot_dir_list = (
+    Object.entries(SNAPSHOT_DIRS_BY_URL)
+        .sort(([_ak, a], [_bk, b]) =>
+            Number(snapIdFromDir(b)) - Number(snapIdFromDir(a)))
+        .reverse())
+
+for (const [existing_url, snapshot_dir] of snapshot_dir_list) {
+    // if (existing_url.startsWith('https://www.facebook.com/')) {
+    const is_desired_url = !(existing_url.includes('facebook.com/') || existing_url.includes('instagram.com/'))
+    const already_archived = false   // fs.existsSync(path.join(SNAPSHOT_DIRS_BY_URL[existing_url], 'versions'))
+    if (is_desired_url && !already_archived) {
+        // URLS.push(existing_url)
+        fs.appendFileSync(
+            path.join(DATA_DIR, 'queue.csv'),
+            `${SNAPSHOT_DIRS_BY_URL[existing_url]},${existing_url}\n`,
+            'utf-8',
+        )
+    }
+}
+URLS = [...new Set(URLS)]
+console.log('[+] Added', URLS.length, 'existing urls to queue...')
+
+
+/********************** Config: Output Paths **********************************/
+// const TASK_PATH             = (url)  => path.join(DATA_DIR, 'results', `${hashCode(url)}`)
+const TASK_PATH             = (url)  => SNAPSHOT_DIRS_BY_URL[url] || path.join(ARCHIVE_DIR, `1999999999.${hashCode(url)}`)
+// const TASK_PATH                = (url)  => {
+//     const existing_snap_dir = SNAPSHOT_DIRS_BY_URL[url]
+//     assert(existing_snap_dir, `Could not find existing snapshot dir for ${url}`)
+//     return existing_snap_dir
+// }
+
+const OUTPUT_PATH           = (page, filename, extname='') => 
+                                        path.join(TASK_PATH(page._original_url), `${filename}${extname}`)
+
+const SSL_PATH              = (page) => OUTPUT_PATH(page, 'ssl.json')
+const CONSOLELOG_PATH       = (page) => OUTPUT_PATH(page, 'console.log')
+const HEADERS_PATH          = (page) => OUTPUT_PATH(page, 'headers.json')
+const REDIRECTS_PATH        = (page) => OUTPUT_PATH(page, 'redirects.json')
+const REQUESTS_PATH         = (page) => OUTPUT_PATH(page, 'requests.json')
+const TRACE_PATH            = (page) => OUTPUT_PATH(page, 'trace.json')
+const METRICS_PATH          = (page) => OUTPUT_PATH(page, 'metrics.json')
+const OUTLINKS_PATH         = (page) => OUTPUT_PATH(page, 'outlinks.json')
+const SEO_PATH              = (page) => OUTPUT_PATH(page, 'seo.json')
+const FAVICON_PATH          = (page) => OUTPUT_PATH(page, 'favicon.json')
+const TITLE_PATH            = (page) => OUTPUT_PATH(page, 'title.txt')
+const BODYTEXT_PATH         = (page) => OUTPUT_PATH(page, 'body.txt')
+const PANDOC_PATH           = (page) => OUTPUT_PATH(page, 'pandoc.md')
+const READABILITY_PATH      = (page) => OUTPUT_PATH(page, 'readability.json')
+const ACCESIBILITY_PATH     = (page) => OUTPUT_PATH(page, 'accessibility.json')
+const DOM_PATH              = (page) => OUTPUT_PATH(page, 'dom.html')
+const PDF_PATH              = (page) => OUTPUT_PATH(page, 'output.pdf')
+const SCREENSHOT_PATH       = (page) => OUTPUT_PATH(page, 'screenshot.png')
+const SCREENSHOT_JPG_PATH   = (page) => OUTPUT_PATH(page, 'screenshot.jpg')
+const AIQA_PATH             = (page) => OUTPUT_PATH(page, 'aiqa.json')
+const SINGLEFILE_PATH       = (page) => OUTPUT_PATH(page, 'singlefile.html')
+const YTDLP_PATH            = (page) => OUTPUT_PATH(page, 'media/')
+const GALLERYDL_PATH        = (page) => OUTPUT_PATH(page, 'photos/')
+const SCREENRECORDING_PATH  = (page) => OUTPUT_PATH(page, 'screenrecording.mp4')
+const SCREENRECORDGIF_PATH  = (page) => OUTPUT_PATH(page, 'screenrecording.gif')
+const RESPONSES_PATH        = (page) => OUTPUT_PATH(page, 'responses')
+const RAW_PATH              = (page) => OUTPUT_PATH(page, 'raw')
+
+
+
+/********************** Config: Chrome Extensions *****************************/
+
+interface ChromeExtension {
+    name: string
+    webstore_id: string
+}
+interface LoadedChromeExtension extends ChromeExtension {
+    id?: string
+    webstore_url?: string
+    crx_url?: string
+    crx_path?: string
+    unpacked_path?: string
+    read_manifest?: () => any
+    read_version?: () => string | null
+}
+
+const CHROME_EXTENSIONS: LoadedChromeExtension[] = [
+    // Content access / unblocking / blocking plugins
+    {webstore_id: 'ifibfemgeogfhoebkmokieepdoobkbpo', name: 'captcha2'},                 // https://2captcha.com/blog/how-to-use-2captcha-solver-extension-in-puppeteer
+    {webstore_id: 'edibdbjcniadpccecjdfdjjppcpchdlm', name: 'istilldontcareaboutcookies'},
+    {webstore_id: 'cjpalhdlnbpafiamejdnhcphjbkeiagm', name: 'ublock'},
+    // {webstore_id: 'mlomiejdfkolichcflejclcbmpeaniij', name: 'ghostery'},
+    // {webstore_id: 'mnjggcdmjocbbbhaepdhchncahnbgone', name: 'sponsorblock'},
+    // {webstore_id: 'iplffkdpngmdjhlpjmppncnlhomiipha', name: 'unpaywall'},
+    // {webstore_id: 'gofocbepaccnkpphbgjpolififgcakhn', name: 'spaywallnews'},
+    
+    // Archiving plugins
+    {webstore_id: 'mpiodijhokgodhhofbcjdecpffjipkle', name: 'singlefile'},
+    // {webstore_id: 'fpeoodllldobpkbkabpblcfaogecpndd', name: 'archivewebpage'},
+    // {webstore_id: 'niloccemoadcdkdjlinkgdfekeahmflj', name: 'pocket'},
+    // {webstore_id: 'kenncghfghgolcbmckhiljgaabnpcaaa', name: 'warcreate'},
+    // {webstore_id: 'jjndjgheafjngoipoacpjgeicjeomjli', name: 'puppeteerstream'},
+
+    // Utilities for humans setting up/viewing/debugging the archiving session
+    // {webstore_id: 'aeblfdkhhhdcdjpifhhbdiojplfjncoa', name: '1password'},
+    // {webstore_id: 'fngmhnnpilhplaeedifhccceomclgfbg', name: 'editthiscookie'},
+    // {webstore_id: 'cgfpgnepljlgenjclbekbjdlgcodfmjp', name: 'simpletabsorter'},
+    
+    // Scripting/automation plugins
+    // {webstore_id: 'jinjaccalgkegednnccohejagnlnfdag', name: 'violentmonkey'},
+    // {webstore_id: 'infppggnoaenmfagbfknfkancpbljcca', name: 'automa'},
+    // {webstore_id: 'pfegffhjcgkneoemnlniggnhkfioidjg', name: 'screenscraper'},
+]
+
+/******************** Config: Chrome Profile Preferences **********************/
+
+// https://niek.github.io/chrome-features/
+const CHROME_DISABLED_COMPONENTS = [
+    'Translate',
+    'AcceptCHFrame',
+    'OptimizationHints',
+    'ProcessPerSiteUpToMainFrameThreshold',
+    'InterestFeedContentSuggestions',
+    'CalculateNativeWinOcclusion',
+    'BackForwardCache',
+    'HeavyAdPrivacyMitigations',
+    'LazyFrameLoading',
+    'ImprovedCookieControls',
+    'PrivacySandboxSettings4',
+    'AutofillServerCommunication',
+    'CertificateTransparencyComponentUpdater',
+    'DestroyProfileOnBrowserClose',
+    'CrashReporting',
+    'OverscrollHistoryNavigation',
+    'InfiniteSessionRestore',
+    //'LockProfileCookieDatabase',      // disabling allows multiple chrome instances to concurrently modify profile, but might make chrome much slower https://github.com/yt-dlp/yt-dlp/issues/7271  https://issues.chromium.org/issues/40901624
+]
+
+const CHROME_PREFERENCES_EXTRA = {}
+const CHROME_PREFERENCES_DEFAULT = {
+    // https://chromium.googlesource.com/chromium/src/+/32352ad08ee673a4d43e8593ce988b224f6482d3/chrome/common/pref_names.cc
+    homepage: 'about:blank',                        // doesn't work here, managed by Secure Preferences
+    homepage_is_newtabpage: false,                  // doesn't work here, managed by Secure Preferences
+    session: {                                      // doesn't work here, managed by Secure Preferences
+        restore_on_startup: 4,                      // doesn't work here, managed by Secure Preferences
+        startup_urls: 'about:blank',                // doesn't work here, managed by Secure Preferences
+    },
+    default_apps: 'noinstall',
+    browser: {
+        confirm_to_quit: false,
+        enable_spellchecking: false,
+        check_default_browser: false,
+        show_update_promotion_info_bar: false,
+    },
+    profile: {
+        // name: 'ArchiveBox Persona: Default',    // doesnt work to change display name, not sure why
+        // using_default_name: false,
+        exited_cleanly: true,
+        default_content_setting_values: {
+          automatic_downloads: 1,
+        },
+    },
+    bookmark_bar: {show_on_all_tabs: false},
+    safebrowsing: {enabled: false},
+    search: {suggest_enabled: false},
+    download: {
+        prompt_for_download: false,
+        open_pdf_in_system_reader: true,
+        // default_directory: CHROME_DOWNLOADS_DIR || path.join(__dirname, 'downloads'),
+    },
+    select_file_dialogs: {allowed: false},
+    autofill: {save_data: false},
+    printing: {enabled: false},
+    message_center: {welcome_notification_dismissed_local: true},
+    extensions: {
+        ui: {
+            developer_mode: true,
+            dismissed_adt_promo: true,
+        },
+        // pinned_extensions: CHROME_EXTENSIONS?.map(({id}) => id) || [],
+    },
+    webkit: {
+        webprefs: {
+            javascript_enabled: true,
+            minimum_font_size: 9,
+            // default_font_size: 12,
+            // web_security_enabled: false,
+            // allow_displaying_insecure_content: true,
+            // allow_running_insecure_content: true,
+            java_enabled: true,
+            loads_images_automatically: true,
+        },
+    },
+    settings: {
+        multi_profile_never_show_intro: true,
+        multi_profile_warning_show_dismissed: true,
+        first_run_tutorial_shown: true,
+    },
+    plugins: {
+        always_open_pdf_externally: true,
+    },
+}
+
+const CHROME_PREFERENCES_PATH = path.join(CHROME_PROFILE_PATH, 'Default', 'Preferences')
+
+const getChromePreferences = ({CHROME_PREFERENCES_DEFAULT, CHROME_PREFERENCES_EXTRA, CHROME_EXTENSIONS, CHROME_DOWNLOADS_DIR}) =>
+    merge.all([CHROME_PREFERENCES_DEFAULT, CHROME_PREFERENCES_EXTRA, {
+        extensions: {
+            pinned_extensions: CHROME_EXTENSIONS?.map(({id}) => id) || [],
+        },
+        download: {
+            default_directory: CHROME_DOWNLOADS_DIR || path.join(__dirname, 'downloads'),
+        },
+    }])
+
+function applyChromePreferences(puppeteer, prefs_path, preferences) {
+    if (fs.existsSync(prefs_path)) {
+        const preferences_existing = JSON.parse(fs.readFileSync(prefs_path, 'utf-8'))
+        const preferences_merged = merge(preferences_existing, preferences)
+        // console.log(JSON.stringify(preferences_merged, null, 4))
+        fs.writeFileSync(prefs_path, JSON.stringify(preferences_merged))    
+    } else {
+        // otherwise profile has not been created yet, use plugin instead (plugin only works on first creation)
+        puppeteer.use(PrefsPlugin({userPrefs: preferences}))
+    }
+    return puppeteer
+}
+
+
+/******************** Config: Chrome Launch Args ******************************/
+
+const CHROME_ARGS_DEFAULT = [
+    // Headless behavior tuning, determinstic behavior settings
+    // '--headless=new',
+    '--test-type',
+    '--test-type=gpu',                                // https://github.com/puppeteer/puppeteer/issues/10516
+    '--deterministic-mode',
+    '--js-flags=--random-seed=1157259159',            // make all JS random numbers deterministic by providing a seed
+    '--allow-pre-commit-input',                       // allow JS mutations before page rendering is complete
+    '--disable-blink-features=AutomationControlled',  // hide the signatures that announce browser is being remote-controlled
+    '--enable-automation',                            // <- DONT USE THIS, it makes you easily detectable / blocked by cloudflare
+    // `--proxy-server=https://43.159.28.126:2334:u7ce652b7568805c4-zone-custom-region-us-session-szGWq3FRU-sessTime-60:u7ce652b7568805c4`,      // send all network traffic through a proxy https://2captcha.com/proxy
+    // `--proxy-bypass-list=127.0.0.1`,
+
+    // Docker-specific options
+    // https://github.com/GoogleChrome/lighthouse-ci/tree/main/docs/recipes/docker-client#--no-sandbox-issues-explained
+    // '--no-sandbox',                                   // rely on docker sandboxing in docker, otherwise we need cap_add: SYS_ADM to use host sandboxing
+    // '--disable-gpu-sandbox',
+    // '--disable-setuid-sandbox',
+    // '--disable-dev-shm-usage',                     // docker 75mb default shm size is not big enough, disabling just uses /tmp instead
+    // '--no-xshm',
+
+    // Profile data dir setup
+    // chrome://profile-internals
+    `--user-data-dir=${CHROME_PROFILE_PATH}`,
+    `--profile-directory=${CHROME_PROFILE_USER}`,
+    '--password-store=basic',                            // use mock keychain instead of OS-provided keychain (we manage auth.json instead)
+    '--use-mock-keychain',
+    '--disable-cookie-encryption',                       // we need to be able to write unencrypted cookies to save/load auth.json
+    // '--disable-sync',                                 // don't try to use Google account sync features
+
+    // Extensions
+    // chrome://inspect/#extensions
+    // `--load-extension=${CHROME_EXTENSIONS.map(({unpacked_path}) => unpacked_path).join(',')}`,  // not needed when using existing profile that already has extensions installed
+    `--allowlisted-extension-id=${CHROME_EXTENSIONS.map(({ webstore_id }) => webstore_id).join(',')}`,
+    '--allow-legacy-extension-manifests',
+
+    // Browser window and viewport setup
+    // chrome://version
+    // `--user-agent="${DEFAULT_USER_AGENT}"`,
+    // `--window-size=${DEFAULT_VIEWPORT.width},${DEFAULT_VIEWPORT.height}`,
+    '--window-position=0,0',
+    '--hide-scrollbars',                               // hide scrollbars because otherwise they show up in screenshots
+    '--install-autogenerated-theme=169,32,85',         // red border makes it easier to see which chrome window is archivebox's
+    '--virtual-time-budget=60000',                     // fast-forward all animations & timers by 60s
+    '--autoplay-policy=no-user-gesture-required',      // auto-start videos so they trigger network requests + show up in outputs 
+    '--disable-gesture-requirement-for-media-playback',
+    '--lang=en-US,en;q=0.9',
+
+    // DANGER: JS isolation security features (to allow easier tampering with pages during archiving)
+    // chrome://net-internals
+    // '--disable-web-security',                              // <- WARNING, breaks some sites that expect/enforce strict CORS headers (try webflow.com)
+    // '--disable-features=IsolateOrigins,site-per-process', // useful for injecting JS, but some very strict sites can panic / show error pages when isolation is disabled (e.g. webflow.com)
+    // '--allow-running-insecure-content',                   // Breaks CORS/CSRF/HSTS etc., useful sometimes but very easy to detect
+    // '--allow-file-access-from-files',                     // <- WARNING, dangerous, allows JS to read filesystem using file:// URLs
+
+    // // DANGER: Disable HTTPS verification
+    // '--ignore-certificate-errors',
+    // '--ignore-ssl-errors',
+    // '--ignore-certificate-errors-spki-list',
+    // '--allow-insecure-localhost',
+
+    // IO: stdin/stdout, debug port config
+    // chrome://inspect
+    '--log-level=2',                                  // 1=DEBUG 2=WARNING 3=ERROR
+    '--enable-logging=stderr',
+    '--remote-debugging-address=0.0.0.0',
+    `--remote-debugging-port=${CHROME_DEBUG_PORT}`,
+
+    // GPU, canvas, text, and pdf rendering config
+    // chrome://gpu
+    '--enable-webgl',                                 // enable web-gl graphics support
+    '--font-render-hinting=none',                     // make rendering more deterministic by ignoring OS font hints, may also need css override, try:    * {text-rendering: geometricprecision !important; -webkit-font-smoothing: antialiased;}
+    '--force-color-profile=srgb',                     // make rendering more deterministic by using consitent color profile, if browser looks weird, try: generic-rgb
+    '--disable-partial-raster',                       // make rendering more deterministic (TODO: verify if still needed)
+    '--disable-skia-runtime-opts',                    // make rendering more deterministic by avoiding Skia hot path runtime optimizations
+    '--disable-2d-canvas-clip-aa',                    // make rendering more deterministic by disabling antialiasing on 2d canvas clips
+    // '--disable-gpu',                                  // falls back to more consistent software renderer
+    // // '--use-gl=swiftshader',                        <- DO NOT USE, breaks M1 ARM64. it makes rendering more deterministic by using simpler CPU renderer instead of OS GPU renderer  bug: https://groups.google.com/a/chromium.org/g/chromium-dev/c/8eR2GctzGuw
+    // // '--disable-software-rasterizer',               <- DO NOT USE, harmless, used in tandem with --disable-gpu
+    // // '--run-all-compositor-stages-before-draw',     <- DO NOT USE, makes headful chrome hang on startup (tested v121 Google Chrome.app on macOS)
+    // // '--disable-gl-drawing-for-tests',              <- DO NOT USE, disables gl output (makes tests run faster if you dont care about canvas)
+    // // '--blink-settings=imagesEnabled=false',        <- DO NOT USE, disables images entirely (only sometimes useful to speed up loading)
+
+    // Process management & performance tuning
+    // chrome://process-internals
+    '--disable-lazy-loading',                         // make rendering more deterministic by loading all content up-front instead of on-focus
+    '--disable-renderer-backgrounding',               // dont throttle tab rendering based on focus/visibility
+    '--disable-background-networking',                // dont throttle tab networking based on focus/visibility
+    '--disable-background-timer-throttling',          // dont throttle tab timers based on focus/visibility
+    '--disable-backgrounding-occluded-windows',       // dont throttle tab window based on focus/visibility
+    '--disable-ipc-flooding-protection',              // dont throttle ipc traffic or accessing big request/response/buffer/etc. objects will fail
+    '--disable-extensions-http-throttling',           // dont throttle http traffic based on runtime heuristics
+    '--disable-field-trial-config',                   // disable shared field trial state between browser processes 
+    '--disable-back-forward-cache',                   // disable browsing navigation cache
+    // '--in-process-gpu',                            <- DONT USE THIS, makes headful startup time ~5-10s slower (tested v121 Google Chrome.app on macOS)
+    // '--disable-component-extensions-with-background-pages',  // TODO: check this, disables chrome components that only run in background (could lower startup time)
+
+    // uncomment to disable hardware camera/mic/speaker access + present fake devices to websites
+    // (faster to disable, but disabling breaks recording browser audio in puppeteer-stream screenrecordings)
+    // '--use-fake-device-for-media-stream',
+    // '--use-fake-ui-for-media-stream',
+    // '--disable-features=GlobalMediaControls,MediaRouter,DialMediaRouteProvider',
+    
+    // // Output format options (PDF, screenshot, etc.)
+    '--export-tagged-pdf',                            // include table on contents and tags in printed PDFs
+    '--generate-pdf-document-outline',
+
+    // Suppress first-run features, popups, hints, updates, etc.
+    // chrome://system
+    '--no-pings',
+    '--no-first-run',
+    '--no-default-browser-check',
+    '--disable-default-apps',
+    '--ash-no-nudges',
+    '--disable-infobars',
+    '--disable-search-engine-choice-screen',
+    '--disable-session-crashed-bubble',
+    '--simulate-outdated-no-au="Tue, 31 Dec 2099 23:59:59 GMT"',
+    '--hide-crash-restore-bubble',
+    '--suppress-message-center-popups',
+    '--disable-client-side-phishing-detection',
+    '--disable-domain-reliability',
+    '--disable-component-update',
+    '--disable-datasaver-prompt',
+    '--disable-hang-monitor',
+    '--disable-session-crashed-bubble',
+    '--disable-speech-synthesis-api',
+    '--disable-speech-api',
+    '--disable-print-preview',
+    '--safebrowsing-disable-auto-update',
+    '--deny-permission-prompts',
+    '--disable-external-intent-requests',
+    '--disable-notifications',
+    '--disable-desktop-notifications',
+    '--noerrdialogs',
+    '--disable-popup-blocking',
+    '--disable-prompt-on-repost',
+    '--silent-debugger-extension-api',
+    '--block-new-web-contents',
+    '--metrics-recording-only',
+    '--disable-breakpad',
+
+    
+    // other feature flags
+    // chrome://flags        chrome://components
+    `--disable-features=${CHROME_DISABLED_COMPONENTS.join(',')}`,
+    '--enable-features=NetworkService',
+]
+const CHROME_ARGS_EXTRA = []
+
+
+const CHROME_LAUNCH_OPTIONS = {
+    CHROME_PROFILE_PATH,
+    CHROME_PROFILE_USER,
+    CHROME_EXTENSIONS,
+    CHROME_DEBUG_PORT,
+    CHROME_DISABLED_COMPONENTS,
+    DEFAULT_VIEWPORT,
+    CHROME_ARGS_DEFAULT,
+    CHROME_ARGS_EXTRA,
+}
+/* Chrome CLI Args Documentation
+   - https://github.com/GoogleChrome/chrome-launcher/blob/main/docs/chrome-flags-for-tools.md
+   - https://chromium.googlesource.com/chromium/chromium/+/master/content/public/common/content_switches.cc
+   - https://jtway.co/optimize-your-chrome-options-for-testing-to-get-x1-25-impact-4f19f071bf45
+   - https://peter.sh/experiments/chromium-command-line-switches/
+   - https://www.chromium.org/developers/how-tos/run-chromium-with-flags/
+   - https://github.com/manoj9788/Chrome-Driver-arguments/blob/master/README.md
+*/
+const getChromeArgs = ({CHROME_ARGS_DEFAULT, CHROME_ARGS_EXTRA,
+                        CHROME_PROFILE_PATH, CHROME_PROFILE_USER,
+                        CHROME_EXTENSIONS,
+                        CHROME_DEBUG_PORT,
+                        CHROME_DISABLED_COMPONENTS,
+                        DEFAULT_VIEWPORT}=CHROME_LAUNCH_OPTIONS) =>
+    [
+        ...CHROME_ARGS_DEFAULT,
+        `--user-data-dir=${CHROME_PROFILE_PATH}`,
+        `--profile-directory=${CHROME_PROFILE_USER}`,
+        `--load-extension=${CHROME_EXTENSIONS.map(({unpacked_path}) => unpacked_path).join(',')}`,
+        `--allowlisted-extension-id=${CHROME_EXTENSIONS.map(({id}) => id).join(',')}`,
+        `--window-size=${DEFAULT_VIEWPORT.width},${DEFAULT_VIEWPORT.height}`,
+        `--remote-debugging-port=${CHROME_DEBUG_PORT}`,
+        `--disable-features=${CHROME_DISABLED_COMPONENTS.join(',')}`,
+        ...CHROME_ARGS_EXTRA,
+    ]
+
+
+/******************** Chrome Extension Management *****************************/
+
+function getExtensionId(unpacked_path) {
+    const manifest_path = path.join(unpacked_path, 'manifest.json')
+    if (!fs.existsSync(manifest_path)) return null
+
+    // chrome uses a SHA256 hash of the unpacked extension directory path to compute a dynamic id
+    const hash = crypto.createHash('sha256');
+    hash.update(Buffer.from(unpacked_path, 'utf-8'));
+    const detected_extension_id = Array.from(hash.digest('hex'))
+      .slice(0, 32) // Convert each hexadecimal character to a character in the range 'a'-'p'
+      .map(i => String.fromCharCode(parseInt(i, 16) + 'a'.charCodeAt(0)))
+      .join('');
+
+    return detected_extension_id
+}
+
+async function installExtension(extension) {
+    const manifest_path = path.join(extension.unpacked_path, 'manifest.json')
+
+    // Download extensions using:
+    // curl -fsSL 'https://clients2.google.com/service/update2/crx?response=redirect&prodversion=1230&acceptformat=crx3&x=id%3D$EXTENSION_ID%26uc' > extensionname.crx
+    // unzip -d extensionname extensionname.zip
+
+    if (!fs.existsSync(manifest_path) && !fs.existsSync(extension.crx_path)) {
+        console.log("[🛠️] Downloading missing extension", extension.name, extension.webstore_id, '->', extension.crx_path);
+
+        // Download crx file from ext.crx_url -> ext.crx_path
+        const response = await fetch(extension.crx_url) as Response
+        const crx_file = fs.createWriteStream(extension.crx_path);
+        if (response.headers.get("content-length") && response.body) {
+            // @ts-ignore
+            const crx_stream = Readable.fromWeb(response.body)
+            await finished(crx_stream.pipe(crx_file))
+        } else {
+            console.warn('[⚠️] Failed to download extension', extension.name, extension.webstore_id)
+        }
+    }
+    
+    var {stdout, stderr} = {stdout: '', stderr: ''}
+
+    // Unzip crx file from ext.crx_url -> ext.unpacked_path
+    await fs.promises.mkdir(extension.unpacked_path, {recursive: true})
+    try {
+        var {stdout, stderr} = await exec(`/usr/bin/unzip ${extension.crx_path} -d ${extension.unpacked_path}`)
+    } catch(err1) {
+        try {
+            await unzip(extension.crx_path, extension.unpacked_path)
+        } catch(err2) {
+            // console.error(`[❌] Failed to install ${extension.crx_path}: could not unzip crx`, err1, err2)
+            // return false
+        }
+    }
+
+    if (!fs.existsSync(manifest_path))
+        console.error(`[❌] Failed to install ${extension.crx_path}: could not find manifest.json in unpacked_path`, stdout, stderr)
+
+    return fs.existsSync(manifest_path)
+}
+
+async function loadOrInstallExtension(ext) {
+    if (!(ext.webstore_id || ext.unpacked_path))
+        throw 'Extension must have either {webstore_id} or {unpacked_path}'
+
+    // Set statically computable extension metadata
+    ext.webstore_id =       ext.webstore_id     || ext.id
+    ext.name =              ext.name            || ext.webstore_id
+    ext.webstore_url =      ext.webstore_url    || `https://chromewebstore.google.com/detail/${ext.webstore_id}`
+    ext.crx_url =           ext.crx_url         || `https://clients2.google.com/service/update2/crx?response=redirect&prodversion=1230&acceptformat=crx3&x=id%3D${ext.webstore_id}%26uc`
+    ext.crx_path =          ext.crx_path        || path.join(CHROME_EXTENSIONS_DIR, `${ext.webstore_id}__${ext.name}.crx`)
+    ext.unpacked_path =     ext.unpacked_path   || path.join(CHROME_EXTENSIONS_DIR, `${ext.webstore_id}__${ext.name}`)
+    
+    const manifest_path =   path.join(ext.unpacked_path, 'manifest.json')
+    ext.read_manifest =     () => JSON.parse(fs.readFileSync(manifest_path, 'utf-8'))
+    ext.read_version =      () => fs.existsSync(manifest_path) && ext.read_manifest()?.version || null
+
+    // if extension is not installed, download and unpack it
+    if (!ext.read_version()) {
+        await installExtension(ext)
+    }
+
+    // autodetect id from filesystem path (unpacked extensions dont have stable IDs)
+    ext.id = getExtensionId(ext.unpacked_path)
+    ext.version = ext.read_version()
+    if (!ext.version) {
+        console.warn('[❌] Unable to detect ID and version of installed extension', prettyPath(ext.unpacked_path))
+    } else {
+        console.log(`[➕] Installed extension ${ext.name} (${ext.version})...`.padEnd(82), prettyPath(ext.unpacked_path))
+    }
+
+    return ext
+}
+
+async function isTargetExtension(target) {
+    let target_type
+    let target_ctx
+    let target_url
+    try {
+        target_type = target.type()
+        target_ctx = (await target.worker()) || (await target.page()) || null
+        target_url = target.url() || target_ctx?.url() || null
+    } catch(err) {
+        if (String(err).includes('No target with given id found')) {
+            // because this runs on initial browser startup, we sometimes race with closing the initial
+            // new tab page. it will throw a harmless error if we try to check a target that's already closed,
+            // ignore it and return null since that page is definitely not an extension's bg page anyway
+            target_type = 'closed'
+            target_ctx = null
+            target_url = 'about:closed'
+        } else {
+            throw err
+        }
+    }
+    
+    const target_is_bg = ['service_worker', 'background_page'].includes(target_type)
+    const target_is_extension = target_url?.startsWith('chrome-extension://')
+    const extension_id = (target_is_extension && target_url.split('://')[1].split('/')[0]) || null
+    const manifest_version = target_type === 'service_worker' ? '3' : '2'
+
+    return {
+        target_type,
+        target_ctx,
+        target_url,
+        target_is_bg,
+        target_is_extension,
+        extension_id,
+        manifest_version,
+    }
+}
+
+async function loadExtensionFromTarget(extensions, target) {
+    const {
+        target_is_bg,
+        target_is_extension,
+        target_type,
+        target_ctx,
+        target_url,
+        extension_id,
+        manifest_version,
+    } = await isTargetExtension(target)
+    
+    if (!(target_is_bg && extension_id && target_ctx))
+        return null
+
+    const manifest = await target_ctx.evaluate(() =>
+        // @ts-ignore
+        chrome.runtime.getManifest())
+    
+    const { name, version, homepage_url, options_page, options_ui } = manifest
+
+    if (!version || !extension_id)
+        return null
+    
+    const options_url = await target_ctx.evaluate(
+        (options_page) => chrome.runtime.getURL(options_page),
+        options_page || options_ui?.page || 'options.html',
+    )
+    
+    const commands = await target_ctx.evaluate(async () =>
+        (await new Promise((resolve, reject) => {
+            if (chrome.commands)
+                chrome.commands.getAll(resolve)
+            else
+                resolve({})
+        }))
+    )
+
+    // console.log(`[+] Found Manifest V${manifest_version} Extension:`, extension_id, name, target_url, Object.keys(commands).length)
+
+    let dispatchEval = async (...args) =>
+        await target_ctx.evaluate(...args)
+    let dispatchPopup = async () =>
+        await target_ctx.evaluate('chrome.action?.openPopup() || chrome.tabs.create({url: chrome.runtime.getURL("popup.html")})')
+    
+    let dispatchAction
+    let dispatchMessage
+    let dispatchCommand
+
+    if (manifest_version === '3') {
+        dispatchAction = async (tab) => {
+            // https://developer.chrome.com/docs/extensions/reference/api/action#event-onClicked
+            return await target_ctx.evaluate(async (tab) => {
+                tab = tab || (await new Promise((resolve) =>
+                    chrome.tabs.query({currentWindow: true, active: true}, ([tab]) => resolve(tab))))
+                // @ts-ignore
+                return await chrome.action.onClicked.dispatch(tab)
+            }, tab)
+        }
+        dispatchMessage = async (message, options) => {
+            // https://developer.chrome.com/docs/extensions/reference/api/runtime
+            return await target_ctx.evaluate(async (extension_id, message, options) => {
+                return await chrome.runtime.sendMessage(extension_id, message, options)
+            }, extension_id, message, options)
+        }
+        dispatchCommand = async (command, tab) => {
+            // https://developer.chrome.com/docs/extensions/reference/api/commands#event-onCommand
+            return await target_ctx.evaluate(async (command, tab) => {
+                // @ts-ignore
+                return await chrome.commands.onCommand.dispatch(command, tab)
+            }, command, tab)
+        }
+    } else if (manifest_version === '2') {
+        dispatchAction = async (tab) => {
+            // https://developer.chrome.com/docs/extensions/mv2/reference/browserAction#event-onClicked
+            return await target_ctx.evaluate(async (tab) => {
+                tab = tab || (await new Promise((resolve) =>
+                    chrome.tabs.query({currentWindow: true, active: true}, ([tab]) => resolve(tab))))
+                // @ts-ignore
+                return await chrome.browserAction.onClicked.dispatch(tab)
+            }, tab)
+        }
+        dispatchMessage = async (message, options) => {
+            // https://developer.chrome.com/docs/extensions/mv2/reference/runtime#method-sendMessage
+            return await target_ctx.evaluate(async (extension_id, message, options) => {
+                return await new Promise((resolve) =>
+                    chrome.runtime.sendMessage(extension_id, message, options, resolve)
+                )
+            }, extension_id, message, options)
+        }
+        dispatchCommand = async (command, tab) => {
+            // https://developer.chrome.com/docs/extensions/mv2/reference/commands#event-onCommand
+            return await target_ctx.evaluate(async (command, tab) => {
+                return await new Promise((resolve) =>
+                    // @ts-ignore
+                    chrome.commands.onCommand.dispatch(command, tab, resolve)
+                )
+            }, command, tab)
+        }
+    }
+    const existing_extension = extensions.filter(({id}) => id === extension_id)[0] || {}
+
+    const new_extension = {
+        ...existing_extension,
+        id: extension_id,
+        webstore_name: name,
+
+        target,
+        target_ctx,
+        target_type,
+        target_url,
+
+        manifest_version,
+        manifest,
+        version,
+        homepage_url,
+        options_url,
+
+        dispatchEval,         // run some JS in the extension's service worker context
+        dispatchPopup,        // open the extension popup
+        dispatchAction,       // trigger an extension menubar icon click
+        dispatchMessage,      // send a chrome runtime message in the service worker context
+        dispatchCommand,      // trigger an extension keyboard shortcut command
+    }
+
+    console.log(`[➕] Loaded extension ${name.substring(0, 32)} (${version}) ${target_type}...`.padEnd(82), target_url)
+    Object.assign(existing_extension, new_extension)
+
+    return new_extension
+}
+
+
+
+async function getChromeExtensionsFromPersona({CHROME_EXTENSIONS, CHROME_EXTENSIONS_DIR}) {
+    console.log('*************************************************************************')
+    console.log(`[⚙️] Installing ${CHROME_EXTENSIONS.length} chrome extensions from CHROME_EXTENSIONS...`)
+    try {
+        // read extension metadata from filesystem (installing from Chrome webstore if extension is missing)
+        for (const extension of CHROME_EXTENSIONS) {
+            Object.assign(extension, await loadOrInstallExtension(extension))
+        }
+
+        // for easier debugging, write parsed extension info to filesystem
+        await overwriteFile(
+            CHROME_EXTENSIONS_JSON_PATH.replace('.json', '.present.json'),
+            CHROME_EXTENSIONS,
+        )
+    } catch(err) {
+        console.error(err)
+    }
+    console.log('*************************************************************************')
+    return CHROME_EXTENSIONS
+}
+
+let _EXTENSIONS_CACHE = null
+async function getChromeExtensionsFromCache({browser, extensions=CHROME_EXTENSIONS, extensions_dir=CHROME_EXTENSIONS_DIR}) {
+    if (_EXTENSIONS_CACHE === null) {
+        console.log(`[⚙️] Loading ${CHROME_EXTENSIONS.length} chrome extensions from CHROME_EXTENSIONS...`)
+
+        // find loaded Extensions at runtime / browser launch time & connect handlers
+        // looks at all the open targets for extension service workers / bg pages
+        for (const target of browser.targets()) {
+            // mutates extensions object in-place to add metadata loaded from filesystem persona dir
+            await loadExtensionFromTarget(extensions, target)
+        }
+        _EXTENSIONS_CACHE = extensions
+
+        // write installed extension metadata to filesystem extensions.json for easier debugging
+        await overwriteFile(
+            CHROME_EXTENSIONS_JSON_PATH.replace('.json', '.loaded.json'),
+            extensions,
+        )
+        await overwriteSymlink(
+            CHROME_EXTENSIONS_JSON_PATH.replace('.json', '.loaded.json'),
+            CHROME_EXTENSIONS_JSON_PATH,
+        )
+    }
+    
+    return _EXTENSIONS_CACHE
+}
+
+async function setup2CaptchaExtension({browser, extensions}) {
+    let page = null
+    try {
+        // open a new tab to finish setting up the 2captcha extension manually using its extension options page
+        page = await browser.newPage()
+        const { options_url } = extensions.filter(ext => ext.name === 'captcha2')[0]
+        await page.goto(options_url)
+        await wait(2_500)
+        await page.bringToFront()
+    
+        // type in the API key and click the Login button (and auto-close success modal after it pops up)
+        await page.evaluate(() => {
+            const elem = document.querySelector("input[name=apiKey]") as HTMLInputElement
+            elem.value = ""
+        })
+        await page.type('input[name=apiKey]', API_KEY_2CAPTCHA, { delay: 25 })
+        
+        // toggle all the important switches to ON
+        await page.evaluate(() => {
+            const checkboxes = Array.from(document.querySelectorAll<HTMLInputElement>('input#isPluginEnabled, input[name*=enabledFor], input[name*=autoSolve]'));
+            for (const checkbox of checkboxes) {
+                if (!checkbox.checked) checkbox.click()
+            }
+        })
+    
+        let dialog_opened = false
+        page.on('dialog', async (dialog) => {
+            setTimeout(async () => {
+                await dialog.accept();
+                dialog_opened = true
+            }, 500);
+        })
+        await page.click('button#connect')
+        await wait(2_500)
+        if (!dialog_opened) {
+            throw `2captcha extension login confirmation dialog never opened, please check its options page manually: ${options_url}`
+        }
+        console.log('[🔑] Configured the 2captcha extension using its options page...')
+    } catch(err) {
+        console.warn(`[❌] Failed to configure the 2captcha extension using its options page!`, err)
+    }
+    if (page) await page.close()
+}
+
+async function speedtest({browser, page, measureUpload=true, timeout=25000}: {browser?: Browser, page?: Page, measureUpload?: boolean, timeout?: number}) {
+    // run a speedtest using fast.com, printing results once per second
+
+    browser = browser || await page.browser()
+    page = page || await browser.newPage()
+
+    // save one speedtest_<date>.json result per day
+    const today = versionStrFromDate(new Date(), {withDate: true, withTime: false})
+    const SPEEDTEST_PATH = path.join(SPEEDTESTS_DIR, `speedtest_${today}.json`)
+
+    // check if we've already run one today, if so return earlier results and skip running again
+    try {
+        return JSON.parse(await fs.promises.readFile(SPEEDTEST_PATH, 'utf-8'))
+    } catch(err) {
+        // otherwise speedtest does not exist yet for today, continue onwards...
+    }
+
+    console.log('[🚤] Running Speedtest using Fast.com...'.padEnd(82), prettyPath(SPEEDTEST_PATH))
+
+    await page.goto('https://fast.com', {timeout, waitUntil: 'domcontentloaded'});
+    await page.waitForSelector('#speed-value', {timeout})
+
+    let result = null
+    let loop_idx = 0
+
+    while (loop_idx < 100) {
+        result = await page.evaluate(() => {
+            const $ = document.querySelector.bind(document);
+
+            return {
+                downloadSpeed: Number($('#speed-value').textContent),
+                downloadUnit: $('#speed-units').textContent.trim(),
+                downloaded: Number($('#down-mb-value').textContent.trim()),
+                uploadSpeed: Number($('#upload-value').textContent),
+                uploadUnit: $('#upload-units').textContent.trim(),
+                uploaded: Number($('#up-mb-value').textContent.trim()),
+                latency: Number($('#latency-value').textContent.trim()),
+                bufferBloat: Number($('#bufferbloat-value').textContent.trim()),
+                userLocation: $('#user-location').textContent.trim(),
+                userIp: $('#user-ip').textContent.trim(),
+                isDone: Boolean($('#speed-value.succeeded') && $('#upload-value.succeeded')),
+            };
+        })
+        if (result.downloadSpeed > 0) {
+            // console.log(JSON.stringify(result).replaceAll('"', '').replaceAll(',', ' ').replaceAll('{', '').replaceAll('}', ''))
+        }
+
+        if (result.isDone || (!measureUpload && result.uploadSpeed)) {
+            break
+        }
+
+        await wait(500)
+        loop_idx++
+    }
+
+    await Promise.allSettled([
+        page.close(),
+        overwriteFile(SPEEDTEST_PATH, result)
+    ])
+
+    return result
+}
+
+/******************************************************************************/
+/******************************************************************************/
+
+const ALREADY_ARCHIVED = new Set(['', 'about:blank', 'chrome://newtab', 'chrome://version'])
+const TASKS_PER_RUN_LIMIT = 200
+
+async function botArchiveTask({page, data, url=''}) {
+    url = url || data  // puppeteer-cluster passes in the url value via the data: arg
+
+    const is_unarchivable_url = URL_SCHEMES_IGNORED.includes(url.split(':')[0])
+    const is_already_archived = ALREADY_ARCHIVED.has(url.slice(0, 4096))
+    if (is_unarchivable_url || is_already_archived) return null 
+    ALREADY_ARCHIVED.add(url.slice(0, 4096))
+
+    if (ALREADY_ARCHIVED.size > TASKS_PER_RUN_LIMIT) {
+        console.warn('[❌] Hit maximum URLs archived per browser session, exiting to free memory.')
+        console.warn('     Run this process again to continue with the next batch...')
+        process.exit(21)
+    }
+
+    const browser = await page.browser()
+    const client = await page.target().createCDPSession()
+    const extensions = await getChromeExtensionsFromCache({browser})
+    const browser_version = await browser.version()
+    const original_url = url.toString()
+    const start_time = (new Date())
+    
+    console.log('[0/4]-------------------------------------------------------------------------')
+    const snapshot_dir = await setupSnapshotDir({original_url, start_time})
+    const snapshot = await setupSnapshotDB({original_url, start_time, snapshot_dir})
+    console.log('[1/4]-------------------------------------------------------------------------')
+    console.log(`[🪟] Starting page & viewport setup (${browser_version} ${DEFAULT_VIEWPORT.isMobile ? 'mobile' : 'desktop'} ${DEFAULT_VIEWPORT.width}x${DEFAULT_VIEWPORT.height}px)...`)
+
+
+    const page_state = {
+        // global static state
+        browser,
+        client,
+        browser_version,
+        extensions,
+
+        // per-page static metadata
+        original_url,
+        snapshot,
+        snapshot_dir,
+        start_time: start_time.toISOString(),
+        start_ts: Number(start_time),
+        version: versionStrFromDate(start_time),
+
+        // per-page mutable archiving state
+        main_response: null,
+        recorder: null,
+        console_log: [],
+        traffic_log: {},
+        redirects: {},
+    }
+    page._original_url = original_url
+    
+    try {
+        // run all page setup functions in parallel
+        const results = await Promise.allSettled([
+            // loadAuthStorage(page, page_state, { apply: true }),
+            startMetadataRecording(page, page_state),
+            setupURLRewriting(page, page_state),
+            // setupViewport(page, page_state),
+            setupModalAutoClosing(page, page_state),
+            loadCloudflareCookie(page, page_state),
+            startResponseSaving(page, page_state),
+            saveYTDLP(page, page_state),
+            saveGALLERYDL(page, page_state),
+            // saveSourceMaps(page, page_state),
+            // TODO: someday setup https://github.com/osnr/TabFS ?
+        ]);
+        // run all page setup functions in parallel
+        const rejected = results
+            .filter(result => result.status === 'rejected')
+            .map(result => (result as PromiseRejectedResult).reason);
+        if (rejected.length) console.warn('[⚠️] Partial failures during page setup:', rejected);
+    } catch(err) {
+        console.error('[❌] PAGE SETUP ERROR', JSON.stringify(err, null, 4))
+        return
+    }
+
+
+    console.log('[2/4]-------------------------------------------------------------------------')
+
+    console.log('[➡️] NAVIGATION[INI]', ANSI.blue + url + ANSI.reset)
+    const startrecording_promise = startScreenrecording(page, page_state)
+    page_state.main_response = await page.goto(url, {waitUntil: 'load', timeout: 40_000})
+    try {
+        const results = await Promise.allSettled([
+            startrecording_promise,
+            page.bringToFront(),
+            page.waitForNetworkIdle({concurrency: 0, idleTime: 900, timeout: 20_000}),
+        ])
+        const rejected = results
+            .filter(result => result.status === 'rejected')
+            .map(result =>  (result as PromiseRejectedResult).reason)
+        if (rejected.length) console.warn('[⚠️] Parial failures during page load:', rejected)
+    } catch(err) {
+        console.error('[❌] ERROR DURING PAGE LOAD', JSON.stringify(err, null, 4))
+        return
+    }
+
+    if (page_state.main_response === null) {
+        page_state.main_response = await page.waitForResponse(() => true)
+    }
+    assert(page_state.main_response)
+    if (page_state.main_response.status() == 429) {
+        throw `[⚠️] Got 429 rate-limit response, skipping this URL for now...`
+    }
+
+    // emulate human browsing behavior
+    // await disableAnimations(page, page_state);
+    await jiggleMouse(page, page_state);
+    await solveCaptchas(page, page_state);
+    await blockRedirects(page, page_state);
+    await scrollDown(page, page_state);
+    // await expandComments(page, page_state);
+    await submitForm(page, page_state);
+    // await blockJSExecution(page, page_state);
+
+    console.log('[3/4]-------------------------------------------------------------------------')
+    
+    // stop tampering with page requests & JS / recording metadata / traffic log
+    await stopMetadataRecording(page, page_state)
+
+    // do all synchonous archiving steps that need exclusive use of the whole page while doing stuff
+    const saveScreenrecording_promise = saveScreenrecording(page, page_state);
+    await saveScreenshot(page, page_state);
+    await savePDF(page, page_state);
+
+    console.log('[4/4]-------------------------------------------------------------------------')
+
+    // do all async archiving steps that can be run at the same time
+    await inlineShadowDOM(page, page_state);
+    const results = await Promise.allSettled([
+        saveTitle(page, page_state),
+        saveSEO(page, page_state),
+        saveFavicon(page, page_state),
+        saveSSL(page, page_state),
+        saveRequests(page, page_state),
+        saveRedirects(page, page_state),
+        saveHeaders(page, page_state),
+        saveRaw(page, page_state),
+        saveDOM(page, page_state),
+        saveBodyText(page, page_state),
+        // savePandoc(page, page_state),
+        saveReadability(page, page_state),
+        saveAccessibility(page, page_state),
+        saveOutlinks(page, page_state),
+        // saveAuthStorage(page, page_state),
+        saveAIQualityAssuranceResult(page, page_state),
+    ]);
+
+    // do all sync archiving steps that require browser extensions at the very end (they are the buggiest)
+    const bg_results = Promise.allSettled([
+        saveScreenrecording_promise,
+        saveSinglefile(page, page_state),
+        // saveArchiveWebPage(page, page_state),
+        // savePocket(page, page_state),
+    ])
+
+    const {duration} = await saveMetrics(page, page_state);
+
+    const rejected = results
+        .filter(result => result.status === 'rejected')
+        .map(result =>  (result as PromiseRejectedResult).reason)                            // not sure why this has a ts-error, .reason does exist on rejected promises
+
+    if (rejected.length)
+        console.warn('[⚠️] Parial failures during archiving:', rejected)
+
+    // Start an interactive REPL here with the `page` instance.
+    // https://github.com/berstend/puppeteer-extra/tree/master/packages/puppeteer-extra-plugin-repl
+    // await page.repl()
+    // await page.browser().repl()
+
+    console.log(`[✅] ${ANSI.blue}Finished archiving in ${duration/1000}s.${ANSI.reset}`)
+    
+    try {
+        const rejected = (await bg_results)
+            .filter(result => result.status === 'rejected')
+            .map(result =>  (result as PromiseRejectedResult).reason)                        // not sure why this has a ts-error, .reason does exist on rejected promises
+        if (rejected.length)
+            console.warn('[⚠️] Parial failures during wrap-up tasks:', rejected)
+        
+        console.log('[🗑️] Resetting to about:blank to ensure memory is freed...')
+        await page.goto('about:blank')
+        await page.close()
+    } catch(err) {
+        console.log(err)
+    }
+
+    // symlink the best results from across all the versions/ into the snapshot dir root
+    await symlinkBestSnapshotResults(snapshot_dir)
+
+    // display latest version screenshot GIF
+    console.log()
+    try {
+        const latest_version_gif = path.join(snapshot_dir, 'versions', page_state.version, path.basename(SCREENRECORDGIF_PATH(page)))
+        const dirent = await blockUntilExists(latest_version_gif, {min_bytes: 100, timeout: 15_000})
+        child_process.spawn('/Users/squash/.iterm2/imgcat', [dirent.abspath], {stdio: [null, 'inherit', 'inherit']})
+    } catch(err) {
+        console.warn('[⚠️] Failed to display screenrecording.gif...', err)
+        console.log()
+    }
+
+    // determine whether task succeeded or failed based on AI QA score
+    const latest_version_aiqa = path.join(snapshot_dir, 'versions', page_state.version, path.basename(AIQA_PATH(page)))
+    const qa_results = JSON.parse((await fs.promises.readFile(latest_version_aiqa)).toString())
+    if (qa_results.pct_visible < 50) {
+        throw `[❌] Task completed with problems, got AI QA score of ${qa_results.pct_visible}%! ${qa_results.warnings.join(', ')} ${qa_results.error_text || ''}`
+    } else {
+        console.log(`[💫] Task completed succesfully: ${qa_results.pct_visible}%    ${qa_results.warnings.join(', ') || ''}`)
+        console.log(`     Summary: ${(qa_results.main_content_title || qa_results.description || 'No title/description detected').substring(0, 80)}... ${qa_results.main_content_author || ''} ${qa_results.main_content_date || ''}`)
+        return true
+    }
+}
+
+async function passiveArchiveTask({browser, page, url}) {
+    // archive passively (e.g. a tab that was opened already by a human), without changing the active page
+
+    const is_unarchivable_url = URL_SCHEMES_IGNORED.includes(url.split(':')[0])
+    const is_already_archived = ALREADY_ARCHIVED.has(url.slice(0, 4096))
+    if (is_unarchivable_url || is_already_archived) return null
+    ALREADY_ARCHIVED.add(url.slice(0, 4096))
+
+    // these have to be as early as possible because we're racing with the page load (we might even be too late)
+    // jk nevermind, we now re-open a new bg tab for every tab that's created to re-capture the initial request
+    // await page.setRequestInterception(true);
+    // await page.setCacheEnabled(false);
+
+    const original_url = url.toString()
+    const start_time = (new Date())
+    const browser_version = await browser.version()
+    
+    console.log('------------------------------------------------------------------------------')
+    console.log('[➕] Starting archive of new tab opened in driver browser...', await browser.version())
+    const snapshot_dir = await setupSnapshotDir({original_url, start_time})
+    const snapshot = await setupSnapshotDB({ original_url, start_time, snapshot_dir })
+    console.log('------------------------------------------------------------------------------')
+    console.log(`[🪟] Starting page & viewport setup (${browser_version} ${DEFAULT_VIEWPORT.isMobile ? 'mobile' : 'desktop'} ${DEFAULT_VIEWPORT.width}x${DEFAULT_VIEWPORT.height}px)...`)
+
+    // create a new page in the background for archiving
+    const old_page = page
+    page = await browser.newPage()
+    await old_page.bringToFront()
+    const client = await page.target().createCDPSession()
+    const extensions = await getChromeExtensionsFromCache({ browser })
+
+    const page_state = {
+        // global static state
+        browser,
+        client,
+        browser_version,
+        extensions,
+
+        // per-page static metadata
+        original_url,
+        snapshot,
+        snapshot_dir,
+        start_time: start_time.toISOString(),
+        start_ts: Number(start_time),
+        version: versionStrFromDate(start_time),
+
+        // per-page mutable archiving state
+        main_response: null,
+        recorder: null,
+        console_log: [],
+        traffic_log: {},
+        redirects: {},
+    }
+    page._original_url = original_url
+
+    try {
+        
+        // run all page setup functions in parallel
+        const results = await Promise.allSettled([
+            // loadAuthStorage(page, page_state, {apply: true}),
+            startMetadataRecording(page, page_state),
+            setupURLRewriting(page, page_state),
+            startResponseSaving(page, page_state),
+            saveYTDLP(page, page_state),
+            saveGALLERYDL(page, page_state),
+            // saveSourceMaps(page, page_state),
+        ]);
+        const rejected = results
+            .filter(result => result.status === 'rejected')
+            .map(result =>  (result as PromiseRejectedResult).reason)
+        if (rejected.length) console.warn('[⚠️] Parial failures during page setup:', rejected)
+    } catch(err) {
+        console.warn('[❌] ERROR DURING PAGE SETUP', JSON.stringify(err, null, 4))
+        return
+    }
+
+    // load the url in the background page, then switch to it once its loaded and close the original tab
+    console.log('[➡️] NAVIGATION[INI]', ANSI.blue + url + ANSI.reset)
+    const startrecording_promise = startScreenrecording(page, page_state)
+    page_state.main_response = await page.goto(url, {waitUntil: 'load', timeout: 40_000})
+    
+    // for debugging
+    globalThis.page = page
+    globalThis.page_state = page_state
+
+    // start loading the page, start screenrecording, close the old page, and wait for loading to finish (all at once, fine for these to race)
+    try {
+        const results = await Promise.allSettled([
+            startrecording_promise,
+            page.bringToFront(),
+            old_page.close(),
+            page.waitForNetworkIdle({concurrency: 0, idleTime: 900, timeout: 20_000}),
+        ])
+        const rejected = results
+            .filter(result => result.status === 'rejected')
+            .map(result =>  (result as PromiseRejectedResult).reason)
+        if (rejected.length) console.warn('[⚠️] Parial failures during [age load:', rejected)
+    } catch(err) {
+        console.warn('[❌] ERROR DURING PAGE LOAD', JSON.stringify(err, null, 4))
+        return
+    }
+
+    if (page_state.main_response === null) {
+        page_state.main_response = await page.waitForResponse(() => true)
+    }
+    assert(page_state.main_response)
+    if (page_state.main_response.status() == 429) {
+        throw `[⚠️] Got 429 rate-limit response, skipping this URL for now...`
+    }
+
+    // resume page if paused by waitForDebuggerOnStart/dev tools debugger/backgrounding
+    try {
+        await client.send('Page.enable');
+        await client.send('Page.setWebLifecycleState', {state: 'active'});
+        await client.send('Runtime.runIfWaitingForDebugger')
+    } catch(err) { /* console.warn(err) */ }
+
+    // wait a couple seconds for page to finish loading
+    await wait(5_000)
+
+    // emulate human browsing behavior
+    // await disableAnimations(page, page_state);
+    // await jiggleMouse(page, page_state);
+    await solveCaptchas(page, page_state);
+    // await blockRedirects(page, page_state);
+    // await scrollDown(page, page_state);
+    // await expandComments(page, page_state);
+    await submitForm(page, page_state);
+    // await blockJSExecution(page, page_state);
+    await stopMetadataRecording(page, page_state)   // stop tampering with page requests & JS
+
+    console.log('[3/4]-------------------------------------------------------------------------')
+
+    // do all synchonous archiving steps that need exclusive use of the whole page while doing stuff
+    const saveScreenrecording_promise = saveScreenrecording(page, page_state);
+    await saveScreenshot(page, page_state);
+    await savePDF(page, page_state);
+
+    console.log('[4/4]-------------------------------------------------------------------------')
+
+    // do all async archiving steps that can be run at the same time
+    await inlineShadowDOM(page, page_state);
+    const results = await Promise.allSettled([
+        saveTitle(page, page_state),
+        saveSEO(page, page_state),
+        saveFavicon(page, page_state),
+        saveSSL(page, page_state),
+        saveRequests(page, page_state),
+        saveRedirects(page, page_state),
+        saveHeaders(page, page_state),
+        saveRaw(page, page_state),
+        saveDOM(page, page_state),
+        saveBodyText(page, page_state),
+        // savePandoc(page, page_state),
+        saveReadability(page, page_state),
+        saveAccessibility(page, page_state),
+        saveOutlinks(page, page_state),
+        // saveAuthStorage(page, page_state),
+        saveAIQualityAssuranceResult(page, page_state),
+    ]);
+
+    // do all sync archiving steps that require browser extensions at the very end (they are the buggiest)
+    const bg_results = Promise.allSettled([
+        saveScreenrecording_promise,
+        saveSinglefile(page, page_state),
+        // saveArchiveWebPage(page, page_state),
+        // savePocket(page, page_state),
+    ])
+
+    const {duration} = await saveMetrics(page, page_state);
+
+    const rejected = results
+        .filter(result => result.status === 'rejected')
+        .map(result =>  (result as PromiseRejectedResult).reason)
+
+    if (rejected.length)
+        console.warn('[⚠️] Parial failures during page archiving:', rejected)
+
+    // Start an interactive REPL here with the `page` instance.
+    // https://github.com/berstend/puppeteer-extra/tree/master/packages/puppeteer-extra-plugin-repl
+    // await page.repl()
+    // await page.browser().repl()
+
+    console.log(`[✅] Finished archiving in ${duration/1000}s.`,)
+    
+    // await page.tracing.stop();
+    try {
+        const rejected = (await bg_results)
+            .filter(result => result.status === 'rejected')
+            .map(result =>  (result as PromiseRejectedResult).reason)
+        if (rejected.length)
+            console.warn('[⚠️] Parial failures during page wrap-up tasks:', rejected)
+    } catch(err) {
+        console.log(err)
+    }
+    await symlinkBestSnapshotResults(snapshot_dir)
+}
+
+
+/******************************************************************************/
+/************************* Page Setup Tasks ***********************************/
+
+
+
+async function setupSnapshotDir({original_url, start_time, snapshot_dir=null}) {
+    // setup archive/<id> snapshot output folder, move old files into versions/<date>/* + clear any existing symlinks
+
+    const snap_dir = snapshot_dir || TASK_PATH(original_url)
+    
+    console.log()
+    console.log()
+    console.log(ANSI.blue + original_url + ANSI.reset)
+    console.log(ANSI.black + snap_dir + ANSI.reset)
+    console.log()
+    console.log('[📂] Setting up Snapshot output directory...'.padEnd(82), prettyPath(snap_dir))
+
+    // check for existing data at old legacy paths e.g. ./data/archive/1999999999.1723425
+    const hacky_dir = path.join(ARCHIVE_DIR, `1999999999.${hashCode(original_url)}`)
+    const known_dir = SNAPSHOT_DIRS_BY_URL[original_url]
+
+    const known_dir_exists = fs.existsSync(known_dir)
+    const hacky_dir_exists = fs.existsSync(hacky_dir)
+
+    if (snap_dir == hacky_dir) {
+        if (known_dir_exists) {
+            throw `Tried to create snapshot in ${snap_dir} but potential duplicate exists in ${known_dir}!`
+        }
+    } else if (snap_dir == known_dir) {
+        if (hacky_dir_exists) {
+            throw `Tried to create snapshot in ${snap_dir} but potential duplicate exists in ${hacky_dir}!`
+        }
+    } else {
+        if (known_dir_exists) {
+            throw `Tried to create snapshot in ${snap_dir} but potential duplicate exists in ${known_dir}!`
+        } else if (hacky_dir_exists) {
+            throw `Tried to create snapshot in ${snap_dir} but potential duplicate exists in ${hacky_dir}!`
+        } else {
+            throw `Tried to create snapshot in ${snap_dir} but its not a recognized snapshot dir path:\n    - ${known_dir}\n    - ${hacky_dir}`
+        }
+    }
+    
+    // mkdir -p ./data/archive/<snap_id>/versions && cd ./data/archive/<snap_id>
+    await fs.promises.mkdir(path.join(snap_dir, 'versions'), {recursive: true})
+    process.chdir(snap_dir)
+
+    // clear any /data/archive/<snap_id>/*.* symlinks pointing to existing ./versions/<versionid>/*.* files
+    await clearSnapshotDirSymlinks(snap_dir)
+
+    // move /data/archive/<snap_id>/*.* loose output files from any prior run into ./versions/<versionid>/*.*
+    await collectSnapshotDirVersionFiles(snap_dir)    
+    
+    // update /data/indexes/<index_name>/* to include references to /data/archive/<snap_id> as-needed
+    await updateSnapshotDirIndexes(snap_dir, {original_url, start_time})
+
+    // assert /data/archive/<snap_id>/ contains no invalid/partial files + is empty/ready to receive new files
+    await assertSnapshotDirIsValid(snap_dir, {is_empty: true})
+
+    return snap_dir
+}
+
+// ./index/<index_name> : index_getter(page_state) => "<index_key_str>"
+const INDEXES = {
+    snapshots_by_day: ({start_time}) =>
+        versionStrFromDate(start_time, {withDate: true, withTime: false}),
+    snapshots_by_domain: ({original_url}) =>
+        (new URL(original_url)).hostname || '',      // hostname does not include :port
+}
+
+async function updateSnapshotDirIndexes(snap_dir, page_state, indexes=INDEXES, indexes_dir=INDEXES_DIR) {
+    assert(indexes)
+    console.log(`[🔎] Linking Snapshot in indexes (${Object.keys(indexes).join(', ')})...`)
+    // const {snapshot_dir, original_url, start_ts} = page_state
+    for (const [index_name, index_key_getter] of Object.entries(indexes)) {
+        const index_entry = await indexSnapshotDir(snap_dir, {index_name, index_key_getter, indexes_dir}, page_state)
+    }
+}
+
+async function indexSnapshotDir(snap_dir, {index_name, index_key_getter, indexes_dir=INDEXES_DIR}, page_state) {
+    // place symlinks to this snapshot in any /indexes/<index_name/ indexes as-needed
+    // const snap_id = snap_dir.split('/').at(-1)
+
+    const index_dir = path.join(indexes_dir, index_name)                         // /data/index/snapshots_by_day
+    await fs.promises.mkdir(index_dir, {recursive: true})
+
+    // calculate the index key, e.g. "200101231" or "example.com" 
+    assert(index_name && index_key_getter)
+    assert(page_state)
+    const index_key = String(index_key_getter(page_state))                       // '20010131'
+    assert(index_key)
+    const snap_id = path.parse(snap_dir).base                                    // '19999999.23423523'
+    assert(snap_id)
+
+    const index_entries_dir = path.join(index_dir, index_key)                    // /data/index/snapshots_by_day/20010131
+    await fs.promises.mkdir(index_entries_dir, {recursive: true})
+
+    const symlink_path = path.join(index_entries_dir, snap_id)                   // /data/index/snapshots_by_day/20010131/19999999.23423523
+
+    // create symlink index/snapshots_by_day/<YYYYMMDD>/<snap id> -> ./archive/<snap_id> symlink
+    const {symlink_abspath} = await overwriteSymlink(snap_dir, symlink_path, {relative: true, mkdirs: false})
+}
+
+
+async function collectSnapshotDirVersionFiles(snap_dir) {
+    // move archive/<id>/*.* snapshot output files into archive/<id>/versions/<date>/* dated version folder
+
+    // detect start time / version info from previous result metrics.json
+    const snap_id = snap_dir.split('/archive/').at(-1)
+    const existing_metrics = path.join(snap_dir, 'metrics.json')
+    let {start_time, VERSION} = {start_time: '1970-01-01T00:00:00.000Z', VERSION: '19700101000000'}
+    try {
+        ;({start_time, VERSION} = JSON.parse(await fs.promises.readFile(existing_metrics, 'utf-8')));
+    } catch(err) {
+        // continue normally, overwriting existing files is fine if they're broken to begin with
+    }
+ 
+    // create new version folder based on metrics.json start_time (or epoch time as fallback for legacy output)
+    const version_dir_name = VERSION || versionStrFromDate(start_time)
+    const version_dir = path.join(snap_dir, 'versions', version_dir_name)
+    await fs.promises.mkdir(version_dir, {recursive: true})
+
+    // move all result files from snapshot_dir root into version folder
+    const existing_snapshot_files =
+        (await fs.promises.readdir(snap_dir, {withFileTypes: true}))
+            .filter(dirent => {
+                if (dirent.name.startsWith('.')) return false    // ignore hidden files, dont version them
+                if (dirent.name == 'versions') return false      // dont try to move versions folder into itself
+                if (dirent.isSymbolicLink()) return false        // skip existing symbolic links
+                return (dirent.isFile() || dirent.isDirectory()) // dont try to version sockets/FIFOs/devs etc.
+            })
+
+    if (existing_snapshot_files.length) {
+        console.log(`[📅] Moving snapshot results into version dir: ./data/archive/${snap_id}/* ->`.padEnd(82), `./data/archive/${snap_id}/versions/${VERSION}/`)
+    }
+    
+    const snapshot_files = await getDirInfo(snap_dir, {withRoot: false, filter: ({relpath}) => !relpath.startsWith('versions')})
+    const version_files = await getDirInfo(version_dir, {withRoot: false})
+
+    for (const {name} of existing_snapshot_files) {
+        const snapdir_entry_abspath = path.join(snap_dir, name)
+        const versioned_entry_abspath = path.join(version_dir, name)
+
+        const snapshot_entry = snapshot_files[name]
+        const version_entry = version_files[name]
+
+        if (snapshot_entry && version_entry) {
+            // a conflicting file/dir already exists in the destination path
+            // we have a few options here, we can try to merge them, or we can create a new version
+
+            if (snapshot_entry.sha256 == version_entry.sha256) {
+                // both are the same already, delete the duplicate (leaving the copy inside the version dir)
+                // if (snapshot_entry.is_dir) {
+                //     await fs.promises.rmdir(snapshot_entry.abspath, {recursive: true})
+                // } else {
+                //     await fs.promises.unlink(snapshot_entry.abspath)
+                // }
+                // console.warn(`[!] Found harmless exact duplicate files, leaving as is: ${snapshot_entry.summary} and ${version_entry.summary}`)
+            } else {
+                // both are different,
+                if (snapshot_entry.num_bytes > version_entry.num_bytes) {
+                    // snapshot entry is bigger, keep it and delete version entry?
+                } else {
+                    // version entry is bigger, keep it and delete snapshot entry
+                }
+                console.warn('    ', snapshot_entry.summary)
+                console.warn('    ', version_entry.summary)
+                // throw `Found conflicting duplicate files with different contents: ${name}`
+            }
+        } else {
+            // mv ./data/archive/<snap_id>/example.txt -> ./data/archive/<snap_id>/versions/<version_id>/example.txt
+            await fs.promises.rename(snapdir_entry_abspath, versioned_entry_abspath)
+            console.log(`  ↣ ${prettyPath(snapdir_entry_abspath)} ->`.padEnd(82), prettyPath(versioned_entry_abspath))
+        }
+    }
+}
+
+// Extractor definition
+// {
+//  phase: setup | load | sync1 | async1 | sync2 | close
+//  name: 'media' | 'photos', 'wget', 'singlefile'
+//  
+//  shouldRun(page, page_state)
+    
+    // pageSetup
+    // pageLoad
+    // pageInteraction         clicking around/scrolling
+    // archivePhase1           sync
+    // archivePhase2           async
+    // archivePhase3           async
+    // pageClose
+
+//  execute(page, page_state)
+//  validateResult(page, page_state)
+// }
+
+async function clearSnapshotDirSymlinks(snap_dir) {
+    // delete all archive/<id>/* symlinks in preparation for new snapshot output to be placed there
+
+    const existing_symlinks =
+        (await fs.promises.readdir(snap_dir, {withFileTypes: true}))
+            .filter(dirent => {
+                if (dirent.name.startsWith('.')) return false    // ignore hidden files, dont version them
+                if (dirent.name == 'versions') return false      // dont try to move versions folder into itself
+                return dirent.isSymbolicLink()
+            })
+
+    for (const {name: existing_symlink} of existing_symlinks) {
+        await fs.promises.unlink(path.join(snap_dir, existing_symlink))
+        // if symlinks are not cleared before starting, it can cause issues with outputs writing into previous versions folders
+        // e.g. screerecording saves to ./media which could be pointing to previous version's ./versions/<olddate>/media
+    }
+}
+
+async function symlinkBestSnapshotResults(snap_dir) {
+    // move any existing files into versions/<date> folder (clear out main folder)
+    // symlink latest files from versions/<date>/* into main folder
+    
+    await fs.promises.mkdir(path.join(snap_dir, 'versions'), {recursive: true})
+    process.chdir(snap_dir)
+
+    const metrics_file = path.join(snap_dir, 'metrics.json')
+    // if (!fs.existsSync(metrics_file) || (await fs.promises.lstat(metrics_file)).isSymbolicLink()) {
+    //     console.warn('[⚠️] Warning, found partial dirty snapshot state (did the snapshot get interrupted?)', snap_dir)
+    // }
+
+    // move output files into versioned folder
+    await collectSnapshotDirVersionFiles(snap_dir)    
+
+    // clear any existing symlinks
+    await clearSnapshotDirSymlinks(snap_dir)
+
+    // assert task dir is empty and contains no bare files that might get overwritten, also asserts version dirs are valid
+    await assertSnapshotDirIsValid(snap_dir, {is_empty: true})
+
+
+    const version_dirs = (await fs.promises.readdir(path.join(snap_dir, 'versions'))).sort()   // earliest to latest
+    const most_recent = version_dirs.at(-1)
+
+    // for each version dir in versions/ (oldest -> newest)
+    for (const version_dir of version_dirs) {
+        if (version_dir.startsWith('.')) continue
+
+        const version_dir_abspath = path.join(snap_dir, 'versions', version_dir)
+        const version_dir_files = (
+            (await fs.promises.readdir(version_dir_abspath))
+                .filter(filename => !filename.startsWith('.')))
+
+        // iterate through all the files/folders in the version dir
+        for (const filename of version_dir_files) {
+            const snapdir_entry = path.join(snap_dir, filename)                                // ./data/archive/<snapid>/filename
+            const versiondir_entry = path.join(snap_dir, 'versions', version_dir, filename)    // ./data/archive/<snapid>/versions/<versionid>/filename
+            
+            if (fs.existsSync(snapdir_entry)) {
+                // if an entry already exists in the snapshot root for this filename
+                if ((await fs.promises.lstat(snapdir_entry)).isSymbolicLink()) {
+                    // if a symlink already exists in the root with the same name,
+                    // check if the version file we're looking at is a better candidate to replace it
+
+                    const existing_abspath = await fs.promises.realpath(snapdir_entry)
+                    const desired_abspath = path.join(version_dir_abspath, filename)
+                    if (existing_abspath != desired_abspath) {
+                        // check if the new candidate is larger or if the existing symlink is larger   (largest file = most likely to be highest quality capture data)
+                        const largest_path = await getLargestPath(existing_abspath, desired_abspath)
+                        if (largest_path != (await fs.promises.realpath(existing_abspath))) {
+                            const larger_version = path.basename(path.dirname(largest_path))
+                            const larger_abspath = path.join(snap_dir, 'versions', larger_version, filename)
+                            
+                            // console.log('    - swapping for larger file:', filename, '->', larger_abspath.split('/archive/').at(-1))
+                            await overwriteSymlink(larger_abspath, snapdir_entry, {search_limit: snap_dir})
+                        } else {
+                            // console.log('    - leaving larger file:', largest_path.split('/archive/').at(-1))
+                        }
+                    } else {
+                        // leave existing symlink pointing to current version file, nothing to change
+                        // console.log('    - leaving current file:', existing_abspath.split('/archive/').at(-1))
+                    }
+                } else {
+                    // clearSnapshotDirSymlinks() should have already cleared these files out!
+                    throw `Non-symlink file found in root of snapshot dir! Refusing to overwrite: ${prettyPath(snapdir_entry)}`
+                }
+            } else {
+                // no entry exists in the snapshot root for this filename, create one by linking to the version file
+                await overwriteSymlink(versiondir_entry, snapdir_entry, {search_limit: snap_dir})
+            }
+            // if (version_dir == most_recent) {
+            //     // only log most recent links even though we link older ones too (otherwise its too noisy)
+            //     console.log(`  🔗 ./${filename} -> ./${versiondir_entry} linking...`)
+            // }
+        }
+    }
+
+    return snap_dir
+}
+
+async function assertSnapshotDirIsValid(snap_dir, {is_empty=false}={}) {
+    process.chdir(snap_dir)
+    console.log()
+    console.log(`[☑️] Checking that snapshot records are valid...`)
+
+    // get all directory entries in archive/<snapshot_id>/*
+    const snapshot_dir_entries =
+        (await fs.promises.readdir(snap_dir, {withFileTypes: true}))
+            .filter(dirent => {
+                if (dirent.name.startsWith('.')) return false
+                if (dirent.name == 'versions') return false
+            })
+
+    // assert versions folder exists and is not a symbolic link
+    const versions_dir = path.join(snap_dir, 'versions')
+    assert(fs.existsSync(versions_dir))
+    assert(!(await fs.promises.lstat(versions_dir)).isSymbolicLink())
+
+    // if it should be empty, check that no loose files exist
+    if (is_empty) {
+        assert(!snapshot_dir_entries.length, `Found loose files in snapshot-dir that shouldn't be there! ${snap_dir}`)
+    }
+
+    // assert all non-hidden files in snapshot dir are symbolic links to actual data in versions/<date>/*
+    for (const snapshot_dir_entry of snapshot_dir_entries) {
+        if (snapshot_dir_entry.name.startsWith('.')) continue
+        if (snapshot_dir_entry.name == 'versions') continue
+        assert(snapshot_dir_entry.isSymbolicLink(), `Found non-symbolic link in root of snapshot dir! ${snap_dir}/${snapshot_dir_entry.name}`)
+        assert(fs.existsSync(snapshot_dir_entry.name), `Found broken symbolic link in root of snapshot dir! ${snap_dir}/${snapshot_dir_entry.name}`)
+    }
+
+    const version_entries = (
+        (await fs.promises.readdir(versions_dir))
+            .filter(foldername => !foldername.startsWith('.'))
+            .sort())
+
+    console.log(`  √ ${prettyPath(versions_dir)}`, version_entries.length)
+
+    for (const version_dir of version_entries) {
+        await assertVersionDirIsValid(path.join(versions_dir, version_dir))
+    }
+
+    // write snapshot dir file listing w/ sizes & hashes to .files.json
+    const directory_info = await getDirInfo(snap_dir, {withRoot: true, withHelpers: false, maxdepth: 3})
+    await overwriteFile(path.join(snap_dir, '.files.json'), directory_info)
+}
+
+async function assertVersionDirIsValid(version_dir) {
+    const dirname = path.parse(version_dir).name
+    assert(fs.existsSync(version_dir), `Version dir does not exist: ${prettyPath(version_dir)}`)
+
+    const dirent = await fs.promises.lstat(version_dir)
+    assert(dirent.isDirectory() && !dirent.isSymbolicLink(), `Found non-directory in versions dir! ${prettyPath(version_dir)}`)
+    
+    const unix_epoch = '19700101000000'
+    const is_name_valid_datestr = /^\d+$/.test(dirname) && (dirname.length == 14) && (dirname.startsWith('2') || dirname == unix_epoch) && parseVersionDateStr(dirname)
+    assert(is_name_valid_datestr, `Version directories must be a 14-character long date string like 20251231235959! ${dirname}`)
+
+    // get all directory entries in archive/<snapshot_id>/versions/<version_id>/*
+    const version_dir_entries = (
+        (await fs.promises.readdir(version_dir, {withFileTypes: true}))
+            .filter((dirent) => !dirent.name.startsWith('.')))
+
+    // assert version dir contains only actual snapshot output files (not-symbolic links or other version dirs)
+    for (const version_dir_entry of version_dir_entries) {
+        assert(version_dir_entry.name != 'versions', `Version dir cannot contain another versions folder! ${prettyPath(version_dir)}/versions`)
+        assert(!version_dir_entry.isSymbolicLink(), `Version dir cannot contain symbolic link! ${prettyPath(version_dir)}/${version_dir_entry.name}`)
+    }
+
+    // color highlight the unix epoch version in black, and any version created today in blue
+    let pretty_dirname = dirname
+    if (dirname == unix_epoch) {
+        pretty_dirname = ANSI.black + unix_epoch + ANSI.reset
+    }
+    const today = versionStrFromDate(new Date(), {withDate: true, withTime: false})
+    if (dirname.startsWith(today)) {
+        pretty_dirname = ANSI.blue + dirname + ANSI.reset
+    }
+
+    // write version dir file listing w/ sizes & hashes to .files.json
+    const directory_info = await getDirInfo(version_dir, { withRoot: true, withHelpers: false, maxdepth: 3 })
+    await overwriteFile(path.join(version_dir, '.files.json'), directory_info)
+
+    console.log(`    √ ./versions/${pretty_dirname} contains`, version_dir_entries.length, 'results')
+}
+
+async function setupSnapshotDB({ original_url, start_time, snapshot_dir }) {
+    // setup Snapshot database row, finding it if it already exists or creating a new one
+
+    const timestamp = snapshot_dir.split('/').at(-1)
+    const search_attrs = { url: original_url, timestamp }
+    const update_attrs = { url: original_url, timestamp, added: start_time, title: null }
+
+    let snapshot = await Snapshot.findOne({ where: search_attrs });
+    let created = false
+    if (!snapshot) {
+        snapshot = await Snapshot.findOne({ where: {url: original_url} });
+        if (snapshot) {
+            // console.warn(`[X] Found DB Snapshot [${timestamp}](${original_url.substring(0, 30)}...) that has different timestamp from existing dir ${prettyPath(snapshot_dir)}!`)
+            // throw 'Snapshot DB record does not match filesystem path!'
+        } else {
+            console.log(`[+] Creating new DB Snapshot [${timestamp}](${original_url.substring(0, 30)}...) for ${prettyPath(snapshot_dir)}...`)
+            // ;([snapshot, created] = await Snapshot.findOrCreate({where: search_attrs, defaults: update_attrs }));
+            // throw 'Wanted to create new Snapshot but refusing to modify DB during testing!'
+        }
+    }
+
+    // assert(snapshot && (snapshot instanceof Snapshot))
+    return snapshot
+}
+
+async function setupViewport(page, _page_state) {
+    // setup viewport
+    await page.setViewport(DEFAULT_VIEWPORT);
+    await page.setGeolocation(DEFAULT_GEOLOCATION);
+    // await page.setBypassCSP(true);             // bypass CSP restrictions (requires --disable-web-security)
+    page.setDefaultTimeout(DEFAULT_TIMEOUT);
+
+    // Optional: emulate a mobile device
+    //  await page.emulate(puppeteer.devices['iPhone 6']);
+
+    // Configure light mode/dark mode & accessibility reduced motion preferences
+    await page.emulateMediaFeatures([
+        {name: 'prefers-color-scheme', value: DEFAULT_COLOR_SCHEME},
+        {name: 'prefers-reduced-motion', value: 'reduce'},
+    ]);
+
+    // Setup headers & deterministically chose a random referrer based on URL
+    const rand_idx = hashCode(await page.url()) % DEFAULT_REFERRERS.length
+    await page.setExtraHTTPHeaders({
+        ...DEFAULT_HEADERS,
+        referrer: DEFAULT_REFERRERS[rand_idx],
+    })
+
+    // Setup alert to trigger if site tries to sniff whether we are a bot
+    function sniffDetector() {
+        const userAgent = window.navigator.userAgent;
+        const platform = window.navigator.platform;
+        // @ts-ignore
+        window.navigator.__defineGetter__('userAgent', function () {
+            // @ts-ignore
+            window.navigator.sniffed = true;
+            return userAgent;
+        });
+        // @ts-ignore
+        window.navigator.__defineGetter__('platform', function () {
+            // @ts-ignore
+            window.navigator.sniffed = true;
+            return platform;
+        });
+    }
+    await page.evaluateOnNewDocument(sniffDetector);
+    // @ts-ignore
+    const was_sniffed = await page.evaluate(() => (!!window.navigator.sniffed))
+    if (was_sniffed) {
+        console.warn('[⚠️] Site tried to sniff if we are a bot! Site may be difficult to archive.')
+    }
+    
+    return page
+}
+
+async function setupModalAutoClosing(page, page_state, {timeout=1_250}={}) {
+    page.on('dialog', (dialog) => { 
+        console.log(`[👆] Auto-closing modal that popped up: ${dialog.message()}...`)
+        setTimeout(() => {try { dialog.accept() } catch(err) {}}, timeout);
+    })
+
+    // if you expect a file-upload dialog, use this to catch it instead:
+    // const [fileChooser] = await Promise.all([
+    //   page.waitForFileChooser(),
+    // ]);
+    // await fileChooser.accept(['/tmp/myfile.pdf']);
+    page.on('close', () => {
+        try {
+            page.off('dialog')
+        } catch(err) {}
+    })
+}
+
+async function startScreenrecording(page, page_state, {duration_limit=60, codec='libx264'}={}) {
+    await fs.promises.mkdir(path.dirname(SCREENRECORDING_PATH(page)), {recursive: true})
+    // console.log(`[🎬] Starting screen-recording stream...`.padEnd(82), prettyPath(SCREENRECORDING_PATH(page)))
+    
+    // alternative: interact with low-level puppeteer screencast API directly
+    // using puppeteer.page.screencast: https://pptr.dev/api/puppeteer.page.screencast
+    // const recorder = await page.screencast({path: SCREENRECORDING_PATH(page)});
+
+    // alternative: use puppeteer-stream for .webm/.mp4 screen recordings with tab audio included
+    // works sometimes but has a few issues, e.g.: https://github.com/SamuelScheit/puppeteer-stream/issues/8
+
+    // alternative: puppeteer-screen-recorder (most compatible/stable but doesn't include tab audio output)
+    const recorder = new PuppeteerScreenRecorder(page, {
+        followNewTab: false,
+        recordDurationLimit: duration_limit,
+        // fps: 25,
+        // ffmpeg_Path: '<path of ffmpeg_path>' || null,
+        // videoFrame: {
+        //   width: 1024,
+        //   height: 768,
+        // },
+        // videoCrf: 18,
+        videoCodec: codec,
+        // videoPreset: 'ultrafast',
+        // videoBitrate: 1000,
+        // autopad: {
+        //   color: 'black' | '#35A5FF',
+        // },
+        // aspectRatio: '4:3',
+    });
+    page_state.recorder = recorder
+    await recorder.start(SCREENRECORDING_PATH(page))
+
+    page.on('close', async () => {await saveScreenrecording(page, page_state)});
+    return page_state
+}
+
+async function startResponseSaving(page, page_state) {
+    const dir = RESPONSES_PATH(page)
+    await fs.promises.mkdir(dir, {recursive: true})
+
+    console.log(`[🌄] Starting raw response bytes recording...`.padEnd(82), prettyPath(dir) + '/')
+
+    // Document, Stylesheet, Image, Media, Font, Script, TextTrack, XHR, Fetch, Prefetch, EventSource, WebSocket, Manifest, SignedExchange, Ping, CSPViolationReport, Preflight, Other
+    const types_to_save = [
+        // 'document',
+        'script',
+        'stylesheet',
+        'font',
+        'image',
+        'media',
+        'xhr',
+        'websocket',
+    ]
+
+    // reset responses index file to empty
+    const responses_log_path = path.join(dir, 'index.jsonl')
+    await overwriteFile(responses_log_path, '')
+
+    // add handler to save all image repsonses into output directory
+    page.on('response', async (response) => {
+        try {
+
+            const timestamp = versionStrFromDate(new Date(), {withDate: true, withTime: true, withSeconds: true, withMilliseconds: true})
+
+            if (!page_state.main_response && (response.request().url() == page_state.original_url)) {
+                // save first response as main page response (if we havent already caught it earlier)
+                page_state.main_response = response
+            }
+
+            const status = response.status()
+            if ((status >= 300) && (status < 500)) {
+                // console.log('Got bad response from', response.url(), 'to', response.headers()['location'])
+                return
+            }
+            const request = response.request()
+            const resourceType = request.resourceType()
+            const url_scheme = (response.url() || request.url()).split(':')[0].toLowerCase()
+            const method = (url_scheme === 'data') ? 'DATA' : request.method()
+
+            // console.log('    ', resourceType, response.url())
+            if (types_to_save.includes(resourceType)) {
+                // create ./responses/xhr/www.facebook.com/static/images/icons/ subdir based on hostname + path
+                const resource_type_dir = path.join(dir, resourceType)
+                const url = new URL(response.url())
+                let subdir = resource_type_dir
+                const url_path = (url.pathname || '').slice(0, 250).endsWith('/')
+                    ? (url.pathname || '').slice(0, 250)
+                    : path.dirname((url.pathname || '').slice(0, 250))
+
+                // determine subdirectory based on url type (handles http:,https:,file:,data:,chrome-extension:,about:,etc.)
+                if (!URL_SCHEMES_IGNORED.includes(url_scheme)) {
+                    // is a normal http:// or https:// url, use the domain + path to construct subdirectory
+                    subdir = path.join(resource_type_dir, (url.hostname || 'data').slice(0, 250), url_path)
+                } else if (url_scheme == 'data') {
+                    // is a data:... url, store in ./data subdirectory
+                    subdir = path.join(resource_type_dir, 'data')
+                } else {
+                    // is a chrome-extension:// or other special url, use the extension id + path to construct subdirectory
+                    const url_path = path.dirname((url.pathname || '').slice(0, 999))
+                    subdir = path.join(resource_type_dir, url_scheme, (url.hostname || 'data').slice(0, 250), url_path)
+                }
+
+                // write response to responses/all/1716861056899__https%3A%2F%2Fwww.instagram.com%2Fgraphql%2Fquery.json
+                let abspath = null
+                let resp_mimetype = null
+                let extension = ''
+                let uniq_filename = null
+                let uniq_abspath = null
+                let symlink_abspath = null
+                let responseSha256 = null
+                try {
+                    await fs.promises.mkdir(path.join(dir, 'all'), {recursive: true})
+                    try {
+                        await fs.promises.mkdir(subdir, {recursive: true})
+                    } catch(err) {
+                        subdir = subdir + '.dir' // TODO: apply this workaround to parent path entries too
+                        try {
+                            await fs.promises.mkdir(subdir, {recursive: true})
+                        } catch(err) {
+                            subdir = path.join(resource_type_dir, 'data')
+                            await fs.promises.mkdir(subdir, {recursive: true})
+                        }
+                    }
+                    ;({abspath: symlink_abspath, resp_mimetype, extension} = await detectFilename({page, response, dir: subdir, resourceType}))
+                    
+                    // responses/all/1716861056899__https%3A%2F%2Fwww.instagram.com%2Fgraphql%2Fquery.json
+                    uniq_filename = `${timestamp}__${method}__` + [encodeURIComponent(url.href).slice(0, 64).replaceAll('/', '_').replace(new RegExp(`.${extension}$`), ''), extension].filter(s => s.length).join('.')
+                    uniq_abspath = path.join(dir, 'all', uniq_filename)
+
+
+                    let bytesBuffer = null
+                    try {
+                        bytesBuffer = await response.buffer()
+                    } catch(err) {
+                        if (String(err).includes("Cannot read properties of undefined (reading 'body')")) {
+                            // not sure why it's happening but seems to be too late to caputre body sometimes? possible race condition
+                        } else {
+                            console.warn('[⚠️] Failed to save response bytes for:', response.request().url(), err)
+                        }
+                    }
+                    if (bytesBuffer) {
+                        // write response data into ./all/<TS>__<METHOD>__<URL>.<EXT>
+                        await overwriteFile(uniq_abspath, bytesBuffer)
+                        
+                        responseSha256 = crypto.createHash('sha256').update(bytesBuffer).digest('hex')
+
+                        // write symlink file to ./<TYPE>/<DOMAIN>/...<PATH>/<FILENAME>.<EXT>   ->  ./all/<TS>__<METHOD>__<URL>.<EXT>
+                        await overwriteSymlink(uniq_abspath, symlink_abspath, {relative: dir, mkdirs: true, search_limit: dir})
+                    }
+                    // console.log('    ->', symlink_abspath)
+                } catch(err) {
+                    // dont do anything for redirectresponses, error responses, etc.
+                    console.warn(err)
+                }
+
+                const urlSha256 = crypto.createHash('sha256').update(String(request.url())).digest('hex')
+                // const headersSha256 = crypto.createHash('sha256').update(String(request.headers()))   // someday we may want to save headers hashes too
+
+                const truncated_url = (method == 'DATA') ? request.url().slice(0, 128) : request.url()   // don't duplicate bytes in data: urls (we already saved them in the file)
+
+                // this is essentially replicating the functionality of a WARC file, but in directory + index.jsonl form
+                await fs.promises.appendFile(
+                    responses_log_path,
+                    JSON.stringify({
+                        ts: timestamp,
+                        method,
+                        url: truncated_url,
+                        urlSha256,
+                        postData: request.postData(),
+                        response_url: ((method != 'DATA') && (url.href != request.url())) ? url.href : undefined,
+                        status,
+                        resourceType,
+                        mimeType: resp_mimetype,
+                        responseSha256,
+                        path: uniq_abspath?.replace(dir, '.'),
+                        symlink_path: symlink_abspath?.replace(dir, '.'),
+                        extension,
+                    }) + '\n',
+                    'utf-8',
+                )
+            }
+        } catch(err) {
+            // we should never throw hard errors here because there's nothing above us to catch it
+            // and we dont want to crash the entire CDP session / browser / main node process
+            console.warn('[❌] Error in response handler (set in startResponseSaving):', err)
+        }
+    });
+    // handled by stopMetadataRecording():
+    // page.on('close', () => {
+    //     page.off('response')
+    // })
+}
+
+function dedupeCookies(cookies) {
+    const len_before = cookies.length
+
+    const allowed_cookie_attrs = ['domain', 'path', 'name', 'value', 'expires', 'sameSite', 'sourceScheme', 'url', 'priority', 'secure', 'httpOnly']
+
+    const deduped_cookies = {}
+    for (const cookie of cookies) {
+        try {
+            const unique_id = `${cookie.domain}${cookie.path}${cookie.name}`
+            deduped_cookies[unique_id] = {
+                ...(deduped_cookies[unique_id] || {}),
+                ...cookie,
+                expires: 2147483640,    // max allowed expiry time (2038-01-18)
+                session: false,         // make sure cookies dont expire at browser close time
+                secure: false,          // make cookie restrictions more lax (for archiving scripts)
+                httpOnly: false,        // make it easier to tamper with cookies from JS (for archiving scripts)
+                
+                // "path": "/",
+                // "expires": 2147483641,
+                // "size": 194,
+                // "httpOnly": false,
+                // "secure": false,
+                // "session": false,
+                // "priority": "High",
+                // "sameParty": false,
+                // "sourceScheme": "Secure",
+                // "sourcePort": 443
+                
+                // and more...                  https://pptr.dev/api/puppeteer.cookieparam
+            } as Cookie
+
+            if (!deduped_cookies[unique_id].value) {
+                delete deduped_cookies[unique_id]
+                continue
+            }
+            if (deduped_cookies[unique_id].name.startsWith('__')) {
+                // cookies that start with __ must be secure, see https://github.com/puppeteer/puppeteer/issues/6806
+                deduped_cookies[unique_id].secure = true
+                deduped_cookies[unique_id].sourceScheme = 'Secure'
+            }
+            if (deduped_cookies[unique_id].domain.startsWith('.')) {
+                deduped_cookies[unique_id].sameParty = false
+                deduped_cookies[unique_id].domain = deduped_cookies[unique_id].domain.slice(1)
+            }
+            
+            for (const key of Object.keys(deduped_cookies[unique_id])) {
+                if (!allowed_cookie_attrs.includes(key)) {
+                    delete deduped_cookies[unique_id][key]
+                }
+            }
+        } catch(err) {
+            console.error('[❌] Failed to parse cookie during deduping', cookie)
+            throw err
+        }
+    }
+    // console.log(`[🍪] Deduped ${len_before} cookies to ${Object.keys(deduped_cookies).length}...`)
+
+    return Object.values(deduped_cookies) as Cookie[]
+}
+
+async function loadCookiesTxt() {
+    const cookies = [] as Cookie[]
+    return cookies  // write-only from chrome -> files for now
+
+    if (fs.existsSync(COOKIES_TXT_PATH)) {
+        // console.log(`[🍪] Loading cookies/localStorage/sessionStorage from ${COOKIES_TXT_PATH}...`)
+
+        // Read from to cookies.txt file using tough-cookie + @root/file-cookie-store
+        const cookies_store = new FileCookieStore(COOKIES_TXT_PATH, {auto_sync: false, lockfile: false});
+        cookies_store.getAllCookiesAsync = util.promisify(cookies_store.getAllCookies);
+        const exported_cookies = await cookies_store.getAllCookiesAsync()
+        for (const cookie of exported_cookies) {
+            const cookie_from_tough = cookie.toJSON()
+            const domain = cookie_from_tough.hostOnly ? `.${cookie_from_tough.domain}` : cookie_from_tough.domain
+            const cookie_for_puppeteer: Cookie = {
+                domain,
+                name: cookie_from_tough.key,
+                path: cookie_from_tough.path,
+                value: cookie_from_tough.value,
+                secure: cookie_from_tough.secure || false,
+                httpOnly: cookie_from_tough.httpOnly || false,
+                session: false,
+                expires: (new Date(cookie_from_tough.expires)).valueOf()/1000,
+                size: undefined,
+            }
+            // console.log('COOKIE_FROM_TOUGH_TXT', cookie_from_tough, cookie_for_puppeteer)
+            cookies.push(cookie_for_puppeteer)
+        }
+    }
+}
+
+type AuthJSON = {
+    cookies: Cookie[],
+    sessionStorage: any,
+    localStorage: any,
+}
+
+async function loadAuthStorage(page, {client}, {apply=true}={}) {
+    var {
+        cookies,
+        sessionStorage,
+        localStorage,
+    }: AuthJSON = {cookies: [], sessionStorage: {}, localStorage: {}}
+    
+    if (!LOAD_AUTH_STORAGE) {
+        // dont read auth from filesystem auth.json/cookies.txt, just rely on existing cookies in chrome profile
+        return {cookies, sessionStorage, localStorage}
+    }
+
+    if (fs.existsSync(COOKIES_TXT_PATH)) {
+        try {
+            cookies = await loadCookiesTxt()
+        } catch(err) {
+            console.warn('[⚠️] Loaded invalid cookies.txt, moved it to cookies.txt.corrupted (did two processes try to change it at the same time?)')
+            await fs.promises.rename(COOKIES_TXT_PATH, COOKIES_TXT_PATH + '.corrupted')
+        }
+        // console.log(`[🍪] Loading cookies from cookies.txt...`, cookies.length)
+    }
+
+    if (fs.existsSync(AUTH_JSON_PATH)) {
+        try {
+            var {
+                cookies: auth_json_cookies,
+                sessionStorage,
+                localStorage,
+            } = JSON.parse(await fs.promises.readFile(AUTH_JSON_PATH, 'utf-8'));
+            cookies = [...cookies, ...auth_json_cookies]
+            // console.log(`[🍪] Loading cookies from auth.json...`, auth_json_cookies.length)
+        } catch(err) {
+            console.warn('[⚠️] Loaded invalid auth.json, moved it to auth.json.corrupted (did two processes try to change it at the same time?)')
+            await fs.promises.rename(AUTH_JSON_PATH, AUTH_JSON_PATH + '.corrupted')
+        }
+    }
+
+    cookies = dedupeCookies(cookies)
+
+    if (apply) {
+        console.log(`[🍪] Loading stored cookies/localStorage/sessionStorage into session...`, cookies.length)
+
+        // if (cookies?.length) {
+        //     try {
+        //         // try setting all at once first (much faster)
+        //         await page.setCookie(...cookies)
+        //     } catch(err) {
+        //         // if any errors, fall back to setting one-by-one so that individual error can be caught
+        //         for (const cookie of cookies) {
+        //             try {
+        //                 await page.setCookie(cookie);
+        //             } catch(err) {
+        //                 console.error('[❌] Failed to set cookie', cookie)
+        //                 throw err
+        //             }
+        //         }
+        //     }
+        // }
+        const origin = await page.evaluate(() => window.location.origin)
+
+        await page.evaluate((savedSessionStorage) => {
+            for (const [key, value] of Object.entries(savedSessionStorage)) {
+                sessionStorage[key] = value;
+            }
+        }, sessionStorage[origin] || {});
+      
+        await page.evaluate((savedLocalStorage) => {
+            for (const [key, value] of Object.entries(savedLocalStorage)) {
+                localStorage[key] = value;
+            }
+        }, localStorage[origin] || {});
+
+        // origin/auth context changes when we do page.goto so we have to hook pageload and apply it then as well
+        // https://stackoverflow.com/questions/51789038/set-localstorage-items-before-page-loads-in-puppeteer
+        await page.evaluateOnNewDocument(({sessionStorage, localStorage}) => {
+            const origin = window.location.origin;
+
+            for (const [key, value] of Object.entries(sessionStorage[origin] || {})) {
+                window.sessionStorage.setItem(key, value as string)
+            }
+            for (const [key, value] of Object.entries(localStorage[origin] || {})) {
+                window.localStorage.setItem(key, value as string)
+            }
+            
+        }, {sessionStorage, localStorage});
+    }
+
+    return {cookies, sessionStorage, localStorage}
+}
+
+async function loadCloudflareCookie(page, {original_url}, {timeout=20_000}={}) {
+    // make request to FlareSolverr server to get magic cookies that let us bypass cloudflare bot detection
+    // docker run -p 8191:8191 -e LOG_LEVEL=info ghcr.io/flaresolverr/flaresolverr
+
+
+    // alternatives if this stops working:
+    // - https://github.com/omkarcloud/botasaurus
+    // - https://github.com/ultrafunkamsterdam/nodriver
+    // - https://github.com/Akmal-CloudFreed/CloudFreed-CloudFlare-bypass
+    // - https://github.com/VeNoMouS/cloudscraper
+
+    const query = { url: original_url, cmd: "request.get", maxTimeout: timeout }
+    try {
+        const response = await fetch(FLARESOLVERR_API_ENDPOINT, {
+            method: 'POST',
+            headers: {'Content-Type': 'application/json'},
+            body: JSON.stringify(query),
+        });
+        const data = await response.json();
+
+        const new_cookies = (data?.solution?.cookies || []).map(cookie => ({
+            ...cookie,
+            'expires': 2147483640,       // overwrite expiration to 32bit maximum timestamp (2038-01-18)
+            'secure': false,             // cookie value is plain text (not encrypted/encoded)
+        }))
+
+        if (new_cookies.length) {
+            console.log(`[☑️] Got Cloudflare bypass cookies (${new_cookies.length}) from FlareSolverr API...`)
+            await page.setCookie(...new_cookies);
+            return new_cookies
+        } else {
+            const error_str = JSON.stringify(data?.message || data, null, 4)
+            throw `Bad FlareSolverr Response: ${error_str}`
+        }
+
+    } catch (error) {
+        if (JSON.stringify(error).includes('Challenge not detected')) {
+            console.log('[☑️] Page is accessible without FlareSolverr Cloudflare bypass.')
+        } else {
+            console.warn('[❌] Failed to get Cloudflare bypass cookies from FlareSolverr API.', error)
+        }
+    }
+    return []
+}
+
+async function setupURLRewriting(page, page_state) {
+    await page.setRequestInterception(true);
+
+    const rewrites = URL_REWRITES.sort((a, b) => (a.idx || 0) - (b.idx || 0))
+
+    page.on('request', interceptedRequest => {
+        if (interceptedRequest.isInterceptResolutionHandled()) return;
+
+        const original_url = interceptedRequest.url()
+
+        // apply all the rewrites in order to the request URL
+        let url = original_url
+        for (const rewrite of rewrites) {
+            const new_url = url.replace(rewrite.pattern, rewrite.replacement)
+            // console.log(rewrite, url, new_url)
+        
+            // if url is rewritten to an emptystring, abort the request
+            if (!new_url) {
+                console.warn('[🟥] Request blocked', rewrite.pattern, ':', url)
+                interceptedRequest.abort()
+                return
+            }
+            else if (new_url && new_url != url) {
+                // console.warn('[📳] Request rewritten', rewrite.pattern, rewrite.replacement, ':', url, '->', new_url)
+                console.warn('[📳] Request rewritten', rewrite.pattern, ':', new_url)
+                url = new_url
+            }
+        }
+
+        if (url == original_url) {
+            // if url is unchanged, continue request flow as-is
+            interceptedRequest.continue()
+        } else {
+            // otherwise redirect the browser to our rewritten version
+            interceptedRequest.respond({
+                status: 302,
+                headers: {
+                    location: url,
+                    'x-redirect-by': 'ArchiveBox.setupURLRewriting',
+                },
+            })
+        }
+    });
+    // handled by stopMetadataRecording():
+    // page.on('close', () => {
+    //     page.off('request')
+    //     page.setRequestInterception(false)
+    // })
+}
+
+async function startMetadataRecording(page, {original_url, version, client, traffic_log, console_log, redirects}) {
+    // update helper state on page
+    page._original_url = (original_url || (await page.url())).toString()
+
+    // DEBUGGING: helpers for repl() debugging, dont rely on these (global state is badd mmkay)
+    // page._client = client || page._client || await page.target().createCDPSession()
+    // page._redirects = redirects
+    // page._traffic_log = traffic_log
+
+    // add initial entry to page redirect log
+    redirects[original_url] = {
+        idx: 0,
+        url: original_url,
+        src: null,
+        type: 'Initial',
+        wallTime: Date.now()/1000,
+        frameId: page.mainFrame()._id,
+        requestId: null,
+        initiator: {type: "user"},
+        isMainFrame: true,
+    }
+    
+    // DEBUGGING: record optional chrome debug trace with screenshots (heavy)
+    // try {
+    //     await page.tracing.stop()
+    //     await wait(200)
+    // } catch(err) {}
+    // try {
+    //     await page.tracing.start({path: TRACE_PATH(page), screenshots: true});
+    // } catch(err) {}
+
+    let last_main_frame_url = original_url
+
+    // setup network request intercepts handler
+    const addCDPRequestDataListener = (eventName) => {
+        client.on(eventName, event => {
+            try {
+                // save any HTTP/JS redirects to redirects for saveRedirects(page) to use later on
+                const new_url = event.documentURL
+                const http_status = event.redirectResponse?.status || 0
+                const is_new_url = (new_url !== original_url) && !redirects[new_url]
+                const is_main_frame_navigation = (event.frameId == page.mainFrame()._id)
+                const is_http_redirect = (300 < http_status) && (http_status < 400)
+
+                if (new_url && is_new_url && (is_main_frame_navigation || is_http_redirect) && event.type == 'Document') {
+                    const new_redirect_entry = {
+                        url: new_url,
+                        src: event.redirectResponse?.url || last_main_frame_url,
+                        type: http_status || 'JS',
+                        wallTime: Date.now()/1000,
+                        frameId: event.frameId,
+                        requestId: event.requestId,
+                        initiator: event.initiator,
+                        idx: Object.keys(redirects).length,
+                        isMainFrame: is_main_frame_navigation,
+                    }
+                    redirects[new_url] = new_redirect_entry
+                    if (is_main_frame_navigation) {
+                        ALREADY_ARCHIVED.add(new_redirect_entry.url.slice(0, 4096))  // we're already archiving this tab as it redirects, dont create a duplicate archive for the destination
+                        console.warn(`[➡️] NAVIGATION[${new_redirect_entry.type}]${ANSI.blue} ${last_main_frame_url} ${ANSI.reset}\n                  ->${ANSI.blue} ${new_redirect_entry.url} ${ANSI.reset}`)
+                        last_main_frame_url = new_url
+                    }
+                }
+
+                if (event.loaderId) {
+                    traffic_log[event.loaderId] = traffic_log[event.loaderId] || {}       // make sure loader is also in requests list first
+                    // sometimes it's not in the list if we start archiving too late / after a page's initial request was already made
+                }
+
+                // save to traffic_log as {8BC2087A2CCEF28017099C0E10E87440: {Network.eventWillBeSent: {eventId,loaderId, request|response, ...}}
+                // https://stackoverflow.com/questions/47078655/missing-request-headers-in-puppeteer?noredirect=1&lq=1
+                traffic_log[event.requestId] = traffic_log[event.requestId] || {}
+                Object.assign(traffic_log[event.requestId], { [eventName]: event })
+                
+                // DEBUGGING: log page visits and navigation events to console
+                // if (event?.response?.status) {
+                //     // if we're expecting an HTML response, then we assume it's a page visit & log it to console
+                //     const acceptMimeType = traffic_log[event.requestId]['Network.requestWillBeSentExtraInfo']?.headers?.accept
+                //     if (acceptMimeType && acceptMimeType.includes('text/html')) {
+                //         // log any HTML page responses (less noisy)
+                //         console.log(`[>] GOT ${event.documentURL}: ${event.response.status} ${event.response.url} (${event.response.mimeType})`)
+                //     } else {
+                //         // log ALL responses, inclusing JS,CSS,Images,etc. (very noisy)
+                //         // console.log(`      > ${event.response.status} ${event.response.url} (${event.response.mimeType})`)
+                //     }
+                // }
+            } catch(err) {
+                console.warn('[X] Error during request/response handler (startMetadataRecording.addCDPRequestDataListener)')
+                console.warn(err)
+            }
+        })
+    }
+    addCDPRequestDataListener('Network.requestWillBeSent')
+    addCDPRequestDataListener('Network.requestWillBeSentExtraInfo')
+    addCDPRequestDataListener('Network.responseReceived')
+    addCDPRequestDataListener('Network.responseReceivedExtraInfo')
+
+    // clear any existing log entries
+    const consolelog_info = {
+        TYPE: 'console',
+        VERSION: version,
+        URL: original_url,
+    }
+    await overwriteFile(CONSOLELOG_PATH(page), JSON.stringify(consolelog_info) + '\n')
+
+    // record console logs from page
+    const appendConsoleLog = async (line) => {
+        if (!line) return
+        console_log.push(line)
+        await fs.promises.appendFile(
+            CONSOLELOG_PATH(page),
+            line + '\n',
+            'utf-8',
+        )
+    }
+
+    page.on('console', async(message) =>
+            await appendConsoleLog(`${message.type().toUpperCase()} ${message.location()} ${JSON.stringify(message.text())}`))
+    page.on('pageerror', async (error) =>
+            await appendConsoleLog(error.message || JSON.stringify(error)))
+    page.on('requestfailed', async (request) =>
+            await appendConsoleLog(`${request.failure()?.errorText} ${request.url() || JSON.stringify(request)}`))
+    
+    // set puppeteer options on page
+    await client.send('Network.enable')                         // enable network tampering API
+    await client.send('Emulation.clearDeviceMetricsOverride');  // clear timing statistics
+    await client.send('Page.setDownloadBehavior', {
+        behavior: 'allow',
+        downloadPath: CHROME_DOWNLOADS_DIR,
+    })
+
+    // handled by stopMetadataRecording():
+    // page.on('close', () => {
+    //     try {
+    //         page.off('request')
+    //         page.off('console')
+    //         page.off('pageerror')
+    //         page.off('requestfailed')
+    //         page.setRequestInterception(false)
+    //     } catch(err) {
+    //         // some versions of puppeteer have had race conditions here where page is already closed by now
+    //         console.warn('[X] Error in page close handler', err)
+    //     }
+    // })
+
+    return {original_url, client, redirects, traffic_log, console_log}
+}
+
+async function stopMetadataRecording(page, _page_state) {
+    console.log('[🪝] Stopping CDP event hooks and request interception...')
+    try {
+        page.off('request')
+        page.off('response')
+        page.off('console')
+        page.off('pageerror')
+        page.off('requestfailed')
+        page.off('hashchange')
+        page.setRequestInterception(false)
+        // page.tracing.stop()
+    } catch(err) {
+        // some versions of puppeteer have had race conditions here where page is already closed by now
+        console.warn('[X] Error in page close handler', err)
+    }
+}
+
+/********************** Human Behavior Emulation ******************************/
+
+async function solveCaptchas(page, page_state, {timeout=90_000}={}) {
+
+    // using puppeteer-extra-plugin-recaptcha auto-solver
+    // await page.solveRecaptchas()
+
+    // using 2captcha-solver extension auto-solver
+    try {
+        // console.log('[🕑] Waiting for CAPTCHA to appear...')
+        await page.waitForSelector('.captcha-solver', {timeout: 5_000})
+
+        console.log('[🤖] CAPTCHA challenge found, submitting to 2Captcha for solving...')
+        await page.click('.captcha-solver')
+
+        console.log(`[🧠] Waiting up to ${timeout/1000}s for CAPTCHA to be solved...`)
+        await page.waitForSelector(`.captcha-solver[data-state="solved"]`, {timeout})
+
+        console.log('[🔓] CAPTCHA solution retrieved from 2captcha.')
+    } catch(err) {
+        console.log('[☑️] No CATPCHA challenges found, site thinks we are human.')
+    }
+}
+
+async function jiggleMouse(page, page_state, {timeout=600}={}) {
+    console.log(`[🐁] Moving mouse around randomly for ${timeout/1000}s...`)
+
+    const randomPoint = await getRandomPagePoint(page)
+    const cursor = createCursor(page, randomPoint, true)
+
+    cursor.toggleRandomMove(true)
+    await wait(timeout/2);
+    await cursor.moveTo({x: DEFAULT_VIEWPORT.width/2, y: DEFAULT_VIEWPORT.height/2});
+    await wait(timeout/2);
+    cursor.toggleRandomMove(false)
+}
+
+async function blockRedirects(page, {original_url}) {
+    page.on('request', req => {
+        if (req.isInterceptResolutionHandled()) return;
+
+        // if it's a top-level navigation event to a new url
+        if (req.isNavigationRequest() && req.frame() === page.mainFrame() && req.url() !== original_url) {
+            req.abort('aborted');
+            console.warn('[🟥] Blocked page attempt to naviage to new URL', req.url())
+        } else {
+            req.continue();
+        }
+    });
+    // handled by stopMetadataRecording():
+    // page.on('close', () => {
+    //     page.off('request')
+    //     page.setRequestInterception(false)
+    // })
+    await page.setRequestInterception(true);
+}
+
+async function blockJSExecution(page, _page_state) {
+    console.warn('[🟥] Stopping all JS execution on page...')
+    await page.evaluate(() => {
+        debugger; 
+    })
+    // OR alternatively this (more buggy, breaks many sites):
+    // const html = await page.content();
+    // page.setJavaScriptEnabled(false);
+    // await page.setContent(html, { waitUntil: 'networkidle0' }); // 4
+}
+
+async function scrollDown(page, _page_state, {timeout=120_000, scroll_delay=SCROLL_DELAY, scroll_distance=SCROLL_DISTANCE, scroll_limit=SCROLL_LIMIT}={}) {
+    const starting_height = await page.evaluate('document.body.scrollHeight');
+    let last_height = starting_height
+
+    let scroll_count = 0;
+    let scroll_position = scroll_count * scroll_distance
+    // await page.bringToFront()
+
+    // scroll to top
+    await page.evaluate(() => { window.scrollTo({ top: 0, left: 0, behavior: 'smooth' }); });
+
+    while ((scroll_count < scroll_limit) && ((scroll_delay * scroll_count) < timeout)) {
+        console.log(`[⬇️] Scrolling down ${scroll_count}x 1000px... (${scroll_position}/${last_height})`)
+        await page.evaluate((y_offset) => { window.scrollTo({ top: y_offset, left: 0, behavior: 'smooth' }); }, scroll_position);
+        scroll_count++
+        scroll_position = scroll_count * scroll_distance
+
+        // check if any new content was added / if we are infiniscrolling
+        let new_height = await page.evaluate('document.body.scrollHeight')
+        const added_px = new_height - last_height
+        if (added_px > 0) {
+            console.log('[✚] Detected infini-scrolling...', `${last_height}+${added_px} => ${new_height}`)
+        } else if (scroll_position >= new_height + scroll_distance) {
+            // we've reached the bottom, condition isn't true until we've tried to go n+1 past the end (which is fine)
+            if (scroll_count > 2)
+                break
+        }
+        last_height = new_height
+        
+        // sleep 2s, perform the smooth scroll down by 1000px, and increment the counter
+        await wait(scroll_delay);
+
+        // facebook watch pages infiniscroll (more and more recommendations forever), stop them after 3 pages
+        if (page._original_url.startsWith('https://www.facebook.com/watch/?v') && scroll_count > 3) break
+    }
+
+    // scroll to bottom
+    if (scroll_position < last_height) {
+        await page.evaluate(() => { window.scrollTo({ top: document.body.scrollHeight, left: 0, behavior: 'smooth' }); });
+        await wait(scroll_delay)
+        await page.evaluate(() => { window.scrollTo({ top: document.body.scrollHeight, left: 0, behavior: 'smooth' }); });
+    }
+
+    // Always wait an additional 2sec at the end for scroll animations / loading / rendering to settle down
+    console.log('[📉] Reached bottom of the page.', `(${scroll_position}/${last_height})`)
+    await wait(scroll_delay);
+    await page.evaluate(() => { window.scrollTo({ top: 0, left: 0, behavior: 'smooth' }); });
+    await wait(scroll_delay);
+
+    return last_height
+}
+
+async function disableAnimations(page, _page_state) {
+    console.log(`[⛄️] Disabling all animations using CSS override...`)
+
+    // https://stackoverflow.com/questions/53167644/injecting-css-into-site-with-puppeteer
+    const css_override = `*, *::before, *::after {
+        -moz-animation: none !important;
+        -moz-transition: none !important;
+        animation: none !important;
+        transition: none !important;
+        caret-color: transparent !important;
+    }`
+
+    // inject override into current page
+    await page.addStyleTag({content: css_override});
+
+    // inject override into any subsequently navigated pages
+    await page.evaluateOnNewDocument((css_override) => {
+        const style_tag = document.createElement('style')
+        style_tag.type = 'text/css'
+        style_tag.innerHTML = css_override
+        document.getElementsByTagName('head')[0].appendChild(style_tag)
+    }, css_override);
+}
+
+async function expandComments(page, _page_state, {timeout=120_000, limit=15_000, delay=650}={}) {
+    console.log(`[🗃️] Expanding up to ${limit} comments every ${delay}ms...`)
+    
+    // expand all <details> sections in Github READMEs, HedgeDoc pages, etc.
+    await page.$$eval('pierce/article details', elem => {elem.open = true})           // expand Github README details sections
+    await page.$$eval('pierce/div.js-discussion details:not(.details-overlay)', elem => {elem.open = true}) // expand Github issue discussion hidden comments
+    await page.$$eval('pierce/.markdown-body details', elem => {elem.open = true})    // expand HedgeDoc Markdown details sections
+
+    await page.exposeFunction('onHashChange', url => page.emit('hashchange', url));
+    await page.evaluateOnNewDocument(() => {
+        // @ts-ignore
+        addEventListener('hashchange', (e) => onHashChange(location.href));
+    });
+
+    // Listen for hashchange events in node Puppeteer code.
+    page.on('hashchange', url => console.log('Page tried to navigate to:', new URL(url)));
+
+
+    const num_expanded = await page.evaluate(async ({timeout, limit, delay}) => {
+        function getElementsByXPath(xpath, ctx?) {
+            var results = [];
+            var xpathResult = document.evaluate(
+                xpath,                                                          // e.g. //*[text()='"+text+"'] 
+                ctx || document,
+                null,
+                XPathResult.ORDERED_NODE_ITERATOR_TYPE,
+                null
+            );
+            var node;
+            while ((node = xpathResult.iterateNext()) != null) {
+               results.push(node);
+            }
+            return results;
+        }
+
+        let num_expanded = 0
+        const getLoadMoreLinks = () => [
+            // find all the buttons/links to expand collapsed/hidden/lazy-loaded content 
+            ...document.querySelectorAll('faceplate-partial[loading=action]'),  // new reddit
+            ...document.querySelectorAll('a[onclick^="return morechildren"]'),  // old reddit show more replies
+            ...document.querySelectorAll('a[onclick^="return togglecomment"]'), // old reddit show hidden replies
+            // ...document.querySelectorAll('a.js-show-link'),                     // stack overflow comments show more (TODO: make this only work on SO)
+            // ...document.querySelectorAll('a.morelink'),                         // HackerNews profile show more (TODO: make this only work on HN)
+            // ...getElementsByXPath("//*[text()~='View \d+ replies']"),        // facebook comment expander
+            ...getElementsByXPath("//*[text()='Show more replies']"),           // twitter infiniscroll expander
+            ...getElementsByXPath("//*[text()='Show replies']"),                // twitter replies expander
+        ]
+        const wait = (ms) => new Promise(res => setTimeout(res, ms))
+
+        let load_more_links = getLoadMoreLinks()
+        while (load_more_links.length) {
+            console.log('Expanding comments...', load_more_links.length)
+            for (const link of load_more_links) {
+                link.scrollIntoView({behavior: 'smooth'})
+                if (link.slot == 'children') {
+                    continue
+                    // patch new reddit "More replies" links that would open in a new window to display inline instead
+                    // const comment_id = link.src.split('?')[0].split('/').at(-1)
+                    // link.slot = `children-${comment_id}-0`
+                    // link.__alwaysShowSlot = false
+                }
+                // click the "More replies" button
+                link.click()
+                num_expanded++
+                await wait(delay)
+                const time_elapsed = num_expanded * delay
+                if ((num_expanded > limit) || (time_elapsed > timeout))
+                    return num_expanded
+            }
+            load_more_links = getLoadMoreLinks()
+        }
+        return num_expanded
+    }, {timeout, limit, delay});
+
+    page.off('hashchange')
+
+    if (num_expanded) {
+        console.log(`[🗃️] Expanded ${num_expanded} comments...`)
+        
+        // scroll to bottom, then back up to top
+        const final_height = await page.evaluate('document.body.scrollHeight');
+        await page.evaluate((top) => { window.scrollTo({ top, left: 0, behavior: 'smooth' }); }, final_height + 1000);
+        await wait(delay);
+        await page.evaluate(() => { window.scrollTo({ top: 0, left: 0, behavior: 'smooth' }); });
+        await wait(delay);
+    }
+
+}
+
+async function submitForm(page, _page_state, {timeout=5_000}={}) {
+    try {
+        await page.waitForSelector('form button[type=submit]', {timeout: 1_500});
+        console.log('[☑️] Submitting form...')
+        await page.click('form button[type=submit]')
+        await page.waitForNavigation({timeout});
+        await page.goBack();
+    } catch (err) {
+        // no form found
+    }
+}
+
+// TODO: add an evasion to set navigator.connection.rtt = 365 (0 = detectable as headless)
+
+/******************************************************************************/
+/******************************************************************************/
+
+/**************** Extension-Based Archive Output Tasks ************************/
+
+async function saveSinglefile(page, {main_response, extensions}) {
+    const extension = extensions.filter(({name}) => name === 'singlefile')[0]
+    if (!extension.version) throw 'Could not find Singlefile extension ID, is it installed?'
+
+    const url = await page.url() || main_response.url()
+    if (URL_SCHEMES_IGNORED.includes(url.split(':')[0])) return null
+
+    // get list of existing past files in downloads/* to ignore
+    const files_before = new Set(
+        (await fs.promises.readdir(CHROME_DOWNLOADS_DIR))
+            .filter(fn => fn.endsWith('.html'))
+    );
+
+    const out_path = SINGLEFILE_PATH(page)
+
+    console.log(`[🛠️] Saving Singlefile HTML using extension (${extension.id})...`.padEnd(82+1), prettyPath(CHROME_DOWNLOADS_DIR))
+    await page.bringToFront()     // action button acts on the foreground tab, so it has to be in front :(
+    await extension.dispatchAction()
+    let files_new = []
+
+    const check_delay = 3_000
+    for (const _try in [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]) {
+        await wait(check_delay)
+
+        const files_after = (await fs.promises.readdir(CHROME_DOWNLOADS_DIR)).filter(fn => fn.endsWith('.html'));
+        files_new = files_after.filter(file => !files_before.has(file))
+
+        if (files_new.length == 0) {
+            // console.warn(`    ...waiting for Singlefile to write HTML into ${CHROME_DOWNLOADS_DIR}...`)
+            continue
+        }
+        // iterate through new downloads and find a matching .html containing our page's URL in the header
+        for (const file of files_new) {
+            const dl_path = path.join(CHROME_DOWNLOADS_DIR, file)
+            const dl_text = await fs.promises.readFile(dl_path, 'utf-8')
+            const dl_header = dl_text.split('meta charset')[0]
+            if (dl_header.includes(`url: ${url}`)) {
+                /// dont need this check anymore as now all output is versioned:
+                // if (fs.existsSync(out_path)) {
+                //     const {size: existingSize} = await fs.promises.stat(out_path)
+                //     const {size: newFileSize} = await fs.promises.stat(dl_path)
+                //     if (newFileSize < existingSize) {
+                //         console.log(`[🗑️] Discarding singlefile output (${file}) as it's smaller than existing ${out_path}...`)
+                //         await fs.promises.rm(dl_path)
+                //         return out_path
+                //     }
+                // }
+                console.log(`[✍️] Moving Singlefile download from ${file}...`.padEnd(82), prettyPath(out_path))
+                await fs.promises.rename(dl_path, out_path)
+                return out_path
+            }
+        }
+    }
+
+    console.warn(`[❌] Couldn't find matching Singlefile HTML in ${CHROME_DOWNLOADS_DIR} after waiting ${(check_delay*10)/1000}s:`, files_new.join(', '))
+    return null
+}
+
+async function saveArchiveWebPage(page, {extensions}, {timeout=30_000}={}) {
+    // TODO: waiting on them to expose commands so we can generate .wacz easily
+    // https://github.com/webrecorder/archiveweb.page/issues/207
+    // ...
+    const browser = await page.browser()
+    const extension = extensions.filter(({name}) => name === 'archivewebpage')[0]
+    await page.bringToFront()
+    await extension.dispatchPopup()
+    await extension.dispatchAction()
+    const popup = await browser.waitForTarget(
+        target => target.url().toString().startsWith(`chrome-extension://${extension.id}/popup.html`),
+        {timeout: 5_000},
+    )
+    await page.bringToFront()
+
+    // await puppeteer.Locator.race([
+    //     popup.locator('::-p-aria(Start With Autopilot)'),
+    //     popup.locator('wr-popup-viewer >>>> input'),
+    //     popup.locator(':scope >>> input')
+    // ])
+    // .setTimeout(timeout)
+    // .click({
+    //   offset: {
+    //     x: 7.7265625,
+    //     y: 7.203125,
+    //   },
+    // });
+
+    // @ts-ignore
+    await puppeteer.Locator.race([
+        popup.locator('wr-popup-viewer >>>> div.status-row > p'),
+        popup.locator(':scope >>> div.status-row > p'),
+        popup.locator('::-p-text(Recording: \n)')
+    ]).setTimeout(timeout).click({
+      delay: 733.3000000007451,
+      offset: {
+        x: 293,
+        y: 13.5,
+      },
+    })
+
+    await wait(8_000)
+
+    // @ts-ignore
+    await puppeteer.Locator.race([
+        popup.locator('wr-popup-viewer >>>> div:nth-of-type(2) > button > span:nth-of-type(2)'),
+        popup.locator(':scope >>> div:nth-of-type(2) > button > span:nth-of-type(2)'),
+        popup.locator('::-p-text(Stop)')
+    ]).setTimeout(timeout).click({
+      offset: {
+        x: 7.859375,
+        y: 23.203125,
+      },
+    });
+
+    return null
+}
+
+async function savePocket(page, {extensions}) {
+    const browser = await page.browser()
+    const extension = extensions.filter(({name}) => name === 'pocket')[0]
+    if (!extension.version) throw 'Could not find Pocket extension ID, is it installed?'
+
+    console.log(`[🛠️] Saving URL to Pocket API using extension (${extension.id})...`, 'https://getpocket.com/saves')
+    await page.bringToFront()    // action button acts on the foreground tab, so it has to be in front
+    await extension.dispatchAction()
+    try {
+        const login_window = await browser.waitForTarget(
+            target => target.url().toString().startsWith('https://getpocket.com/'),
+            {timeout: 3_000},
+        )
+        // login window will open if pocket is not signed-in
+        if (login_window) return false
+    } catch(e) {
+        // no new window should open if it saves correctly
+        return true
+    }
+}
+
+/***************** Synchronous Archive Output Tasks ***************************/
+
+async function saveScreenrecording(page, page_state, {save_gif=true}={}) {
+    if (page_state.recorder) {
+        const duration = Date.now() - page_state.start_ts
+        console.log(`[🎥] Saving screen-recording video (${duration/1000}s)...`.padEnd(82), prettyPath(SCREENRECORDING_PATH(page)))
+        const recorder = page_state.recorder
+        page_state.recorder = null
+        await recorder.stop()
+
+        // create symlink for legacy path
+        const snap_dir = page_state.snapshot_dir
+        const legacy_path = path.join(snap_dir, 'media', 'screenrecording.mp4')
+        await overwriteSymlink(SCREENRECORDING_PATH(page), legacy_path, {relative: snap_dir, search_limit: snap_dir})
+
+        // // remove duplicate frames (white frames at start while it loads + static image at end)
+        // const video_path = SCREENRECORDING_PATH(page)
+        // const short_path = video_path.replace('.mp4', '.short.mp4')
+        // try {
+        //     await exec(
+        //         // create a shortened video starting from 0:02s to 0:01s with duplicate frames removed (can look jumpy sometimes)
+        //         `ffmpeg -ss 2 -sseof -1 -y -i ${video_path} -vf mpdecimate,setpts=N/FRAME_RATE/TB ${short_path}`
+        //     )
+        // } catch(err) {
+        //     console.log('[❌] Failed to shorten screenrecording.mp4')
+        // }
+
+        // convert video to GIF
+        if (save_gif) {
+            try {
+                const BIN_NAME = '/Volumes/NVME/Users/squash/bin/ffmpeg'
+                const child = child_process.spawn(
+                    BIN_NAME,
+                    [
+                        '-hide_banner',
+                        '-loglevel', 'error',
+                        '-ss', '3',
+                        '-t', '10',
+                        '-y',
+                        '-i', SCREENRECORDING_PATH(page),
+                        '-vf', "fps=10,scale=1024:-1:flags=bicubic,split[s0][s1];[s0]palettegen[p];[s1][p]paletteuse",
+                        '-loop', '0',
+                        SCREENRECORDGIF_PATH(page),
+                    ],
+                    {
+                        cwd: path.dirname(SCREENRECORDING_PATH(page)),
+                        timeout: 60_000,
+                        // stdio: [null, 'pipe', 'pipe'],
+                        stdio: 'ignore',
+                        detached: true,                          // run in background, don't block on response
+                    },
+                )
+                await blockUntilExists(SCREENRECORDGIF_PATH(page), {min_bytes: 100, timeout: 40_000})
+                console.log(`[🎥] Saved screen-recording GIF with ffmpeg pid=${child.pid} (${duration/1000}s)...`.padEnd(82), prettyPath(SCREENRECORDGIF_PATH(page)))
+
+                const snap_dir = page_state.snapshot_dir
+                const legacy_path = path.join(snap_dir, 'media', 'screenrecording.gif')
+                await overwriteSymlink(SCREENRECORDGIF_PATH(page), legacy_path, {relative: snap_dir, search_limit: snap_dir})
+            } catch(err) {
+                console.log('[❌] Failed to convert video to GIF:', err)
+            }
+        }
+
+        return SCREENRECORDING_PATH(page)
+    }
+    return null
+}
+
+async function saveScreenshot(page, _page_state, {aspect_ratio=SCREENSHOT_ASPECT_RATIO, width=null, height=null, jpg_width=1440, jpg_quality=90, timeout=30_000}={}) {
+    try {await fs.promises.unlink(SCREENSHOT_PATH(page))} catch(err) {}
+    
+    // setup width and height
+    width = width || DEFAULT_VIEWPORT.width
+    assert((typeof width === 'number') && width > 200)
+    height = height || Math.floor(width/aspect_ratio)
+    assert((typeof height === 'number') && height > 200)
+    
+    console.log(`[📸] Saving full-page screenshot (${width}x${height}px)...`.padEnd(82), prettyPath(SCREENSHOT_PATH(page)))
+
+    // set width, height, and deviceScale factor: https://github.com/puppeteer/puppeteer/issues/1576
+    await page.setViewport({ ...DEFAULT_VIEWPORT, width, height, deviceScaleFactor: 2})
+    await page.bringToFront()
+    await wait(1_250)       // page takes a sec settle after foregrounding and viewport update
+
+    // take lossless fullpage screenshot of 1920x1440+px (4:3+) -> ./screenshot.png
+    await page.screenshot({ path: SCREENSHOT_PATH(page), fullPage: true, type: 'png' })
+    
+    // wait for the screenshot to be created, then set the viewport to the next size
+    await blockUntilExists(SCREENSHOT_PATH(page), {min_bytes: 100, timeout})
+    await wait(6_000)       // puppeteer takes a while to finish writing png data when fullPage: true
+    
+    const jpg_height = Math.floor(jpg_width/aspect_ratio)
+    await page.setViewport({ ...DEFAULT_VIEWPORT, width: jpg_width, height: jpg_height, deviceScaleFactor: 2})
+    await wait(1_250)       // page takes a sec settle after foregrounding and viewport update
+
+    // WARNING: make sure you never try to create two screenshots at the same time (especially not fullpage screenshots)
+    // thats why there are all these delays here.
+    // screenshot creation messes up the whole viewport while it's running,
+    // and it writes bad/white empty screenshots if you try to make more than one concurrently
+
+    // take compressed screenshot of jpg_width*jpg_height (4:3) -> ./screenshot.jpg
+    await page.screenshot({
+        path: SCREENSHOT_JPG_PATH(page),
+        type: 'jpeg',
+        quality: jpg_quality,
+        clip: {
+            x: 0,
+            y: 0,
+            width: jpg_width,
+            height: jpg_height,
+        },
+        captureBeyondViewport: false,
+    });
+    await blockUntilExists(SCREENSHOT_JPG_PATH(page), {min_bytes: 100, timeout: timeout/2})
+    console.log(`[📸] Saved screenshot as screenshot.jpg (${jpg_width}x${jpg_height}px)...`.padEnd(82), prettyPath(SCREENSHOT_JPG_PATH(page)))
+
+    // reset viewport back to defaults
+    await wait(1_250)
+    await page.setViewport(DEFAULT_VIEWPORT)
+
+    // ALTERNATIVE METHOD based on cropping fullpage png and converting to jpg manually:
+    // import {PNG} from 'pngjs';
+    // import jpeg from 'jpeg-js';
+    // setTimeout(async () => {
+    //     try {
+    //         const screenshot_png = SCREENSHOT_PATH(page);
+    //         const screenshot_jpg = SCREENSHOT_JPG_PATH(page)
+    //         const jpg_max_height = height
+    //         const jpg_quality = quality; // Adjust the quality as needed (0-100)
+
+    //         fs.createReadStream(screenshot_png)
+    //             .pipe(new PNG())
+    //             .on('parsed', function () {
+    //                 const width = this.width;
+    //                 const height = this.height;
+            
+    //                 let cropped_height = height;
+    //                 if (height > jpg_max_height) {
+    //                   cropped_height = jpg_max_height;
+    //                 }
+            
+    //                 const cropped_bytes = new Uint8Array(width * cropped_height * 4);
+    //                 for (let y = 0; y < cropped_height; y++) {
+    //                   for (let x = 0; x < width; x++) {
+    //                     const idx = (width * y + x) << 2;
+    //                     cropped_bytes[idx] = this.data[idx];
+    //                     cropped_bytes[idx + 1] = this.data[idx + 1];
+    //                     cropped_bytes[idx + 2] = this.data[idx + 2];
+    //                     cropped_bytes[idx + 3] = this.data[idx + 3];
+    //                   }
+    //                 }
+            
+    //                 const jpeg_obj = {
+    //                   data: cropped_bytes,
+    //                   width: width,
+    //                   height: cropped_height,
+    //                 };
+            
+    //                 const jpeg_bytes = jpeg.encode(jpeg_obj, jpg_quality);
+    //                 fs.writeFileSync(screenshot_jpg, jpeg_bytes.data);
+    //                 console.log(`[📸] Saved screenshot as screenshot.jpg (${width}x${jpg_max_height}px)...`.padEnd(82), prettyPath(SCREENSHOT_JPG_PATH(page)))
+    //             });
+    //     } catch(err) {
+    //         console.error('[X] Error while generating JPG screenshot', SCREENSHOT_JPG_PATH(page), err)
+    //     }
+    // }, DELAY_BEFORE_JPG_CONVERSION)  
+
+    // ALTERNATIVE METHOD TO WRITE SCREENSHOT JPG:
+    // await wait(5_000)  // puppeteer takes a while to finish writing png data when fullPage: true
+    // if ((await page.evaluate('document.body.scrollHeight')) > max_height) {
+    //     // if page exceeds max_height, save additional cropped screenshot as screenshot.top.png
+    //     // (needed b.c. uncropped screenshot may have insane 1:20+ aspect ratio that is hard to use elsewhere)
+    //     await page.screenshot({ path: SCREENSHOT_JPG_PATH(page), type: 'jpg', quality: 100})
+    //     await wait(1_000)  // page takes a sec settle after a screenshot
+    // }
+
+    return SCREENSHOT_PATH(page)
+}
+
+async function savePDF(page, _page_state, {timeout=30_000}={}) {
+    const url = page.url() || 'about:blank'
+    if (URL_SCHEMES_IGNORED.includes(url.split(':')[0])) return null
+
+    const out_path = PDF_PATH(page)
+    console.log(`[📓] Saving print-as-PDF export...`.padEnd(82), prettyPath(out_path))
+    await page.bringToFront()
+    try {await fs.promises.unlink(PDF_PATH(page))} catch(err) {}
+
+    // await page.emulateMediaType('screen')           // print as "@media(screen) instead of @media(print)"
+
+    // page.createPDFStream lets us to save larger PDFs than page.pdf() before crashing
+    // (streams to disk in chunks instead of all at once)
+    const pdf_stream = await page.createPDFStream({
+        timeout: timeout,
+        printBackground: true,
+        outline: true,
+        tagged: true,
+        format: 'A4',
+        displayHeaderFooter: false,
+        // margin: { top: '0.5cm', right: '1cm', bottom: '0.8cm', left: '1cm' },
+    })
+    const reader = pdf_stream.getReader()
+
+    // iterate through reader and append chunks to out_path
+    await fs.promises.rm(out_path, {force: true})
+    let num_bytes = 0
+    let error = '0 bytes written'
+    try {
+        while (true) {
+            const {done, value} = await reader.read()
+            if (done) break;
+            await fs.promises.appendFile(out_path, value)
+            num_bytes += value.length;
+        }
+    } catch(error) {
+        num_bytes = 0
+    }
+
+    if (!num_bytes) {
+        console.warn('[❌] Failed to save PDF', JSON.stringify(error, null, 4))
+        await fs.promises.rm(out_path, {force: true})
+        return null
+    }
+
+    return out_path
+}
+
+async function inlineShadowDOM(page, _page_state, {limit=100_000}={}) {
+    console.log(`[😎] Replacing Shadow DOM elements with inline HTML...`)
+
+    try {
+        const num_replaced = await page.evaluate((limit) => {
+            let num_replaced = 0
+
+            // Returns HTML of given shadow DOM.
+            const getShadowDomHtml = (shadowRoot) => {
+                let shadowHTML = '';
+                for (const el of shadowRoot.childNodes) {
+                    shadowHTML += el.nodeValue || el.outerHTML;
+                }
+                return shadowHTML;
+            };
+
+            // Recursively replaces shadow DOMs with their HTML.
+            const replaceShadowDomsWithHtml = (rootElement) => {
+                if (num_replaced > limit) return
+                for (const el of rootElement.querySelectorAll('*')) {
+                    if (el.shadowRoot) {
+                        replaceShadowDomsWithHtml(el.shadowRoot);
+                        el.innerHTML += getShadowDomHtml(el.shadowRoot);
+                    }
+                }
+                num_replaced++
+            };
+
+            replaceShadowDomsWithHtml(document.body);
+
+            return num_replaced
+        }, limit)
+        // console.log('    √ replaced', num_replaced, 'Shadow DOM trees')
+    } catch(err) {
+        console.log('[⚠️] Inlining Shadow DOM failed', err)
+    }
+}
+
+async function saveAIQualityAssuranceResult(page, {original_url, version}) {
+    console.log(`[🧠] Analyzing screenshot with GPT-4o for QA checks...`.padEnd(82), prettyPath(AIQA_PATH(page)))
+    
+    let screenshot_path = SCREENSHOT_PATH(page)
+    const screenshot_cropped_path = SCREENSHOT_JPG_PATH(page)
+
+    if (fs.existsSync(screenshot_cropped_path)) {
+        // screenshot is too tall to pass to openai, send cropped version instead
+        screenshot_path = screenshot_cropped_path
+    }
+    try {
+        await blockUntilExists(screenshot_path, {min_bytes: 100, timeout: 7_500})
+    } catch (err) {
+        console.warn('[❌] Failed to send screenshot to GTP-4o for analysis, no screenshot.{png,jpg} exists', err)
+        return null
+    }
+    var stdout = ''
+    var stderr = ''
+    let result = null
+    const PYTHON_BIN = path.join(__dirname, '.venv/bin/python')
+    const SCRIPT_PATH = path.join(__dirname, 'ai_qa.py')
+    await blockUntilExists(PYTHON_BIN, {min_bytes: 1, timeout: 250})
+    await blockUntilExists(SCRIPT_PATH, {min_bytes: 1, timeout: 250})
+
+    try {
+        var {stdout, stderr} = await exec(
+            `${PYTHON_BIN} ${SCRIPT_PATH} --attach '${screenshot_path}'`
+        )
+        result = JSON.parse(stdout.toString())
+        if (!result) throw 'Got empty result!'
+        result = {
+            TYPE: 'aiqa',
+            VERSION: version,
+            URL: original_url,
+            ...result,
+        }
+    } catch(parse_err) {
+        console.warn('[❌] Failed to get OpenAI analysis for screenshot.png', parse_err, stderr)
+    }
+    if (!(result || stdout)) {
+        return null
+    }
+    await overwriteFile(
+        AIQA_PATH(page),
+        result || stdout.toString(),
+    )
+ 
+    
+
+    return result
+}
+
+async function saveYTDLP(page, {original_url, version}, {max_size='750m'}={}) {
+    console.log(`[🎥] Saving media with YT-DLP (<=${max_size})...`.padEnd(82), prettyPath(YTDLP_PATH(page)))
+
+    await fs.promises.mkdir(YTDLP_PATH(page), {recursive: true})
+
+    const cwd = YTDLP_PATH(page)
+    const bin_name = 'yt-dlp'
+    const timeout = 300_000              // 5min timeout
+    const args = [
+        '--restrict-filenames',
+        '--trim-filenames', '128',
+        '--write-description',
+        '--write-info-json',
+        '--write-annotations',
+        '--write-thumbnail',
+        '--no-call-home',
+        '--write-sub',
+        '--write-auto-subs',
+        '--convert-subs=srt',
+        '--yes-playlist',
+        '--continue',
+        '--no-abort-on-error',
+        '--ignore-errors',
+        '--geo-bypass',
+        '--add-metadata',
+        `--format=(bv*+ba/b)[filesize<=${max_size}][filesize_approx<=?${max_size}]/(bv*+ba/b)`,
+        '--no-check-certificate',
+        '--no-progress',
+        // `--cookies=${COOKIES_TXT_PATH}`,   // using logged in cookies actually makes it fail more often, not sure why
+        original_url,
+    ]
+
+    const {getResult, ...exec_info} = await saveExecResult(bin_name, args, {original_url, version}, {cwd, timeout})
+
+    return {getResult, ...exec_info}
+}
+
+async function saveGALLERYDL(page, {original_url, version}) {
+    console.log(`[🎥] Saving photos with gallery-dl...`.padEnd(82), prettyPath(GALLERYDL_PATH(page)))
+
+    await fs.promises.mkdir(GALLERYDL_PATH(page), {recursive: true})
+
+    const cwd = GALLERYDL_PATH(page)
+    const bin_name = 'gallery-dl'
+    const timeout = 300_000              // 5min timeout
+    const args = [
+        '--verbose',
+        '--write-metadata',
+        '--write-infojson',
+        '--write-tags',
+        '--sleep=1.5-2.5',
+        `--cookies=${COOKIES_TXT_PATH}`,
+        // '--no-check-certificate',
+        // `--directory=media`,
+        original_url,
+    ]
+
+    const {getResult, ...exec_info} = await saveExecResult(bin_name, args, {original_url, version}, {cwd, timeout})
+
+    return {getResult, ...exec_info}
+}
+
+// async function saveWget(page, {original_url, version}) {
+//     console.log(`[⎒] Saving wget site clone...`.padEnd(82), prettyPath(WGET_PATH(page)))
+
+//     const args = [
+//         // ...
+//     ]
+ 
+//     spawn(
+//         'wget',
+//         [
+//             ...args,
+//             original_url,
+//         ],
+//         {
+//             cwd: WGET_PATH(page),
+//             detached: true,         // run in background, don't block on response
+//             stdio: 'ignore',
+//             timeout: 300_000,       // 5min timeout
+//         },
+//     )
+
+//     return {path: WGET_PATH(page)}
+// }
+
+/**************** Asynchronous Archive Output Tasks ***************************/
+
+type FaviconCandidate = {
+    url: string,
+    basename: string,
+    extension: string,
+    expected_mimetype: string,
+}
+
+const faviconFromDomain = (url) => {
+    // https://auth:[email protected]:1234/a/bc123 -> https://auth:[email protected]:1234/favicon.ico
+    const url_origin = (new URL(url)).origin
+    return {
+        url: url_origin ? `${url_origin}/favicon.ico` : null,
+        basename: 'favicon',
+        extension: undefined,                     // auto-detect extension at download time in case it redirects us to a png
+        expected_mimetype: 'image/',              // only accept image/* to avoid saving html/txt error reponses as icon
+    } as FaviconCandidate
+}
+
+const faviconFromGoogle = (url, size=256) => {
+    // https://auth:[email protected]:1234/a/bc123 -> https://www.google.com/s2.favicons?domain=t.co
+    const domain = url && (new URL(url)).hostname
+    return {
+        url: domain?.includes('.') ? `https://www.google.com/s2/favicons?sz=${size},domain=${domain}` : null,
+        basename: 'google_favicon',
+        extension: 'png',
+        expected_mimetype: 'image/png',           // google always provides PNGs in response
+    } as FaviconCandidate
+}
+
+const faviconFromHtml = async (page) => {
+    // <link rel="icon" src="https://example.com/static/images/favicon.png"/> -> https://example.com/static/images/favicon.png
+    let url
+    try {
+        url = await page.$eval('link[rel*="icon"]', (elem) => elem?.href)
+        if (!url || !url.includes('://'))
+            url = null
+    } catch(err) {
+        url = null
+        // console.warn('Failed to find favicon tag in html', JSON.stringify(err, null, 4))
+    }
+
+    return {
+        url,
+        basename: 'favicon',
+        extension: undefined,                    // auto-detect extension at download time
+        expected_mimetype: 'image/',             // accept any image/* mimetype at download time
+    } as FaviconCandidate
+}
+
+type FaviconResult = {
+    url: string,
+    num_bytes: number,
+    abspath?: string,
+    dir?: string,
+    filename?: string,
+    mimeType?: string,
+}
+
+async function saveFavicon(page, {original_url, main_response, version}) {
+    const dir = path.dirname(FAVICON_PATH(page))
+    const response_url = main_response?.url()
+
+    const favicon_downloads_to_try: {[key: string]: FaviconCandidate} = unique([
+        await faviconFromHtml(page),
+        faviconFromDomain(response_url),
+        faviconFromDomain(original_url),
+        faviconFromGoogle(response_url),
+        faviconFromGoogle(original_url),
+    ].filter(({url}) => url), 'url')
+
+    const browser = await page.browser()
+
+    // let logs = []
+    // let errors = []
+    let output_files: {[key: string]: FaviconResult} = {}
+
+    for (const download_options of Object.values(favicon_downloads_to_try)) {
+        let result: FaviconResult = {num_bytes: 0, url: download_options.url}
+        // {url, num_bytes, abspath, dir, filename, basename, extension, mimeType}
+        try {
+            // try getting it with node-fetch first
+            const response = await fetch(download_options.url) as Response
+            const file_options = await detectFilename({...download_options, response, dir})
+            if (response.headers.get("content-length")) {
+                const favicon_stream = Readable.fromWeb(response.body as any)
+                await overwriteFile(file_options.abspath, favicon_stream)
+                result = {
+                    ...file_options,
+                    num_bytes: parseInt(response.headers.get("content-length") || '0'),
+                    mimeType: response.headers.get("content-type"),
+                }
+            } else {
+                throw 'Failed to download favicon with fetch()'
+            }
+        } catch(err) {
+            // console.warn('[!] Failed to get favicon with node-fetch', err)
+            // fallback to getting it by opening a new browser tab
+            result = await download({...download_options, browser, dir, page})
+        }
+
+        // logs.push(...(result.logs || []))
+        // errors.push(...(result.errors || []))
+
+        if (result.num_bytes) {
+            console.log(`[🌠] Saving page favicon (${result.url.substring(0, 35)}... ${result.mimeType})...`.padEnd(82), prettyPath(result.abspath))
+            output_files[result.filename] = result
+            break   // break here stops after the first successful download, comment out to keep going instead
+        }
+    }
+    const output_file = Object.values(output_files).sort(file => file.num_bytes).at(-1)
+    const favicon_info = {
+        TYPE: 'favicon',
+        VERSION: version,
+        URL: original_url,
+        succeeded: !!output_file,
+        // stdout: JSON.stringify(logs),
+        // stderr: JSON.stringify(errors),
+        favicon_url: output_file?.url,
+        favicon_urls: Object.keys(favicon_downloads_to_try),
+        favicon_files: Object.keys(output_files).map(fname => fname.replace(dir, '.')),
+        favicon_filename: output_file?.filename,
+        favicon_num_bytes: output_file?.num_bytes,
+    }
+    await overwriteFile(FAVICON_PATH(page), favicon_info)
+
+    return favicon_info
+}
+
+async function saveTitle(page, {original_url, version}) {
+    const title_from_browser = (await page.title()) || null
+    const title_from_js = await page.evaluate(() => document?.title || null)
+    const title_from_html = await page.evaluate(() => document?.querySelector('title')?.innerText || null)
+    const title_from_og = await page.evaluate(() => document?.querySelector('meta[property="og:title"]')?.getAttribute('content') || null)
+
+    // best guess at best title = longest title
+    const title = ([title_from_html, title_from_og, title_from_js, title_from_browser]
+        .filter(title => title)
+        .sort((a, b) => b.length - a.length)[0] || '')
+        .replaceAll('\n', ' ')
+
+    if (title?.length) {
+        console.log(`[📗] Saving page title (${title.substring(0, 40)})...`.padEnd(82), prettyPath(TITLE_PATH(page)))
+        await overwriteFile(TITLE_PATH(page), title)
+    }
+
+    const title_info = {
+        TYPE: 'title',
+        VERSION: version,
+        URL: original_url,
+        title,
+        title_from_html,
+        title_from_og,
+        title_from_js,
+        title_from_browser,
+    }
+    const title_json_path = TITLE_PATH(page).replace('.txt', '.json')
+    await overwriteFile(title_json_path, title_info)
+
+    return title_info
+}
+
+async function saveRaw(page, {main_response}) {
+    const response = main_response
+    if (!response) {
+        console.warn('[⚠️] Failed to save page RAW bytes, main_response is null', response)
+    }
+    const dir = RAW_PATH(page)
+    await fs.promises.mkdir(dir, {recursive: true})
+
+    const {url, abspath, mimeType} = await detectFilename({page, response, dir})
+
+    console.log(`[🔟] Saving raw response bytes (${mimeType})...`.padEnd(82), prettyPath(abspath))
+
+    await download({page, response, abspath})
+    return abspath
+}
+
+async function saveSourceMaps(page, {original_url, version}) {
+    console.log(`[🐛] Saving source maps to ./responses/all/*.{js,css}.map...`)
+
+    const response_index_path = path.join(RESPONSES_PATH(page), 'index.jsonl')
+    const response_index = await fs.promises.readFile(response_index_path, 'utf-8')
+
+    const urls_to_download = []
+
+    for (const response of response_index.split('\n')) {
+        try {
+            const {url, extension} = JSON.parse(response)
+            if (['css', 'js'].includes(extension?.toLowerCase())) {
+                urls_to_download.push(url + '.map')
+            }
+        } catch(err) { continue }
+    }
+
+    // TODO: fix this, it needs to both after stopSavingMetadata and before stopSavingMetadata
+    // fix is to use traffic_log to get response url list instead of waiting for index.jsonl to be created
+    await page.evaluate(async (urls_to_download) => {
+        const promises = []
+        for (const sourcemap_url in urls_to_download) {
+            promises.push(fetch(sourcemap_url))
+        }
+        return Promise.allSettled(promises)
+    }, urls_to_download)
+
+    return {
+        TYPE: 'sourcemaps',
+        URL: original_url,
+        VERSION: version,
+        sourcemaps: urls_to_download,
+    }
+}
+
+async function saveRequests(page, {original_url, version, traffic_log}) {
+    console.log(`[📼] Saving requests log (${Object.keys(traffic_log).length})...`.padEnd(82), prettyPath(REQUESTS_PATH(page)))
+
+    const requests_info = {
+        TYPE: 'requests',
+        VERSION: version,
+        URL: original_url,
+        requests: traffic_log,
+    }
+
+    await overwriteFile(REQUESTS_PATH(page), requests_info)
+
+    return requests_info
+}
+
+async function saveRedirects(page, {original_url, main_response, traffic_log, redirects, version}) {
+    const main_request_id = Object.keys(traffic_log).filter(id => !id.includes('.'))[0]
+    const main_response_traffic = traffic_log[main_request_id] || {}
+
+    const url_from_browser = await page.url() || null
+    const url_from_request = (
+        main_response?.request()?.url()
+        || main_response_traffic['Network.requestWillBeSent']?.request?.url
+        || null)
+    const url_from_response = (
+        main_response?.url()
+        || main_response_traffic['Network.responseReceived']?.main_response?.url
+        || null)
+
+    const http_redirects = 
+        Object.values(traffic_log)
+            .filter(event => event['Network.requestWillBeSent']?.redirectResponse)
+            .map(event => event['Network.requestWillBeSent'])
+            .map(requestWillBeSent => ({
+                url: requestWillBeSent.request.url,
+                src: requestWillBeSent.redirectResponse.url,
+                status: requestWillBeSent.redirectResponse.status,
+                loaderId: requestWillBeSent.loaderId,
+                requestId: requestWillBeSent.requestId,
+                wallTime: requestWillBeSent.wallTime,
+                initiator: requestWillBeSent.initiator,
+                isMainFrame: (requestWillBeSent.loaderId == main_request_id),
+            }))
+    
+    const url_parsed = new URL(url_from_response || url_from_request || url_from_browser)
+
+    const redirects_info = {
+        TYPE: 'redirects',
+        VERSION: version,
+        URL: original_url,
+        url_parsed,
+        url_from_request,
+        url_from_response,
+        url_from_browser,
+        redirects_from_browser: redirects,
+        redirects_from_http: http_redirects,
+    }
+    console.log(`[🔗] Saving page redirects log (${http_redirects.length})...`.padEnd(82), prettyPath(REDIRECTS_PATH(page)))
+
+    await overwriteFile(REDIRECTS_PATH(page), redirects_info)
+
+    return redirects_info
+}
+
+async function saveHeaders(page, {original_url, version, traffic_log}) {
+    const main_request_id = Object.keys(traffic_log).filter(id => !id.includes('.'))[0]
+    const main_response_traffic = traffic_log[main_request_id] || {}
+
+    // combine base request with browser-added request headers
+    const request = {...main_response_traffic['Network.requestWillBeSent']?.request}
+    const request_extra_headers = main_response_traffic['Network.requestWillBeSentExtraInfo']?.headers || {}
+    request.headers = {...request.headers, ...request_extra_headers}
+
+    // combine base response with browser-added response headers
+    const response = {...main_response_traffic['Network.responseReceived']?.response}
+    const response_extra_headers = main_response_traffic['Network.responseReceivedExtraInfo']?.headers || {}
+    response.headers = {...response.headers, ...response_extra_headers}
+
+    const headers_info = {
+        TYPE: 'headers',
+        VERSION: version,
+        URL: original_url,
+        request,
+        response,
+    }
+
+    const num_headers = Object.keys({...request.headers, ...response.headers}).length
+    if (num_headers) {
+        console.log(`[👾] Saving main request & response headers (${num_headers})...`.padEnd(82), prettyPath(HEADERS_PATH(page)))
+        await overwriteFile(HEADERS_PATH(page), headers_info)
+    }
+
+    return headers_info
+}
+ 
+async function saveSSL(page, {original_url, version, traffic_log}) {
+    const main_request_id = Object.keys(traffic_log).filter(id => !id.includes('.'))[0]
+    const main_response_traffic = traffic_log[main_request_id] || {}
+
+    const relevant_response_keys = [
+        'url',
+        'status',
+        'mimeType',
+        'connectionReused',
+        'remoteIPAddress',
+        'remotePort',
+        'fromServiceWorker',
+        'encodedDataLength',
+        'protocol',
+        'alternateProtocolUsage',
+        'securityState',
+        'securityDetails',
+    ]
+    let ssl_info = Object.entries(main_response_traffic['Network.responseReceived']?.response || {})
+        .reduce((obj, [key, val]) => {
+            if (relevant_response_keys.includes(key)) {
+                obj[key] = val
+            }
+            return obj
+        }, {}) as any
+
+    // TODO: parse SSL certificate sha256 hash from chrome://system/#chrome_root_store
+    // const ssl_certificate = await client.send('Network.getCertificate', {origin: original_url})
+    // ssl_info.sslCertSha256 = '<unknown>'
+
+    ssl_info = {
+        TYPE: 'ssl',
+        VERSION: version,
+        URL: original_url,
+        ...ssl_info,
+    }
+
+    if (Object.keys(ssl_info).length-3) {
+        console.log(`[🔏] Saving page SSL details (${ssl_info?.securityDetails?.protocol})...`.padEnd(82), prettyPath(SSL_PATH(page)))
+        await overwriteFile(SSL_PATH(page), ssl_info)
+    }
+
+    return ssl_info
+}
+
+
+async function saveDOM(page, {original_url, version}) {
+    const html = await page.content();
+    console.log(`[📖] Saving DOM dump (${html.length})...`.padEnd(82), prettyPath(DOM_PATH(page)))
+    const html_with_header = 
+        `<!-- Saved by ArchiveBox TYPE=dom VERSION=${version} URL=${original_url} -->\n${html}`
+    await overwriteFile(DOM_PATH(page), html_with_header)
+    return DOM_PATH(page)
+}
+
+async function saveBodyText(page, _page_state) {
+    const innerText = await page.evaluate(() => document?.body?.innerText);
+
+    if (innerText?.length) {
+        console.log(`[📃] Saving body text (${innerText.length})...`.padEnd(82), prettyPath(BODYTEXT_PATH(page)))
+        await overwriteFile(BODYTEXT_PATH(page), innerText)
+    }
+
+    // // alternative method: emulate Ctrl+A, Ctrl+C (sometimes gets more than body.innerText)
+    // const innerText = await page.$eval('*', (el) => {
+    //     const selection = window.getSelection();
+    //     const range = document.createRange();
+    //     range.selectNode(el);
+    //     selection.removeAllRanges();
+    //     selection.addRange(range);
+    //     return window.getSelection().toString();
+    // });
+
+    return innerText
+}
+
+async function savePandoc(page, { original_url, version }) {
+    console.log(`[📒] Converting DOM HTML to markdown with Pandoc...`.padEnd(82), prettyPath(PANDOC_PATH(page)))
+
+    let dom_paths = [DOM_PATH(page), SINGLEFILE_PATH(page)].filter(fs.existsSync)
+    if (!dom_paths) return null
+    const dom_path = dom_paths[0]
+
+    var stdout: string = ''
+    var stderr: string = ''
+    let result: any = null
+    const BIN_NAME = 'pandoc'
+    // pandoc --from html --to markdown_github --citeproc --wrap=none --highlight-style=kate
+    const args = [
+        BIN_NAME,
+        '--from=html',
+        '--to=markdown_github',
+        '--wrap=none',
+        '--citeproc',
+        '--highlight-style=kate',
+        `--output='${PANDOC_PATH(page)}'`,
+        dom_path,
+    ]
+    try {
+        ;({ stdout, stderr } = await exec(args.join(' ')));
+        stdout = stdout.toString().trim()
+        if (!stdout) throw 'Got empty result!'
+        result = {
+            TYPE: 'pandoc',
+            VERSION: version,
+            URL: original_url,
+            cmd: args,
+            markdown_file: PANDOC_PATH(page),
+        }
+    } catch (parse_err) {
+        console.warn('[❌] Failed to run Pandoc HTML to MD conversion', parse_err, stderr)
+    }
+    if (!stdout) {return null}
+    await overwriteFile(
+        PANDOC_PATH(page),
+        stdout,
+    )
+
+    // pandoc --from markdown_github --to html --citeproc --wrap=none --highlight-style=kate
+    const reverse_conversion_args = [
+        BIN_NAME,
+        '--from=markdown_github',
+        '--to=html',
+        '--wrap=none',
+        '--citeproc',
+        '--highlight-style=kate',
+        `--output='${PANDOC_PATH(page).replace('.md', '.html')}'`,
+        PANDOC_PATH(page),
+    ]
+    try {
+        ; ({ stdout, stderr } = await exec(reverse_conversion_args.join(' ')));
+        stdout = stdout.toString().trim()
+        if (!stdout) throw 'Got empty result!'
+        result = {
+            ...result,
+            html_file: PANDOC_PATH(page).replace('.md', '.html'),
+        }
+    } catch (parse_err) {
+        console.warn('[❌] Failed to run Pandoc MD to HTML conversion', parse_err, stderr)
+    }
+    if (!result) { return null }
+    await overwriteFile(
+        PANDOC_PATH(page).replace('.md', '.html'),
+        result,
+    )
+
+    return result
+}
+
+async function saveReadability(page, {original_url, version}) {
+    const url = await page.url()
+    let html = ''
+    let article = null
+    try {
+        html = await page.content()
+        if (html.length > 14_000_000) {
+            console.warn('[⚠️] Truncating readability article text because html is too long...', html.length)
+            html = html.substring(0, 13_900_000)
+        }
+        const virtualConsole = new VirtualConsole()
+        const dom = new JSDOM(html, {url, virtualConsole})
+        const reader = new Readability(dom.window.document);
+        article = reader.parse()
+    } catch(err) {
+        console.warn(`[❌] Failed to get readability article text`)
+        return null
+    }
+    if (article) {
+        console.log(`[📜] Saving readability article text (${article.textContent?.length})...`.padEnd(82), prettyPath(READABILITY_PATH(page)))
+        const {content, textContent, ...metadata} = article
+        if (content.trim()) {
+            await overwriteFile(READABILITY_PATH(page).replace('.json', '.html'), content);
+        }
+        if (textContent.trim()) {
+            await overwriteFile(READABILITY_PATH(page).replace('.json', '.txt'), textContent);
+        }
+        const readability_info = {
+            TYPE: 'readability',
+            VERSION: version,
+            URL: original_url,
+            ...metadata,
+        }
+        await overwriteFile(READABILITY_PATH(page), readability_info)
+        return readability_info
+    }
+    return null
+}
+
+async function saveAccessibility(page, {original_url, version}) {
+    // get accessibility tree
+    const accessibility_tree = await page.accessibility.snapshot({interestingOnly: true});
+    // console.log(accessibility_tree);
+
+    // get iframe tree
+    const iframes = []
+    function dumpFrameTree(frame, indent='>') {
+        iframes.push(indent + frame.url());
+        for (const child of frame.childFrames()) {
+            dumpFrameTree(child, indent + '>');
+        }
+    }
+    dumpFrameTree(page.mainFrame(), '');
+    // console.log(iframes)
+
+    // generate simple table-of-contents of all the key html elements (e.g. h1, h2, h3, article, main, etc.)
+    const outline = await page.evaluate(() => {
+        const headings = []
+        for (const elem of [...document.querySelectorAll("h1, h2, h3, h4, h5, h6, a, header, footer, article, main, aside, nav, section, figure, summary, table, form, iframe")] as HTMLElement[]) {
+            
+            // skip a tags that aren't named anchors
+            if (elem.tagName.toLowerCase() == 'a' && !(elem as HTMLAnchorElement).name) continue
+
+            // e.g. article #main-article
+            const elem_id = ((typeof elem.id === 'string' && elem.id) || (elem as HTMLAnchorElement).name || elem.ariaLabel || elem.role || '')
+            const elem_classes = elem.className.trim().split(' ').slice(0, 3).join(' .') || ''
+            const elem_action = (elem as any).action?.split('/')?.slice(-1)?.join('/')
+            const summary = elem.innerText.length > 128
+                ? `${elem.innerText?.slice(0, 128)}...`
+                : elem.innerText
+
+            let prefix = ''
+            let title = (elem_id ? `#${elem_id}` : '')
+            if (!title && elem_classes) title = `.${elem_classes}`
+            if (elem_action) title = `${title} /${elem_action}`
+            if (summary) title = `${title}: ${summary}`
+
+            // if elem is a header, prepend a #### prefix based on its level
+            const level = Number(elem.tagName.toLowerCase().replace('h', ''))
+            if (!isNaN(level)) {
+                prefix = '#'.repeat(level)
+                title = elem.innerText || elem_id || elem_classes
+            } else {
+                // set prefix to element's breadcrumb path
+                let node = elem
+                const parents = [elem.tagName?.toLowerCase().trim()]
+                while (node) {
+                    // add each parent element's name to the path
+                    // const elem_type = node.tagName?.toLowerCase().trim() || ''
+                    // if (elem_type && !['div', 'span', 'p', 'body', 'html'].includes(elem_type)) {
+                    //     parents.unshift(elem_type);
+                    // }
+                    parents.unshift('')  // add emptystring to abbreviate path as >>>> istead of main>article>header>div>...
+                    node = node.parentNode as HTMLElement
+                }
+                prefix = parents.join('>')
+            }
+            // strip all repeated whitespace and newlines
+            title = title.replaceAll('\n', ' ').replace(/\s+/g, ' ').trim()
+
+            if (prefix) {
+                headings.push(`${prefix} ${title}`)
+            }
+        }
+        // console.log(headings.join('\n'))
+        return headings
+    })
+
+    console.log(`[🩼] Saving accessibility outline (${Object.keys(accessibility_tree).length})...`.padEnd(82), prettyPath(ACCESIBILITY_PATH(page)))
+    // console.log(outline.filter(line => line.startsWith('#')).join('\n'))
+
+    const accessibility_info = {
+        TYPE: 'accessibility',
+        VERSION: version,
+        URL: original_url,
+        iframes,
+        headings: outline,
+        tree: accessibility_tree,
+    }
+
+    await overwriteFile(
+        ACCESIBILITY_PATH(page),
+        accessibility_info,
+    )
+
+    return accessibility_info
+}
+
+async function saveSEO(page, {original_url, version}) {
+    // collect all <meta name="title" property="og:title" content="Page Title for SEO | Somesite.com"> tags into dict
+    const seo_vars = await page.evaluate(() => 
+        [...document.querySelectorAll('meta')]
+            .map(tag => ({key: tag.getAttribute('name') || tag.getAttribute('property') || '', value: tag.getAttribute('content') || ''}))
+            .filter(obj => obj.key && obj.value)
+            .sort((a, b) => a.value.length - b.value.length)
+            .reduce((acc, node) => {acc[node.key] = node.value; return acc}, {})
+    )
+
+    const seo_info = {
+        TYPE: 'seo',
+        VERSION: version,
+        URL: original_url,
+        ...seo_vars,
+    }
+
+    const num_vars = Object.keys(seo_vars).length
+    if (num_vars) {
+        console.log(`[🔎] Saving page SEO metadata (${num_vars})...`.padEnd(82), prettyPath(SEO_PATH(page)))
+        await overwriteFile(SEO_PATH(page), seo_info)
+    }
+
+    return seo_info
+}
+
+async function saveOutlinks(page, {original_url, version}) {
+    // TODO: slow to iterate over all elements so many times, perhaps we can collapse everything down into one loop
+
+
+    // Regular expression that matches syntax for a link (https://stackoverflow.com/a/3809435/117030):
+    const LINK_REGEX = /https?:\/\/(www\.)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b([-a-zA-Z0-9()@:%_\+.~#?&//=]*)/gi;
+
+    const filterW3Urls = (urls) =>
+        urls.filter(url =>
+            url && !url.startsWith('http://www.w3.org/'))
+
+    const filterDataUrls = (urls) =>
+        urls.filter(url =>
+            url && !url.startsWith('data:'))
+
+    const html = await page.content();
+
+    const raw = html?.match(LINK_REGEX) || [];
+
+    const hrefs = await page.$$eval(
+        "pierce/a[href]",
+        elems => elems
+            .map(elem => elem.href)
+            .filter(url => url),
+    );
+
+    const links = await page.$$eval(
+        "pierce/link[href]",
+        elems => elems
+            .map(({rel, href}) => ({rel, href}))
+            .filter(({rel, href}) => rel !== 'stylesheet')
+            .reduce((collection, entry) => {
+                const {rel, href} = entry
+                const non_empty_rel = collection[href]?.rel || rel
+                collection[href] = {rel: non_empty_rel, href}
+                return collection
+            }, {})
+    );
+
+    const iframes = await page.$$eval(
+        "pierce/iframe[src]",
+        elems => elems.map(iframe => iframe.src).filter(url => url)
+    );
+
+    const images = await page.$$eval(
+        "pierce/img[src]",
+        elems => elems.map(img => img.src).filter(url => url && !url.startsWith('data:'))
+    );
+
+
+    const css_images = await page.$$eval(
+        "pierce/*",
+        elems => elems
+            .map(elem => {
+                const css_url_ptn = /url\(\s*?['"]?\s*?(\S+?)\s*?["']?\s*?\)/i;
+                const bg_img = window.getComputedStyle(elem, null).getPropertyValue('background-image')
+                const bg_url = css_url_ptn.exec(bg_img)
+                return bg_url ? bg_url[1] : null
+            })
+    )
+
+    const css_stylesheets = await page.$$eval(
+        "pierce/link[rel=stylesheet]",
+        elems => elems.map(elem => elem.href).filter(url => url)
+    );
+
+    const js_scripts = await page.$$eval(
+        "pierce/script[src]",
+        elems => elems.map(elem => elem.src).filter(url => url)
+    );
+
+    const outlinks_info = {
+        TYPE: 'outlinks',
+        VERSION: version,
+        URL: original_url,
+        raw: [...new Set(filterDataUrls(filterW3Urls(raw)))],
+        hrefs: [...new Set(filterDataUrls(hrefs))],
+        links: [...Object.values(links)],
+        iframes: [...new Set(iframes)],
+        images: [...new Set(filterDataUrls(images))],
+        css_images: [...new Set(filterDataUrls(css_images))],
+        css_stylesheets: [...new Set(filterDataUrls(css_stylesheets))],
+        js_scripts: [...new Set(filterDataUrls(js_scripts))],
+    }
+
+    if (raw?.length || hrefs?.length || links?.length || iframes?.length) {
+        console.log(`[🖇️] Saving page outgoing links (${raw?.length || hrefs?.length})...`.padEnd(82+1), prettyPath(OUTLINKS_PATH(page)))
+
+        await overwriteFile(OUTLINKS_PATH(page), outlinks_info)
+    }
+    return outlinks_info
+}
+
+
+async function saveAuthStorage(page, {client, version, original_url}) {
+    const url = original_url || await page.url()
+    if (URL_SCHEMES_IGNORED.includes(url.split(':')[0])) return null
+    if (!SAVE_AUTH_STORAGE) return null
+
+    // const cookies = JSON.stringify(await page.cookies());  // doesnt include httponly cookies
+    const auth_from_browser = {
+        cookies: (await client.send('Network.getAllCookies')).cookies,
+        localStorage: {},
+        sessionStorage: {},
+    }
+
+    // attempt to load localStorage and sessionStorage from browser (may fail in some cases https://github.com/puppeteer/puppeteer/issues/921)
+    try {
+        auth_from_browser.localStorage = (await page.evaluate(() =>
+            JSON.parse(JSON.stringify({[window.location.origin]: window.localStorage}))))
+    } catch(err) {
+        throw `Failed to get page window.localStorage! ${err}`
+    }
+    try {
+        auth_from_browser.sessionStorage = (await page.evaluate(() =>
+            JSON.parse(JSON.stringify({[window.location.origin]: window.sessionStorage}))))
+    } catch(err) {
+        throw `Failed to get page window.sessionStorage! ${err}`
+    }
+
+    // WARNING: small TOCTTOU gap between this read-before-write and the write below
+    // can possibly overwrite changes made by other processes in this gap
+    const auth_on_disk = await loadAuthStorage(page, {client}, {apply: false})
+
+    const cookies = dedupeCookies([...auth_on_disk.cookies, ...auth_from_browser.cookies])
+
+    const auth_info = {
+        TYPE: 'auth',
+        VERSION: version,
+        URL: original_url,
+        cookies: cookies,
+        sessionStorage: merge(auth_on_disk.sessionStorage, auth_from_browser.sessionStorage),
+        localStorage: merge(auth_on_disk.localStorage, auth_from_browser.localStorage),
+    }
+    // console.log(`[⛙] Merged ${auth_on_disk.cookies.length} existing + ${auth_from_browser.cookies.length} new -> ${auth_info.cookies.length} cookies`)
+  
+    console.log(`[🍪] Saving cookies/localStorage/sessionStorage (${auth_info.cookies.length})...`.padEnd(82), prettyPath(AUTH_JSON_PATH));
+    await overwriteFile(AUTH_JSON_PATH, auth_info);
+  
+    // Write to cookies.txt file using tough-cookie + @root/file-cookie-store
+    await saveCookiesTxt(cookies)
+
+    return auth_info
+}
+
+async function saveCookiesTxt(cookies) {
+    const cookies_store = new FileCookieStore(COOKIES_TXT_PATH, {auto_sync: false, lockfile: false})
+    const cookie_jar = new ToughCookie.CookieJar(cookies_store)
+    cookie_jar.setCookieAsync = util.promisify(cookie_jar.setCookie)
+    cookies_store.saveAsync = util.promisify(cookies_store.save)
+    for (const cookie of cookies) {
+        const cookie_for_tough = {
+            domain: cookie.domain,
+            path: cookie.path,
+            key: cookie.name,
+            value: cookie.value,
+            expires: (new Date(cookie.expires * 1000)).toISOString(),
+            hostOnly: cookie.domain.startsWith('.'),
+            secure: cookie.secure,
+        }
+        // console.log('COOKIE_FOR_TOUGH_TXT', cookie_for_tough)
+        const parsed_cookie = ToughCookie.Cookie.fromJSON(cookie_for_tough)
+        // console.log('COOKIE_FOR_TOUGH_TXT_TO_DUMP', parsed_cookie)
+        try {
+            // assemble a fake URL just to satisfy ToughCookieJar's requirement of having a URL at set time
+            let url = cookie.secure ? 'https://' : 'http://'
+            if (cookie.domain.startsWith('.')) {
+                url = url + cookie.domain.slice(1)
+            } else {
+                url = url + cookie.domain
+            }
+            if (cookie.sourcePort && ![80, 443].includes(cookie.sourcePort)) {
+                url = `${url}:${cookie.sourcePort}`
+            }
+            url = `${url}${cookie.path || ''}`
+            await cookie_jar.setCookieAsync(parsed_cookie, url, {ignoreError: true})
+        } catch(err) {
+            console.error('[❌] Failed to dump browser cookie for cookies.txt...', cookie_for_tough, '->', parsed_cookie, err)
+        }
+    }
+    console.log(`[🍪] Saving cookies TXT (${cookies.length})...`.padEnd(82), prettyPath(COOKIES_TXT_PATH));
+    await cookies_store.saveAsync()
+}
+
+async function saveMetrics(page, {original_url, version, start_time, start_ts, traffic_log, redirects}) {
+    const end_time = (new Date()).toISOString()
+    const end_ts = Date.now()
+    const metrics_info = {
+        TYPE: 'metrics',
+        VERSION: version,
+        URL: original_url,
+        ...(await page.metrics()),
+        start_time,
+        start_ts,
+        end_time,
+        end_ts,
+        duration: (end_ts - start_ts),
+        num_requests: traffic_log.length,
+        num_redirects: Object.keys(redirects).length -1,
+    }
+
+    console.log(`[🏎️] Saving final summary + timing metrics...`.padEnd(82+1), prettyPath(METRICS_PATH(page)))
+    await overwriteFile(METRICS_PATH(page), metrics_info)
+
+    return metrics_info
+}
+
+
+/******************************************************************************/
+/******************************************************************************/
+
+/**************************** Utility Helpers *********************************/
+
+
+function hashCode(str) {
+    // get a simple integer hash for a given string (based on java String#hashCode)
+    // useful only for throwaway nonces / easy deterministic random identifiers, not a replacement for sha256
+    let hash = 0;
+    for (let i=0; i<str.length; i++) {
+       hash = str.charCodeAt(i) + ((hash << 5) - hash);
+    }
+    return Math.abs(hash)
+}
+
+function unique(iter, key: string | ((any, number) => string)='id') {
+    // uniqueify an array of objects by a value within them, key can be name of attr or getter function
+    // > iter = [{id: 1}, {id: 2}, {id: 1}]
+    // > Object.entries(iter) = [
+    //   [ '0',  { id: 1 } ],
+    //   [ '1',  { id: 2 } ],
+    //   [ '2',  { id: 1 } ] ]
+    // > unique(iter, 'id') => {1: {id: 1}, 2: {id: 2}}
+
+    // > iter = {a1: {id: 1}, b2: {id: 2}, a3: {id: 1}}
+    // > Object.entries(iter) = [
+    //   [ 'a1', { id: 1 } ],
+    //   [ 'b2', { id: 2 } ],
+    //   [ 'a3', { id: 1 } ]
+    // ]
+    // > unique(iter, 'id') => {1: {id: 1}, 2: {id: 2}}
+
+    const key_type = (typeof key)
+    if (!['function', 'string'].includes(key_type))
+        throw 'key must be either a string lookup key or a function (obj, idx) => return unique_id'
+
+    const key_func = (key_type === 'string')
+        ? (entry_obj, idx) => entry_obj[(key as string)]
+        : (entry_obj, idx) => (key as Function)(entry_obj, idx)   // otherwise key is a callback func
+
+    const seen = {}
+    for (const [idx, entry_obj] of Object.entries(iter)) {
+        const unique_id = key_func(entry_obj, idx)
+        if (seen[unique_id] === undefined) {
+            seen[unique_id] = entry_obj
+        }
+    }
+
+    return seen
+}
+
+const wait = (ms: number) => new Promise(res => {
+    if (ms > 10_000) {
+        console.debug(`[⏲️] Waiting ${Math.round(ms/1000)}s...`)
+    }
+    setTimeout(res, ms)
+})
+
+const TimeoutError = Symbol()
+const withTimeout = (promise, ms) => {
+    // run a promise with a time limit, raises a TimeoutError if it fails
+    let timer
+    return Promise.race([
+        promise,
+        new Promise((_r, reject) =>
+            timer = setTimeout(reject, ms, TimeoutError)
+        ),
+    ]).finally(() => clearTimeout(timer))
+}
+
+const MAX_VALID_DATE = new Date('2150-01-01T00:00:00.000Z')
+const MIN_VALID_DATE = new Date('2010-01-01T00:00:00.000Z')
+const UNIX_EPOCH_DATE = new Date(0)
+
+const validateDate = (date, {min=MIN_VALID_DATE, max=MAX_VALID_DATE, singleton=UNIX_EPOCH_DATE}={}) => {
+    assert((date instanceof Date), `Got invalid type for Date: ${typeof date} ${date} (expected Date)`)
+    assert(String(date) !== 'Invalid Date', `Got invalid value for Date: ${typeof date} ${date}`)
+    if (Number(date) === Number(singleton)) return date  // epoch singleton is always valid
+    assert(date < max, `Got Date that was higher than MAX_VALID_DATE=${max}`)
+    assert(date > min, `Got Date that was lower than MIN_VALID_DATE=${min}`)
+    return date
+}
+
+const parseVersionDateStr = (yyyymmddtime) => {
+    // YYYYMMDDhhmmssxxx or YYYYMMDDhhmmss or YYYYMMDDhhmm or YYYYMMDD -> Date
+    const is_only_numbers = /^\d+$/.test(yyyymmddtime.replace('.', ''))
+    assert(is_only_numbers, `Non-numeric characters in YYYYMMDD date are not allowed: ${yyyymmddtime} (while trying YYYYMMDDhhmmssxxx format)`)
+
+    const num_digits = String(yyyymmddtime).split('.')[0].length
+    assert([17, 14, 12, 8].includes(num_digits), `Got invalid number of digits (${num_digits}) in YYYYMMDD date: ${yyyymmddtime} (while trying YYYYMMDDhhmmssxxx format)`)
+
+    const [_all, yyyy, mm, dd, hr, min, sec, ms] = /^(\d{4})(\d{2})(\d{2})(\d{2})?(\d{2})?(\d{2})?(\d{3})?$/.exec(yyyymmddtime)
+    assert(yyyy && mm && dd, `Could not find YYYYMMDD`)
+    const time_error_msg = `Detected YYYYMMDD[hhmm[ss[xxxx]]] but time segment is invalid ${hr}:${min || '__'}:${ms || '___'}`
+    if (ms) assert(hr && min && sec, time_error_msg)
+    if (sec) assert(hr && min, time_error_msg)
+    if (min) assert(hr, time_error_msg)
+    if (hr) assert (min, time_error_msg)
+
+    const iso_str = `${yyyy}-${mm}-${dd}T${hr || '00'}:${min || '00'}:${sec || '00'}.${ms || '00'}Z`
+    const parsed_date = new Date(iso_str)
+
+    return validateDate(parsed_date)                        // 1970-01-01T00:00:00.000Z (ISO format)
+}
+
+const parseTimestampDateStr = (timestamp) => {
+    // 1709724291000 or 1709724291000.000 or 1709724291 or 1709724291.000 -> Date
+    timestamp = String(timestamp)
+    const is_only_numbers = /^\d+$/.test(timestamp.replace('.', ''))
+    assert(is_only_numbers, `Got invalid characters in timstamp: ${timestamp} (while trying xxxxxxxxxxxxx format)`)
+
+    const num_digits = String(timestamp).split('.')[0].length
+    assert([13, 10, 1].includes(num_digits), `Got invalid number of digits (${num_digits}) in timestamp: ${timestamp} (while trying xxxxxxxxxxxxx format)`)
+
+    let parsed_date = null
+
+    if (num_digits === 13) {
+        parsed_date = new Date(Number(timestamp))                   // 1709724291000   (unix timestamp w/ milliseconds)
+    } else if (num_digits === 10) {
+        parsed_date = new Date(Number(timestamp) * 1000)            // 1709724291      (unix timestamp w/ seconds)
+    } else if (num_digits === 1) {
+        assert(String(timestamp) === '0', `Got invalid single-digit timestamp: ${timestamp} (while trying xxxxxxxxxxxxx format or 0 for UNIX epoch)`)
+        parsed_date = UNIX_EPOCH_DATE
+    }
+    return validateDate(parsed_date)
+}
+
+const parseISODateStr = (iso_str) => {
+    // 1970-01-01T00:00:00.000Z -> Date
+    const num_digits = String(iso_str).length
+    assert([24, 19, 16, 10].includes(num_digits), `Got invalid number of digits (${num_digits}) in ISO date: ${iso_str} (while trying 1970-01-01T00:00:00.000Z format)`)
+
+    const parsed_date = new Date(iso_str)
+    return validateDate(parsed_date)
+}
+
+const parseDate = (date) => {
+    // date === undefined      => use today/now
+    // date === null           => use unix epoch 0 aka 1970-01-01T00:00:00.000Z
+    // date *= YYYYMMDDHHMMSS  => use a version date string (e.g. 20010131235958)
+    // date *= 1234567...      => use a timestmap (e.g. 1709724291000)
+    // date *= 1970-01-01T...  => use iso datetime (e.g. 1970-01-01T00:00:00.000Z)
+    // returns -> Date
+
+    if (date === undefined) {
+        return (new Date())             // today      (2024-05-29T22:02:34.682Z) aka timestamp=1717020154682
+    }
+    if (date === null || date == 0) {
+        return UNIX_EPOCH_DATE          // unix epoch (1970-01-01T00:00:00.000Z) aka timestamp=0
+    }
+    if (date instanceof Date) {
+        return validateDate(date)       // JS date    Date('1970-01-01T00:00:00.000Z')
+    }
+
+    if ((typeof date) === 'number') {
+        date = String(date)             // unix timestamp e.g. 1717020154682
+    }
+    assert((typeof date) === 'string', `Tried to parse date but got unsupported type ${(typeof date)}: ${date}`)
+
+    const errors = [`Failed to parse Date from string: ${date}`]
+    try {
+        return parseVersionDateStr(date)
+    } catch(err) { errors.push(err) }
+    try {
+        return parseTimestampDateStr(date)
+    } catch(err) { errors.push(err) }
+    try {
+        return parseISODateStr(date)
+    } catch(err) { errors.push(err) }
+    
+    throw errors.join('\n')
+}
+
+const versionStrFromDate = (date, {withDate=true, withTime=true, withSeconds=true, withMilliseconds=false}={}) => {
+    // takes Date, returns YYYYMMDDHHMMSSXXX or YYYYMMDDHHMMSS or YYYYMMDDHHMM or YYYYMMDD
+    const parsed_date = parseDate(date)
+
+    const [date_iso, time_iso] = parsed_date.toISOString().split('T')                       // ['2001-01-31', '23:59:58.090Z']
+
+    const components_to_use = []
+    if (withDate) {                                                                        
+        components_to_use.push(date_iso.replaceAll('-', ''))                                // '20010131'
+    }
+    if (withTime) {
+        const [hr, min, sec, ms] = time_iso.replace('Z', '').replace('.', ':').split(':')   // ['23', '59', '58', '090']
+        components_to_use.push(hr)
+        components_to_use.push(min)
+        if (withSeconds) {
+            components_to_use.push(sec)
+            if (withMilliseconds) {
+                components_to_use.push(ms)
+            }
+        }
+    }
+    assert(components_to_use.length, 'At least one of {withDate, withTime} must be set.')
+
+    const final_str = components_to_use.join('')                                            // 20010131235958
+
+    assert(parseVersionDateStr(final_str))  // sanity check to make sure it parses correctly
+
+    return final_str
+}
+
+// test date functions:
+// console.log(parseDate('20120131'))
+// console.log(versionStrFromDate(parseDate('20120131')))
+// console.log(versionStrFromDate(parseDate('0')))
+// console.log(versionStrFromDate(parseDate(0)))
+// console.log(versionStrFromDate(parseDate(null)))
+// console.log(versionStrFromDate())
+// console.log(versionStrFromDate(parseDate('20120131235859090')))
+// console.log(versionStrFromDate(parseDate('1970-01-01T00:00:00.000Z')))
+// console.log(versionStrFromDate(parseDate('2024-12-01T00:00')))
+// console.log(versionStrFromDate(parseDate('2024-12-01'), {withTime: false}))
+
+const prettyPath = (path) => {
+    // return a pretty-printable path where the abspath of the data dir is replaced with /data for brevity/privacy
+    return path.replace(DATA_DIR, './data')
+}
+
+const pathIsHidden = (relpath) => {
+    // check if a path or any of the directories above it are hidden  (e.g. ./some/.dir/abc or ./.DS_Store)
+    
+    // make sure test path behaves like an abspath (avoids edge-cases messing up relpaths on '' or '.' or './')
+    let test_path = relpath
+    if (test_path.startsWith('./'))
+        test_path = test_path.substring(2)
+    if (!test_path.startsWith('/'))
+        test_path = path.join('/', test_path)
+
+    // iterate through parents, checking if any parent is hidden until we reach /
+    while (test_path !== '/') {
+        const basename = path.basename(test_path)
+        if (basename.startsWith('.')) {
+            // console.log('PATH IS HIDDEN', relpath)
+            return true
+        }
+        // otherwise set test_path to parent dir and repeat
+        test_path = path.dirname(test_path)
+    }
+    return false
+}
+
+const pathDepth = (child_path, relative_to='.') => {
+    // get the number of directory hops deep a child path is relative to '.' (or a given parent)
+    
+    if (child_path.startsWith('/') && !relative_to.startsWith('/')) {
+        // if child_path is absolute, then relative_to must be absolute as well otherwise depth will be depth all the way to the / root
+        relative_to = fs.realpathSync(relative_to)
+    }
+    if (relative_to.startsWith('/') && !child_path.startsWith('/')) {
+        // same deal, either both paths have to be relative, or both have to be absolute
+        child_path = fs.realpathSync(child_path)
+    }
+    const relative_path_to_root = path.relative(relative_to, child_path)
+    const num_hops_down = relative_path_to_root.split('/').length
+    return num_hops_down
+}
+
+interface DirentWithExtras extends fs.Dirent {
+    relpath: string,
+    abspath: string,
+    reldepth: number,
+}
+
+async function getDirEntries(dir_path, {pwd=null, recursive=true, includeHidden=false, includeFiles=true, includeDirs=true, includeLinks=false, filter=null, maxdepth=-1}={}) {
+    // get the list of all sub-paths under a given path recursively
+
+    // console.log('GETTING DIRECTORY ENTRIES', {dir_path, pwd, recursive, includeHidden, includeFiles, includeDirs, maxdepth})
+
+    pwd = pwd || dir_path
+    let dir_abspath = dir_path
+
+    if (!dir_abspath.startsWith(pwd)) {
+        dir_abspath = path.join(pwd, dir_abspath)
+    }
+
+    assert(fs.existsSync(dir_abspath), `Tried to get directory listing for dir that doesn't exist! ${prettyPath(dir_abspath)}`)
+
+    return (await fs.promises.readdir(dir_abspath, { recursive, withFileTypes: true }))
+        .map((dirent: DirentWithExtras) => {
+            // filter combined with map because relpath is re-used in both operations
+            const relpath = path.join(path.relative(pwd, dirent.parentPath), dirent.name)
+            // console.log('CALCULATED RELATIVE PATH', relpath)
+            const abspath = path.join(dir_abspath, relpath)
+            const basename = path.basename(dirent.name)
+            if (!includeLinks && dirent.isSymbolicLink()) return null
+            if (!includeFiles && dirent.isFile()) return null
+            if (!includeDirs && dirent.isDirectory()) return null
+            if (!includeHidden && pathIsHidden(relpath)) return null
+
+            dirent.relpath = relpath
+            dirent.abspath = abspath
+            dirent.reldepth = pathDepth(relpath)
+            // console.log('RELATIVE DEPTH MEASURED', prettyPath(dir_abspath), prettyPath(relpath), dirent.reldepth)
+
+            if (maxdepth >= 0) {
+                if ((dirent.reldepth-1) > maxdepth) return null
+            }
+            
+            if ((typeof filter) === 'function') {
+                const should_keep = filter({abspath, relpath, basename, dirent})
+                if (!should_keep) {
+                    // console.log('FILTER EXCLUDED RESULT', {abspath, relpath, basename, dirent})
+                    return null
+                }
+            }
+
+            return relpath
+        })
+        .filter(Boolean)
+        .sort() as string[]
+}
+
+
+async function getTotalSize(dir_or_file_path, {pwd=null, _cache=null, filter=null, subfiles=null}={}) {
+    // get the total size in bytes of a file or directory (recursively adds up file sizes within directory)
+
+    // check _cache first
+    if (_cache && (dir_or_file_path in _cache))
+        return _cache[dir_or_file_path]
+
+    // make sure dir_or_file_path is under pwd
+    pwd = pwd || path.dirname(dir_or_file_path)
+    let abspath = dir_or_file_path
+    if (!dir_or_file_path.startsWith(pwd)) {
+        abspath = path.join(pwd, dir_or_file_path)
+    }
+
+    // if it's a file, stat it and return the size
+    // console.log('CALCUALTED ABSPATH', {abspath, dir_or_file_path, pwd})
+    const dirent = await fs.promises.stat(abspath)
+    if (dirent.isFile()) {
+        // console.log('CALCULATING FILE SIZE subfile=', prettyPath(abspath))
+        return dirent.size
+    }
+
+    // if it's not a file and not a directory, give up, dont try to size special files like FIFO/socket/etc.
+    if (!dirent.isDirectory()) return 0
+
+    // if it's a directory, size is the sum of all the sizes of files within
+    // console.log('CALCULATING SUBDIR SIZE subdir=', prettyPath(abspath))
+    let total_bytes = 0
+    const files_within = subfiles || await getDirEntries(dir_or_file_path, {
+        pwd,
+        recursive: true,
+        includeDirs: false,
+        includeFiles: true,
+        filter,
+    })
+    for (const subpath of files_within) {
+        total_bytes += await getTotalSize(subpath, {pwd, _cache, filter})
+    }
+    return total_bytes
+}
+
+
+async function getDirSizes(dir_path, {pwd=null, subfiles=null, withRoot=true, filter=null, maxdepth=-1}={}) {
+    // get the size of a directory and all the files within (recursively) as a number of bytes
+    //  dir_path:     path   absolute or relative path of the directory you want size info for
+    //       pwd:     path   (optional) absolute path of the directory you want to interpret dir_path relative to
+    //  subfiles: dirent[]   (optional) instead of reading disk, you can manually provide a getDirEntries results list to use
+    //  withRoot:     bool   include a summary entry for the root dir_path dir in the list as '.'
+    //    filter: function   (optional) provide a filter func for dir entries ({abspath, relpath, basename, dirent}) => true/false
+    //  maxdepth:   number   (optional) does not affect actual calculations, but hides entries below a certain depth in the returned output for brevity
+
+    assert((await fs.promises.stat(dir_path)).isDirectory(), `Tried to calculate directory sizes but path is not a directory! ${dir_path}`)
+    pwd = pwd || dir_path
+
+    // {'.': 246, 'example.json': 123, 'example2.txt': 123}
+    const sizes = {}
+
+    // first collect the list of all sub-files recursively and calculate their sizes individually
+    const files_within = subfiles || await getDirEntries(dir_path, {
+        pwd,
+        recursive: true,
+        includeDirs: false,
+        includeFiles: true,
+        // dont pass maxdepth here, we need the entire file listing to accurately calculate parent dir sizes
+        // it never makes sense to ignore subfiles beyond a certain depth for size calculation
+        filter,  // filter is allowed though, useful to calculcate size of some subset of files that match a pattern
+    })
+    for (const subpath of files_within) {
+        sizes[subpath] = await getTotalSize(subpath, {pwd, _cache: sizes, filter})
+    }
+    
+    // then calculate the top-level directory total as the sum of all the file sizes under it
+    const total_size = Object.values(sizes).reduce((a: number, b: number) => a + b, 0)
+
+    // then calculate the subtotals of all the sub-directories
+    const subdirs_within = await getDirEntries(dir_path, {pwd, recursive: true, includeDirs: true, includeFiles: false, filter, maxdepth})
+    for (const subpath of subdirs_within) {
+        sizes[subpath] = await getTotalSize(subpath, {pwd, _cache: sizes, filter})   // uses _cache to avoid re-computing
+    }
+
+    // if maxdepth is passed, filter results to only include paths shallower than max depth
+    if (maxdepth >= 0) {
+        for (const subpath of Object.keys(sizes)) {
+            if (pathDepth(subpath) > maxdepth) {
+                delete sizes[subpath]
+            }
+        }
+    }
+
+    // set total_size last so it appears at the bottom of the object in logs for convenience
+    if (withRoot) {
+        sizes['.'] = total_size
+    }
+
+    return sizes
+}
+
+
+async function getLargestPath(path_a, path_b) {
+    // compare two files/directories and return the largest one of the two (calculating size recursively)
+    
+    path_a = await fs.promises.realpath(path_a)
+    path_b = await fs.promises.realpath(path_b)
+    const size_a = await getTotalSize(path_a)
+    const size_b = await getTotalSize(path_b)
+
+    // console.log('COMPARING', prettyPath(path_a), size_a, '  ', prettyPath(path_b), size_b)
+
+    if (size_a > size_b) return path_a
+    return path_b
+}
+
+async function findCommonAncestor(target_abspath, symlink_abspath, {relative=true, search_limit=DATA_DIR}: {relative?: boolean | string, search_limit?: string}={}) {
+    // given a target path and a symlink path, find the common ancestor path they both share
+    // (searches recursively through absolute path parent directories until a common dir is found, up to search_limit)
+
+    search_limit = await fs.promises.realpath(search_limit)
+
+    let relative_dir = search_limit
+    if ((typeof relative) === 'boolean') {
+        // if start dir is default, set it to symlinks directory path
+        if (relative) {
+            relative_dir = path.dirname(symlink_abspath)
+        } else {
+            relative_dir = search_limit
+        }
+    } else if ((typeof relative) === 'string') {
+        // if start dir is a string, get its absolute path
+        relative_dir = relative as string
+    } else {
+        throw `Got invalid type for relative path during common ancestor search: ${relative}`
+    }
+
+    if ((await fs.promises.stat(relative_dir)).isFile()) {
+        // if start dir is a file, set it to its parent dir path
+        relative_dir = path.dirname(relative_dir)
+    }
+    assert(
+        (await fs.promises.stat(relative_dir)).isDirectory(),
+        `Tried to find common ancestor starting from invalid search directory:\n    🔗 ${prettyPath(symlink_abspath)}\n    -> ${prettyPath(target_abspath)}\n    Error: search dir does not exist or is not a directory: ❌ ${prettyPath(relative_dir)}`,
+    )
+
+    const symlink_filename = path.basename(symlink_abspath)
+    const target_filename = path.basename(target_abspath)
+    const symlink_parent_abspath = await fs.promises.realpath(path.dirname(symlink_abspath))
+    const target_parent_abspath = await fs.promises.realpath(path.dirname(target_abspath))
+    const search_dir_abspath = await fs.promises.realpath(relative_dir)
+
+    let closest_common_ancestor = search_dir_abspath
+
+    const isAncestorCommon = (ancestor) => (
+        target_parent_abspath.startsWith(ancestor)
+        && symlink_parent_abspath.startsWith(ancestor))
+
+    // check if both src and target start with the same ancestor path
+    while (closest_common_ancestor !== search_limit) {
+        if (isAncestorCommon(closest_common_ancestor)) break
+        else {
+            // otherwise go up one directory and try again
+            // console.log('    ...going up a directory', prettyPath(closest_common_ancestor)+'/..')
+            closest_common_ancestor = path.dirname(closest_common_ancestor)
+        }
+    }
+
+    assert(
+        isAncestorCommon(closest_common_ancestor),
+        `Tried to create relative symlink but could not find common ancestor:\n    🔗 ${prettyPath(symlink_abspath)}\n    -> ${prettyPath(target_abspath)}\n    Error: target path and symlink path are not both under:\n      ❌ ${prettyPath(closest_common_ancestor)}`,
+    )
+    
+    const symlink_to_ancestor_relpath = path.relative(symlink_parent_abspath, closest_common_ancestor)                             // ../../..
+    const target_from_ancestor_relpath = path.join(path.relative(closest_common_ancestor, target_parent_abspath), target_filename)   // 'archive/19999999.23423523'
+    const symlink_to_target_relpath = path.join(symlink_to_ancestor_relpath, target_from_ancestor_relpath)                           // '../../../archive/19999999.23423523'
+
+    return {
+        closest_common_ancestor,
+        search_dir_abspath,
+
+        target_abspath,
+        target_filename,
+        target_from_ancestor_relpath,
+        
+        symlink_abspath,
+        symlink_filename,
+        symlink_to_ancestor_relpath,
+        symlink_to_target_relpath,
+    }
+}
+
+interface StatsWithExtras extends fs.Stats {
+    abspath: string
+    relpath?: string
+    reldepth?: number
+}
+
+async function blockUntilExists(file_path, {timeout=7_500, min_bytes=0}={}) {
+    // wait up to timeout seconds until file we expect to exist appears on the filesystem
+    // (used to handle eventual consistency in network filesystems where we need a delay after writing before reads show up)
+    const interval = 250
+    const max_tries = timeout / interval
+    let tries = 0
+    
+    let abspath = null
+    while (tries < max_tries) {
+        try {
+            const abspath = await fs.promises.realpath(file_path)
+            assert(fs.existsSync(abspath))
+            
+            const dirent = await fs.promises.stat(abspath) as StatsWithExtras
+            dirent.abspath = abspath
+            
+            if (min_bytes && (dirent.size < min_bytes)) {
+                assert(dirent.size >= 1)
+                // this is a valid warning but unfortunately its too common to bother showing:
+                // console.warn(`[⚠️] Expected file to be >=${Math.round(min_bytes/1000)}kb but was only ${dirent.size/1000}kb:`, prettyPath(file_path))
+            }
+            
+            return dirent
+        } catch(err) {
+            const waited = (tries * interval)
+            if (waited === 5_000) {
+                console.warn(`[⚠️] Waited >${waited/1000}s for file to appear (is filesystem or bg task running slow?):`, prettyPath(file_path))
+            }
+            await wait(interval)
+            tries++
+        }
+    }
+    throw `Expected file does not exist after ${timeout/1000}s: ${prettyPath(file_path)}`
+}
+
+async function overwriteSymlink(target_path, symlink_path, {relative=true, mkdirs=false, search_limit=DATA_DIR, timeout=5_000}: {relative?: boolean | string, mkdirs?: boolean, search_limit?: string, timeout?: number}={}) {
+    // create a symlink from symlink_path -> target_path
+    // relative: true => symlink is created as a relative link by default (it will auto-find the closest common ancestor dir, often DATA_DIR)
+    // mkdirs: true   => optionally creates symlink parent dirs automatically)
+
+    // make sure target file actually exists first
+    let target_dirent
+    try {
+        target_dirent = await blockUntilExists(target_path, {timeout})
+    } catch(err) {
+        throw `Tried to create symlink pointing to file that does not exist:\n    🔗 ${prettyPath(symlink_path)}\n    -> ❌ ${prettyPath(target_path)}\n    ${err}`
+    }
+    const target_abspath = target_dirent.abspath
+    const target_filename = path.basename(target_abspath)
+    const target_parent_abspath = path.dirname(target_abspath)
+    
+    // make sure target is a valid file or directory and not a special character/block device/other weird file
+    const target_is_dir = target_dirent.isDirectory()
+    const target_is_file = target_dirent.isFile()
+    assert(target_is_dir || target_is_file, `Tried to create symlink to an unsupported file type:\n    🔗 ${prettyPath(symlink_path)}\n    -> ❌ ${prettyPath(target_path)} (expected file or directory)`)
+
+    // create symlink file parent directories if needed
+    const symlink_filename = path.basename(symlink_path)
+    const symlink_parent_dir = path.dirname(symlink_path)
+    if (mkdirs) {
+        await fs.promises.mkdir(symlink_parent_dir, {recursive: true})
+    }
+    try {
+        assert((await fs.promises.stat(symlink_parent_dir)).isDirectory())
+    } catch(err) {
+        throw `Tried to create symlink in a directory that doesn't exist:\n    🔗 ${symlink_parent_dir}❌/${symlink_filename}\n    -> ${target_path}\n    ${err}`
+    }
+    const symlink_parent_abspath = await fs.promises.realpath(symlink_parent_dir)
+    const symlink_abspath = path.join(symlink_parent_abspath, symlink_filename)
+
+    // determine nearest common ancestor between symlink dir and target dir
+    const {
+        closest_common_ancestor,
+        symlink_to_ancestor_relpath,
+        target_from_ancestor_relpath,
+        symlink_to_target_relpath,
+    } = await findCommonAncestor(target_abspath, symlink_abspath, {relative, search_limit}) 
+    
+    // set final target path to abspath or relative path depending on {relative} options
+    let target_path_final
+    if (relative) {
+        // make symlink into relative link (based on closest common ancestor dir between symlink_abspath and target_abspath)
+        target_path_final = symlink_to_target_relpath
+        // console.log('  🔗', prettyPath(symlink_abspath), '->', prettyPath(target_abspath), `(as relative link: ${target_path_final})`)
+    } else {
+        // make symlink into an absolute path (verbatim passed target_path)
+        target_path_final = target_path
+        // console.log('  🔗', prettyPath(symlink_abspath), '->', prettyPath(target_abspath), '(as absolute path)')
+    }
+
+    // remove any existing symlink at destination if there is already one there
+    const random_nonce = crypto.randomBytes(16).toString('hex').substring(0, 8)
+    const symlink_temp_path = `${symlink_abspath}.${random_nonce}.dup`
+    try { await fs.promises.unlink(symlink_abspath) } catch(err) {}
+    try { await fs.promises.unlink(symlink_temp_path) } catch(err) {}
+
+    // create the symlink and check that it works after creation
+    let created_symlink = null
+    try {
+        created_symlink = symlink_temp_path
+        await fs.promises.symlink(target_path_final, symlink_temp_path)
+        created_symlink = symlink_abspath
+        await fs.promises.rename(symlink_temp_path, symlink_abspath)
+    } catch(err) {
+        if (String(err).includes('EISDIR')) {
+            // console.warn('[⚠️] Tried to create symlink on top of existing directory', prettyPath(symlink_abspath))
+
+            // no real recourse in this situation, and its too noisy to log every time this happens
+            // it's also not always safe to move the dir out of the way, so better to just fail silently here, leaving:
+            // ${symlink_abspath}.${random_nonce}.dup
+        } else {
+            console.warn('[⚠️] Failed to create symlink', prettyPath(created_symlink), err)
+        }
+    }
+
+    let dirent
+    try {
+        dirent = await blockUntilExists(created_symlink, {timeout, min_bytes: 0})
+        // best we can do here is just check that it exists ^, trying to check that it has the exact expected abspath that we set is bad, because its a race condition:
+        // assert(dirent.abspath == target_abspath) // its often already overwritten by later activity, so final abspath may already be different
+    } catch(err) {
+        throw `Symlink created but does not seem to resolve to intended file:\n    🔗 ${symlink_path}\n    -> ❌ ${target_path}\n      actual=${dirent?.abspath}\n    expected=${target_abspath}\n    ${err}`
+    }
+
+    return {
+        symlink_path,
+        symlink_abspath: created_symlink,
+        symlink_filename: path.basename(created_symlink),
+        symlink_parent_abspath,
+        symlink_to_ancestor_relpath,
+        symlink_to_target_relpath,
+        
+        target_path,
+        target_abspath,
+        target_filename,
+        target_parent_abspath,
+        target_from_ancestor_relpath,
+        target_path_final,
+        target_is_dir,
+        target_is_file,
+        target_is_relative: Boolean(relative),
+        
+        closest_common_ancestor,
+    }
+}
+
+// test symlink and common ancestor finding
+// console.log(await findCommonAncestor('/Volumes/NVME/Users/squash/Local/Code/archiveboxes/archivebox-spreadsheet-bot/data/archive/1709724410.19269/seo.json', '/Volumes/NVME/Users/squash/Local/Code/archiveboxes/archivebox-spreadsheet-bot/data/archive/1709724410.19269/seo2.json'))
+// console.log(await findCommonAncestor('/Volumes/NVME/Users/squash/Local/Code/archiveboxes/archivebox-spreadsheet-bot/data/archive/1709724410.19269', '/Volumes/NVME/Users/squash/Local/Code/archiveboxes/archivebox-spreadsheet-bot/data/index/snapshots_by_domain/twitter.com/1709724410.19269', {relative: true, search_limit: '/Volumes/NVME/Users/squash/Local/Code/archiveboxes/archivebox-spreadsheet-bot/data/'}))
+// console.log(await overwriteSymlink('/Volumes/NVME/Users/squash/Local/Code/archiveboxes/archivebox-spreadsheet-bot/data/archive/1709724410.19269', '/Volumes/NVME/Users/squash/Local/Code/archiveboxes/archivebox-spreadsheet-bot/data/index/snapshots_by_domain/twitter.com/1709724410.19269'))
+// console.log(await overwriteSymlink('/Volumes/NVME/Users/squash/Local/Code/archiveboxes/archivebox-spreadsheet-bot/data/index/snapshots_by_domain/twitter.com/1709724410.19269', '/Volumes/NVME/Users/squash/Local/Code/archiveboxes/archivebox-spreadsheet-bot/data/index/favorite_snapshots/1709724410.19269', {relative: false, mkdirs: true, search_limit: '/Volumes/NVME/Users/squash/Local/Code/archiveboxes/archivebox-spreadsheet-bot/data/'}))
+
+
+
+async function overwriteDir(path) {
+    // delete any existing folder at the destination path (important otherwise we may create a folder inside an existing folder/symlink)
+    try {
+        await fs.promises.rm(path, { recursive: true, force: true });
+    } catch(err) {}
+
+    await fs.promises.mkdir(path, {recursive: true})
+
+    return path
+}
+
+async function overwriteFile(path, contents, options={encoding: 'utf8', flag: 'w', flush: false, block: true}) {
+    // write any JS value to a fresh file (e.g. String, Buffer, WritableStream, etc. anything JSON-serializable)
+
+    const block_until_created = options.block || true
+    delete options.block
+
+    try {
+        // delete any existing symlink/file present at the destination path
+        // (important otherwise we may write into an existing symlink by accident)
+        await fs.promises.unlink(path)
+    } catch(err) {}
+
+    try {
+        let nonce = 1
+        while ((await fs.promises.stat(path)).isDirectory()) {
+            // if we try to write a file to a path that already has a directory in that location
+            // (common when trying to write response JSON e.g. http://www.instagram.com/api/graphql returns json and www.instagram.com/api/graphql/abc returns json)
+            path = path.replace(`.${nonce-1}`, '') + `.${nonce}`
+            nonce++;
+            if (nonce > 20) throw `Too many conflicting files while trying to write to ${prettyPath(path)}`
+        }
+    } catch(err) {
+        if (!String(err).includes('no such file or directory')) {
+            console.warn('[⚠️] Warning: Problem with conflicting directory at while trying to write file', err)
+        }
+    }
+
+    // refuse writing undefined/null/function because its likely an error and not intended
+    const content_is_null =  (contents === null) || (contents === undefined)
+    const content_is_func = (typeof contents === 'function')
+    if (content_is_null || content_is_func) {
+        throw `Cannot write ${typeof contents} ${contents} to file: ${path}`
+    }
+
+    // Numbers, BigInts, and Booleans can be cast to strings, then wrt
+    const content_is_primitive = ['number', 'bigint', 'boolean'].includes(typeof contents)
+    if (content_is_primitive) {
+        contents = String(contents)
+        await fs.promises.writeFile(path, contents, options as any)
+        if (block_until_created) await blockUntilExists(path, {min_bytes: Buffer.byteLength(contents)})
+        return path
+    }
+
+    // Strings and Buffers can be written directly to file
+    const content_is_string = (typeof contents === 'string' || contents instanceof String)
+    const content_is_buffer = Buffer.isBuffer(contents)
+    if (content_is_string || content_is_buffer) {
+        await fs.promises.writeFile(path, contents, options as any)
+        if (block_until_created) await blockUntilExists(path, {min_bytes: Buffer.byteLength(contents)})
+        return path
+    }
+    
+    // WritableStream objects can be piped into file
+    const content_is_stream = (contents?.pipe)
+    if (content_is_stream) {
+        const stream_byte_length = contents.writableLength
+        const dest_file = fs.createWriteStream(path);
+        await finished(contents.pipe(dest_file))
+        if (block_until_created) await blockUntilExists(path, {min_bytes: stream_byte_length})
+        return path
+    }
+
+    // Objects and Arrays can be JSON-stringified then written into file
+    const content_is_obj = (Array.isArray(contents) || typeof contents === 'object')
+    if (content_is_obj) {
+        contents = JSON.stringify(contents, null, 4)
+        await fs.promises.writeFile(path, contents, options as any)
+        if (block_until_created) await blockUntilExists(path, {min_bytes: Buffer.byteLength(contents)})
+        return path
+    }
+    throw `Cannot write contents of type ${typeof contents} to file: ${path} < ${contents}`
+}
+
+
+async function saveExecResult(bin, args=null, {original_url, version}, {cwd='.', timeout=60_000, ...spawn_options}={}) {
+    assert(bin)
+    assert(original_url && original_url.includes('://'))
+    assert(version)
+
+    const BIN_NAME = bin                 // 'yt-dlp'
+    const ARGS = args || []              // ['--some-arg', '--some-other-arg']
+    const CWD = cwd || process.cwd()     // '.'
+    const TIMEOUT = 300_000              // 5min timeout
+    const PATH = process.env.PATH
+
+    await fs.promises.mkdir(cwd, {recursive: true})
+
+    // quick-n-dirty dump of cmd to bash script, but this might be better: https://github.com/nodejs/node/issues/34840#issuecomment-677402567
+    const cmd_log_str = `#!/usr/bin/env bash
+TYPE="${BIN_NAME}"
+URL="${original_url}"
+VERSION="${version}"
+
+TIMEOUT=${TIMEOUT}
+CWD="${CWD}"
+PATH="${PATH}:$PATH"
+
+${BIN_NAME} ${ARGS.map(arg => JSON.stringify(arg)).join(' ')}
+`
+    const cmd_log = path.join(cwd, 'cmd.sh')
+    await overwriteFile(cmd_log, cmd_log_str)
+
+    const stdout_log = fs.createWriteStream(path.join(cwd, 'stdout.log'))
+    const stderr_log = fs.createWriteStream(path.join(cwd, 'stderr.log'))
+ 
+    const start_date = new Date()
+    const start_ts = Number(start_date)
+    const start_time = start_date.toISOString()
+
+    const child = child_process.spawn(
+        BIN_NAME,
+        ARGS,
+        {
+            cwd: CWD,
+            timeout: TIMEOUT,                           // 5min timeout
+            stdio: [null, 'pipe', 'pipe'],              // </dev/null >./stdout.log 2>./stderr.log
+            // detached: true,                          // run in background, don't block on response
+            ...(spawn_options || {}),
+        },
+    )
+    child.stdout.setEncoding('utf8')
+    child.stdout.pipe(stdout_log)
+    child.stderr.setEncoding('utf8')
+    child.stderr.pipe(stderr_log)
+
+    const exec_info = {
+        TYPE: BIN_NAME,
+        URL: original_url,
+        VERSION: version,
+        bin_name: BIN_NAME,
+        args: ARGS,
+        timeout: TIMEOUT,
+        hostname: os.hostname(),
+        bin_paths: PATH,
+        ppid: process.pid,
+        pid: child.pid,
+        start_ts,
+        start_time,
+        end_time: null,
+        end_ts: null,
+        duration: null,
+        returncode: null,
+        log_files: {},
+        output_files: {},
+    }
+
+    // promise that resolves when the command is finished executing
+    // TODO: refactor to use withTimeout
+    const getResult = (timeout=TIMEOUT) =>
+        new Promise((resolve, reject) => {
+            const loop = setInterval(() => {
+                if (exec_info.end_time) {
+                    clearInterval(loop)
+                    clearTimeout(timer)
+                    resolve(exec_info)
+                }
+            }, 100)
+
+            const timer = setTimeout(() => {
+                clearInterval(loop)
+                if (!exec_info.end_time) {
+                    reject(new Error(`Process ${BIN_NAME} did not finish within TIMEOUT=${TIMEOUT}`))
+                }
+            }, timeout);
+        })
+
+    const logFilesFilter = ({relpath}) =>
+        ['cmd.sh', 'stdout.log', 'stderr.log'].includes(relpath)
+
+    const outputFilesFilter = ({relpath}) =>
+        !['cmd.sh', 'stdout.log', 'stderr.log', 'index.json'].includes(relpath)
+
+    const getOutputFiles = async (filter=outputFilesFilter) => {
+        return await getDirInfo(CWD, {filter, withHelpers: false, withRoot: false, maxdepth: 6})
+    }
+
+    child.on('close', async (returncode) => {
+        const end_date = new Date()
+        exec_info.returncode = returncode
+        exec_info.pid = child.pid
+        exec_info.end_ts = Number(end_date)
+        exec_info.end_time = end_date.toISOString()
+        exec_info.duration = exec_info.end_ts - exec_info.start_ts
+        exec_info.log_files = await getOutputFiles(logFilesFilter)
+        exec_info.output_files = await getOutputFiles(outputFilesFilter)
+
+        const end_metadata = ` 
+# END_TIME="${exec_info.end_time}"
+# DURATION=${exec_info.duration}
+# RETURNCODE=${exec_info.returncode }
+`
+        await fs.promises.appendFile(cmd_log, end_metadata)
+
+        // write exec_info json (which includes file list) to CWD/index.json
+        await overwriteFile(path.join(CWD, 'index.json'), exec_info)
+    })
+    // child.unref()  // dont wait for child process to close
+    
+    const start_metadata = `
+#################### LAST RUN LOG ####################
+# HOSTNAME="${exec_info.hostname}"
+# PPID=${exec_info.ppid}
+# PID=${exec_info.pid}
+# START_TIME="${exec_info.start_time}"
+`
+    await fs.promises.appendFile(cmd_log, start_metadata)
+
+    return {
+        ...exec_info,
+        getResult,
+    }
+}
+
+const HASH_CACHE = {}
+
+async function sha256File(file_path: string, {pwd=null}: {pwd?: string}={}) {
+    return new Promise((resolve, reject) => {
+        pwd = pwd || path.dirname(file_path);
+        if (!file_path.startsWith(pwd)) {
+            file_path = path.join(pwd, file_path);
+        }
+
+        const dirent = fs.statSync(file_path);
+        const abspath = fs.realpathSync(file_path);
+        const cache_key = `${abspath}:${dirent.size}:${dirent.mtimeMs}`; // PATH:SIZE:LAST_MODIFIED_TIME
+        if (cache_key in HASH_CACHE) {
+            resolve(HASH_CACHE[cache_key]);
+        }
+
+        const hash = crypto.createHash('sha256');
+        const rs = fs.createReadStream(abspath);
+        rs.on('error', reject);
+        rs.on('data', chunk => hash.update(chunk));
+        rs.on('end', () => {
+            const final_hash = hash.digest('hex');
+            HASH_CACHE[cache_key] = final_hash;
+            resolve(final_hash);
+        });
+    }) as Promise<string>
+}
+
+async function getDirSha256(dir_path, {pwd=null, withRoot=true, filter=null, maxdepth=-1, subfiles=null}={}) {
+    // console.log('CALCULATING SHA256 OF FILES IN DIR', dir_path, {withRoot, filter, maxdepth})
+    //  dir_path:     path   absolute or relative path of the directory you want the merkle sha256 for
+    //       pwd:     path   (optional) absolute path of the directory you want to interpret dir_path relative to
+    //  withRoot:     bool   include a summary entry for the root dir_path dir in the list as '.'
+    //    filter: function   (optional) provide a filter func for dir entries ({abspath, relpath, basename, dirent}) => true/false
+    //  maxdepth:   number   (optional) does not affect actual calculations, but hides entries below a certain depth in the returned output for brevity
+    //  subfiles: dirent[]   (optional) instead of reading disk, you can manually provide a getDirEntries results list to use
+
+    pwd = pwd || dir_path
+    if (!dir_path.startsWith(pwd)) {
+        dir_path = path.join(pwd, dir_path)
+    }
+
+    const dirent = await fs.promises.stat(dir_path)
+    assert(dirent.isDirectory(), `Tried to compute sha256 of path but missing or not a directory! ${dir_path}`)
+    assert((maxdepth >= -1), `maxdepth must be -1, 0, or 1, 2, 3, etc... (got ${maxdepth})`)
+    
+    // assert(!(filter && withRoot), `Cannot generate root hash (consistently) when a custom filter is provided!`)
+
+    // get the sha256 of every file in a directory recursively (excluding hidden files and symlinks)
+    // EQUIVALENT TO: find . -type f -not -path '*/.*' -print0 | sort -z | xargs -0 sha256sum
+    const all_subfiles = (subfiles as string[]) || await getDirEntries(dir_path, {
+        pwd,
+        recursive: true,
+        includeFiles: true,
+        includeDirs: false,
+
+        // ~~maxdepth,~~    // dont pass maxdepth here, we need the entire file listing to accurately calculate parent dir hashes.
+                            // it never makes sense to ignore subfiles beyond a certain depth for hash calculation. Hashes are
+                            // only useful IDs if they are consistent+repeatable, hashing to an arbitrary depth will produce 
+                            // many different hashes for the same directory, which is not something we need/want polluting the hash space.
+
+        
+        filter,  // we do however allow passing a manual filter funcs which does actually affect the hash
+                 // this is useful to allow quick checks to see whether a certain subset of files has changed or not
+    })
+    const hashes: {[key: string]: string} = {}
+    let hashable_summary_str = ''
+    for (const subfile of all_subfiles) {
+        // {'versions/20240413144307/screen recording.mp4': '1df4d9c3aca8b36f1f73e327d56038f80a35db407a298edb16c72576d7dd894e', ...}
+        hashes[subfile] = await sha256File(subfile, {pwd})
+        const relpath = path.relative(await fs.promises.realpath(dir_path), await fs.promises.realpath(path.join(pwd, subfile)))
+        hashable_summary_str += `${hashes[subfile]}  ./${relpath}\n`
+    }
+    // console.log('CALCULATED HASHES FOR ALL SUBFILES IN DIR', dir_path, Object.keys(hashes).length)
+
+    // get list of subdirectories and recursively hash every subdirectory
+    // EQUIVALENT TO: find . -type d -not -path '*/.*' -maxdepth ${maxdepth} -print | sort
+    const subdirs = await getDirEntries(dir_path, {pwd, recursive: true, includeHidden: false, includeDirs: true, includeFiles: false, filter, maxdepth})
+
+    // for each subdirectory, get its hash recursively and store it in the hash list
+    for (const subdir of subdirs) {
+        // console.log('GETTING SUBDIR HASH', subdir)
+        // a directory's hash is defined as the hash of all the *files* within (excluding dirs/symlinks/hidden)
+        const subdir_hashes = await getDirSha256(
+            subdir,
+            {pwd, withRoot: true, filter, maxdepth: 0},
+        )
+        hashes[subdir] = subdir_hashes['.']
+    }
+    // console.log('CALCULATED HASHES FOR ALL SUBDIRS IN DIR', dir_path, subdirs.length)
+
+    // filter results if maxdepth is provided
+    if (maxdepth >= 0) {
+        for (const subpath of Object.keys(hashes)) {
+            if (pathDepth(subpath) > maxdepth) {
+                delete hashes[subpath]
+            }
+        }
+    }
+    // console.log('LIMITED OUTPUT DUE TO MAXDEPTH', maxdepth, Object.keys(hashes).length)
+
+    // calculate the hash of the root '.' folder by hashing all of hashes of its contents
+    // EQUIVALENT TO: find . -type f -not -path '*/.*' -print0 | sort -z | xargs -0 sha256sum | sha256sum
+    if (withRoot) {
+        // pass the first command's output containing the file list + hashes into another sha256
+        // to get the final hash of the whole directory combined
+        // console.log('CALCULATING FINAL ROOT HASH for ', dir_path)
+        // console.log(hashable_summary_str)
+        hashes['.'] = crypto.createHash('sha256').update(hashable_summary_str).digest('hex') as string
+        // console.log('--->', hashes['.'])
+    }
+
+    return hashes
+}
+
+
+async function getDirInfo(dir_path, {pwd=null, withRoot=true, withHelpers=true, filter=null, maxdepth=-1, subfiles=null}={}) {
+    // get a detailed JSON/dumpable index of a directory's contents, w/ merkle sha256's, sizes, and mimeTypes
+    //    dir_path:     path   absolute or relative path of the directory you want size info for
+    //         pwd:     path   (optional) absolute path of the directory you want to interpret dir_path relative to
+    //    withRoot:     bool   include a summary entry for the root dir_path dir in the list as '.'
+    // withHelpers:     bool   attach many extra helper attrs/funcs to results (beyond JSON-serializable core data)
+    //      filter: function   (optional) provide a filter func for dir entries ({abspath, relpath, basename, dirent}) => true/false
+    //    maxdepth:   number   (optional) does not affect actual calculations, but hides entries below a certain depth in the returned output for brevity
+    //    subfiles: dirent[]   (optional) instead of reading disk, you can manually provide a getDirEntries results list to use
+
+    // {
+    //   ...
+    //   'example.txt': { ... },
+    //   'foobar/example.mp3': { ... },
+    //   '.': {                                 // this is the fully agumented result when withHelpers=true
+    //     is_file: false,
+    //     is_dir: true,
+    //     filename: '.',
+    //     basename: '1709039915.378868',
+    //     mimeType: 'inode/directory'
+    //     extension: undefined,
+    //     num_bytes: 11540961,
+    //     num_subpaths: 15,
+    //     sha256: '9fc58b3ed887e7139338062ebd49bd6795373759e8acb73d2f7a40f1413789da',
+    //     reldepth: 1,
+    //     relpath: './',
+    //     cwd: '/opt/archivebox/data/archive/1709039915.378868/',
+    //     dirname: '/opt/archivebox/data/archive',
+    //     abspath: '/opt/archivebox/data/archive/1709039915.378868',
+    //     dirent: Stats {
+    //       dev: 16777240,
+    //       mode: 16895,
+    //       uid: 501,
+    //       ...
+    //       mtimeMs: 1717160622956.1357,
+    //       ctimeMs: 1717160622956.1357,
+    //     },
+    //     created: '2024-05-31T13:03:42.956Z',
+    //     modified: '2024-05-31T13:03:42.956Z',
+    //     summary: './data/archive/1709039915.378868 (inode/directory 11541kb 9fc58b3e)',
+    //     helptext: 'Verify these hashes by running:\n' +
+    //       '  cd /opt/archivebox/data/archive/1709039915.378868 \n' +
+    //       "  find . -type f -not -path '*/.*' -print0 | sort -z | xargs -0 sha256sum | sha256sum",
+    //   },
+    // }
+
+    pwd = pwd || dir_path
+    if (!dir_path.startsWith(pwd)) {
+        dir_path = path.join(pwd, dir_path)
+    }
+
+    // calculate hashes and sizes recursively
+    const hashes = await getDirSha256(dir_path, {pwd, withRoot, filter, maxdepth, subfiles})
+    const sizes = await getDirSizes(dir_path, {pwd, withRoot, filter, maxdepth, subfiles})
+
+    const num_total_subpaths = Object.keys(hashes).filter(name => name !== '.').length
+
+    const details = {}
+    for (const [filename, sha256] of Object.entries(hashes)) {
+        if (filename === '.' && !withRoot) continue
+
+        const abspath = await fs.promises.realpath(path.join(dir_path, filename))
+        const dirent = await fs.promises.stat(abspath)
+        const num_subpaths = Object.keys(hashes).filter(subpath => subpath.startsWith(filename + '/')).length
+        const is_file = dirent.isFile()
+        const is_dir = dirent.isDirectory()
+
+        // bare-bones info suitable for JSON dumps/exports
+        const basic_info = {
+            sha256,
+            num_bytes: sizes[filename],
+            created: (new Date(dirent.ctimeMs)).toISOString(),
+            mimeType: undefined,
+            extension: undefined,
+            num_subpaths: undefined,
+        }
+        if (is_dir) {
+            basic_info.mimeType = 'inode/directory'
+            basic_info.extension = undefined
+            basic_info.num_subpaths = (filename === '.') ? num_total_subpaths : num_subpaths
+        }
+        if (is_file) {
+            basic_info.mimeType = mime.lookup(abspath) || null
+            basic_info.extension = path.extname(filename)
+            basic_info.num_subpaths = undefined
+        }
+
+        // extra helpers suitable for usage in other areas of the codebase
+        const info_with_helpers = {
+            ...basic_info,
+            filename,
+            basename: path.basename(abspath),
+            dirname: path.dirname(abspath),
+            cwd: dir_path,
+            relpath: is_dir ? (filename + '/') : filename,
+            reldepth: pathDepth(filename),
+            abspath,
+            is_file,
+            is_dir,
+            dirent,
+            modified: (new Date(dirent.mtimeMs)).toISOString(),
+            summary: `${prettyPath(abspath)} (${basic_info.mimeType} ${Math.round(basic_info.num_bytes/1000)}kb ${sha256.substring(0, 8)})`,
+            helptext: undefined,
+        }
+        if (filename === '.') {
+            info_with_helpers.helptext = `Verify these hashes by running:\n  cd ${prettyPath(abspath)} \n  find . -type f -not -path '*/.*' -print0 | sort -z | xargs -0 sha256sum | sha256sum`
+        }
+
+        if ((typeof filter) === 'function') {
+            if (!filter(info_with_helpers)) continue
+        }
+
+        details[filename] = withHelpers ? info_with_helpers : basic_info
+    }
+    return details
+}
+
+// console.log(await getDirSha256(
+//     '/opt/archivebox/data/archive/1709039915.378868/',
+//     {
+//         withRoot: true,
+//         maxdepth: -1,
+//         filter: ({relpath}) => relpath.startsWith('versions'),
+//     },
+// ))
+// console.log(await getDirSizes(
+//     '/opt/archivebox/data/archive/1709039915.378868/',
+//     {
+//         withRoot: false,
+//         maxdepth: 2,
+//         filter: ({relpath}) => !relpath.startsWith('versions'),
+//     },
+// ))
+// console.log(await getDirInfo(
+//     '/opt/archivebox/data/archive/1709039915.378868/',
+//     {
+//         withRoot: true,
+//         withHelpers: true,
+//         maxdepth: 1,
+//         // filter: ({relpath}) => relpath.startsWith('versions'),
+//     },
+// ))
+
+type DetectFilenameOptions = {
+    url?: string,
+    response?: HTTPResponse | Response,
+    page?: Page,
+    dir?: string,
+    abspath?: string,
+    filename?: string,
+    basename?: string,
+    extension?: string,
+    mimeType?: string,
+    resourceType?: string,
+}
+
+async function detectFilename({ url, response, page, dir, abspath, filename, basename, extension, mimeType, resourceType }: DetectFilenameOptions) {
+    // this function takes a url (and/or response/page), and detects the abspath,dir,filename,basename,extention,mimeType
+    // from the URL (+ any enforced path components passed in via args)
+    // example: detectFilename({url: 'https://example.com/favicon.png', extension: 'ico'}) outputs 'favicon.ico'
+    //
+    // it has some quirks that are specific to archiving and may not behave as you expect
+    // e.g. if visiting the url https://example.com/error.zip returns a 500 text/html error page
+    // this may still save it as a .zip with mimeType=application/x-zip and ignore the response mimeType the url ends in .zip
+    // however, if the url has no extension, e.g. https://example.com/error it will 
+    // auto-detect the mimeType based on the response and append an extension, saving as error.html
+    //
+    // ⚠️ SECURITY WARNING: think carefully about the permissions, shell injection, and RCE implications of any changes made here ⚠️
+    // this function writes untrusted web content to the filesystem using auto-detected mimetype to co-erce the extension,
+    // which can be dangerous (e.g. what if one of these downloads is a malicious ransomware .exe, do we really want to give it .exe?
+    // if we do, how do we make sure it never gets executed? (without damaging the integrity of the copy)
+
+    if (!(response || page)) throw 'Either a page or a response must be provided in order to detect mimeType & URL'
+
+    if (response && (typeof response.headers !== 'function')) {
+        const node_fetch_response: Response = response as Response
+        response = {
+            url: () => node_fetch_response.url,
+            headers: () => node_fetch_response.headers,
+        } as unknown as HTTPResponse
+    }
+    response = response as HTTPResponse
+
+    url = url || response?.url() || (await page.url())
+    if (!url) throw 'URL was not provided and could not be detected from {response, page}'
+
+    // Document, Stylesheet, Image, Media, Font, Script, TextTrack, XHR, Fetch, Prefetch, EventSource, WebSocket, Manifest, SignedExchange, Ping, CSPViolationReport, Preflight, Other
+    try {
+        resourceType = resourceType || response?.request()?.resourceType()
+    } catch(err) {
+        // ignore, sometimes response is null/not available
+    }
+    const resourceTypeToMimeType = {
+        'Stylesheet': 'text/css',
+        'Script': 'application/x-javascript',
+        'WebSocket': 'application/json',
+        'Website': 'text/html',
+    }
+    
+    mimeType = mimeType || resourceTypeToMimeType[resourceType]   // guess extension based on request resourceType
+    extension = extension || (mimeType ? mime.extension(mimeType) : null)
+
+    // handle special url cases (e.g. schemes in URL_SCHEMES_IGNORED)
+    if (url.startsWith('about:blank')) {
+        filename = 'about_blank'
+        mimeType = 'text/html'
+    }
+    else if (url.startsWith('data:')) {
+        filename = `data__${hashCode(url)}`
+    }
+
+    // console.log('detectFilename>', {url, dir, abspath, filename, basename, extension, mimeType, resourceType})
+
+    if (abspath) {
+        if (dir || filename || basename || extension)
+            throw '{abspath} should not be passed with other options (e.g. dir, filename, basename, extension)'
+        var {dir, base: filename, ext: extension, name: basename} = path.parse(abspath)
+        // path.parse('/home/user/dir/file.txt') returns:
+        // { root: '/',
+        //   dir: '/home/user/dir',
+        //   base: 'file.txt',
+        //   ext: '.txt',
+        //   name: 'file' } 
+    } else {
+        dir = dir || path.resolve(process.cwd())
+
+        filename = filename                                         // https://example.com/a.1.zip?e.pdf=2#g.h=3  => a.1.zip
+            || (new URL(url)).pathname.split('/').at(-1)          // https://example.com/file124.rss  => file124.rss    prefers last component of path with no query/hash, falls back to domain name if no path
+            || 'index'                                            // https://example.com/abc/def/     => index.html
+            //|| (new URL(url)).hostname.replaceAll('.', '_')     // https://example.com              => example_com  (but if disabled, this would be index.html)
+    }
+    if (!filename) throw 'filename/abspath were not passed and could not be detected from url'
+    
+    const path_extname = path.extname(filename)
+    const resp_mimetype = response && (
+        (response as any).mimeType
+        || response.headers()['content-type']?.split(';')[0]
+        || resourceTypeToMimeType[resourceType]
+        || 'application/octet-stream'
+    )
+
+    mimeType = mimeType                                         // https://example.com/a.1.zip?e.pdf=2#g.h=3  => application/x-zip    prefers mimetype based on extension in path, falls back to response mimeType
+        || (path_extname && mime.lookup(path_extname))                    // https://example.com/file124.rss  => application/rss+xml
+        || resp_mimetype                                                  // https://example.com/get?type=png => image/png
+        
+    extension = extension
+        || (path_extname && path_extname.replace('.', ''))      // https://example.com/a.1.zip?e.pdf=2#g.h=3  => zip                  prefers extension in path, falls back to response mimeType's suggested extension
+        || (resp_mimetype && mime.extension(resp_mimetype))               // https://example.com              => html
+        || ''                                                             // https://example.com/websocket.1  => 
+    if (extension.startsWith('.'))
+        extension = extension.slice(1)
+
+    basename = basename                                         // https://example.com/a.1.zip?e.pdf=2#g.h=3  => a.1                  prefers to filename in path (without extension), falls back to domain name
+        || (path.parse(filename).name)                          // https://mp4dl.example.com        => mp4dl_example_com
+
+    basename = basename.slice(0, 120)                            // truncate at 120 characters (leaving 8 chars for .ext)
+    basename = basename.replace(/[^a-zA-Z0-9%+?&=@;_ \.-]/g, '') // strip characters not allowed in filenames
+
+    filename = basename + '.' + extension
+
+    if (filename.endsWith('.'))
+        filename = filename.slice(0, -1)
+
+    abspath = abspath || path.join(dir, filename)
+
+    // console.log('detectFilename<', {url, dir, abspath, filename, basename, extension, mimeType, resourceType})
+
+    return {
+        url,
+        dir,
+        abspath,
+        filename,
+        basename,
+        extension,
+        mimeType,
+        resourceType,
+        resp_mimetype,
+    }
+}
+
+interface DowloadOptions extends DetectFilenameOptions {
+    browser?: Browser
+    expected_mimetype?: string
+    timeout?: number
+}
+
+async function download({ url, browser, page, response, dir, abspath, filename, basename, extension, expected_mimetype, timeout }: DowloadOptions) {
+    url = url || (response as HTTPResponse)?.url() || (await page?.url())
+    ALREADY_ARCHIVED.add(url.slice(0, 4096))   // prevent running whole archive task on tabs we create for just for downloading
+
+    browser = browser || (page && (await page.browser()))
+    timeout = timeout || 120_000
+    expected_mimetype = expected_mimetype || ''
+    let newPage = null
+    let errors = []
+    let num_bytes = 0
+    let bytesBuffer = null
+
+
+    // if we need to fetch the url (i.e. it's not already been requested)
+    if (!response) {
+        if (!browser) throw 'No {browser} or {page} was provided to download with'
+        newPage = await browser.newPage()
+        if (page) await page.bringToFront()  // if origin page is provided, make sure it stays in foreground
+        response = await newPage.goto(url, {timeout: timeout, waitUntil: 'networkidle0'})
+        if (page) await page.bringToFront()  // if origin page is provided, make sure it stays in foreground
+    }
+    url = url || (response as HTTPResponse)?.url() || (await newPage?.url()) || (await page?.url());
+    const response_mimetype = (response as HTTPResponse).headers()['content-type']?.split(';')[0] || 'text/html'
+
+    // detect the filename we should write to based on provided url/response/page/filename/extension suggestions
+    var {
+        dir,
+        abspath,
+        filename,
+        basename,
+        extension,
+        mimeType,
+    } = await detectFilename({url, page, response, dir, abspath, filename, basename, extension, mimeType})
+
+    // if mimeType is passed, make sure response matches expected mimetype, otherwise consider download a failure
+    if (!response_mimetype.startsWith(expected_mimetype)) {
+        errors.push(`Expected ${expected_mimetype} but got ${response_mimetype}`)
+    } else {
+
+        // download the file using puppeteer's response.buffer()
+        try {
+            // write the response bytes into the output file
+            bytesBuffer = await (response as HTTPResponse).buffer()
+            await overwriteFile(abspath, bytesBuffer)
+            num_bytes = bytesBuffer.length
+        } catch(err) {
+            errors.push(err)
+        }
+
+        // security check to make sure downloaded file is not executable (random binaries downloaded off the internet = dangerous)
+        fs.access(abspath, fs.constants.X_OK, (err) => {
+            if (!err) console.warn(
+                '[⚠️] SECURITY WARNING: Downloaded file appears to be executable:', prettyPath(abspath),
+                '\n     (be careful running untrusted programs downloaded from the internet!)'
+            )
+        })
+    }
+
+    // if we opened a dedicated page for downloading, close it now
+    if (newPage) {
+        newPage.close()
+    }
+
+    if (errors.length) {
+        // console.warn(`[❌] Downloading ${url} (${mimeType}) to ${abspath} failed:`, JSON.stringify(errors, null, 4))
+    } else {
+        console.log(`[💾] Downloaded ${url.substring(0, 40)}  (${num_bytes} ${mimeType})...`.padEnd(82), prettyPath(abspath))
+    }
+
+    return {
+        url, response, errors,
+        dir, abspath, filename, basename, extension, mimeType,
+        bytesBuffer, num_bytes,
+    }
+}
+
+
+/************************** Puppeteer Launching *******************************/
+
+
+async function startCluster(puppeteer, args=CHROME_ARGS_DEFAULT) {
+    console.log(`[🎭] Launching ${CHROME_CLUSTER_WORKERS}x Chromium browsers with puppeteer-cluster:`.padEnd(82), prettyPath(CHROME_PROFILE_PATH))
+    const cluster = await Cluster.launch({
+        puppeteer,
+        monitor: true,
+        maxConcurrency: CHROME_CLUSTER_WORKERS,
+        sameDomainDelay: 2550,
+        workerCreationDelay: 250,
+        timeout: 300_000,                       // total ms timeout for an entire task (1000ms * 60s * 5m)
+        concurrency: Cluster.CONCURRENCY_PAGE,  // share cookies between all tabs in a given browser
+        puppeteerOptions: {
+            args,                                           // all the chrome launch CLI args
+            ignoreDefaultArgs: true,                        // trust me, we have enough args already...
+            // dumpio: true,                                // full debug log output, super noisy
+        }
+    })
+    console.log('*************************************************************************')
+    return cluster
+}
+
+async function remoteBrowser(puppeteer, {browserURL, browserWSEndpoint}) {
+    console.log('[🎭] Connecting Puppeteer to existing Chromium browser via:', browserURL || browserWSEndpoint)
+    let completed_initial_connection = false
+    const browser = await puppeteer.connect({browserURL, browserWSEndpoint, defaultViewport: null, targetFilter: () => completed_initial_connection})
+    completed_initial_connection = true
+    console.log('*************************************************************************')
+    return browser
+}
+
+async function startBrowser(puppeteer, args=CHROME_ARGS_DEFAULT) {
+    console.log('[🎭] Launching Puppeteer Chromium browser...'.padEnd(82+1), prettyPath(CHROME_PROFILE_PATH))
+
+    const browser = await puppeteer.launch({ignoreDefaultArgs: true, args, dumpio: true})
+    globalThis.browser = browser
+    console.log('*************************************************************************')
+    
+    // store all active tabs on global var by url for easier vscode interactive debugging
+    const storeTabForDebugger = async (target) => {
+        try {
+            globalThis.tabs = globalThis.tabs || {}
+            const url = target.url()
+            const page = await target.page()
+            if (!page || page?.isClosed()) {
+                delete globalThis.tabs[url]
+            } else {
+                globalThis.tab = page
+                globalThis.tabs[url] = page
+            }
+        } catch(err) {console.warn(err)}
+    }
+    browser.on('targetcreated', storeTabForDebugger)
+    browser.on('targetchanged', storeTabForDebugger)
+    browser.on('targetdestroyed', storeTabForDebugger)
+
+    // wait for initial extension background.js/service worker targets to load
+    await wait(3_000)
+
+    // prime the extensions cache
+    const extensions = await getChromeExtensionsFromCache({browser})
+    globalThis.extensions = extensions  // for easier debugging only
+
+    // give the user 2min to check any issues with the initial startup pages (bot profile pages),
+    // solve captchas, re-login, etc. then close them after that to save resources
+    const startup_pages = (await browser.pages())
+    const startup_page_close_delay = 120_000
+    setTimeout(async () => {
+        for (const page of startup_pages) {
+            try { await page.close() } catch(err) { /* page may already be closed by now, which is fine */ }
+        }
+        
+    }, startup_page_close_delay)
+    
+    // setup any extensions that need final runtime configuration using their options pages
+    // await setup2CaptchaExtension({browser, extensions})
+
+    // open a placeholder page so browser window stays open when there are no active archiving pages
+    // (it's annoying to have the entire window open/close/open/close/etc every time an archive task runs)
+    const empty_page = await browser.newPage()
+    await wait(250)
+    await empty_page.goto('chrome://version')
+    await wait(500)
+    console.log('*************************************************************************')
+
+    return browser
+}
+
+async function startAPIServer(port=API_SERVER_PORT, host=API_SERVER_HOST, taskCallback=null) {
+    // taskCallback should be an async function that takes ({url}) => and does something with it
+    assert(taskCallback && (typeof taskCallback === 'function'))
+
+    const server = createServer(async (req, res) => {
+        if (req.method === 'POST') {
+            console.log(`[API][POST] ${req.url}`)
+            let body = '';
+
+            req.on('data', (chunk) => {
+                body += chunk;
+            });
+
+            req.on('end', () => {
+                try {
+                    const jsonData = JSON.parse(body);
+                    // Process the JSON data
+                    console.log(jsonData);
+    
+                    res.writeHead(200, { 'Content-Type': 'application/json' });
+                    res.end(JSON.stringify({ message: 'JSON data received' }));
+                } catch (error) {
+                    res.writeHead(400, { 'Content-Type': 'application/json' });
+                    res.end(JSON.stringify({ error: 'Invalid JSON data' }));
+                }
+            });
+        } else if (req.method === 'GET') {
+            console.log(`[API][GET] ${req.url}`)
+            const parsedUrl = new URL(`http://${host}:${port}${req.url}`)
+            const query = new URLSearchParams(parsedUrl.search);
+            const url = query.get('url');
+            if (url && url.includes('://')) {
+                res.writeHead(200, { 'Content-Type': 'text/plain' });
+                try {
+                    await taskCallback({url})
+                    res.end(`${url}\n${TASK_PATH(url)}`);
+                } catch(err) {
+                    res.end(`${url}\n${TASK_PATH(url)}\n${err}`);
+                }
+            } else {
+                res.writeHead(500, { 'Content-Type': 'text/plain' });
+                res.end(`Bad URL: ${url}\n\nExpected: /?url=https://example.com/url/to/archive`);
+            }
+        } else {
+            res.writeHead(405, { 'Content-Type': 'application/json' });
+            res.end(JSON.stringify({ error: 'Method not allowed' }));
+        }
+    })
+
+    server.listen(port, host, () => {
+        console.log(`[🎰] API Server listening for requests on http://${host}:${port}/?url=...`);
+    })
+    console.log('*************************************************************************')
+
+    return server
+}
+
+async function main(urls, cluster=CHROME_CLUSTER) {
+    process.chdir(DATA_DIR)
+
+    const extensions =      await getChromeExtensionsFromPersona({CHROME_EXTENSIONS, CHROME_EXTENSIONS_DIR})
+    const args =            getChromeArgs({...CHROME_LAUNCH_OPTIONS, CHROME_EXTENSIONS: extensions})
+    const preferences =     getChromePreferences({CHROME_PREFERENCES_DEFAULT, CHROME_PREFERENCES_EXTRA, CHROME_DOWNLOADS_DIR, CHROME_EXTENSIONS: extensions})
+    const Puppeteer =       applyChromePreferences(PupeteerExtra, CHROME_PREFERENCES_PATH, preferences)
+    
+    Puppeteer.use(StealthPlugin());
+    // Puppeteer.use(ReplPlugin());
+    // handled by uBlock Origin & ReCaptcha browser extensions, probably not needed here anymore:
+    // Puppeteer.use(RecaptchaPlugin({
+    //     provider: {id: '2captcha', token: API_KEY_2CAPTCHA},
+    //     visualFeedback: true,
+    // }))
+    // const AdblockerPlugin = require('puppeteer-extra-plugin-adblocker')
+    // puppeteer.use(AdblockerPlugin({ blockTrackers: true }))
+
+    if (cluster) {
+        // launch browser with multiple tabs w/ puppeteer
+        const cluster = await startCluster(Puppeteer, args)
+
+        const handleTask = async ({url}) => cluster.queue(url, botArchiveTask)
+        const server = await startAPIServer(API_SERVER_PORT, API_SERVER_HOST, handleTask)
+
+        console.log('[📋] Running tasks in parallel with puppeteer cluster...')
+        for (const url of urls) {
+            if (fs.existsSync(path.join(TASK_PATH(url), 'aiqa.json'))) {
+                try {
+                    JSON.parse((await fs.promises.readFile(path.join(TASK_PATH(url), 'aiqa.json'))).toString())
+                    console.log('    skipping (already present):', TASK_PATH(url), url)
+                    continue
+                } catch(err) {
+                    // pass
+                }
+            }
+            cluster.queue(url, botArchiveTask)
+            await wait(3_000)
+        }
+
+        await cluster.idle();
+        await cluster.close();
+    } else {
+        // launch single new browser w/ puppeter / connect to remote CDP browser w/ puppeteer
+        const browser = await startBrowser(Puppeteer, args)
+        // const browser = await remoteBrowser(Puppeteer, {browserURL, browserWSEndpoint})
+
+        // run speedtest in the background
+        speedtest({browser})
+
+        const handleTask = async ({url}) => await botArchiveTask({page: (await browser.newPage()), data: url})
+        const server = await startAPIServer(API_SERVER_PORT, API_SERVER_HOST, handleTask)
+
+        // wait for any pre-run setup tasks or server requests
+        await wait(5_000)
+
+        let num_succeeded = 0
+        let num_failed = 0
+
+        console.log(`[📋] Running ${urls.length} tasks sequentially with puppeteer browser...`)
+        for (const url of urls) {
+            const run_count = (num_succeeded + num_failed) || 1
+
+            // check if task should be run or skipped based on existing snapshot data present in directory
+            const metrics_path = path.join(TASK_PATH(url), 'metrics.json')
+            const screenshot_path = path.join(TASK_PATH(url), 'screenrecording.gif')
+            const aiqa_path = path.join(TASK_PATH(url), 'aiqa.json')
+            const versions_path = path.join(TASK_PATH(url), 'versions')
+            if (fs.existsSync(metrics_path) && fs.existsSync(screenshot_path) && fs.existsSync(aiqa_path) && fs.existsSync(versions_path)) {
+                try {
+                    const ai_qa_result = JSON.parse(await fs.promises.readFile(aiqa_path, 'utf-8'))
+                    console.log(prettyPath(TASK_PATH(url)), `${ai_qa_result.pct_visible}%`, ai_qa_result.website_brand_name, url.substring(0, 80))
+                    assert(ai_qa_result.website_brand_name)
+                    continue
+                } catch(err) {
+                    // pass
+                }
+            }
+            let delay = 0
+
+            // create a new browser page and run the archiving task
+            const page = (await browser.newPage())
+            try {
+                console.log(ANSI.black + `◤==============================================================================[${String(run_count).padStart(3)}]/[${urls.length}]◥` + ANSI.reset)
+                await botArchiveTask({page, data: url})
+                delay = 1_000
+                num_succeeded += 1
+            } catch(err) {
+                console.error('[❌] Archiving task failed!', url)
+                console.error(err)
+                num_failed += 1
+                delay = 15_000   // extra delay if there are errors
+            }
+            console.log(ANSI.black + `◣==============================================================================[☑ ${num_succeeded}][🆇 ${num_failed}]◢` + ANSI.reset)
+            
+            // check for abnormally high failure rates and exit early if needed
+            const failure_pct = Math.round((num_failed/run_count) * 100)
+            if (failure_pct > 50) {
+                if (run_count > 5) {
+                    console.warn(`[⚠️] ${failure_pct}% Task failure rate is very high! Will self-cancel after 10 URLs if >50% continue to fail...`)
+                }
+                if (run_count > 10) {
+                    throw `Too many tasks failed in a row! Quitting early after ${run_count}/${urls.length} tasks.`
+                }
+            }
+            
+            // increase the delay between tasks based on the ratio of how many are failing:succeeding
+            delay = Math.pow(4, (num_failed/(num_succeeded + 3))) * delay
+            // e.g. 0:1 failure ratio ==  1  * delay ==   1 ~ 15s
+            //      1:1 failure ratio ==  5  * delay ==   5 ~  1m ... 5^(failed:succeeded) exponential increase
+            //      2:1 failure ratio == 25  * delay == 25s ~  6m
+            //      3:1 failure ratio == 125 * delay ==  2m ~ 31m
+            //      etc...
+            //      up to 1hr+
+            delay = Math.min(delay, 3_600_000)   // 1hr maximum delay between tasks
+            delay = Math.max(delay, 1_000)       // 1s minimum delay between tasks
+            if (delay > 2_500) {
+                console.log('... waiting', Math.round(delay/1000), 'seconds (self rate-limit)...')
+            }
+            await wait(delay)   // base ratelimit
+            console.log()
+        }
+
+
+        if (PASSIVE_ARCHIVING) {
+            // replace these as-needed:
+            const browserURL = 'http://localhost:9222/'
+            const browserWSEndpoint = 'ws://localhost:9222/devtools/browser'
+
+            const driver_browser = browser || await remoteBrowser(Puppeteer, {browserURL, browserWSEndpoint})
+            const archiver_browser = {} //await startBrowser(Puppeteer, args)
+
+            const extensions = await getChromeExtensionsFromCache({browser: driver_browser})
+
+            // close both browsers if either one is closed
+            let browser_is_open = true
+            driver_browser.on('disconnected', async () => {browser_is_open = false})  // await archiver_browser.close()
+            // archiver_browser.on('disconnected', async () => {browser_is_open = false; await driver_browser.close()})
+
+            // handle any tab navigation to a new URL in the driver browser
+            const handleUserNavigation = async (target) => {
+                const url = target.url()
+                const page = await target.page()
+                // const client = await target.createCDPSession()
+
+                if (target.type() == 'page' && page && url) {
+                    console.log(ANSI.black + '==============================================================================' + ANSI.reset)
+                    console.warn('[➕] DRIVER BROWSER NAVIGATED:', ANSI.blue, url, ANSI.reset)
+                    
+                    try {
+                        await passiveArchiveTask({browser: driver_browser, page, url})
+                        await wait(3_000)
+                    } catch(err) {
+                        console.error('[❌] Archiving task failed!', url)
+                        console.error(err)
+                        await wait(10_000)   // base ratelimit
+                    }
+                    console.log(ANSI.black + '==============================================================================' + ANSI.reset)
+                    // await client.send('Page.enable')
+                    // await client.send('Page.setWebLifecycleState', {state: 'active'})
+                }
+                // await client.send('Runtime.runIfWaitingForDebugger')
+            }
+
+            // setup handler to archive new page whenever one is opened
+            driver_browser.on('targetcreated', handleUserNavigation)
+            driver_browser.on('targetchanged', handleUserNavigation)
+
+            console.log('------------------------------------------------------')
+            console.log('[👀] Waiting for browser tabs to be opened by human...')
+            while (browser_is_open) {
+                await wait(2_000)
+            }
+        } else {
+            while (true) {
+                await wait(2_000)
+            }
+        }
+
+        await browser.close()
+    }
+    console.log('[✅] Finished all tasks and stopped browsers.')
+    process.exit(0);
+}
+
+
+/******************************************************************************/
+if (import.meta.main) {
+    main(URLS).catch(console.error);
+}
+
+/******************************************************************************/
+
+// if we want to handle CLI args in the future, minimist is great:
+// var argv = require('minimist')(process.argv.slice(2));
+// console.log(argv); // --url=https://example.com --binpath=/browsers/chromium-1047/bin/chromium --datadir=/Chromium
+// const {url, binpath, datadir} = argv;
+
+
+// OLD CODE, may be useful in the future if we need audio in screenrecordings:
+// async function setupScreenrecordingWithAudio(page, wss) {
+//     console.log('[🎬] Setting up screen-recording plugin...');
+//     const stream_port = (await wss).options.port;
+//     // streamPage = await (page.browser()).newPage()
+//     await page.goto(`chrome-extension://jjndjgheafjngoipoacpjgeicjeomjli/options.html#${stream_port}`)
+//
+//     // puppeteer-stream recording start
+//     streamFile = fs.createWriteStream(SCREENRECORDING_PATH(page))
+//     stream = await getStream(page, {
+//       audio: true,
+//       video: true,
+//       bitsPerSecond: 8000000,       // 1080p video
+//     });
+//     stream.pipe(streamFile);
+//     return {stream, streamFile}
+//
+//     // puppeteer-stream recording stop & cleanup
+//     if (stream && streamFile) {
+//         await stream?.destroy();
+//         streamFile?.close();
+//         // await streamPage.close();
+//     }
+// }
+

+ 13 - 54
archivebox/__init__.py

@@ -14,7 +14,6 @@ __package__ = 'archivebox'
 import os
 import sys
 from pathlib import Path
-from typing import cast
 
 ASCII_LOGO = """
  █████╗ ██████╗  ██████╗██╗  ██╗██╗██╗   ██╗███████╗ ██████╗  ██████╗ ██╗  ██╗
@@ -41,69 +40,29 @@ from .misc.checks import check_not_root, check_io_encoding      # noqa
 check_not_root()
 check_io_encoding()
 
-# print('INSTALLING MONKEY PATCHES')
+# Install monkey patches for third-party libraries
 from .misc.monkey_patches import *                    # noqa
-# print('DONE INSTALLING MONKEY PATCHES')
 
+# Built-in plugin directories
+BUILTIN_PLUGINS_DIR = PACKAGE_DIR / 'plugins'
+USER_PLUGINS_DIR = Path(os.getcwd()) / 'plugins'
 
-# print('LOADING VENDORED LIBRARIES')
-from .pkgs import load_vendored_pkgs             # noqa
-load_vendored_pkgs()
-# print('DONE LOADING VENDORED LIBRARIES')
-
-# print('LOADING ABX PLUGIN SPECIFICATIONS')
-# Load ABX Plugin Specifications + Default Implementations
-import abx                                       # noqa
-import abx_spec_archivebox                       # noqa
-import abx_spec_config                           # noqa
-import abx_spec_abx_pkg                          # noqa
-import abx_spec_django                           # noqa
-import abx_spec_searchbackend                    # noqa
-
-abx.pm.add_hookspecs(abx_spec_config.PLUGIN_SPEC)
-abx.pm.register(abx_spec_config.PLUGIN_SPEC())
-
-abx.pm.add_hookspecs(abx_spec_abx_pkg.PLUGIN_SPEC)
-abx.pm.register(abx_spec_abx_pkg.PLUGIN_SPEC())
-
-abx.pm.add_hookspecs(abx_spec_django.PLUGIN_SPEC)
-abx.pm.register(abx_spec_django.PLUGIN_SPEC())
-
-abx.pm.add_hookspecs(abx_spec_searchbackend.PLUGIN_SPEC)
-abx.pm.register(abx_spec_searchbackend.PLUGIN_SPEC())
-
-# Cast to ArchiveBoxPluginSpec to enable static type checking of pm.hook.call() methods
-abx.pm = cast(abx.ABXPluginManager[abx_spec_archivebox.ArchiveBoxPluginSpec], abx.pm)
-pm = abx.pm
-# print('DONE LOADING ABX PLUGIN SPECIFICATIONS')
-
-# Load all pip-installed ABX-compatible plugins
-ABX_ECOSYSTEM_PLUGINS = abx.get_pip_installed_plugins(group='abx')
-
-# Load all built-in ArchiveBox plugins
-ARCHIVEBOX_BUILTIN_PLUGINS = {
-    'config': PACKAGE_DIR / 'config',
-    'workers': PACKAGE_DIR / 'workers',
-    'core': PACKAGE_DIR / 'core',
-    'crawls': PACKAGE_DIR / 'crawls',
-    # 'machine': PACKAGE_DIR / 'machine'
-    # 'search': PACKAGE_DIR / 'search',
+# These are kept for backwards compatibility with existing code
+# that checks for plugins. The new hook system uses discover_hooks()
+ALL_PLUGINS = {
+    'builtin': BUILTIN_PLUGINS_DIR,
+    'user': USER_PLUGINS_DIR,
 }
-
-# Load all user-defined ArchiveBox plugins
-USER_PLUGINS = abx.find_plugins_in_dir(Path(os.getcwd()) / 'user_plugins')
-
-# Import all plugins and register them with ABX Plugin Manager
-ALL_PLUGINS = {**ABX_ECOSYSTEM_PLUGINS, **ARCHIVEBOX_BUILTIN_PLUGINS, **USER_PLUGINS}
-# print('LOADING ALL PLUGINS')
-LOADED_PLUGINS = abx.load_plugins(ALL_PLUGINS)
-# print('DONE LOADING ALL PLUGINS')
+LOADED_PLUGINS = ALL_PLUGINS
 
 # Setup basic config, constants, paths, and version
 from .config.constants import CONSTANTS                         # noqa
 from .config.paths import PACKAGE_DIR, DATA_DIR, ARCHIVE_DIR    # noqa
 from .config.version import VERSION                             # noqa
 
+# Set MACHINE_ID env var so hook scripts can use it
+os.environ.setdefault('MACHINE_ID', CONSTANTS.MACHINE_ID)
+
 __version__ = VERSION
 __author__ = 'ArchiveBox'
 __license__ = 'MIT'

+ 0 - 3
archivebox/api/apps.py

@@ -2,14 +2,11 @@ __package__ = 'archivebox.api'
 
 from django.apps import AppConfig
 
-import abx
-
 
 class APIConfig(AppConfig):
     name = 'api'
 
 
[email protected]
 def register_admin(admin_site):
     from api.admin import register_admin
     register_admin(admin_site)

+ 37 - 6
archivebox/api/migrations/0001_initial.py

@@ -1,10 +1,11 @@
-# Generated by Django 4.2.11 on 2024-04-25 04:19
+# Generated by Django 5.0.6 on 2024-12-25 (squashed)
 
-import api.models
+from uuid import uuid4
 from django.conf import settings
 from django.db import migrations, models
 import django.db.models.deletion
-import uuid
+
+import api.models
 
 
 class Migration(migrations.Migration):
@@ -19,11 +20,41 @@ class Migration(migrations.Migration):
         migrations.CreateModel(
             name='APIToken',
             fields=[
-                ('id', models.UUIDField(default=uuid.uuid4, editable=False, primary_key=True, serialize=False)),
+                ('id', models.UUIDField(default=uuid4, editable=False, primary_key=True, serialize=False, unique=True)),
+                ('created_by', models.ForeignKey(default=None, on_delete=django.db.models.deletion.CASCADE, to=settings.AUTH_USER_MODEL)),
+                ('created_at', models.DateTimeField(auto_now_add=True, db_index=True)),
+                ('modified_at', models.DateTimeField(auto_now=True)),
                 ('token', models.CharField(default=api.models.generate_secret_token, max_length=32, unique=True)),
-                ('created', models.DateTimeField(auto_now_add=True)),
                 ('expires', models.DateTimeField(blank=True, null=True)),
-                ('user', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, to=settings.AUTH_USER_MODEL)),
             ],
+            options={
+                'verbose_name': 'API Key',
+                'verbose_name_plural': 'API Keys',
+            },
+        ),
+        migrations.CreateModel(
+            name='OutboundWebhook',
+            fields=[
+                ('id', models.UUIDField(default=uuid4, editable=False, primary_key=True, serialize=False, unique=True)),
+                ('created_by', models.ForeignKey(default=None, on_delete=django.db.models.deletion.CASCADE, to=settings.AUTH_USER_MODEL)),
+                ('created_at', models.DateTimeField(auto_now_add=True, db_index=True)),
+                ('modified_at', models.DateTimeField(auto_now=True)),
+                ('name', models.CharField(blank=True, default='', max_length=255)),
+                ('signal', models.CharField(choices=[], db_index=True, max_length=255)),
+                ('ref', models.CharField(db_index=True, max_length=255)),
+                ('endpoint', models.URLField(max_length=2083)),
+                ('headers', models.JSONField(blank=True, default=dict)),
+                ('auth_token', models.CharField(blank=True, default='', max_length=4000)),
+                ('enabled', models.BooleanField(db_index=True, default=True)),
+                ('keep_last_response', models.BooleanField(default=False)),
+                ('last_response', models.TextField(blank=True, default='')),
+                ('last_success', models.DateTimeField(blank=True, null=True)),
+                ('last_failure', models.DateTimeField(blank=True, null=True)),
+            ],
+            options={
+                'verbose_name': 'API Outbound Webhook',
+                'ordering': ['name', 'ref'],
+                'abstract': False,
+            },
         ),
     ]

+ 0 - 17
archivebox/api/migrations/0002_alter_apitoken_options.py

@@ -1,17 +0,0 @@
-# Generated by Django 5.0.4 on 2024-04-26 05:28
-
-from django.db import migrations
-
-
-class Migration(migrations.Migration):
-
-    dependencies = [
-        ('api', '0001_initial'),
-    ]
-
-    operations = [
-        migrations.AlterModelOptions(
-            name='apitoken',
-            options={'verbose_name': 'API Key', 'verbose_name_plural': 'API Keys'},
-        ),
-    ]

+ 0 - 78
archivebox/api/migrations/0003_rename_user_apitoken_created_by_apitoken_abid_and_more.py

@@ -1,78 +0,0 @@
-# Generated by Django 5.0.6 on 2024-06-03 01:52
-
-import charidfield.fields
-import django.db.models.deletion
-import signal_webhooks.fields
-import signal_webhooks.utils
-import uuid
-from django.conf import settings
-from django.db import migrations, models
-
-import archivebox.base_models.models
-
-
-class Migration(migrations.Migration):
-
-    dependencies = [
-        ('api', '0002_alter_apitoken_options'),
-        migrations.swappable_dependency(settings.AUTH_USER_MODEL),
-    ]
-
-    operations = [
-        migrations.RenameField(
-            model_name='apitoken',
-            old_name='user',
-            new_name='created_by',
-        ),
-        migrations.AddField(
-            model_name='apitoken',
-            name='abid',
-            field=charidfield.fields.CharIDField(blank=True, db_index=True, default=None, help_text='ABID-format identifier for this entity (e.g. snp_01BJQMF54D093DXEAWZ6JYRPAQ)', max_length=30, null=True, prefix='apt_', unique=True),
-        ),
-        migrations.AddField(
-            model_name='apitoken',
-            name='modified',
-            field=models.DateTimeField(auto_now=True),
-        ),
-        migrations.AddField(
-            model_name='apitoken',
-            name='uuid',
-            field=models.UUIDField(blank=True, null=True, unique=True),
-        ),
-        migrations.AlterField(
-            model_name='apitoken',
-            name='id',
-            field=models.UUIDField(default=uuid.uuid4, primary_key=True, serialize=False),
-        ),
-        migrations.CreateModel(
-            name='OutboundWebhook',
-            fields=[
-                ('name', models.CharField(db_index=True, help_text='Give your webhook a descriptive name (e.g. Notify ACME Slack channel of any new ArchiveResults).', max_length=255, unique=True, verbose_name='name')),
-                ('signal', models.CharField(choices=[('CREATE', 'Create'), ('UPDATE', 'Update'), ('DELETE', 'Delete'), ('M2M', 'M2M changed'), ('CREATE_OR_UPDATE', 'Create or Update'), ('CREATE_OR_DELETE', 'Create or Delete'), ('CREATE_OR_M2M', 'Create or M2M changed'), ('UPDATE_OR_DELETE', 'Update or Delete'), ('UPDATE_OR_M2M', 'Update or M2M changed'), ('DELETE_OR_M2M', 'Delete or M2M changed'), ('CREATE_UPDATE_OR_DELETE', 'Create, Update or Delete'), ('CREATE_UPDATE_OR_M2M', 'Create, Update or M2M changed'), ('CREATE_DELETE_OR_M2M', 'Create, Delete or M2M changed'), ('UPDATE_DELETE_OR_M2M', 'Update, Delete or M2M changed'), ('CREATE_UPDATE_DELETE_OR_M2M', 'Create, Update or Delete, or M2M changed')], help_text='The type of event the webhook should fire for (e.g. Create, Update, Delete).', max_length=255, verbose_name='signal')),
-                ('ref', models.CharField(db_index=True, help_text='Dot import notation of the model the webhook should fire for (e.g. core.models.Snapshot or core.models.ArchiveResult).', max_length=1023, validators=[signal_webhooks.utils.model_from_reference], verbose_name='referenced model')),
-                ('endpoint', models.URLField(help_text='External URL to POST the webhook notification to (e.g. https://someapp.example.com/webhook/some-webhook-receiver).', max_length=2047, verbose_name='endpoint')),
-                ('headers', models.JSONField(blank=True, default=dict, help_text='Headers to send with the webhook request.', validators=[signal_webhooks.utils.is_dict], verbose_name='headers')),
-                ('auth_token', signal_webhooks.fields.TokenField(blank=True, default='', help_text='Authentication token to use in an Authorization header.', max_length=8000, validators=[signal_webhooks.utils.decode_cipher_key], verbose_name='authentication token')),
-                ('enabled', models.BooleanField(default=True, help_text='Is this webhook enabled?', verbose_name='enabled')),
-                ('keep_last_response', models.BooleanField(default=False, help_text='Should the webhook keep a log of the latest response it got?', verbose_name='keep last response')),
-                ('updated', models.DateTimeField(auto_now=True, help_text='When the webhook was last updated.', verbose_name='updated')),
-                ('last_response', models.CharField(blank=True, default='', help_text='Latest response to this webhook.', max_length=8000, verbose_name='last response')),
-                ('last_success', models.DateTimeField(default=None, help_text='When the webhook last succeeded.', null=True, verbose_name='last success')),
-                ('last_failure', models.DateTimeField(default=None, help_text='When the webhook last failed.', null=True, verbose_name='last failure')),
-                ('created', models.DateTimeField(auto_now_add=True)),
-                ('modified', models.DateTimeField(auto_now=True)),
-                ('id', models.UUIDField(blank=True, null=True, unique=True)),
-                ('uuid', models.UUIDField(default=uuid.uuid4, primary_key=True, serialize=False)),
-                ('abid', charidfield.fields.CharIDField(blank=True, db_index=True, default=None, help_text='ABID-format identifier for this entity (e.g. snp_01BJQMF54D093DXEAWZ6JYRPAQ)', max_length=30, null=True, prefix='whk_', unique=True)),
-                ('created_by', models.ForeignKey(default=archivebox.base_models.models.get_or_create_system_user_pk, on_delete=django.db.models.deletion.CASCADE, to=settings.AUTH_USER_MODEL)),
-            ],
-            options={
-                'verbose_name': 'API Outbound Webhook',
-                'abstract': False,
-            },
-        ),
-        migrations.AddConstraint(
-            model_name='outboundwebhook',
-            constraint=models.UniqueConstraint(fields=('ref', 'endpoint'), name='prevent_duplicate_hooks_api_outboundwebhook'),
-        ),
-    ]

+ 0 - 24
archivebox/api/migrations/0004_alter_apitoken_id_alter_apitoken_uuid.py

@@ -1,24 +0,0 @@
-# Generated by Django 5.1 on 2024-08-20 10:44
-
-import uuid
-from django.db import migrations, models
-
-
-class Migration(migrations.Migration):
-
-    dependencies = [
-        ('api', '0003_rename_user_apitoken_created_by_apitoken_abid_and_more'),
-    ]
-
-    operations = [
-        migrations.AlterField(
-            model_name='apitoken',
-            name='id',
-            field=models.UUIDField(default=uuid.uuid4, editable=False, primary_key=True, serialize=False),
-        ),
-        migrations.AlterField(
-            model_name='apitoken',
-            name='uuid',
-            field=models.UUIDField(blank=True, editable=False, null=True, unique=True),
-        ),
-    ]

+ 0 - 22
archivebox/api/migrations/0005_remove_apitoken_uuid_remove_outboundwebhook_uuid_and_more.py

@@ -1,22 +0,0 @@
-# Generated by Django 5.1 on 2024-08-20 22:40
-
-import uuid
-from django.db import migrations, models
-
-
-class Migration(migrations.Migration):
-
-    dependencies = [
-        ('api', '0004_alter_apitoken_id_alter_apitoken_uuid'),
-    ]
-
-    operations = [
-        migrations.RemoveField(
-            model_name='apitoken',
-            name='uuid',
-        ),
-        migrations.RemoveField(
-            model_name='outboundwebhook',
-            name='id',
-        ),
-    ]

+ 0 - 29
archivebox/api/migrations/0006_remove_outboundwebhook_uuid_apitoken_id_and_more.py

@@ -1,29 +0,0 @@
-# Generated by Django 5.1 on 2024-08-20 22:43
-
-import uuid
-from django.db import migrations, models
-
-
-class Migration(migrations.Migration):
-
-    dependencies = [
-        ('api', '0005_remove_apitoken_uuid_remove_outboundwebhook_uuid_and_more'),
-    ]
-
-    operations = [
-        migrations.RenameField(
-            model_name='outboundwebhook',
-            old_name='uuid',
-            new_name='id'
-        ),
-        migrations.AlterField(
-            model_name='outboundwebhook',
-            name='id',
-            field=models.UUIDField(default=uuid.uuid4, editable=False, primary_key=True, serialize=False),
-        ),
-        migrations.AlterField(
-            model_name='apitoken',
-            name='id',
-            field=models.UUIDField(default=uuid.uuid4, editable=False, primary_key=True, serialize=False),
-        ),
-    ]

+ 0 - 23
archivebox/api/migrations/0007_alter_apitoken_created_by.py

@@ -1,23 +0,0 @@
-# Generated by Django 5.1 on 2024-08-20 22:52
-
-import django.db.models.deletion
-from django.conf import settings
-from django.db import migrations, models
-
-import archivebox.base_models.models
-
-
-class Migration(migrations.Migration):
-
-    dependencies = [
-        ('api', '0006_remove_outboundwebhook_uuid_apitoken_id_and_more'),
-        migrations.swappable_dependency(settings.AUTH_USER_MODEL),
-    ]
-
-    operations = [
-        migrations.AlterField(
-            model_name='apitoken',
-            name='created_by',
-            field=models.ForeignKey(default=archivebox.base_models.models.get_or_create_system_user_pk, on_delete=django.db.models.deletion.CASCADE, to=settings.AUTH_USER_MODEL),
-        ),
-    ]

+ 0 - 48
archivebox/api/migrations/0008_alter_apitoken_created_alter_apitoken_created_by_and_more.py

@@ -1,48 +0,0 @@
-# Generated by Django 5.1 on 2024-09-04 23:32
-
-import django.db.models.deletion
-from django.conf import settings
-from django.db import migrations, models
-
-import archivebox.base_models.models
-
-
-class Migration(migrations.Migration):
-
-    dependencies = [
-        ('api', '0007_alter_apitoken_created_by'),
-        migrations.swappable_dependency(settings.AUTH_USER_MODEL),
-    ]
-
-    operations = [
-        migrations.AlterField(
-            model_name='apitoken',
-            name='created',
-            field=archivebox.base_models.models.AutoDateTimeField(db_index=True, default=None),
-        ),
-        migrations.AlterField(
-            model_name='apitoken',
-            name='created_by',
-            field=models.ForeignKey(default=None, on_delete=django.db.models.deletion.CASCADE, to=settings.AUTH_USER_MODEL),
-        ),
-        migrations.AlterField(
-            model_name='apitoken',
-            name='id',
-            field=models.UUIDField(default=None, editable=False, primary_key=True, serialize=False, unique=True, verbose_name='ID'),
-        ),
-        migrations.AlterField(
-            model_name='outboundwebhook',
-            name='created',
-            field=archivebox.base_models.models.AutoDateTimeField(db_index=True, default=None),
-        ),
-        migrations.AlterField(
-            model_name='outboundwebhook',
-            name='created_by',
-            field=models.ForeignKey(default=None, on_delete=django.db.models.deletion.CASCADE, to=settings.AUTH_USER_MODEL),
-        ),
-        migrations.AlterField(
-            model_name='outboundwebhook',
-            name='id',
-            field=models.UUIDField(default=None, editable=False, primary_key=True, serialize=False, unique=True, verbose_name='ID'),
-        ),
-    ]

+ 0 - 40
archivebox/api/migrations/0009_rename_created_apitoken_created_at_and_more.py

@@ -1,40 +0,0 @@
-# Generated by Django 5.1 on 2024-09-05 00:26
-
-from django.db import migrations, models
-
-import archivebox.base_models.models
-
-
-class Migration(migrations.Migration):
-
-    dependencies = [
-        ('api', '0008_alter_apitoken_created_alter_apitoken_created_by_and_more'),
-    ]
-
-    operations = [
-        migrations.RenameField(
-            model_name='apitoken',
-            old_name='created',
-            new_name='created_at',
-        ),
-        migrations.RenameField(
-            model_name='apitoken',
-            old_name='modified',
-            new_name='modified_at',
-        ),
-        migrations.RenameField(
-            model_name='outboundwebhook',
-            old_name='modified',
-            new_name='modified_at',
-        ),
-        migrations.AddField(
-            model_name='outboundwebhook',
-            name='created_at',
-            field=archivebox.base_models.models.AutoDateTimeField(db_index=True, default=None),
-        ),
-        migrations.AlterField(
-            model_name='outboundwebhook',
-            name='created',
-            field=models.DateTimeField(auto_now_add=True, help_text='When the webhook was created.', verbose_name='created'),
-        ),
-    ]

+ 1 - 1
archivebox/api/models.py

@@ -38,7 +38,7 @@ class APIToken(models.Model):
         return not self.expires or self.expires >= (for_date or timezone.now())
 
 
-class OutboundWebhook(models.Model, WebhookBase):
+class OutboundWebhook(WebhookBase):
     id = models.UUIDField(primary_key=True, default=uuid7, editable=False, unique=True)
     created_by = models.ForeignKey(settings.AUTH_USER_MODEL, on_delete=models.CASCADE, default=None, null=False)
     created_at = models.DateTimeField(default=timezone.now, db_index=True)

+ 0 - 1
archivebox/api/v1_api.py

@@ -84,7 +84,6 @@ api = NinjaAPIWithIOCapture(
     title='ArchiveBox API',
     description=html_description,
     version=VERSION,
-    csrf=False,
     auth=API_AUTH_METHODS,
     urls_namespace="api-1",
     docs=Swagger(settings={"persistAuthorization": True}),

+ 68 - 0
archivebox/base_models/admin.py

@@ -3,9 +3,77 @@
 __package__ = 'archivebox.base_models'
 
 from django.contrib import admin
+from django.utils.html import format_html, mark_safe
 from django_object_actions import DjangoObjectActions
 
 
+class ConfigEditorMixin:
+    """
+    Mixin for admin classes with a config JSON field.
+
+    Provides a readonly field that shows available config options
+    from all discovered plugin schemas.
+    """
+
+    @admin.display(description='Available Config Options')
+    def available_config_options(self, obj):
+        """Show documentation for available config keys."""
+        try:
+            from archivebox.hooks import discover_plugin_configs
+            plugin_configs = discover_plugin_configs()
+        except ImportError:
+            return format_html('<i>Plugin config system not available</i>')
+
+        html_parts = [
+            '<details>',
+            '<summary style="cursor: pointer; font-weight: bold; padding: 4px;">',
+            'Click to see available config keys ({})</summary>'.format(
+                sum(len(s.get('properties', {})) for s in plugin_configs.values())
+            ),
+            '<div style="max-height: 400px; overflow-y: auto; padding: 8px; background: #f8f8f8; border-radius: 4px; font-family: monospace; font-size: 11px;">',
+        ]
+
+        for plugin_name, schema in sorted(plugin_configs.items()):
+            properties = schema.get('properties', {})
+            if not properties:
+                continue
+
+            html_parts.append(f'<div style="margin: 8px 0;"><strong style="color: #333;">{plugin_name}</strong></div>')
+            html_parts.append('<table style="width: 100%; border-collapse: collapse; margin-bottom: 12px;">')
+            html_parts.append('<tr style="background: #eee;"><th style="text-align: left; padding: 4px;">Key</th><th style="text-align: left; padding: 4px;">Type</th><th style="text-align: left; padding: 4px;">Default</th><th style="text-align: left; padding: 4px;">Description</th></tr>')
+
+            for key, prop in sorted(properties.items()):
+                prop_type = prop.get('type', 'string')
+                default = prop.get('default', '')
+                description = prop.get('description', '')
+
+                # Truncate long defaults
+                default_str = str(default)
+                if len(default_str) > 30:
+                    default_str = default_str[:27] + '...'
+
+                html_parts.append(
+                    f'<tr style="border-bottom: 1px solid #ddd;">'
+                    f'<td style="padding: 4px; font-weight: bold;">{key}</td>'
+                    f'<td style="padding: 4px; color: #666;">{prop_type}</td>'
+                    f'<td style="padding: 4px; color: #666;">{default_str}</td>'
+                    f'<td style="padding: 4px;">{description}</td>'
+                    f'</tr>'
+                )
+
+            html_parts.append('</table>')
+
+        html_parts.append('</div></details>')
+        html_parts.append(
+            '<p style="margin-top: 8px; color: #666; font-size: 11px;">'
+            '<strong>Usage:</strong> Add key-value pairs in JSON format, e.g., '
+            '<code>{"SAVE_WGET": false, "WGET_TIMEOUT": 120}</code>'
+            '</p>'
+        )
+
+        return mark_safe(''.join(html_parts))
+
+
 class BaseModelAdmin(DjangoObjectActions, admin.ModelAdmin):
     list_display = ('id', 'created_at', 'created_by')
     readonly_fields = ('id', 'created_at', 'modified_at')

+ 2 - 2
archivebox/base_models/apps.py

@@ -1,7 +1,7 @@
 # from django.apps import AppConfig
 
 
-# class AbidUtilsConfig(AppConfig):
+# class BaseModelsConfig(AppConfig):
 #     default_auto_field = 'django.db.models.BigAutoField'
-    
+
 #     name = 'base_models'

+ 15 - 2
archivebox/base_models/models.py

@@ -19,7 +19,7 @@ from django.conf import settings
 from django_stubs_ext.db.models import TypedModelMeta
 
 from archivebox import DATA_DIR
-from archivebox.index.json import to_json
+from archivebox.misc.util import to_json
 from archivebox.misc.hashing import get_dir_info
 
 
@@ -31,6 +31,16 @@ def get_or_create_system_user_pk(username='system'):
     return user.pk
 
 
+class AutoDateTimeField(models.DateTimeField):
+    """DateTimeField that automatically updates on save (legacy compatibility)."""
+    def pre_save(self, model_instance, add):
+        if add or not getattr(model_instance, self.attname):
+            value = timezone.now()
+            setattr(model_instance, self.attname, value)
+            return value
+        return super().pre_save(model_instance, add)
+
+
 class ModelWithUUID(models.Model):
     id = models.UUIDField(primary_key=True, default=uuid7, editable=False, unique=True)
     created_at = models.DateTimeField(default=timezone.now, db_index=True)
@@ -74,6 +84,7 @@ class ModelWithSerializers(ModelWithUUID):
 
 
 class ModelWithNotes(models.Model):
+    """Mixin for models with a notes field."""
     notes = models.TextField(blank=True, null=False, default='')
 
     class Meta:
@@ -81,6 +92,7 @@ class ModelWithNotes(models.Model):
 
 
 class ModelWithHealthStats(models.Model):
+    """Mixin for models with health tracking fields."""
     num_uses_failed = models.PositiveIntegerField(default=0)
     num_uses_succeeded = models.PositiveIntegerField(default=0)
 
@@ -94,6 +106,7 @@ class ModelWithHealthStats(models.Model):
 
 
 class ModelWithConfig(models.Model):
+    """Mixin for models with a JSON config field."""
     config = models.JSONField(default=dict, null=False, blank=False, editable=True)
 
     class Meta:
@@ -113,7 +126,7 @@ class ModelWithOutputDir(ModelWithSerializers):
 
     @property
     def output_dir_parent(self) -> str:
-        return getattr(self, 'output_dir_parent', f'{self._meta.model_name}s')
+        return f'{self._meta.model_name}s'
 
     @property
     def output_dir_name(self) -> str:

+ 10 - 1
archivebox/cli/__init__.py

@@ -37,7 +37,13 @@ class ArchiveBoxGroup(click.Group):
         'server': 'archivebox.cli.archivebox_server.main',
         'shell': 'archivebox.cli.archivebox_shell.main',
         'manage': 'archivebox.cli.archivebox_manage.main',
+        # Worker/orchestrator commands
+        'orchestrator': 'archivebox.cli.archivebox_orchestrator.main',
         'worker': 'archivebox.cli.archivebox_worker.main',
+        # Task commands (called by workers as subprocesses)
+        'crawl': 'archivebox.cli.archivebox_crawl.main',
+        'snapshot': 'archivebox.cli.archivebox_snapshot.main',
+        'extract': 'archivebox.cli.archivebox_extract.main',
     }
     all_subcommands = {
         **meta_commands,
@@ -118,11 +124,14 @@ def cli(ctx, help=False):
                 raise
             
 
-def main(args=None, prog_name=None):
+def main(args=None, prog_name=None, stdin=None):
     # show `docker run archivebox xyz` in help messages if running in docker
     IN_DOCKER = os.environ.get('IN_DOCKER', False) in ('1', 'true', 'True', 'TRUE', 'yes')
     IS_TTY = sys.stdin.isatty()
     prog_name = prog_name or (f'docker compose run{"" if IS_TTY else " -T"} archivebox' if IN_DOCKER else 'archivebox')
+    
+    # stdin param allows passing input data from caller (used by __main__.py)
+    # currently not used by click-based CLI, but kept for backwards compatibility
 
     try:
         cli(args=args, prog_name=prog_name)

+ 82 - 161
archivebox/cli/archivebox_add.py

@@ -16,214 +16,135 @@ from archivebox.misc.util import enforce_types, docstring
 from archivebox import CONSTANTS
 from archivebox.config.common import ARCHIVING_CONFIG
 from archivebox.config.permissions import USER, HOSTNAME
-from archivebox.parsers import PARSERS
 
 
 if TYPE_CHECKING:
     from core.models import Snapshot
 
 
-ORCHESTRATOR = None
-
 @enforce_types
 def add(urls: str | list[str],
         depth: int | str=0,
         tag: str='',
         parser: str="auto",
-        extract: str="",
+        plugins: str="",
         persona: str='Default',
         overwrite: bool=False,
         update: bool=not ARCHIVING_CONFIG.ONLY_NEW,
         index_only: bool=False,
         bg: bool=False,
         created_by_id: int | None=None) -> QuerySet['Snapshot']:
-    """Add a new URL or list of URLs to your archive"""
+    """Add a new URL or list of URLs to your archive.
+
+    The new flow is:
+    1. Save URLs to sources file
+    2. Create Seed pointing to the file
+    3. Create Crawl with max_depth
+    4. Create root Snapshot pointing to file:// URL (depth=0)
+    5. Orchestrator runs parser extractors on root snapshot
+    6. Parser extractors output to urls.jsonl
+    7. URLs are added to Crawl.urls and child Snapshots are created
+    8. Repeat until max_depth is reached
+    """
 
-    global ORCHESTRATOR
+    from rich import print
 
     depth = int(depth)
 
-    assert depth in (0, 1), 'Depth must be 0 or 1 (depth >1 is not supported yet)'
-    
+    assert depth in (0, 1, 2, 3, 4), 'Depth must be 0-4'
+
     # import models once django is set up
+    from core.models import Snapshot
     from crawls.models import Seed, Crawl
-    from workers.orchestrator import Orchestrator
     from archivebox.base_models.models import get_or_create_system_user_pk
-
+    from workers.orchestrator import Orchestrator
 
     created_by_id = created_by_id or get_or_create_system_user_pk()
-    
-    # 1. save the provided urls to sources/2024-11-05__23-59-59__cli_add.txt
+
+    # 1. Save the provided URLs to sources/2024-11-05__23-59-59__cli_add.txt
     sources_file = CONSTANTS.SOURCES_DIR / f'{timezone.now().strftime("%Y-%m-%d__%H-%M-%S")}__cli_add.txt'
+    sources_file.parent.mkdir(parents=True, exist_ok=True)
     sources_file.write_text(urls if isinstance(urls, str) else '\n'.join(urls))
-    
-    # 2. create a new Seed pointing to the sources/2024-11-05__23-59-59__cli_add.txt
+
+    # 2. Create a new Seed pointing to the sources file
     cli_args = [*sys.argv]
     if cli_args[0].lower().endswith('archivebox'):
-        cli_args[0] = 'archivebox'  # full path to archivebox bin to just archivebox e.g. /Volumes/NVME/Users/squash/archivebox/.venv/bin/archivebox -> archivebox
+        cli_args[0] = 'archivebox'
     cmd_str = ' '.join(cli_args)
-    seed = Seed.from_file(sources_file, label=f'{USER}@{HOSTNAME} $ {cmd_str}', parser=parser, tag=tag, created_by=created_by_id, config={
-        'ONLY_NEW': not update,
-        'INDEX_ONLY': index_only,
-        'OVERWRITE': overwrite,
-        'EXTRACTORS': extract,
-        'DEFAULT_PERSONA': persona or 'Default',
-    })
-    # 3. create a new Crawl pointing to the Seed
+
+    seed = Seed.from_file(
+        sources_file,
+        label=f'{USER}@{HOSTNAME} $ {cmd_str}',
+        parser=parser,
+        tag=tag,
+        created_by=created_by_id,
+        config={
+            'ONLY_NEW': not update,
+            'INDEX_ONLY': index_only,
+            'OVERWRITE': overwrite,
+            'EXTRACTORS': plugins,
+            'DEFAULT_PERSONA': persona or 'Default',
+        }
+    )
+
+    # 3. Create a new Crawl pointing to the Seed (status=queued)
     crawl = Crawl.from_seed(seed, max_depth=depth)
-    
-    # 4. start the Orchestrator & wait until it completes
-    #    ... orchestrator will create the root Snapshot, which creates pending ArchiveResults, which gets run by the ArchiveResultActors ...
-    # from crawls.actors import CrawlActor
-    # from core.actors import SnapshotActor, ArchiveResultActor
-
-    if not bg:
-        orchestrator = Orchestrator(exit_on_idle=True, max_concurrent_actors=4)
-        orchestrator.start()
-    
-    # 5. return the list of new Snapshots created
+
+    print(f'[green]\\[+] Created Crawl {crawl.id} with max_depth={depth}[/green]')
+    print(f'    [dim]Seed: {seed.uri}[/dim]')
+
+    # 4. The CrawlMachine will create the root Snapshot when started
+    #    Root snapshot URL = file:///path/to/sources/...txt
+    #    Parser extractors will run on it and discover URLs
+    #    Those URLs become child Snapshots (depth=1)
+
+    if index_only:
+        # Just create the crawl but don't start processing
+        print('[yellow]\\[*] Index-only mode - crawl created but not started[/yellow]')
+        # Create root snapshot manually
+        crawl.create_root_snapshot()
+        return crawl.snapshot_set.all()
+
+    # 5. Start the orchestrator to process the queue
+    #    The orchestrator will:
+    #    - Process Crawl -> create root Snapshot
+    #    - Process root Snapshot -> run parser extractors -> discover URLs
+    #    - Create child Snapshots from discovered URLs
+    #    - Process child Snapshots -> run extractors
+    #    - Repeat until max_depth reached
+
+    if bg:
+        # Background mode: start orchestrator and return immediately
+        print('[yellow]\\[*] Running in background mode - starting orchestrator...[/yellow]')
+        orchestrator = Orchestrator(exit_on_idle=True)
+        orchestrator.start()  # Fork to background
+    else:
+        # Foreground mode: run orchestrator until all work is done
+        print(f'[green]\\[*] Starting orchestrator to process crawl...[/green]')
+        orchestrator = Orchestrator(exit_on_idle=True)
+        orchestrator.runloop()  # Block until complete
+
+    # 6. Return the list of Snapshots in this crawl
     return crawl.snapshot_set.all()
 
 
 @click.command()
[email protected]('--depth', '-d', type=click.Choice(('0', '1')), default='0', help='Recursively archive linked pages up to N hops away')
[email protected]('--depth', '-d', type=click.Choice([str(i) for i in range(5)]), default='0', help='Recursively archive linked pages up to N hops away')
 @click.option('--tag', '-t', default='', help='Comma-separated list of tags to add to each snapshot e.g. tag1,tag2,tag3')
[email protected]('--parser', type=click.Choice(['auto', *PARSERS.keys()]), default='auto', help='Parser for reading input URLs')
[email protected]('--extract', '-e', default='', help='Comma-separated list of extractors to use e.g. title,favicon,screenshot,singlefile,...')
[email protected]('--parser', default='auto', help='Parser for reading input URLs (auto, txt, html, rss, json, jsonl, netscape, ...)')
[email protected]('--plugins', '-p', default='', help='Comma-separated list of plugins to run e.g. title,favicon,screenshot,singlefile,...')
 @click.option('--persona', default='Default', help='Authentication profile to use when archiving')
 @click.option('--overwrite', '-F', is_flag=True, help='Overwrite existing data if URLs have been archived previously')
 @click.option('--update', is_flag=True, default=ARCHIVING_CONFIG.ONLY_NEW, help='Retry any previously skipped/failed URLs when re-adding them')
 @click.option('--index-only', is_flag=True, help='Just add the URLs to the index without archiving them now')
-# @click.option('--update-all', is_flag=True, help='Update ALL links in index when finished adding new ones')
[email protected]('--bg', is_flag=True, help='Run crawl in background worker instead of immediately')
[email protected]('--bg', is_flag=True, help='Run archiving in background (start orchestrator and return immediately)')
 @click.argument('urls', nargs=-1, type=click.Path())
 @docstring(add.__doc__)
 def main(**kwargs):
     """Add a new URL or list of URLs to your archive"""
-    
+
     add(**kwargs)
 
 
 if __name__ == '__main__':
     main()
-
-
-
-
-# OLD VERSION:
-# def add(urls: Union[str, List[str]],
-#         tag: str='',
-#         depth: int=0,
-#         update: bool=not ARCHIVING_CONFIG.ONLY_NEW,
-#         update_all: bool=False,
-#         index_only: bool=False,
-#         overwrite: bool=False,
-#         # duplicate: bool=False,  # TODO: reuse the logic from admin.py resnapshot to allow adding multiple snapshots by appending timestamp automatically
-#         init: bool=False,
-#         extractors: str="",
-#         parser: str="auto",
-#         created_by_id: int | None=None,
-#         out_dir: Path=DATA_DIR) -> List[Link]:
-#     """Add a new URL or list of URLs to your archive"""
-
-#     from core.models import Snapshot, Tag
-#     # from workers.supervisord_util import start_cli_workers, tail_worker_logs
-#     # from workers.tasks import bg_archive_link
-    
-
-#     assert depth in (0, 1), 'Depth must be 0 or 1 (depth >1 is not supported yet)'
-
-#     extractors = extractors.split(",") if extractors else []
-
-#     if init:
-#         run_subcommand('init', stdin=None, pwd=out_dir)
-
-#     # Load list of links from the existing index
-#     check_data_folder()
-
-#     # worker = start_cli_workers()
-    
-#     new_links: List[Link] = []
-#     all_links = load_main_index(out_dir=out_dir)
-
-#     log_importing_started(urls=urls, depth=depth, index_only=index_only)
-#     if isinstance(urls, str):
-#         # save verbatim stdin to sources
-#         write_ahead_log = save_text_as_source(urls, filename='{ts}-import.txt', out_dir=out_dir)
-#     elif isinstance(urls, list):
-#         # save verbatim args to sources
-#         write_ahead_log = save_text_as_source('\n'.join(urls), filename='{ts}-import.txt', out_dir=out_dir)
-    
-
-#     new_links += parse_links_from_source(write_ahead_log, root_url=None, parser=parser)
-
-#     # If we're going one level deeper, download each link and look for more links
-#     new_links_depth = []
-#     if new_links and depth == 1:
-#         log_crawl_started(new_links)
-#         for new_link in new_links:
-#             try:
-#                 downloaded_file = save_file_as_source(new_link.url, filename=f'{new_link.timestamp}-crawl-{new_link.domain}.txt', out_dir=out_dir)
-#                 new_links_depth += parse_links_from_source(downloaded_file, root_url=new_link.url)
-#             except Exception as err:
-#                 stderr('[!] Failed to get contents of URL {new_link.url}', err, color='red')
-
-#     imported_links = list({link.url: link for link in (new_links + new_links_depth)}.values())
-    
-#     new_links = dedupe_links(all_links, imported_links)
-
-#     write_main_index(links=new_links, out_dir=out_dir, created_by_id=created_by_id)
-#     all_links = load_main_index(out_dir=out_dir)
-
-#     tags = [
-#         Tag.objects.get_or_create(name=name.strip(), defaults={'created_by_id': created_by_id})[0]
-#         for name in tag.split(',')
-#         if name.strip()
-#     ]
-#     if tags:
-#         for link in imported_links:
-#             snapshot = Snapshot.objects.get(url=link.url)
-#             snapshot.tags.add(*tags)
-#             snapshot.tags_str(nocache=True)
-#             snapshot.save()
-#         # print(f'    √ Tagged {len(imported_links)} Snapshots with {len(tags)} tags {tags_str}')
-
-#     if index_only:
-#         # mock archive all the links using the fake index_only extractor method in order to update their state
-#         if overwrite:
-#             archive_links(imported_links, overwrite=overwrite, methods=['index_only'], out_dir=out_dir, created_by_id=created_by_id)
-#         else:
-#             archive_links(new_links, overwrite=False, methods=['index_only'], out_dir=out_dir, created_by_id=created_by_id)
-#     else:
-#         # fully run the archive extractor methods for each link
-#         archive_kwargs = {
-#             "out_dir": out_dir,
-#             "created_by_id": created_by_id,
-#         }
-#         if extractors:
-#             archive_kwargs["methods"] = extractors
-
-#         stderr()
-
-#         ts = datetime.now(timezone.utc).strftime('%Y-%m-%d %H:%M:%S')
-
-#         if update:
-#             stderr(f'[*] [{ts}] Archiving + updating {len(imported_links)}/{len(all_links)}', len(imported_links), 'URLs from added set...', color='green')
-#             archive_links(imported_links, overwrite=overwrite, **archive_kwargs)
-#         elif update_all:
-#             stderr(f'[*] [{ts}] Archiving + updating {len(all_links)}/{len(all_links)}', len(all_links), 'URLs from entire library...', color='green')
-#             archive_links(all_links, overwrite=overwrite, **archive_kwargs)
-#         elif overwrite:
-#             stderr(f'[*] [{ts}] Archiving + overwriting {len(imported_links)}/{len(all_links)}', len(imported_links), 'URLs from added set...', color='green')
-#             archive_links(imported_links, overwrite=True, **archive_kwargs)
-#         elif new_links:
-#             stderr(f'[*] [{ts}] Archiving {len(new_links)}/{len(all_links)} URLs from added set...', color='green')
-#             archive_links(new_links, overwrite=False, **archive_kwargs)
-
-#     # tail_worker_logs(worker['stdout_logfile'])
-
-#     # if CAN_UPGRADE:
-#     #     hint(f"There's a new version of ArchiveBox available! Your current version is {VERSION}. You can upgrade to {VERSIONS_AVAILABLE['recommended_version']['tag_name']} ({VERSIONS_AVAILABLE['recommended_version']['html_url']}). For more on how to upgrade: https://github.com/ArchiveBox/ArchiveBox/wiki/Upgrading-or-Merging-Archives\n")
-
-#     return new_links
-

+ 4 - 4
archivebox/cli/archivebox_config.py

@@ -20,15 +20,15 @@ def config(*keys,
           **kwargs) -> None:
     """Get and set your ArchiveBox project configuration values"""
 
-    import archivebox
     from archivebox.misc.checks import check_data_folder
     from archivebox.misc.logging_util import printable_config
     from archivebox.config.collection import load_all_config, write_config_file, get_real_name
+    from archivebox.config.configset import get_flat_config, get_all_configs
 
     check_data_folder()
 
-    FLAT_CONFIG = archivebox.pm.hook.get_FLAT_CONFIG()
-    CONFIGS = archivebox.pm.hook.get_CONFIGS()
+    FLAT_CONFIG = get_flat_config()
+    CONFIGS = get_all_configs()
     
     config_options: list[str] = list(kwargs.pop('key=value', []) or keys or [f'{key}={val}' for key, val in kwargs.items()])
     no_args = not (get or set or reset or config_options)
@@ -105,7 +105,7 @@ def config(*keys,
         if new_config:
             before = FLAT_CONFIG
             matching_config = write_config_file(new_config)
-            after = {**load_all_config(), **archivebox.pm.hook.get_FLAT_CONFIG()}
+            after = {**load_all_config(), **get_flat_config()}
             print(printable_config(matching_config))
 
             side_effect_changes = {}

+ 302 - 0
archivebox/cli/archivebox_crawl.py

@@ -0,0 +1,302 @@
+#!/usr/bin/env python3
+
+"""
+archivebox crawl [urls_or_snapshot_ids...] [--depth=N] [--plugin=NAME]
+
+Discover outgoing links from URLs or existing Snapshots.
+
+If a URL is passed, creates a Snapshot for it first, then runs parser plugins.
+If a snapshot_id is passed, runs parser plugins on the existing Snapshot.
+Outputs discovered outlink URLs as JSONL.
+
+Pipe the output to `archivebox snapshot` to archive the discovered URLs.
+
+Input formats:
+    - Plain URLs (one per line)
+    - Snapshot UUIDs (one per line)
+    - JSONL: {"type": "Snapshot", "url": "...", ...}
+    - JSONL: {"type": "Snapshot", "id": "...", ...}
+
+Output (JSONL):
+    {"type": "Snapshot", "url": "https://discovered-url.com", "via_extractor": "...", ...}
+
+Examples:
+    # Discover links from a page (creates snapshot first)
+    archivebox crawl https://example.com
+
+    # Discover links from an existing snapshot
+    archivebox crawl 01234567-89ab-cdef-0123-456789abcdef
+
+    # Full recursive crawl pipeline
+    archivebox crawl https://example.com | archivebox snapshot | archivebox extract
+
+    # Use only specific parser plugin
+    archivebox crawl --plugin=parse_html_urls https://example.com
+
+    # Chain: create snapshot, then crawl its outlinks
+    archivebox snapshot https://example.com | archivebox crawl | archivebox snapshot | archivebox extract
+"""
+
+__package__ = 'archivebox.cli'
+__command__ = 'archivebox crawl'
+
+import sys
+import json
+from pathlib import Path
+from typing import Optional
+
+import rich_click as click
+
+from archivebox.misc.util import docstring
+
+
+def discover_outlinks(
+    args: tuple,
+    depth: int = 1,
+    plugin: str = '',
+    wait: bool = True,
+) -> int:
+    """
+    Discover outgoing links from URLs or existing Snapshots.
+
+    Accepts URLs or snapshot_ids. For URLs, creates Snapshots first.
+    Runs parser plugins, outputs discovered URLs as JSONL.
+    The output can be piped to `archivebox snapshot` to archive the discovered links.
+
+    Exit codes:
+        0: Success
+        1: Failure
+    """
+    from rich import print as rprint
+    from django.utils import timezone
+
+    from archivebox.misc.jsonl import (
+        read_args_or_stdin, write_record,
+        TYPE_SNAPSHOT, get_or_create_snapshot
+    )
+    from archivebox.base_models.models import get_or_create_system_user_pk
+    from core.models import Snapshot, ArchiveResult
+    from crawls.models import Seed, Crawl
+    from archivebox.config import CONSTANTS
+    from workers.orchestrator import Orchestrator
+
+    created_by_id = get_or_create_system_user_pk()
+    is_tty = sys.stdout.isatty()
+
+    # Collect all input records
+    records = list(read_args_or_stdin(args))
+
+    if not records:
+        rprint('[yellow]No URLs or snapshot IDs provided. Pass as arguments or via stdin.[/yellow]', file=sys.stderr)
+        return 1
+
+    # Separate records into existing snapshots vs new URLs
+    existing_snapshot_ids = []
+    new_url_records = []
+
+    for record in records:
+        # Check if it's an existing snapshot (has id but no url, or looks like a UUID)
+        if record.get('id') and not record.get('url'):
+            existing_snapshot_ids.append(record['id'])
+        elif record.get('id'):
+            # Has both id and url - check if snapshot exists
+            try:
+                Snapshot.objects.get(id=record['id'])
+                existing_snapshot_ids.append(record['id'])
+            except Snapshot.DoesNotExist:
+                new_url_records.append(record)
+        elif record.get('url'):
+            new_url_records.append(record)
+
+    # For new URLs, create a Crawl and Snapshots
+    snapshot_ids = list(existing_snapshot_ids)
+
+    if new_url_records:
+        # Create a Crawl to manage this operation
+        sources_file = CONSTANTS.SOURCES_DIR / f'{timezone.now().strftime("%Y-%m-%d__%H-%M-%S")}__crawl.txt'
+        sources_file.parent.mkdir(parents=True, exist_ok=True)
+        sources_file.write_text('\n'.join(r.get('url', '') for r in new_url_records if r.get('url')))
+
+        seed = Seed.from_file(
+            sources_file,
+            label=f'crawl --depth={depth}',
+            created_by=created_by_id,
+        )
+        crawl = Crawl.from_seed(seed, max_depth=depth)
+
+        # Create snapshots for new URLs
+        for record in new_url_records:
+            try:
+                record['crawl_id'] = str(crawl.id)
+                record['depth'] = record.get('depth', 0)
+
+                snapshot = get_or_create_snapshot(record, created_by_id=created_by_id)
+                snapshot_ids.append(str(snapshot.id))
+
+            except Exception as e:
+                rprint(f'[red]Error creating snapshot: {e}[/red]', file=sys.stderr)
+                continue
+
+    if not snapshot_ids:
+        rprint('[red]No snapshots to process[/red]', file=sys.stderr)
+        return 1
+
+    if existing_snapshot_ids:
+        rprint(f'[blue]Using {len(existing_snapshot_ids)} existing snapshots[/blue]', file=sys.stderr)
+    if new_url_records:
+        rprint(f'[blue]Created {len(snapshot_ids) - len(existing_snapshot_ids)} new snapshots[/blue]', file=sys.stderr)
+    rprint(f'[blue]Running parser plugins on {len(snapshot_ids)} snapshots...[/blue]', file=sys.stderr)
+
+    # Create ArchiveResults for plugins
+    # If --plugin is specified, only run that one. Otherwise, run all available plugins.
+    # The orchestrator will handle dependency ordering (plugins declare deps in config.json)
+    for snapshot_id in snapshot_ids:
+        try:
+            snapshot = Snapshot.objects.get(id=snapshot_id)
+
+            if plugin:
+                # User specified a single plugin to run
+                ArchiveResult.objects.get_or_create(
+                    snapshot=snapshot,
+                    extractor=plugin,
+                    defaults={
+                        'status': ArchiveResult.StatusChoices.QUEUED,
+                        'retry_at': timezone.now(),
+                        'created_by_id': snapshot.created_by_id,
+                    }
+                )
+            else:
+                # Create pending ArchiveResults for all enabled plugins
+                # This uses hook discovery to find available plugins dynamically
+                snapshot.create_pending_archiveresults()
+
+            # Mark snapshot as started
+            snapshot.status = Snapshot.StatusChoices.STARTED
+            snapshot.retry_at = timezone.now()
+            snapshot.save()
+
+        except Snapshot.DoesNotExist:
+            continue
+
+    # Run plugins
+    if wait:
+        rprint('[blue]Running outlink plugins...[/blue]', file=sys.stderr)
+        orchestrator = Orchestrator(exit_on_idle=True)
+        orchestrator.runloop()
+
+    # Collect discovered URLs from urls.jsonl files
+    # Uses dynamic discovery - any plugin that outputs urls.jsonl is considered a parser
+    from archivebox.hooks import collect_urls_from_extractors
+
+    discovered_urls = {}
+    for snapshot_id in snapshot_ids:
+        try:
+            snapshot = Snapshot.objects.get(id=snapshot_id)
+            snapshot_dir = Path(snapshot.output_dir)
+
+            # Dynamically collect urls.jsonl from ANY plugin subdirectory
+            for entry in collect_urls_from_extractors(snapshot_dir):
+                url = entry.get('url')
+                if url and url not in discovered_urls:
+                    # Add metadata for crawl tracking
+                    entry['type'] = TYPE_SNAPSHOT
+                    entry['depth'] = snapshot.depth + 1
+                    entry['via_snapshot'] = str(snapshot.id)
+                    discovered_urls[url] = entry
+
+        except Snapshot.DoesNotExist:
+            continue
+
+    rprint(f'[green]Discovered {len(discovered_urls)} URLs[/green]', file=sys.stderr)
+
+    # Output discovered URLs as JSONL (when piped) or human-readable (when TTY)
+    for url, entry in discovered_urls.items():
+        if is_tty:
+            via = entry.get('via_extractor', 'unknown')
+            rprint(f'  [dim]{via}[/dim] {url[:80]}', file=sys.stderr)
+        else:
+            write_record(entry)
+
+    return 0
+
+
+def process_crawl_by_id(crawl_id: str) -> int:
+    """
+    Process a single Crawl by ID (used by workers).
+
+    Triggers the Crawl's state machine tick() which will:
+    - Transition from queued -> started (creates root snapshot)
+    - Transition from started -> sealed (when all snapshots done)
+    """
+    from rich import print as rprint
+    from crawls.models import Crawl
+
+    try:
+        crawl = Crawl.objects.get(id=crawl_id)
+    except Crawl.DoesNotExist:
+        rprint(f'[red]Crawl {crawl_id} not found[/red]', file=sys.stderr)
+        return 1
+
+    rprint(f'[blue]Processing Crawl {crawl.id} (status={crawl.status})[/blue]', file=sys.stderr)
+
+    try:
+        crawl.sm.tick()
+        crawl.refresh_from_db()
+        rprint(f'[green]Crawl complete (status={crawl.status})[/green]', file=sys.stderr)
+        return 0
+    except Exception as e:
+        rprint(f'[red]Crawl error: {type(e).__name__}: {e}[/red]', file=sys.stderr)
+        return 1
+
+
+def is_crawl_id(value: str) -> bool:
+    """Check if value looks like a Crawl UUID."""
+    import re
+    uuid_pattern = re.compile(r'^[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}$', re.I)
+    if not uuid_pattern.match(value):
+        return False
+    # Verify it's actually a Crawl (not a Snapshot or other object)
+    from crawls.models import Crawl
+    return Crawl.objects.filter(id=value).exists()
+
+
[email protected]()
[email protected]('--depth', '-d', type=int, default=1, help='Max depth for recursive crawling (default: 1)')
[email protected]('--plugin', '-p', default='', help='Use only this parser plugin (e.g., parse_html_urls, parse_dom_outlinks)')
[email protected]('--wait/--no-wait', default=True, help='Wait for plugins to complete (default: wait)')
[email protected]('args', nargs=-1)
+def main(depth: int, plugin: str, wait: bool, args: tuple):
+    """Discover outgoing links from URLs or existing Snapshots, or process Crawl by ID"""
+    from archivebox.misc.jsonl import read_args_or_stdin
+
+    # Read all input
+    records = list(read_args_or_stdin(args))
+
+    if not records:
+        from rich import print as rprint
+        rprint('[yellow]No URLs, Snapshot IDs, or Crawl IDs provided. Pass as arguments or via stdin.[/yellow]', file=sys.stderr)
+        sys.exit(1)
+
+    # Check if input looks like existing Crawl IDs to process
+    # If ALL inputs are Crawl UUIDs, process them
+    all_are_crawl_ids = all(
+        is_crawl_id(r.get('id') or r.get('url', ''))
+        for r in records
+    )
+
+    if all_are_crawl_ids:
+        # Process existing Crawls by ID
+        exit_code = 0
+        for record in records:
+            crawl_id = record.get('id') or record.get('url')
+            result = process_crawl_by_id(crawl_id)
+            if result != 0:
+                exit_code = result
+        sys.exit(exit_code)
+    else:
+        # Default behavior: discover outlinks from input (URLs or Snapshot IDs)
+        sys.exit(discover_outlinks(args, depth=depth, plugin=plugin, wait=wait))
+
+
+if __name__ == '__main__':
+    main()

+ 239 - 26
archivebox/cli/archivebox_extract.py

@@ -1,49 +1,262 @@
 #!/usr/bin/env python3
 
+"""
+archivebox extract [snapshot_ids...] [--plugin=NAME]
+
+Run plugins on Snapshots. Accepts snapshot IDs as arguments, from stdin, or via JSONL.
+
+Input formats:
+    - Snapshot UUIDs (one per line)
+    - JSONL: {"type": "Snapshot", "id": "...", "url": "..."}
+    - JSONL: {"type": "ArchiveResult", "snapshot_id": "...", "plugin": "..."}
+
+Output (JSONL):
+    {"type": "ArchiveResult", "id": "...", "snapshot_id": "...", "plugin": "...", "status": "..."}
+
+Examples:
+    # Extract specific snapshot
+    archivebox extract 01234567-89ab-cdef-0123-456789abcdef
+
+    # Pipe from snapshot command
+    archivebox snapshot https://example.com | archivebox extract
+
+    # Run specific plugin only
+    archivebox extract --plugin=screenshot 01234567-89ab-cdef-0123-456789abcdef
+
+    # Chain commands
+    archivebox crawl https://example.com | archivebox snapshot | archivebox extract
+"""
+
 __package__ = 'archivebox.cli'
 __command__ = 'archivebox extract'
 
-
 import sys
-from typing import TYPE_CHECKING, Generator
+from typing import Optional, List
 
 import rich_click as click
 
-from django.db.models import Q
 
-from archivebox.misc.util import enforce_types, docstring
+def process_archiveresult_by_id(archiveresult_id: str) -> int:
+    """
+    Run extraction for a single ArchiveResult by ID (used by workers).
 
-
-if TYPE_CHECKING:
+    Triggers the ArchiveResult's state machine tick() to run the extractor.
+    """
+    from rich import print as rprint
     from core.models import ArchiveResult
 
+    try:
+        archiveresult = ArchiveResult.objects.get(id=archiveresult_id)
+    except ArchiveResult.DoesNotExist:
+        rprint(f'[red]ArchiveResult {archiveresult_id} not found[/red]', file=sys.stderr)
+        return 1
+
+    rprint(f'[blue]Extracting {archiveresult.extractor} for {archiveresult.snapshot.url}[/blue]', file=sys.stderr)
+
+    try:
+        # Trigger state machine tick - this runs the actual extraction
+        archiveresult.sm.tick()
+        archiveresult.refresh_from_db()
+
+        if archiveresult.status == ArchiveResult.StatusChoices.SUCCEEDED:
+            print(f'[green]Extraction succeeded: {archiveresult.output}[/green]')
+            return 0
+        elif archiveresult.status == ArchiveResult.StatusChoices.FAILED:
+            print(f'[red]Extraction failed: {archiveresult.output}[/red]', file=sys.stderr)
+            return 1
+        else:
+            # Still in progress or backoff - not a failure
+            print(f'[yellow]Extraction status: {archiveresult.status}[/yellow]')
+            return 0
+
+    except Exception as e:
+        print(f'[red]Extraction error: {type(e).__name__}: {e}[/red]', file=sys.stderr)
+        return 1
+
+
+def run_plugins(
+    args: tuple,
+    plugin: str = '',
+    wait: bool = True,
+) -> int:
+    """
+    Run plugins on Snapshots from input.
+
+    Reads Snapshot IDs or JSONL from args/stdin, runs plugins, outputs JSONL.
+
+    Exit codes:
+        0: Success
+        1: Failure
+    """
+    from rich import print as rprint
+    from django.utils import timezone
+
+    from archivebox.misc.jsonl import (
+        read_args_or_stdin, write_record, archiveresult_to_jsonl,
+        TYPE_SNAPSHOT, TYPE_ARCHIVERESULT
+    )
+    from core.models import Snapshot, ArchiveResult
+    from workers.orchestrator import Orchestrator
+
+    is_tty = sys.stdout.isatty()
+
+    # Collect all input records
+    records = list(read_args_or_stdin(args))
 
-ORCHESTRATOR = None
+    if not records:
+        rprint('[yellow]No snapshots provided. Pass snapshot IDs as arguments or via stdin.[/yellow]', file=sys.stderr)
+        return 1
 
-@enforce_types
-def extract(archiveresult_id: str) -> Generator['ArchiveResult', None, None]:
-    archiveresult = ArchiveResult.objects.get(id=archiveresult_id)
-    if not archiveresult:
-        raise Exception(f'ArchiveResult {archiveresult_id} not found')
-    
-    return archiveresult.EXTRACTOR.extract()
+    # Gather snapshot IDs to process
+    snapshot_ids = set()
+    for record in records:
+        record_type = record.get('type')
+
+        if record_type == TYPE_SNAPSHOT:
+            snapshot_id = record.get('id')
+            if snapshot_id:
+                snapshot_ids.add(snapshot_id)
+            elif record.get('url'):
+                # Look up by URL
+                try:
+                    snap = Snapshot.objects.get(url=record['url'])
+                    snapshot_ids.add(str(snap.id))
+                except Snapshot.DoesNotExist:
+                    rprint(f'[yellow]Snapshot not found for URL: {record["url"]}[/yellow]', file=sys.stderr)
+
+        elif record_type == TYPE_ARCHIVERESULT:
+            snapshot_id = record.get('snapshot_id')
+            if snapshot_id:
+                snapshot_ids.add(snapshot_id)
+
+        elif 'id' in record:
+            # Assume it's a snapshot ID
+            snapshot_ids.add(record['id'])
+
+    if not snapshot_ids:
+        rprint('[red]No valid snapshot IDs found in input[/red]', file=sys.stderr)
+        return 1
+
+    # Get snapshots and ensure they have pending ArchiveResults
+    processed_count = 0
+    for snapshot_id in snapshot_ids:
+        try:
+            snapshot = Snapshot.objects.get(id=snapshot_id)
+        except Snapshot.DoesNotExist:
+            rprint(f'[yellow]Snapshot {snapshot_id} not found[/yellow]', file=sys.stderr)
+            continue
+
+        # Create pending ArchiveResults if needed
+        if plugin:
+            # Only create for specific plugin
+            result, created = ArchiveResult.objects.get_or_create(
+                snapshot=snapshot,
+                extractor=plugin,
+                defaults={
+                    'status': ArchiveResult.StatusChoices.QUEUED,
+                    'retry_at': timezone.now(),
+                    'created_by_id': snapshot.created_by_id,
+                }
+            )
+            if not created and result.status in [ArchiveResult.StatusChoices.FAILED, ArchiveResult.StatusChoices.SKIPPED]:
+                # Reset for retry
+                result.status = ArchiveResult.StatusChoices.QUEUED
+                result.retry_at = timezone.now()
+                result.save()
+        else:
+            # Create all pending plugins
+            snapshot.create_pending_archiveresults()
+
+        # Reset snapshot status to allow processing
+        if snapshot.status == Snapshot.StatusChoices.SEALED:
+            snapshot.status = Snapshot.StatusChoices.STARTED
+            snapshot.retry_at = timezone.now()
+            snapshot.save()
+
+        processed_count += 1
+
+    if processed_count == 0:
+        rprint('[red]No snapshots to process[/red]', file=sys.stderr)
+        return 1
+
+    rprint(f'[blue]Queued {processed_count} snapshots for extraction[/blue]', file=sys.stderr)
+
+    # Run orchestrator if --wait (default)
+    if wait:
+        rprint('[blue]Running plugins...[/blue]', file=sys.stderr)
+        orchestrator = Orchestrator(exit_on_idle=True)
+        orchestrator.runloop()
+
+    # Output results as JSONL (when piped) or human-readable (when TTY)
+    for snapshot_id in snapshot_ids:
+        try:
+            snapshot = Snapshot.objects.get(id=snapshot_id)
+            results = snapshot.archiveresult_set.all()
+            if plugin:
+                results = results.filter(extractor=plugin)
+
+            for result in results:
+                if is_tty:
+                    status_color = {
+                        'succeeded': 'green',
+                        'failed': 'red',
+                        'skipped': 'yellow',
+                    }.get(result.status, 'dim')
+                    rprint(f'  [{status_color}]{result.status}[/{status_color}] {result.extractor} → {result.output or ""}', file=sys.stderr)
+                else:
+                    write_record(archiveresult_to_jsonl(result))
+        except Snapshot.DoesNotExist:
+            continue
+
+    return 0
+
+
+def is_archiveresult_id(value: str) -> bool:
+    """Check if value looks like an ArchiveResult UUID."""
+    import re
+    uuid_pattern = re.compile(r'^[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}$', re.I)
+    if not uuid_pattern.match(value):
+        return False
+    # Verify it's actually an ArchiveResult (not a Snapshot or other object)
+    from core.models import ArchiveResult
+    return ArchiveResult.objects.filter(id=value).exists()
 
-# <user>@<machine_id>#<datetime>/absolute/path/to/binary
-# 2014.24.01
 
 @click.command()
[email protected]('--plugin', '-p', default='', help='Run only this plugin (e.g., screenshot, singlefile)')
[email protected]('--wait/--no-wait', default=True, help='Wait for plugins to complete (default: wait)')
[email protected]('args', nargs=-1)
+def main(plugin: str, wait: bool, args: tuple):
+    """Run plugins on Snapshots, or process existing ArchiveResults by ID"""
+    from archivebox.misc.jsonl import read_args_or_stdin
+
+    # Read all input
+    records = list(read_args_or_stdin(args))
+
+    if not records:
+        from rich import print as rprint
+        rprint('[yellow]No Snapshot IDs or ArchiveResult IDs provided. Pass as arguments or via stdin.[/yellow]', file=sys.stderr)
+        sys.exit(1)
 
[email protected]('archiveresult_ids', nargs=-1, type=str)
-@docstring(extract.__doc__)
-def main(archiveresult_ids: list[str]):
-    """Add a new URL or list of URLs to your archive"""
-    
-    for archiveresult_id in (archiveresult_ids or sys.stdin):
-        print(f'Extracting {archiveresult_id}...')
-        archiveresult = extract(str(archiveresult_id))
-        print(archiveresult.as_json())
+    # Check if input looks like existing ArchiveResult IDs to process
+    all_are_archiveresult_ids = all(
+        is_archiveresult_id(r.get('id') or r.get('url', ''))
+        for r in records
+    )
+
+    if all_are_archiveresult_ids:
+        # Process existing ArchiveResults by ID
+        exit_code = 0
+        for record in records:
+            archiveresult_id = record.get('id') or record.get('url')
+            result = process_archiveresult_by_id(archiveresult_id)
+            if result != 0:
+                exit_code = result
+        sys.exit(exit_code)
+    else:
+        # Default behavior: run plugins on Snapshots from input
+        sys.exit(run_plugins(args, plugin=plugin, wait=wait))
 
 
 if __name__ == '__main__':
     main()
-

+ 13 - 13
archivebox/cli/archivebox_init.py

@@ -21,10 +21,9 @@ def init(force: bool=False, quick: bool=False, install: bool=False, setup: bool=
     from archivebox.config import CONSTANTS, VERSION, DATA_DIR
     from archivebox.config.common import SERVER_CONFIG
     from archivebox.config.collection import write_config_file
-    from archivebox.index import load_main_index, write_main_index, fix_invalid_folder_locations, get_invalid_folders
-    from archivebox.index.schema import Link
-    from archivebox.index.json import parse_json_main_index, parse_json_links_details
-    from archivebox.index.sql import apply_migrations
+    from archivebox.misc.folders import fix_invalid_folder_locations, get_invalid_folders
+    from archivebox.misc.legacy import parse_json_main_index, parse_json_links_details, SnapshotDict
+    from archivebox.misc.db import apply_migrations
     
     # if os.access(out_dir / CONSTANTS.JSON_INDEX_FILENAME, os.F_OK):
     #     print("[red]:warning: This folder contains a JSON index. It is deprecated, and will no longer be kept up to date automatically.[/red]", file=sys.stderr)
@@ -100,10 +99,10 @@ def init(force: bool=False, quick: bool=False, install: bool=False, setup: bool=
     from core.models import Snapshot
 
     all_links = Snapshot.objects.none()
-    pending_links: dict[str, Link] = {}
+    pending_links: dict[str, SnapshotDict] = {}
 
     if existing_index:
-        all_links = load_main_index(DATA_DIR, warn=False)
+        all_links = Snapshot.objects.all()
         print(f'    √ Loaded {all_links.count()} links from existing main index.')
 
     if quick:
@@ -119,9 +118,9 @@ def init(force: bool=False, quick: bool=False, install: bool=False, setup: bool=
 
             # Links in JSON index but not in main index
             orphaned_json_links = {
-                link.url: link
-                for link in parse_json_main_index(DATA_DIR)
-                if not all_links.filter(url=link.url).exists()
+                link_dict['url']: link_dict
+                for link_dict in parse_json_main_index(DATA_DIR)
+                if not all_links.filter(url=link_dict['url']).exists()
             }
             if orphaned_json_links:
                 pending_links.update(orphaned_json_links)
@@ -129,9 +128,9 @@ def init(force: bool=False, quick: bool=False, install: bool=False, setup: bool=
 
             # Links in data dir indexes but not in main index
             orphaned_data_dir_links = {
-                link.url: link
-                for link in parse_json_links_details(DATA_DIR)
-                if not all_links.filter(url=link.url).exists()
+                link_dict['url']: link_dict
+                for link_dict in parse_json_links_details(DATA_DIR)
+                if not all_links.filter(url=link_dict['url']).exists()
             }
             if orphaned_data_dir_links:
                 pending_links.update(orphaned_data_dir_links)
@@ -159,7 +158,8 @@ def init(force: bool=False, quick: bool=False, install: bool=False, setup: bool=
             print('        archivebox init --quick', file=sys.stderr)
             raise SystemExit(1)
         
-        write_main_index(list(pending_links.values()), DATA_DIR)
+        if pending_links:
+            Snapshot.objects.create_from_dicts(list(pending_links.values()))
 
     print('\n[green]----------------------------------------------------------------------[/green]')
 

+ 60 - 123
archivebox/cli/archivebox_install.py

@@ -4,7 +4,7 @@ __package__ = 'archivebox.cli'
 
 import os
 import sys
-from typing import Optional, List
+import shutil
 
 import rich_click as click
 from rich import print
@@ -13,149 +13,86 @@ from archivebox.misc.util import docstring, enforce_types
 
 
 @enforce_types
-def install(binproviders: Optional[List[str]]=None, binaries: Optional[List[str]]=None, dry_run: bool=False) -> None:
-    """Automatically install all ArchiveBox dependencies and extras"""
-    
-    # if running as root:
-    #    - run init to create index + lib dir
-    #    - chown -R 911 DATA_DIR
-    #    - install all binaries as root
-    #    - chown -R 911 LIB_DIR
-    # else:
-    #    - run init to create index + lib dir as current user
-    #    - install all binaries as current user
-    #    - recommend user re-run with sudo if any deps need to be installed as root
-
-    import abx
-    import archivebox
-    from archivebox.config.permissions import IS_ROOT, ARCHIVEBOX_USER, ARCHIVEBOX_GROUP, SudoPermission
-    from archivebox.config.paths import DATA_DIR, ARCHIVE_DIR, get_or_create_working_lib_dir
+def install(dry_run: bool=False) -> None:
+    """Detect and install ArchiveBox dependencies by running a dependency-check crawl"""
+
+    from archivebox.config.permissions import IS_ROOT, ARCHIVEBOX_USER, ARCHIVEBOX_GROUP
+    from archivebox.config.paths import ARCHIVE_DIR
     from archivebox.misc.logging import stderr
     from archivebox.cli.archivebox_init import init
-    from archivebox.misc.system import run as run_shell
-
 
     if not (os.access(ARCHIVE_DIR, os.R_OK) and ARCHIVE_DIR.is_dir()):
         init()  # must init full index because we need a db to store InstalledBinary entries in
 
-    print('\n[green][+] Installing ArchiveBox dependencies automatically...[/green]')
-    
-    # we never want the data dir to be owned by root, detect owner of existing owner of DATA_DIR to try and guess desired non-root UID
+    print('\n[green][+] Detecting ArchiveBox dependencies...[/green]')
+
     if IS_ROOT:
         EUID = os.geteuid()
-        
-        # if we have sudo/root permissions, take advantage of them just while installing dependencies
         print()
-        print(f'[yellow]:warning:  Running as UID=[blue]{EUID}[/blue] with [red]sudo[/red] only for dependencies that need it.[/yellow]')
-        print(f'    DATA_DIR, LIB_DIR, and TMP_DIR will be owned by [blue]{ARCHIVEBOX_USER}:{ARCHIVEBOX_GROUP}[/blue].')
+        print(f'[yellow]:warning:  Running as UID=[blue]{EUID}[/blue].[/yellow]')
+        print(f'    DATA_DIR will be owned by [blue]{ARCHIVEBOX_USER}:{ARCHIVEBOX_GROUP}[/blue].')
         print()
-    
-    LIB_DIR = get_or_create_working_lib_dir()
-    
-    package_manager_names = ', '.join(
-        f'[yellow]{binprovider.name}[/yellow]'
-        for binprovider in reversed(list(abx.as_dict(abx.pm.hook.get_BINPROVIDERS()).values()))
-        if not binproviders or (binproviders and binprovider.name in binproviders)
-    )
-    print(f'[+] Setting up package managers {package_manager_names}...')
-    for binprovider in reversed(list(abx.as_dict(abx.pm.hook.get_BINPROVIDERS()).values())):
-        if binproviders and binprovider.name not in binproviders:
-            continue
-        try:
-            binprovider.setup()
-        except Exception:
-            # it's ok, installing binaries below will automatically set up package managers as needed
-            # e.g. if user does not have npm available we cannot set it up here yet, but once npm Binary is installed
-            # the next package that depends on npm will automatically call binprovider.setup() during its own install
-            pass
-    
-    print()
-    
-    for binary in reversed(list(abx.as_dict(abx.pm.hook.get_BINARIES()).values())):
-        if binary.name in ('archivebox', 'django', 'sqlite', 'python'):
-            # obviously must already be installed if we are running
-            continue
-        
-        if binaries and binary.name not in binaries:
-            continue
-        
-        providers = ' [grey53]or[/grey53] '.join(
-            provider.name for provider in binary.binproviders_supported
-            if not binproviders or (binproviders and provider.name in binproviders)
-        )
-        if not providers:
-            continue
-        print(f'[+] Detecting / Installing [yellow]{binary.name.ljust(22)}[/yellow] using [red]{providers}[/red]...')
-        try:
-            with SudoPermission(uid=0, fallback=True):
-                # print(binary.load_or_install(fresh=True).model_dump(exclude={'overrides', 'bin_dir', 'hook_type'}))
-                if binproviders:
-                    providers_supported_by_binary = [provider.name for provider in binary.binproviders_supported]
-                    for binprovider_name in binproviders:
-                        if binprovider_name not in providers_supported_by_binary:
-                            continue
-                        try:
-                            if dry_run:
-                                # always show install commands when doing a dry run
-                                sys.stderr.write("\033[2;49;90m")  # grey53
-                                result = binary.install(binproviders=[binprovider_name], dry_run=dry_run).model_dump(exclude={'overrides', 'bin_dir', 'hook_type'})
-                                sys.stderr.write("\033[00m\n")     # reset
-                            else:
-                                loaded_binary = archivebox.pm.hook.binary_load_or_install(binary=binary, binproviders=[binprovider_name], fresh=True, dry_run=dry_run, quiet=False)
-                                result = loaded_binary.model_dump(exclude={'overrides', 'bin_dir', 'hook_type'})
-                            if result and result['loaded_version']:
-                                break
-                        except Exception as e:
-                            print(f'[red]:cross_mark: Failed to install {binary.name} as using {binprovider_name} as user {ARCHIVEBOX_USER}: {e}[/red]')
-                else:
-                    if dry_run:
-                        sys.stderr.write("\033[2;49;90m")  # grey53
-                        binary.install(dry_run=dry_run).model_dump(exclude={'overrides', 'bin_dir', 'hook_type'})
-                        sys.stderr.write("\033[00m\n")  # reset
-                    else:
-                        loaded_binary = archivebox.pm.hook.binary_load_or_install(binary=binary, fresh=True, dry_run=dry_run)
-                        result = loaded_binary.model_dump(exclude={'overrides', 'bin_dir', 'hook_type'})
-            if IS_ROOT and LIB_DIR:
-                with SudoPermission(uid=0):
-                    if ARCHIVEBOX_USER == 0:
-                        os.system(f'chmod -R 777 "{LIB_DIR.resolve()}"')
-                    else:    
-                        os.system(f'chown -R {ARCHIVEBOX_USER} "{LIB_DIR.resolve()}"')
-        except Exception as e:
-            print(f'[red]:cross_mark: Failed to install {binary.name} as user {ARCHIVEBOX_USER}: {e}[/red]')
-            if binaries and len(binaries) == 1:
-                # if we are only installing a single binary, raise the exception so the user can see what went wrong
-                raise
-                
+
+    if dry_run:
+        print('[dim]Dry run - would create a crawl to detect dependencies[/dim]')
+        return
+
+    # Set up Django
     from archivebox.config.django import setup_django
     setup_django()
-    
+
+    from django.utils import timezone
+    from crawls.models import Seed, Crawl
+    from archivebox.base_models.models import get_or_create_system_user_pk
+
+    # Create a seed and crawl for dependency detection
+    # Using a minimal crawl that will trigger on_Crawl hooks
+    created_by_id = get_or_create_system_user_pk()
+
+    seed = Seed.objects.create(
+        uri='archivebox://install',
+        label='Dependency detection',
+        created_by_id=created_by_id,
+    )
+
+    crawl = Crawl.objects.create(
+        seed=seed,
+        max_depth=0,
+        created_by_id=created_by_id,
+        status='queued',
+    )
+
+    print(f'[+] Created dependency detection crawl: {crawl.id}')
+    print('[+] Running crawl to detect binaries via on_Crawl hooks...')
+    print()
+
+    # Run the crawl synchronously (this triggers on_Crawl hooks)
+    from workers.orchestrator import Orchestrator
+    orchestrator = Orchestrator(exit_on_idle=True)
+    orchestrator.runloop()
+
+    print()
+
+    # Check for superuser
     from django.contrib.auth import get_user_model
     User = get_user_model()
 
     if not User.objects.filter(is_superuser=True).exclude(username='system').exists():
         stderr('\n[+] Don\'t forget to create a new admin user for the Web UI...', color='green')
         stderr('    archivebox manage createsuperuser')
-        # run_subcommand('manage', subcommand_args=['createsuperuser'], pwd=out_dir)
-    
-    print('\n[green][√] Set up ArchiveBox and its dependencies successfully.[/green]\n', file=sys.stderr)
-    
-    from abx_plugin_pip.binaries import ARCHIVEBOX_BINARY
-    
-    extra_args = []
-    if binproviders:
-        extra_args.append(f'--binproviders={",".join(binproviders)}')
-    if binaries:
-        extra_args.append(f'--binaries={",".join(binaries)}')
-    
-    proc = run_shell([ARCHIVEBOX_BINARY.load().abspath, 'version', *extra_args], capture_output=False, cwd=DATA_DIR)
-    raise SystemExit(proc.returncode)
+
+    print()
+
+    # Run version to show full status
+    archivebox_path = shutil.which('archivebox') or sys.executable
+    if 'python' in archivebox_path:
+        os.system(f'{sys.executable} -m archivebox version')
+    else:
+        os.system(f'{archivebox_path} version')
 
 
 @click.command()
[email protected]('--binproviders', '-p', type=str, help='Select binproviders to use DEFAULT=env,apt,brew,sys_pip,venv_pip,lib_pip,pipx,sys_npm,lib_npm,puppeteer,playwright (all)', default=None)
[email protected]('--binaries', '-b', type=str, help='Select binaries to install DEFAULT=curl,wget,git,yt-dlp,chrome,single-file,readability-extractor,postlight-parser,... (all)', default=None)
[email protected]('--dry-run', '-d', is_flag=True, help='Show what would be installed without actually installing anything', default=False)
[email protected]('--dry-run', '-d', is_flag=True, help='Show what would happen without actually running', default=False)
 @docstring(install.__doc__)
 def main(**kwargs) -> None:
     install(**kwargs)

+ 67 - 0
archivebox/cli/archivebox_orchestrator.py

@@ -0,0 +1,67 @@
+#!/usr/bin/env python3
+
+"""
+archivebox orchestrator [--daemon]
+
+Start the orchestrator process that manages workers.
+
+The orchestrator polls queues for each model type (Crawl, Snapshot, ArchiveResult)
+and lazily spawns worker processes when there is work to be done.
+"""
+
+__package__ = 'archivebox.cli'
+__command__ = 'archivebox orchestrator'
+
+import sys
+
+import rich_click as click
+
+from archivebox.misc.util import docstring
+
+
+def orchestrator(daemon: bool = False, watch: bool = False) -> int:
+    """
+    Start the orchestrator process.
+    
+    The orchestrator:
+    1. Polls each model queue (Crawl, Snapshot, ArchiveResult)
+    2. Spawns worker processes when there is work to do
+    3. Monitors worker health and restarts failed workers
+    4. Exits when all queues are empty (unless --daemon)
+    
+    Args:
+        daemon: Run forever (don't exit when idle)
+        watch: Just watch the queues without spawning workers (for debugging)
+    
+    Exit codes:
+        0: All work completed successfully
+        1: Error occurred
+    """
+    from workers.orchestrator import Orchestrator
+    
+    if Orchestrator.is_running():
+        print('[yellow]Orchestrator is already running[/yellow]')
+        return 0
+    
+    try:
+        orchestrator_instance = Orchestrator(exit_on_idle=not daemon)
+        orchestrator_instance.runloop()
+        return 0
+    except KeyboardInterrupt:
+        return 0
+    except Exception as e:
+        print(f'[red]Orchestrator error: {type(e).__name__}: {e}[/red]', file=sys.stderr)
+        return 1
+
+
[email protected]()
[email protected]('--daemon', '-d', is_flag=True, help="Run forever (don't exit on idle)")
[email protected]('--watch', '-w', is_flag=True, help="Watch queues without spawning workers")
+@docstring(orchestrator.__doc__)
+def main(daemon: bool, watch: bool):
+    """Start the ArchiveBox orchestrator process"""
+    sys.exit(orchestrator(daemon=daemon, watch=watch))
+
+
+if __name__ == '__main__':
+    main()

+ 8 - 11
archivebox/cli/archivebox_remove.py

@@ -12,10 +12,7 @@ import rich_click as click
 from django.db.models import QuerySet
 
 from archivebox.config import DATA_DIR
-from archivebox.index.schema import Link
 from archivebox.config.django import setup_django
-from archivebox.index import load_main_index
-from archivebox.index.sql import remove_from_sql_main_index
 from archivebox.misc.util import enforce_types, docstring
 from archivebox.misc.checks import check_data_folder
 from archivebox.misc.logging_util import (
@@ -35,7 +32,7 @@ def remove(filter_patterns: Iterable[str]=(),
           before: float | None=None,
           yes: bool=False,
           delete: bool=False,
-          out_dir: Path=DATA_DIR) -> Iterable[Link]:
+          out_dir: Path=DATA_DIR) -> QuerySet:
     """Remove the specified URLs from the archive"""
     
     setup_django()
@@ -63,27 +60,27 @@ def remove(filter_patterns: Iterable[str]=(),
         log_removal_finished(0, 0)
         raise SystemExit(1)
 
-    log_links = [link.as_link() for link in snapshots]
-    log_list_finished(log_links)
-    log_removal_started(log_links, yes=yes, delete=delete)
+    log_list_finished(snapshots)
+    log_removal_started(snapshots, yes=yes, delete=delete)
 
     timer = TimedProgress(360, prefix='      ')
     try:
         for snapshot in snapshots:
             if delete:
-                shutil.rmtree(snapshot.as_link().link_dir, ignore_errors=True)
+                shutil.rmtree(snapshot.output_dir, ignore_errors=True)
     finally:
         timer.end()
 
     to_remove = snapshots.count()
 
     from archivebox.search import flush_search_index
+    from core.models import Snapshot
 
     flush_search_index(snapshots=snapshots)
-    remove_from_sql_main_index(snapshots=snapshots, out_dir=out_dir)
-    all_snapshots = load_main_index(out_dir=out_dir)
+    snapshots.delete()
+    all_snapshots = Snapshot.objects.all()
     log_removal_finished(all_snapshots.count(), to_remove)
-    
+
     return all_snapshots
 
 

+ 5 - 2
archivebox/cli/archivebox_schedule.py

@@ -35,9 +35,12 @@ def schedule(add: bool=False,
  
     depth = int(depth)
     
+    import shutil
     from crontab import CronTab, CronSlices
     from archivebox.misc.system import dedupe_cron_jobs
-    from abx_plugin_pip.binaries import ARCHIVEBOX_BINARY
+    
+    # Find the archivebox binary path
+    ARCHIVEBOX_ABSPATH = shutil.which('archivebox') or sys.executable.replace('python', 'archivebox')
 
     Path(CONSTANTS.LOGS_DIR).mkdir(exist_ok=True)
 
@@ -58,7 +61,7 @@ def schedule(add: bool=False,
             'cd',
             quoted(out_dir),
             '&&',
-            quoted(ARCHIVEBOX_BINARY.load().abspath),
+            quoted(ARCHIVEBOX_ABSPATH),
             *([
                 'add',
                 *(['--overwrite'] if overwrite else []),

+ 43 - 32
archivebox/cli/archivebox_search.py

@@ -4,7 +4,7 @@ __package__ = 'archivebox.cli'
 __command__ = 'archivebox search'
 
 from pathlib import Path
-from typing import Optional, List, Iterable
+from typing import Optional, List, Any
 
 import rich_click as click
 from rich import print
@@ -12,11 +12,19 @@ from rich import print
 from django.db.models import QuerySet
 
 from archivebox.config import DATA_DIR
-from archivebox.index import LINK_FILTERS
-from archivebox.index.schema import Link
 from archivebox.misc.logging import stderr
 from archivebox.misc.util import enforce_types, docstring
 
+# Filter types for URL matching
+LINK_FILTERS = {
+    'exact': lambda pattern: {'url': pattern},
+    'substring': lambda pattern: {'url__icontains': pattern},
+    'regex': lambda pattern: {'url__iregex': pattern},
+    'domain': lambda pattern: {'url__istartswith': f'http://{pattern}'},
+    'tag': lambda pattern: {'tags__name': pattern},
+    'timestamp': lambda pattern: {'timestamp': pattern},
+}
+
 STATUS_CHOICES = [
     'indexed', 'archived', 'unarchived', 'present', 'valid', 'invalid',
     'duplicate', 'orphaned', 'corrupted', 'unrecognized'
@@ -24,38 +32,37 @@ STATUS_CHOICES = [
 
 
 
-def list_links(snapshots: Optional[QuerySet]=None,
-               filter_patterns: Optional[List[str]]=None,
-               filter_type: str='substring',
-               after: Optional[float]=None,
-               before: Optional[float]=None,
-               out_dir: Path=DATA_DIR) -> Iterable[Link]:
-    
-    from archivebox.index import load_main_index
-    from archivebox.index import snapshot_filter
+def get_snapshots(snapshots: Optional[QuerySet]=None,
+                  filter_patterns: Optional[List[str]]=None,
+                  filter_type: str='substring',
+                  after: Optional[float]=None,
+                  before: Optional[float]=None,
+                  out_dir: Path=DATA_DIR) -> QuerySet:
+    """Filter and return Snapshots matching the given criteria."""
+    from core.models import Snapshot
 
     if snapshots:
-        all_snapshots = snapshots
+        result = snapshots
     else:
-        all_snapshots = load_main_index(out_dir=out_dir)
+        result = Snapshot.objects.all()
 
     if after is not None:
-        all_snapshots = all_snapshots.filter(timestamp__gte=after)
+        result = result.filter(timestamp__gte=after)
     if before is not None:
-        all_snapshots = all_snapshots.filter(timestamp__lt=before)
+        result = result.filter(timestamp__lt=before)
     if filter_patterns:
-        all_snapshots = snapshot_filter(all_snapshots, filter_patterns, filter_type)
+        result = Snapshot.objects.filter_by_patterns(filter_patterns, filter_type)
 
-    if not all_snapshots:
+    if not result:
         stderr('[!] No Snapshots matched your filters:', filter_patterns, f'({filter_type})', color='lightyellow')
 
-    return all_snapshots
+    return result
 
 
-def list_folders(links: list[Link], status: str, out_dir: Path=DATA_DIR) -> dict[str, Link | None]:
-    
+def list_folders(snapshots: QuerySet, status: str, out_dir: Path=DATA_DIR) -> dict[str, Any]:
+
     from archivebox.misc.checks import check_data_folder
-    from archivebox.index import (
+    from archivebox.misc.folders import (
         get_indexed_folders,
         get_archived_folders,
         get_unarchived_folders,
@@ -67,7 +74,7 @@ def list_folders(links: list[Link], status: str, out_dir: Path=DATA_DIR) -> dict
         get_corrupted_folders,
         get_unrecognized_folders,
     )
-    
+
     check_data_folder()
 
     STATUS_FUNCTIONS = {
@@ -84,7 +91,7 @@ def list_folders(links: list[Link], status: str, out_dir: Path=DATA_DIR) -> dict
     }
 
     try:
-        return STATUS_FUNCTIONS[status](links, out_dir=out_dir)
+        return STATUS_FUNCTIONS[status](snapshots, out_dir=out_dir)
     except KeyError:
         raise ValueError('Status not recognized.')
 
@@ -109,7 +116,7 @@ def search(filter_patterns: list[str] | None=None,
         stderr('[X] --with-headers requires --json, --html or --csv\n', color='red')
         raise SystemExit(2)
 
-    snapshots = list_links(
+    snapshots = get_snapshots(
         filter_patterns=list(filter_patterns) if filter_patterns else None,
         filter_type=filter_type,
         before=before,
@@ -120,20 +127,24 @@ def search(filter_patterns: list[str] | None=None,
         snapshots = snapshots.order_by(sort)
 
     folders = list_folders(
-        links=snapshots,
+        snapshots=snapshots,
         status=status,
         out_dir=DATA_DIR,
     )
 
     if json:
-        from archivebox.index.json import generate_json_index_from_links
-        output = generate_json_index_from_links(folders.values(), with_headers)
+        from core.models import Snapshot
+        # Filter for non-None snapshots
+        valid_snapshots = [s for s in folders.values() if s is not None]
+        output = Snapshot.objects.filter(pk__in=[s.pk for s in valid_snapshots]).to_json(with_headers=with_headers)
     elif html:
-        from archivebox.index.html import generate_index_from_links
-        output = generate_index_from_links(folders.values(), with_headers) 
+        from core.models import Snapshot
+        valid_snapshots = [s for s in folders.values() if s is not None]
+        output = Snapshot.objects.filter(pk__in=[s.pk for s in valid_snapshots]).to_html(with_headers=with_headers)
     elif csv:
-        from archivebox.index.csv import links_to_csv
-        output = links_to_csv(folders.values(), csv.split(','), with_headers)
+        from core.models import Snapshot
+        valid_snapshots = [s for s in folders.values() if s is not None]
+        output = Snapshot.objects.filter(pk__in=[s.pk for s in valid_snapshots]).to_csv(cols=csv.split(','), header=with_headers)
     else:
         from archivebox.misc.logging_util import printable_folders
         output = printable_folders(folders, with_headers)

+ 218 - 0
archivebox/cli/archivebox_snapshot.py

@@ -0,0 +1,218 @@
+#!/usr/bin/env python3
+
+"""
+archivebox snapshot [urls...] [--depth=N] [--tag=TAG] [--plugins=...]
+
+Create Snapshots from URLs. Accepts URLs as arguments, from stdin, or via JSONL.
+
+Input formats:
+    - Plain URLs (one per line)
+    - JSONL: {"type": "Snapshot", "url": "...", "title": "...", "tags": "..."}
+
+Output (JSONL):
+    {"type": "Snapshot", "id": "...", "url": "...", "status": "queued", ...}
+
+Examples:
+    # Create snapshots from URLs
+    archivebox snapshot https://example.com https://foo.com
+
+    # Pipe from stdin
+    echo 'https://example.com' | archivebox snapshot
+
+    # Chain with extract
+    archivebox snapshot https://example.com | archivebox extract
+
+    # With crawl depth
+    archivebox snapshot --depth=1 https://example.com
+"""
+
+__package__ = 'archivebox.cli'
+__command__ = 'archivebox snapshot'
+
+import sys
+from typing import Optional
+
+import rich_click as click
+
+from archivebox.misc.util import docstring
+
+
+def process_snapshot_by_id(snapshot_id: str) -> int:
+    """
+    Process a single Snapshot by ID (used by workers).
+
+    Triggers the Snapshot's state machine tick() which will:
+    - Transition from queued -> started (creates pending ArchiveResults)
+    - Transition from started -> sealed (when all ArchiveResults done)
+    """
+    from rich import print as rprint
+    from core.models import Snapshot
+
+    try:
+        snapshot = Snapshot.objects.get(id=snapshot_id)
+    except Snapshot.DoesNotExist:
+        rprint(f'[red]Snapshot {snapshot_id} not found[/red]', file=sys.stderr)
+        return 1
+
+    rprint(f'[blue]Processing Snapshot {snapshot.id} {snapshot.url[:50]} (status={snapshot.status})[/blue]', file=sys.stderr)
+
+    try:
+        snapshot.sm.tick()
+        snapshot.refresh_from_db()
+        rprint(f'[green]Snapshot complete (status={snapshot.status})[/green]', file=sys.stderr)
+        return 0
+    except Exception as e:
+        rprint(f'[red]Snapshot error: {type(e).__name__}: {e}[/red]', file=sys.stderr)
+        return 1
+
+
+def create_snapshots(
+    urls: tuple,
+    depth: int = 0,
+    tag: str = '',
+    plugins: str = '',
+    created_by_id: Optional[int] = None,
+) -> int:
+    """
+    Create Snapshots from URLs or JSONL records.
+
+    Reads from args or stdin, creates Snapshot objects, outputs JSONL.
+    If --plugins is passed, also runs specified plugins (blocking).
+
+    Exit codes:
+        0: Success
+        1: Failure
+    """
+    from rich import print as rprint
+    from django.utils import timezone
+
+    from archivebox.misc.jsonl import (
+        read_args_or_stdin, write_record, snapshot_to_jsonl,
+        TYPE_SNAPSHOT, TYPE_TAG, get_or_create_snapshot
+    )
+    from archivebox.base_models.models import get_or_create_system_user_pk
+    from core.models import Snapshot
+    from crawls.models import Seed, Crawl
+    from archivebox.config import CONSTANTS
+
+    created_by_id = created_by_id or get_or_create_system_user_pk()
+    is_tty = sys.stdout.isatty()
+
+    # Collect all input records
+    records = list(read_args_or_stdin(urls))
+
+    if not records:
+        rprint('[yellow]No URLs provided. Pass URLs as arguments or via stdin.[/yellow]', file=sys.stderr)
+        return 1
+
+    # If depth > 0, we need a Crawl to manage recursive discovery
+    crawl = None
+    if depth > 0:
+        # Create a seed for this batch
+        sources_file = CONSTANTS.SOURCES_DIR / f'{timezone.now().strftime("%Y-%m-%d__%H-%M-%S")}__snapshot.txt'
+        sources_file.parent.mkdir(parents=True, exist_ok=True)
+        sources_file.write_text('\n'.join(r.get('url', '') for r in records if r.get('url')))
+
+        seed = Seed.from_file(
+            sources_file,
+            label=f'snapshot --depth={depth}',
+            created_by=created_by_id,
+        )
+        crawl = Crawl.from_seed(seed, max_depth=depth)
+
+    # Process each record
+    created_snapshots = []
+    for record in records:
+        if record.get('type') != TYPE_SNAPSHOT and 'url' not in record:
+            continue
+
+        try:
+            # Add crawl info if we have one
+            if crawl:
+                record['crawl_id'] = str(crawl.id)
+                record['depth'] = record.get('depth', 0)
+
+            # Add tags if provided via CLI
+            if tag and not record.get('tags'):
+                record['tags'] = tag
+
+            # Get or create the snapshot
+            snapshot = get_or_create_snapshot(record, created_by_id=created_by_id)
+            created_snapshots.append(snapshot)
+
+            # Output JSONL record (only when piped)
+            if not is_tty:
+                write_record(snapshot_to_jsonl(snapshot))
+
+        except Exception as e:
+            rprint(f'[red]Error creating snapshot: {e}[/red]', file=sys.stderr)
+            continue
+
+    if not created_snapshots:
+        rprint('[red]No snapshots created[/red]', file=sys.stderr)
+        return 1
+
+    rprint(f'[green]Created {len(created_snapshots)} snapshots[/green]', file=sys.stderr)
+
+    # If TTY, show human-readable output
+    if is_tty:
+        for snapshot in created_snapshots:
+            rprint(f'  [dim]{snapshot.id}[/dim] {snapshot.url[:60]}', file=sys.stderr)
+
+    # If --plugins is passed, run the orchestrator for those plugins
+    if plugins:
+        from workers.orchestrator import Orchestrator
+        rprint(f'[blue]Running plugins: {plugins or "all"}...[/blue]', file=sys.stderr)
+        orchestrator = Orchestrator(exit_on_idle=True)
+        orchestrator.runloop()
+
+    return 0
+
+
+def is_snapshot_id(value: str) -> bool:
+    """Check if value looks like a Snapshot UUID."""
+    import re
+    uuid_pattern = re.compile(r'^[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}$', re.I)
+    return bool(uuid_pattern.match(value))
+
+
[email protected]()
[email protected]('--depth', '-d', type=int, default=0, help='Recursively crawl linked pages up to N levels deep')
[email protected]('--tag', '-t', default='', help='Comma-separated tags to add to each snapshot')
[email protected]('--plugins', '-p', default='', help='Comma-separated list of plugins to run after creating snapshots (e.g. title,screenshot)')
[email protected]('args', nargs=-1)
+def main(depth: int, tag: str, plugins: str, args: tuple):
+    """Create Snapshots from URLs, or process existing Snapshots by ID"""
+    from archivebox.misc.jsonl import read_args_or_stdin
+
+    # Read all input
+    records = list(read_args_or_stdin(args))
+
+    if not records:
+        from rich import print as rprint
+        rprint('[yellow]No URLs or Snapshot IDs provided. Pass as arguments or via stdin.[/yellow]', file=sys.stderr)
+        sys.exit(1)
+
+    # Check if input looks like existing Snapshot IDs to process
+    # If ALL inputs are UUIDs with no URL, assume we're processing existing Snapshots
+    all_are_ids = all(
+        (r.get('id') and not r.get('url')) or is_snapshot_id(r.get('url', ''))
+        for r in records
+    )
+
+    if all_are_ids:
+        # Process existing Snapshots by ID
+        exit_code = 0
+        for record in records:
+            snapshot_id = record.get('id') or record.get('url')
+            result = process_snapshot_by_id(snapshot_id)
+            if result != 0:
+                exit_code = result
+        sys.exit(exit_code)
+    else:
+        # Create new Snapshots from URLs
+        sys.exit(create_snapshots(args, depth=depth, tag=tag, plugins=plugins))
+
+
+if __name__ == '__main__':
+    main()

+ 4 - 5
archivebox/cli/archivebox_status.py

@@ -10,9 +10,8 @@ from rich import print
 from archivebox.misc.util import enforce_types, docstring
 from archivebox.config import DATA_DIR, CONSTANTS, ARCHIVE_DIR
 from archivebox.config.common import SHELL_CONFIG
-from archivebox.index.json import parse_json_links_details
-from archivebox.index import (
-    load_main_index,
+from archivebox.misc.legacy import parse_json_links_details
+from archivebox.misc.folders import (
     get_indexed_folders,
     get_archived_folders,
     get_invalid_folders,
@@ -33,7 +32,7 @@ def status(out_dir: Path=DATA_DIR) -> None:
     """Print out some info and statistics about the archive collection"""
 
     from django.contrib.auth import get_user_model
-    from archivebox.index.sql import get_admins
+    from archivebox.misc.db import get_admins
     from core.models import Snapshot
     User = get_user_model()
 
@@ -44,7 +43,7 @@ def status(out_dir: Path=DATA_DIR) -> None:
     print(f'    Index size: {size} across {num_files} files')
     print()
 
-    links = load_main_index(out_dir=out_dir)
+    links = Snapshot.objects.all()
     num_sql_links = links.count()
     num_link_details = sum(1 for link in parse_json_links_details(out_dir=out_dir))
     print(f'    > SQL Main Index: {num_sql_links} links'.ljust(36), f'(found in {CONSTANTS.SQL_INDEX_FILENAME})')

+ 68 - 7
archivebox/cli/archivebox_update.py

@@ -8,8 +8,7 @@ import rich_click as click
 from typing import Iterable
 
 from archivebox.misc.util import enforce_types, docstring
-from archivebox.index import (
-    LINK_FILTERS,
+from archivebox.misc.folders import (
     get_indexed_folders,
     get_archived_folders,
     get_unarchived_folders,
@@ -22,6 +21,16 @@ from archivebox.index import (
     get_unrecognized_folders,
 )
 
+# Filter types for URL matching
+LINK_FILTERS = {
+    'exact': lambda pattern: {'url': pattern},
+    'substring': lambda pattern: {'url__icontains': pattern},
+    'regex': lambda pattern: {'url__iregex': pattern},
+    'domain': lambda pattern: {'url__istartswith': f'http://{pattern}'},
+    'tag': lambda pattern: {'tags__name': pattern},
+    'timestamp': lambda pattern: {'timestamp': pattern},
+}
+
 
 @enforce_types
 def update(filter_patterns: Iterable[str]=(),
@@ -33,15 +42,66 @@ def update(filter_patterns: Iterable[str]=(),
           after: float | None=None,
           status: str='indexed',
           filter_type: str='exact',
-          extract: str="") -> None:
+          plugins: str="",
+          max_workers: int=4) -> None:
     """Import any new links from subscriptions and retry any previously failed/skipped links"""
     
+    from rich import print
+    
     from archivebox.config.django import setup_django
     setup_django()
+
+    from django.utils import timezone
+    from core.models import Snapshot
+    from workers.orchestrator import parallel_archive
+    
+    # Get snapshots to update based on filters
+    snapshots = Snapshot.objects.all()
+    
+    if filter_patterns:
+        snapshots = Snapshot.objects.filter_by_patterns(list(filter_patterns), filter_type)
+    
+    if status == 'unarchived':
+        snapshots = snapshots.filter(downloaded_at__isnull=True)
+    elif status == 'archived':
+        snapshots = snapshots.filter(downloaded_at__isnull=False)
+    
+    if before:
+        from datetime import datetime
+        snapshots = snapshots.filter(bookmarked_at__lt=datetime.fromtimestamp(before))
+    if after:
+        from datetime import datetime
+        snapshots = snapshots.filter(bookmarked_at__gt=datetime.fromtimestamp(after))
+    
+    if resume:
+        snapshots = snapshots.filter(timestamp__gte=str(resume))
+    
+    snapshot_ids = list(snapshots.values_list('pk', flat=True))
+    
+    if not snapshot_ids:
+        print('[yellow]No snapshots found matching the given filters[/yellow]')
+        return
+    
+    print(f'[green]\\[*] Found {len(snapshot_ids)} snapshots to update[/green]')
+    
+    if index_only:
+        print('[yellow]Index-only mode - skipping archiving[/yellow]')
+        return
     
-    from workers.orchestrator import Orchestrator
-    orchestrator = Orchestrator(exit_on_idle=False)
-    orchestrator.start()
+    methods = plugins.split(',') if plugins else None
+
+    # Queue snapshots for archiving via the state machine system
+    # Workers will pick them up and run the plugins
+    if len(snapshot_ids) > 1 and max_workers > 1:
+        parallel_archive(snapshot_ids, max_workers=max_workers, overwrite=overwrite, methods=methods)
+    else:
+        # Queue snapshots by setting status to queued
+        for snapshot in snapshots:
+            Snapshot.objects.filter(id=snapshot.id).update(
+                status=Snapshot.StatusChoices.QUEUED,
+                retry_at=timezone.now(),
+            )
+        print(f'[green]Queued {len(snapshot_ids)} snapshots for archiving[/green]')
 
 
 @click.command()
@@ -71,7 +131,8 @@ Update only links or data directories that have the given status:
     unrecognized  {get_unrecognized_folders.__doc__}
 ''')
 @click.option('--filter-type', '-t', type=click.Choice([*LINK_FILTERS.keys(), 'search']), default='exact', help='Type of pattern matching to use when filtering URLs')
[email protected]('--extract', '-e', default='', help='Comma-separated list of extractors to use e.g. title,favicon,screenshot,singlefile,...')
[email protected]('--plugins', '-p', default='', help='Comma-separated list of plugins to use e.g. title,favicon,screenshot,singlefile,...')
[email protected]('--max-workers', '-j', type=int, default=4, help='Number of parallel worker processes for archiving')
 @click.argument('filter_patterns', nargs=-1)
 @docstring(update.__doc__)
 def main(**kwargs):

+ 109 - 96
archivebox/cli/archivebox_version.py

@@ -3,7 +3,10 @@
 __package__ = 'archivebox.cli'
 
 import sys
-from typing import Iterable
+import os
+import platform
+from pathlib import Path
+from typing import Iterable, Optional
 
 import rich_click as click
 
@@ -12,7 +15,6 @@ from archivebox.misc.util import docstring, enforce_types
 
 @enforce_types
 def version(quiet: bool=False,
-            binproviders: Iterable[str]=(),
             binaries: Iterable[str]=()) -> list[str]:
     """Print the ArchiveBox version, debug metadata, and installed dependency versions"""
     
@@ -22,37 +24,24 @@ def version(quiet: bool=False,
     if quiet or '--version' in sys.argv:
         return []
     
-    # Only do slower imports when getting full version info
-    import os
-    import platform
-    from pathlib import Path
-    
     from rich.panel import Panel
     from rich.console import Console
-    from abx_pkg import Binary
     
-    import abx
-    import archivebox
     from archivebox.config import CONSTANTS, DATA_DIR
     from archivebox.config.version import get_COMMIT_HASH, get_BUILD_TIME
     from archivebox.config.permissions import ARCHIVEBOX_USER, ARCHIVEBOX_GROUP, RUNNING_AS_UID, RUNNING_AS_GID, IN_DOCKER
     from archivebox.config.paths import get_data_locations, get_code_locations
     from archivebox.config.common import SHELL_CONFIG, STORAGE_CONFIG, SEARCH_BACKEND_CONFIG
     from archivebox.misc.logging_util import printable_folder_status
-    
-    from abx_plugin_default_binproviders import apt, brew, env
+    from archivebox.config.configset import get_config
     
     console = Console()
     prnt = console.print
     
-    LDAP_ENABLED = archivebox.pm.hook.get_SCOPE_CONFIG().LDAP_ENABLED
+    # Check if LDAP is enabled (simple config lookup)
+    config = get_config()
+    LDAP_ENABLED = config.get('LDAP_ENABLED', False)
 
-    # 0.7.1
-    # ArchiveBox v0.7.1+editable COMMIT_HASH=951bba5 BUILD_TIME=2023-12-17 16:46:05 1702860365
-    # IN_DOCKER=False IN_QEMU=False ARCH=arm64 OS=Darwin PLATFORM=macOS-14.2-arm64-arm-64bit PYTHON=Cpython
-    # FS_ATOMIC=True FS_REMOTE=False FS_USER=501:20 FS_PERMS=644
-    # DEBUG=False IS_TTY=True TZ=UTC SEARCH_BACKEND=ripgrep LDAP=False
-    
     p = platform.uname()
     COMMIT_HASH = get_COMMIT_HASH()
     prnt(
@@ -68,15 +57,26 @@ def version(quiet: bool=False,
         f'PLATFORM={platform.platform()}',
         f'PYTHON={sys.implementation.name.title()}' + (' (venv)' if CONSTANTS.IS_INSIDE_VENV else ''),
     )
-    OUTPUT_IS_REMOTE_FS = get_data_locations().DATA_DIR.is_mount or get_data_locations().ARCHIVE_DIR.is_mount
-    DATA_DIR_STAT = CONSTANTS.DATA_DIR.stat()
-    prnt(
-        f'EUID={os.geteuid()}:{os.getegid()} UID={RUNNING_AS_UID}:{RUNNING_AS_GID} PUID={ARCHIVEBOX_USER}:{ARCHIVEBOX_GROUP}',
-        f'FS_UID={DATA_DIR_STAT.st_uid}:{DATA_DIR_STAT.st_gid}',
-        f'FS_PERMS={STORAGE_CONFIG.OUTPUT_PERMISSIONS}',
-        f'FS_ATOMIC={STORAGE_CONFIG.ENFORCE_ATOMIC_WRITES}',
-        f'FS_REMOTE={OUTPUT_IS_REMOTE_FS}',
-    )
+    
+    try:
+        OUTPUT_IS_REMOTE_FS = get_data_locations().DATA_DIR.is_mount or get_data_locations().ARCHIVE_DIR.is_mount
+    except Exception:
+        OUTPUT_IS_REMOTE_FS = False
+        
+    try:
+        DATA_DIR_STAT = CONSTANTS.DATA_DIR.stat()
+        prnt(
+            f'EUID={os.geteuid()}:{os.getegid()} UID={RUNNING_AS_UID}:{RUNNING_AS_GID} PUID={ARCHIVEBOX_USER}:{ARCHIVEBOX_GROUP}',
+            f'FS_UID={DATA_DIR_STAT.st_uid}:{DATA_DIR_STAT.st_gid}',
+            f'FS_PERMS={STORAGE_CONFIG.OUTPUT_PERMISSIONS}',
+            f'FS_ATOMIC={STORAGE_CONFIG.ENFORCE_ATOMIC_WRITES}',
+            f'FS_REMOTE={OUTPUT_IS_REMOTE_FS}',
+        )
+    except Exception:
+        prnt(
+            f'EUID={os.geteuid()}:{os.getegid()} UID={RUNNING_AS_UID}:{RUNNING_AS_GID} PUID={ARCHIVEBOX_USER}:{ARCHIVEBOX_GROUP}',
+        )
+        
     prnt(
         f'DEBUG={SHELL_CONFIG.DEBUG}',
         f'IS_TTY={SHELL_CONFIG.IS_TTY}',
@@ -84,14 +84,11 @@ def version(quiet: bool=False,
         f'ID={CONSTANTS.MACHINE_ID}:{CONSTANTS.COLLECTION_ID}',
         f'SEARCH_BACKEND={SEARCH_BACKEND_CONFIG.SEARCH_BACKEND_ENGINE}',
         f'LDAP={LDAP_ENABLED}',
-        #f'DB=django.db.backends.sqlite3 (({CONFIG["SQLITE_JOURNAL_MODE"]})',  # add this if we have more useful info to show eventually
     )
     prnt()
     
     if not (os.access(CONSTANTS.ARCHIVE_DIR, os.R_OK) and os.access(CONSTANTS.CONFIG_FILE, os.R_OK)):
         PANEL_TEXT = '\n'.join((
-            # '',
-            # f'[yellow]CURRENT DIR =[/yellow] [red]{os.getcwd()}[/red]',
             '',
             '[violet]Hint:[/violet] [green]cd[/green] into a collection [blue]DATA_DIR[/blue] and run [green]archivebox version[/green] again...',
             '      [grey53]OR[/grey53] run [green]archivebox init[/green] to create a new collection in the current dir.',
@@ -105,77 +102,94 @@ def version(quiet: bool=False,
 
     prnt('[pale_green1][i] Binary Dependencies:[/pale_green1]')
     failures = []
-    BINARIES = abx.as_dict(archivebox.pm.hook.get_BINARIES())
-    for name, binary in list(BINARIES.items()):
-        if binary.name == 'archivebox':
-            continue
-        
-        # skip if the binary is not in the requested list of binaries
-        if binaries and binary.name not in binaries:
-            continue
-        
-        # skip if the binary is not supported by any of the requested binproviders
-        if binproviders and binary.binproviders_supported and not any(provider.name in binproviders for provider in binary.binproviders_supported):
-            continue
-        
-        err = None
-        try:
-            loaded_bin = binary.load()
-        except Exception as e:
-            err = e
-            loaded_bin = binary
-        provider_summary = f'[dark_sea_green3]{loaded_bin.binprovider.name.ljust(10)}[/dark_sea_green3]' if loaded_bin.binprovider else '[grey23]not found[/grey23] '
-        if loaded_bin.abspath:
-            abspath = str(loaded_bin.abspath).replace(str(DATA_DIR), '[light_slate_blue].[/light_slate_blue]').replace(str(Path('~').expanduser()), '~')
-            if ' ' in abspath:
-                abspath = abspath.replace(' ', r'\ ')
-        else:
-            abspath = f'[red]{err}[/red]'
-        prnt('', '[green]√[/green]' if loaded_bin.is_valid else '[red]X[/red]', '', loaded_bin.name.ljust(21), str(loaded_bin.version).ljust(12), provider_summary, abspath, overflow='ignore', crop=False)
-        if not loaded_bin.is_valid:
-            failures.append(loaded_bin.name)
-            
-    prnt()
-    prnt('[gold3][i] Package Managers:[/gold3]')
-    BINPROVIDERS = abx.as_dict(archivebox.pm.hook.get_BINPROVIDERS())
-    for name, binprovider in list(BINPROVIDERS.items()):
-        err = None
-        
-        if binproviders and binprovider.name not in binproviders:
-            continue
-        
-        # TODO: implement a BinProvider.BINARY() method that gets the loaded binary for a binprovider's INSTALLER_BIN
-        loaded_bin = binprovider.INSTALLER_BINARY or Binary(name=binprovider.INSTALLER_BIN, binproviders=[env, apt, brew])
-        
-        abspath = str(loaded_bin.abspath).replace(str(DATA_DIR), '[light_slate_blue].[/light_slate_blue]').replace(str(Path('~').expanduser()), '~')
-        abspath = None
-        if loaded_bin.abspath:
-            abspath = str(loaded_bin.abspath).replace(str(DATA_DIR), '.').replace(str(Path('~').expanduser()), '~')
-            if ' ' in abspath:
-                abspath = abspath.replace(' ', r'\ ')
-                
-        PATH = str(binprovider.PATH).replace(str(DATA_DIR), '[light_slate_blue].[/light_slate_blue]').replace(str(Path('~').expanduser()), '~')
-        ownership_summary = f'UID=[blue]{str(binprovider.EUID).ljust(4)}[/blue]'
-        provider_summary = f'[dark_sea_green3]{str(abspath).ljust(52)}[/dark_sea_green3]' if abspath else f'[grey23]{"not available".ljust(52)}[/grey23]'
-        prnt('', '[green]√[/green]' if binprovider.is_valid else '[grey53]-[/grey53]', '', binprovider.name.ljust(11), provider_summary, ownership_summary, f'PATH={PATH}', overflow='ellipsis', soft_wrap=True)
-
-    if not (binaries or binproviders):
-        # dont show source code / data dir info if we just want to get version info for a binary or binprovider
-        
+
+    # Setup Django before importing models
+    from archivebox.config.django import setup_django
+    setup_django()
+
+    from machine.models import Machine, InstalledBinary
+
+    machine = Machine.current()
+
+    # Get all *_BINARY config values
+    binary_config_keys = [key for key in config.keys() if key.endswith('_BINARY')]
+
+    if not binary_config_keys:
+        prnt('', '[grey53]No binary dependencies defined in config.[/grey53]')
+    else:
+        for key in sorted(set(binary_config_keys)):
+            # Get the actual binary name/path from config value
+            bin_value = config.get(key, '').strip()
+            if not bin_value:
+                continue
+
+            # Check if it's a path (has slashes) or just a name
+            is_path = '/' in bin_value
+
+            if is_path:
+                # It's a full path - match against abspath
+                bin_name = Path(bin_value).name
+                # Skip if user specified specific binaries and this isn't one
+                if binaries and bin_name not in binaries:
+                    continue
+                # Find InstalledBinary where abspath ends with this path
+                installed = InstalledBinary.objects.filter(
+                    machine=machine,
+                    abspath__endswith=bin_value,
+                ).exclude(abspath='').exclude(abspath__isnull=True).order_by('-modified_at').first()
+            else:
+                # It's just a binary name - match against name
+                bin_name = bin_value
+                # Skip if user specified specific binaries and this isn't one
+                if binaries and bin_name not in binaries:
+                    continue
+                # Find InstalledBinary by name
+                installed = InstalledBinary.objects.filter(
+                    machine=machine,
+                    name__iexact=bin_name,
+                ).exclude(abspath='').exclude(abspath__isnull=True).order_by('-modified_at').first()
+
+            if installed and installed.is_valid:
+                display_path = installed.abspath.replace(str(DATA_DIR), '.').replace(str(Path('~').expanduser()), '~')
+                version_str = (installed.version or 'unknown')[:15]
+                provider = (installed.binprovider or 'env')[:8]
+                prnt('', '[green]√[/green]', '', bin_name.ljust(18), version_str.ljust(16), provider.ljust(8), display_path, overflow='ignore', crop=False)
+            else:
+                prnt('', '[red]X[/red]', '', bin_name.ljust(18), '[grey53]not installed[/grey53]', overflow='ignore', crop=False)
+                failures.append(bin_name)
+
+    # Show hint if no binaries are installed yet
+    has_any_installed = InstalledBinary.objects.filter(machine=machine).exclude(abspath='').exists()
+    if not has_any_installed:
+        prnt()
+        prnt('', '[grey53]Run [green]archivebox install[/green] to detect and install dependencies.[/grey53]')
+
+    if not binaries:
+        # Show code and data locations
         prnt()
         prnt('[deep_sky_blue3][i] Code locations:[/deep_sky_blue3]')
-        for name, path in get_code_locations().items():
-            prnt(printable_folder_status(name, path), overflow='ignore', crop=False)
+        try:
+            for name, path in get_code_locations().items():
+                if isinstance(path, dict):
+                    prnt(printable_folder_status(name, path), overflow='ignore', crop=False)
+        except Exception as e:
+            prnt(f'  [red]Error getting code locations: {e}[/red]')
 
         prnt()
         if os.access(CONSTANTS.ARCHIVE_DIR, os.R_OK) or os.access(CONSTANTS.CONFIG_FILE, os.R_OK):
             prnt('[bright_yellow][i] Data locations:[/bright_yellow]')
-            for name, path in get_data_locations().items():
-                prnt(printable_folder_status(name, path), overflow='ignore', crop=False)
-        
-            from archivebox.misc.checks import check_data_dir_permissions
+            try:
+                for name, path in get_data_locations().items():
+                    if isinstance(path, dict):
+                        prnt(printable_folder_status(name, path), overflow='ignore', crop=False)
+            except Exception as e:
+                prnt(f'  [red]Error getting data locations: {e}[/red]')
             
-            check_data_dir_permissions()
+            try:
+                from archivebox.misc.checks import check_data_dir_permissions
+                check_data_dir_permissions()
+            except Exception:
+                pass
         else:
             prnt()
             prnt('[red][i] Data locations:[/red] (not in a data directory)')
@@ -194,7 +208,6 @@ def version(quiet: bool=False,
 
 @click.command()
 @click.option('--quiet', '-q', is_flag=True, help='Only print ArchiveBox version number and nothing else. (equivalent to archivebox --version)')
[email protected]('--binproviders', '-p', help='Select binproviders to detect DEFAULT=env,apt,brew,sys_pip,venv_pip,lib_pip,pipx,sys_npm,lib_npm,puppeteer,playwright (all)')
 @click.option('--binaries', '-b', help='Select binaries to detect DEFAULT=curl,wget,git,yt-dlp,chrome,single-file,readability-extractor,postlight-parser,... (all)')
 @docstring(version.__doc__)
 def main(**kwargs):

+ 35 - 18
archivebox/cli/archivebox_worker.py

@@ -4,29 +4,46 @@ __package__ = 'archivebox.cli'
 __command__ = 'archivebox worker'
 
 import sys
-import json
 
 import rich_click as click
 
+from archivebox.misc.util import docstring
+
+
+def worker(worker_type: str, daemon: bool = False, plugin: str | None = None):
+    """
+    Start a worker process to process items from the queue.
+
+    Worker types:
+        - crawl: Process Crawl objects (parse seeds, create snapshots)
+        - snapshot: Process Snapshot objects (create archive results)
+        - archiveresult: Process ArchiveResult objects (run plugins)
+
+    Workers poll the database for queued items, claim them atomically,
+    and spawn subprocess tasks to handle each item.
+    """
+    from workers.worker import get_worker_class
+
+    WorkerClass = get_worker_class(worker_type)
+
+    # Build kwargs
+    kwargs = {'daemon': daemon}
+    if plugin and worker_type == 'archiveresult':
+        kwargs['extractor'] = plugin  # internal field still called extractor
+
+    # Create and run worker
+    worker_instance = WorkerClass(**kwargs)
+    worker_instance.runloop()
+
 
 @click.command()
[email protected]('worker_type')
[email protected]('--wait-for-first-event', is_flag=True)
[email protected]('--exit-on-idle', is_flag=True)
-def main(worker_type: str, wait_for_first_event: bool, exit_on_idle: bool):
-    """Start an ArchiveBox worker process of the given type"""
-    
-    from workers.worker import get_worker_type
-    
-    # allow piping in events to process from stdin
-    # if not sys.stdin.isatty():
-    #     for line in sys.stdin.readlines():
-    #         Event.dispatch(event=json.loads(line), parent=None)
-
-    # run the actor
-    Worker = get_worker_type(worker_type)
-    for event in Worker.run(wait_for_first_event=wait_for_first_event, exit_on_idle=exit_on_idle):
-        print(event)
[email protected]('worker_type', type=click.Choice(['crawl', 'snapshot', 'archiveresult']))
[email protected]('--daemon', '-d', is_flag=True, help="Run forever (don't exit on idle)")
[email protected]('--plugin', '-p', default=None, help='Filter by plugin (archiveresult only)')
+@docstring(worker.__doc__)
+def main(worker_type: str, daemon: bool, plugin: str | None):
+    """Start an ArchiveBox worker process"""
+    worker(worker_type, daemon=daemon, plugin=plugin)
 
 
 if __name__ == '__main__':

+ 0 - 1
archivebox/cli/tests.py

@@ -31,7 +31,6 @@ DATA_DIR = 'data.tests'
 os.environ.update(TEST_CONFIG)
 
 from ..main import init
-from ..index import load_main_index
 from archivebox.config.constants import (
     SQL_INDEX_FILENAME,
     JSON_INDEX_FILENAME,

+ 966 - 0
archivebox/cli/tests_piping.py

@@ -0,0 +1,966 @@
+#!/usr/bin/env python3
+"""
+Tests for CLI piping workflow: crawl | snapshot | extract
+
+This module tests the JSONL-based piping between CLI commands as described in:
+https://github.com/ArchiveBox/ArchiveBox/issues/1363
+
+Workflows tested:
+    archivebox snapshot URL | archivebox extract
+    archivebox crawl URL | archivebox snapshot | archivebox extract
+    archivebox crawl --plugin=PARSER URL | archivebox snapshot | archivebox extract
+
+Each command should:
+    - Accept URLs, snapshot_ids, or JSONL as input (args or stdin)
+    - Output JSONL to stdout when piped (not TTY)
+    - Output human-readable to stderr when TTY
+"""
+
+__package__ = 'archivebox.cli'
+
+import os
+import sys
+import json
+import shutil
+import tempfile
+import unittest
+from io import StringIO
+from pathlib import Path
+from unittest.mock import patch, MagicMock
+
+# Test configuration - disable slow extractors
+TEST_CONFIG = {
+    'USE_COLOR': 'False',
+    'SHOW_PROGRESS': 'False',
+    'SAVE_ARCHIVE_DOT_ORG': 'False',
+    'SAVE_TITLE': 'True',  # Fast extractor
+    'SAVE_FAVICON': 'False',
+    'SAVE_WGET': 'False',
+    'SAVE_WARC': 'False',
+    'SAVE_PDF': 'False',
+    'SAVE_SCREENSHOT': 'False',
+    'SAVE_DOM': 'False',
+    'SAVE_SINGLEFILE': 'False',
+    'SAVE_READABILITY': 'False',
+    'SAVE_MERCURY': 'False',
+    'SAVE_GIT': 'False',
+    'SAVE_MEDIA': 'False',
+    'SAVE_HEADERS': 'False',
+    'USE_CURL': 'False',
+    'USE_WGET': 'False',
+    'USE_GIT': 'False',
+    'USE_CHROME': 'False',
+    'USE_YOUTUBEDL': 'False',
+    'USE_NODE': 'False',
+}
+
+os.environ.update(TEST_CONFIG)
+
+
+# =============================================================================
+# JSONL Utility Tests
+# =============================================================================
+
+class TestJSONLParsing(unittest.TestCase):
+    """Test JSONL input parsing utilities."""
+
+    def test_parse_plain_url(self):
+        """Plain URLs should be parsed as Snapshot records."""
+        from archivebox.misc.jsonl import parse_line, TYPE_SNAPSHOT
+
+        result = parse_line('https://example.com')
+        self.assertIsNotNone(result)
+        self.assertEqual(result['type'], TYPE_SNAPSHOT)
+        self.assertEqual(result['url'], 'https://example.com')
+
+    def test_parse_jsonl_snapshot(self):
+        """JSONL Snapshot records should preserve all fields."""
+        from archivebox.misc.jsonl import parse_line, TYPE_SNAPSHOT
+
+        line = '{"type": "Snapshot", "url": "https://example.com", "tags": "test,demo"}'
+        result = parse_line(line)
+        self.assertIsNotNone(result)
+        self.assertEqual(result['type'], TYPE_SNAPSHOT)
+        self.assertEqual(result['url'], 'https://example.com')
+        self.assertEqual(result['tags'], 'test,demo')
+
+    def test_parse_jsonl_with_id(self):
+        """JSONL with id field should be recognized."""
+        from archivebox.misc.jsonl import parse_line, TYPE_SNAPSHOT
+
+        line = '{"type": "Snapshot", "id": "abc123", "url": "https://example.com"}'
+        result = parse_line(line)
+        self.assertIsNotNone(result)
+        self.assertEqual(result['id'], 'abc123')
+        self.assertEqual(result['url'], 'https://example.com')
+
+    def test_parse_uuid_as_snapshot_id(self):
+        """Bare UUIDs should be parsed as snapshot IDs."""
+        from archivebox.misc.jsonl import parse_line, TYPE_SNAPSHOT
+
+        uuid = '01234567-89ab-cdef-0123-456789abcdef'
+        result = parse_line(uuid)
+        self.assertIsNotNone(result)
+        self.assertEqual(result['type'], TYPE_SNAPSHOT)
+        self.assertEqual(result['id'], uuid)
+
+    def test_parse_empty_line(self):
+        """Empty lines should return None."""
+        from archivebox.misc.jsonl import parse_line
+
+        self.assertIsNone(parse_line(''))
+        self.assertIsNone(parse_line('   '))
+        self.assertIsNone(parse_line('\n'))
+
+    def test_parse_comment_line(self):
+        """Comment lines should return None."""
+        from archivebox.misc.jsonl import parse_line
+
+        self.assertIsNone(parse_line('# This is a comment'))
+        self.assertIsNone(parse_line('  # Indented comment'))
+
+    def test_parse_invalid_url(self):
+        """Invalid URLs should return None."""
+        from archivebox.misc.jsonl import parse_line
+
+        self.assertIsNone(parse_line('not-a-url'))
+        self.assertIsNone(parse_line('ftp://example.com'))  # Only http/https/file
+
+    def test_parse_file_url(self):
+        """file:// URLs should be parsed."""
+        from archivebox.misc.jsonl import parse_line, TYPE_SNAPSHOT
+
+        result = parse_line('file:///path/to/file.txt')
+        self.assertIsNotNone(result)
+        self.assertEqual(result['type'], TYPE_SNAPSHOT)
+        self.assertEqual(result['url'], 'file:///path/to/file.txt')
+
+
+class TestJSONLOutput(unittest.TestCase):
+    """Test JSONL output formatting."""
+
+    def test_snapshot_to_jsonl(self):
+        """Snapshot model should serialize to JSONL correctly."""
+        from archivebox.misc.jsonl import snapshot_to_jsonl, TYPE_SNAPSHOT
+
+        # Create a mock snapshot
+        mock_snapshot = MagicMock()
+        mock_snapshot.id = 'test-uuid-1234'
+        mock_snapshot.url = 'https://example.com'
+        mock_snapshot.title = 'Example Title'
+        mock_snapshot.tags_str.return_value = 'tag1,tag2'
+        mock_snapshot.bookmarked_at = None
+        mock_snapshot.created_at = None
+        mock_snapshot.timestamp = '1234567890'
+        mock_snapshot.depth = 0
+        mock_snapshot.status = 'queued'
+
+        result = snapshot_to_jsonl(mock_snapshot)
+        self.assertEqual(result['type'], TYPE_SNAPSHOT)
+        self.assertEqual(result['id'], 'test-uuid-1234')
+        self.assertEqual(result['url'], 'https://example.com')
+        self.assertEqual(result['title'], 'Example Title')
+
+    def test_archiveresult_to_jsonl(self):
+        """ArchiveResult model should serialize to JSONL correctly."""
+        from archivebox.misc.jsonl import archiveresult_to_jsonl, TYPE_ARCHIVERESULT
+
+        mock_result = MagicMock()
+        mock_result.id = 'result-uuid-5678'
+        mock_result.snapshot_id = 'snapshot-uuid-1234'
+        mock_result.extractor = 'title'
+        mock_result.status = 'succeeded'
+        mock_result.output = 'Example Title'
+        mock_result.start_ts = None
+        mock_result.end_ts = None
+
+        result = archiveresult_to_jsonl(mock_result)
+        self.assertEqual(result['type'], TYPE_ARCHIVERESULT)
+        self.assertEqual(result['id'], 'result-uuid-5678')
+        self.assertEqual(result['snapshot_id'], 'snapshot-uuid-1234')
+        self.assertEqual(result['extractor'], 'title')
+        self.assertEqual(result['status'], 'succeeded')
+
+
+class TestReadArgsOrStdin(unittest.TestCase):
+    """Test reading from args or stdin."""
+
+    def test_read_from_args(self):
+        """Should read URLs from command line args."""
+        from archivebox.misc.jsonl import read_args_or_stdin
+
+        args = ('https://example1.com', 'https://example2.com')
+        records = list(read_args_or_stdin(args))
+
+        self.assertEqual(len(records), 2)
+        self.assertEqual(records[0]['url'], 'https://example1.com')
+        self.assertEqual(records[1]['url'], 'https://example2.com')
+
+    def test_read_from_stdin(self):
+        """Should read URLs from stdin when no args provided."""
+        from archivebox.misc.jsonl import read_args_or_stdin
+
+        stdin_content = 'https://example1.com\nhttps://example2.com\n'
+        stream = StringIO(stdin_content)
+
+        # Mock isatty to return False (simulating piped input)
+        stream.isatty = lambda: False
+
+        records = list(read_args_or_stdin((), stream=stream))
+
+        self.assertEqual(len(records), 2)
+        self.assertEqual(records[0]['url'], 'https://example1.com')
+        self.assertEqual(records[1]['url'], 'https://example2.com')
+
+    def test_read_jsonl_from_stdin(self):
+        """Should read JSONL from stdin."""
+        from archivebox.misc.jsonl import read_args_or_stdin
+
+        stdin_content = '{"type": "Snapshot", "url": "https://example.com", "tags": "test"}\n'
+        stream = StringIO(stdin_content)
+        stream.isatty = lambda: False
+
+        records = list(read_args_or_stdin((), stream=stream))
+
+        self.assertEqual(len(records), 1)
+        self.assertEqual(records[0]['url'], 'https://example.com')
+        self.assertEqual(records[0]['tags'], 'test')
+
+    def test_skip_tty_stdin(self):
+        """Should not read from TTY stdin (would block)."""
+        from archivebox.misc.jsonl import read_args_or_stdin
+
+        stream = StringIO('https://example.com')
+        stream.isatty = lambda: True  # Simulate TTY
+
+        records = list(read_args_or_stdin((), stream=stream))
+        self.assertEqual(len(records), 0)
+
+
+# =============================================================================
+# Unit Tests for Individual Commands
+# =============================================================================
+
+class TestCrawlCommand(unittest.TestCase):
+    """Unit tests for archivebox crawl command."""
+
+    def setUp(self):
+        """Set up test environment."""
+        self.test_dir = tempfile.mkdtemp()
+        os.environ['DATA_DIR'] = self.test_dir
+
+    def tearDown(self):
+        """Clean up test environment."""
+        shutil.rmtree(self.test_dir, ignore_errors=True)
+
+    def test_crawl_accepts_url(self):
+        """crawl should accept URLs as input."""
+        from archivebox.misc.jsonl import read_args_or_stdin
+
+        args = ('https://example.com',)
+        records = list(read_args_or_stdin(args))
+
+        self.assertEqual(len(records), 1)
+        self.assertEqual(records[0]['url'], 'https://example.com')
+
+    def test_crawl_accepts_snapshot_id(self):
+        """crawl should accept snapshot IDs as input."""
+        from archivebox.misc.jsonl import read_args_or_stdin
+
+        uuid = '01234567-89ab-cdef-0123-456789abcdef'
+        args = (uuid,)
+        records = list(read_args_or_stdin(args))
+
+        self.assertEqual(len(records), 1)
+        self.assertEqual(records[0]['id'], uuid)
+
+    def test_crawl_accepts_jsonl(self):
+        """crawl should accept JSONL with snapshot info."""
+        from archivebox.misc.jsonl import read_args_or_stdin
+
+        stdin = StringIO('{"type": "Snapshot", "id": "abc123", "url": "https://example.com"}\n')
+        stdin.isatty = lambda: False
+
+        records = list(read_args_or_stdin((), stream=stdin))
+
+        self.assertEqual(len(records), 1)
+        self.assertEqual(records[0]['id'], 'abc123')
+        self.assertEqual(records[0]['url'], 'https://example.com')
+
+    def test_crawl_separates_existing_vs_new(self):
+        """crawl should identify existing snapshots vs new URLs."""
+        # This tests the logic in discover_outlinks() that separates
+        # records with 'id' (existing) from records with just 'url' (new)
+
+        records = [
+            {'type': 'Snapshot', 'id': 'existing-id-1'},  # Existing (id only)
+            {'type': 'Snapshot', 'url': 'https://new-url.com'},  # New (url only)
+            {'type': 'Snapshot', 'id': 'existing-id-2', 'url': 'https://existing.com'},  # Existing (has id)
+        ]
+
+        existing = []
+        new = []
+
+        for record in records:
+            if record.get('id') and not record.get('url'):
+                existing.append(record['id'])
+            elif record.get('id'):
+                existing.append(record['id'])  # Has both id and url - treat as existing
+            elif record.get('url'):
+                new.append(record)
+
+        self.assertEqual(len(existing), 2)
+        self.assertEqual(len(new), 1)
+        self.assertEqual(new[0]['url'], 'https://new-url.com')
+
+
+class TestSnapshotCommand(unittest.TestCase):
+    """Unit tests for archivebox snapshot command."""
+
+    def setUp(self):
+        """Set up test environment."""
+        self.test_dir = tempfile.mkdtemp()
+        os.environ['DATA_DIR'] = self.test_dir
+
+    def tearDown(self):
+        """Clean up test environment."""
+        shutil.rmtree(self.test_dir, ignore_errors=True)
+
+    def test_snapshot_accepts_url(self):
+        """snapshot should accept URLs as input."""
+        from archivebox.misc.jsonl import read_args_or_stdin
+
+        args = ('https://example.com',)
+        records = list(read_args_or_stdin(args))
+
+        self.assertEqual(len(records), 1)
+        self.assertEqual(records[0]['url'], 'https://example.com')
+
+    def test_snapshot_accepts_jsonl_with_metadata(self):
+        """snapshot should accept JSONL with tags and other metadata."""
+        from archivebox.misc.jsonl import read_args_or_stdin
+
+        stdin = StringIO('{"type": "Snapshot", "url": "https://example.com", "tags": "tag1,tag2", "title": "Test"}\n')
+        stdin.isatty = lambda: False
+
+        records = list(read_args_or_stdin((), stream=stdin))
+
+        self.assertEqual(len(records), 1)
+        self.assertEqual(records[0]['url'], 'https://example.com')
+        self.assertEqual(records[0]['tags'], 'tag1,tag2')
+        self.assertEqual(records[0]['title'], 'Test')
+
+    def test_snapshot_output_format(self):
+        """snapshot output should include id and url."""
+        from archivebox.misc.jsonl import snapshot_to_jsonl
+
+        mock_snapshot = MagicMock()
+        mock_snapshot.id = 'test-id'
+        mock_snapshot.url = 'https://example.com'
+        mock_snapshot.title = 'Test'
+        mock_snapshot.tags_str.return_value = ''
+        mock_snapshot.bookmarked_at = None
+        mock_snapshot.created_at = None
+        mock_snapshot.timestamp = '123'
+        mock_snapshot.depth = 0
+        mock_snapshot.status = 'queued'
+
+        output = snapshot_to_jsonl(mock_snapshot)
+
+        self.assertIn('id', output)
+        self.assertIn('url', output)
+        self.assertEqual(output['type'], 'Snapshot')
+
+
+class TestExtractCommand(unittest.TestCase):
+    """Unit tests for archivebox extract command."""
+
+    def setUp(self):
+        """Set up test environment."""
+        self.test_dir = tempfile.mkdtemp()
+        os.environ['DATA_DIR'] = self.test_dir
+
+    def tearDown(self):
+        """Clean up test environment."""
+        shutil.rmtree(self.test_dir, ignore_errors=True)
+
+    def test_extract_accepts_snapshot_id(self):
+        """extract should accept snapshot IDs as input."""
+        from archivebox.misc.jsonl import read_args_or_stdin
+
+        uuid = '01234567-89ab-cdef-0123-456789abcdef'
+        args = (uuid,)
+        records = list(read_args_or_stdin(args))
+
+        self.assertEqual(len(records), 1)
+        self.assertEqual(records[0]['id'], uuid)
+
+    def test_extract_accepts_jsonl_snapshot(self):
+        """extract should accept JSONL Snapshot records."""
+        from archivebox.misc.jsonl import read_args_or_stdin, TYPE_SNAPSHOT
+
+        stdin = StringIO('{"type": "Snapshot", "id": "abc123", "url": "https://example.com"}\n')
+        stdin.isatty = lambda: False
+
+        records = list(read_args_or_stdin((), stream=stdin))
+
+        self.assertEqual(len(records), 1)
+        self.assertEqual(records[0]['type'], TYPE_SNAPSHOT)
+        self.assertEqual(records[0]['id'], 'abc123')
+
+    def test_extract_gathers_snapshot_ids(self):
+        """extract should gather snapshot IDs from various input formats."""
+        from archivebox.misc.jsonl import TYPE_SNAPSHOT, TYPE_ARCHIVERESULT
+
+        records = [
+            {'type': TYPE_SNAPSHOT, 'id': 'snap-1'},
+            {'type': TYPE_SNAPSHOT, 'id': 'snap-2', 'url': 'https://example.com'},
+            {'type': TYPE_ARCHIVERESULT, 'snapshot_id': 'snap-3'},
+            {'id': 'snap-4'},  # Bare id
+        ]
+
+        snapshot_ids = set()
+        for record in records:
+            record_type = record.get('type')
+
+            if record_type == TYPE_SNAPSHOT:
+                snapshot_id = record.get('id')
+                if snapshot_id:
+                    snapshot_ids.add(snapshot_id)
+            elif record_type == TYPE_ARCHIVERESULT:
+                snapshot_id = record.get('snapshot_id')
+                if snapshot_id:
+                    snapshot_ids.add(snapshot_id)
+            elif 'id' in record:
+                snapshot_ids.add(record['id'])
+
+        self.assertEqual(len(snapshot_ids), 4)
+        self.assertIn('snap-1', snapshot_ids)
+        self.assertIn('snap-2', snapshot_ids)
+        self.assertIn('snap-3', snapshot_ids)
+        self.assertIn('snap-4', snapshot_ids)
+
+
+# =============================================================================
+# URL Collection Tests
+# =============================================================================
+
+class TestURLCollection(unittest.TestCase):
+    """Test collecting urls.jsonl from extractor output."""
+
+    def setUp(self):
+        """Create test directory structure."""
+        self.test_dir = Path(tempfile.mkdtemp())
+
+        # Create fake extractor output directories with urls.jsonl
+        (self.test_dir / 'wget').mkdir()
+        (self.test_dir / 'wget' / 'urls.jsonl').write_text(
+            '{"url": "https://wget-link-1.com"}\n'
+            '{"url": "https://wget-link-2.com"}\n'
+        )
+
+        (self.test_dir / 'parse_html_urls').mkdir()
+        (self.test_dir / 'parse_html_urls' / 'urls.jsonl').write_text(
+            '{"url": "https://html-link-1.com"}\n'
+            '{"url": "https://html-link-2.com", "title": "HTML Link 2"}\n'
+        )
+
+        (self.test_dir / 'screenshot').mkdir()
+        # No urls.jsonl in screenshot dir - not a parser
+
+    def tearDown(self):
+        """Clean up test directory."""
+        shutil.rmtree(self.test_dir, ignore_errors=True)
+
+    def test_collect_urls_from_extractors(self):
+        """Should collect urls.jsonl from all extractor subdirectories."""
+        from archivebox.hooks import collect_urls_from_extractors
+
+        urls = collect_urls_from_extractors(self.test_dir)
+
+        self.assertEqual(len(urls), 4)
+
+        # Check that via_extractor is set
+        extractors = {u['via_extractor'] for u in urls}
+        self.assertIn('wget', extractors)
+        self.assertIn('parse_html_urls', extractors)
+        self.assertNotIn('screenshot', extractors)  # No urls.jsonl
+
+    def test_collect_urls_preserves_metadata(self):
+        """Should preserve metadata from urls.jsonl entries."""
+        from archivebox.hooks import collect_urls_from_extractors
+
+        urls = collect_urls_from_extractors(self.test_dir)
+
+        # Find the entry with title
+        titled = [u for u in urls if u.get('title') == 'HTML Link 2']
+        self.assertEqual(len(titled), 1)
+        self.assertEqual(titled[0]['url'], 'https://html-link-2.com')
+
+    def test_collect_urls_empty_dir(self):
+        """Should handle empty or non-existent directories."""
+        from archivebox.hooks import collect_urls_from_extractors
+
+        empty_dir = self.test_dir / 'nonexistent'
+        urls = collect_urls_from_extractors(empty_dir)
+
+        self.assertEqual(len(urls), 0)
+
+
+# =============================================================================
+# Integration Tests
+# =============================================================================
+
+class TestPipingWorkflowIntegration(unittest.TestCase):
+    """
+    Integration tests for the complete piping workflow.
+
+    These tests require Django to be set up and use the actual database.
+    """
+
+    @classmethod
+    def setUpClass(cls):
+        """Set up Django and test database."""
+        cls.test_dir = tempfile.mkdtemp()
+        os.environ['DATA_DIR'] = cls.test_dir
+
+        # Initialize Django
+        from archivebox.config.django import setup_django
+        setup_django()
+
+        # Initialize the archive
+        from archivebox.cli.archivebox_init import init
+        init()
+
+    @classmethod
+    def tearDownClass(cls):
+        """Clean up test database."""
+        shutil.rmtree(cls.test_dir, ignore_errors=True)
+
+    def test_snapshot_creates_and_outputs_jsonl(self):
+        """
+        Test: archivebox snapshot URL
+        Should create a Snapshot and output JSONL when piped.
+        """
+        from core.models import Snapshot
+        from archivebox.misc.jsonl import (
+            read_args_or_stdin, write_record, snapshot_to_jsonl,
+            TYPE_SNAPSHOT, get_or_create_snapshot
+        )
+        from archivebox.base_models.models import get_or_create_system_user_pk
+
+        created_by_id = get_or_create_system_user_pk()
+
+        # Simulate input
+        url = 'https://test-snapshot-1.example.com'
+        records = list(read_args_or_stdin((url,)))
+
+        self.assertEqual(len(records), 1)
+        self.assertEqual(records[0]['url'], url)
+
+        # Create snapshot
+        snapshot = get_or_create_snapshot(records[0], created_by_id=created_by_id)
+
+        self.assertIsNotNone(snapshot.id)
+        self.assertEqual(snapshot.url, url)
+
+        # Verify output format
+        output = snapshot_to_jsonl(snapshot)
+        self.assertEqual(output['type'], TYPE_SNAPSHOT)
+        self.assertIn('id', output)
+        self.assertEqual(output['url'], url)
+
+    def test_extract_accepts_snapshot_from_previous_command(self):
+        """
+        Test: archivebox snapshot URL | archivebox extract
+        Extract should accept JSONL output from snapshot command.
+        """
+        from core.models import Snapshot, ArchiveResult
+        from archivebox.misc.jsonl import (
+            snapshot_to_jsonl, read_args_or_stdin, get_or_create_snapshot,
+            TYPE_SNAPSHOT
+        )
+        from archivebox.base_models.models import get_or_create_system_user_pk
+
+        created_by_id = get_or_create_system_user_pk()
+
+        # Step 1: Create snapshot (simulating 'archivebox snapshot')
+        url = 'https://test-extract-1.example.com'
+        snapshot = get_or_create_snapshot({'url': url}, created_by_id=created_by_id)
+        snapshot_output = snapshot_to_jsonl(snapshot)
+
+        # Step 2: Parse snapshot output as extract input
+        stdin = StringIO(json.dumps(snapshot_output) + '\n')
+        stdin.isatty = lambda: False
+
+        records = list(read_args_or_stdin((), stream=stdin))
+
+        self.assertEqual(len(records), 1)
+        self.assertEqual(records[0]['type'], TYPE_SNAPSHOT)
+        self.assertEqual(records[0]['id'], str(snapshot.id))
+
+        # Step 3: Gather snapshot IDs (as extract does)
+        snapshot_ids = set()
+        for record in records:
+            if record.get('type') == TYPE_SNAPSHOT and record.get('id'):
+                snapshot_ids.add(record['id'])
+
+        self.assertIn(str(snapshot.id), snapshot_ids)
+
+    def test_crawl_outputs_discovered_urls(self):
+        """
+        Test: archivebox crawl URL
+        Should create snapshot, run plugins, output discovered URLs.
+        """
+        from archivebox.hooks import collect_urls_from_extractors
+        from archivebox.misc.jsonl import TYPE_SNAPSHOT
+
+        # Create a mock snapshot directory with urls.jsonl
+        test_snapshot_dir = Path(self.test_dir) / 'archive' / 'test-crawl-snapshot'
+        test_snapshot_dir.mkdir(parents=True, exist_ok=True)
+
+        # Create mock extractor output
+        (test_snapshot_dir / 'parse_html_urls').mkdir()
+        (test_snapshot_dir / 'parse_html_urls' / 'urls.jsonl').write_text(
+            '{"url": "https://discovered-1.com"}\n'
+            '{"url": "https://discovered-2.com", "title": "Discovered 2"}\n'
+        )
+
+        # Collect URLs (as crawl does)
+        discovered = collect_urls_from_extractors(test_snapshot_dir)
+
+        self.assertEqual(len(discovered), 2)
+
+        # Add crawl metadata (as crawl does)
+        for entry in discovered:
+            entry['type'] = TYPE_SNAPSHOT
+            entry['depth'] = 1
+            entry['via_snapshot'] = 'test-crawl-snapshot'
+
+        # Verify output format
+        self.assertEqual(discovered[0]['type'], TYPE_SNAPSHOT)
+        self.assertEqual(discovered[0]['depth'], 1)
+        self.assertEqual(discovered[0]['url'], 'https://discovered-1.com')
+
+    def test_full_pipeline_snapshot_extract(self):
+        """
+        Test: archivebox snapshot URL | archivebox extract
+
+        This is equivalent to: archivebox add URL
+        """
+        from core.models import Snapshot
+        from archivebox.misc.jsonl import (
+            get_or_create_snapshot, snapshot_to_jsonl, read_args_or_stdin,
+            TYPE_SNAPSHOT
+        )
+        from archivebox.base_models.models import get_or_create_system_user_pk
+
+        created_by_id = get_or_create_system_user_pk()
+
+        # === archivebox snapshot https://example.com ===
+        url = 'https://test-pipeline-1.example.com'
+        snapshot = get_or_create_snapshot({'url': url}, created_by_id=created_by_id)
+        snapshot_jsonl = json.dumps(snapshot_to_jsonl(snapshot))
+
+        # === | archivebox extract ===
+        stdin = StringIO(snapshot_jsonl + '\n')
+        stdin.isatty = lambda: False
+
+        records = list(read_args_or_stdin((), stream=stdin))
+
+        # Extract should receive the snapshot ID
+        self.assertEqual(len(records), 1)
+        self.assertEqual(records[0]['id'], str(snapshot.id))
+
+        # Verify snapshot exists in DB
+        db_snapshot = Snapshot.objects.get(id=snapshot.id)
+        self.assertEqual(db_snapshot.url, url)
+
+    def test_full_pipeline_crawl_snapshot_extract(self):
+        """
+        Test: archivebox crawl URL | archivebox snapshot | archivebox extract
+
+        This is equivalent to: archivebox add --depth=1 URL
+        """
+        from core.models import Snapshot
+        from archivebox.misc.jsonl import (
+            get_or_create_snapshot, snapshot_to_jsonl, read_args_or_stdin,
+            TYPE_SNAPSHOT
+        )
+        from archivebox.base_models.models import get_or_create_system_user_pk
+        from archivebox.hooks import collect_urls_from_extractors
+
+        created_by_id = get_or_create_system_user_pk()
+
+        # === archivebox crawl https://example.com ===
+        # Step 1: Create snapshot for starting URL
+        start_url = 'https://test-crawl-pipeline.example.com'
+        start_snapshot = get_or_create_snapshot({'url': start_url}, created_by_id=created_by_id)
+
+        # Step 2: Simulate extractor output with discovered URLs
+        snapshot_dir = Path(self.test_dir) / 'archive' / str(start_snapshot.timestamp)
+        snapshot_dir.mkdir(parents=True, exist_ok=True)
+        (snapshot_dir / 'parse_html_urls').mkdir(exist_ok=True)
+        (snapshot_dir / 'parse_html_urls' / 'urls.jsonl').write_text(
+            '{"url": "https://outlink-1.example.com"}\n'
+            '{"url": "https://outlink-2.example.com"}\n'
+        )
+
+        # Step 3: Collect discovered URLs (crawl output)
+        discovered = collect_urls_from_extractors(snapshot_dir)
+        crawl_output = []
+        for entry in discovered:
+            entry['type'] = TYPE_SNAPSHOT
+            entry['depth'] = 1
+            crawl_output.append(json.dumps(entry))
+
+        # === | archivebox snapshot ===
+        stdin = StringIO('\n'.join(crawl_output) + '\n')
+        stdin.isatty = lambda: False
+
+        records = list(read_args_or_stdin((), stream=stdin))
+        self.assertEqual(len(records), 2)
+
+        # Create snapshots for discovered URLs
+        created_snapshots = []
+        for record in records:
+            snap = get_or_create_snapshot(record, created_by_id=created_by_id)
+            created_snapshots.append(snap)
+
+        self.assertEqual(len(created_snapshots), 2)
+
+        # === | archivebox extract ===
+        snapshot_jsonl_lines = [json.dumps(snapshot_to_jsonl(s)) for s in created_snapshots]
+        stdin = StringIO('\n'.join(snapshot_jsonl_lines) + '\n')
+        stdin.isatty = lambda: False
+
+        records = list(read_args_or_stdin((), stream=stdin))
+        self.assertEqual(len(records), 2)
+
+        # Verify all snapshots exist in DB
+        for record in records:
+            db_snapshot = Snapshot.objects.get(id=record['id'])
+            self.assertIn(db_snapshot.url, [
+                'https://outlink-1.example.com',
+                'https://outlink-2.example.com'
+            ])
+
+
+class TestDepthWorkflows(unittest.TestCase):
+    """Test various depth crawl workflows."""
+
+    @classmethod
+    def setUpClass(cls):
+        """Set up Django and test database."""
+        cls.test_dir = tempfile.mkdtemp()
+        os.environ['DATA_DIR'] = cls.test_dir
+
+        from archivebox.config.django import setup_django
+        setup_django()
+
+        from archivebox.cli.archivebox_init import init
+        init()
+
+    @classmethod
+    def tearDownClass(cls):
+        """Clean up test database."""
+        shutil.rmtree(cls.test_dir, ignore_errors=True)
+
+    def test_depth_0_workflow(self):
+        """
+        Test: archivebox snapshot URL | archivebox extract
+
+        Depth 0: Only archive the specified URL, no crawling.
+        """
+        from core.models import Snapshot
+        from archivebox.misc.jsonl import get_or_create_snapshot
+        from archivebox.base_models.models import get_or_create_system_user_pk
+
+        created_by_id = get_or_create_system_user_pk()
+
+        # Create snapshot
+        url = 'https://depth0-test.example.com'
+        snapshot = get_or_create_snapshot({'url': url}, created_by_id=created_by_id)
+
+        # Verify only one snapshot created
+        self.assertEqual(Snapshot.objects.filter(url=url).count(), 1)
+        self.assertEqual(snapshot.url, url)
+
+    def test_depth_1_workflow(self):
+        """
+        Test: archivebox crawl URL | archivebox snapshot | archivebox extract
+
+        Depth 1: Archive URL + all outlinks from that URL.
+        """
+        # This is tested in test_full_pipeline_crawl_snapshot_extract
+        pass
+
+    def test_depth_metadata_propagation(self):
+        """Test that depth metadata propagates through the pipeline."""
+        from archivebox.misc.jsonl import TYPE_SNAPSHOT
+
+        # Simulate crawl output with depth metadata
+        crawl_output = [
+            {'type': TYPE_SNAPSHOT, 'url': 'https://hop1.com', 'depth': 1, 'via_snapshot': 'root'},
+            {'type': TYPE_SNAPSHOT, 'url': 'https://hop2.com', 'depth': 2, 'via_snapshot': 'hop1'},
+        ]
+
+        # Verify depth is preserved
+        for entry in crawl_output:
+            self.assertIn('depth', entry)
+            self.assertIn('via_snapshot', entry)
+
+
+class TestParserPluginWorkflows(unittest.TestCase):
+    """Test workflows with specific parser plugins."""
+
+    @classmethod
+    def setUpClass(cls):
+        """Set up Django and test database."""
+        cls.test_dir = tempfile.mkdtemp()
+        os.environ['DATA_DIR'] = cls.test_dir
+
+        from archivebox.config.django import setup_django
+        setup_django()
+
+        from archivebox.cli.archivebox_init import init
+        init()
+
+    @classmethod
+    def tearDownClass(cls):
+        """Clean up test database."""
+        shutil.rmtree(cls.test_dir, ignore_errors=True)
+
+    def test_html_parser_workflow(self):
+        """
+        Test: archivebox crawl --plugin=parse_html_urls URL | archivebox snapshot | archivebox extract
+        """
+        from archivebox.hooks import collect_urls_from_extractors
+        from archivebox.misc.jsonl import TYPE_SNAPSHOT
+
+        # Create mock output directory
+        snapshot_dir = Path(self.test_dir) / 'archive' / 'html-parser-test'
+        snapshot_dir.mkdir(parents=True, exist_ok=True)
+        (snapshot_dir / 'parse_html_urls').mkdir(exist_ok=True)
+        (snapshot_dir / 'parse_html_urls' / 'urls.jsonl').write_text(
+            '{"url": "https://html-discovered.com", "title": "HTML Link"}\n'
+        )
+
+        # Collect URLs
+        discovered = collect_urls_from_extractors(snapshot_dir)
+
+        self.assertEqual(len(discovered), 1)
+        self.assertEqual(discovered[0]['url'], 'https://html-discovered.com')
+        self.assertEqual(discovered[0]['via_extractor'], 'parse_html_urls')
+
+    def test_rss_parser_workflow(self):
+        """
+        Test: archivebox crawl --plugin=parse_rss_urls URL | archivebox snapshot | archivebox extract
+        """
+        from archivebox.hooks import collect_urls_from_extractors
+
+        # Create mock output directory
+        snapshot_dir = Path(self.test_dir) / 'archive' / 'rss-parser-test'
+        snapshot_dir.mkdir(parents=True, exist_ok=True)
+        (snapshot_dir / 'parse_rss_urls').mkdir(exist_ok=True)
+        (snapshot_dir / 'parse_rss_urls' / 'urls.jsonl').write_text(
+            '{"url": "https://rss-item-1.com", "title": "RSS Item 1"}\n'
+            '{"url": "https://rss-item-2.com", "title": "RSS Item 2"}\n'
+        )
+
+        # Collect URLs
+        discovered = collect_urls_from_extractors(snapshot_dir)
+
+        self.assertEqual(len(discovered), 2)
+        self.assertTrue(all(d['via_extractor'] == 'parse_rss_urls' for d in discovered))
+
+    def test_multiple_parsers_dedupe(self):
+        """
+        Multiple parsers may discover the same URL - should be deduplicated.
+        """
+        from archivebox.hooks import collect_urls_from_extractors
+
+        # Create mock output with duplicate URLs from different parsers
+        snapshot_dir = Path(self.test_dir) / 'archive' / 'dedupe-test'
+        snapshot_dir.mkdir(parents=True, exist_ok=True)
+
+        (snapshot_dir / 'parse_html_urls').mkdir(exist_ok=True)
+        (snapshot_dir / 'parse_html_urls' / 'urls.jsonl').write_text(
+            '{"url": "https://same-url.com"}\n'
+        )
+
+        (snapshot_dir / 'wget').mkdir(exist_ok=True)
+        (snapshot_dir / 'wget' / 'urls.jsonl').write_text(
+            '{"url": "https://same-url.com"}\n'  # Same URL, different extractor
+        )
+
+        # Collect URLs
+        all_discovered = collect_urls_from_extractors(snapshot_dir)
+
+        # Both entries are returned (deduplication happens at the crawl command level)
+        self.assertEqual(len(all_discovered), 2)
+
+        # Verify both extractors found the same URL
+        urls = {d['url'] for d in all_discovered}
+        self.assertEqual(urls, {'https://same-url.com'})
+
+
+class TestEdgeCases(unittest.TestCase):
+    """Test edge cases and error handling."""
+
+    def test_empty_input(self):
+        """Commands should handle empty input gracefully."""
+        from archivebox.misc.jsonl import read_args_or_stdin
+
+        # Empty args, TTY stdin (should not block)
+        stdin = StringIO('')
+        stdin.isatty = lambda: True
+
+        records = list(read_args_or_stdin((), stream=stdin))
+        self.assertEqual(len(records), 0)
+
+    def test_malformed_jsonl(self):
+        """Should skip malformed JSONL lines."""
+        from archivebox.misc.jsonl import read_args_or_stdin
+
+        stdin = StringIO(
+            '{"url": "https://good.com"}\n'
+            'not valid json\n'
+            '{"url": "https://also-good.com"}\n'
+        )
+        stdin.isatty = lambda: False
+
+        records = list(read_args_or_stdin((), stream=stdin))
+
+        self.assertEqual(len(records), 2)
+        urls = {r['url'] for r in records}
+        self.assertEqual(urls, {'https://good.com', 'https://also-good.com'})
+
+    def test_mixed_input_formats(self):
+        """Should handle mixed URLs and JSONL."""
+        from archivebox.misc.jsonl import read_args_or_stdin
+
+        stdin = StringIO(
+            'https://plain-url.com\n'
+            '{"type": "Snapshot", "url": "https://jsonl-url.com", "tags": "test"}\n'
+            '01234567-89ab-cdef-0123-456789abcdef\n'  # UUID
+        )
+        stdin.isatty = lambda: False
+
+        records = list(read_args_or_stdin((), stream=stdin))
+
+        self.assertEqual(len(records), 3)
+
+        # Plain URL
+        self.assertEqual(records[0]['url'], 'https://plain-url.com')
+
+        # JSONL with metadata
+        self.assertEqual(records[1]['url'], 'https://jsonl-url.com')
+        self.assertEqual(records[1]['tags'], 'test')
+
+        # UUID
+        self.assertEqual(records[2]['id'], '01234567-89ab-cdef-0123-456789abcdef')
+
+
+if __name__ == '__main__':
+    unittest.main()

+ 227 - 25
archivebox/config/__init__.py

@@ -1,6 +1,17 @@
+"""
+ArchiveBox config exports.
+
+This module provides backwards-compatible config exports for extractors
+and other modules that expect to import config values directly.
+"""
+
 __package__ = 'archivebox.config'
 __order__ = 200
 
+import shutil
+from pathlib import Path
+from typing import Dict, List, Optional
+
 from .paths import (
     PACKAGE_DIR,                                    # noqa
     DATA_DIR,                                       # noqa
@@ -9,28 +20,219 @@ from .paths import (
 from .constants import CONSTANTS, CONSTANTS_CONFIG, PACKAGE_DIR, DATA_DIR, ARCHIVE_DIR      # noqa
 from .version import VERSION                        # noqa
 
-# import abx
-
-# @abx.hookimpl
-# def get_CONFIG():
-#     from .common import (
-#         SHELL_CONFIG,
-#         STORAGE_CONFIG,
-#         GENERAL_CONFIG,
-#         SERVER_CONFIG,
-#         ARCHIVING_CONFIG,
-#         SEARCH_BACKEND_CONFIG,
-#     )
-#     return {
-#         'SHELL_CONFIG': SHELL_CONFIG,
-#         'STORAGE_CONFIG': STORAGE_CONFIG,
-#         'GENERAL_CONFIG': GENERAL_CONFIG,
-#         'SERVER_CONFIG': SERVER_CONFIG,
-#         'ARCHIVING_CONFIG': ARCHIVING_CONFIG,
-#         'SEARCHBACKEND_CONFIG': SEARCH_BACKEND_CONFIG,
-#     }
-
-# @abx.hookimpl
-# def ready():
-#     for config in get_CONFIG().values():
-#         config.validate()
+
+###############################################################################
+# Config value exports for extractors
+# These provide backwards compatibility with extractors that import from ..config
+###############################################################################
+
+def _get_config():
+    """Lazy import to avoid circular imports."""
+    from .common import ARCHIVING_CONFIG, STORAGE_CONFIG
+    return ARCHIVING_CONFIG, STORAGE_CONFIG
+
+# Direct exports (evaluated at import time for backwards compat)
+# These are recalculated each time the module attribute is accessed
+
+def __getattr__(name: str):
+    """Module-level __getattr__ for lazy config loading."""
+    
+    # Timeout settings
+    if name == 'TIMEOUT':
+        cfg, _ = _get_config()
+        return cfg.TIMEOUT
+    if name == 'MEDIA_TIMEOUT':
+        cfg, _ = _get_config()
+        return cfg.MEDIA_TIMEOUT
+    
+    # SSL/Security settings
+    if name == 'CHECK_SSL_VALIDITY':
+        cfg, _ = _get_config()
+        return cfg.CHECK_SSL_VALIDITY
+    
+    # Storage settings  
+    if name == 'RESTRICT_FILE_NAMES':
+        _, storage = _get_config()
+        return storage.RESTRICT_FILE_NAMES
+    
+    # User agent / cookies
+    if name == 'COOKIES_FILE':
+        cfg, _ = _get_config()
+        return cfg.COOKIES_FILE
+    if name == 'USER_AGENT':
+        cfg, _ = _get_config()
+        return cfg.USER_AGENT
+    if name == 'CURL_USER_AGENT':
+        cfg, _ = _get_config()
+        return cfg.USER_AGENT
+    if name == 'WGET_USER_AGENT':
+        cfg, _ = _get_config()
+        return cfg.USER_AGENT
+    if name == 'CHROME_USER_AGENT':
+        cfg, _ = _get_config()
+        return cfg.USER_AGENT
+    
+    # Archive method toggles (SAVE_*)
+    if name == 'SAVE_TITLE':
+        return True
+    if name == 'SAVE_FAVICON':
+        return True
+    if name == 'SAVE_WGET':
+        return True
+    if name == 'SAVE_WARC':
+        return True
+    if name == 'SAVE_WGET_REQUISITES':
+        return True
+    if name == 'SAVE_SINGLEFILE':
+        return True
+    if name == 'SAVE_READABILITY':
+        return True
+    if name == 'SAVE_MERCURY':
+        return True
+    if name == 'SAVE_HTMLTOTEXT':
+        return True
+    if name == 'SAVE_PDF':
+        return True
+    if name == 'SAVE_SCREENSHOT':
+        return True
+    if name == 'SAVE_DOM':
+        return True
+    if name == 'SAVE_HEADERS':
+        return True
+    if name == 'SAVE_GIT':
+        return True
+    if name == 'SAVE_MEDIA':
+        return True
+    if name == 'SAVE_ARCHIVE_DOT_ORG':
+        return True
+    
+    # Extractor-specific settings
+    if name == 'RESOLUTION':
+        cfg, _ = _get_config()
+        return cfg.RESOLUTION
+    if name == 'GIT_DOMAINS':
+        return 'github.com,bitbucket.org,gitlab.com,gist.github.com,codeberg.org,gitea.com,git.sr.ht'
+    if name == 'MEDIA_MAX_SIZE':
+        cfg, _ = _get_config()
+        return cfg.MEDIA_MAX_SIZE
+    if name == 'FAVICON_PROVIDER':
+        return 'https://www.google.com/s2/favicons?domain={}'
+    
+    # Binary paths (use shutil.which for detection)
+    if name == 'CURL_BINARY':
+        return shutil.which('curl') or 'curl'
+    if name == 'WGET_BINARY':
+        return shutil.which('wget') or 'wget'
+    if name == 'GIT_BINARY':
+        return shutil.which('git') or 'git'
+    if name == 'YOUTUBEDL_BINARY':
+        return shutil.which('yt-dlp') or shutil.which('youtube-dl') or 'yt-dlp'
+    if name == 'CHROME_BINARY':
+        for chrome in ['chromium', 'chromium-browser', 'google-chrome', 'google-chrome-stable', 'chrome']:
+            path = shutil.which(chrome)
+            if path:
+                return path
+        return 'chromium'
+    if name == 'NODE_BINARY':
+        return shutil.which('node') or 'node'
+    if name == 'SINGLEFILE_BINARY':
+        return shutil.which('single-file') or shutil.which('singlefile') or 'single-file'
+    if name == 'READABILITY_BINARY':
+        return shutil.which('readability-extractor') or 'readability-extractor'
+    if name == 'MERCURY_BINARY':
+        return shutil.which('mercury-parser') or shutil.which('postlight-parser') or 'mercury-parser'
+    
+    # Binary versions (return placeholder, actual version detection happens elsewhere)
+    if name == 'CURL_VERSION':
+        return 'curl'
+    if name == 'WGET_VERSION':
+        return 'wget'
+    if name == 'GIT_VERSION':
+        return 'git'
+    if name == 'YOUTUBEDL_VERSION':
+        return 'yt-dlp'
+    if name == 'CHROME_VERSION':
+        return 'chromium'
+    if name == 'SINGLEFILE_VERSION':
+        return 'singlefile'
+    if name == 'READABILITY_VERSION':
+        return 'readability'
+    if name == 'MERCURY_VERSION':
+        return 'mercury'
+    
+    # Binary arguments
+    if name == 'CURL_ARGS':
+        return ['--silent', '--location', '--compressed']
+    if name == 'WGET_ARGS':
+        return [
+            '--no-verbose',
+            '--adjust-extension',
+            '--convert-links',
+            '--force-directories',
+            '--backup-converted',
+            '--span-hosts',
+            '--no-parent',
+            '-e', 'robots=off',
+        ]
+    if name == 'GIT_ARGS':
+        return ['--recursive']
+    if name == 'YOUTUBEDL_ARGS':
+        cfg, _ = _get_config()
+        return [
+            '--write-description',
+            '--write-info-json',
+            '--write-annotations',
+            '--write-thumbnail',
+            '--no-call-home',
+            '--write-sub',
+            '--write-auto-subs',
+            '--convert-subs=srt',
+            '--yes-playlist',
+            '--continue',
+            '--no-abort-on-error',
+            '--ignore-errors',
+            '--geo-bypass',
+            '--add-metadata',
+            f'--format=(bv*+ba/b)[filesize<={cfg.MEDIA_MAX_SIZE}][filesize_approx<=?{cfg.MEDIA_MAX_SIZE}]/(bv*+ba/b)',
+        ]
+    if name == 'SINGLEFILE_ARGS':
+        return None  # Uses defaults
+    if name == 'CHROME_ARGS':
+        return []
+    
+    # Other settings
+    if name == 'WGET_AUTO_COMPRESSION':
+        return True
+    if name == 'DEPENDENCIES':
+        return {}  # Legacy, not used anymore
+    
+    # Allowlist/Denylist patterns (compiled regexes)
+    if name == 'SAVE_ALLOWLIST_PTN':
+        cfg, _ = _get_config()
+        return cfg.SAVE_ALLOWLIST_PTNS
+    if name == 'SAVE_DENYLIST_PTN':
+        cfg, _ = _get_config()
+        return cfg.SAVE_DENYLIST_PTNS
+    
+    raise AttributeError(f"module 'archivebox.config' has no attribute '{name}'")
+
+
+# Re-export common config classes for direct imports
+def get_CONFIG():
+    """Get all config sections as a dict."""
+    from .common import (
+        SHELL_CONFIG,
+        STORAGE_CONFIG,
+        GENERAL_CONFIG,
+        SERVER_CONFIG,
+        ARCHIVING_CONFIG,
+        SEARCH_BACKEND_CONFIG,
+    )
+    return {
+        'SHELL_CONFIG': SHELL_CONFIG,
+        'STORAGE_CONFIG': STORAGE_CONFIG,
+        'GENERAL_CONFIG': GENERAL_CONFIG,
+        'SERVER_CONFIG': SERVER_CONFIG,
+        'ARCHIVING_CONFIG': ARCHIVING_CONFIG,
+        'SEARCHBACKEND_CONFIG': SEARCH_BACKEND_CONFIG,
+    }

+ 29 - 14
archivebox/config/collection.py

@@ -18,13 +18,8 @@ from archivebox.misc.logging import stderr
 
 def get_real_name(key: str) -> str:
     """get the up-to-date canonical name for a given old alias or current key"""
-    CONFIGS = archivebox.pm.hook.get_CONFIGS()
-    
-    for section in CONFIGS.values():
-        try:
-            return section.aliases[key]
-        except (KeyError, AttributeError):
-            pass
+    # Config aliases are no longer used with the simplified config system
+    # Just return the key as-is since we no longer have a complex alias mapping
     return key
 
 
@@ -117,9 +112,20 @@ def load_config_file() -> Optional[benedict]:
 
 
 def section_for_key(key: str) -> Any:
-    for config_section in archivebox.pm.hook.get_CONFIGS().values():
-        if hasattr(config_section, key):
-            return config_section
+    """Find the config section containing a given key."""
+    from archivebox.config.common import (
+        SHELL_CONFIG,
+        STORAGE_CONFIG,
+        GENERAL_CONFIG,
+        SERVER_CONFIG,
+        ARCHIVING_CONFIG,
+        SEARCH_BACKEND_CONFIG,
+    )
+    
+    for section in [SHELL_CONFIG, STORAGE_CONFIG, GENERAL_CONFIG, 
+                    SERVER_CONFIG, ARCHIVING_CONFIG, SEARCH_BACKEND_CONFIG]:
+        if hasattr(section, key):
+            return section
     raise ValueError(f'No config section found for key: {key}')
 
 
@@ -178,7 +184,8 @@ def write_config_file(config: Dict[str, str]) -> benedict:
     updated_config = {}
     try:
         # validate the updated_config by attempting to re-parse it
-        updated_config = {**load_all_config(), **archivebox.pm.hook.get_FLAT_CONFIG()}
+        from archivebox.config.configset import get_flat_config
+        updated_config = {**load_all_config(), **get_flat_config()}
     except BaseException:                                                       # lgtm [py/catch-base-exception]
         # something went horribly wrong, revert to the previous version
         with open(f'{config_path}.bak', 'r', encoding='utf-8') as old:
@@ -236,12 +243,20 @@ def load_config(defaults: Dict[str, Any],
     return benedict(extended_config)
 
 def load_all_config():
-    import abx
+    """Load all config sections and return as a flat dict."""
+    from archivebox.config.common import (
+        SHELL_CONFIG,
+        STORAGE_CONFIG,
+        GENERAL_CONFIG,
+        SERVER_CONFIG,
+        ARCHIVING_CONFIG,
+        SEARCH_BACKEND_CONFIG,
+    )
     
     flat_config = benedict()
     
-    for config_section in abx.pm.hook.get_CONFIGS().values():
-        config_section.__init__()
+    for config_section in [SHELL_CONFIG, STORAGE_CONFIG, GENERAL_CONFIG, 
+                           SERVER_CONFIG, ARCHIVING_CONFIG, SEARCH_BACKEND_CONFIG]:
         flat_config.update(dict(config_section))
         
     return flat_config

+ 121 - 91
archivebox/config/common.py

@@ -1,4 +1,4 @@
-__package__ = 'archivebox.config'
+__package__ = "archivebox.config"
 
 import re
 import sys
@@ -10,7 +10,7 @@ from rich import print
 from pydantic import Field, field_validator
 from django.utils.crypto import get_random_string
 
-from abx_spec_config.base_configset import BaseConfigSet
+from archivebox.config.configset import BaseConfigSet
 
 from .constants import CONSTANTS
 from .version import get_COMMIT_HASH, get_BUILD_TIME, VERSION
@@ -20,109 +20,127 @@ from .permissions import IN_DOCKER
 
 
 class ShellConfig(BaseConfigSet):
-    DEBUG: bool                         = Field(default=lambda: '--debug' in sys.argv)
-    
-    IS_TTY: bool                        = Field(default=sys.stdout.isatty())
-    USE_COLOR: bool                     = Field(default=lambda c: c.IS_TTY)
-    SHOW_PROGRESS: bool                 = Field(default=lambda c: c.IS_TTY)
-    
-    IN_DOCKER: bool                     = Field(default=IN_DOCKER)
-    IN_QEMU: bool                       = Field(default=False)
+    toml_section_header: str = "SHELL_CONFIG"
 
-    ANSI: Dict[str, str]                = Field(default=lambda c: CONSTANTS.DEFAULT_CLI_COLORS if c.USE_COLOR else CONSTANTS.DISABLED_CLI_COLORS)
+    DEBUG: bool = Field(default="--debug" in sys.argv)
+
+    IS_TTY: bool = Field(default=sys.stdout.isatty())
+    USE_COLOR: bool = Field(default=sys.stdout.isatty())
+    SHOW_PROGRESS: bool = Field(default=sys.stdout.isatty())
+
+    IN_DOCKER: bool = Field(default=IN_DOCKER)
+    IN_QEMU: bool = Field(default=False)
+
+    ANSI: Dict[str, str] = Field(
+        default_factory=lambda: CONSTANTS.DEFAULT_CLI_COLORS if sys.stdout.isatty() else CONSTANTS.DISABLED_CLI_COLORS
+    )
 
     @property
     def TERM_WIDTH(self) -> int:
         if not self.IS_TTY:
             return 200
         return shutil.get_terminal_size((140, 10)).columns
-    
+
     @property
     def COMMIT_HASH(self) -> Optional[str]:
         return get_COMMIT_HASH()
-    
+
     @property
     def BUILD_TIME(self) -> str:
         return get_BUILD_TIME()
- 
+
 
 SHELL_CONFIG = ShellConfig()
 
 
 class StorageConfig(BaseConfigSet):
+    toml_section_header: str = "STORAGE_CONFIG"
+
     # TMP_DIR must be a local, fast, readable/writable dir by archivebox user,
     # must be a short path due to unix path length restrictions for socket files (<100 chars)
     # must be a local SSD/tmpfs for speed and because bind mounts/network mounts/FUSE dont support unix sockets
-    TMP_DIR: Path                       = Field(default=CONSTANTS.DEFAULT_TMP_DIR)
-    
+    TMP_DIR: Path = Field(default=CONSTANTS.DEFAULT_TMP_DIR)
+
     # LIB_DIR must be a local, fast, readable/writable dir by archivebox user,
     # must be able to contain executable binaries (up to 5GB size)
     # should not be a remote/network/FUSE mount for speed reasons, otherwise extractors will be slow
-    LIB_DIR: Path                       = Field(default=CONSTANTS.DEFAULT_LIB_DIR)
-    
-    OUTPUT_PERMISSIONS: str             = Field(default='644')
-    RESTRICT_FILE_NAMES: str            = Field(default='windows')
-    ENFORCE_ATOMIC_WRITES: bool         = Field(default=True)
-    
+    LIB_DIR: Path = Field(default=CONSTANTS.DEFAULT_LIB_DIR)
+
+    OUTPUT_PERMISSIONS: str = Field(default="644")
+    RESTRICT_FILE_NAMES: str = Field(default="windows")
+    ENFORCE_ATOMIC_WRITES: bool = Field(default=True)
+
     # not supposed to be user settable:
-    DIR_OUTPUT_PERMISSIONS: str         = Field(default=lambda c: c['OUTPUT_PERMISSIONS'].replace('6', '7').replace('4', '5'))
+    DIR_OUTPUT_PERMISSIONS: str = Field(default="755")  # computed from OUTPUT_PERMISSIONS
 
 
 STORAGE_CONFIG = StorageConfig()
 
 
 class GeneralConfig(BaseConfigSet):
-    TAG_SEPARATOR_PATTERN: str          = Field(default=r'[,]')
+    toml_section_header: str = "GENERAL_CONFIG"
+
+    TAG_SEPARATOR_PATTERN: str = Field(default=r"[,]")
+
 
 GENERAL_CONFIG = GeneralConfig()
 
 
 class ServerConfig(BaseConfigSet):
-    SECRET_KEY: str                     = Field(default=lambda: get_random_string(50, 'abcdefghijklmnopqrstuvwxyz0123456789_'))
-    BIND_ADDR: str                      = Field(default=lambda: ['127.0.0.1:8000', '0.0.0.0:8000'][SHELL_CONFIG.IN_DOCKER])
-    ALLOWED_HOSTS: str                  = Field(default='*')
-    CSRF_TRUSTED_ORIGINS: str           = Field(default=lambda c: 'http://localhost:8000,http://127.0.0.1:8000,http://0.0.0.0:8000,http://{}'.format(c.BIND_ADDR))
-    
-    SNAPSHOTS_PER_PAGE: int             = Field(default=40)
-    PREVIEW_ORIGINALS: bool             = Field(default=True)
-    FOOTER_INFO: str                    = Field(default='Content is hosted for personal archiving purposes only.  Contact server owner for any takedown requests.')
+    toml_section_header: str = "SERVER_CONFIG"
+
+    SECRET_KEY: str = Field(default_factory=lambda: get_random_string(50, "abcdefghijklmnopqrstuvwxyz0123456789_"))
+    BIND_ADDR: str = Field(default="127.0.0.1:8000")
+    ALLOWED_HOSTS: str = Field(default="*")
+    CSRF_TRUSTED_ORIGINS: str = Field(default="http://localhost:8000,http://127.0.0.1:8000,http://0.0.0.0:8000")
+
+    SNAPSHOTS_PER_PAGE: int = Field(default=40)
+    PREVIEW_ORIGINALS: bool = Field(default=True)
+    FOOTER_INFO: str = Field(
+        default="Content is hosted for personal archiving purposes only.  Contact server owner for any takedown requests."
+    )
     # CUSTOM_TEMPLATES_DIR: Path          = Field(default=None)  # this is now a constant
 
-    PUBLIC_INDEX: bool                  = Field(default=True)
-    PUBLIC_SNAPSHOTS: bool              = Field(default=True)
-    PUBLIC_ADD_VIEW: bool               = Field(default=False)
-    
-    ADMIN_USERNAME: str                 = Field(default=None)
-    ADMIN_PASSWORD: str                 = Field(default=None)
-    
-    REVERSE_PROXY_USER_HEADER: str      = Field(default='Remote-User')
-    REVERSE_PROXY_WHITELIST: str        = Field(default='')
-    LOGOUT_REDIRECT_URL: str            = Field(default='/')
-    
+    PUBLIC_INDEX: bool = Field(default=True)
+    PUBLIC_SNAPSHOTS: bool = Field(default=True)
+    PUBLIC_ADD_VIEW: bool = Field(default=False)
+
+    ADMIN_USERNAME: Optional[str] = Field(default=None)
+    ADMIN_PASSWORD: Optional[str] = Field(default=None)
+
+    REVERSE_PROXY_USER_HEADER: str = Field(default="Remote-User")
+    REVERSE_PROXY_WHITELIST: str = Field(default="")
+    LOGOUT_REDIRECT_URL: str = Field(default="/")
+
+
 SERVER_CONFIG = ServerConfig()
 
 
 class ArchivingConfig(BaseConfigSet):
-    ONLY_NEW: bool                        = Field(default=True)
-    OVERWRITE: bool                       = Field(default=False)
-    
-    TIMEOUT: int                          = Field(default=60)
-    MEDIA_TIMEOUT: int                    = Field(default=3600)
-
-    MEDIA_MAX_SIZE: str                   = Field(default='750m')
-    RESOLUTION: str                       = Field(default='1440,2000')
-    CHECK_SSL_VALIDITY: bool              = Field(default=True)
-    USER_AGENT: str                       = Field(default=f'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36 ArchiveBox/{VERSION} (+https://github.com/ArchiveBox/ArchiveBox/)')
-    COOKIES_FILE: Path | None             = Field(default=None)
-    
-    URL_DENYLIST: str                     = Field(default=r'\.(css|js|otf|ttf|woff|woff2|gstatic\.com|googleapis\.com/css)(\?.*)?$', alias='URL_BLACKLIST')
-    URL_ALLOWLIST: str | None             = Field(default=None, alias='URL_WHITELIST')
-    
-    SAVE_ALLOWLIST: Dict[str, List[str]]  = Field(default={})  # mapping of regex patterns to list of archive methods
-    SAVE_DENYLIST: Dict[str, List[str]]   = Field(default={})
-    
-    DEFAULT_PERSONA: str                  = Field(default='Default')
-    
+    toml_section_header: str = "ARCHIVING_CONFIG"
+
+    ONLY_NEW: bool = Field(default=True)
+    OVERWRITE: bool = Field(default=False)
+
+    TIMEOUT: int = Field(default=60)
+    MEDIA_TIMEOUT: int = Field(default=3600)
+
+    MEDIA_MAX_SIZE: str = Field(default="750m")
+    RESOLUTION: str = Field(default="1440,2000")
+    CHECK_SSL_VALIDITY: bool = Field(default=True)
+    USER_AGENT: str = Field(
+        default=f"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36 ArchiveBox/{VERSION} (+https://github.com/ArchiveBox/ArchiveBox/)"
+    )
+    COOKIES_FILE: Path | None = Field(default=None)
+
+    URL_DENYLIST: str = Field(default=r"\.(css|js|otf|ttf|woff|woff2|gstatic\.com|googleapis\.com/css)(\?.*)?$", alias="URL_BLACKLIST")
+    URL_ALLOWLIST: str | None = Field(default=None, alias="URL_WHITELIST")
+
+    SAVE_ALLOWLIST: Dict[str, List[str]] = Field(default={})  # mapping of regex patterns to list of archive methods
+    SAVE_DENYLIST: Dict[str, List[str]] = Field(default={})
+
+    DEFAULT_PERSONA: str = Field(default="Default")
+
     # GIT_DOMAINS: str                    = Field(default='github.com,bitbucket.org,gitlab.com,gist.github.com,codeberg.org,gitea.com,git.sr.ht')
     # WGET_USER_AGENT: str                = Field(default=lambda c: c['USER_AGENT'] + ' wget/{WGET_VERSION}')
     # CURL_USER_AGENT: str                = Field(default=lambda c: c['USER_AGENT'] + ' curl/{CURL_VERSION}')
@@ -134,58 +152,70 @@ class ArchivingConfig(BaseConfigSet):
 
     def validate(self):
         if int(self.TIMEOUT) < 5:
-            print(f'[red][!] Warning: TIMEOUT is set too low! (currently set to TIMEOUT={self.TIMEOUT} seconds)[/red]', file=sys.stderr)
-            print('    You must allow *at least* 5 seconds for indexing and archive methods to run succesfully.', file=sys.stderr)
-            print('    (Setting it to somewhere between 30 and 3000 seconds is recommended)', file=sys.stderr)
+            print(f"[red][!] Warning: TIMEOUT is set too low! (currently set to TIMEOUT={self.TIMEOUT} seconds)[/red]", file=sys.stderr)
+            print("    You must allow *at least* 5 seconds for indexing and archive methods to run succesfully.", file=sys.stderr)
+            print("    (Setting it to somewhere between 30 and 3000 seconds is recommended)", file=sys.stderr)
             print(file=sys.stderr)
-            print('    If you want to make ArchiveBox run faster, disable specific archive methods instead:', file=sys.stderr)
-            print('        https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#archive-method-toggles', file=sys.stderr)
+            print("    If you want to make ArchiveBox run faster, disable specific archive methods instead:", file=sys.stderr)
+            print("        https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#archive-method-toggles", file=sys.stderr)
             print(file=sys.stderr)
-    
-    @field_validator('CHECK_SSL_VALIDITY', mode='after')
+
+    @field_validator("CHECK_SSL_VALIDITY", mode="after")
     def validate_check_ssl_validity(cls, v):
         """SIDE EFFECT: disable "you really shouldnt disable ssl" warnings emitted by requests"""
         if not v:
             import requests
             import urllib3
+
             requests.packages.urllib3.disable_warnings(requests.packages.urllib3.exceptions.InsecureRequestWarning)
             urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
         return v
-    
+
     @property
     def URL_ALLOWLIST_PTN(self) -> re.Pattern | None:
         return re.compile(self.URL_ALLOWLIST, CONSTANTS.ALLOWDENYLIST_REGEX_FLAGS) if self.URL_ALLOWLIST else None
-    
+
     @property
     def URL_DENYLIST_PTN(self) -> re.Pattern:
         return re.compile(self.URL_DENYLIST, CONSTANTS.ALLOWDENYLIST_REGEX_FLAGS)
-    
+
     @property
     def SAVE_ALLOWLIST_PTNS(self) -> Dict[re.Pattern, List[str]]:
-        return {
-            # regexp: methods list
-            re.compile(key, CONSTANTS.ALLOWDENYLIST_REGEX_FLAGS): val
-            for key, val in self.SAVE_ALLOWLIST.items()
-        } if self.SAVE_ALLOWLIST else {}
-    
+        return (
+            {
+                # regexp: methods list
+                re.compile(key, CONSTANTS.ALLOWDENYLIST_REGEX_FLAGS): val
+                for key, val in self.SAVE_ALLOWLIST.items()
+            }
+            if self.SAVE_ALLOWLIST
+            else {}
+        )
+
     @property
     def SAVE_DENYLIST_PTNS(self) -> Dict[re.Pattern, List[str]]:
-        return {
-            # regexp: methods list
-            re.compile(key, CONSTANTS.ALLOWDENYLIST_REGEX_FLAGS): val
-            for key, val in self.SAVE_DENYLIST.items()
-        } if self.SAVE_DENYLIST else {}
+        return (
+            {
+                # regexp: methods list
+                re.compile(key, CONSTANTS.ALLOWDENYLIST_REGEX_FLAGS): val
+                for key, val in self.SAVE_DENYLIST.items()
+            }
+            if self.SAVE_DENYLIST
+            else {}
+        )
+
 
 ARCHIVING_CONFIG = ArchivingConfig()
 
 
 class SearchBackendConfig(BaseConfigSet):
-    USE_INDEXING_BACKEND: bool          = Field(default=True)
-    USE_SEARCHING_BACKEND: bool         = Field(default=True)
-    
-    SEARCH_BACKEND_ENGINE: str          = Field(default='ripgrep')
-    SEARCH_PROCESS_HTML: bool           = Field(default=True)
-    SEARCH_BACKEND_TIMEOUT: int         = Field(default=10)
+    toml_section_header: str = "SEARCH_BACKEND_CONFIG"
 
-SEARCH_BACKEND_CONFIG = SearchBackendConfig()
+    USE_INDEXING_BACKEND: bool = Field(default=True)
+    USE_SEARCHING_BACKEND: bool = Field(default=True)
+
+    SEARCH_BACKEND_ENGINE: str = Field(default="ripgrep")
+    SEARCH_PROCESS_HTML: bool = Field(default=True)
+    SEARCH_BACKEND_TIMEOUT: int = Field(default=10)
 
+
+SEARCH_BACKEND_CONFIG = SearchBackendConfig()

+ 266 - 0
archivebox/config/configset.py

@@ -0,0 +1,266 @@
+"""
+Simplified config system for ArchiveBox.
+
+This replaces the complex abx_spec_config/base_configset.py with a simpler
+approach that still supports environment variables, config files, and
+per-object overrides.
+"""
+
+__package__ = "archivebox.config"
+
+import os
+import json
+from pathlib import Path
+from typing import Any, Dict, Optional, List, Type, TYPE_CHECKING, cast
+from configparser import ConfigParser
+
+from pydantic import Field
+from pydantic_settings import BaseSettings
+
+
+class BaseConfigSet(BaseSettings):
+    """
+    Base class for config sections.
+
+    Automatically loads values from:
+    1. Environment variables (highest priority)
+    2. ArchiveBox.conf file (if exists)
+    3. Default values (lowest priority)
+
+    Subclasses define fields with defaults and types:
+
+        class ShellConfig(BaseConfigSet):
+            DEBUG: bool = Field(default=False)
+            USE_COLOR: bool = Field(default=True)
+    """
+
+    class Config:
+        # Use env vars with ARCHIVEBOX_ prefix or raw name
+        env_prefix = ""
+        extra = "ignore"
+        validate_default = True
+
+    @classmethod
+    def load_from_file(cls, config_path: Path) -> Dict[str, str]:
+        """Load config values from INI file."""
+        if not config_path.exists():
+            return {}
+
+        parser = ConfigParser()
+        parser.optionxform = lambda x: x  # type: ignore  # preserve case
+        parser.read(config_path)
+
+        # Flatten all sections into single namespace
+        return {key.upper(): value for section in parser.sections() for key, value in parser.items(section)}
+
+    def update_in_place(self, warn: bool = True, persist: bool = False, **kwargs) -> None:
+        """
+        Update config values in place.
+
+        This allows runtime updates to config without reloading.
+        """
+        for key, value in kwargs.items():
+            if hasattr(self, key):
+                # Use object.__setattr__ to bypass pydantic's frozen model
+                object.__setattr__(self, key, value)
+
+
+def get_config(
+    scope: str = "global",
+    defaults: Optional[Dict] = None,
+    user: Any = None,
+    crawl: Any = None,
+    snapshot: Any = None,
+) -> Dict[str, Any]:
+    """
+    Get merged config from all sources.
+
+    Priority (highest to lowest):
+    1. Per-snapshot config (snapshot.config JSON field)
+    2. Per-crawl config (crawl.config JSON field)
+    3. Per-user config (user.config JSON field)
+    4. Environment variables
+    5. Config file (ArchiveBox.conf)
+    6. Plugin schema defaults (config.json)
+    7. Core config defaults
+
+    Args:
+        scope: Config scope ('global', 'crawl', 'snapshot', etc.)
+        defaults: Default values to start with
+        user: User object with config JSON field
+        crawl: Crawl object with config JSON field
+        snapshot: Snapshot object with config JSON field
+
+    Returns:
+        Merged config dict
+    """
+    from archivebox.config.constants import CONSTANTS
+    from archivebox.config.common import (
+        SHELL_CONFIG,
+        STORAGE_CONFIG,
+        GENERAL_CONFIG,
+        SERVER_CONFIG,
+        ARCHIVING_CONFIG,
+        SEARCH_BACKEND_CONFIG,
+    )
+
+    # Start with defaults
+    config = dict(defaults or {})
+
+    # Add plugin config defaults from JSONSchema config.json files
+    try:
+        from archivebox.hooks import get_config_defaults_from_plugins
+        plugin_defaults = get_config_defaults_from_plugins()
+        config.update(plugin_defaults)
+    except ImportError:
+        pass  # hooks not available yet during early startup
+
+    # Add all core config sections
+    config.update(dict(SHELL_CONFIG))
+    config.update(dict(STORAGE_CONFIG))
+    config.update(dict(GENERAL_CONFIG))
+    config.update(dict(SERVER_CONFIG))
+    config.update(dict(ARCHIVING_CONFIG))
+    config.update(dict(SEARCH_BACKEND_CONFIG))
+
+    # Load from config file
+    config_file = CONSTANTS.CONFIG_FILE
+    if config_file.exists():
+        file_config = BaseConfigSet.load_from_file(config_file)
+        config.update(file_config)
+
+    # Override with environment variables
+    for key in config:
+        env_val = os.environ.get(key)
+        if env_val is not None:
+            config[key] = _parse_env_value(env_val, config.get(key))
+
+    # Also check plugin config aliases in environment
+    try:
+        from archivebox.hooks import discover_plugin_configs
+        plugin_configs = discover_plugin_configs()
+        for plugin_name, schema in plugin_configs.items():
+            for key, prop_schema in schema.get('properties', {}).items():
+                # Check x-aliases
+                for alias in prop_schema.get('x-aliases', []):
+                    if alias in os.environ and key not in os.environ:
+                        config[key] = _parse_env_value(os.environ[alias], config.get(key))
+                        break
+                # Check x-fallback
+                fallback = prop_schema.get('x-fallback')
+                if fallback and fallback in config and key not in config:
+                    config[key] = config[fallback]
+    except ImportError:
+        pass
+
+    # Apply user config overrides
+    if user and hasattr(user, "config") and user.config:
+        config.update(user.config)
+
+    # Apply crawl config overrides
+    if crawl and hasattr(crawl, "config") and crawl.config:
+        config.update(crawl.config)
+
+    # Apply snapshot config overrides (highest priority)
+    if snapshot and hasattr(snapshot, "config") and snapshot.config:
+        config.update(snapshot.config)
+
+    return config
+
+
+def get_flat_config() -> Dict[str, Any]:
+    """
+    Get a flat dictionary of all config values.
+
+    Replaces abx.pm.hook.get_FLAT_CONFIG()
+    """
+    return get_config(scope="global")
+
+
+def get_all_configs() -> Dict[str, BaseConfigSet]:
+    """
+    Get all config section objects as a dictionary.
+
+    Replaces abx.pm.hook.get_CONFIGS()
+    """
+    from archivebox.config.common import (
+        SHELL_CONFIG, SERVER_CONFIG, ARCHIVING_CONFIG, SEARCH_BACKEND_CONFIG
+    )
+    return {
+        'SHELL_CONFIG': SHELL_CONFIG,
+        'SERVER_CONFIG': SERVER_CONFIG,
+        'ARCHIVING_CONFIG': ARCHIVING_CONFIG,
+        'SEARCH_BACKEND_CONFIG': SEARCH_BACKEND_CONFIG,
+    }
+
+
+def _parse_env_value(value: str, default: Any = None) -> Any:
+    """Parse an environment variable value based on expected type."""
+    if default is None:
+        # Try to guess the type
+        if value.lower() in ("true", "false", "yes", "no", "1", "0"):
+            return value.lower() in ("true", "yes", "1")
+        try:
+            return int(value)
+        except ValueError:
+            pass
+        try:
+            return json.loads(value)
+        except (json.JSONDecodeError, ValueError):
+            pass
+        return value
+
+    # Parse based on default's type
+    if isinstance(default, bool):
+        return value.lower() in ("true", "yes", "1")
+    elif isinstance(default, int):
+        return int(value)
+    elif isinstance(default, float):
+        return float(value)
+    elif isinstance(default, (list, dict)):
+        return json.loads(value)
+    elif isinstance(default, Path):
+        return Path(value)
+    else:
+        return value
+
+
+# Default worker concurrency settings
+DEFAULT_WORKER_CONCURRENCY = {
+    "crawl": 2,
+    "snapshot": 3,
+    "wget": 2,
+    "ytdlp": 2,
+    "screenshot": 3,
+    "singlefile": 2,
+    "title": 5,
+    "favicon": 5,
+    "headers": 5,
+    "archive_org": 2,
+    "readability": 3,
+    "mercury": 3,
+    "git": 2,
+    "pdf": 2,
+    "dom": 3,
+}
+
+
+def get_worker_concurrency() -> Dict[str, int]:
+    """
+    Get worker concurrency settings.
+
+    Can be configured via WORKER_CONCURRENCY env var as JSON dict.
+    """
+    config = get_config()
+
+    # Start with defaults
+    concurrency = DEFAULT_WORKER_CONCURRENCY.copy()
+
+    # Override with config
+    if "WORKER_CONCURRENCY" in config:
+        custom = config["WORKER_CONCURRENCY"]
+        if isinstance(custom, str):
+            custom = json.loads(custom)
+        concurrency.update(custom)
+
+    return concurrency

+ 173 - 179
archivebox/config/views.py

@@ -1,6 +1,7 @@
-__package__ = 'abx.archivebox'
+__package__ = 'archivebox.config'
 
 import os
+import shutil
 import inspect
 from pathlib import Path
 from typing import Any, List, Dict, cast
@@ -13,14 +14,22 @@ from django.utils.html import format_html, mark_safe
 from admin_data_views.typing import TableContext, ItemContext
 from admin_data_views.utils import render_with_table_view, render_with_item_view, ItemLink
 
-import abx
-import archivebox
 from archivebox.config import CONSTANTS
 from archivebox.misc.util import parse_date
 
 from machine.models import InstalledBinary
 
 
+# Common binaries to check for
+KNOWN_BINARIES = [
+    'wget', 'curl', 'chromium', 'chrome', 'google-chrome', 'google-chrome-stable',
+    'node', 'npm', 'npx', 'yt-dlp', 'ytdlp', 'youtube-dl',
+    'git', 'singlefile', 'readability-extractor', 'mercury-parser',
+    'python3', 'python', 'bash', 'zsh',
+    'ffmpeg', 'ripgrep', 'rg', 'sonic', 'archivebox',
+]
+
+
 def obj_to_yaml(obj: Any, indent: int=0) -> str:
     indent_str = "  " * indent
     if indent == 0:
@@ -62,65 +71,92 @@ def obj_to_yaml(obj: Any, indent: int=0) -> str:
     else:
         return f" {str(obj)}"
 
+
+def get_detected_binaries() -> Dict[str, Dict[str, Any]]:
+    """Detect available binaries using shutil.which."""
+    binaries = {}
+    
+    for name in KNOWN_BINARIES:
+        path = shutil.which(name)
+        if path:
+            binaries[name] = {
+                'name': name,
+                'abspath': path,
+                'version': None,  # Could add version detection later
+                'is_available': True,
+            }
+    
+    return binaries
+
+
+def get_filesystem_plugins() -> Dict[str, Dict[str, Any]]:
+    """Discover plugins from filesystem directories."""
+    from archivebox.hooks import BUILTIN_PLUGINS_DIR, USER_PLUGINS_DIR
+    
+    plugins = {}
+    
+    for base_dir, source in [(BUILTIN_PLUGINS_DIR, 'builtin'), (USER_PLUGINS_DIR, 'user')]:
+        if not base_dir.exists():
+            continue
+        
+        for plugin_dir in base_dir.iterdir():
+            if plugin_dir.is_dir() and not plugin_dir.name.startswith('_'):
+                plugin_id = f'{source}.{plugin_dir.name}'
+                
+                # Find hook scripts
+                hooks = []
+                for ext in ('sh', 'py', 'js'):
+                    hooks.extend(plugin_dir.glob(f'on_*__*.{ext}'))
+                
+                plugins[plugin_id] = {
+                    'id': plugin_id,
+                    'name': plugin_dir.name,
+                    'path': str(plugin_dir),
+                    'source': source,
+                    'hooks': [str(h.name) for h in hooks],
+                }
+    
+    return plugins
+
+
 @render_with_table_view
 def binaries_list_view(request: HttpRequest, **kwargs) -> TableContext:
-    FLAT_CONFIG = archivebox.pm.hook.get_FLAT_CONFIG()
     assert request.user.is_superuser, 'Must be a superuser to view configuration settings.'
 
     rows = {
         "Binary Name": [],
         "Found Version": [],
-        "From Plugin": [],
         "Provided By": [],
         "Found Abspath": [],
-        "Related Configuration": [],
-        # "Overrides": [],
-        # "Description": [],
-    }
-
-    relevant_configs = {
-        key: val
-        for key, val in FLAT_CONFIG.items()
-        if '_BINARY' in key or '_VERSION' in key
     }
 
-    for plugin_id, plugin in abx.get_all_plugins().items():
-        plugin = benedict(plugin)
-        if not hasattr(plugin.plugin, 'get_BINARIES'):
-            continue
+    # Get binaries from database (previously detected/installed)
+    db_binaries = {b.name: b for b in InstalledBinary.objects.all()}
+    
+    # Get currently detectable binaries  
+    detected = get_detected_binaries()
+    
+    # Merge and display
+    all_binary_names = sorted(set(list(db_binaries.keys()) + list(detected.keys())))
+    
+    for name in all_binary_names:
+        db_binary = db_binaries.get(name)
+        detected_binary = detected.get(name)
         
-        for binary in plugin.plugin.get_BINARIES().values():
-            try:
-                installed_binary = InstalledBinary.objects.get_from_db_or_cache(binary)
-                binary = installed_binary.load_from_db()
-            except Exception as e:
-                print(e)
-
-            rows['Binary Name'].append(ItemLink(binary.name, key=binary.name))
-            rows['Found Version'].append(f'✅ {binary.loaded_version}' if binary.loaded_version else '❌ missing')
-            rows['From Plugin'].append(plugin.package)
-            rows['Provided By'].append(
-                ', '.join(
-                    f'[{binprovider.name}]' if binprovider.name == getattr(binary.loaded_binprovider, 'name', None) else binprovider.name
-                    for binprovider in binary.binproviders_supported
-                    if binprovider
-                )
-                # binary.loaded_binprovider.name
-                # if binary.loaded_binprovider else
-                # ', '.join(getattr(provider, 'name', str(provider)) for provider in binary.binproviders_supported)
-            )
-            rows['Found Abspath'].append(str(binary.loaded_abspath or '❌ missing'))
-            rows['Related Configuration'].append(mark_safe(', '.join(
-                f'<a href="/admin/environment/config/{config_key}/">{config_key}</a>'
-                for config_key, config_value in relevant_configs.items()
-                    if str(binary.name).lower().replace('-', '').replace('_', '').replace('ytdlp', 'youtubedl') in config_key.lower()
-                    or config_value.lower().endswith(binary.name.lower())
-                    # or binary.name.lower().replace('-', '').replace('_', '') in str(config_value).lower()
-            )))
-            # if not binary.overrides:
-                # import ipdb; ipdb.set_trace()
-            # rows['Overrides'].append(str(obj_to_yaml(binary.overrides) or str(binary.overrides))[:200])
-            # rows['Description'].append(binary.description)
+        rows['Binary Name'].append(ItemLink(name, key=name))
+        
+        if db_binary:
+            rows['Found Version'].append(f'✅ {db_binary.version}' if db_binary.version else '✅ found')
+            rows['Provided By'].append(db_binary.binprovider or 'PATH')
+            rows['Found Abspath'].append(str(db_binary.abspath or ''))
+        elif detected_binary:
+            rows['Found Version'].append('✅ found')
+            rows['Provided By'].append('PATH')
+            rows['Found Abspath'].append(detected_binary['abspath'])
+        else:
+            rows['Found Version'].append('❌ missing')
+            rows['Provided By'].append('-')
+            rows['Found Abspath'].append('-')
 
     return TableContext(
         title="Binaries",
@@ -132,43 +168,65 @@ def binary_detail_view(request: HttpRequest, key: str, **kwargs) -> ItemContext:
 
     assert request.user and request.user.is_superuser, 'Must be a superuser to view configuration settings.'
 
-    binary = None
-    plugin = None
-    for plugin_id, plugin in abx.get_all_plugins().items():
-        try:
-            for loaded_binary in plugin['hooks'].get_BINARIES().values():
-                if loaded_binary.name == key:
-                    binary = loaded_binary
-                    plugin = plugin
-                    # break  # last write wins
-        except Exception as e:
-            print(e)
-
-    assert plugin and binary, f'Could not find a binary matching the specified name: {key}'
-
+    # Try database first
     try:
-        binary = binary.load()
-    except Exception as e:
-        print(e)
-
+        binary = InstalledBinary.objects.get(name=key)
+        return ItemContext(
+            slug=key,
+            title=key,
+            data=[
+                {
+                    "name": binary.name,
+                    "description": str(binary.abspath or ''),
+                    "fields": {
+                        'name': binary.name,
+                        'binprovider': binary.binprovider,
+                        'abspath': str(binary.abspath),
+                        'version': binary.version,
+                        'sha256': binary.sha256,
+                    },
+                    "help_texts": {},
+                },
+            ],
+        )
+    except InstalledBinary.DoesNotExist:
+        pass
+    
+    # Try to detect from PATH
+    path = shutil.which(key)
+    if path:
+        return ItemContext(
+            slug=key,
+            title=key,
+            data=[
+                {
+                    "name": key,
+                    "description": path,
+                    "fields": {
+                        'name': key,
+                        'binprovider': 'PATH',
+                        'abspath': path,
+                        'version': 'unknown',
+                    },
+                    "help_texts": {},
+                },
+            ],
+        )
+    
     return ItemContext(
         slug=key,
         title=key,
         data=[
             {
-                "name": binary.name,
-                "description": binary.abspath,
+                "name": key,
+                "description": "Binary not found",
                 "fields": {
-                    'plugin': plugin['package'],
-                    'binprovider': binary.loaded_binprovider,
-                    'abspath': binary.loaded_abspath,
-                    'version': binary.loaded_version,
-                    'overrides': obj_to_yaml(binary.overrides),
-                    'providers': obj_to_yaml(binary.binproviders_supported),
-                },
-                "help_texts": {
-                    # TODO
+                    'name': key,
+                    'binprovider': 'not installed',
+                    'abspath': 'not found',
+                    'version': 'N/A',
                 },
+                "help_texts": {},
             },
         ],
     )
@@ -180,66 +238,26 @@ def plugins_list_view(request: HttpRequest, **kwargs) -> TableContext:
     assert request.user.is_superuser, 'Must be a superuser to view configuration settings.'
 
     rows = {
-        "Label": [],
-        "Version": [],
-        "Author": [],
-        "Package": [],
-        "Source Code": [],
-        "Config": [],
-        "Binaries": [],
-        "Package Managers": [],
-        # "Search Backends": [],
+        "Name": [],
+        "Source": [],
+        "Path": [],
+        "Hooks": [],
     }
 
-    config_colors = {
-        '_BINARY': '#339',
-        'USE_': 'green',
-        'SAVE_': 'green',
-        '_ARGS': '#33e',
-        'KEY': 'red',
-        'COOKIES': 'red',
-        'AUTH': 'red',
-        'SECRET': 'red',
-        'TOKEN': 'red',
-        'PASSWORD': 'red',
-        'TIMEOUT': '#533',
-        'RETRIES': '#533',
-        'MAX': '#533',
-        'MIN': '#533',
-    }
-    def get_color(key):
-        for pattern, color in config_colors.items():
-            if pattern in key:
-                return color
-        return 'black'
-
-    for plugin_id, plugin in abx.get_all_plugins().items():
-        plugin.hooks.get_BINPROVIDERS = getattr(plugin.plugin, 'get_BINPROVIDERS', lambda: {})
-        plugin.hooks.get_BINARIES = getattr(plugin.plugin, 'get_BINARIES', lambda: {})
-        plugin.hooks.get_CONFIG = getattr(plugin.plugin, 'get_CONFIG', lambda: {})
-        
-        rows['Label'].append(ItemLink(plugin.label, key=plugin.package))
-        rows['Version'].append(str(plugin.version))
-        rows['Author'].append(mark_safe(f'<a href="{plugin.homepage}" target="_blank">{plugin.author}</a>'))
-        rows['Package'].append(ItemLink(plugin.package, key=plugin.package))
-        rows['Source Code'].append(format_html('<code>{}</code>', str(plugin.source_code).replace(str(Path('~').expanduser()), '~')))
-        rows['Config'].append(mark_safe(''.join(
-            f'<a href="/admin/environment/config/{key}/"><b><code style="color: {get_color(key)};">{key}</code></b>=<code>{value}</code></a><br/>'
-            for configdict in plugin.hooks.get_CONFIG().values()
-                for key, value in benedict(configdict).items()
-        )))
-        rows['Binaries'].append(mark_safe(', '.join(
-            f'<a href="/admin/environment/binaries/{binary.name}/"><code>{binary.name}</code></a>'
-            for binary in plugin.hooks.get_BINARIES().values()
-        )))
-        rows['Package Managers'].append(mark_safe(', '.join(
-            f'<a href="/admin/environment/binproviders/{binprovider.name}/"><code>{binprovider.name}</code></a>'
-            for binprovider in plugin.hooks.get_BINPROVIDERS().values()
-        )))
-        # rows['Search Backends'].append(mark_safe(', '.join(
-        #     f'<a href="/admin/environment/searchbackends/{searchbackend.name}/"><code>{searchbackend.name}</code></a>'
-        #     for searchbackend in plugin.SEARCHBACKENDS.values()
-        # )))
+    plugins = get_filesystem_plugins()
+
+    for plugin_id, plugin in plugins.items():
+        rows['Name'].append(ItemLink(plugin['name'], key=plugin_id))
+        rows['Source'].append(plugin['source'])
+        rows['Path'].append(format_html('<code>{}</code>', plugin['path']))
+        rows['Hooks'].append(', '.join(plugin['hooks']) or '(none)')
+
+    if not plugins:
+        # Show a helpful message when no plugins found
+        rows['Name'].append('(no plugins found)')
+        rows['Source'].append('-')
+        rows['Path'].append(format_html('<code>archivebox/plugins/</code> or <code>data/plugins/</code>'))
+        rows['Hooks'].append('-')
 
     return TableContext(
         title="Installed plugins",
@@ -251,39 +269,31 @@ def plugin_detail_view(request: HttpRequest, key: str, **kwargs) -> ItemContext:
 
     assert request.user.is_superuser, 'Must be a superuser to view configuration settings.'
 
-    plugins = abx.get_all_plugins()
-
-    plugin_id = None
-    for check_plugin_id, loaded_plugin in plugins.items():
-        if check_plugin_id.split('.')[-1] == key.split('.')[-1]:
-            plugin_id = check_plugin_id
-            break
-
-    assert plugin_id, f'Could not find a plugin matching the specified name: {key}'
-
-    plugin = abx.get_plugin(plugin_id)
+    plugins = get_filesystem_plugins()
+    
+    plugin = plugins.get(key)
+    if not plugin:
+        return ItemContext(
+            slug=key,
+            title=f'Plugin not found: {key}',
+            data=[],
+        )
 
     return ItemContext(
         slug=key,
-        title=key,
+        title=plugin['name'],
         data=[
             {
-                "name": plugin.package,
-                "description": plugin.label,
+                "name": plugin['name'],
+                "description": plugin['path'],
                 "fields": {
-                    "id": plugin.id,
-                    "package": plugin.package,
-                    "label": plugin.label,
-                    "version": plugin.version,
-                    "author": plugin.author,
-                    "homepage": plugin.homepage,
-                    "dependencies": getattr(plugin, 'DEPENDENCIES', []),
-                    "source_code": plugin.source_code,
-                    "hooks": plugin.hooks,
-                },
-                "help_texts": {
-                    # TODO
+                    "id": plugin['id'],
+                    "name": plugin['name'],
+                    "source": plugin['source'],
+                    "path": plugin['path'],
+                    "hooks": plugin['hooks'],
                 },
+                "help_texts": {},
             },
         ],
     )
@@ -333,22 +343,6 @@ def worker_list_view(request: HttpRequest, **kwargs) -> TableContext:
     # Add a row for each worker process managed by supervisord
     for proc in cast(List[Dict[str, Any]], supervisor.getAllProcessInfo()):
         proc = benedict(proc)
-        # {
-        #     "name": "daphne",
-        #     "group": "daphne",
-        #     "start": 1725933056,
-        #     "stop": 0,
-        #     "now": 1725933438,
-        #     "state": 20,
-        #     "statename": "RUNNING",
-        #     "spawnerr": "",
-        #     "exitstatus": 0,
-        #     "logfile": "logs/server.log",
-        #     "stdout_logfile": "logs/server.log",
-        #     "stderr_logfile": "",
-        #     "pid": 33283,
-        #     "description": "pid 33283, uptime 0:06:22",
-        # }
         rows["Name"].append(ItemLink(proc.name, key=proc.name))
         rows["State"].append(proc.statename)
         rows['PID'].append(proc.description.replace('pid ', ''))

+ 3 - 7
archivebox/core/__init__.py

@@ -1,16 +1,13 @@
 __package__ = 'archivebox.core'
 __order__ = 100
-import abx
 
-@abx.hookimpl
+
 def register_admin(admin_site):
     """Register the core.models views (Snapshot, ArchiveResult, Tag, etc.) with the admin site"""
-    from core.admin import register_admin
-    register_admin(admin_site)
-
+    from core.admin import register_admin as do_register
+    do_register(admin_site)
 
 
[email protected]
 def get_CONFIG():
     from archivebox.config.common import (
         SHELL_CONFIG,
@@ -28,4 +25,3 @@ def get_CONFIG():
         'ARCHIVING_CONFIG': ARCHIVING_CONFIG,
         'SEARCHBACKEND_CONFIG': SEARCH_BACKEND_CONFIG,
     }
-

+ 0 - 3
archivebox/core/admin.py

@@ -9,10 +9,7 @@ from core.admin_snapshots import SnapshotAdmin
 from core.admin_archiveresults import ArchiveResultAdmin
 from core.admin_users import UserAdmin
 
-import abx
 
-
[email protected]
 def register_admin(admin_site):
     admin_site.register(get_user_model(), UserAdmin)
     admin_site.register(ArchiveResult, ArchiveResultAdmin)

+ 1 - 5
archivebox/core/admin_archiveresults.py

@@ -11,8 +11,6 @@ from django.utils import timezone
 
 from huey_monitor.admin import TaskModel
 
-import abx
-
 from archivebox.config import DATA_DIR
 from archivebox.config.common import SERVER_CONFIG
 from archivebox.misc.paginators import AccelleratedPaginator
@@ -43,7 +41,6 @@ class ArchiveResultInline(admin.TabularInline):
     ordering = ('end_ts',)
     show_change_link = True
     # # classes = ['collapse']
-    # # list_display_links = ['abid']
 
     def get_parent_object_from_request(self, request):
         resolved = resolve(request.path_info)
@@ -80,7 +77,7 @@ class ArchiveResultInline(admin.TabularInline):
         formset.form.base_fields['start_ts'].initial = timezone.now()
         formset.form.base_fields['end_ts'].initial = timezone.now()
         formset.form.base_fields['cmd_version'].initial = '-'
-        formset.form.base_fields['pwd'].initial = str(snapshot.link_dir)
+        formset.form.base_fields['pwd'].initial = str(snapshot.output_dir)
         formset.form.base_fields['created_by'].initial = request.user
         formset.form.base_fields['cmd'].initial = '["-"]'
         formset.form.base_fields['output'].initial = 'Manually recorded cmd output...'
@@ -193,6 +190,5 @@ class ArchiveResultAdmin(BaseModelAdmin):
 
 
 
[email protected]
 def register_admin(admin_site):
     admin_site.register(ArchiveResult, ArchiveResultAdmin)

+ 2 - 2
archivebox/core/admin_site.py

@@ -36,7 +36,7 @@ def register_admin_site():
     admin.site = archivebox_admin
     sites.site = archivebox_admin
     
-    # register all plugins admin classes
-    archivebox.pm.hook.register_admin(admin_site=archivebox_admin)
+    # Plugin admin registration is now handled by individual app admins
+    # No longer using archivebox.pm.hook.register_admin()
     
     return archivebox_admin

+ 22 - 27
archivebox/core/admin_snapshots.py

@@ -19,11 +19,9 @@ from archivebox.misc.util import htmldecode, urldecode
 from archivebox.misc.paginators import AccelleratedPaginator
 from archivebox.misc.logging_util import printable_filesize
 from archivebox.search.admin import SearchResultsAdminMixin
-from archivebox.index.html import snapshot_icons
-from archivebox.extractors import archive_links
 
-from archivebox.base_models.admin import BaseModelAdmin
-from archivebox.workers.tasks import bg_archive_links, bg_add
+from archivebox.base_models.admin import BaseModelAdmin, ConfigEditorMixin
+from archivebox.workers.tasks import bg_archive_snapshots, bg_add
 
 from core.models import Tag
 from core.admin_tags import TagInline
@@ -53,13 +51,13 @@ class SnapshotActionForm(ActionForm):
     # )
 
 
-class SnapshotAdmin(SearchResultsAdminMixin, BaseModelAdmin):
+class SnapshotAdmin(SearchResultsAdminMixin, ConfigEditorMixin, BaseModelAdmin):
     list_display = ('created_at', 'title_str', 'status', 'files', 'size', 'url_str')
     sort_fields = ('title_str', 'url_str', 'created_at', 'status', 'crawl')
-    readonly_fields = ('admin_actions', 'status_info', 'tags_str', 'imported_timestamp', 'created_at', 'modified_at', 'downloaded_at', 'link_dir')
+    readonly_fields = ('admin_actions', 'status_info', 'tags_str', 'imported_timestamp', 'created_at', 'modified_at', 'downloaded_at', 'link_dir', 'available_config_options')
     search_fields = ('id', 'url', 'timestamp', 'title', 'tags__name')
     list_filter = ('created_at', 'downloaded_at', 'archiveresult__status', 'created_by', 'tags__name')
-    fields = ('url', 'title', 'created_by', 'bookmarked_at', 'status', 'retry_at', 'crawl', *readonly_fields)
+    fields = ('url', 'title', 'created_by', 'bookmarked_at', 'status', 'retry_at', 'crawl', 'config', 'available_config_options', *readonly_fields[:-1])
     ordering = ['-created_at']
     actions = ['add_tags', 'remove_tags', 'update_titles', 'update_snapshots', 'resnapshot_snapshot', 'overwrite_snapshots', 'delete_snapshots']
     inlines = [TagInline, ArchiveResultInline]
@@ -196,14 +194,14 @@ class SnapshotAdmin(SearchResultsAdminMixin, BaseModelAdmin):
     )
     def files(self, obj):
         # return '-'
-        return snapshot_icons(obj)
+        return obj.icons()
 
 
     @admin.display(
         # ordering='archiveresult_count'
     )
     def size(self, obj):
-        archive_size = os.access(Path(obj.link_dir) / 'index.html', os.F_OK) and obj.archive_size
+        archive_size = os.access(Path(obj.output_dir) / 'index.html', os.F_OK) and obj.archive_size
         if archive_size:
             size_txt = printable_filesize(archive_size)
             if archive_size > 52428800:
@@ -261,30 +259,27 @@ class SnapshotAdmin(SearchResultsAdminMixin, BaseModelAdmin):
         description="ℹ️ Get Title"
     )
     def update_titles(self, request, queryset):
-        links = [snapshot.as_link() for snapshot in queryset]
-        if len(links) < 3:
-            # run syncronously if there are only 1 or 2 links
-            archive_links(links, overwrite=True, methods=('title','favicon'), out_dir=DATA_DIR)
-            messages.success(request, f"Title and favicon have been fetched and saved for {len(links)} URLs.")
-        else:
-            # otherwise run in a background worker
-            result = bg_archive_links((links,), kwargs={"overwrite": True, "methods": ["title", "favicon"], "out_dir": DATA_DIR})
-            messages.success(
-                request,
-                mark_safe(f"Title and favicon are updating in the background for {len(links)} URLs. {result_url(result)}"),
-            )
+        from core.models import Snapshot
+        count = queryset.count()
+
+        # Queue snapshots for archiving via the state machine system
+        result = bg_archive_snapshots(queryset, kwargs={"overwrite": True, "methods": ["title", "favicon"], "out_dir": DATA_DIR})
+        messages.success(
+            request,
+            mark_safe(f"Title and favicon are updating in the background for {count} URLs. {result_url(result)}"),
+        )
 
     @admin.action(
         description="⬇️ Get Missing"
     )
     def update_snapshots(self, request, queryset):
-        links = [snapshot.as_link() for snapshot in queryset]
+        count = queryset.count()
 
-        result = bg_archive_links((links,), kwargs={"overwrite": False, "out_dir": DATA_DIR})
+        result = bg_archive_snapshots(queryset, kwargs={"overwrite": False, "out_dir": DATA_DIR})
 
         messages.success(
             request,
-            mark_safe(f"Re-trying any previously failed methods for {len(links)} URLs in the background. {result_url(result)}"),
+            mark_safe(f"Re-trying any previously failed methods for {count} URLs in the background. {result_url(result)}"),
         )
 
 
@@ -307,13 +302,13 @@ class SnapshotAdmin(SearchResultsAdminMixin, BaseModelAdmin):
         description="🔄 Redo"
     )
     def overwrite_snapshots(self, request, queryset):
-        links = [snapshot.as_link() for snapshot in queryset]
+        count = queryset.count()
 
-        result = bg_archive_links((links,), kwargs={"overwrite": True, "out_dir": DATA_DIR})
+        result = bg_archive_snapshots(queryset, kwargs={"overwrite": True, "out_dir": DATA_DIR})
 
         messages.success(
             request,
-            mark_safe(f"Clearing all previous results and re-downloading {len(links)} URLs in the background. {result_url(result)}"),
+            mark_safe(f"Clearing all previous results and re-downloading {count} URLs in the background. {result_url(result)}"),
         )
 
     @admin.action(

+ 1 - 4
archivebox/core/admin_tags.py

@@ -3,8 +3,6 @@ __package__ = 'archivebox.core'
 from django.contrib import admin
 from django.utils.html import format_html, mark_safe
 
-import abx
-
 from archivebox.misc.paginators import AccelleratedPaginator
 from archivebox.base_models.admin import BaseModelAdmin
 
@@ -150,7 +148,7 @@ class TagAdmin(BaseModelAdmin):
 
 
 # @admin.register(SnapshotTag, site=archivebox_admin)
-# class SnapshotTagAdmin(ABIDModelAdmin):
+# class SnapshotTagAdmin(BaseModelAdmin):
 #     list_display = ('id', 'snapshot', 'tag')
 #     sort_fields = ('id', 'snapshot', 'tag')
 #     search_fields = ('id', 'snapshot_id', 'tag_id')
@@ -159,7 +157,6 @@ class TagAdmin(BaseModelAdmin):
 #     ordering = ['-id']
 
 
[email protected]
 def register_admin(admin_site):
     admin_site.register(Tag, TagAdmin)
 

+ 0 - 3
archivebox/core/admin_users.py

@@ -5,8 +5,6 @@ from django.contrib.auth.admin import UserAdmin
 from django.utils.html import format_html, mark_safe
 from django.contrib.auth import get_user_model
 
-import abx
-
 
 class CustomUserAdmin(UserAdmin):
     sort_fields = ['id', 'email', 'username', 'is_superuser', 'last_login', 'date_joined']
@@ -86,6 +84,5 @@ class CustomUserAdmin(UserAdmin):
 
 
 
[email protected]
 def register_admin(admin_site):
     admin_site.register(get_user_model(), CustomUserAdmin)

+ 0 - 5
archivebox/core/apps.py

@@ -2,17 +2,12 @@ __package__ = 'archivebox.core'
 
 from django.apps import AppConfig
 
-import archivebox
-
 
 class CoreConfig(AppConfig):
     name = 'core'
 
     def ready(self):
         """Register the archivebox.core.admin_site as the main django admin site"""
-        from django.conf import settings
-        archivebox.pm.hook.ready(settings=settings)
-        
         from core.admin_site import register_admin_site
         register_admin_site()
         

+ 9 - 12
archivebox/core/forms.py

@@ -3,37 +3,34 @@ __package__ = 'archivebox.core'
 from django import forms
 
 from archivebox.misc.util import URL_REGEX
-from ..parsers import PARSERS
 from taggit.utils import edit_string_for_tags, parse_tags
 
-PARSER_CHOICES = [
-    (parser_key, parser[0])
-    for parser_key, parser in PARSERS.items()
-]
 DEPTH_CHOICES = (
     ('0', 'depth = 0 (archive just these URLs)'),
     ('1', 'depth = 1 (archive these URLs and all URLs one hop away)'),
 )
 
-from ..extractors import get_default_archive_methods
+from archivebox.hooks import get_extractors
 
-ARCHIVE_METHODS = [
-    (name, name)
-    for name, _, _ in get_default_archive_methods()
-]
+def get_archive_methods():
+    """Get available archive methods from discovered hooks."""
+    return [(name, name) for name in get_extractors()]
 
 
 class AddLinkForm(forms.Form):
     url = forms.RegexField(label="URLs (one per line)", regex=URL_REGEX, min_length='6', strip=True, widget=forms.Textarea, required=True)
-    parser = forms.ChoiceField(label="URLs format", choices=[('auto', 'Auto-detect parser'), *PARSER_CHOICES], initial='auto')
     tag = forms.CharField(label="Tags (comma separated tag1,tag2,tag3)", strip=True, required=False)
     depth = forms.ChoiceField(label="Archive depth", choices=DEPTH_CHOICES, initial='0', widget=forms.RadioSelect(attrs={"class": "depth-selection"}))
     archive_methods = forms.MultipleChoiceField(
         label="Archive methods (select at least 1, otherwise all will be used by default)",
         required=False,
         widget=forms.SelectMultiple,
-        choices=ARCHIVE_METHODS,
+        choices=[],  # populated dynamically in __init__
     )
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.fields['archive_methods'].choices = get_archive_methods()
     # TODO: hook these up to the view and put them 
     # in a collapsible UI section labeled "Advanced"
     #

+ 5 - 7
archivebox/core/migrations/0007_archiveresult.py

@@ -1,18 +1,14 @@
 # Generated by Django 3.0.8 on 2020-11-04 12:25
 
-import os
 import json
 from pathlib import Path
 
 from django.db import migrations, models
 import django.db.models.deletion
 
+from config import CONFIG
 from index.json import to_json
 
-DATA_DIR = Path(os.getcwd()).resolve()                    # archivebox user data dir
-ARCHIVE_DIR = DATA_DIR / 'archive'                      # archivebox snapshot data dir
-
-
 try:
     JSONField = models.JSONField
 except AttributeError:
@@ -21,12 +17,14 @@ except AttributeError:
 
 
 def forwards_func(apps, schema_editor):
+    from core.models import EXTRACTORS
+
     Snapshot = apps.get_model("core", "Snapshot")
     ArchiveResult = apps.get_model("core", "ArchiveResult")
 
     snapshots = Snapshot.objects.all()
     for snapshot in snapshots:
-        out_dir = ARCHIVE_DIR / snapshot.timestamp
+        out_dir = Path(CONFIG['ARCHIVE_DIR']) / snapshot.timestamp
 
         try:
             with open(out_dir / "index.json", "r") as f:
@@ -61,7 +59,7 @@ def forwards_func(apps, schema_editor):
 
 def verify_json_index_integrity(snapshot):
     results = snapshot.archiveresult_set.all()
-    out_dir = ARCHIVE_DIR / snapshot.timestamp
+    out_dir = Path(CONFIG['ARCHIVE_DIR']) / snapshot.timestamp
     with open(out_dir / "index.json", "r") as f:
         index = json.load(f)
 

+ 0 - 58
archivebox/core/migrations/0023_alter_archiveresult_options_archiveresult_abid_and_more.py

@@ -1,58 +0,0 @@
-# Generated by Django 5.0.6 on 2024-05-13 10:56
-
-import charidfield.fields
-from django.db import migrations, models
-
-
-class Migration(migrations.Migration):
-
-    dependencies = [
-        ('core', '0022_auto_20231023_2008'),
-    ]
-
-    operations = [
-        migrations.AlterModelOptions(
-            name='archiveresult',
-            options={'verbose_name': 'Result'},
-        ),
-        migrations.AddField(
-            model_name='archiveresult',
-            name='abid',
-            field=charidfield.fields.CharIDField(blank=True, db_index=True, default=None, help_text='ABID-format identifier for this entity (e.g. snp_01BJQMF54D093DXEAWZ6JYRPAQ)', max_length=30, null=True, prefix='res_', unique=True),
-        ),
-        migrations.AddField(
-            model_name='snapshot',
-            name='abid',
-            field=charidfield.fields.CharIDField(blank=True, db_index=True, default=None, help_text='ABID-format identifier for this entity (e.g. snp_01BJQMF54D093DXEAWZ6JYRPAQ)', max_length=30, null=True, prefix='snp_', unique=True),
-        ),
-        migrations.AddField(
-            model_name='snapshot',
-            name='uuid',
-            field=models.UUIDField(blank=True, null=True, unique=True),
-        ),
-        migrations.AddField(
-            model_name='tag',
-            name='abid',
-            field=charidfield.fields.CharIDField(blank=True, db_index=True, default=None, help_text='ABID-format identifier for this entity (e.g. snp_01BJQMF54D093DXEAWZ6JYRPAQ)', max_length=30, null=True, prefix='tag_', unique=True),
-        ),
-        migrations.AlterField(
-            model_name='archiveresult',
-            name='extractor',
-            field=models.CharField(choices=(
-                ('htmltotext', 'htmltotext'),
-                ('git', 'git'),
-                ('singlefile', 'singlefile'),
-                ('media', 'media'),
-                ('archive_org', 'archive_org'),
-                ('readability', 'readability'),
-                ('mercury', 'mercury'),
-                ('favicon', 'favicon'),
-                ('pdf', 'pdf'),
-                ('headers', 'headers'),
-                ('screenshot', 'screenshot'),
-                ('dom', 'dom'),
-                ('title', 'title'),
-                ('wget', 'wget'),
-            ), max_length=32),
-        ),
-    ]

+ 466 - 0
archivebox/core/migrations/0023_new_schema.py

@@ -0,0 +1,466 @@
+# Generated by Django 5.0.6 on 2024-12-25
+# Transforms schema from 0022 to new simplified schema (ABID system removed)
+
+from uuid import uuid4
+from django.conf import settings
+from django.db import migrations, models
+import django.db.models.deletion
+import django.utils.timezone
+
+
+def get_or_create_system_user_pk(apps, schema_editor):
+    """Get or create system user for migrations."""
+    User = apps.get_model('auth', 'User')
+    user, _ = User.objects.get_or_create(
+        username='system',
+        defaults={'is_active': False, 'password': '!'}
+    )
+    return user.pk
+
+
+def populate_created_by_snapshot(apps, schema_editor):
+    """Populate created_by for existing snapshots."""
+    User = apps.get_model('auth', 'User')
+    Snapshot = apps.get_model('core', 'Snapshot')
+
+    system_user, _ = User.objects.get_or_create(
+        username='system',
+        defaults={'is_active': False, 'password': '!'}
+    )
+
+    Snapshot.objects.filter(created_by__isnull=True).update(created_by=system_user)
+
+
+def populate_created_by_archiveresult(apps, schema_editor):
+    """Populate created_by for existing archive results."""
+    User = apps.get_model('auth', 'User')
+    ArchiveResult = apps.get_model('core', 'ArchiveResult')
+
+    system_user, _ = User.objects.get_or_create(
+        username='system',
+        defaults={'is_active': False, 'password': '!'}
+    )
+
+    ArchiveResult.objects.filter(created_by__isnull=True).update(created_by=system_user)
+
+
+def populate_created_by_tag(apps, schema_editor):
+    """Populate created_by for existing tags."""
+    User = apps.get_model('auth', 'User')
+    Tag = apps.get_model('core', 'Tag')
+
+    system_user, _ = User.objects.get_or_create(
+        username='system',
+        defaults={'is_active': False, 'password': '!'}
+    )
+
+    Tag.objects.filter(created_by__isnull=True).update(created_by=system_user)
+
+
+def generate_uuid_for_archiveresults(apps, schema_editor):
+    """Generate UUIDs for archive results that don't have them."""
+    ArchiveResult = apps.get_model('core', 'ArchiveResult')
+    for ar in ArchiveResult.objects.filter(uuid__isnull=True).iterator(chunk_size=500):
+        ar.uuid = uuid4()
+        ar.save(update_fields=['uuid'])
+
+
+def generate_uuid_for_tags(apps, schema_editor):
+    """Generate UUIDs for tags that don't have them."""
+    Tag = apps.get_model('core', 'Tag')
+    for tag in Tag.objects.filter(uuid__isnull=True).iterator(chunk_size=500):
+        tag.uuid = uuid4()
+        tag.save(update_fields=['uuid'])
+
+
+def copy_bookmarked_at_from_added(apps, schema_editor):
+    """Copy added timestamp to bookmarked_at."""
+    Snapshot = apps.get_model('core', 'Snapshot')
+    Snapshot.objects.filter(bookmarked_at__isnull=True).update(
+        bookmarked_at=models.F('added')
+    )
+
+
+def copy_created_at_from_added(apps, schema_editor):
+    """Copy added timestamp to created_at for snapshots."""
+    Snapshot = apps.get_model('core', 'Snapshot')
+    Snapshot.objects.filter(created_at__isnull=True).update(
+        created_at=models.F('added')
+    )
+
+
+def copy_created_at_from_start_ts(apps, schema_editor):
+    """Copy start_ts to created_at for archive results."""
+    ArchiveResult = apps.get_model('core', 'ArchiveResult')
+    ArchiveResult.objects.filter(created_at__isnull=True).update(
+        created_at=models.F('start_ts')
+    )
+
+
+class Migration(migrations.Migration):
+    """
+    This migration transforms the schema from the main branch (0022) to the new
+    simplified schema without the ABID system.
+
+    For dev branch users who had ABID migrations (0023-0074), this replaces them
+    with a clean transformation.
+    """
+
+    replaces = [
+        ('core', '0023_alter_archiveresult_options_archiveresult_abid_and_more'),
+        ('core', '0024_auto_20240513_1143'),
+        ('core', '0025_alter_archiveresult_uuid'),
+        ('core', '0026_archiveresult_created_archiveresult_created_by_and_more'),
+        ('core', '0027_update_snapshot_ids'),
+        ('core', '0028_alter_archiveresult_uuid'),
+        ('core', '0029_alter_archiveresult_id'),
+        ('core', '0030_alter_archiveresult_uuid'),
+        ('core', '0031_alter_archiveresult_id_alter_archiveresult_uuid_and_more'),
+        ('core', '0032_alter_archiveresult_id'),
+        ('core', '0033_rename_id_archiveresult_old_id'),
+        ('core', '0034_alter_archiveresult_old_id_alter_archiveresult_uuid'),
+        ('core', '0035_remove_archiveresult_uuid_archiveresult_id'),
+        ('core', '0036_alter_archiveresult_id_alter_archiveresult_old_id'),
+        ('core', '0037_rename_id_snapshot_old_id'),
+        ('core', '0038_rename_uuid_snapshot_id'),
+        ('core', '0039_rename_snapshot_archiveresult_snapshot_old'),
+        ('core', '0040_archiveresult_snapshot'),
+        ('core', '0041_alter_archiveresult_snapshot_and_more'),
+        ('core', '0042_remove_archiveresult_snapshot_old'),
+        ('core', '0043_alter_archiveresult_snapshot_alter_snapshot_id_and_more'),
+        ('core', '0044_alter_archiveresult_snapshot_alter_tag_uuid_and_more'),
+        ('core', '0045_alter_snapshot_old_id'),
+        ('core', '0046_alter_archiveresult_snapshot_alter_snapshot_id_and_more'),
+        ('core', '0047_alter_snapshottag_unique_together_and_more'),
+        ('core', '0048_alter_archiveresult_snapshot_and_more'),
+        ('core', '0049_rename_snapshot_snapshottag_snapshot_old_and_more'),
+        ('core', '0050_alter_snapshottag_snapshot_old'),
+        ('core', '0051_snapshottag_snapshot_alter_snapshottag_snapshot_old'),
+        ('core', '0052_alter_snapshottag_unique_together_and_more'),
+        ('core', '0053_remove_snapshottag_snapshot_old'),
+        ('core', '0054_alter_snapshot_timestamp'),
+        ('core', '0055_alter_tag_slug'),
+        ('core', '0056_remove_tag_uuid'),
+        ('core', '0057_rename_id_tag_old_id'),
+        ('core', '0058_alter_tag_old_id'),
+        ('core', '0059_tag_id'),
+        ('core', '0060_alter_tag_id'),
+        ('core', '0061_rename_tag_snapshottag_old_tag_and_more'),
+        ('core', '0062_alter_snapshottag_old_tag'),
+        ('core', '0063_snapshottag_tag_alter_snapshottag_old_tag'),
+        ('core', '0064_alter_snapshottag_unique_together_and_more'),
+        ('core', '0065_remove_snapshottag_old_tag'),
+        ('core', '0066_alter_snapshottag_tag_alter_tag_id_alter_tag_old_id'),
+        ('core', '0067_alter_snapshottag_tag'),
+        ('core', '0068_alter_archiveresult_options'),
+        ('core', '0069_alter_archiveresult_created_alter_snapshot_added_and_more'),
+        ('core', '0070_alter_archiveresult_created_by_alter_snapshot_added_and_more'),
+        ('core', '0071_remove_archiveresult_old_id_remove_snapshot_old_id_and_more'),
+        ('core', '0072_rename_added_snapshot_bookmarked_at_and_more'),
+        ('core', '0073_rename_created_archiveresult_created_at_and_more'),
+        ('core', '0074_alter_snapshot_downloaded_at'),
+    ]
+
+    dependencies = [
+        ('core', '0022_auto_20231023_2008'),
+        migrations.swappable_dependency(settings.AUTH_USER_MODEL),
+    ]
+
+    operations = [
+        # === SNAPSHOT CHANGES ===
+
+        # Add new fields to Snapshot
+        migrations.AddField(
+            model_name='snapshot',
+            name='created_by',
+            field=models.ForeignKey(
+                default=None, null=True, blank=True,
+                on_delete=django.db.models.deletion.CASCADE,
+                related_name='snapshot_set',
+                to=settings.AUTH_USER_MODEL,
+            ),
+        ),
+        migrations.AddField(
+            model_name='snapshot',
+            name='created_at',
+            field=models.DateTimeField(default=django.utils.timezone.now, db_index=True, null=True),
+        ),
+        migrations.AddField(
+            model_name='snapshot',
+            name='modified_at',
+            field=models.DateTimeField(auto_now=True),
+        ),
+        migrations.AddField(
+            model_name='snapshot',
+            name='bookmarked_at',
+            field=models.DateTimeField(default=django.utils.timezone.now, db_index=True, null=True),
+        ),
+        migrations.AddField(
+            model_name='snapshot',
+            name='downloaded_at',
+            field=models.DateTimeField(default=None, null=True, blank=True, db_index=True),
+        ),
+        migrations.AddField(
+            model_name='snapshot',
+            name='depth',
+            field=models.PositiveSmallIntegerField(default=0, db_index=True),
+        ),
+        migrations.AddField(
+            model_name='snapshot',
+            name='status',
+            field=models.CharField(choices=[('queued', 'Queued'), ('started', 'Started'), ('sealed', 'Sealed')], default='queued', max_length=15, db_index=True),
+        ),
+        migrations.AddField(
+            model_name='snapshot',
+            name='retry_at',
+            field=models.DateTimeField(default=django.utils.timezone.now, null=True, blank=True, db_index=True),
+        ),
+        migrations.AddField(
+            model_name='snapshot',
+            name='config',
+            field=models.JSONField(default=dict, blank=False),
+        ),
+        migrations.AddField(
+            model_name='snapshot',
+            name='notes',
+            field=models.TextField(blank=True, default=''),
+        ),
+        migrations.AddField(
+            model_name='snapshot',
+            name='output_dir',
+            field=models.CharField(max_length=256, default=None, null=True, blank=True),
+        ),
+
+        # Copy data from old fields to new
+        migrations.RunPython(copy_bookmarked_at_from_added, migrations.RunPython.noop),
+        migrations.RunPython(copy_created_at_from_added, migrations.RunPython.noop),
+        migrations.RunPython(populate_created_by_snapshot, migrations.RunPython.noop),
+
+        # Make created_by non-nullable after population
+        migrations.AlterField(
+            model_name='snapshot',
+            name='created_by',
+            field=models.ForeignKey(
+                on_delete=django.db.models.deletion.CASCADE,
+                related_name='snapshot_set',
+                to=settings.AUTH_USER_MODEL,
+                db_index=True,
+            ),
+        ),
+
+        # Update timestamp field constraints
+        migrations.AlterField(
+            model_name='snapshot',
+            name='timestamp',
+            field=models.CharField(max_length=32, unique=True, db_index=True, editable=False),
+        ),
+
+        # Update title field size
+        migrations.AlterField(
+            model_name='snapshot',
+            name='title',
+            field=models.CharField(max_length=512, null=True, blank=True, db_index=True),
+        ),
+
+        # Remove old 'added' and 'updated' fields
+        migrations.RemoveField(model_name='snapshot', name='added'),
+        migrations.RemoveField(model_name='snapshot', name='updated'),
+
+        # Remove old 'tags' CharField (now M2M via Tag model)
+        migrations.RemoveField(model_name='snapshot', name='tags'),
+
+        # === TAG CHANGES ===
+
+        # Add uuid field to Tag temporarily for ID migration
+        migrations.AddField(
+            model_name='tag',
+            name='uuid',
+            field=models.UUIDField(default=uuid4, null=True, blank=True),
+        ),
+        migrations.AddField(
+            model_name='tag',
+            name='created_by',
+            field=models.ForeignKey(
+                default=None, null=True, blank=True,
+                on_delete=django.db.models.deletion.CASCADE,
+                related_name='tag_set',
+                to=settings.AUTH_USER_MODEL,
+            ),
+        ),
+        migrations.AddField(
+            model_name='tag',
+            name='created_at',
+            field=models.DateTimeField(default=django.utils.timezone.now, db_index=True, null=True),
+        ),
+        migrations.AddField(
+            model_name='tag',
+            name='modified_at',
+            field=models.DateTimeField(auto_now=True),
+        ),
+
+        # Populate UUIDs for tags
+        migrations.RunPython(generate_uuid_for_tags, migrations.RunPython.noop),
+        migrations.RunPython(populate_created_by_tag, migrations.RunPython.noop),
+
+        # Make created_by non-nullable
+        migrations.AlterField(
+            model_name='tag',
+            name='created_by',
+            field=models.ForeignKey(
+                on_delete=django.db.models.deletion.CASCADE,
+                related_name='tag_set',
+                to=settings.AUTH_USER_MODEL,
+            ),
+        ),
+
+        # Update slug field
+        migrations.AlterField(
+            model_name='tag',
+            name='slug',
+            field=models.SlugField(unique=True, max_length=100, editable=False),
+        ),
+
+        # === ARCHIVERESULT CHANGES ===
+
+        # Add uuid field for new ID
+        migrations.AddField(
+            model_name='archiveresult',
+            name='uuid',
+            field=models.UUIDField(default=uuid4, null=True, blank=True),
+        ),
+        migrations.AddField(
+            model_name='archiveresult',
+            name='created_by',
+            field=models.ForeignKey(
+                default=None, null=True, blank=True,
+                on_delete=django.db.models.deletion.CASCADE,
+                related_name='archiveresult_set',
+                to=settings.AUTH_USER_MODEL,
+            ),
+        ),
+        migrations.AddField(
+            model_name='archiveresult',
+            name='created_at',
+            field=models.DateTimeField(default=django.utils.timezone.now, db_index=True, null=True),
+        ),
+        migrations.AddField(
+            model_name='archiveresult',
+            name='modified_at',
+            field=models.DateTimeField(auto_now=True),
+        ),
+        migrations.AddField(
+            model_name='archiveresult',
+            name='retry_at',
+            field=models.DateTimeField(default=django.utils.timezone.now, null=True, blank=True, db_index=True),
+        ),
+        migrations.AddField(
+            model_name='archiveresult',
+            name='notes',
+            field=models.TextField(blank=True, default=''),
+        ),
+        migrations.AddField(
+            model_name='archiveresult',
+            name='output_dir',
+            field=models.CharField(max_length=256, default=None, null=True, blank=True),
+        ),
+
+        # Populate UUIDs and data for archive results
+        migrations.RunPython(generate_uuid_for_archiveresults, migrations.RunPython.noop),
+        migrations.RunPython(copy_created_at_from_start_ts, migrations.RunPython.noop),
+        migrations.RunPython(populate_created_by_archiveresult, migrations.RunPython.noop),
+
+        # Make created_by non-nullable
+        migrations.AlterField(
+            model_name='archiveresult',
+            name='created_by',
+            field=models.ForeignKey(
+                on_delete=django.db.models.deletion.CASCADE,
+                related_name='archiveresult_set',
+                to=settings.AUTH_USER_MODEL,
+                db_index=True,
+            ),
+        ),
+
+        # Update extractor choices
+        migrations.AlterField(
+            model_name='archiveresult',
+            name='extractor',
+            field=models.CharField(
+                choices=[
+                    ('htmltotext', 'htmltotext'), ('git', 'git'), ('singlefile', 'singlefile'),
+                    ('media', 'media'), ('archive_org', 'archive_org'), ('readability', 'readability'),
+                    ('mercury', 'mercury'), ('favicon', 'favicon'), ('pdf', 'pdf'),
+                    ('headers', 'headers'), ('screenshot', 'screenshot'), ('dom', 'dom'),
+                    ('title', 'title'), ('wget', 'wget'),
+                ],
+                max_length=32, db_index=True,
+            ),
+        ),
+
+        # Update status field
+        migrations.AlterField(
+            model_name='archiveresult',
+            name='status',
+            field=models.CharField(
+                choices=[
+                    ('queued', 'Queued'), ('started', 'Started'), ('backoff', 'Waiting to retry'),
+                    ('succeeded', 'Succeeded'), ('failed', 'Failed'), ('skipped', 'Skipped'),
+                ],
+                max_length=16, default='queued', db_index=True,
+            ),
+        ),
+
+        # Update output field size
+        migrations.AlterField(
+            model_name='archiveresult',
+            name='output',
+            field=models.CharField(max_length=1024, default=None, null=True, blank=True),
+        ),
+
+        # Update cmd_version field size
+        migrations.AlterField(
+            model_name='archiveresult',
+            name='cmd_version',
+            field=models.CharField(max_length=128, default=None, null=True, blank=True),
+        ),
+
+        # Make start_ts and end_ts nullable
+        migrations.AlterField(
+            model_name='archiveresult',
+            name='start_ts',
+            field=models.DateTimeField(default=None, null=True, blank=True),
+        ),
+        migrations.AlterField(
+            model_name='archiveresult',
+            name='end_ts',
+            field=models.DateTimeField(default=None, null=True, blank=True),
+        ),
+
+        # Make pwd nullable
+        migrations.AlterField(
+            model_name='archiveresult',
+            name='pwd',
+            field=models.CharField(max_length=256, default=None, null=True, blank=True),
+        ),
+
+        # Make cmd nullable
+        migrations.AlterField(
+            model_name='archiveresult',
+            name='cmd',
+            field=models.JSONField(default=None, null=True, blank=True),
+        ),
+
+        # Update model options
+        migrations.AlterModelOptions(
+            name='archiveresult',
+            options={'verbose_name': 'Archive Result', 'verbose_name_plural': 'Archive Results Log'},
+        ),
+        migrations.AlterModelOptions(
+            name='snapshot',
+            options={'verbose_name': 'Snapshot', 'verbose_name_plural': 'Snapshots'},
+        ),
+        migrations.AlterModelOptions(
+            name='tag',
+            options={'verbose_name': 'Tag', 'verbose_name_plural': 'Tags'},
+        ),
+    ]

+ 0 - 101
archivebox/core/migrations/0024_auto_20240513_1143.py

@@ -1,101 +0,0 @@
-# Generated by Django 5.0.6 on 2024-05-13 11:43
-
-from django.db import migrations
-from datetime import datetime
-
-from archivebox.base_models.abid import abid_from_values, DEFAULT_ABID_URI_SALT
-
-
-def calculate_abid(self):
-    """
-    Return a freshly derived ABID (assembled from attrs defined in ABIDModel.abid_*_src).
-    """
-    prefix = self.abid_prefix
-    ts = eval(self.abid_ts_src)
-    uri = eval(self.abid_uri_src)
-    subtype = eval(self.abid_subtype_src)
-    rand = eval(self.abid_rand_src)
-
-    if (not prefix) or prefix == 'obj_':
-        suggested_abid = self.__class__.__name__[:3].lower()
-        raise Exception(f'{self.__class__.__name__}.abid_prefix must be defined to calculate ABIDs (suggested: {suggested_abid})')
-
-    if not ts:
-        ts = datetime.utcfromtimestamp(0)
-        print(f'[!] WARNING: Generating ABID with ts=0000000000 placeholder because {self.__class__.__name__}.abid_ts_src={self.abid_ts_src} is unset!', ts.isoformat())
-
-    if not uri:
-        uri = str(self)
-        print(f'[!] WARNING: Generating ABID with uri=str(self) placeholder because {self.__class__.__name__}.abid_uri_src={self.abid_uri_src} is unset!', uri)
-
-    if not subtype:
-        subtype = self.__class__.__name__
-        print(f'[!] WARNING: Generating ABID with subtype={subtype} placeholder because {self.__class__.__name__}.abid_subtype_src={self.abid_subtype_src} is unset!', subtype)
-
-    if not rand:
-        rand = getattr(self, 'uuid', None) or getattr(self, 'id', None) or getattr(self, 'pk')
-        print(f'[!] WARNING: Generating ABID with rand=self.id placeholder because {self.__class__.__name__}.abid_rand_src={self.abid_rand_src} is unset!', rand)
-
-    abid = abid_from_values(
-        prefix=prefix,
-        ts=ts,
-        uri=uri,
-        subtype=subtype,
-        rand=rand,
-        salt=DEFAULT_ABID_URI_SALT,
-    )
-    assert abid.ulid and abid.uuid and abid.typeid, f'Failed to calculate {prefix}_ABID for {self.__class__.__name__}'
-    return abid
-
-
-def copy_snapshot_uuids(apps, schema_editor):
-    print('   Copying snapshot.id -> snapshot.uuid...')
-    Snapshot = apps.get_model("core", "Snapshot")
-    for snapshot in Snapshot.objects.all():
-        snapshot.uuid = snapshot.id
-        snapshot.save(update_fields=["uuid"])
-
-def generate_snapshot_abids(apps, schema_editor):
-    print('   Generating snapshot.abid values...')
-    Snapshot = apps.get_model("core", "Snapshot")
-    for snapshot in Snapshot.objects.all():
-        snapshot.abid_prefix = 'snp_'
-        snapshot.abid_ts_src = 'self.added'
-        snapshot.abid_uri_src = 'self.url'
-        snapshot.abid_subtype_src = '"01"'
-        snapshot.abid_rand_src = 'self.uuid'
-
-        snapshot.abid = calculate_abid(snapshot)
-        snapshot.uuid = snapshot.abid.uuid
-        snapshot.save(update_fields=["abid", "uuid"])
-
-def generate_archiveresult_abids(apps, schema_editor):
-    print('   Generating ArchiveResult.abid values... (may take an hour or longer for large collections...)')
-    ArchiveResult = apps.get_model("core", "ArchiveResult")
-    Snapshot = apps.get_model("core", "Snapshot")
-    for result in ArchiveResult.objects.all():
-        result.abid_prefix = 'res_'
-        result.snapshot = Snapshot.objects.get(pk=result.snapshot_id)
-        result.snapshot_added = result.snapshot.added
-        result.snapshot_url = result.snapshot.url
-        result.abid_ts_src = 'self.snapshot_added'
-        result.abid_uri_src = 'self.snapshot_url'
-        result.abid_subtype_src = 'self.extractor'
-        result.abid_rand_src = 'self.id'
-
-        result.abid = calculate_abid(result)
-        result.uuid = result.abid.uuid
-        result.save(update_fields=["abid", "uuid"])
-
-
-class Migration(migrations.Migration):
-
-    dependencies = [
-        ('core', '0023_alter_archiveresult_options_archiveresult_abid_and_more'),
-    ]
-
-    operations = [
-        migrations.RunPython(copy_snapshot_uuids, reverse_code=migrations.RunPython.noop),
-        migrations.RunPython(generate_snapshot_abids, reverse_code=migrations.RunPython.noop),
-        migrations.RunPython(generate_archiveresult_abids, reverse_code=migrations.RunPython.noop),
-    ]

+ 0 - 19
archivebox/core/migrations/0025_alter_archiveresult_uuid.py

@@ -1,19 +0,0 @@
-# Generated by Django 5.0.6 on 2024-05-13 12:08
-
-import uuid
-from django.db import migrations, models
-
-
-class Migration(migrations.Migration):
-
-    dependencies = [
-        ('core', '0024_auto_20240513_1143'),
-    ]
-
-    operations = [
-        migrations.AlterField(
-            model_name='archiveresult',
-            name='uuid',
-            field=models.UUIDField(default=uuid.uuid4, editable=False, unique=True),
-        ),
-    ]

+ 0 - 117
archivebox/core/migrations/0026_archiveresult_created_archiveresult_created_by_and_more.py

@@ -1,117 +0,0 @@
-# Generated by Django 5.0.6 on 2024-05-13 13:01
-
-import django.db.models.deletion
-import django.utils.timezone
-from django.conf import settings
-from django.db import migrations, models
-
-import archivebox.base_models.models
-
-
-def updated_created_by_ids(apps, schema_editor):
-    """Get or create a system user with is_superuser=True to be the default owner for new DB rows"""
-
-    User = apps.get_model("auth", "User")
-    ArchiveResult = apps.get_model("core", "ArchiveResult")
-    Snapshot = apps.get_model("core", "Snapshot")
-    Tag = apps.get_model("core", "Tag")
-
-    # if only one user exists total, return that user
-    if User.objects.filter(is_superuser=True).count() == 1:
-        user_id = User.objects.filter(is_superuser=True).values_list('pk', flat=True)[0]
-
-    # otherwise, create a dedicated "system" user
-    user_id = User.objects.get_or_create(username='system', is_staff=True, is_superuser=True, defaults={'email': '', 'password': ''})[0].pk
-    
-    ArchiveResult.objects.all().update(created_by_id=user_id)
-    Snapshot.objects.all().update(created_by_id=user_id)
-    Tag.objects.all().update(created_by_id=user_id)
-
-
-
-class Migration(migrations.Migration):
-
-    dependencies = [
-        ('core', '0025_alter_archiveresult_uuid'),
-        migrations.swappable_dependency(settings.AUTH_USER_MODEL),
-    ]
-
-    operations = [
-        migrations.AddField(
-            model_name='archiveresult',
-            name='created',
-            field=models.DateTimeField(auto_now_add=True, default=django.utils.timezone.now),
-            preserve_default=False,
-        ),
-        migrations.AddField(
-            model_name='archiveresult',
-            name='created_by',
-            field=models.ForeignKey(null=True, default=archivebox.base_models.models.get_or_create_system_user_pk, on_delete=django.db.models.deletion.CASCADE, to=settings.AUTH_USER_MODEL),
-        ),
-        migrations.AddField(
-            model_name='archiveresult',
-            name='modified',
-            field=models.DateTimeField(auto_now=True),
-        ),
-        migrations.AddField(
-            model_name='snapshot',
-            name='created',
-            field=models.DateTimeField(auto_now_add=True, default=django.utils.timezone.now),
-            preserve_default=False,
-        ),
-        migrations.AddField(
-            model_name='snapshot',
-            name='created_by',
-            field=models.ForeignKey(null=True, default=archivebox.base_models.models.get_or_create_system_user_pk, on_delete=django.db.models.deletion.CASCADE, to=settings.AUTH_USER_MODEL),
-        ),
-        migrations.AddField(
-            model_name='snapshot',
-            name='modified',
-            field=models.DateTimeField(auto_now=True),
-        ),
-        migrations.AddField(
-            model_name='tag',
-            name='created',
-            field=models.DateTimeField(auto_now_add=True, default=django.utils.timezone.now),
-            preserve_default=False,
-        ),
-        migrations.AddField(
-            model_name='tag',
-            name='created_by',
-            field=models.ForeignKey(null=True, default=archivebox.base_models.models.get_or_create_system_user_pk, on_delete=django.db.models.deletion.CASCADE, to=settings.AUTH_USER_MODEL),
-        ),
-        migrations.AddField(
-            model_name='tag',
-            name='modified',
-            field=models.DateTimeField(auto_now=True),
-        ),
-        migrations.AddField(
-            model_name='tag',
-            name='uuid',
-            field=models.UUIDField(blank=True, null=True, unique=True),
-        ),
-        migrations.AlterField(
-            model_name='archiveresult',
-            name='uuid',
-            field=models.UUIDField(blank=True, null=True, unique=True),
-        ),
-
-
-        migrations.RunPython(updated_created_by_ids, reverse_code=migrations.RunPython.noop),
-
-        migrations.AddField(
-            model_name='snapshot',
-            name='created_by',
-            field=models.ForeignKey(default=archivebox.base_models.models.get_or_create_system_user_pk, on_delete=django.db.models.deletion.CASCADE, to=settings.AUTH_USER_MODEL),
-        ),
-        migrations.AlterField(
-            model_name='archiveresult',
-            name='created_by',
-            field=models.ForeignKey(default=archivebox.base_models.models.get_or_create_system_user_pk, on_delete=django.db.models.deletion.CASCADE, to=settings.AUTH_USER_MODEL),
-        ),
-        migrations.AddField(
-            model_name='tag',
-            name='created_by',
-            field=models.ForeignKey(default=archivebox.base_models.models.get_or_create_system_user_pk, on_delete=django.db.models.deletion.CASCADE, to=settings.AUTH_USER_MODEL),
-        ),
-    ]

+ 0 - 105
archivebox/core/migrations/0027_update_snapshot_ids.py

@@ -1,105 +0,0 @@
-# Generated by Django 5.0.6 on 2024-08-18 02:48
-
-from django.db import migrations
-
-from datetime import datetime
-from archivebox.base_models.abid import ABID, abid_from_values, DEFAULT_ABID_URI_SALT
-
-
-def calculate_abid(self):
-    """
-    Return a freshly derived ABID (assembled from attrs defined in ABIDModel.abid_*_src).
-    """
-    prefix = self.abid_prefix
-    ts = eval(self.abid_ts_src)
-    uri = eval(self.abid_uri_src)
-    subtype = eval(self.abid_subtype_src)
-    rand = eval(self.abid_rand_src)
-
-    if (not prefix) or prefix == 'obj_':
-        suggested_abid = self.__class__.__name__[:3].lower()
-        raise Exception(f'{self.__class__.__name__}.abid_prefix must be defined to calculate ABIDs (suggested: {suggested_abid})')
-
-    if not ts:
-        ts = datetime.utcfromtimestamp(0)
-        print(f'[!] WARNING: Generating ABID with ts=0000000000 placeholder because {self.__class__.__name__}.abid_ts_src={self.abid_ts_src} is unset!', ts.isoformat())
-
-    if not uri:
-        uri = str(self)
-        print(f'[!] WARNING: Generating ABID with uri=str(self) placeholder because {self.__class__.__name__}.abid_uri_src={self.abid_uri_src} is unset!', uri)
-
-    if not subtype:
-        subtype = self.__class__.__name__
-        print(f'[!] WARNING: Generating ABID with subtype={subtype} placeholder because {self.__class__.__name__}.abid_subtype_src={self.abid_subtype_src} is unset!', subtype)
-
-    if not rand:
-        rand = getattr(self, 'uuid', None) or getattr(self, 'id', None) or getattr(self, 'pk')
-        print(f'[!] WARNING: Generating ABID with rand=self.id placeholder because {self.__class__.__name__}.abid_rand_src={self.abid_rand_src} is unset!', rand)
-
-    abid = abid_from_values(
-        prefix=prefix,
-        ts=ts,
-        uri=uri,
-        subtype=subtype,
-        rand=rand,
-        salt=DEFAULT_ABID_URI_SALT,
-    )
-    assert abid.ulid and abid.uuid and abid.typeid, f'Failed to calculate {prefix}_ABID for {self.__class__.__name__}'
-    return abid
-
-def update_snapshot_ids(apps, schema_editor):
-    Snapshot = apps.get_model("core", "Snapshot")
-    num_total = Snapshot.objects.all().count()
-    print(f'   Updating {num_total} Snapshot.id, Snapshot.uuid values in place...')
-    for idx, snapshot in enumerate(Snapshot.objects.all().only('abid').iterator(chunk_size=500)):
-        assert snapshot.abid
-        snapshot.abid_prefix = 'snp_'
-        snapshot.abid_ts_src = 'self.added'
-        snapshot.abid_uri_src = 'self.url'
-        snapshot.abid_subtype_src = '"01"'
-        snapshot.abid_rand_src = 'self.uuid'
-
-        snapshot.abid = calculate_abid(snapshot)
-        snapshot.uuid = snapshot.abid.uuid
-        snapshot.save(update_fields=["abid", "uuid"])
-        assert str(ABID.parse(snapshot.abid).uuid) == str(snapshot.uuid)
-        if idx % 1000 == 0:
-            print(f'Migrated {idx}/{num_total} Snapshot objects...')
-
-def update_archiveresult_ids(apps, schema_editor):
-    Snapshot = apps.get_model("core", "Snapshot")
-    ArchiveResult = apps.get_model("core", "ArchiveResult")
-    num_total = ArchiveResult.objects.all().count()
-    print(f'   Updating {num_total} ArchiveResult.id, ArchiveResult.uuid values in place... (may take an hour or longer for large collections...)')
-    for idx, result in enumerate(ArchiveResult.objects.all().only('abid', 'snapshot_id').iterator(chunk_size=500)):
-        assert result.abid
-        result.abid_prefix = 'res_'
-        result.snapshot = Snapshot.objects.get(pk=result.snapshot_id)
-        result.snapshot_added = result.snapshot.added
-        result.snapshot_url = result.snapshot.url
-        result.abid_ts_src = 'self.snapshot_added'
-        result.abid_uri_src = 'self.snapshot_url'
-        result.abid_subtype_src = 'self.extractor'
-        result.abid_rand_src = 'self.id'
-
-        result.abid = calculate_abid(result)
-        result.uuid = result.abid.uuid
-        result.uuid = ABID.parse(result.abid).uuid
-        result.save(update_fields=["abid", "uuid"])
-        assert str(ABID.parse(result.abid).uuid) == str(result.uuid)
-        if idx % 5000 == 0:
-            print(f'Migrated {idx}/{num_total} ArchiveResult objects...')
-
-
-class Migration(migrations.Migration):
-
-    dependencies = [
-        ('core', '0026_archiveresult_created_archiveresult_created_by_and_more'),
-    ]
-
-    operations = [
-        migrations.RunPython(update_snapshot_ids, reverse_code=migrations.RunPython.noop),
-        migrations.RunPython(update_archiveresult_ids, reverse_code=migrations.RunPython.noop),
-    ]
-
-

+ 0 - 19
archivebox/core/migrations/0028_alter_archiveresult_uuid.py

@@ -1,19 +0,0 @@
-# Generated by Django 5.0.6 on 2024-08-18 04:28
-
-import uuid
-from django.db import migrations, models
-
-
-class Migration(migrations.Migration):
-
-    dependencies = [
-        ('core', '0027_update_snapshot_ids'),
-    ]
-
-    operations = [
-        migrations.AlterField(
-            model_name='archiveresult',
-            name='uuid',
-            field=models.UUIDField(default=uuid.uuid4),
-        ),
-    ]

+ 0 - 18
archivebox/core/migrations/0029_alter_archiveresult_id.py

@@ -1,18 +0,0 @@
-# Generated by Django 5.0.6 on 2024-08-18 04:28
-
-from django.db import migrations, models
-
-
-class Migration(migrations.Migration):
-
-    dependencies = [
-        ('core', '0028_alter_archiveresult_uuid'),
-    ]
-
-    operations = [
-        migrations.AlterField(
-            model_name='archiveresult',
-            name='id',
-            field=models.BigIntegerField(primary_key=True, serialize=False, verbose_name='ID'),
-        ),
-    ]

+ 0 - 18
archivebox/core/migrations/0030_alter_archiveresult_uuid.py

@@ -1,18 +0,0 @@
-# Generated by Django 5.0.6 on 2024-08-18 05:00
-
-from django.db import migrations, models
-
-
-class Migration(migrations.Migration):
-
-    dependencies = [
-        ('core', '0029_alter_archiveresult_id'),
-    ]
-
-    operations = [
-        migrations.AlterField(
-            model_name='archiveresult',
-            name='uuid',
-            field=models.UUIDField(unique=True),
-        ),
-    ]

+ 0 - 34
archivebox/core/migrations/0031_alter_archiveresult_id_alter_archiveresult_uuid_and_more.py

@@ -1,34 +0,0 @@
-# Generated by Django 5.0.6 on 2024-08-18 05:09
-
-import uuid
-from django.db import migrations, models
-
-
-class Migration(migrations.Migration):
-
-    dependencies = [
-        ('core', '0030_alter_archiveresult_uuid'),
-    ]
-
-    operations = [
-        migrations.AlterField(
-            model_name='archiveresult',
-            name='id',
-            field=models.IntegerField(default=uuid.uuid4, primary_key=True, serialize=False, verbose_name='ID'),
-        ),
-        migrations.AlterField(
-            model_name='archiveresult',
-            name='uuid',
-            field=models.UUIDField(default=uuid.uuid4, unique=True),
-        ),
-        migrations.AlterField(
-            model_name='snapshot',
-            name='uuid',
-            field=models.UUIDField(default=uuid.uuid4, unique=True),
-        ),
-        migrations.AlterField(
-            model_name='tag',
-            name='uuid',
-            field=models.UUIDField(default=uuid.uuid4, null=True, unique=True),
-        ),
-    ]

+ 0 - 23
archivebox/core/migrations/0032_alter_archiveresult_id.py

@@ -1,23 +0,0 @@
-# Generated by Django 5.0.6 on 2024-08-18 05:20
-
-import core.models
-import random
-from django.db import migrations, models
-
-
-def rand_int_id():
-    return random.getrandbits(32)
-
-class Migration(migrations.Migration):
-
-    dependencies = [
-        ('core', '0031_alter_archiveresult_id_alter_archiveresult_uuid_and_more'),
-    ]
-
-    operations = [
-        migrations.AlterField(
-            model_name='archiveresult',
-            name='id',
-            field=models.BigIntegerField(default=rand_int_id, primary_key=True, serialize=False, verbose_name='ID'),
-        ),
-    ]

+ 0 - 18
archivebox/core/migrations/0033_rename_id_archiveresult_old_id.py

@@ -1,18 +0,0 @@
-# Generated by Django 5.0.6 on 2024-08-18 05:34
-
-from django.db import migrations
-
-
-class Migration(migrations.Migration):
-
-    dependencies = [
-        ('core', '0032_alter_archiveresult_id'),
-    ]
-
-    operations = [
-        migrations.RenameField(
-            model_name='archiveresult',
-            old_name='id',
-            new_name='old_id',
-        ),
-    ]

+ 0 - 45
archivebox/core/migrations/0034_alter_archiveresult_old_id_alter_archiveresult_uuid.py

@@ -1,45 +0,0 @@
-# Generated by Django 5.0.6 on 2024-08-18 05:37
-
-import uuid
-import random
-from django.db import migrations, models
-
-from archivebox.base_models.abid import ABID
-
-
-def rand_int_id():
-    return random.getrandbits(32)
-
-
-def update_archiveresult_ids(apps, schema_editor):
-    ArchiveResult = apps.get_model("core", "ArchiveResult")
-    num_total = ArchiveResult.objects.all().count()
-    print(f'   Updating {num_total} ArchiveResult.id, ArchiveResult.uuid values in place... (may take an hour or longer for large collections...)')
-    for idx, result in enumerate(ArchiveResult.objects.all().only('abid').iterator(chunk_size=500)):
-        assert result.abid
-        result.uuid = ABID.parse(result.abid).uuid
-        result.save(update_fields=["uuid"])
-        assert str(ABID.parse(result.abid).uuid) == str(result.uuid)
-        if idx % 2500 == 0:
-            print(f'Migrated {idx}/{num_total} ArchiveResult objects...')
-
-
-class Migration(migrations.Migration):
-
-    dependencies = [
-        ('core', '0033_rename_id_archiveresult_old_id'),
-    ]
-
-    operations = [
-        migrations.AlterField(
-            model_name='archiveresult',
-            name='old_id',
-            field=models.BigIntegerField(default=rand_int_id, serialize=False, verbose_name='ID'),
-        ),
-        migrations.RunPython(update_archiveresult_ids, reverse_code=migrations.RunPython.noop),
-        migrations.AlterField(
-            model_name='archiveresult',
-            name='uuid',
-            field=models.UUIDField(default=uuid.uuid4, primary_key=True, serialize=False, unique=True),
-        ),
-    ]

+ 0 - 19
archivebox/core/migrations/0035_remove_archiveresult_uuid_archiveresult_id.py

@@ -1,19 +0,0 @@
-# Generated by Django 5.0.6 on 2024-08-18 05:49
-
-import uuid
-from django.db import migrations, models
-
-
-class Migration(migrations.Migration):
-
-    dependencies = [
-        ('core', '0034_alter_archiveresult_old_id_alter_archiveresult_uuid'),
-    ]
-
-    operations = [
-        migrations.RenameField(
-            model_name='archiveresult',
-            old_name='uuid',
-            new_name='id',
-        ),
-    ]

+ 0 - 29
archivebox/core/migrations/0036_alter_archiveresult_id_alter_archiveresult_old_id.py

@@ -1,29 +0,0 @@
-# Generated by Django 5.0.6 on 2024-08-18 05:59
-
-import core.models
-import uuid
-import random
-from django.db import migrations, models
-
-
-def rand_int_id():
-    return random.getrandbits(32)
-
-class Migration(migrations.Migration):
-
-    dependencies = [
-        ('core', '0035_remove_archiveresult_uuid_archiveresult_id'),
-    ]
-
-    operations = [
-        migrations.AlterField(
-            model_name='archiveresult',
-            name='id',
-            field=models.UUIDField(default=uuid.uuid4, primary_key=True, serialize=False, unique=True, verbose_name='ID'),
-        ),
-        migrations.AlterField(
-            model_name='archiveresult',
-            name='old_id',
-            field=models.BigIntegerField(default=rand_int_id, serialize=False, verbose_name='Old ID'),
-        ),
-    ]

+ 0 - 18
archivebox/core/migrations/0037_rename_id_snapshot_old_id.py

@@ -1,18 +0,0 @@
-# Generated by Django 5.0.6 on 2024-08-18 06:08
-
-from django.db import migrations
-
-
-class Migration(migrations.Migration):
-
-    dependencies = [
-        ('core', '0036_alter_archiveresult_id_alter_archiveresult_old_id'),
-    ]
-
-    operations = [
-        migrations.RenameField(
-            model_name='snapshot',
-            old_name='id',
-            new_name='old_id',
-        ),
-    ]

+ 0 - 18
archivebox/core/migrations/0038_rename_uuid_snapshot_id.py

@@ -1,18 +0,0 @@
-# Generated by Django 5.0.6 on 2024-08-18 06:09
-
-from django.db import migrations
-
-
-class Migration(migrations.Migration):
-
-    dependencies = [
-        ('core', '0037_rename_id_snapshot_old_id'),
-    ]
-
-    operations = [
-        migrations.RenameField(
-            model_name='snapshot',
-            old_name='uuid',
-            new_name='id',
-        ),
-    ]

+ 0 - 18
archivebox/core/migrations/0039_rename_snapshot_archiveresult_snapshot_old.py

@@ -1,18 +0,0 @@
-# Generated by Django 5.0.6 on 2024-08-18 06:25
-
-from django.db import migrations
-
-
-class Migration(migrations.Migration):
-
-    dependencies = [
-        ('core', '0038_rename_uuid_snapshot_id'),
-    ]
-
-    operations = [
-        migrations.RenameField(
-            model_name='archiveresult',
-            old_name='snapshot',
-            new_name='snapshot_old',
-        ),
-    ]

+ 0 - 34
archivebox/core/migrations/0040_archiveresult_snapshot.py

@@ -1,34 +0,0 @@
-# Generated by Django 5.0.6 on 2024-08-18 06:46
-
-import django.db.models.deletion
-from django.db import migrations, models
-
-def update_archiveresult_snapshot_ids(apps, schema_editor):
-    ArchiveResult = apps.get_model("core", "ArchiveResult")
-    Snapshot = apps.get_model("core", "Snapshot")
-    num_total = ArchiveResult.objects.all().count()
-    print(f'   Updating {num_total} ArchiveResult.snapshot_id values in place... (may take an hour or longer for large collections...)')
-    for idx, result in enumerate(ArchiveResult.objects.all().only('snapshot_old_id').iterator(chunk_size=5000)):
-        assert result.snapshot_old_id
-        snapshot = Snapshot.objects.only('id').get(old_id=result.snapshot_old_id)
-        result.snapshot_id = snapshot.id
-        result.save(update_fields=["snapshot_id"])
-        assert str(result.snapshot_id) == str(snapshot.id)
-        if idx % 5000 == 0:
-            print(f'Migrated {idx}/{num_total} ArchiveResult objects...')
-
-
-class Migration(migrations.Migration):
-
-    dependencies = [
-        ('core', '0039_rename_snapshot_archiveresult_snapshot_old'),
-    ]
-
-    operations = [
-        migrations.AddField(
-            model_name='archiveresult',
-            name='snapshot',
-            field=models.ForeignKey(null=True, on_delete=django.db.models.deletion.CASCADE, related_name='archiveresults', to='core.snapshot', to_field='id'),
-        ),
-        migrations.RunPython(update_archiveresult_snapshot_ids, reverse_code=migrations.RunPython.noop),
-    ]

+ 0 - 24
archivebox/core/migrations/0041_alter_archiveresult_snapshot_and_more.py

@@ -1,24 +0,0 @@
-# Generated by Django 5.0.6 on 2024-08-18 06:50
-
-import django.db.models.deletion
-from django.db import migrations, models
-
-
-class Migration(migrations.Migration):
-
-    dependencies = [
-        ('core', '0040_archiveresult_snapshot'),
-    ]
-
-    operations = [
-        migrations.AlterField(
-            model_name='archiveresult',
-            name='snapshot',
-            field=models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, to='core.snapshot', to_field='id'),
-        ),
-        migrations.AlterField(
-            model_name='archiveresult',
-            name='snapshot_old',
-            field=models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, related_name='archiveresults_old', to='core.snapshot'),
-        ),
-    ]

+ 0 - 17
archivebox/core/migrations/0042_remove_archiveresult_snapshot_old.py

@@ -1,17 +0,0 @@
-# Generated by Django 5.0.6 on 2024-08-18 06:51
-
-from django.db import migrations
-
-
-class Migration(migrations.Migration):
-
-    dependencies = [
-        ('core', '0041_alter_archiveresult_snapshot_and_more'),
-    ]
-
-    operations = [
-        migrations.RemoveField(
-            model_name='archiveresult',
-            name='snapshot_old',
-        ),
-    ]

+ 0 - 20
archivebox/core/migrations/0043_alter_archiveresult_snapshot_alter_snapshot_id_and_more.py

@@ -1,20 +0,0 @@
-# Generated by Django 5.0.6 on 2024-08-18 06:52
-
-import django.db.models.deletion
-import uuid
-from django.db import migrations, models
-
-
-class Migration(migrations.Migration):
-
-    dependencies = [
-        ('core', '0042_remove_archiveresult_snapshot_old'),
-    ]
-
-    operations = [
-        migrations.AlterField(
-            model_name='archiveresult',
-            name='snapshot',
-            field=models.ForeignKey(db_column='snapshot_id', on_delete=django.db.models.deletion.CASCADE, to='core.snapshot', to_field='id'),
-        ),
-    ]

+ 0 - 40
archivebox/core/migrations/0044_alter_archiveresult_snapshot_alter_tag_uuid_and_more.py

@@ -1,40 +0,0 @@
-# Generated by Django 5.0.6 on 2024-08-19 23:01
-
-import django.db.models.deletion
-import uuid
-from django.db import migrations, models
-
-
-
-class Migration(migrations.Migration):
-
-    dependencies = [
-        ('core', '0043_alter_archiveresult_snapshot_alter_snapshot_id_and_more'),
-    ]
-
-    operations = [
-        migrations.SeparateDatabaseAndState(
-            database_operations=[
-                # No-op, SnapshotTag model already exists in DB
-            ],
-            state_operations=[
-                migrations.CreateModel(
-                    name='SnapshotTag',
-                    fields=[
-                        ('id', models.AutoField(primary_key=True, serialize=False)),
-                        ('snapshot', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, to='core.snapshot')),
-                        ('tag', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, to='core.tag')),
-                    ],
-                    options={
-                        'db_table': 'core_snapshot_tags',
-                        'unique_together': {('snapshot', 'tag')},
-                    },
-                ),
-                migrations.AlterField(
-                    model_name='snapshot',
-                    name='tags',
-                    field=models.ManyToManyField(blank=True, related_name='snapshot_set', through='core.SnapshotTag', to='core.tag'),
-                ),
-            ],
-        ),
-    ]

+ 0 - 19
archivebox/core/migrations/0045_alter_snapshot_old_id.py

@@ -1,19 +0,0 @@
-# Generated by Django 5.0.6 on 2024-08-20 01:54
-
-import uuid
-from django.db import migrations, models
-
-
-class Migration(migrations.Migration):
-
-    dependencies = [
-        ('core', '0044_alter_archiveresult_snapshot_alter_tag_uuid_and_more'),
-    ]
-
-    operations = [
-        migrations.AlterField(
-            model_name='snapshot',
-            name='old_id',
-            field=models.UUIDField(default=uuid.uuid4, editable=False, primary_key=True, serialize=False, unique=True),
-        ),
-    ]

+ 0 - 30
archivebox/core/migrations/0046_alter_archiveresult_snapshot_alter_snapshot_id_and_more.py

@@ -1,30 +0,0 @@
-# Generated by Django 5.0.6 on 2024-08-20 01:55
-
-import django.db.models.deletion
-import uuid
-from django.db import migrations, models
-
-
-class Migration(migrations.Migration):
-
-    dependencies = [
-        ('core', '0045_alter_snapshot_old_id'),
-    ]
-
-    operations = [
-        migrations.AlterField(
-            model_name='archiveresult',
-            name='snapshot',
-            field=models.ForeignKey(db_column='snapshot_id', on_delete=django.db.models.deletion.CASCADE, to='core.snapshot', to_field='id'),
-        ),
-        migrations.AlterField(
-            model_name='snapshot',
-            name='id',
-            field=models.UUIDField(default=uuid.uuid4, primary_key=True, serialize=False, unique=True),
-        ),
-        migrations.AlterField(
-            model_name='snapshot',
-            name='old_id',
-            field=models.UUIDField(default=uuid.uuid4, editable=False, unique=True),
-        ),
-    ]

+ 0 - 24
archivebox/core/migrations/0047_alter_snapshottag_unique_together_and_more.py

@@ -1,24 +0,0 @@
-# Generated by Django 5.0.6 on 2024-08-20 02:16
-
-import django.db.models.deletion
-from django.db import migrations, models
-
-
-class Migration(migrations.Migration):
-
-    dependencies = [
-        ('core', '0046_alter_archiveresult_snapshot_alter_snapshot_id_and_more'),
-    ]
-
-    operations = [
-        migrations.AlterField(
-            model_name='archiveresult',
-            name='snapshot',
-            field=models.ForeignKey(db_column='snapshot_id', on_delete=django.db.models.deletion.CASCADE, to='core.snapshot', to_field='id'),
-        ),
-        migrations.AlterField(
-            model_name='snapshottag',
-            name='tag',
-            field=models.ForeignKey(db_column='tag_id', on_delete=django.db.models.deletion.CASCADE, to='core.tag'),
-        ),
-    ]

+ 0 - 24
archivebox/core/migrations/0048_alter_archiveresult_snapshot_and_more.py

@@ -1,24 +0,0 @@
-# Generated by Django 5.0.6 on 2024-08-20 02:17
-
-import django.db.models.deletion
-from django.db import migrations, models
-
-
-class Migration(migrations.Migration):
-
-    dependencies = [
-        ('core', '0047_alter_snapshottag_unique_together_and_more'),
-    ]
-
-    operations = [
-        migrations.AlterField(
-            model_name='archiveresult',
-            name='snapshot',
-            field=models.ForeignKey(db_column='snapshot_id', on_delete=django.db.models.deletion.CASCADE, to='core.snapshot'),
-        ),
-        migrations.AlterField(
-            model_name='snapshottag',
-            name='snapshot',
-            field=models.ForeignKey(db_column='snapshot_id', on_delete=django.db.models.deletion.CASCADE, to='core.snapshot', to_field='old_id'),
-        ),
-    ]

+ 0 - 22
archivebox/core/migrations/0049_rename_snapshot_snapshottag_snapshot_old_and_more.py

@@ -1,22 +0,0 @@
-# Generated by Django 5.0.6 on 2024-08-20 02:26
-
-from django.db import migrations
-
-
-class Migration(migrations.Migration):
-
-    dependencies = [
-        ('core', '0048_alter_archiveresult_snapshot_and_more'),
-    ]
-
-    operations = [
-        migrations.RenameField(
-            model_name='snapshottag',
-            old_name='snapshot',
-            new_name='snapshot_old',
-        ),
-        migrations.AlterUniqueTogether(
-            name='snapshottag',
-            unique_together={('snapshot_old', 'tag')},
-        ),
-    ]

+ 0 - 19
archivebox/core/migrations/0050_alter_snapshottag_snapshot_old.py

@@ -1,19 +0,0 @@
-# Generated by Django 5.0.6 on 2024-08-20 02:30
-
-import django.db.models.deletion
-from django.db import migrations, models
-
-
-class Migration(migrations.Migration):
-
-    dependencies = [
-        ('core', '0049_rename_snapshot_snapshottag_snapshot_old_and_more'),
-    ]
-
-    operations = [
-        migrations.AlterField(
-            model_name='snapshottag',
-            name='snapshot_old',
-            field=models.ForeignKey(db_column='snapshot_old_id', on_delete=django.db.models.deletion.CASCADE, to='core.snapshot', to_field='old_id'),
-        ),
-    ]

+ 0 - 40
archivebox/core/migrations/0051_snapshottag_snapshot_alter_snapshottag_snapshot_old.py

@@ -1,40 +0,0 @@
-# Generated by Django 5.0.6 on 2024-08-20 02:31
-
-import django.db.models.deletion
-from django.db import migrations, models
-
-
-def update_snapshottag_ids(apps, schema_editor):
-    Snapshot = apps.get_model("core", "Snapshot")
-    SnapshotTag = apps.get_model("core", "SnapshotTag")
-    num_total = SnapshotTag.objects.all().count()
-    print(f'   Updating {num_total} SnapshotTag.snapshot_id values in place... (may take an hour or longer for large collections...)')
-    for idx, snapshottag in enumerate(SnapshotTag.objects.all().only('snapshot_old_id').iterator(chunk_size=500)):
-        assert snapshottag.snapshot_old_id
-        snapshot = Snapshot.objects.get(old_id=snapshottag.snapshot_old_id)
-        snapshottag.snapshot_id = snapshot.id
-        snapshottag.save(update_fields=["snapshot_id"])
-        assert str(snapshottag.snapshot_id) == str(snapshot.id)
-        if idx % 100 == 0:
-            print(f'Migrated {idx}/{num_total} SnapshotTag objects...')
-
-
-class Migration(migrations.Migration):
-
-    dependencies = [
-        ('core', '0050_alter_snapshottag_snapshot_old'),
-    ]
-
-    operations = [
-        migrations.AddField(
-            model_name='snapshottag',
-            name='snapshot',
-            field=models.ForeignKey(blank=True, db_column='snapshot_id', null=True, on_delete=django.db.models.deletion.CASCADE, to='core.snapshot'),
-        ),
-        migrations.AlterField(
-            model_name='snapshottag',
-            name='snapshot_old',
-            field=models.ForeignKey(db_column='snapshot_old_id', on_delete=django.db.models.deletion.CASCADE, related_name='snapshottag_old_set', to='core.snapshot', to_field='old_id'),
-        ),
-        migrations.RunPython(update_snapshottag_ids, reverse_code=migrations.RunPython.noop),
-    ]

+ 0 - 27
archivebox/core/migrations/0052_alter_snapshottag_unique_together_and_more.py

@@ -1,27 +0,0 @@
-# Generated by Django 5.0.6 on 2024-08-20 02:37
-
-import django.db.models.deletion
-from django.db import migrations, models
-
-
-class Migration(migrations.Migration):
-
-    dependencies = [
-        ('core', '0051_snapshottag_snapshot_alter_snapshottag_snapshot_old'),
-    ]
-
-    operations = [
-        migrations.AlterUniqueTogether(
-            name='snapshottag',
-            unique_together=set(),
-        ),
-        migrations.AlterField(
-            model_name='snapshottag',
-            name='snapshot',
-            field=models.ForeignKey(db_column='snapshot_id', on_delete=django.db.models.deletion.CASCADE, to='core.snapshot'),
-        ),
-        migrations.AlterUniqueTogether(
-            name='snapshottag',
-            unique_together={('snapshot', 'tag')},
-        ),
-    ]

+ 0 - 17
archivebox/core/migrations/0053_remove_snapshottag_snapshot_old.py

@@ -1,17 +0,0 @@
-# Generated by Django 5.0.6 on 2024-08-20 02:38
-
-from django.db import migrations
-
-
-class Migration(migrations.Migration):
-
-    dependencies = [
-        ('core', '0052_alter_snapshottag_unique_together_and_more'),
-    ]
-
-    operations = [
-        migrations.RemoveField(
-            model_name='snapshottag',
-            name='snapshot_old',
-        ),
-    ]

+ 0 - 18
archivebox/core/migrations/0054_alter_snapshot_timestamp.py

@@ -1,18 +0,0 @@
-# Generated by Django 5.0.6 on 2024-08-20 02:40
-
-from django.db import migrations, models
-
-
-class Migration(migrations.Migration):
-
-    dependencies = [
-        ('core', '0053_remove_snapshottag_snapshot_old'),
-    ]
-
-    operations = [
-        migrations.AlterField(
-            model_name='snapshot',
-            name='timestamp',
-            field=models.CharField(db_index=True, editable=False, max_length=32, unique=True),
-        ),
-    ]

+ 0 - 18
archivebox/core/migrations/0055_alter_tag_slug.py

@@ -1,18 +0,0 @@
-# Generated by Django 5.0.6 on 2024-08-20 03:24
-
-from django.db import migrations, models
-
-
-class Migration(migrations.Migration):
-
-    dependencies = [
-        ('core', '0054_alter_snapshot_timestamp'),
-    ]
-
-    operations = [
-        migrations.AlterField(
-            model_name='tag',
-            name='slug',
-            field=models.SlugField(editable=False, max_length=100, unique=True),
-        ),
-    ]

+ 0 - 17
archivebox/core/migrations/0056_remove_tag_uuid.py

@@ -1,17 +0,0 @@
-# Generated by Django 5.0.6 on 2024-08-20 03:25
-
-from django.db import migrations
-
-
-class Migration(migrations.Migration):
-
-    dependencies = [
-        ('core', '0055_alter_tag_slug'),
-    ]
-
-    operations = [
-        migrations.RemoveField(
-            model_name='tag',
-            name='uuid',
-        ),
-    ]

+ 0 - 18
archivebox/core/migrations/0057_rename_id_tag_old_id.py

@@ -1,18 +0,0 @@
-# Generated by Django 5.0.6 on 2024-08-20 03:29
-
-from django.db import migrations
-
-
-class Migration(migrations.Migration):
-
-    dependencies = [
-        ('core', '0056_remove_tag_uuid'),
-    ]
-
-    operations = [
-        migrations.RenameField(
-            model_name='tag',
-            old_name='id',
-            new_name='old_id',
-        ),
-    ]

+ 0 - 22
archivebox/core/migrations/0058_alter_tag_old_id.py

@@ -1,22 +0,0 @@
-# Generated by Django 5.0.6 on 2024-08-20 03:30
-
-import random
-from django.db import migrations, models
-
-
-def rand_int_id():
-    return random.getrandbits(32)
-
-class Migration(migrations.Migration):
-
-    dependencies = [
-        ('core', '0057_rename_id_tag_old_id'),
-    ]
-
-    operations = [
-        migrations.AlterField(
-            model_name='tag',
-            name='old_id',
-            field=models.BigIntegerField(default=rand_int_id, primary_key=True, serialize=False, verbose_name='Old ID'),
-        ),
-    ]

+ 0 - 90
archivebox/core/migrations/0059_tag_id.py

@@ -1,90 +0,0 @@
-# Generated by Django 5.0.6 on 2024-08-20 03:33
-
-from datetime import datetime
-from django.db import migrations, models
-from archivebox.base_models.abid import abid_from_values
-from archivebox.base_models.models import ABID
-
-def calculate_abid(self):
-    """
-    Return a freshly derived ABID (assembled from attrs defined in ABIDModel.abid_*_src).
-    """
-    prefix = self.abid_prefix
-    ts = eval(self.abid_ts_src)
-    uri = eval(self.abid_uri_src)
-    subtype = eval(self.abid_subtype_src)
-    rand = eval(self.abid_rand_src)
-
-    if (not prefix) or prefix == 'obj_':
-        suggested_abid = self.__class__.__name__[:3].lower()
-        raise Exception(f'{self.__class__.__name__}.abid_prefix must be defined to calculate ABIDs (suggested: {suggested_abid})')
-
-    if not ts:
-        ts = datetime.utcfromtimestamp(0)
-        print(f'[!] WARNING: Generating ABID with ts=0000000000 placeholder because {self.__class__.__name__}.abid_ts_src={self.abid_ts_src} is unset!', ts.isoformat())
-
-    if not uri:
-        uri = str(self)
-        print(f'[!] WARNING: Generating ABID with uri=str(self) placeholder because {self.__class__.__name__}.abid_uri_src={self.abid_uri_src} is unset!', uri)
-
-    if not subtype:
-        subtype = self.__class__.__name__
-        print(f'[!] WARNING: Generating ABID with subtype={subtype} placeholder because {self.__class__.__name__}.abid_subtype_src={self.abid_subtype_src} is unset!', subtype)
-
-    if not rand:
-        rand = getattr(self, 'uuid', None) or getattr(self, 'id', None) or getattr(self, 'pk')
-        print(f'[!] WARNING: Generating ABID with rand=self.id placeholder because {self.__class__.__name__}.abid_rand_src={self.abid_rand_src} is unset!', rand)
-
-    abid = abid_from_values(
-        prefix=prefix,
-        ts=ts,
-        uri=uri,
-        subtype=subtype,
-        rand=rand,
-    )
-    assert abid.ulid and abid.uuid and abid.typeid, f'Failed to calculate {prefix}_ABID for {self.__class__.__name__}'
-    return abid
-
-
-def update_archiveresult_ids(apps, schema_editor):
-    Tag = apps.get_model("core", "Tag")
-    num_total = Tag.objects.all().count()
-    print(f'   Updating {num_total} Tag.id, ArchiveResult.uuid values in place...')
-    for idx, tag in enumerate(Tag.objects.all().iterator(chunk_size=500)):
-        if not tag.slug:
-            tag.slug = tag.name.lower().replace(' ', '_')
-        if not tag.name:
-            tag.name = tag.slug
-        if not (tag.name or tag.slug):
-            tag.delete()
-            continue
-
-        assert tag.slug or tag.name, f'Tag.slug must be defined! You have a Tag(id={tag.pk}) missing a slug!'
-        tag.abid_prefix = 'tag_'
-        tag.abid_ts_src = 'self.created'
-        tag.abid_uri_src = 'self.slug'
-        tag.abid_subtype_src = '"03"'
-        tag.abid_rand_src = 'self.old_id'
-        tag.abid = calculate_abid(tag)
-        tag.id = tag.abid.uuid
-        tag.save(update_fields=["abid", "id", "name", "slug"])
-        assert str(ABID.parse(tag.abid).uuid) == str(tag.id)
-        if idx % 10 == 0:
-            print(f'Migrated {idx}/{num_total} Tag objects...')
-
-
-
-class Migration(migrations.Migration):
-
-    dependencies = [
-        ('core', '0058_alter_tag_old_id'),
-    ]
-
-    operations = [
-        migrations.AddField(
-            model_name='tag',
-            name='id',
-            field=models.UUIDField(blank=True, null=True),
-        ),
-        migrations.RunPython(update_archiveresult_ids, reverse_code=migrations.RunPython.noop),
-    ]

+ 0 - 19
archivebox/core/migrations/0060_alter_tag_id.py

@@ -1,19 +0,0 @@
-# Generated by Django 5.0.6 on 2024-08-20 03:42
-
-import uuid
-from django.db import migrations, models
-
-
-class Migration(migrations.Migration):
-
-    dependencies = [
-        ('core', '0059_tag_id'),
-    ]
-
-    operations = [
-        migrations.AlterField(
-            model_name='tag',
-            name='id',
-            field=models.UUIDField(default=uuid.uuid4, editable=False, unique=True),
-        ),
-    ]

+ 0 - 22
archivebox/core/migrations/0061_rename_tag_snapshottag_old_tag_and_more.py

@@ -1,22 +0,0 @@
-# Generated by Django 5.0.6 on 2024-08-20 03:43
-
-from django.db import migrations
-
-
-class Migration(migrations.Migration):
-
-    dependencies = [
-        ('core', '0060_alter_tag_id'),
-    ]
-
-    operations = [
-        migrations.RenameField(
-            model_name='snapshottag',
-            old_name='tag',
-            new_name='old_tag',
-        ),
-        migrations.AlterUniqueTogether(
-            name='snapshottag',
-            unique_together={('snapshot', 'old_tag')},
-        ),
-    ]

+ 0 - 19
archivebox/core/migrations/0062_alter_snapshottag_old_tag.py

@@ -1,19 +0,0 @@
-# Generated by Django 5.0.6 on 2024-08-20 03:44
-
-import django.db.models.deletion
-from django.db import migrations, models
-
-
-class Migration(migrations.Migration):
-
-    dependencies = [
-        ('core', '0061_rename_tag_snapshottag_old_tag_and_more'),
-    ]
-
-    operations = [
-        migrations.AlterField(
-            model_name='snapshottag',
-            name='old_tag',
-            field=models.ForeignKey(db_column='old_tag_id', on_delete=django.db.models.deletion.CASCADE, to='core.tag'),
-        ),
-    ]

+ 0 - 40
archivebox/core/migrations/0063_snapshottag_tag_alter_snapshottag_old_tag.py

@@ -1,40 +0,0 @@
-# Generated by Django 5.0.6 on 2024-08-20 03:45
-
-import django.db.models.deletion
-from django.db import migrations, models
-
-
-def update_snapshottag_ids(apps, schema_editor):
-    Tag = apps.get_model("core", "Tag")
-    SnapshotTag = apps.get_model("core", "SnapshotTag")
-    num_total = SnapshotTag.objects.all().count()
-    print(f'   Updating {num_total} SnapshotTag.tag_id values in place... (may take an hour or longer for large collections...)')
-    for idx, snapshottag in enumerate(SnapshotTag.objects.all().only('old_tag_id').iterator(chunk_size=500)):
-        assert snapshottag.old_tag_id
-        tag = Tag.objects.get(old_id=snapshottag.old_tag_id)
-        snapshottag.tag_id = tag.id
-        snapshottag.save(update_fields=["tag_id"])
-        assert str(snapshottag.tag_id) == str(tag.id)
-        if idx % 100 == 0:
-            print(f'Migrated {idx}/{num_total} SnapshotTag objects...')
-
-
-class Migration(migrations.Migration):
-
-    dependencies = [
-        ('core', '0062_alter_snapshottag_old_tag'),
-    ]
-
-    operations = [
-        migrations.AddField(
-            model_name='snapshottag',
-            name='tag',
-            field=models.ForeignKey(blank=True, db_column='tag_id', null=True, on_delete=django.db.models.deletion.CASCADE, to='core.tag', to_field='id'),
-        ),
-        migrations.AlterField(
-            model_name='snapshottag',
-            name='old_tag',
-            field=models.ForeignKey(db_column='old_tag_id', on_delete=django.db.models.deletion.CASCADE, related_name='snapshottags_old', to='core.tag'),
-        ),
-        migrations.RunPython(update_snapshottag_ids, reverse_code=migrations.RunPython.noop),
-    ]

+ 0 - 27
archivebox/core/migrations/0064_alter_snapshottag_unique_together_and_more.py

@@ -1,27 +0,0 @@
-# Generated by Django 5.0.6 on 2024-08-20 03:50
-
-import django.db.models.deletion
-from django.db import migrations, models
-
-
-class Migration(migrations.Migration):
-
-    dependencies = [
-        ('core', '0063_snapshottag_tag_alter_snapshottag_old_tag'),
-    ]
-
-    operations = [
-        migrations.AlterUniqueTogether(
-            name='snapshottag',
-            unique_together=set(),
-        ),
-        migrations.AlterField(
-            model_name='snapshottag',
-            name='tag',
-            field=models.ForeignKey(db_column='tag_id', on_delete=django.db.models.deletion.CASCADE, to='core.tag', to_field='id'),
-        ),
-        migrations.AlterUniqueTogether(
-            name='snapshottag',
-            unique_together={('snapshot', 'tag')},
-        ),
-    ]

+ 0 - 17
archivebox/core/migrations/0065_remove_snapshottag_old_tag.py

@@ -1,17 +0,0 @@
-# Generated by Django 5.0.6 on 2024-08-20 03:51
-
-from django.db import migrations
-
-
-class Migration(migrations.Migration):
-
-    dependencies = [
-        ('core', '0064_alter_snapshottag_unique_together_and_more'),
-    ]
-
-    operations = [
-        migrations.RemoveField(
-            model_name='snapshottag',
-            name='old_tag',
-        ),
-    ]

+ 0 - 34
archivebox/core/migrations/0066_alter_snapshottag_tag_alter_tag_id_alter_tag_old_id.py

@@ -1,34 +0,0 @@
-# Generated by Django 5.0.6 on 2024-08-20 03:52
-
-import core.models
-import django.db.models.deletion
-import uuid
-import random
-from django.db import migrations, models
-
-def rand_int_id():
-    return random.getrandbits(32)
-
-class Migration(migrations.Migration):
-
-    dependencies = [
-        ('core', '0065_remove_snapshottag_old_tag'),
-    ]
-
-    operations = [
-        migrations.AlterField(
-            model_name='snapshottag',
-            name='tag',
-            field=models.ForeignKey(db_column='tag_id', on_delete=django.db.models.deletion.CASCADE, to='core.tag', to_field='id'),
-        ),
-        migrations.AlterField(
-            model_name='tag',
-            name='id',
-            field=models.UUIDField(default=uuid.uuid4, editable=False, primary_key=True, serialize=False, unique=True),
-        ),
-        migrations.AlterField(
-            model_name='tag',
-            name='old_id',
-            field=models.BigIntegerField(default=rand_int_id, serialize=False, unique=True, verbose_name='Old ID'),
-        ),
-    ]

Some files were not shown because too many files changed in this diff