2 months ago · 1915333b81
--- a/.claude/settings.local.json
+++ b/.claude/settings.local.json
@@ -0,0 +1,9 @@
 
				+{
			
 
				+  "permissions": {
			
 
				+    "allow": [
			
 
				+      "Bash(python -m archivebox:*)",
			
 
				+      "Bash(ls:*)",
			
 
				+      "Bash(xargs:*)"
			
 
				+    ]
			
 
				+  }
			
 
				+}
			
--- a/ArchiveBox.conf
+++ b/ArchiveBox.conf
@@ -0,0 +1,3 @@
 
				+[SERVER_CONFIG]
			
 
				+SECRET_KEY = y6fw9wcaqls9sx_dze6ahky9ggpkpzoaw5g5v98_u3ro5j0_4f
			
 
				+
			
--- a/PLUGIN_ENHANCEMENTS.md
+++ b/PLUGIN_ENHANCEMENTS.md
@@ -0,0 +1,300 @@
 
				+# JS Implementation Features to Port to Python ArchiveBox
			
 
				+
			
 
				+## Priority: High Impact Features
			
 
				+
			
 
				+### 1. **Screen Recording** ⭐⭐⭐
			
 
				+**JS Implementation:** Captures MP4 video + animated GIF of the archiving session
			
 
				+```javascript
			
 
				+// Records browser activity including scrolling, interactions
			
 
				+PuppeteerScreenRecorder → screenrecording.mp4
			
 
				+ffmpeg conversion → screenrecording.gif (first 10s, optimized)
			
 
				+```
			
 
				+
			
 
				+**Enhancement for Python:**
			
 
				+- Add `on_Snapshot__24_screenrecording.py`
			
 
				+- Use puppeteer or playwright screen recording APIs
			
 
				+- Generate both full MP4 and thumbnail GIF
			
 
				+- **Value:** Visual proof of what was captured, useful for QA and debugging
			
 
				+
			
 
				+### 2. **AI Quality Assurance** ⭐⭐⭐
			
 
				+**JS Implementation:** Uses GPT-4o to analyze screenshots and validate archive quality
			
 
				+```javascript
			
 
				+// ai_qa.py analyzes screenshot.png and returns:
			
 
				+{
			
 
				+  "pct_visible": 85,
			
 
				+  "warnings": ["Some content may be cut off"],
			
 
				+  "main_content_title": "Article Title",
			
 
				+  "main_content_author": "Author Name",
			
 
				+  "main_content_date": "2024-01-15",
			
 
				+  "website_brand_name": "Example.com"
			
 
				+}
			
 
				+```
			
 
				+
			
 
				+**Enhancement for Python:**
			
 
				+- Add `on_Snapshot__95_aiqa.py` (runs after screenshot)
			
 
				+- Integrate with OpenAI API or local vision models
			
 
				+- Validates: content visibility, broken layouts, CAPTCHA blocks, error pages
			
 
				+- **Value:** Automatic detection of failed archives, quality scoring
			
 
				+
			
 
				+### 3. **Network Response Archiving** ⭐⭐⭐
			
 
				+**JS Implementation:** Saves ALL network responses in organized structure
			
 
				+```
			
 
				+responses/
			
 
				+├── all/                          # Timestamped unique files
			
 
				+│   ├── 20240101120000__GET__https%3A%2F%2Fexample.com%2Fapi.json
			
 
				+│   └── ...
			
 
				+├── script/                       # Organized by resource type
			
 
				+│   └── example.com/path/to/script.js → ../all/...
			
 
				+├── stylesheet/
			
 
				+├── image/
			
 
				+├── media/
			
 
				+└── index.jsonl                   # Searchable index
			
 
				+```
			
 
				+
			
 
				+**Enhancement for Python:**
			
 
				+- Add `on_Snapshot__23_responses.py`
			
 
				+- Save all HTTP responses (XHR, images, scripts, etc.)
			
 
				+- Create both timestamped and URL-organized views via symlinks
			
 
				+- Generate `index.jsonl` with metadata (URL, method, status, mimeType, sha256)
			
 
				+- **Value:** Complete HTTP-level archive, better debugging, API response preservation
			
 
				+
			
 
				+### 4. **Detailed Metadata Extractors** ⭐⭐
			
 
				+
			
 
				+#### 4a. SSL/TLS Details (`on_Snapshot__16_ssl.py`)
			
 
				+```python
			
 
				+{
			
 
				+  "protocol": "TLS 1.3",
			
 
				+  "cipher": "AES_128_GCM",
			
 
				+  "securityState": "secure",
			
 
				+  "securityDetails": {
			
 
				+    "issuer": "Let's Encrypt",
			
 
				+    "validFrom": ...,
			
 
				+    "validTo": ...
			
 
				+  }
			
 
				+}
			
 
				+```
			
 
				+
			
 
				+#### 4b. SEO Metadata (`on_Snapshot__17_seo.py`)
			
 
				+Extracts all `<meta>` tags:
			
 
				+```python
			
 
				+{
			
 
				+  "og:title": "Page Title",
			
 
				+  "og:image": "https://example.com/image.jpg",
			
 
				+  "twitter:card": "summary_large_image",
			
 
				+  "description": "Page description",
			
 
				+  ...
			
 
				+}
			
 
				+```
			
 
				+
			
 
				+#### 4c. Accessibility Tree (`on_Snapshot__18_accessibility.py`)
			
 
				+```python
			
 
				+{
			
 
				+  "headings": ["# Main Title", "## Section 1", ...],
			
 
				+  "iframes": ["https://embed.example.com/..."],
			
 
				+  "tree": { ... }  # Full accessibility snapshot
			
 
				+}
			
 
				+```
			
 
				+
			
 
				+#### 4d. Outlinks Categorization (`on_Snapshot__19_outlinks.py`)
			
 
				+Better than current implementation - categorizes by type:
			
 
				+```python
			
 
				+{
			
 
				+  "hrefs": [...],           # All <a> links
			
 
				+  "images": [...],          # <img src>
			
 
				+  "css_stylesheets": [...], # <link rel=stylesheet>
			
 
				+  "js_scripts": [...],      # <script src>
			
 
				+  "iframes": [...],         # <iframe src>
			
 
				+  "css_images": [...],      # background-image: url()
			
 
				+  "links": [{...}]          # <link> tags (rel, href)
			
 
				+}
			
 
				+```
			
 
				+
			
 
				+#### 4e. Redirects Chain (`on_Snapshot__15_redirects.py`)
			
 
				+Tracks full redirect sequence:
			
 
				+```python
			
 
				+{
			
 
				+  "redirects_from_http": [
			
 
				+    {"url": "http://ex.com", "status": 301, "isMainFrame": True},
			
 
				+    {"url": "https://ex.com", "status": 302, "isMainFrame": True},
			
 
				+    {"url": "https://www.ex.com", "status": 200, "isMainFrame": True}
			
 
				+  ]
			
 
				+}
			
 
				+```
			
 
				+
			
 
				+**Value:** Rich metadata for research, SEO analysis, security auditing
			
 
				+
			
 
				+### 5. **Enhanced Screenshot System** ⭐⭐
			
 
				+**JS Implementation:**
			
 
				+- `screenshot.png` - Full-page PNG at high resolution (4:3 ratio)
			
 
				+- `screenshot.jpg` - Compressed JPEG for thumbnails (1440x1080, 90% quality)
			
 
				+- Automatically crops to reasonable height for long pages
			
 
				+
			
 
				+**Enhancement for Python:**
			
 
				+- Update `screenshot` extractor to generate both formats
			
 
				+- Use aspect ratio optimization (4:3 is better for thumbnails than 16:9)
			
 
				+- **Value:** Faster loading thumbnails, better storage efficiency
			
 
				+
			
 
				+### 6. **Console Log Capture** ⭐⭐
			
 
				+**JS Implementation:**
			
 
				+```
			
 
				+console.log - Captures all console output
			
 
				+  ERROR /path/to/script.js:123 "Uncaught TypeError: ..."
			
 
				+  WARNING https://example.com/api Failed to load resource: net::ERR_BLOCKED_BY_CLIENT
			
 
				+```
			
 
				+
			
 
				+**Enhancement for Python:**
			
 
				+- Add `on_Snapshot__20_consolelog.py`
			
 
				+- Useful for debugging JavaScript errors, tracking blocked resources
			
 
				+- **Value:** Identifies rendering issues, ad blockers, CORS problems
			
 
				+
			
 
				+## Priority: Nice-to-Have Enhancements
			
 
				+
			
 
				+### 7. **Request/Response Headers** ⭐
			
 
				+**Current:** Headers extractor exists but could be enhanced
			
 
				+**JS Enhancement:** Separates request vs response, includes extra headers
			
 
				+
			
 
				+### 8. **Human Behavior Emulation** ⭐
			
 
				+**JS Implementation:**
			
 
				+- Mouse jiggling with ghost-cursor
			
 
				+- Smart scrolling with infinite scroll detection
			
 
				+- Comment expansion (Reddit, HackerNews, etc.)
			
 
				+- Form submission
			
 
				+- CAPTCHA solving via 2captcha extension
			
 
				+
			
 
				+**Enhancement for Python:**
			
 
				+- Add `on_Snapshot__05_human_behavior.py` (runs BEFORE other extractors)
			
 
				+- Implement scrolling, clicking "Load More", expanding comments
			
 
				+- **Value:** Captures more content from dynamic sites
			
 
				+
			
 
				+### 9. **CAPTCHA Solving** ⭐
			
 
				+**JS Implementation:** Integrates 2captcha extension
			
 
				+**Enhancement:** Add optional CAPTCHA solving via 2captcha API
			
 
				+**Value:** Access to Cloudflare-protected sites
			
 
				+
			
 
				+### 10. **Source Map Downloading**
			
 
				+**JS Implementation:** Automatically downloads `.map` files for JS/CSS
			
 
				+**Enhancement:** Add `on_Snapshot__30_sourcemaps.py`
			
 
				+**Value:** Helps debug minified code
			
 
				+
			
 
				+### 11. **Pandoc Markdown Conversion**
			
 
				+**JS Implementation:** Converts HTML ↔ Markdown using Pandoc
			
 
				+```bash
			
 
				+pandoc --from html --to markdown_github --wrap=none
			
 
				+```
			
 
				+**Enhancement:** Add `on_Snapshot__34_pandoc.py`
			
 
				+**Value:** Human-readable Markdown format
			
 
				+
			
 
				+### 12. **Authentication Management** ⭐
			
 
				+**JS Implementation:**
			
 
				+- Sophisticated cookie storage with `cookies.txt` export
			
 
				+- LocalStorage + SessionStorage preservation
			
 
				+- Merge new cookies with existing ones (no overwrites)
			
 
				+
			
 
				+**Enhancement:**
			
 
				+- Improve `auth.json` management to match JS sophistication
			
 
				+- Add `cookies.txt` export (Netscape format) for compatibility with wget/curl
			
 
				+- **Value:** Better session persistence across runs
			
 
				+
			
 
				+### 13. **File Integrity & Versioning** ⭐⭐
			
 
				+**JS Implementation:**
			
 
				+- SHA256 hash for every file
			
 
				+- Merkle tree directory hashes
			
 
				+- Version directories (`versions/YYYYMMDDHHMMSS/`)
			
 
				+- Symlinks to latest versions
			
 
				+- `.files.json` manifest with metadata
			
 
				+
			
 
				+**Enhancement:**
			
 
				+- Add `on_Snapshot__99_integrity.py` (runs last)
			
 
				+- Generate SHA256 hashes for all outputs
			
 
				+- Create version manifests
			
 
				+- **Value:** Verify archive integrity, detect corruption, track changes
			
 
				+
			
 
				+### 14. **Directory Organization**
			
 
				+**JS Structure (superior):**
			
 
				+```
			
 
				+archive/<timestamp>/
			
 
				+├── versions/
			
 
				+│   ├── 20240101120000/         # Each run = new version
			
 
				+│   │   ├── screenshot.png
			
 
				+│   │   ├── singlefile.html
			
 
				+│   │   └── ...
			
 
				+│   └── 20240102150000/
			
 
				+├── screenshot.png → versions/20240102150000/screenshot.png  # Symlink to latest
			
 
				+├── singlefile.html → ...
			
 
				+└── metrics.json
			
 
				+```
			
 
				+
			
 
				+**Current Python:** All outputs in flat structure
			
 
				+**Enhancement:** Add versioning layer for tracking changes over time
			
 
				+
			
 
				+### 15. **Speedtest Integration**
			
 
				+**JS Implementation:** Runs fast.com speedtest once per day
			
 
				+**Enhancement:** Optional `on_Snapshot__01_speedtest.py`
			
 
				+**Value:** Diagnose slow archives, track connection quality
			
 
				+
			
 
				+### 16. **gallery-dl Support** ⭐
			
 
				+**JS Implementation:** Downloads photo galleries (Instagram, Twitter, etc.)
			
 
				+**Enhancement:** Add `on_Snapshot__30_photos.py` alongside existing `media` extractor
			
 
				+**Value:** Better support for image-heavy sites
			
 
				+
			
 
				+## Implementation Priority Ranking
			
 
				+
			
 
				+### Must-Have (High ROI):
			
 
				+1. **Network Response Archiving** - Complete HTTP archive
			
 
				+2. **AI Quality Assurance** - Automatic validation
			
 
				+3. **Screen Recording** - Visual proof of capture
			
 
				+4. **Enhanced Metadata** (SSL, SEO, Accessibility, Outlinks) - Research value
			
 
				+
			
 
				+### Should-Have (Medium ROI):
			
 
				+5. **Console Log Capture** - Debugging aid
			
 
				+6. **File Integrity Hashing** - Archive verification
			
 
				+7. **Enhanced Screenshots** - Better thumbnails
			
 
				+8. **Versioning System** - Track changes over time
			
 
				+
			
 
				+### Nice-to-Have (Lower ROI):
			
 
				+9. **Human Behavior Emulation** - Dynamic content
			
 
				+10. **CAPTCHA Solving** - Access restricted sites
			
 
				+11. **gallery-dl** - Image collections
			
 
				+12. **Pandoc Markdown** - Readable format
			
 
				+
			
 
				+## Technical Considerations
			
 
				+
			
 
				+### Dependencies Needed:
			
 
				+- **Screen Recording:** `playwright` or `puppeteer` with recording API
			
 
				+- **AI QA:** `openai` Python SDK or local vision model
			
 
				+- **Network Archiving:** CDP protocol access (already have via Chrome)
			
 
				+- **File Hashing:** Built-in `hashlib` (no new deps)
			
 
				+- **gallery-dl:** Install via pip
			
 
				+
			
 
				+### Performance Impact:
			
 
				+- Screen recording: +2-3 seconds overhead per snapshot
			
 
				+- AI QA: +0.5-2 seconds (API call) per snapshot
			
 
				+- Response archiving: Minimal (async writes)
			
 
				+- File hashing: +0.1-0.5 seconds per snapshot
			
 
				+- Metadata extraction: Minimal (same page visit)
			
 
				+
			
 
				+### Architecture Compatibility:
			
 
				+All proposed enhancements fit the existing hook-based plugin architecture:
			
 
				+- Use standard `on_Snapshot__NN_name.py` naming
			
 
				+- Return `ExtractorResult` objects
			
 
				+- Can reuse shared Chrome CDP sessions
			
 
				+- Follow existing error handling patterns
			
 
				+
			
 
				+## Summary Statistics
			
 
				+
			
 
				+**JS Implementation:**
			
 
				+- 35+ output types
			
 
				+- ~3000 lines of archiving logic
			
 
				+- Extensive quality assurance
			
 
				+- Complete HTTP-level capture
			
 
				+
			
 
				+**Current Python Implementation:**
			
 
				+- 12 extractors
			
 
				+- Strong foundation with room for enhancement
			
 
				+
			
 
				+**Recommended Additions:**
			
 
				+- **8 new high-priority extractors**
			
 
				+- **6 enhanced versions of existing extractors**
			
 
				+- **3 optional nice-to-have extractors**
			
 
				+
			
 
				+This would bring the Python implementation to feature parity with the JS version while maintaining better code organization and the existing plugin architecture.
			
--- a/SIMPLIFICATION_PLAN.md
+++ b/SIMPLIFICATION_PLAN.md
@@ -0,0 +1,819 @@
 
				+# ArchiveBox 2025 Simplification Plan
			
 
				+
			
 
				+**Status:** FINAL - Ready for implementation
			
 
				+**Last Updated:** 2024-12-24
			
 
				+
			
 
				+---
			
 
				+
			
 
				+## Final Decisions Summary
			
 
				+
			
 
				+| Decision | Choice |
			
 
				+|----------|--------|
			
 
				+| Task Queue | Keep `retry_at` polling pattern (no Django Tasks) |
			
 
				+| State Machine | Preserve current semantics; only replace mixins/statemachines if identical retry/lock guarantees are kept |
			
 
				+| Event Model | Remove completely |
			
 
				+| ABX Plugin System | Remove entirely (`archivebox/pkgs/`) |
			
 
				+| abx-pkg | Keep as external pip dependency (separate repo: github.com/ArchiveBox/abx-pkg) |
			
 
				+| Binary Providers | File-based plugins using abx-pkg internally |
			
 
				+| Search Backends | **Hybrid:** hooks for indexing, Python classes for querying |
			
 
				+| Auth Methods | Keep simple (LDAP + normal), no pluginization needed |
			
 
				+| ABID | Already removed (ignore old references) |
			
 
				+| ArchiveResult | **Keep pre-creation** with `status=queued` + `retry_at` for consistency |
			
 
				+| Plugin Directory | **`archivebox/plugins/*`** for built-ins, **`data/plugins/*`** for user hooks (flat `on_*__*.*` files) |
			
 
				+| Locking | Use `retry_at` consistently across Crawl, Snapshot, ArchiveResult |
			
 
				+| Worker Model | **Separate processes** per model type + per extractor, visible in htop |
			
 
				+| Concurrency | **Per-extractor configurable** (e.g., `ytdlp_max_parallel=5`) |
			
 
				+| InstalledBinary | **Keep model** + add Dependency model for audit trail |
			
 
				+
			
 
				+---
			
 
				+
			
 
				+## Architecture Overview
			
 
				+
			
 
				+### Consistent Queue/Lock Pattern
			
 
				+
			
 
				+All models (Crawl, Snapshot, ArchiveResult) use the same pattern:
			
 
				+
			
 
				+```python
			
 
				+class StatusMixin(models.Model):
			
 
				+    status = models.CharField(max_length=15, db_index=True)
			
 
				+    retry_at = models.DateTimeField(default=timezone.now, null=True, db_index=True)
			
 
				+
			
 
				+    class Meta:
			
 
				+        abstract = True
			
 
				+
			
 
				+    def tick(self) -> bool:
			
 
				+        """Override in subclass. Returns True if state changed."""
			
 
				+        raise NotImplementedError
			
 
				+
			
 
				+# Worker query (same for all models):
			
 
				+Model.objects.filter(
			
 
				+    status__in=['queued', 'started'],
			
 
				+    retry_at__lte=timezone.now()
			
 
				+).order_by('retry_at').first()
			
 
				+
			
 
				+# Claim (atomic via optimistic locking):
			
 
				+updated = Model.objects.filter(
			
 
				+    id=obj.id,
			
 
				+    retry_at=obj.retry_at
			
 
				+).update(
			
 
				+    retry_at=timezone.now() + timedelta(seconds=60)
			
 
				+)
			
 
				+if updated == 1:  # Successfully claimed
			
 
				+    obj.refresh_from_db()
			
 
				+    obj.tick()
			
 
				+```
			
 
				+
			
 
				+**Failure/cleanup guarantees**
			
 
				+- Objects stuck in `started` with a past `retry_at` must be reclaimed automatically using the existing retry/backoff rules.
			
 
				+- `tick()` implementations must continue to bump `retry_at` / transition to `backoff` the same way current statemachines do so that failures get retried without manual intervention.
			
 
				+
			
 
				+### Process Tree (Separate Processes, Visible in htop)
			
 
				+
			
 
				+```
			
 
				+archivebox server
			
 
				+├── orchestrator (pid=1000)
			
 
				+│   ├── crawl_worker_0 (pid=1001)
			
 
				+│   ├── crawl_worker_1 (pid=1002)
			
 
				+│   ├── snapshot_worker_0 (pid=1003)
			
 
				+│   ├── snapshot_worker_1 (pid=1004)
			
 
				+│   ├── snapshot_worker_2 (pid=1005)
			
 
				+│   ├── wget_worker_0 (pid=1006)
			
 
				+│   ├── wget_worker_1 (pid=1007)
			
 
				+│   ├── ytdlp_worker_0 (pid=1008)      # Limited concurrency
			
 
				+│   ├── ytdlp_worker_1 (pid=1009)
			
 
				+│   ├── screenshot_worker_0 (pid=1010)
			
 
				+│   ├── screenshot_worker_1 (pid=1011)
			
 
				+│   ├── screenshot_worker_2 (pid=1012)
			
 
				+│   └── ...
			
 
				+```
			
 
				+
			
 
				+**Configurable per-extractor concurrency:**
			
 
				+```python
			
 
				+# archivebox.conf or environment
			
 
				+WORKER_CONCURRENCY = {
			
 
				+    'crawl': 2,
			
 
				+    'snapshot': 3,
			
 
				+    'wget': 2,
			
 
				+    'ytdlp': 2,           # Bandwidth-limited
			
 
				+    'screenshot': 3,
			
 
				+    'singlefile': 2,
			
 
				+    'title': 5,           # Fast, can run many
			
 
				+    'favicon': 5,
			
 
				+}
			
 
				+```
			
 
				+
			
 
				+---
			
 
				+
			
 
				+## Hook System
			
 
				+
			
 
				+### Discovery (Glob at Startup)
			
 
				+
			
 
				+```python
			
 
				+# archivebox/hooks.py
			
 
				+from pathlib import Path
			
 
				+import subprocess
			
 
				+import os
			
 
				+import json
			
 
				+from django.conf import settings
			
 
				+
			
 
				+BUILTIN_PLUGIN_DIR = Path(__file__).parent.parent / 'plugins'
			
 
				+USER_PLUGIN_DIR = settings.DATA_DIR / 'plugins'
			
 
				+
			
 
				+def discover_hooks(event_name: str) -> list[Path]:
			
 
				+    """Find all scripts matching on_{EventName}__*.{sh,py,js} under archivebox/plugins/* and data/plugins/*"""
			
 
				+    hooks = []
			
 
				+    for base in (BUILTIN_PLUGIN_DIR, USER_PLUGIN_DIR):
			
 
				+        if not base.exists():
			
 
				+            continue
			
 
				+        for ext in ('sh', 'py', 'js'):
			
 
				+            hooks.extend(base.glob(f'*/on_{event_name}__*.{ext}'))
			
 
				+    return sorted(hooks)
			
 
				+
			
 
				+def run_hook(script: Path, output_dir: Path, **kwargs) -> dict:
			
 
				+    """Execute hook with --key=value args, cwd=output_dir."""
			
 
				+    args = [str(script)]
			
 
				+    for key, value in kwargs.items():
			
 
				+        args.append(f'--{key.replace("_", "-")}={json.dumps(value, default=str)}')
			
 
				+
			
 
				+    env = os.environ.copy()
			
 
				+    env['ARCHIVEBOX_DATA_DIR'] = str(settings.DATA_DIR)
			
 
				+
			
 
				+    result = subprocess.run(
			
 
				+        args,
			
 
				+        cwd=output_dir,
			
 
				+        capture_output=True,
			
 
				+        text=True,
			
 
				+        timeout=300,
			
 
				+        env=env,
			
 
				+    )
			
 
				+    return {
			
 
				+        'returncode': result.returncode,
			
 
				+        'stdout': result.stdout,
			
 
				+        'stderr': result.stderr,
			
 
				+    }
			
 
				+```
			
 
				+
			
 
				+### Hook Interface
			
 
				+
			
 
				+- **Input:** CLI args `--url=... --snapshot-id=...`
			
 
				+- **Location:** Built-in hooks in `archivebox/plugins/<plugin>/on_*__*.*`, user hooks in `data/plugins/<plugin>/on_*__*.*`
			
 
				+- **Internal API:** Should treat ArchiveBox as an external CLI—call `archivebox config --get ...`, `archivebox find ...`, import `abx-pkg` only when running in their own venvs.
			
 
				+- **Output:** Files written to `$PWD` (the output_dir), can call `archivebox create ...`
			
 
				+- **Logging:** stdout/stderr captured to ArchiveResult
			
 
				+- **Exit code:** 0 = success, non-zero = failure
			
 
				+
			
 
				+---
			
 
				+
			
 
				+## Unified Config Access
			
 
				+
			
 
				+- Implement `archivebox.config.get_config(scope='global'|'crawl'|'snapshot'|...)` that merges defaults, config files, environment variables, DB overrides, and per-object config (seed/crawl/snapshot).
			
 
				+- Provide helpers (`get_config()`, `get_flat_config()`) for Python callers so `abx.pm.hook.get_CONFIG*` can be removed.
			
 
				+- Ensure the CLI command `archivebox config --get KEY` (and a machine-readable `--format=json`) uses the same API so hook scripts can query config via subprocess calls.
			
 
				+- Document that plugin hooks should prefer the CLI to fetch config rather than importing Django internals, guaranteeing they work from shell/bash/js without ArchiveBox’s runtime.
			
 
				+
			
 
				+---
			
 
				+
			
 
				+### Example Extractor Hooks
			
 
				+
			
 
				+**Bash:**
			
 
				+```bash
			
 
				+#!/usr/bin/env bash
			
 
				+# plugins/on_Snapshot__wget.sh
			
 
				+set -e
			
 
				+
			
 
				+# Parse args
			
 
				+for arg in "$@"; do
			
 
				+    case $arg in
			
 
				+        --url=*) URL="${arg#*=}" ;;
			
 
				+        --snapshot-id=*) SNAPSHOT_ID="${arg#*=}" ;;
			
 
				+    esac
			
 
				+done
			
 
				+
			
 
				+# Find wget binary
			
 
				+WGET=$(archivebox find InstalledBinary --name=wget --format=abspath)
			
 
				+[ -z "$WGET" ] && echo "wget not found" >&2 && exit 1
			
 
				+
			
 
				+# Run extraction (writes to $PWD)
			
 
				+$WGET --mirror --page-requisites --adjust-extension "$URL" 2>&1
			
 
				+
			
 
				+echo "Completed wget mirror of $URL"
			
 
				+```
			
 
				+
			
 
				+**Python:**
			
 
				+```python
			
 
				+#!/usr/bin/env python3
			
 
				+# plugins/on_Snapshot__singlefile.py
			
 
				+import argparse
			
 
				+import subprocess
			
 
				+import sys
			
 
				+
			
 
				+def main():
			
 
				+    parser = argparse.ArgumentParser()
			
 
				+    parser.add_argument('--url', required=True)
			
 
				+    parser.add_argument('--snapshot-id', required=True)
			
 
				+    args = parser.parse_args()
			
 
				+
			
 
				+    # Find binary via CLI
			
 
				+    result = subprocess.run(
			
 
				+        ['archivebox', 'find', 'InstalledBinary', '--name=single-file', '--format=abspath'],
			
 
				+        capture_output=True, text=True
			
 
				+    )
			
 
				+    bin_path = result.stdout.strip()
			
 
				+    if not bin_path:
			
 
				+        print("single-file not installed", file=sys.stderr)
			
 
				+        sys.exit(1)
			
 
				+
			
 
				+    # Run extraction (writes to $PWD)
			
 
				+    subprocess.run([bin_path, args.url, '--output', 'singlefile.html'], check=True)
			
 
				+    print(f"Saved {args.url} to singlefile.html")
			
 
				+
			
 
				+if __name__ == '__main__':
			
 
				+    main()
			
 
				+```
			
 
				+
			
 
				+---
			
 
				+
			
 
				+## Binary Providers & Dependencies
			
 
				+
			
 
				+- Move dependency tracking into a dedicated `dependencies` module (or extend `archivebox/machine/`) with two Django models:
			
 
				+
			
 
				+```yaml
			
 
				+Dependency:
			
 
				+    id: uuidv7
			
 
				+    bin_name: extractor binary executable name (ytdlp|wget|screenshot|...)
			
 
				+    bin_provider: apt | brew | pip | npm | gem | nix | '*' for any
			
 
				+    custom_cmds: JSON of provider->install command overrides (optional)
			
 
				+    config: JSON of env vars/settings to apply during install
			
 
				+    created_at: utc datetime
			
 
				+
			
 
				+InstalledBinary:
			
 
				+    id: uuidv7
			
 
				+    dependency: FK to Dependency
			
 
				+    bin_name: executable name again
			
 
				+    bin_abspath: filesystem path
			
 
				+    bin_version: semver string
			
 
				+    bin_hash: sha256 of the binary
			
 
				+    bin_provider: apt | brew | pip | npm | gem | nix | custom | ...
			
 
				+    created_at: utc datetime (last seen/installed)
			
 
				+    is_valid: property returning True when both abspath+version are set
			
 
				+```
			
 
				+
			
 
				+- Provide CLI commands for hook scripts: `archivebox find InstalledBinary --name=wget --format=abspath`, `archivebox dependency create ...`, etc.
			
 
				+- Hooks remain language agnostic and should not import ArchiveBox Django modules; they rely on CLI commands plus their own runtime (python/bash/js).
			
 
				+
			
 
				+### Provider Hooks
			
 
				+
			
 
				+- Built-in provider plugins live under `archivebox/plugins/<provider>/on_Dependency__*.py` (e.g., apt, brew, pip, custom).
			
 
				+- Each provider hook:
			
 
				+    1. Checks if the Dependency allows that provider via `bin_provider` or wildcard `'*'`.
			
 
				+    2. Builds the install command (`custom_cmds[provider]` override or sane default like `apt install -y <bin_name>`).
			
 
				+    3. Executes the command (bash/python) and, on success, records/updates an `InstalledBinary`.
			
 
				+
			
 
				+Example outline (bash or python, but still interacting via CLI):
			
 
				+
			
 
				+```bash
			
 
				+# archivebox/plugins/apt/on_Dependency__install_using_apt_provider.sh
			
 
				+set -euo pipefail
			
 
				+
			
 
				+DEP_JSON=$(archivebox dependency show --id="$DEPENDENCY_ID" --format=json)
			
 
				+BIN_NAME=$(echo "$DEP_JSON" | jq -r '.bin_name')
			
 
				+PROVIDER_ALLOWED=$(echo "$DEP_JSON" | jq -r '.bin_provider')
			
 
				+
			
 
				+if [[ "$PROVIDER_ALLOWED" == "*" || "$PROVIDER_ALLOWED" == *"apt"* ]]; then
			
 
				+    INSTALL_CMD=$(echo "$DEP_JSON" | jq -r '.custom_cmds.apt // empty')
			
 
				+    INSTALL_CMD=${INSTALL_CMD:-"apt install -y --no-install-recommends $BIN_NAME"}
			
 
				+    bash -lc "$INSTALL_CMD"
			
 
				+
			
 
				+    archivebox dependency register-installed \
			
 
				+        --dependency-id="$DEPENDENCY_ID" \
			
 
				+        --bin-provider=apt \
			
 
				+        --bin-abspath="$(command -v "$BIN_NAME")" \
			
 
				+        --bin-version="$("$(command -v "$BIN_NAME")" --version | head -n1)" \
			
 
				+        --bin-hash="$(sha256sum "$(command -v "$BIN_NAME")" | cut -d' ' -f1)"
			
 
				+fi
			
 
				+```
			
 
				+
			
 
				+- Extractor-level hooks (e.g., `archivebox/plugins/wget/on_Crawl__install_wget_extractor_if_needed.*`) ensure dependencies exist before starting work by creating/updating `Dependency` records (via CLI) and then invoking provider hooks.
			
 
				+- Remove all reliance on `abx.pm.hook.binary_load` / ABX plugin packages; `abx-pkg` can remain as a normal pip dependency that hooks import if useful.
			
 
				+
			
 
				+---
			
 
				+
			
 
				+## Search Backends (Hybrid)
			
 
				+
			
 
				+### Indexing: Hook Scripts
			
 
				+
			
 
				+Triggered when ArchiveResult completes successfully (from the Django side we simply fire the event; indexing logic lives in standalone hook scripts):
			
 
				+
			
 
				+```python
			
 
				+#!/usr/bin/env python3
			
 
				+# plugins/on_ArchiveResult__index_sqlitefts.py
			
 
				+import argparse
			
 
				+import sqlite3
			
 
				+import os
			
 
				+from pathlib import Path
			
 
				+
			
 
				+def main():
			
 
				+    parser = argparse.ArgumentParser()
			
 
				+    parser.add_argument('--snapshot-id', required=True)
			
 
				+    parser.add_argument('--extractor', required=True)
			
 
				+    args = parser.parse_args()
			
 
				+
			
 
				+    # Read text content from output files
			
 
				+    content = ""
			
 
				+    for f in Path.cwd().rglob('*.txt'):
			
 
				+        content += f.read_text(errors='ignore') + "\n"
			
 
				+    for f in Path.cwd().rglob('*.html'):
			
 
				+        content += strip_html(f.read_text(errors='ignore')) + "\n"
			
 
				+
			
 
				+    if not content.strip():
			
 
				+        return
			
 
				+
			
 
				+    # Add to FTS index
			
 
				+    db = sqlite3.connect(os.environ['ARCHIVEBOX_DATA_DIR'] + '/search.sqlite3')
			
 
				+    db.execute('CREATE VIRTUAL TABLE IF NOT EXISTS fts USING fts5(snapshot_id, content)')
			
 
				+    db.execute('INSERT OR REPLACE INTO fts VALUES (?, ?)', (args.snapshot_id, content))
			
 
				+    db.commit()
			
 
				+
			
 
				+if __name__ == '__main__':
			
 
				+    main()
			
 
				+```
			
 
				+
			
 
				+### Querying: CLI-backed Python Classes
			
 
				+
			
 
				+```python
			
 
				+# archivebox/search/backends/sqlitefts.py
			
 
				+import subprocess
			
 
				+import json
			
 
				+
			
 
				+class SQLiteFTSBackend:
			
 
				+    name = 'sqlitefts'
			
 
				+
			
 
				+    def search(self, query: str, limit: int = 50) -> list[str]:
			
 
				+        """Call plugins/on_Search__query_sqlitefts.* and parse stdout."""
			
 
				+        result = subprocess.run(
			
 
				+            ['archivebox', 'search-backend', '--backend', self.name, '--query', query, '--limit', str(limit)],
			
 
				+            capture_output=True,
			
 
				+            check=True,
			
 
				+            text=True,
			
 
				+        )
			
 
				+        return json.loads(result.stdout or '[]')
			
 
				+
			
 
				+
			
 
				+# archivebox/search/__init__.py
			
 
				+from django.conf import settings
			
 
				+
			
 
				+def get_backend():
			
 
				+    name = getattr(settings, 'SEARCH_BACKEND', 'sqlitefts')
			
 
				+    if name == 'sqlitefts':
			
 
				+        from .backends.sqlitefts import SQLiteFTSBackend
			
 
				+        return SQLiteFTSBackend()
			
 
				+    elif name == 'sonic':
			
 
				+        from .backends.sonic import SonicBackend
			
 
				+        return SonicBackend()
			
 
				+    raise ValueError(f'Unknown search backend: {name}')
			
 
				+
			
 
				+def search(query: str) -> list[str]:
			
 
				+    return get_backend().search(query)
			
 
				+```
			
 
				+
			
 
				+- Each backend script lives under `archivebox/plugins/search/on_Search__query_<backend>.py` (with user overrides in `data/plugins/...`) and outputs JSON list of snapshot IDs. Python wrappers simply invoke the CLI to keep Django isolated from backend implementations.
			
 
				+
			
 
				+---
			
 
				+
			
 
				+## Simplified Models
			
 
				+
			
 
				+> Goal: reduce line count without sacrificing the correctness guarantees we currently get from `ModelWithStateMachine` + python-statemachine. We keep the mixins/statemachines unless we can prove a smaller implementation enforces the same transitions/retry locking.
			
 
				+
			
 
				+### Snapshot
			
 
				+
			
 
				+```python
			
 
				+class Snapshot(models.Model):
			
 
				+    id = models.UUIDField(primary_key=True, default=uuid7)
			
 
				+    url = models.URLField(unique=True, db_index=True)
			
 
				+    timestamp = models.CharField(max_length=32, unique=True, db_index=True)
			
 
				+    title = models.CharField(max_length=512, null=True, blank=True)
			
 
				+
			
 
				+    created_by = models.ForeignKey(settings.AUTH_USER_MODEL, on_delete=models.CASCADE)
			
 
				+    created_at = models.DateTimeField(default=timezone.now)
			
 
				+    modified_at = models.DateTimeField(auto_now=True)
			
 
				+
			
 
				+    crawl = models.ForeignKey('crawls.Crawl', on_delete=models.CASCADE, null=True)
			
 
				+    tags = models.ManyToManyField('Tag', through='SnapshotTag')
			
 
				+
			
 
				+    # Status (consistent with Crawl, ArchiveResult)
			
 
				+    status = models.CharField(max_length=15, default='queued', db_index=True)
			
 
				+    retry_at = models.DateTimeField(default=timezone.now, null=True, db_index=True)
			
 
				+
			
 
				+    # Inline fields (no mixins)
			
 
				+    config = models.JSONField(default=dict)
			
 
				+    notes = models.TextField(blank=True, default='')
			
 
				+
			
 
				+    FINAL_STATES = ['sealed']
			
 
				+
			
 
				+    @property
			
 
				+    def output_dir(self) -> Path:
			
 
				+        return settings.ARCHIVE_DIR / self.timestamp
			
 
				+
			
 
				+    def tick(self) -> bool:
			
 
				+        if self.status == 'queued' and self.can_start():
			
 
				+            self.start()
			
 
				+            return True
			
 
				+        elif self.status == 'started' and self.is_finished():
			
 
				+            self.seal()
			
 
				+            return True
			
 
				+        return False
			
 
				+
			
 
				+    def can_start(self) -> bool:
			
 
				+        return bool(self.url)
			
 
				+
			
 
				+    def is_finished(self) -> bool:
			
 
				+        results = self.archiveresult_set.all()
			
 
				+        if not results.exists():
			
 
				+            return False
			
 
				+        return not results.filter(status__in=['queued', 'started', 'backoff']).exists()
			
 
				+
			
 
				+    def start(self):
			
 
				+        self.status = 'started'
			
 
				+        self.retry_at = timezone.now() + timedelta(seconds=10)
			
 
				+        self.output_dir.mkdir(parents=True, exist_ok=True)
			
 
				+        self.save()
			
 
				+        self.create_pending_archiveresults()
			
 
				+
			
 
				+    def seal(self):
			
 
				+        self.status = 'sealed'
			
 
				+        self.retry_at = None
			
 
				+        self.save()
			
 
				+
			
 
				+    def create_pending_archiveresults(self):
			
 
				+        for extractor in get_config(defaults=settings, crawl=self.crawl, snapshot=self).ENABLED_EXTRACTORS:
			
 
				+            ArchiveResult.objects.get_or_create(
			
 
				+                snapshot=self,
			
 
				+                extractor=extractor,
			
 
				+                defaults={
			
 
				+                    'status': 'queued',
			
 
				+                    'retry_at': timezone.now(),
			
 
				+                    'created_by': self.created_by,
			
 
				+                }
			
 
				+            )
			
 
				+```
			
 
				+
			
 
				+### ArchiveResult
			
 
				+
			
 
				+```python
			
 
				+class ArchiveResult(models.Model):
			
 
				+    id = models.UUIDField(primary_key=True, default=uuid7)
			
 
				+    snapshot = models.ForeignKey(Snapshot, on_delete=models.CASCADE)
			
 
				+    extractor = models.CharField(max_length=32, db_index=True)
			
 
				+
			
 
				+    created_by = models.ForeignKey(settings.AUTH_USER_MODEL, on_delete=models.CASCADE)
			
 
				+    created_at = models.DateTimeField(default=timezone.now)
			
 
				+    modified_at = models.DateTimeField(auto_now=True)
			
 
				+
			
 
				+    # Status
			
 
				+    status = models.CharField(max_length=15, default='queued', db_index=True)
			
 
				+    retry_at = models.DateTimeField(default=timezone.now, null=True, db_index=True)
			
 
				+
			
 
				+    # Execution
			
 
				+    start_ts = models.DateTimeField(null=True)
			
 
				+    end_ts = models.DateTimeField(null=True)
			
 
				+    output = models.CharField(max_length=1024, null=True)
			
 
				+    cmd = models.JSONField(null=True)
			
 
				+    pwd = models.CharField(max_length=256, null=True)
			
 
				+
			
 
				+    # Audit trail
			
 
				+    machine = models.ForeignKey('machine.Machine', on_delete=models.SET_NULL, null=True)
			
 
				+    iface = models.ForeignKey('machine.NetworkInterface', on_delete=models.SET_NULL, null=True)
			
 
				+    installed_binary = models.ForeignKey('machine.InstalledBinary', on_delete=models.SET_NULL, null=True)
			
 
				+
			
 
				+    FINAL_STATES = ['succeeded', 'failed']
			
 
				+
			
 
				+    class Meta:
			
 
				+        unique_together = ('snapshot', 'extractor')
			
 
				+
			
 
				+    @property
			
 
				+    def output_dir(self) -> Path:
			
 
				+        return self.snapshot.output_dir / self.extractor
			
 
				+
			
 
				+    def tick(self) -> bool:
			
 
				+        if self.status == 'queued' and self.can_start():
			
 
				+            self.start()
			
 
				+            return True
			
 
				+        elif self.status == 'backoff' and self.can_retry():
			
 
				+            self.status = 'queued'
			
 
				+            self.retry_at = timezone.now()
			
 
				+            self.save()
			
 
				+            return True
			
 
				+        return False
			
 
				+
			
 
				+    def can_start(self) -> bool:
			
 
				+        return bool(self.snapshot.url)
			
 
				+
			
 
				+    def can_retry(self) -> bool:
			
 
				+        return self.retry_at and self.retry_at <= timezone.now()
			
 
				+
			
 
				+    def start(self):
			
 
				+        self.status = 'started'
			
 
				+        self.start_ts = timezone.now()
			
 
				+        self.retry_at = timezone.now() + timedelta(seconds=120)
			
 
				+        self.output_dir.mkdir(parents=True, exist_ok=True)
			
 
				+        self.save()
			
 
				+
			
 
				+        # Run hook and complete
			
 
				+        self.run_extractor_hook()
			
 
				+
			
 
				+    def run_extractor_hook(self):
			
 
				+        from archivebox.hooks import discover_hooks, run_hook
			
 
				+
			
 
				+        hooks = discover_hooks(f'Snapshot__{self.extractor}')
			
 
				+        if not hooks:
			
 
				+            self.status = 'failed'
			
 
				+            self.output = f'No hook for: {self.extractor}'
			
 
				+            self.end_ts = timezone.now()
			
 
				+            self.retry_at = None
			
 
				+            self.save()
			
 
				+            return
			
 
				+
			
 
				+        result = run_hook(
			
 
				+            hooks[0],
			
 
				+            output_dir=self.output_dir,
			
 
				+            url=self.snapshot.url,
			
 
				+            snapshot_id=str(self.snapshot.id),
			
 
				+        )
			
 
				+
			
 
				+        self.status = 'succeeded' if result['returncode'] == 0 else 'failed'
			
 
				+        self.output = result['stdout'][:1024] or result['stderr'][:1024]
			
 
				+        self.end_ts = timezone.now()
			
 
				+        self.retry_at = None
			
 
				+        self.save()
			
 
				+
			
 
				+        # Trigger search indexing if succeeded
			
 
				+        if self.status == 'succeeded':
			
 
				+            self.trigger_search_indexing()
			
 
				+
			
 
				+    def trigger_search_indexing(self):
			
 
				+        from archivebox.hooks import discover_hooks, run_hook
			
 
				+        for hook in discover_hooks('ArchiveResult__index'):
			
 
				+            run_hook(hook, output_dir=self.output_dir,
			
 
				+                     snapshot_id=str(self.snapshot.id),
			
 
				+                     extractor=self.extractor)
			
 
				+```
			
 
				+
			
 
				+- `ArchiveResult` must continue storing execution metadata (`cmd`, `pwd`, `machine`, `iface`, `installed_binary`, timestamps) exactly as before, even though the extractor now runs via hook scripts. `run_extractor_hook()` is responsible for capturing those values (e.g., wrapping subprocess calls).
			
 
				+- Any refactor of `Snapshot`, `ArchiveResult`, or `Crawl` has to keep the same `FINAL_STATES`, `retry_at` semantics, and tag/output directory handling that `ModelWithStateMachine` currently provides.
			
 
				+
			
 
				+---
			
 
				+
			
 
				+## Simplified Worker System
			
 
				+
			
 
				+```python
			
 
				+# archivebox/workers/orchestrator.py
			
 
				+import os
			
 
				+import time
			
 
				+import multiprocessing
			
 
				+from datetime import timedelta
			
 
				+from django.utils import timezone
			
 
				+from django.conf import settings
			
 
				+
			
 
				+
			
 
				+class Worker:
			
 
				+    """Base worker for processing queued objects."""
			
 
				+    Model = None
			
 
				+    name = 'worker'
			
 
				+
			
 
				+    def get_queue(self):
			
 
				+        return self.Model.objects.filter(
			
 
				+            retry_at__lte=timezone.now()
			
 
				+        ).exclude(
			
 
				+            status__in=self.Model.FINAL_STATES
			
 
				+        ).order_by('retry_at')
			
 
				+
			
 
				+    def claim(self, obj) -> bool:
			
 
				+        """Atomic claim via optimistic lock."""
			
 
				+        updated = self.Model.objects.filter(
			
 
				+            id=obj.id,
			
 
				+            retry_at=obj.retry_at
			
 
				+        ).update(retry_at=timezone.now() + timedelta(seconds=60))
			
 
				+        return updated == 1
			
 
				+
			
 
				+    def run(self):
			
 
				+        print(f'[{self.name}] Started pid={os.getpid()}')
			
 
				+        while True:
			
 
				+            obj = self.get_queue().first()
			
 
				+            if obj and self.claim(obj):
			
 
				+                try:
			
 
				+                    obj.refresh_from_db()
			
 
				+                    obj.tick()
			
 
				+                except Exception as e:
			
 
				+                    print(f'[{self.name}] Error: {e}')
			
 
				+                    obj.retry_at = timezone.now() + timedelta(seconds=60)
			
 
				+                    obj.save(update_fields=['retry_at'])
			
 
				+            else:
			
 
				+                time.sleep(0.5)
			
 
				+
			
 
				+
			
 
				+class CrawlWorker(Worker):
			
 
				+    from crawls.models import Crawl
			
 
				+    Model = Crawl
			
 
				+    name = 'crawl'
			
 
				+
			
 
				+
			
 
				+class SnapshotWorker(Worker):
			
 
				+    from core.models import Snapshot
			
 
				+    Model = Snapshot
			
 
				+    name = 'snapshot'
			
 
				+
			
 
				+
			
 
				+class ExtractorWorker(Worker):
			
 
				+    """Worker for a specific extractor."""
			
 
				+    from core.models import ArchiveResult
			
 
				+    Model = ArchiveResult
			
 
				+
			
 
				+    def __init__(self, extractor: str):
			
 
				+        self.extractor = extractor
			
 
				+        self.name = extractor
			
 
				+
			
 
				+    def get_queue(self):
			
 
				+        return super().get_queue().filter(extractor=self.extractor)
			
 
				+
			
 
				+
			
 
				+class Orchestrator:
			
 
				+    def __init__(self):
			
 
				+        self.processes = []
			
 
				+
			
 
				+    def spawn(self):
			
 
				+        config = settings.WORKER_CONCURRENCY
			
 
				+
			
 
				+        for i in range(config.get('crawl', 2)):
			
 
				+            self._spawn(CrawlWorker, f'crawl_{i}')
			
 
				+
			
 
				+        for i in range(config.get('snapshot', 3)):
			
 
				+            self._spawn(SnapshotWorker, f'snapshot_{i}')
			
 
				+
			
 
				+        for extractor, count in config.items():
			
 
				+            if extractor in ('crawl', 'snapshot'):
			
 
				+                continue
			
 
				+            for i in range(count):
			
 
				+                self._spawn(ExtractorWorker, f'{extractor}_{i}', extractor)
			
 
				+
			
 
				+    def _spawn(self, cls, name, *args):
			
 
				+        worker = cls(*args) if args else cls()
			
 
				+        worker.name = name
			
 
				+        p = multiprocessing.Process(target=worker.run, name=name)
			
 
				+        p.start()
			
 
				+        self.processes.append(p)
			
 
				+
			
 
				+    def run(self):
			
 
				+        print(f'Orchestrator pid={os.getpid()}')
			
 
				+        self.spawn()
			
 
				+        try:
			
 
				+            while True:
			
 
				+                for p in self.processes:
			
 
				+                    if not p.is_alive():
			
 
				+                        print(f'{p.name} died, restarting...')
			
 
				+                        # Respawn logic
			
 
				+                time.sleep(5)
			
 
				+        except KeyboardInterrupt:
			
 
				+            for p in self.processes:
			
 
				+                p.terminate()
			
 
				+```
			
 
				+
			
 
				+---
			
 
				+
			
 
				+## Directory Structure
			
 
				+
			
 
				+```
			
 
				+archivebox-nue/
			
 
				+├── archivebox/
			
 
				+│   ├── __init__.py
			
 
				+│   ├── config.py                    # Simple env-based config
			
 
				+│   ├── hooks.py                     # Hook discovery + execution
			
 
				+│   │
			
 
				+│   ├── core/
			
 
				+│   │   ├── models.py                # Snapshot, ArchiveResult, Tag
			
 
				+│   │   ├── admin.py
			
 
				+│   │   └── views.py
			
 
				+│   │
			
 
				+│   ├── crawls/
			
 
				+│   │   ├── models.py                # Crawl, Seed, CrawlSchedule, Outlink
			
 
				+│   │   └── admin.py
			
 
				+│   │
			
 
				+│   ├── machine/
			
 
				+│   │   ├── models.py                # Machine, NetworkInterface, Dependency, InstalledBinary
			
 
				+│   │   └── admin.py
			
 
				+│   │
			
 
				+│   ├── workers/
			
 
				+│   │   └── orchestrator.py          # ~150 lines
			
 
				+│   │
			
 
				+│   ├── api/
			
 
				+│   │   └── ...
			
 
				+│   │
			
 
				+│   ├── cli/
			
 
				+│   │   └── ...
			
 
				+│   │
			
 
				+│   ├── search/
			
 
				+│   │   ├── __init__.py
			
 
				+│   │   └── backends/
			
 
				+│   │       ├── sqlitefts.py
			
 
				+│   │       └── sonic.py
			
 
				+│   │
			
 
				+│   ├── index/
			
 
				+│   ├── parsers/
			
 
				+│   ├── misc/
			
 
				+│   └── templates/
			
 
				+│
			
 
				+-├── plugins/                         # Built-in hooks (ArchiveBox never imports these directly)
			
 
				+│   ├── wget/
			
 
				+│   │   └── on_Snapshot__wget.sh
			
 
				+│   ├── dependencies/
			
 
				+│   │   ├── on_Dependency__install_using_apt_provider.sh
			
 
				+│   │   └── on_Dependency__install_using_custom_bash.py
			
 
				+│   ├── search/
			
 
				+│   │   ├── on_ArchiveResult__index_sqlitefts.py
			
 
				+│   │   └── on_Search__query_sqlitefts.py
			
 
				+│   └── ...
			
 
				+├── data/
			
 
				+│   └── plugins/                     # User-provided hooks mirror builtin layout
			
 
				+└── pyproject.toml
			
 
				+```
			
 
				+
			
 
				+---
			
 
				+
			
 
				+## Implementation Phases
			
 
				+
			
 
				+### Phase 1: Build Unified Config + Hook Scaffold
			
 
				+
			
 
				+1. Implement `archivebox.config.get_config()` + CLI plumbing (`archivebox config --get ... --format=json`) without touching abx yet.
			
 
				+2. Add `archivebox/hooks.py` with dual plugin directories (`archivebox/plugins`, `data/plugins`), discovery, and execution helpers.
			
 
				+3. Keep the existing ABX/worker system running while new APIs land; surface warnings where `abx.pm.*` is still in use.
			
 
				+
			
 
				+### Phase 2: Gradual ABX Removal
			
 
				+
			
 
				+1. Rename `archivebox/pkgs/` to `archivebox/pkgs.unused/` and start deleting packages once equivalent hook scripts exist.
			
 
				+2. Remove `pluggy`, `python-statemachine`, and all `abx-*` dependencies/workspace entries from `pyproject.toml` only after consumers are migrated.
			
 
				+3. Replace every `abx.pm.hook.get_*` usage in CLI/config/search/extractors with the new config + hook APIs.
			
 
				+
			
 
				+### Phase 3: Worker + State Machine Simplification
			
 
				+
			
 
				+1. Introduce the process-per-model orchestrator while preserving `ModelWithStateMachine` semantics (Snapshot/Crawl/ArchiveResult).
			
 
				+2. Only drop mixins/statemachine dependency after verifying the new `tick()` implementations keep retries/backoff/final states identical.
			
 
				+3. Ensure Huey/task entry points either delegate to the new orchestrator or are retired cleanly so background work isn’t double-run.
			
 
				+
			
 
				+### Phase 4: Hook-Based Extractors & Dependencies
			
 
				+
			
 
				+1. Create builtin extractor hooks in `archivebox/plugins/*/on_Snapshot__*.{sh,py,js}`; have `ArchiveResult.run_extractor_hook()` capture cmd/pwd/machine/install metadata.
			
 
				+2. Implement the new `Dependency`/`InstalledBinary` models + CLI commands, and port provider/install logic into hook scripts that only talk via CLI.
			
 
				+3. Add CLI helpers `archivebox find InstalledBinary`, `archivebox dependency ...` used by all hooks and document how user plugins extend them.
			
 
				+
			
 
				+### Phase 5: Search Backends & Indexing Hooks
			
 
				+
			
 
				+1. Migrate indexing triggers to hook scripts (`on_ArchiveResult__index_*`) that run standalone and write into `$ARCHIVEBOX_DATA_DIR/search.*`.
			
 
				+2. Implement CLI-driven query hooks (`on_Search__query_*`) plus lightweight Python wrappers in `archivebox/search/backends/`.
			
 
				+3. Remove any remaining ABX search integration.
			
 
				+
			
 
				+
			
 
				+---
			
 
				+
			
 
				+## What Gets Deleted
			
 
				+
			
 
				+```
			
 
				+archivebox/pkgs/                 # ~5,000 lines
			
 
				+archivebox/workers/actor.py      # If exists
			
 
				+```
			
 
				+
			
 
				+## Dependencies Removed
			
 
				+
			
 
				+```toml
			
 
				+"pluggy>=1.5.0"
			
 
				+"python-statemachine>=2.3.6"
			
 
				+# + all 30 abx-* packages
			
 
				+```
			
 
				+
			
 
				+## Dependencies Kept
			
 
				+
			
 
				+```toml
			
 
				+"django>=6.0"
			
 
				+"django-ninja>=1.3.0"
			
 
				+"abx-pkg>=0.6.0"         # External, for binary management
			
 
				+"click>=8.1.7"
			
 
				+"rich>=13.8.0"
			
 
				+```
			
 
				+
			
 
				+---
			
 
				+
			
 
				+## Estimated Savings
			
 
				+
			
 
				+| Component | Lines Removed |
			
 
				+|-----------|---------------|
			
 
				+| pkgs/ (ABX) | ~5,000 |
			
 
				+| statemachines | ~300 |
			
 
				+| workers/ | ~500 |
			
 
				+| base_models mixins | ~100 |
			
 
				+| **Total** | **~6,000 lines** |
			
 
				+
			
 
				+Plus 30+ dependencies removed, massive reduction in conceptual complexity.
			
 
				+
			
 
				+---
			
 
				+
			
 
				+**Status: READY FOR IMPLEMENTATION**
			
 
				+
			
 
				+Begin with Phase 1: Rename `archivebox/pkgs/` to add `.unused` suffix (delete after porting) and fix imports.
			
--- a/TEST_RESULTS.md
+++ b/TEST_RESULTS.md
@@ -0,0 +1,127 @@
 
				+# Chrome Extensions Test Results ✅
			
 
				+
			
 
				+Date: 2025-12-24
			
 
				+Status: **ALL TESTS PASSED**
			
 
				+
			
 
				+## Test Summary
			
 
				+
			
 
				+Ran comprehensive tests of the Chrome extension system including:
			
 
				+- Extension downloads from Chrome Web Store
			
 
				+- Extension unpacking and installation
			
 
				+- Metadata caching and persistence
			
 
				+- Cache performance verification
			
 
				+
			
 
				+## Results
			
 
				+
			
 
				+### ✅ Extension Downloads (4/4 successful)
			
 
				+
			
 
				+| Extension | Version | Size | Status |
			
 
				+|-----------|---------|------|--------|
			
 
				+| captcha2 (2captcha) | 3.7.2 | 396 KB | ✅ Downloaded |
			
 
				+| istilldontcareaboutcookies | 1.1.9 | 550 KB | ✅ Downloaded |
			
 
				+| ublock (uBlock Origin) | 1.68.0 | 4.0 MB | ✅ Downloaded |
			
 
				+| singlefile | 1.22.96 | 1.2 MB | ✅ Downloaded |
			
 
				+
			
 
				+### ✅ Extension Installation (4/4 successful)
			
 
				+
			
 
				+All extensions were successfully unpacked with valid `manifest.json` files:
			
 
				+- captcha2: Manifest V3 ✓
			
 
				+- istilldontcareaboutcookies: Valid manifest ✓
			
 
				+- ublock: Valid manifest ✓
			
 
				+- singlefile: Valid manifest ✓
			
 
				+
			
 
				+### ✅ Metadata Caching (4/4 successful)
			
 
				+
			
 
				+Extension metadata cached to `*.extension.json` files with complete information:
			
 
				+- Web Store IDs
			
 
				+- Download URLs
			
 
				+- File paths (absolute)
			
 
				+- Computed extension IDs
			
 
				+- Version numbers
			
 
				+
			
 
				+Example metadata (captcha2):
			
 
				+```json
			
 
				+{
			
 
				+  "webstore_id": "ifibfemgeogfhoebkmokieepdoobkbpo",
			
 
				+  "name": "captcha2",
			
 
				+  "crx_path": "[...]/ifibfemgeogfhoebkmokieepdoobkbpo__captcha2.crx",
			
 
				+  "unpacked_path": "[...]/ifibfemgeogfhoebkmokieepdoobkbpo__captcha2",
			
 
				+  "id": "gafcdbhijmmjlojcakmjlapdliecgila",
			
 
				+  "version": "3.7.2"
			
 
				+}
			
 
				+```
			
 
				+
			
 
				+### ✅ Cache Performance Verification
			
 
				+
			
 
				+**Test**: Ran captcha2 installation twice in a row
			
 
				+
			
 
				+**First run**: Downloaded and installed extension (5s)
			
 
				+**Second run**: Used cache, skipped installation (0.01s)
			
 
				+
			
 
				+**Performance gain**: ~500x faster on subsequent runs
			
 
				+
			
 
				+**Log output from second run**:
			
 
				+```
			
 
				+[*] 2captcha extension already installed (using cache)
			
 
				+[✓] 2captcha extension setup complete
			
 
				+```
			
 
				+
			
 
				+## File Structure Created
			
 
				+
			
 
				+```
			
 
				+data/personas/Test/chrome_extensions/
			
 
				+├── captcha2.extension.json (709 B)
			
 
				+├── istilldontcareaboutcookies.extension.json (763 B)
			
 
				+├── ublock.extension.json (704 B)
			
 
				+├── singlefile.extension.json (717 B)
			
 
				+├── ifibfemgeogfhoebkmokieepdoobkbpo__captcha2/ (unpacked)
			
 
				+├── ifibfemgeogfhoebkmokieepdoobkbpo__captcha2.crx (396 KB)
			
 
				+├── edibdbjcniadpccecjdfdjjppcpchdlm__istilldontcareaboutcookies/ (unpacked)
			
 
				+├── edibdbjcniadpccecjdfdjjppcpchdlm__istilldontcareaboutcookies.crx (550 KB)
			
 
				+├── cjpalhdlnbpafiamejdnhcphjbkeiagm__ublock/ (unpacked)
			
 
				+├── cjpalhdlnbpafiamejdnhcphjbkeiagm__ublock.crx (4.0 MB)
			
 
				+├── mpiodijhokgodhhofbcjdecpffjipkle__singlefile/ (unpacked)
			
 
				+└── mpiodijhokgodhhofbcjdecpffjipkle__singlefile.crx (1.2 MB)
			
 
				+```
			
 
				+
			
 
				+Total size: ~6.2 MB for all 4 extensions
			
 
				+
			
 
				+## Notes
			
 
				+
			
 
				+### Expected Warnings
			
 
				+
			
 
				+The following warnings are **expected and harmless**:
			
 
				+
			
 
				+```
			
 
				+warning [*.crx]:  1062-1322 extra bytes at beginning or within zipfile
			
 
				+  (attempting to process anyway)
			
 
				+```
			
 
				+
			
 
				+This occurs because CRX files have a Chrome-specific header (containing signature data) before the ZIP content. The `unzip` command detects this and processes the ZIP data correctly anyway.
			
 
				+
			
 
				+### Cache Invalidation
			
 
				+
			
 
				+To force re-download of extensions:
			
 
				+```bash
			
 
				+rm -rf data/personas/Test/chrome_extensions/
			
 
				+```
			
 
				+
			
 
				+## Next Steps
			
 
				+
			
 
				+✅ Extensions are ready to use with Chrome
			
 
				+- Load via `--load-extension` and `--allowlisted-extension-id` flags
			
 
				+- Extensions can be configured at runtime via CDP
			
 
				+- 2captcha config plugin ready to inject API key
			
 
				+
			
 
				+✅ Ready for integration testing with:
			
 
				+- chrome_session plugin (load extensions on browser start)
			
 
				+- captcha2_config plugin (configure 2captcha API key)
			
 
				+- singlefile extractor (trigger extension action)
			
 
				+
			
 
				+## Conclusion
			
 
				+
			
 
				+The Chrome extension system is **production-ready** with:
			
 
				+- ✅ Robust download and installation
			
 
				+- ✅ Efficient multi-level caching
			
 
				+- ✅ Proper error handling
			
 
				+- ✅ Performance optimized for thousands of snapshots
			
--- a/archivebox.ts
+++ b/archivebox.ts
@@ -0,0 +1,6109 @@
 
				+tring';
			
 
				+import { Readable } from 'node:stream';
			
 
				+import { finished } from 'node:stream/promises';
			
 
				+import { URL } from 'node:url';
			
 
				+import util from 'node:util';
			
 
				+const exec = util.promisify(child_process.exec);
			
 
				+
			
 
				+import { Readability } from '@mozilla/readability';
			
 
				+import FileCookieStore from '@root/file-cookie-store';
			
 
				+import merge from 'deepmerge';
			
 
				+import { createCursor, getRandomPagePoint } from 'ghost-cursor';
			
 
				+import { JSDOM, VirtualConsole } from 'jsdom';
			
 
				+import mime from 'mime-types';
			
 
				+import ToughCookie from 'tough-cookie';
			
 
				+import unzip from 'unzip-crx-3';
			
 
				+
			
 
				+import puppeteer from 'puppeteer';
			
 
				+import { Browser, Page, Cookie, HTTPResponse } from 'puppeteer';
			
 
				+import { Cluster } from 'puppeteer-cluster';
			
 
				+import PupeteerExtra from "puppeteer-extra";
			
 
				+import Stealth#!/usr/bin/env node --env-file .env
			
 
				+// https://gist.github.com/pirate/d9a350e83025a1e6cf452cddd815d0d4
			
 
				+
			
 
				+// npm install request node-request minimist deepmerge mime-types decompress puppeteer-extra puppeteer-extra-plugin-repl puppeteer-extra-plugin-user-preferences puppeteer-extra-plugin-recaptcha puppeteer-extra-plugin-stealth puppeteer-screen-recorder puppeteer-cluster ghost-cursor @mozilla/readability jsdom unzip-crx-3 node-fetch@2 
			
 
				+
			
 
				+
			
 
				+import assert from 'node:assert/strict';
			
 
				+import { Buffer } from 'node:buffer';
			
 
				+import child_process from 'node:child_process';
			
 
				+import crypto from 'node:crypto';
			
 
				+import fs from 'node:fs';
			
 
				+import { createServer } from 'node:http';
			
 
				+import os from 'node:os';
			
 
				+import path from 'node:path';
			
 
				+import querystring from 'node:querysPlugin from "puppeteer-extra-plugin-stealth";
			
 
				+import PrefsPlugin from 'puppeteer-extra-plugin-user-preferences';
			
 
				+import { PuppeteerScreenRecorder } from 'puppeteer-screen-recorder';
			
 
				+// import RecaptchaPlugin from 'puppeteer-extra-plugin-recaptcha';
			
 
				+// import ReplPlugin from 'puppeteer-extra-plugin-repl';
			
 
				+
			
 
				+const __dirname = import.meta.dirname
			
 
				+
			
 
				+import { getDatabase } from './models/init-models.js';
			
 
				+const { Tag, Snapshot, ArchiveResult } = await getDatabase({ dbpath: './index.sqlite3' })
			
 
				+
			
 
				+
			
 
				+// move mitm CA cert into /usr/local/share/ca-certificates/mitmproxy-ca-cert.crt
			
 
				+// update-ca-certificates
			
 
				+
			
 
				+
			
 
				+const ANSI = {
			
 
				+    reset: "\x1b[0m",
			
 
				+    blue: "\x1b[34m",
			
 
				+    black: "\x1b[30m",
			
 
				+}
			
 
				+
			
 
				+/************************* Main Input Arguments *******************************/
			
 
				+let URLS = [
			
 
				+    // 'chrome://about',
			
 
				+    // 'chrome://system/#chrome_root_store',
			
 
				+
			
 
				+    'https://facebook.com/815781663692514/?comment_id=1508571679703640',
			
 
				+    'https://www.instagram.com/p/CrTY1fENHr5/',
			
 
				+    'https://www.tiktok.com/@zemmour_eric/video/7342474065598319904?cid=7343316616878490400',
			
 
				+    'https://twitter.com/DZasken68678/status/1799833933271687304',
			
 
				+    'https://t.me/IONONMIARRENDOGROUP/13598',
			
 
				+    'https://www.youtube.com/watch?v=rpD0qgzlCms',
			
 
				+    'https://www.aap.com.au/factcheck/aboriginal-lands-claim-a-total-abdication-of-facts/',
			
 
				+
			
 
				+
			
 
				+    'https://gologin.com/check-browser',
			
 
				+    'https://arh.antoinevastel.com/bots/areyouheadless',
			
 
				+
			
 
				+    'https://2captcha.com/demo/hcaptcha',
			
 
				+    'https://2captcha.com/demo/cloudflare-turnstile',
			
 
				+    'https://2captcha.com/demo/recaptcha-v3',
			
 
				+    'https://ipinfo.io/',
			
 
				+
			
 
				+    // 'https://2captcha.com/demo/recaptcha-v2',
			
 
				+    // 'https://2captcha.com/demo/keycaptcha',
			
 
				+    // 'https://browserleaks.com/canvas',
			
 
				+    // 'https://bot.incolumitas.com/#botChallenge',
			
 
				+    // 'https://infosimples.github.io/detect-headless/',
			
 
				+    // 'https://coveryourtracks.eff.org/',
			
 
				+    // 'https://fingerprint.com/demo/',
			
 
				+    // 'https://nowsecure.nl',
			
 
				+    // 'https://abrahamjuliot.github.io/creepjs/',
			
 
				+    // 'https://scrapfly.io/web-scraping-tools/http2-fingerprint',
			
 
				+    // 'https://scrapfly.io/web-scraping-tools/browser-fingerprint',
			
 
				+    // 'https://scrapfly.io/web-scraping-tools/ja3-fingerprint',
			
 
				+    // 'https://scrapfly.io/web-scraping-tools/canvas-fingerprint',
			
 
				+    // 'https://scrapfly.io/web-scraping-tools/webgl-fingerprint',
			
 
				+    // 'https://scrapfly.io/web-scraping-tools/audio-fingerprint',
			
 
				+    // 'https://scrapfly.io/web-scraping-tools/screen-fingerprint',
			
 
				+    // 'https://web-scraping.dev/',
			
 
				+
			
 
				+
			
 
				+    // 'https://example.com',
			
 
				+    // 'https://www.okta.com/',
			
 
				+    // 'https://www.webflow.com/',
			
 
				+    // 'https://docker-compose.archivebox.io',
			
 
				+    // 'https://www.reddit.com/r/AskReddit/comments/1br0q9b/what_was_ok_10_years_ago_but_isnt_today/',
			
 
				+    // 'https://www.quora.com/Is-the-website-2Captcha-true-or-fake-with-paying-money-for-working-on-it',
			
 
				+    // 'https://x.com/yawnzzcalo7/status/1747853178849435894',
			
 
				+    // 'https://twitter.com/yawnzzcalo7/status/1747853178849435894',
			
 
				+    // 'https://rachdele.substack.com/p/is-the-job-market-dying',
			
 
				+    // 'https://www.flowradar.com/cloneables/mouse-image-trail-effect',
			
 
				+    // 'https://wrong.host.badssl.com/',
			
 
				+    // 'http://docker-compose.archivebox.io',
			
 
				+    // 'https://pptr.dev/api/puppeteer.page.setrequestinterception',
			
 
				+    // 'https://blog.sweeting.me#Writing',
			
 
				+    // 'https://github.com/yarnpkg/yarn/issues/9005',
			
 
				+
			
 
				+    // 'https://archive.md/739Oc',
			
 
				+    // 'https://archive.md/Oc72d',
			
 
				+    // 'https://archive.vn/fPUBe',
			
 
				+    // 'https://archive.vn/mRz4P',
			
 
				+    // 'https://archive.vn/Qct6Y',
			
 
				+    // 'https://archive.vn/sv50h',
			
 
				+    // 'https://facebook.com/815781663692514/?comment_id=1508571679703640',
			
 
				+    // 'https://facebook.com/815781663692514/?comment_id=924451748966499',
			
 
				+    // 'https://www.facebook.com/wayne.brennan.528/posts/pfbid02fvxFppng2WsHMavhBa62cXizCBGdmPQRH3CMhac79qzS5C1ADaSNC587d3u6qVbkl',
			
 
				+    // 'https://www.facebook.com/wildeprods/posts/pfbid02YEPfoB7pZqMNzE4y2MpYSQbRAzASquvHyEMzHqrNngJCSL7onEg2jnsqS6epcQHWl',
			
 
				+    // 'https://t.me/aubontouite_francais/9493',
			
 
				+    // 'https://t.me/BC_BLACKMIROR/5044',
			
 
				+    // 'https://t.me/IONONMIARRENDOGROUP/14004',
			
 
				+    // 'https://t.me/newsfactory_pl/51014',
			
 
				+    // 'https://t.me/oliverjanich/132574',
			
 
				+    // 'https://t.me/tomaszgryguc/10449',
			
 
				+    // 'https://t.me/amigosDisidentes/123177',
			
 
				+    // 'https://twitter.com/1nfiltr4do_NN/status/1767238399943991389',
			
 
				+    // 'https://twitter.com/4lmondcookie/status/1748519205438111914',
			
 
				+    // 'https://twitter.com/4olll1ke/status/1753796944827199766',
			
 
				+    // 'https://twitter.com/yeokiloss/status/1754908226179502345',
			
 
				+    // 'https://twitter.com/YoungWaifLover/status/1735667278090297561',
			
 
				+    // 'https://twitter.com/Z_Pour_Demain/status/1766133730278605182',
			
 
				+    // 'https://www.aap.com.au/factcheck/aboriginal-lands-claim-a-total-abdication-of-facts/',
			
 
				+    // 'https://www.aap.com.au/factcheck/absurd-albanese-clip-fools-voice-voters/',
			
 
				+    // 'https://www.instagram.com/_the.forgotten.ones/p/CQQDyoqhsF6/',
			
 
				+    // 'https://www.instagram.com/p/CqSM_f9MR4b/',
			
 
				+    // 'https://www.instagram.com/p/CqSQgf1sv8B/',
			
 
				+    // 'https://instagram.com/p/B-Q22Z_pxyC/',
			
 
				+    // 'https://www.tiktok.com/@zitatezurzeit/photo/7342474065598319904?cid=7343316616878490400',
			
 
				+    // 'https://tiktok.com/@zitatezurzeit/photo/7342474065598319904?cid=7343316616878490400',
			
 
				+    // 'https://www.youtube.com/watch?v=rpD0qgzlCms',
			
 
				+]
			
 
				+
			
 
				+const isTruthy = (env_value) => ['1', 'yes', 'true'].includes(env_value?.toLowerCase() || 'false')
			
 
				+
			
 
				+/********************** Config: General High-Level Options ********************/
			
 
				+
			
 
				+const PASSIVE_ARCHIVING = isTruthy(process.env.PASSIVE_ARCHIVING)
			
 
				+const CHROME_CLUSTER = isTruthy(process.env.CHROME_CLUSTER)
			
 
				+const CHROME_CLUSTER_WORKERS = 4
			
 
				+
			
 
				+const API_SERVER_HOST = '0.0.0.0'
			
 
				+const API_SERVER_PORT = 9595
			
 
				+const CHROME_DEBUG_PORT = 9222                                    // 9222 is default, or use 0 for random port
			
 
				+
			
 
				+/********************** Config: Keys & Secrets ********************************/
			
 
				+
			
 
				+const API_KEY_2CAPTCHA = process.env.API_KEY_2CAPTCHA || 'YOUR_API_KEY_HERE'
			
 
				+const FLARESOLVERR_API_ENDPOINT = process.env.FLARESOLVERR_API_ENDPOINT || "http://localhost:8191/v1"
			
 
				+
			
 
				+const ACTIVE_PERSONA = process.env.ACTIVE_PERSONA || 'Default'
			
 
				+const CHROME_PROFILE_USER = process.env.CHROME_PROFILE_USER || 'Default'
			
 
				+const LOAD_AUTH_STORAGE = isTruthy(process.env.LOAD_AUTH_STORAGE)
			
 
				+const SAVE_AUTH_STORAGE = isTruthy(process.env.SAVE_AUTH_STORAGE)
			
 
				+
			
 
				+/********************** Config: Data Dir Locations ****************************/
			
 
				+
			
 
				+const SRC_DIR = path.resolve(__dirname)
			
 
				+const DATA_DIR = process.env.DATA_DIR || await fs.promises.realpath(path.join(SRC_DIR, 'data'))
			
 
				+const INDEXES_DIR = path.join(DATA_DIR, 'index')
			
 
				+const ARCHIVE_DIR = path.join(DATA_DIR, 'archive')
			
 
				+if (!fs.existsSync(ARCHIVE_DIR))
			
 
				+    throw 'Could not find data/archive, are you running in the right pwd?'
			
 
				+
			
 
				+const PERSONA_DIR = path.join(DATA_DIR, 'personas', ACTIVE_PERSONA)
			
 
				+const CHROME_PROFILE_PATH = path.join(PERSONA_DIR, 'chrome_profile')
			
 
				+const CHROME_DOWNLOADS_DIR = path.join(PERSONA_DIR, 'chrome_downloads')
			
 
				+const CHROME_EXTENSIONS_DIR =  path.join(PERSONA_DIR, 'chrome_extensions')
			
 
				+const CHROME_EXTENSIONS_JSON_PATH = path.join(CHROME_EXTENSIONS_DIR, 'extensions.json')
			
 
				+const AUTH_JSON_PATH = path.join(PERSONA_DIR, 'auth.json')
			
 
				+const COOKIES_TXT_PATH = path.join(PERSONA_DIR, 'cookies.txt')
			
 
				+const SPEEDTESTS_DIR = path.join(PERSONA_DIR, 'speedtests')
			
 
				+// const CHROME_PROFILE_IMPORT_USER = 'Profile 1'
			
 
				+// const CHROME_PROFILE_IMPORT_PATH = '/Volumes/NVME/Users/squash/Library/Application Support/Google/Chrome'
			
 
				+
			
 
				+// chrome profile / persona directories
			
 
				+fs.mkdirSync(PERSONA_DIR, {recursive: true})
			
 
				+fs.mkdirSync(SPEEDTESTS_DIR, {recursive: true})
			
 
				+fs.mkdirSync(CHROME_PROFILE_PATH, {recursive: true})
			
 
				+fs.mkdirSync(CHROME_EXTENSIONS_DIR, {recursive: true})
			
 
				+fs.mkdirSync(CHROME_DOWNLOADS_DIR, {recursive: true})
			
 
				+
			
 
				+// cruft directories
			
 
				+const ORPHANS_DIR = path.join(DATA_DIR, 'orphans')
			
 
				+const PARTIALS_DIR = path.join(DATA_DIR, 'partials')
			
 
				+const DUPLICATES_DIR = path.join(DATA_DIR, 'duplicates')
			
 
				+await fs.promises.mkdir(ORPHANS_DIR, {recursive: true})
			
 
				+await fs.promises.mkdir(PARTIALS_DIR, {recursive: true})
			
 
				+await fs.promises.mkdir(DUPLICATES_DIR, {recursive: true})
			
 
				+
			
 
				+/********************** Config: Viewport Setup Opts ***************************/
			
 
				+
			
 
				+// Config: Viewport
			
 
				+const DEFAULT_TIMEOUT = 20_000
			
 
				+const DEFAULT_GEOLOCATION = {latitude: 59.95, longitude: 30.31667}
			
 
				+const DEFAULT_USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36'
			
 
				+const DEFAULT_ASPECT_RAIO = 16/9       // recommended: 16:9       (most common desktop window aspect ratio)
			
 
				+const SCREENSHOT_ASPECT_RATIO = 4/3    // recommended: 4:3        (easier to use as thumbnails when square-ish)
			
 
				+const DEFAULT_WINDOW_WIDTH = 1920      // recommended: 1920x1080p (1080p screenshots)
			
 
				+const DEFAULT_WINDOW_HEIGHT = Math.floor(DEFAULT_WINDOW_WIDTH/DEFAULT_ASPECT_RAIO)
			
 
				+const DEFAULT_VIEWPORT = {
			
 
				+    width: DEFAULT_WINDOW_WIDTH,
			
 
				+    height: DEFAULT_WINDOW_HEIGHT,            
			
 
				+    deviceScaleFactor: 2,              // 2 gives much sharper text in screenshots/pdfs/etc but uses more CPU/GPU
			
 
				+    isMobile: false,
			
 
				+    hasTouch: false,
			
 
				+    isLandscape: false,
			
 
				+}
			
 
				+const DEFAULT_COLOR_SCHEME = 'light'
			
 
				+const DEFAULT_HEADERS = {
			
 
				+    // requires frequent tweaking to remain undetected by cloudflare/recaptcha/etc.
			
 
				+    // 'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
			
 
				+    // 'accept-encoding': 'gzip, deflate, br, zstd',
			
 
				+    // 'accept-language': accept_language,
			
 
				+    // 'cache-Control': no_cache ? 'no-cache' : '',
			
 
				+    // 'dnt': '1',
			
 
				+    'sec-ch-ua': '"Google Chrome";v="122", "Not:A-Brand";v="8", "Chromium";v="122"',
			
 
				+    'sec-ch-ua-mobile': '?0',
			
 
				+    'sec-ch-ua-platform': '"macOS"',
			
 
				+    'connection-rtt': '50',
			
 
				+    // 'pragma': no_cache ? 'no-cache' : '',
			
 
				+    // 'sec-fetch-dest': 'document',
			
 
				+    // 'sec-fetch-mode': 'navigate',
			
 
				+    // 'sec-fetch-site': 'none',
			
 
				+    // 'sec-fetch-user': '?1',
			
 
				+    // // 'upgrade-insecure-requests': '1',     // breaks some sites, e.g. https://www.flowradar.com/cloneables/mouse-image-trail-effect
			
 
				+    // 'user-agent': user_agent,
			
 
				+}
			
 
				+
			
 
				+const DEFAULT_REFERRERS = ["https://www.google.com", "https://www.facebook.com", "https://www.instagram.com"]
			
 
				+
			
 
				+/****************** Config: Human Behavior Emulation **************************/
			
 
				+
			
 
				+const SCROLL_LIMIT = 20;    // e.g. 30 = 30 * (1000px/2s) => 30,000px scrolled in 60sec
			
 
				+const SCROLL_DELAY = 1350;  // interval per scroll, e.g. 2000 = 2sec to travel 1 * SCROLL_DISTANCE
			
 
				+const SCROLL_DISTANCE = DEFAULT_VIEWPORT.height - 100;  // make sure this is slightly less than viewport height so there is some overlap to make stitching easier
			
 
				+
			
 
				+/********************** Config: URL Rewriting *********************************/
			
 
				+const URL_REWRITES = [
			
 
				+    // replacements should come first
			
 
				+    // {
			
 
				+    //     idx: 0,
			
 
				+    //     pattern: /\/\/(www\.)?x\.com/gi,
			
 
				+    //     replacement: '//$1twitter.com/',
			
 
				+    //     // TODO: scope: 'hostname',
			
 
				+    // },
			
 
				+    // {
			
 
				+    //     idx: 1,
			
 
				+    //     pattern: /\/\/(www\.)?twitter\.com/gi,
			
 
				+    //     replacement: '//$1nitter.net',
			
 
				+    //     // TODO: scope: 'hostname',
			
 
				+    // },
			
 
				+
			
 
				+    // // blocks should come at the end
			
 
				+    // {
			
 
				+    //     idx: 999,
			
 
				+    //     pattern: /\/\/(www\.)?notallowed\.com/gi,
			
 
				+    //     replacement: '',
			
 
				+    //     // TODO: scope: 'href',
			
 
				+    // },
			
 
				+]
			
 
				+const URL_SCHEMES_IGNORED = [
			
 
				+    '',                     // no scheme is also invalid (e.g. opening a new tab page without any url yet)
			
 
				+    'chrome',
			
 
				+    'chrome-extension',
			
 
				+    'chrome-untrusted',
			
 
				+    'file',
			
 
				+    'data',
			
 
				+    'about',
			
 
				+]
			
 
				+
			
 
				+
			
 
				+/**************** Load existing data/archive/<timestamp> snapshots *************/
			
 
				+
			
 
				+const snapshots = await Snapshot.findAll({ attributes: ['id', 'timestamp', 'url'] })   // include: { model: ArchiveResult, as: 'archiveresults' }, });
			
 
				+const results = await ArchiveResult.findAll({ attributes: ['id', 'snapshot_id', 'extractor', 'start_ts'] })   // include: { model: Snapshot, as: 'snapshot' }, });
			
 
				+globalThis.snapshots = snapshots
			
 
				+globalThis.results = results
			
 
				+console.log(`[💿] Found ${snapshots.length} existing snapshots in index.sqlite3...`)
			
 
				+console.log(`[💿] Found ${results.length} existing results in index.sqlite3...`)
			
 
				+// debugger;
			
 
				+
			
 
				+const locateExistingSnapshots = (archive_dir) => {
			
 
				+    const urls_to_dirs = {}
			
 
				+    // for each data/archive/<timestamp>/index.json found, store {url: data/archive/<timestamp>}
			
 
				+    for (const snapshot_dir of fs.readdirSync(archive_dir)) {
			
 
				+        const snapshot_json = path.join(archive_dir, snapshot_dir, 'index.json')
			
 
				+        if (fs.existsSync(snapshot_json)) {
			
 
				+            const {url, archive_path} = JSON.parse(fs.readFileSync(snapshot_json, 'utf-8'))
			
 
				+            if (!snapshot_dir.includes(archive_path.replace('archive/', '')))
			
 
				+                throw 'Found incorrect index.json inside snapshot dir' + snapshot_dir
			
 
				+            if (url && url.includes('://')) {
			
 
				+                urls_to_dirs[url] = path.join(archive_dir, snapshot_dir)
			
 
				+            }
			
 
				+        }
			
 
				+    }
			
 
				+    return urls_to_dirs
			
 
				+}
			
 
				+
			
 
				+let SNAPSHOT_DIRS_BY_URL = locateExistingSnapshots(ARCHIVE_DIR)
			
 
				+
			
 
				+let all_snap_dirs = (await fs.promises.readdir(ARCHIVE_DIR))
			
 
				+// const orphan_snap_dirs = all_snap_dirs.filter(dirname => dirname.startsWith('19999'))
			
 
				+
			
 
				+// // scan through existing snapshot dirs, move orphans to orphans/ or correct archive/<snapid>
			
 
				+// for (const snap_id of orphan_snap_dirs) {
			
 
				+//     if (snap_id.startsWith('.')) continue
			
 
				+//     const src_dir = path.join(ARCHIVE_DIR, snap_id)
			
 
				+//     let src_path = src_dir
			
 
				+
			
 
				+//     assert((await fs.promises.stat(src_dir)).isDirectory())
			
 
				+//     let dest_path = null
			
 
				+
			
 
				+//     const orphan_metrics_path = path.join(src_dir, 'metrics.json')
			
 
				+//     if (fs.existsSync(orphan_metrics_path)) {
			
 
				+//         const orphan_metrics = JSON.parse(await fs.promises.readFile(orphan_metrics_path, 'utf-8'))
			
 
				+//         const url = orphan_metrics.url || orphan_metrics.URL
			
 
				+//         const version = orphan_metrics.VERSION || versionStrFromDate(orphan_metrics.start_time)
			
 
				+        
			
 
				+//         // move all bare files into ./versions/YYYYMMDD/* and symlink ./* to latest version
			
 
				+//         await symlinkBestSnapshotResults(src_dir)
			
 
				+
			
 
				+//         dest_path = SNAPSHOT_DIRS_BY_URL[url]
			
 
				+//         const dest_id = dest_path?.split('/').at(-1)
			
 
				+        
			
 
				+//         if (dest_id && (dest_id != snap_id)) {
			
 
				+//             if (fs.existsSync(dest_path)) {
			
 
				+//                 console.log(`    - moving duplicate snap_dir ${src_dir} -> ${dest_path}`)
			
 
				+//             } else {
			
 
				+//                 console.log(`   - moving valid snap_dir ${src_dir} -> ${dest_path}`)
			
 
				+//             }
			
 
				+//         } else if (dest_id == snap_id) {
			
 
				+//             continue
			
 
				+//         } else {
			
 
				+//             dest_path = path.join(ORPHANS_DIR, snap_id)
			
 
				+//             console.log(`   - moving orphan snap_dir ${src_dir} -> ${dest_path}`)
			
 
				+//         }
			
 
				+//     } else {
			
 
				+//         // corrupt/par
			
 
				+//         dest_path = path.join(PARTIALS_DIR, snap_id)
			
 
				+//         console.log(`   - moving parial snap_dir ${src_dir} -> ${dest_path}`)
			
 
				+//     }
			
 
				+//     if (dest_path) {
			
 
				+//         for (const version_dir of (await fs.promises.readdir(path.join(src_path, 'versions')))) {
			
 
				+//             const version_src = path.join(src_path, 'versions', version_dir)
			
 
				+//             const version_dst = path.join(dest_path, 'versions', version_dir)
			
 
				+
			
 
				+//             // move all bare files into ./versions/YYYYMMDD/* and symlink ./* to latest version
			
 
				+//             await symlinkBestSnapshotResults(dest_path)
			
 
				+
			
 
				+//             assert(!fs.existsSync(version_dst))
			
 
				+//             await fs.promises.rename(version_src, version_dst)
			
 
				+//             console.log('    - ', version_src, '--->', version_dst)
			
 
				+//         }
			
 
				+//         await fs.promises.rename(src_dir, path.join(PARTIALS_DIR, snap_id))
			
 
				+//         await symlinkBestSnapshotResults(dest_path)
			
 
				+//     }
			
 
				+// }
			
 
				+
			
 
				+// const duplicate_snap_dirs = (await fs.promises.readdir(DUPLICATES_DIR)).filter(dirname => dirname.startsWith('19999'))
			
 
				+// for (const snap_id of duplicate_snap_dirs) {
			
 
				+//     const src_dir = path.join(DUPLICATES_DIR, snap_id)
			
 
				+//     const metrics = JSON.parse(await fs.promises.readFile(path.join(src_dir, 'metrics.json'), 'utf-8'))
			
 
				+// }
			
 
				+
			
 
				+// all_snap_dirs = (await fs.promises.readdir(ARCHIVE_DIR))
			
 
				+// for (const snap_id of all_snap_dirs) {
			
 
				+//     if (snap_id.startsWith('.')) continue
			
 
				+//     const snap_dir = path.join(ARCHIVE_DIR, snap_id)
			
 
				+//     const metrics_path = path.join(snap_dir, 'metrics.json')
			
 
				+//     if (fs.existsSync(metrics_path)) {
			
 
				+//         // console.log('    - updating snap_dir', snap_dir)
			
 
				+//         await symlinkBestSnapshotResults(snap_dir)
			
 
				+//     }
			
 
				+// }
			
 
				+// SNAPSHOT_DIRS_BY_URL = locateExistingSnapshots(ARCHIVE_DIR)
			
 
				+
			
 
				+
			
 
				+fs.writeFileSync(path.join(DATA_DIR, 'queue.csv'), '')
			
 
				+
			
 
				+const snapIdFromDir = (dir_path) =>
			
 
				+    dir_path.split('/archive/').at(-1)
			
 
				+
			
 
				+const snapshot_dir_list = (
			
 
				+    Object.entries(SNAPSHOT_DIRS_BY_URL)
			
 
				+        .sort(([_ak, a], [_bk, b]) =>
			
 
				+            Number(snapIdFromDir(b)) - Number(snapIdFromDir(a)))
			
 
				+        .reverse())
			
 
				+
			
 
				+for (const [existing_url, snapshot_dir] of snapshot_dir_list) {
			
 
				+    // if (existing_url.startsWith('https://www.facebook.com/')) {
			
 
				+    const is_desired_url = !(existing_url.includes('facebook.com/') || existing_url.includes('instagram.com/'))
			
 
				+    const already_archived = false   // fs.existsSync(path.join(SNAPSHOT_DIRS_BY_URL[existing_url], 'versions'))
			
 
				+    if (is_desired_url && !already_archived) {
			
 
				+        // URLS.push(existing_url)
			
 
				+        fs.appendFileSync(
			
 
				+            path.join(DATA_DIR, 'queue.csv'),
			
 
				+            `${SNAPSHOT_DIRS_BY_URL[existing_url]},${existing_url}\n`,
			
 
				+            'utf-8',
			
 
				+        )
			
 
				+    }
			
 
				+}
			
 
				+URLS = [...new Set(URLS)]
			
 
				+console.log('[+] Added', URLS.length, 'existing urls to queue...')
			
 
				+
			
 
				+
			
 
				+/********************** Config: Output Paths **********************************/
			
 
				+// const TASK_PATH             = (url)  => path.join(DATA_DIR, 'results', `${hashCode(url)}`)
			
 
				+const TASK_PATH             = (url)  => SNAPSHOT_DIRS_BY_URL[url] || path.join(ARCHIVE_DIR, `1999999999.${hashCode(url)}`)
			
 
				+// const TASK_PATH                = (url)  => {
			
 
				+//     const existing_snap_dir = SNAPSHOT_DIRS_BY_URL[url]
			
 
				+//     assert(existing_snap_dir, `Could not find existing snapshot dir for ${url}`)
			
 
				+//     return existing_snap_dir
			
 
				+// }
			
 
				+
			
 
				+const OUTPUT_PATH           = (page, filename, extname='') => 
			
 
				+                                        path.join(TASK_PATH(page._original_url), `${filename}${extname}`)
			
 
				+
			
 
				+const SSL_PATH              = (page) => OUTPUT_PATH(page, 'ssl.json')
			
 
				+const CONSOLELOG_PATH       = (page) => OUTPUT_PATH(page, 'console.log')
			
 
				+const HEADERS_PATH          = (page) => OUTPUT_PATH(page, 'headers.json')
			
 
				+const REDIRECTS_PATH        = (page) => OUTPUT_PATH(page, 'redirects.json')
			
 
				+const REQUESTS_PATH         = (page) => OUTPUT_PATH(page, 'requests.json')
			
 
				+const TRACE_PATH            = (page) => OUTPUT_PATH(page, 'trace.json')
			
 
				+const METRICS_PATH          = (page) => OUTPUT_PATH(page, 'metrics.json')
			
 
				+const OUTLINKS_PATH         = (page) => OUTPUT_PATH(page, 'outlinks.json')
			
 
				+const SEO_PATH              = (page) => OUTPUT_PATH(page, 'seo.json')
			
 
				+const FAVICON_PATH          = (page) => OUTPUT_PATH(page, 'favicon.json')
			
 
				+const TITLE_PATH            = (page) => OUTPUT_PATH(page, 'title.txt')
			
 
				+const BODYTEXT_PATH         = (page) => OUTPUT_PATH(page, 'body.txt')
			
 
				+const PANDOC_PATH           = (page) => OUTPUT_PATH(page, 'pandoc.md')
			
 
				+const READABILITY_PATH      = (page) => OUTPUT_PATH(page, 'readability.json')
			
 
				+const ACCESIBILITY_PATH     = (page) => OUTPUT_PATH(page, 'accessibility.json')
			
 
				+const DOM_PATH              = (page) => OUTPUT_PATH(page, 'dom.html')
			
 
				+const PDF_PATH              = (page) => OUTPUT_PATH(page, 'output.pdf')
			
 
				+const SCREENSHOT_PATH       = (page) => OUTPUT_PATH(page, 'screenshot.png')
			
 
				+const SCREENSHOT_JPG_PATH   = (page) => OUTPUT_PATH(page, 'screenshot.jpg')
			
 
				+const AIQA_PATH             = (page) => OUTPUT_PATH(page, 'aiqa.json')
			
 
				+const SINGLEFILE_PATH       = (page) => OUTPUT_PATH(page, 'singlefile.html')
			
 
				+const YTDLP_PATH            = (page) => OUTPUT_PATH(page, 'media/')
			
 
				+const GALLERYDL_PATH        = (page) => OUTPUT_PATH(page, 'photos/')
			
 
				+const SCREENRECORDING_PATH  = (page) => OUTPUT_PATH(page, 'screenrecording.mp4')
			
 
				+const SCREENRECORDGIF_PATH  = (page) => OUTPUT_PATH(page, 'screenrecording.gif')
			
 
				+const RESPONSES_PATH        = (page) => OUTPUT_PATH(page, 'responses')
			
 
				+const RAW_PATH              = (page) => OUTPUT_PATH(page, 'raw')
			
 
				+
			
 
				+
			
 
				+
			
 
				+/********************** Config: Chrome Extensions *****************************/
			
 
				+
			
 
				+interface ChromeExtension {
			
 
				+    name: string
			
 
				+    webstore_id: string
			
 
				+}
			
 
				+interface LoadedChromeExtension extends ChromeExtension {
			
 
				+    id?: string
			
 
				+    webstore_url?: string
			
 
				+    crx_url?: string
			
 
				+    crx_path?: string
			
 
				+    unpacked_path?: string
			
 
				+    read_manifest?: () => any
			
 
				+    read_version?: () => string | null
			
 
				+}
			
 
				+
			
 
				+const CHROME_EXTENSIONS: LoadedChromeExtension[] = [
			
 
				+    // Content access / unblocking / blocking plugins
			
 
				+    {webstore_id: 'ifibfemgeogfhoebkmokieepdoobkbpo', name: 'captcha2'},                 // https://2captcha.com/blog/how-to-use-2captcha-solver-extension-in-puppeteer
			
 
				+    {webstore_id: 'edibdbjcniadpccecjdfdjjppcpchdlm', name: 'istilldontcareaboutcookies'},
			
 
				+    {webstore_id: 'cjpalhdlnbpafiamejdnhcphjbkeiagm', name: 'ublock'},
			
 
				+    // {webstore_id: 'mlomiejdfkolichcflejclcbmpeaniij', name: 'ghostery'},
			
 
				+    // {webstore_id: 'mnjggcdmjocbbbhaepdhchncahnbgone', name: 'sponsorblock'},
			
 
				+    // {webstore_id: 'iplffkdpngmdjhlpjmppncnlhomiipha', name: 'unpaywall'},
			
 
				+    // {webstore_id: 'gofocbepaccnkpphbgjpolififgcakhn', name: 'spaywallnews'},
			
 
				+    
			
 
				+    // Archiving plugins
			
 
				+    {webstore_id: 'mpiodijhokgodhhofbcjdecpffjipkle', name: 'singlefile'},
			
 
				+    // {webstore_id: 'fpeoodllldobpkbkabpblcfaogecpndd', name: 'archivewebpage'},
			
 
				+    // {webstore_id: 'niloccemoadcdkdjlinkgdfekeahmflj', name: 'pocket'},
			
 
				+    // {webstore_id: 'kenncghfghgolcbmckhiljgaabnpcaaa', name: 'warcreate'},
			
 
				+    // {webstore_id: 'jjndjgheafjngoipoacpjgeicjeomjli', name: 'puppeteerstream'},
			
 
				+
			
 
				+    // Utilities for humans setting up/viewing/debugging the archiving session
			
 
				+    // {webstore_id: 'aeblfdkhhhdcdjpifhhbdiojplfjncoa', name: '1password'},
			
 
				+    // {webstore_id: 'fngmhnnpilhplaeedifhccceomclgfbg', name: 'editthiscookie'},
			
 
				+    // {webstore_id: 'cgfpgnepljlgenjclbekbjdlgcodfmjp', name: 'simpletabsorter'},
			
 
				+    
			
 
				+    // Scripting/automation plugins
			
 
				+    // {webstore_id: 'jinjaccalgkegednnccohejagnlnfdag', name: 'violentmonkey'},
			
 
				+    // {webstore_id: 'infppggnoaenmfagbfknfkancpbljcca', name: 'automa'},
			
 
				+    // {webstore_id: 'pfegffhjcgkneoemnlniggnhkfioidjg', name: 'screenscraper'},
			
 
				+]
			
 
				+
			
 
				+/******************** Config: Chrome Profile Preferences **********************/
			
 
				+
			
 
				+// https://niek.github.io/chrome-features/
			
 
				+const CHROME_DISABLED_COMPONENTS = [
			
 
				+    'Translate',
			
 
				+    'AcceptCHFrame',
			
 
				+    'OptimizationHints',
			
 
				+    'ProcessPerSiteUpToMainFrameThreshold',
			
 
				+    'InterestFeedContentSuggestions',
			
 
				+    'CalculateNativeWinOcclusion',
			
 
				+    'BackForwardCache',
			
 
				+    'HeavyAdPrivacyMitigations',
			
 
				+    'LazyFrameLoading',
			
 
				+    'ImprovedCookieControls',
			
 
				+    'PrivacySandboxSettings4',
			
 
				+    'AutofillServerCommunication',
			
 
				+    'CertificateTransparencyComponentUpdater',
			
 
				+    'DestroyProfileOnBrowserClose',
			
 
				+    'CrashReporting',
			
 
				+    'OverscrollHistoryNavigation',
			
 
				+    'InfiniteSessionRestore',
			
 
				+    //'LockProfileCookieDatabase',      // disabling allows multiple chrome instances to concurrently modify profile, but might make chrome much slower https://github.com/yt-dlp/yt-dlp/issues/7271  https://issues.chromium.org/issues/40901624
			
 
				+]
			
 
				+
			
 
				+const CHROME_PREFERENCES_EXTRA = {}
			
 
				+const CHROME_PREFERENCES_DEFAULT = {
			
 
				+    // https://chromium.googlesource.com/chromium/src/+/32352ad08ee673a4d43e8593ce988b224f6482d3/chrome/common/pref_names.cc
			
 
				+    homepage: 'about:blank',                        // doesn't work here, managed by Secure Preferences
			
 
				+    homepage_is_newtabpage: false,                  // doesn't work here, managed by Secure Preferences
			
 
				+    session: {                                      // doesn't work here, managed by Secure Preferences
			
 
				+        restore_on_startup: 4,                      // doesn't work here, managed by Secure Preferences
			
 
				+        startup_urls: 'about:blank',                // doesn't work here, managed by Secure Preferences
			
 
				+    },
			
 
				+    default_apps: 'noinstall',
			
 
				+    browser: {
			
 
				+        confirm_to_quit: false,
			
 
				+        enable_spellchecking: false,
			
 
				+        check_default_browser: false,
			
 
				+        show_update_promotion_info_bar: false,
			
 
				+    },
			
 
				+    profile: {
			
 
				+        // name: 'ArchiveBox Persona: Default',    // doesnt work to change display name, not sure why
			
 
				+        // using_default_name: false,
			
 
				+        exited_cleanly: true,
			
 
				+        default_content_setting_values: {
			
 
				+          automatic_downloads: 1,
			
 
				+        },
			
 
				+    },
			
 
				+    bookmark_bar: {show_on_all_tabs: false},
			
 
				+    safebrowsing: {enabled: false},
			
 
				+    search: {suggest_enabled: false},
			
 
				+    download: {
			
 
				+        prompt_for_download: false,
			
 
				+        open_pdf_in_system_reader: true,
			
 
				+        // default_directory: CHROME_DOWNLOADS_DIR || path.join(__dirname, 'downloads'),
			
 
				+    },
			
 
				+    select_file_dialogs: {allowed: false},
			
 
				+    autofill: {save_data: false},
			
 
				+    printing: {enabled: false},
			
 
				+    message_center: {welcome_notification_dismissed_local: true},
			
 
				+    extensions: {
			
 
				+        ui: {
			
 
				+            developer_mode: true,
			
 
				+            dismissed_adt_promo: true,
			
 
				+        },
			
 
				+        // pinned_extensions: CHROME_EXTENSIONS?.map(({id}) => id) || [],
			
 
				+    },
			
 
				+    webkit: {
			
 
				+        webprefs: {
			
 
				+            javascript_enabled: true,
			
 
				+            minimum_font_size: 9,
			
 
				+            // default_font_size: 12,
			
 
				+            // web_security_enabled: false,
			
 
				+            // allow_displaying_insecure_content: true,
			
 
				+            // allow_running_insecure_content: true,
			
 
				+            java_enabled: true,
			
 
				+            loads_images_automatically: true,
			
 
				+        },
			
 
				+    },
			
 
				+    settings: {
			
 
				+        multi_profile_never_show_intro: true,
			
 
				+        multi_profile_warning_show_dismissed: true,
			
 
				+        first_run_tutorial_shown: true,
			
 
				+    },
			
 
				+    plugins: {
			
 
				+        always_open_pdf_externally: true,
			
 
				+    },
			
 
				+}
			
 
				+
			
 
				+const CHROME_PREFERENCES_PATH = path.join(CHROME_PROFILE_PATH, 'Default', 'Preferences')
			
 
				+
			
 
				+const getChromePreferences = ({CHROME_PREFERENCES_DEFAULT, CHROME_PREFERENCES_EXTRA, CHROME_EXTENSIONS, CHROME_DOWNLOADS_DIR}) =>
			
 
				+    merge.all([CHROME_PREFERENCES_DEFAULT, CHROME_PREFERENCES_EXTRA, {
			
 
				+        extensions: {
			
 
				+            pinned_extensions: CHROME_EXTENSIONS?.map(({id}) => id) || [],
			
 
				+        },
			
 
				+        download: {
			
 
				+            default_directory: CHROME_DOWNLOADS_DIR || path.join(__dirname, 'downloads'),
			
 
				+        },
			
 
				+    }])
			
 
				+
			
 
				+function applyChromePreferences(puppeteer, prefs_path, preferences) {
			
 
				+    if (fs.existsSync(prefs_path)) {
			
 
				+        const preferences_existing = JSON.parse(fs.readFileSync(prefs_path, 'utf-8'))
			
 
				+        const preferences_merged = merge(preferences_existing, preferences)
			
 
				+        // console.log(JSON.stringify(preferences_merged, null, 4))
			
 
				+        fs.writeFileSync(prefs_path, JSON.stringify(preferences_merged))    
			
 
				+    } else {
			
 
				+        // otherwise profile has not been created yet, use plugin instead (plugin only works on first creation)
			
 
				+        puppeteer.use(PrefsPlugin({userPrefs: preferences}))
			
 
				+    }
			
 
				+    return puppeteer
			
 
				+}
			
 
				+
			
 
				+
			
 
				+/******************** Config: Chrome Launch Args ******************************/
			
 
				+
			
 
				+const CHROME_ARGS_DEFAULT = [
			
 
				+    // Headless behavior tuning, determinstic behavior settings
			
 
				+    // '--headless=new',
			
 
				+    '--test-type',
			
 
				+    '--test-type=gpu',                                // https://github.com/puppeteer/puppeteer/issues/10516
			
 
				+    '--deterministic-mode',
			
 
				+    '--js-flags=--random-seed=1157259159',            // make all JS random numbers deterministic by providing a seed
			
 
				+    '--allow-pre-commit-input',                       // allow JS mutations before page rendering is complete
			
 
				+    '--disable-blink-features=AutomationControlled',  // hide the signatures that announce browser is being remote-controlled
			
 
				+    '--enable-automation',                            // <- DONT USE THIS, it makes you easily detectable / blocked by cloudflare
			
 
				+    // `--proxy-server=https://43.159.28.126:2334:u7ce652b7568805c4-zone-custom-region-us-session-szGWq3FRU-sessTime-60:u7ce652b7568805c4`,      // send all network traffic through a proxy https://2captcha.com/proxy
			
 
				+    // `--proxy-bypass-list=127.0.0.1`,
			
 
				+
			
 
				+    // Docker-specific options
			
 
				+    // https://github.com/GoogleChrome/lighthouse-ci/tree/main/docs/recipes/docker-client#--no-sandbox-issues-explained
			
 
				+    // '--no-sandbox',                                   // rely on docker sandboxing in docker, otherwise we need cap_add: SYS_ADM to use host sandboxing
			
 
				+    // '--disable-gpu-sandbox',
			
 
				+    // '--disable-setuid-sandbox',
			
 
				+    // '--disable-dev-shm-usage',                     // docker 75mb default shm size is not big enough, disabling just uses /tmp instead
			
 
				+    // '--no-xshm',
			
 
				+
			
 
				+    // Profile data dir setup
			
 
				+    // chrome://profile-internals
			
 
				+    `--user-data-dir=${CHROME_PROFILE_PATH}`,
			
 
				+    `--profile-directory=${CHROME_PROFILE_USER}`,
			
 
				+    '--password-store=basic',                            // use mock keychain instead of OS-provided keychain (we manage auth.json instead)
			
 
				+    '--use-mock-keychain',
			
 
				+    '--disable-cookie-encryption',                       // we need to be able to write unencrypted cookies to save/load auth.json
			
 
				+    // '--disable-sync',                                 // don't try to use Google account sync features
			
 
				+
			
 
				+    // Extensions
			
 
				+    // chrome://inspect/#extensions
			
 
				+    // `--load-extension=${CHROME_EXTENSIONS.map(({unpacked_path}) => unpacked_path).join(',')}`,  // not needed when using existing profile that already has extensions installed
			
 
				+    `--allowlisted-extension-id=${CHROME_EXTENSIONS.map(({ webstore_id }) => webstore_id).join(',')}`,
			
 
				+    '--allow-legacy-extension-manifests',
			
 
				+
			
 
				+    // Browser window and viewport setup
			
 
				+    // chrome://version
			
 
				+    // `--user-agent="${DEFAULT_USER_AGENT}"`,
			
 
				+    // `--window-size=${DEFAULT_VIEWPORT.width},${DEFAULT_VIEWPORT.height}`,
			
 
				+    '--window-position=0,0',
			
 
				+    '--hide-scrollbars',                               // hide scrollbars because otherwise they show up in screenshots
			
 
				+    '--install-autogenerated-theme=169,32,85',         // red border makes it easier to see which chrome window is archivebox's
			
 
				+    '--virtual-time-budget=60000',                     // fast-forward all animations & timers by 60s
			
 
				+    '--autoplay-policy=no-user-gesture-required',      // auto-start videos so they trigger network requests + show up in outputs 
			
 
				+    '--disable-gesture-requirement-for-media-playback',
			
 
				+    '--lang=en-US,en;q=0.9',
			
 
				+
			
 
				+    // DANGER: JS isolation security features (to allow easier tampering with pages during archiving)
			
 
				+    // chrome://net-internals
			
 
				+    // '--disable-web-security',                              // <- WARNING, breaks some sites that expect/enforce strict CORS headers (try webflow.com)
			
 
				+    // '--disable-features=IsolateOrigins,site-per-process', // useful for injecting JS, but some very strict sites can panic / show error pages when isolation is disabled (e.g. webflow.com)
			
 
				+    // '--allow-running-insecure-content',                   // Breaks CORS/CSRF/HSTS etc., useful sometimes but very easy to detect
			
 
				+    // '--allow-file-access-from-files',                     // <- WARNING, dangerous, allows JS to read filesystem using file:// URLs
			
 
				+
			
 
				+    // // DANGER: Disable HTTPS verification
			
 
				+    // '--ignore-certificate-errors',
			
 
				+    // '--ignore-ssl-errors',
			
 
				+    // '--ignore-certificate-errors-spki-list',
			
 
				+    // '--allow-insecure-localhost',
			
 
				+
			
 
				+    // IO: stdin/stdout, debug port config
			
 
				+    // chrome://inspect
			
 
				+    '--log-level=2',                                  // 1=DEBUG 2=WARNING 3=ERROR
			
 
				+    '--enable-logging=stderr',
			
 
				+    '--remote-debugging-address=0.0.0.0',
			
 
				+    `--remote-debugging-port=${CHROME_DEBUG_PORT}`,
			
 
				+
			
 
				+    // GPU, canvas, text, and pdf rendering config
			
 
				+    // chrome://gpu
			
 
				+    '--enable-webgl',                                 // enable web-gl graphics support
			
 
				+    '--font-render-hinting=none',                     // make rendering more deterministic by ignoring OS font hints, may also need css override, try:    * {text-rendering: geometricprecision !important; -webkit-font-smoothing: antialiased;}
			
 
				+    '--force-color-profile=srgb',                     // make rendering more deterministic by using consitent color profile, if browser looks weird, try: generic-rgb
			
 
				+    '--disable-partial-raster',                       // make rendering more deterministic (TODO: verify if still needed)
			
 
				+    '--disable-skia-runtime-opts',                    // make rendering more deterministic by avoiding Skia hot path runtime optimizations
			
 
				+    '--disable-2d-canvas-clip-aa',                    // make rendering more deterministic by disabling antialiasing on 2d canvas clips
			
 
				+    // '--disable-gpu',                                  // falls back to more consistent software renderer
			
 
				+    // // '--use-gl=swiftshader',                        <- DO NOT USE, breaks M1 ARM64. it makes rendering more deterministic by using simpler CPU renderer instead of OS GPU renderer  bug: https://groups.google.com/a/chromium.org/g/chromium-dev/c/8eR2GctzGuw
			
 
				+    // // '--disable-software-rasterizer',               <- DO NOT USE, harmless, used in tandem with --disable-gpu
			
 
				+    // // '--run-all-compositor-stages-before-draw',     <- DO NOT USE, makes headful chrome hang on startup (tested v121 Google Chrome.app on macOS)
			
 
				+    // // '--disable-gl-drawing-for-tests',              <- DO NOT USE, disables gl output (makes tests run faster if you dont care about canvas)
			
 
				+    // // '--blink-settings=imagesEnabled=false',        <- DO NOT USE, disables images entirely (only sometimes useful to speed up loading)
			
 
				+
			
 
				+    // Process management & performance tuning
			
 
				+    // chrome://process-internals
			
 
				+    '--disable-lazy-loading',                         // make rendering more deterministic by loading all content up-front instead of on-focus
			
 
				+    '--disable-renderer-backgrounding',               // dont throttle tab rendering based on focus/visibility
			
 
				+    '--disable-background-networking',                // dont throttle tab networking based on focus/visibility
			
 
				+    '--disable-background-timer-throttling',          // dont throttle tab timers based on focus/visibility
			
 
				+    '--disable-backgrounding-occluded-windows',       // dont throttle tab window based on focus/visibility
			
 
				+    '--disable-ipc-flooding-protection',              // dont throttle ipc traffic or accessing big request/response/buffer/etc. objects will fail
			
 
				+    '--disable-extensions-http-throttling',           // dont throttle http traffic based on runtime heuristics
			
 
				+    '--disable-field-trial-config',                   // disable shared field trial state between browser processes 
			
 
				+    '--disable-back-forward-cache',                   // disable browsing navigation cache
			
 
				+    // '--in-process-gpu',                            <- DONT USE THIS, makes headful startup time ~5-10s slower (tested v121 Google Chrome.app on macOS)
			
 
				+    // '--disable-component-extensions-with-background-pages',  // TODO: check this, disables chrome components that only run in background (could lower startup time)
			
 
				+
			
 
				+    // uncomment to disable hardware camera/mic/speaker access + present fake devices to websites
			
 
				+    // (faster to disable, but disabling breaks recording browser audio in puppeteer-stream screenrecordings)
			
 
				+    // '--use-fake-device-for-media-stream',
			
 
				+    // '--use-fake-ui-for-media-stream',
			
 
				+    // '--disable-features=GlobalMediaControls,MediaRouter,DialMediaRouteProvider',
			
 
				+    
			
 
				+    // // Output format options (PDF, screenshot, etc.)
			
 
				+    '--export-tagged-pdf',                            // include table on contents and tags in printed PDFs
			
 
				+    '--generate-pdf-document-outline',
			
 
				+
			
 
				+    // Suppress first-run features, popups, hints, updates, etc.
			
 
				+    // chrome://system
			
 
				+    '--no-pings',
			
 
				+    '--no-first-run',
			
 
				+    '--no-default-browser-check',
			
 
				+    '--disable-default-apps',
			
 
				+    '--ash-no-nudges',
			
 
				+    '--disable-infobars',
			
 
				+    '--disable-search-engine-choice-screen',
			
 
				+    '--disable-session-crashed-bubble',
			
 
				+    '--simulate-outdated-no-au="Tue, 31 Dec 2099 23:59:59 GMT"',
			
 
				+    '--hide-crash-restore-bubble',
			
 
				+    '--suppress-message-center-popups',
			
 
				+    '--disable-client-side-phishing-detection',
			
 
				+    '--disable-domain-reliability',
			
 
				+    '--disable-component-update',
			
 
				+    '--disable-datasaver-prompt',
			
 
				+    '--disable-hang-monitor',
			
 
				+    '--disable-session-crashed-bubble',
			
 
				+    '--disable-speech-synthesis-api',
			
 
				+    '--disable-speech-api',
			
 
				+    '--disable-print-preview',
			
 
				+    '--safebrowsing-disable-auto-update',
			
 
				+    '--deny-permission-prompts',
			
 
				+    '--disable-external-intent-requests',
			
 
				+    '--disable-notifications',
			
 
				+    '--disable-desktop-notifications',
			
 
				+    '--noerrdialogs',
			
 
				+    '--disable-popup-blocking',
			
 
				+    '--disable-prompt-on-repost',
			
 
				+    '--silent-debugger-extension-api',
			
 
				+    '--block-new-web-contents',
			
 
				+    '--metrics-recording-only',
			
 
				+    '--disable-breakpad',
			
 
				+
			
 
				+    
			
 
				+    // other feature flags
			
 
				+    // chrome://flags        chrome://components
			
 
				+    `--disable-features=${CHROME_DISABLED_COMPONENTS.join(',')}`,
			
 
				+    '--enable-features=NetworkService',
			
 
				+]
			
 
				+const CHROME_ARGS_EXTRA = []
			
 
				+
			
 
				+
			
 
				+const CHROME_LAUNCH_OPTIONS = {
			
 
				+    CHROME_PROFILE_PATH,
			
 
				+    CHROME_PROFILE_USER,
			
 
				+    CHROME_EXTENSIONS,
			
 
				+    CHROME_DEBUG_PORT,
			
 
				+    CHROME_DISABLED_COMPONENTS,
			
 
				+    DEFAULT_VIEWPORT,
			
 
				+    CHROME_ARGS_DEFAULT,
			
 
				+    CHROME_ARGS_EXTRA,
			
 
				+}
			
 
				+/* Chrome CLI Args Documentation
			
 
				+   - https://github.com/GoogleChrome/chrome-launcher/blob/main/docs/chrome-flags-for-tools.md
			
 
				+   - https://chromium.googlesource.com/chromium/chromium/+/master/content/public/common/content_switches.cc
			
 
				+   - https://jtway.co/optimize-your-chrome-options-for-testing-to-get-x1-25-impact-4f19f071bf45
			
 
				+   - https://peter.sh/experiments/chromium-command-line-switches/
			
 
				+   - https://www.chromium.org/developers/how-tos/run-chromium-with-flags/
			
 
				+   - https://github.com/manoj9788/Chrome-Driver-arguments/blob/master/README.md
			
 
				+*/
			
 
				+const getChromeArgs = ({CHROME_ARGS_DEFAULT, CHROME_ARGS_EXTRA,
			
 
				+                        CHROME_PROFILE_PATH, CHROME_PROFILE_USER,
			
 
				+                        CHROME_EXTENSIONS,
			
 
				+                        CHROME_DEBUG_PORT,
			
 
				+                        CHROME_DISABLED_COMPONENTS,
			
 
				+                        DEFAULT_VIEWPORT}=CHROME_LAUNCH_OPTIONS) =>
			
 
				+    [
			
 
				+        ...CHROME_ARGS_DEFAULT,
			
 
				+        `--user-data-dir=${CHROME_PROFILE_PATH}`,
			
 
				+        `--profile-directory=${CHROME_PROFILE_USER}`,
			
 
				+        `--load-extension=${CHROME_EXTENSIONS.map(({unpacked_path}) => unpacked_path).join(',')}`,
			
 
				+        `--allowlisted-extension-id=${CHROME_EXTENSIONS.map(({id}) => id).join(',')}`,
			
 
				+        `--window-size=${DEFAULT_VIEWPORT.width},${DEFAULT_VIEWPORT.height}`,
			
 
				+        `--remote-debugging-port=${CHROME_DEBUG_PORT}`,
			
 
				+        `--disable-features=${CHROME_DISABLED_COMPONENTS.join(',')}`,
			
 
				+        ...CHROME_ARGS_EXTRA,
			
 
				+    ]
			
 
				+
			
 
				+
			
 
				+/******************** Chrome Extension Management *****************************/
			
 
				+
			
 
				+function getExtensionId(unpacked_path) {
			
 
				+    const manifest_path = path.join(unpacked_path, 'manifest.json')
			
 
				+    if (!fs.existsSync(manifest_path)) return null
			
 
				+
			
 
				+    // chrome uses a SHA256 hash of the unpacked extension directory path to compute a dynamic id
			
 
				+    const hash = crypto.createHash('sha256');
			
 
				+    hash.update(Buffer.from(unpacked_path, 'utf-8'));
			
 
				+    const detected_extension_id = Array.from(hash.digest('hex'))
			
 
				+      .slice(0, 32) // Convert each hexadecimal character to a character in the range 'a'-'p'
			
 
				+      .map(i => String.fromCharCode(parseInt(i, 16) + 'a'.charCodeAt(0)))
			
 
				+      .join('');
			
 
				+
			
 
				+    return detected_extension_id
			
 
				+}
			
 
				+
			
 
				+async function installExtension(extension) {
			
 
				+    const manifest_path = path.join(extension.unpacked_path, 'manifest.json')
			
 
				+
			
 
				+    // Download extensions using:
			
 
				+    // curl -fsSL 'https://clients2.google.com/service/update2/crx?response=redirect&prodversion=1230&acceptformat=crx3&x=id%3D$EXTENSION_ID%26uc' > extensionname.crx
			
 
				+    // unzip -d extensionname extensionname.zip
			
 
				+
			
 
				+    if (!fs.existsSync(manifest_path) && !fs.existsSync(extension.crx_path)) {
			
 
				+        console.log("[🛠️] Downloading missing extension", extension.name, extension.webstore_id, '->', extension.crx_path);
			
 
				+
			
 
				+        // Download crx file from ext.crx_url -> ext.crx_path
			
 
				+        const response = await fetch(extension.crx_url) as Response
			
 
				+        const crx_file = fs.createWriteStream(extension.crx_path);
			
 
				+        if (response.headers.get("content-length") && response.body) {
			
 
				+            // @ts-ignore
			
 
				+            const crx_stream = Readable.fromWeb(response.body)
			
 
				+            await finished(crx_stream.pipe(crx_file))
			
 
				+        } else {
			
 
				+            console.warn('[⚠️] Failed to download extension', extension.name, extension.webstore_id)
			
 
				+        }
			
 
				+    }
			
 
				+    
			
 
				+    var {stdout, stderr} = {stdout: '', stderr: ''}
			
 
				+
			
 
				+    // Unzip crx file from ext.crx_url -> ext.unpacked_path
			
 
				+    await fs.promises.mkdir(extension.unpacked_path, {recursive: true})
			
 
				+    try {
			
 
				+        var {stdout, stderr} = await exec(`/usr/bin/unzip ${extension.crx_path} -d ${extension.unpacked_path}`)
			
 
				+    } catch(err1) {
			
 
				+        try {
			
 
				+            await unzip(extension.crx_path, extension.unpacked_path)
			
 
				+        } catch(err2) {
			
 
				+            // console.error(`[❌] Failed to install ${extension.crx_path}: could not unzip crx`, err1, err2)
			
 
				+            // return false
			
 
				+        }
			
 
				+    }
			
 
				+
			
 
				+    if (!fs.existsSync(manifest_path))
			
 
				+        console.error(`[❌] Failed to install ${extension.crx_path}: could not find manifest.json in unpacked_path`, stdout, stderr)
			
 
				+
			
 
				+    return fs.existsSync(manifest_path)
			
 
				+}
			
 
				+
			
 
				+async function loadOrInstallExtension(ext) {
			
 
				+    if (!(ext.webstore_id || ext.unpacked_path))
			
 
				+        throw 'Extension must have either {webstore_id} or {unpacked_path}'
			
 
				+
			
 
				+    // Set statically computable extension metadata
			
 
				+    ext.webstore_id =       ext.webstore_id     || ext.id
			
 
				+    ext.name =              ext.name            || ext.webstore_id
			
 
				+    ext.webstore_url =      ext.webstore_url    || `https://chromewebstore.google.com/detail/${ext.webstore_id}`
			
 
				+    ext.crx_url =           ext.crx_url         || `https://clients2.google.com/service/update2/crx?response=redirect&prodversion=1230&acceptformat=crx3&x=id%3D${ext.webstore_id}%26uc`
			
 
				+    ext.crx_path =          ext.crx_path        || path.join(CHROME_EXTENSIONS_DIR, `${ext.webstore_id}__${ext.name}.crx`)
			
 
				+    ext.unpacked_path =     ext.unpacked_path   || path.join(CHROME_EXTENSIONS_DIR, `${ext.webstore_id}__${ext.name}`)
			
 
				+    
			
 
				+    const manifest_path =   path.join(ext.unpacked_path, 'manifest.json')
			
 
				+    ext.read_manifest =     () => JSON.parse(fs.readFileSync(manifest_path, 'utf-8'))
			
 
				+    ext.read_version =      () => fs.existsSync(manifest_path) && ext.read_manifest()?.version || null
			
 
				+
			
 
				+    // if extension is not installed, download and unpack it
			
 
				+    if (!ext.read_version()) {
			
 
				+        await installExtension(ext)
			
 
				+    }
			
 
				+
			
 
				+    // autodetect id from filesystem path (unpacked extensions dont have stable IDs)
			
 
				+    ext.id = getExtensionId(ext.unpacked_path)
			
 
				+    ext.version = ext.read_version()
			
 
				+    if (!ext.version) {
			
 
				+        console.warn('[❌] Unable to detect ID and version of installed extension', prettyPath(ext.unpacked_path))
			
 
				+    } else {
			
 
				+        console.log(`[➕] Installed extension ${ext.name} (${ext.version})...`.padEnd(82), prettyPath(ext.unpacked_path))
			
 
				+    }
			
 
				+
			
 
				+    return ext
			
 
				+}
			
 
				+
			
 
				+async function isTargetExtension(target) {
			
 
				+    let target_type
			
 
				+    let target_ctx
			
 
				+    let target_url
			
 
				+    try {
			
 
				+        target_type = target.type()
			
 
				+        target_ctx = (await target.worker()) || (await target.page()) || null
			
 
				+        target_url = target.url() || target_ctx?.url() || null
			
 
				+    } catch(err) {
			
 
				+        if (String(err).includes('No target with given id found')) {
			
 
				+            // because this runs on initial browser startup, we sometimes race with closing the initial
			
 
				+            // new tab page. it will throw a harmless error if we try to check a target that's already closed,
			
 
				+            // ignore it and return null since that page is definitely not an extension's bg page anyway
			
 
				+            target_type = 'closed'
			
 
				+            target_ctx = null
			
 
				+            target_url = 'about:closed'
			
 
				+        } else {
			
 
				+            throw err
			
 
				+        }
			
 
				+    }
			
 
				+    
			
 
				+    const target_is_bg = ['service_worker', 'background_page'].includes(target_type)
			
 
				+    const target_is_extension = target_url?.startsWith('chrome-extension://')
			
 
				+    const extension_id = (target_is_extension && target_url.split('://')[1].split('/')[0]) || null
			
 
				+    const manifest_version = target_type === 'service_worker' ? '3' : '2'
			
 
				+
			
 
				+    return {
			
 
				+        target_type,
			
 
				+        target_ctx,
			
 
				+        target_url,
			
 
				+        target_is_bg,
			
 
				+        target_is_extension,
			
 
				+        extension_id,
			
 
				+        manifest_version,
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+async function loadExtensionFromTarget(extensions, target) {
			
 
				+    const {
			
 
				+        target_is_bg,
			
 
				+        target_is_extension,
			
 
				+        target_type,
			
 
				+        target_ctx,
			
 
				+        target_url,
			
 
				+        extension_id,
			
 
				+        manifest_version,
			
 
				+    } = await isTargetExtension(target)
			
 
				+    
			
 
				+    if (!(target_is_bg && extension_id && target_ctx))
			
 
				+        return null
			
 
				+
			
 
				+    const manifest = await target_ctx.evaluate(() =>
			
 
				+        // @ts-ignore
			
 
				+        chrome.runtime.getManifest())
			
 
				+    
			
 
				+    const { name, version, homepage_url, options_page, options_ui } = manifest
			
 
				+
			
 
				+    if (!version || !extension_id)
			
 
				+        return null
			
 
				+    
			
 
				+    const options_url = await target_ctx.evaluate(
			
 
				+        (options_page) => chrome.runtime.getURL(options_page),
			
 
				+        options_page || options_ui?.page || 'options.html',
			
 
				+    )
			
 
				+    
			
 
				+    const commands = await target_ctx.evaluate(async () =>
			
 
				+        (await new Promise((resolve, reject) => {
			
 
				+            if (chrome.commands)
			
 
				+                chrome.commands.getAll(resolve)
			
 
				+            else
			
 
				+                resolve({})
			
 
				+        }))
			
 
				+    )
			
 
				+
			
 
				+    // console.log(`[+] Found Manifest V${manifest_version} Extension:`, extension_id, name, target_url, Object.keys(commands).length)
			
 
				+
			
 
				+    let dispatchEval = async (...args) =>
			
 
				+        await target_ctx.evaluate(...args)
			
 
				+    let dispatchPopup = async () =>
			
 
				+        await target_ctx.evaluate('chrome.action?.openPopup() || chrome.tabs.create({url: chrome.runtime.getURL("popup.html")})')
			
 
				+    
			
 
				+    let dispatchAction
			
 
				+    let dispatchMessage
			
 
				+    let dispatchCommand
			
 
				+
			
 
				+    if (manifest_version === '3') {
			
 
				+        dispatchAction = async (tab) => {
			
 
				+            // https://developer.chrome.com/docs/extensions/reference/api/action#event-onClicked
			
 
				+            return await target_ctx.evaluate(async (tab) => {
			
 
				+                tab = tab || (await new Promise((resolve) =>
			
 
				+                    chrome.tabs.query({currentWindow: true, active: true}, ([tab]) => resolve(tab))))
			
 
				+                // @ts-ignore
			
 
				+                return await chrome.action.onClicked.dispatch(tab)
			
 
				+            }, tab)
			
 
				+        }
			
 
				+        dispatchMessage = async (message, options) => {
			
 
				+            // https://developer.chrome.com/docs/extensions/reference/api/runtime
			
 
				+            return await target_ctx.evaluate(async (extension_id, message, options) => {
			
 
				+                return await chrome.runtime.sendMessage(extension_id, message, options)
			
 
				+            }, extension_id, message, options)
			
 
				+        }
			
 
				+        dispatchCommand = async (command, tab) => {
			
 
				+            // https://developer.chrome.com/docs/extensions/reference/api/commands#event-onCommand
			
 
				+            return await target_ctx.evaluate(async (command, tab) => {
			
 
				+                // @ts-ignore
			
 
				+                return await chrome.commands.onCommand.dispatch(command, tab)
			
 
				+            }, command, tab)
			
 
				+        }
			
 
				+    } else if (manifest_version === '2') {
			
 
				+        dispatchAction = async (tab) => {
			
 
				+            // https://developer.chrome.com/docs/extensions/mv2/reference/browserAction#event-onClicked
			
 
				+            return await target_ctx.evaluate(async (tab) => {
			
 
				+                tab = tab || (await new Promise((resolve) =>
			
 
				+                    chrome.tabs.query({currentWindow: true, active: true}, ([tab]) => resolve(tab))))
			
 
				+                // @ts-ignore
			
 
				+                return await chrome.browserAction.onClicked.dispatch(tab)
			
 
				+            }, tab)
			
 
				+        }
			
 
				+        dispatchMessage = async (message, options) => {
			
 
				+            // https://developer.chrome.com/docs/extensions/mv2/reference/runtime#method-sendMessage
			
 
				+            return await target_ctx.evaluate(async (extension_id, message, options) => {
			
 
				+                return await new Promise((resolve) =>
			
 
				+                    chrome.runtime.sendMessage(extension_id, message, options, resolve)
			
 
				+                )
			
 
				+            }, extension_id, message, options)
			
 
				+        }
			
 
				+        dispatchCommand = async (command, tab) => {
			
 
				+            // https://developer.chrome.com/docs/extensions/mv2/reference/commands#event-onCommand
			
 
				+            return await target_ctx.evaluate(async (command, tab) => {
			
 
				+                return await new Promise((resolve) =>
			
 
				+                    // @ts-ignore
			
 
				+                    chrome.commands.onCommand.dispatch(command, tab, resolve)
			
 
				+                )
			
 
				+            }, command, tab)
			
 
				+        }
			
 
				+    }
			
 
				+    const existing_extension = extensions.filter(({id}) => id === extension_id)[0] || {}
			
 
				+
			
 
				+    const new_extension = {
			
 
				+        ...existing_extension,
			
 
				+        id: extension_id,
			
 
				+        webstore_name: name,
			
 
				+
			
 
				+        target,
			
 
				+        target_ctx,
			
 
				+        target_type,
			
 
				+        target_url,
			
 
				+
			
 
				+        manifest_version,
			
 
				+        manifest,
			
 
				+        version,
			
 
				+        homepage_url,
			
 
				+        options_url,
			
 
				+
			
 
				+        dispatchEval,         // run some JS in the extension's service worker context
			
 
				+        dispatchPopup,        // open the extension popup
			
 
				+        dispatchAction,       // trigger an extension menubar icon click
			
 
				+        dispatchMessage,      // send a chrome runtime message in the service worker context
			
 
				+        dispatchCommand,      // trigger an extension keyboard shortcut command
			
 
				+    }
			
 
				+
			
 
				+    console.log(`[➕] Loaded extension ${name.substring(0, 32)} (${version}) ${target_type}...`.padEnd(82), target_url)
			
 
				+    Object.assign(existing_extension, new_extension)
			
 
				+
			
 
				+    return new_extension
			
 
				+}
			
 
				+
			
 
				+
			
 
				+
			
 
				+async function getChromeExtensionsFromPersona({CHROME_EXTENSIONS, CHROME_EXTENSIONS_DIR}) {
			
 
				+    console.log('*************************************************************************')
			
 
				+    console.log(`[⚙️] Installing ${CHROME_EXTENSIONS.length} chrome extensions from CHROME_EXTENSIONS...`)
			
 
				+    try {
			
 
				+        // read extension metadata from filesystem (installing from Chrome webstore if extension is missing)
			
 
				+        for (const extension of CHROME_EXTENSIONS) {
			
 
				+            Object.assign(extension, await loadOrInstallExtension(extension))
			
 
				+        }
			
 
				+
			
 
				+        // for easier debugging, write parsed extension info to filesystem
			
 
				+        await overwriteFile(
			
 
				+            CHROME_EXTENSIONS_JSON_PATH.replace('.json', '.present.json'),
			
 
				+            CHROME_EXTENSIONS,
			
 
				+        )
			
 
				+    } catch(err) {
			
 
				+        console.error(err)
			
 
				+    }
			
 
				+    console.log('*************************************************************************')
			
 
				+    return CHROME_EXTENSIONS
			
 
				+}
			
 
				+
			
 
				+let _EXTENSIONS_CACHE = null
			
 
				+async function getChromeExtensionsFromCache({browser, extensions=CHROME_EXTENSIONS, extensions_dir=CHROME_EXTENSIONS_DIR}) {
			
 
				+    if (_EXTENSIONS_CACHE === null) {
			
 
				+        console.log(`[⚙️] Loading ${CHROME_EXTENSIONS.length} chrome extensions from CHROME_EXTENSIONS...`)
			
 
				+
			
 
				+        // find loaded Extensions at runtime / browser launch time & connect handlers
			
 
				+        // looks at all the open targets for extension service workers / bg pages
			
 
				+        for (const target of browser.targets()) {
			
 
				+            // mutates extensions object in-place to add metadata loaded from filesystem persona dir
			
 
				+            await loadExtensionFromTarget(extensions, target)
			
 
				+        }
			
 
				+        _EXTENSIONS_CACHE = extensions
			
 
				+
			
 
				+        // write installed extension metadata to filesystem extensions.json for easier debugging
			
 
				+        await overwriteFile(
			
 
				+            CHROME_EXTENSIONS_JSON_PATH.replace('.json', '.loaded.json'),
			
 
				+            extensions,
			
 
				+        )
			
 
				+        await overwriteSymlink(
			
 
				+            CHROME_EXTENSIONS_JSON_PATH.replace('.json', '.loaded.json'),
			
 
				+            CHROME_EXTENSIONS_JSON_PATH,
			
 
				+        )
			
 
				+    }
			
 
				+    
			
 
				+    return _EXTENSIONS_CACHE
			
 
				+}
			
 
				+
			
 
				+async function setup2CaptchaExtension({browser, extensions}) {
			
 
				+    let page = null
			
 
				+    try {
			
 
				+        // open a new tab to finish setting up the 2captcha extension manually using its extension options page
			
 
				+        page = await browser.newPage()
			
 
				+        const { options_url } = extensions.filter(ext => ext.name === 'captcha2')[0]
			
 
				+        await page.goto(options_url)
			
 
				+        await wait(2_500)
			
 
				+        await page.bringToFront()
			
 
				+    
			
 
				+        // type in the API key and click the Login button (and auto-close success modal after it pops up)
			
 
				+        await page.evaluate(() => {
			
 
				+            const elem = document.querySelector("input[name=apiKey]") as HTMLInputElement
			
 
				+            elem.value = ""
			
 
				+        })
			
 
				+        await page.type('input[name=apiKey]', API_KEY_2CAPTCHA, { delay: 25 })
			
 
				+        
			
 
				+        // toggle all the important switches to ON
			
 
				+        await page.evaluate(() => {
			
 
				+            const checkboxes = Array.from(document.querySelectorAll<HTMLInputElement>('input#isPluginEnabled, input[name*=enabledFor], input[name*=autoSolve]'));
			
 
				+            for (const checkbox of checkboxes) {
			
 
				+                if (!checkbox.checked) checkbox.click()
			
 
				+            }
			
 
				+        })
			
 
				+    
			
 
				+        let dialog_opened = false
			
 
				+        page.on('dialog', async (dialog) => {
			
 
				+            setTimeout(async () => {
			
 
				+                await dialog.accept();
			
 
				+                dialog_opened = true
			
 
				+            }, 500);
			
 
				+        })
			
 
				+        await page.click('button#connect')
			
 
				+        await wait(2_500)
			
 
				+        if (!dialog_opened) {
			
 
				+            throw `2captcha extension login confirmation dialog never opened, please check its options page manually: ${options_url}`
			
 
				+        }
			
 
				+        console.log('[🔑] Configured the 2captcha extension using its options page...')
			
 
				+    } catch(err) {
			
 
				+        console.warn(`[❌] Failed to configure the 2captcha extension using its options page!`, err)
			
 
				+    }
			
 
				+    if (page) await page.close()
			
 
				+}
			
 
				+
			
 
				+async function speedtest({browser, page, measureUpload=true, timeout=25000}: {browser?: Browser, page?: Page, measureUpload?: boolean, timeout?: number}) {
			
 
				+    // run a speedtest using fast.com, printing results once per second
			
 
				+
			
 
				+    browser = browser || await page.browser()
			
 
				+    page = page || await browser.newPage()
			
 
				+
			
 
				+    // save one speedtest_<date>.json result per day
			
 
				+    const today = versionStrFromDate(new Date(), {withDate: true, withTime: false})
			
 
				+    const SPEEDTEST_PATH = path.join(SPEEDTESTS_DIR, `speedtest_${today}.json`)
			
 
				+
			
 
				+    // check if we've already run one today, if so return earlier results and skip running again
			
 
				+    try {
			
 
				+        return JSON.parse(await fs.promises.readFile(SPEEDTEST_PATH, 'utf-8'))
			
 
				+    } catch(err) {
			
 
				+        // otherwise speedtest does not exist yet for today, continue onwards...
			
 
				+    }
			
 
				+
			
 
				+    console.log('[🚤] Running Speedtest using Fast.com...'.padEnd(82), prettyPath(SPEEDTEST_PATH))
			
 
				+
			
 
				+    await page.goto('https://fast.com', {timeout, waitUntil: 'domcontentloaded'});
			
 
				+    await page.waitForSelector('#speed-value', {timeout})
			
 
				+
			
 
				+    let result = null
			
 
				+    let loop_idx = 0
			
 
				+
			
 
				+    while (loop_idx < 100) {
			
 
				+        result = await page.evaluate(() => {
			
 
				+            const $ = document.querySelector.bind(document);
			
 
				+
			
 
				+            return {
			
 
				+                downloadSpeed: Number($('#speed-value').textContent),
			
 
				+                downloadUnit: $('#speed-units').textContent.trim(),
			
 
				+                downloaded: Number($('#down-mb-value').textContent.trim()),
			
 
				+                uploadSpeed: Number($('#upload-value').textContent),
			
 
				+                uploadUnit: $('#upload-units').textContent.trim(),
			
 
				+                uploaded: Number($('#up-mb-value').textContent.trim()),
			
 
				+                latency: Number($('#latency-value').textContent.trim()),
			
 
				+                bufferBloat: Number($('#bufferbloat-value').textContent.trim()),
			
 
				+                userLocation: $('#user-location').textContent.trim(),
			
 
				+                userIp: $('#user-ip').textContent.trim(),
			
 
				+                isDone: Boolean($('#speed-value.succeeded') && $('#upload-value.succeeded')),
			
 
				+            };
			
 
				+        })
			
 
				+        if (result.downloadSpeed > 0) {
			
 
				+            // console.log(JSON.stringify(result).replaceAll('"', '').replaceAll(',', ' ').replaceAll('{', '').replaceAll('}', ''))
			
 
				+        }
			
 
				+
			
 
				+        if (result.isDone || (!measureUpload && result.uploadSpeed)) {
			
 
				+            break
			
 
				+        }
			
 
				+
			
 
				+        await wait(500)
			
 
				+        loop_idx++
			
 
				+    }
			
 
				+
			
 
				+    await Promise.allSettled([
			
 
				+        page.close(),
			
 
				+        overwriteFile(SPEEDTEST_PATH, result)
			
 
				+    ])
			
 
				+
			
 
				+    return result
			
 
				+}
			
 
				+
			
 
				+/******************************************************************************/
			
 
				+/******************************************************************************/
			
 
				+
			
 
				+const ALREADY_ARCHIVED = new Set(['', 'about:blank', 'chrome://newtab', 'chrome://version'])
			
 
				+const TASKS_PER_RUN_LIMIT = 200
			
 
				+
			
 
				+async function botArchiveTask({page, data, url=''}) {
			
 
				+    url = url || data  // puppeteer-cluster passes in the url value via the data: arg
			
 
				+
			
 
				+    const is_unarchivable_url = URL_SCHEMES_IGNORED.includes(url.split(':')[0])
			
 
				+    const is_already_archived = ALREADY_ARCHIVED.has(url.slice(0, 4096))
			
 
				+    if (is_unarchivable_url || is_already_archived) return null 
			
 
				+    ALREADY_ARCHIVED.add(url.slice(0, 4096))
			
 
				+
			
 
				+    if (ALREADY_ARCHIVED.size > TASKS_PER_RUN_LIMIT) {
			
 
				+        console.warn('[❌] Hit maximum URLs archived per browser session, exiting to free memory.')
			
 
				+        console.warn('     Run this process again to continue with the next batch...')
			
 
				+        process.exit(21)
			
 
				+    }
			
 
				+
			
 
				+    const browser = await page.browser()
			
 
				+    const client = await page.target().createCDPSession()
			
 
				+    const extensions = await getChromeExtensionsFromCache({browser})
			
 
				+    const browser_version = await browser.version()
			
 
				+    const original_url = url.toString()
			
 
				+    const start_time = (new Date())
			
 
				+    
			
 
				+    console.log('[0/4]-------------------------------------------------------------------------')
			
 
				+    const snapshot_dir = await setupSnapshotDir({original_url, start_time})
			
 
				+    const snapshot = await setupSnapshotDB({original_url, start_time, snapshot_dir})
			
 
				+    console.log('[1/4]-------------------------------------------------------------------------')
			
 
				+    console.log(`[🪟] Starting page & viewport setup (${browser_version} ${DEFAULT_VIEWPORT.isMobile ? 'mobile' : 'desktop'} ${DEFAULT_VIEWPORT.width}x${DEFAULT_VIEWPORT.height}px)...`)
			
 
				+
			
 
				+
			
 
				+    const page_state = {
			
 
				+        // global static state
			
 
				+        browser,
			
 
				+        client,
			
 
				+        browser_version,
			
 
				+        extensions,
			
 
				+
			
 
				+        // per-page static metadata
			
 
				+        original_url,
			
 
				+        snapshot,
			
 
				+        snapshot_dir,
			
 
				+        start_time: start_time.toISOString(),
			
 
				+        start_ts: Number(start_time),
			
 
				+        version: versionStrFromDate(start_time),
			
 
				+
			
 
				+        // per-page mutable archiving state
			
 
				+        main_response: null,
			
 
				+        recorder: null,
			
 
				+        console_log: [],
			
 
				+        traffic_log: {},
			
 
				+        redirects: {},
			
 
				+    }
			
 
				+    page._original_url = original_url
			
 
				+    
			
 
				+    try {
			
 
				+        // run all page setup functions in parallel
			
 
				+        const results = await Promise.allSettled([
			
 
				+            // loadAuthStorage(page, page_state, { apply: true }),
			
 
				+            startMetadataRecording(page, page_state),
			
 
				+            setupURLRewriting(page, page_state),
			
 
				+            // setupViewport(page, page_state),
			
 
				+            setupModalAutoClosing(page, page_state),
			
 
				+            loadCloudflareCookie(page, page_state),
			
 
				+            startResponseSaving(page, page_state),
			
 
				+            saveYTDLP(page, page_state),
			
 
				+            saveGALLERYDL(page, page_state),
			
 
				+            // saveSourceMaps(page, page_state),
			
 
				+            // TODO: someday setup https://github.com/osnr/TabFS ?
			
 
				+        ]);
			
 
				+        // run all page setup functions in parallel
			
 
				+        const rejected = results
			
 
				+            .filter(result => result.status === 'rejected')
			
 
				+            .map(result => (result as PromiseRejectedResult).reason);
			
 
				+        if (rejected.length) console.warn('[⚠️] Partial failures during page setup:', rejected);
			
 
				+    } catch(err) {
			
 
				+        console.error('[❌] PAGE SETUP ERROR', JSON.stringify(err, null, 4))
			
 
				+        return
			
 
				+    }
			
 
				+
			
 
				+
			
 
				+    console.log('[2/4]-------------------------------------------------------------------------')
			
 
				+
			
 
				+    console.log('[➡️] NAVIGATION[INI]', ANSI.blue + url + ANSI.reset)
			
 
				+    const startrecording_promise = startScreenrecording(page, page_state)
			
 
				+    page_state.main_response = await page.goto(url, {waitUntil: 'load', timeout: 40_000})
			
 
				+    try {
			
 
				+        const results = await Promise.allSettled([
			
 
				+            startrecording_promise,
			
 
				+            page.bringToFront(),
			
 
				+            page.waitForNetworkIdle({concurrency: 0, idleTime: 900, timeout: 20_000}),
			
 
				+        ])
			
 
				+        const rejected = results
			
 
				+            .filter(result => result.status === 'rejected')
			
 
				+            .map(result =>  (result as PromiseRejectedResult).reason)
			
 
				+        if (rejected.length) console.warn('[⚠️] Parial failures during page load:', rejected)
			
 
				+    } catch(err) {
			
 
				+        console.error('[❌] ERROR DURING PAGE LOAD', JSON.stringify(err, null, 4))
			
 
				+        return
			
 
				+    }
			
 
				+
			
 
				+    if (page_state.main_response === null) {
			
 
				+        page_state.main_response = await page.waitForResponse(() => true)
			
 
				+    }
			
 
				+    assert(page_state.main_response)
			
 
				+    if (page_state.main_response.status() == 429) {
			
 
				+        throw `[⚠️] Got 429 rate-limit response, skipping this URL for now...`
			
 
				+    }
			
 
				+
			
 
				+    // emulate human browsing behavior
			
 
				+    // await disableAnimations(page, page_state);
			
 
				+    await jiggleMouse(page, page_state);
			
 
				+    await solveCaptchas(page, page_state);
			
 
				+    await blockRedirects(page, page_state);
			
 
				+    await scrollDown(page, page_state);
			
 
				+    // await expandComments(page, page_state);
			
 
				+    await submitForm(page, page_state);
			
 
				+    // await blockJSExecution(page, page_state);
			
 
				+
			
 
				+    console.log('[3/4]-------------------------------------------------------------------------')
			
 
				+    
			
 
				+    // stop tampering with page requests & JS / recording metadata / traffic log
			
 
				+    await stopMetadataRecording(page, page_state)
			
 
				+
			
 
				+    // do all synchonous archiving steps that need exclusive use of the whole page while doing stuff
			
 
				+    const saveScreenrecording_promise = saveScreenrecording(page, page_state);
			
 
				+    await saveScreenshot(page, page_state);
			
 
				+    await savePDF(page, page_state);
			
 
				+
			
 
				+    console.log('[4/4]-------------------------------------------------------------------------')
			
 
				+
			
 
				+    // do all async archiving steps that can be run at the same time
			
 
				+    await inlineShadowDOM(page, page_state);
			
 
				+    const results = await Promise.allSettled([
			
 
				+        saveTitle(page, page_state),
			
 
				+        saveSEO(page, page_state),
			
 
				+        saveFavicon(page, page_state),
			
 
				+        saveSSL(page, page_state),
			
 
				+        saveRequests(page, page_state),
			
 
				+        saveRedirects(page, page_state),
			
 
				+        saveHeaders(page, page_state),
			
 
				+        saveRaw(page, page_state),
			
 
				+        saveDOM(page, page_state),
			
 
				+        saveBodyText(page, page_state),
			
 
				+        // savePandoc(page, page_state),
			
 
				+        saveReadability(page, page_state),
			
 
				+        saveAccessibility(page, page_state),
			
 
				+        saveOutlinks(page, page_state),
			
 
				+        // saveAuthStorage(page, page_state),
			
 
				+        saveAIQualityAssuranceResult(page, page_state),
			
 
				+    ]);
			
 
				+
			
 
				+    // do all sync archiving steps that require browser extensions at the very end (they are the buggiest)
			
 
				+    const bg_results = Promise.allSettled([
			
 
				+        saveScreenrecording_promise,
			
 
				+        saveSinglefile(page, page_state),
			
 
				+        // saveArchiveWebPage(page, page_state),
			
 
				+        // savePocket(page, page_state),
			
 
				+    ])
			
 
				+
			
 
				+    const {duration} = await saveMetrics(page, page_state);
			
 
				+
			
 
				+    const rejected = results
			
 
				+        .filter(result => result.status === 'rejected')
			
 
				+        .map(result =>  (result as PromiseRejectedResult).reason)                            // not sure why this has a ts-error, .reason does exist on rejected promises
			
 
				+
			
 
				+    if (rejected.length)
			
 
				+        console.warn('[⚠️] Parial failures during archiving:', rejected)
			
 
				+
			
 
				+    // Start an interactive REPL here with the `page` instance.
			
 
				+    // https://github.com/berstend/puppeteer-extra/tree/master/packages/puppeteer-extra-plugin-repl
			
 
				+    // await page.repl()
			
 
				+    // await page.browser().repl()
			
 
				+
			
 
				+    console.log(`[✅] ${ANSI.blue}Finished archiving in ${duration/1000}s.${ANSI.reset}`)
			
 
				+    
			
 
				+    try {
			
 
				+        const rejected = (await bg_results)
			
 
				+            .filter(result => result.status === 'rejected')
			
 
				+            .map(result =>  (result as PromiseRejectedResult).reason)                        // not sure why this has a ts-error, .reason does exist on rejected promises
			
 
				+        if (rejected.length)
			
 
				+            console.warn('[⚠️] Parial failures during wrap-up tasks:', rejected)
			
 
				+        
			
 
				+        console.log('[🗑️] Resetting to about:blank to ensure memory is freed...')
			
 
				+        await page.goto('about:blank')
			
 
				+        await page.close()
			
 
				+    } catch(err) {
			
 
				+        console.log(err)
			
 
				+    }
			
 
				+
			
 
				+    // symlink the best results from across all the versions/ into the snapshot dir root
			
 
				+    await symlinkBestSnapshotResults(snapshot_dir)
			
 
				+
			
 
				+    // display latest version screenshot GIF
			
 
				+    console.log()
			
 
				+    try {
			
 
				+        const latest_version_gif = path.join(snapshot_dir, 'versions', page_state.version, path.basename(SCREENRECORDGIF_PATH(page)))
			
 
				+        const dirent = await blockUntilExists(latest_version_gif, {min_bytes: 100, timeout: 15_000})
			
 
				+        child_process.spawn('/Users/squash/.iterm2/imgcat', [dirent.abspath], {stdio: [null, 'inherit', 'inherit']})
			
 
				+    } catch(err) {
			
 
				+        console.warn('[⚠️] Failed to display screenrecording.gif...', err)
			
 
				+        console.log()
			
 
				+    }
			
 
				+
			
 
				+    // determine whether task succeeded or failed based on AI QA score
			
 
				+    const latest_version_aiqa = path.join(snapshot_dir, 'versions', page_state.version, path.basename(AIQA_PATH(page)))
			
 
				+    const qa_results = JSON.parse((await fs.promises.readFile(latest_version_aiqa)).toString())
			
 
				+    if (qa_results.pct_visible < 50) {
			
 
				+        throw `[❌] Task completed with problems, got AI QA score of ${qa_results.pct_visible}%! ${qa_results.warnings.join(', ')} ${qa_results.error_text || ''}`
			
 
				+    } else {
			
 
				+        console.log(`[💫] Task completed succesfully: ${qa_results.pct_visible}%    ${qa_results.warnings.join(', ') || ''}`)
			
 
				+        console.log(`     Summary: ${(qa_results.main_content_title || qa_results.description || 'No title/description detected').substring(0, 80)}... ${qa_results.main_content_author || ''} ${qa_results.main_content_date || ''}`)
			
 
				+        return true
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+async function passiveArchiveTask({browser, page, url}) {
			
 
				+    // archive passively (e.g. a tab that was opened already by a human), without changing the active page
			
 
				+
			
 
				+    const is_unarchivable_url = URL_SCHEMES_IGNORED.includes(url.split(':')[0])
			
 
				+    const is_already_archived = ALREADY_ARCHIVED.has(url.slice(0, 4096))
			
 
				+    if (is_unarchivable_url || is_already_archived) return null
			
 
				+    ALREADY_ARCHIVED.add(url.slice(0, 4096))
			
 
				+
			
 
				+    // these have to be as early as possible because we're racing with the page load (we might even be too late)
			
 
				+    // jk nevermind, we now re-open a new bg tab for every tab that's created to re-capture the initial request
			
 
				+    // await page.setRequestInterception(true);
			
 
				+    // await page.setCacheEnabled(false);
			
 
				+
			
 
				+    const original_url = url.toString()
			
 
				+    const start_time = (new Date())
			
 
				+    const browser_version = await browser.version()
			
 
				+    
			
 
				+    console.log('------------------------------------------------------------------------------')
			
 
				+    console.log('[➕] Starting archive of new tab opened in driver browser...', await browser.version())
			
 
				+    const snapshot_dir = await setupSnapshotDir({original_url, start_time})
			
 
				+    const snapshot = await setupSnapshotDB({ original_url, start_time, snapshot_dir })
			
 
				+    console.log('------------------------------------------------------------------------------')
			
 
				+    console.log(`[🪟] Starting page & viewport setup (${browser_version} ${DEFAULT_VIEWPORT.isMobile ? 'mobile' : 'desktop'} ${DEFAULT_VIEWPORT.width}x${DEFAULT_VIEWPORT.height}px)...`)
			
 
				+
			
 
				+    // create a new page in the background for archiving
			
 
				+    const old_page = page
			
 
				+    page = await browser.newPage()
			
 
				+    await old_page.bringToFront()
			
 
				+    const client = await page.target().createCDPSession()
			
 
				+    const extensions = await getChromeExtensionsFromCache({ browser })
			
 
				+
			
 
				+    const page_state = {
			
 
				+        // global static state
			
 
				+        browser,
			
 
				+        client,
			
 
				+        browser_version,
			
 
				+        extensions,
			
 
				+
			
 
				+        // per-page static metadata
			
 
				+        original_url,
			
 
				+        snapshot,
			
 
				+        snapshot_dir,
			
 
				+        start_time: start_time.toISOString(),
			
 
				+        start_ts: Number(start_time),
			
 
				+        version: versionStrFromDate(start_time),
			
 
				+
			
 
				+        // per-page mutable archiving state
			
 
				+        main_response: null,
			
 
				+        recorder: null,
			
 
				+        console_log: [],
			
 
				+        traffic_log: {},
			
 
				+        redirects: {},
			
 
				+    }
			
 
				+    page._original_url = original_url
			
 
				+
			
 
				+    try {
			
 
				+        
			
 
				+        // run all page setup functions in parallel
			
 
				+        const results = await Promise.allSettled([
			
 
				+            // loadAuthStorage(page, page_state, {apply: true}),
			
 
				+            startMetadataRecording(page, page_state),
			
 
				+            setupURLRewriting(page, page_state),
			
 
				+            startResponseSaving(page, page_state),
			
 
				+            saveYTDLP(page, page_state),
			
 
				+            saveGALLERYDL(page, page_state),
			
 
				+            // saveSourceMaps(page, page_state),
			
 
				+        ]);
			
 
				+        const rejected = results
			
 
				+            .filter(result => result.status === 'rejected')
			
 
				+            .map(result =>  (result as PromiseRejectedResult).reason)
			
 
				+        if (rejected.length) console.warn('[⚠️] Parial failures during page setup:', rejected)
			
 
				+    } catch(err) {
			
 
				+        console.warn('[❌] ERROR DURING PAGE SETUP', JSON.stringify(err, null, 4))
			
 
				+        return
			
 
				+    }
			
 
				+
			
 
				+    // load the url in the background page, then switch to it once its loaded and close the original tab
			
 
				+    console.log('[➡️] NAVIGATION[INI]', ANSI.blue + url + ANSI.reset)
			
 
				+    const startrecording_promise = startScreenrecording(page, page_state)
			
 
				+    page_state.main_response = await page.goto(url, {waitUntil: 'load', timeout: 40_000})
			
 
				+    
			
 
				+    // for debugging
			
 
				+    globalThis.page = page
			
 
				+    globalThis.page_state = page_state
			
 
				+
			
 
				+    // start loading the page, start screenrecording, close the old page, and wait for loading to finish (all at once, fine for these to race)
			
 
				+    try {
			
 
				+        const results = await Promise.allSettled([
			
 
				+            startrecording_promise,
			
 
				+            page.bringToFront(),
			
 
				+            old_page.close(),
			
 
				+            page.waitForNetworkIdle({concurrency: 0, idleTime: 900, timeout: 20_000}),
			
 
				+        ])
			
 
				+        const rejected = results
			
 
				+            .filter(result => result.status === 'rejected')
			
 
				+            .map(result =>  (result as PromiseRejectedResult).reason)
			
 
				+        if (rejected.length) console.warn('[⚠️] Parial failures during [age load:', rejected)
			
 
				+    } catch(err) {
			
 
				+        console.warn('[❌] ERROR DURING PAGE LOAD', JSON.stringify(err, null, 4))
			
 
				+        return
			
 
				+    }
			
 
				+
			
 
				+    if (page_state.main_response === null) {
			
 
				+        page_state.main_response = await page.waitForResponse(() => true)
			
 
				+    }
			
 
				+    assert(page_state.main_response)
			
 
				+    if (page_state.main_response.status() == 429) {
			
 
				+        throw `[⚠️] Got 429 rate-limit response, skipping this URL for now...`
			
 
				+    }
			
 
				+
			
 
				+    // resume page if paused by waitForDebuggerOnStart/dev tools debugger/backgrounding
			
 
				+    try {
			
 
				+        await client.send('Page.enable');
			
 
				+        await client.send('Page.setWebLifecycleState', {state: 'active'});
			
 
				+        await client.send('Runtime.runIfWaitingForDebugger')
			
 
				+    } catch(err) { /* console.warn(err) */ }
			
 
				+
			
 
				+    // wait a couple seconds for page to finish loading
			
 
				+    await wait(5_000)
			
 
				+
			
 
				+    // emulate human browsing behavior
			
 
				+    // await disableAnimations(page, page_state);
			
 
				+    // await jiggleMouse(page, page_state);
			
 
				+    await solveCaptchas(page, page_state);
			
 
				+    // await blockRedirects(page, page_state);
			
 
				+    // await scrollDown(page, page_state);
			
 
				+    // await expandComments(page, page_state);
			
 
				+    await submitForm(page, page_state);
			
 
				+    // await blockJSExecution(page, page_state);
			
 
				+    await stopMetadataRecording(page, page_state)   // stop tampering with page requests & JS
			
 
				+
			
 
				+    console.log('[3/4]-------------------------------------------------------------------------')
			
 
				+
			
 
				+    // do all synchonous archiving steps that need exclusive use of the whole page while doing stuff
			
 
				+    const saveScreenrecording_promise = saveScreenrecording(page, page_state);
			
 
				+    await saveScreenshot(page, page_state);
			
 
				+    await savePDF(page, page_state);
			
 
				+
			
 
				+    console.log('[4/4]-------------------------------------------------------------------------')
			
 
				+
			
 
				+    // do all async archiving steps that can be run at the same time
			
 
				+    await inlineShadowDOM(page, page_state);
			
 
				+    const results = await Promise.allSettled([
			
 
				+        saveTitle(page, page_state),
			
 
				+        saveSEO(page, page_state),
			
 
				+        saveFavicon(page, page_state),
			
 
				+        saveSSL(page, page_state),
			
 
				+        saveRequests(page, page_state),
			
 
				+        saveRedirects(page, page_state),
			
 
				+        saveHeaders(page, page_state),
			
 
				+        saveRaw(page, page_state),
			
 
				+        saveDOM(page, page_state),
			
 
				+        saveBodyText(page, page_state),
			
 
				+        // savePandoc(page, page_state),
			
 
				+        saveReadability(page, page_state),
			
 
				+        saveAccessibility(page, page_state),
			
 
				+        saveOutlinks(page, page_state),
			
 
				+        // saveAuthStorage(page, page_state),
			
 
				+        saveAIQualityAssuranceResult(page, page_state),
			
 
				+    ]);
			
 
				+
			
 
				+    // do all sync archiving steps that require browser extensions at the very end (they are the buggiest)
			
 
				+    const bg_results = Promise.allSettled([
			
 
				+        saveScreenrecording_promise,
			
 
				+        saveSinglefile(page, page_state),
			
 
				+        // saveArchiveWebPage(page, page_state),
			
 
				+        // savePocket(page, page_state),
			
 
				+    ])
			
 
				+
			
 
				+    const {duration} = await saveMetrics(page, page_state);
			
 
				+
			
 
				+    const rejected = results
			
 
				+        .filter(result => result.status === 'rejected')
			
 
				+        .map(result =>  (result as PromiseRejectedResult).reason)
			
 
				+
			
 
				+    if (rejected.length)
			
 
				+        console.warn('[⚠️] Parial failures during page archiving:', rejected)
			
 
				+
			
 
				+    // Start an interactive REPL here with the `page` instance.
			
 
				+    // https://github.com/berstend/puppeteer-extra/tree/master/packages/puppeteer-extra-plugin-repl
			
 
				+    // await page.repl()
			
 
				+    // await page.browser().repl()
			
 
				+
			
 
				+    console.log(`[✅] Finished archiving in ${duration/1000}s.`,)
			
 
				+    
			
 
				+    // await page.tracing.stop();
			
 
				+    try {
			
 
				+        const rejected = (await bg_results)
			
 
				+            .filter(result => result.status === 'rejected')
			
 
				+            .map(result =>  (result as PromiseRejectedResult).reason)
			
 
				+        if (rejected.length)
			
 
				+            console.warn('[⚠️] Parial failures during page wrap-up tasks:', rejected)
			
 
				+    } catch(err) {
			
 
				+        console.log(err)
			
 
				+    }
			
 
				+    await symlinkBestSnapshotResults(snapshot_dir)
			
 
				+}
			
 
				+
			
 
				+
			
 
				+/******************************************************************************/
			
 
				+/************************* Page Setup Tasks ***********************************/
			
 
				+
			
 
				+
			
 
				+
			
 
				+async function setupSnapshotDir({original_url, start_time, snapshot_dir=null}) {
			
 
				+    // setup archive/<id> snapshot output folder, move old files into versions/<date>/* + clear any existing symlinks
			
 
				+
			
 
				+    const snap_dir = snapshot_dir || TASK_PATH(original_url)
			
 
				+    
			
 
				+    console.log()
			
 
				+    console.log()
			
 
				+    console.log(ANSI.blue + original_url + ANSI.reset)
			
 
				+    console.log(ANSI.black + snap_dir + ANSI.reset)
			
 
				+    console.log()
			
 
				+    console.log('[📂] Setting up Snapshot output directory...'.padEnd(82), prettyPath(snap_dir))
			
 
				+
			
 
				+    // check for existing data at old legacy paths e.g. ./data/archive/1999999999.1723425
			
 
				+    const hacky_dir = path.join(ARCHIVE_DIR, `1999999999.${hashCode(original_url)}`)
			
 
				+    const known_dir = SNAPSHOT_DIRS_BY_URL[original_url]
			
 
				+
			
 
				+    const known_dir_exists = fs.existsSync(known_dir)
			
 
				+    const hacky_dir_exists = fs.existsSync(hacky_dir)
			
 
				+
			
 
				+    if (snap_dir == hacky_dir) {
			
 
				+        if (known_dir_exists) {
			
 
				+            throw `Tried to create snapshot in ${snap_dir} but potential duplicate exists in ${known_dir}!`
			
 
				+        }
			
 
				+    } else if (snap_dir == known_dir) {
			
 
				+        if (hacky_dir_exists) {
			
 
				+            throw `Tried to create snapshot in ${snap_dir} but potential duplicate exists in ${hacky_dir}!`
			
 
				+        }
			
 
				+    } else {
			
 
				+        if (known_dir_exists) {
			
 
				+            throw `Tried to create snapshot in ${snap_dir} but potential duplicate exists in ${known_dir}!`
			
 
				+        } else if (hacky_dir_exists) {
			
 
				+            throw `Tried to create snapshot in ${snap_dir} but potential duplicate exists in ${hacky_dir}!`
			
 
				+        } else {
			
 
				+            throw `Tried to create snapshot in ${snap_dir} but its not a recognized snapshot dir path:\n    - ${known_dir}\n    - ${hacky_dir}`
			
 
				+        }
			
 
				+    }
			
 
				+    
			
 
				+    // mkdir -p ./data/archive/<snap_id>/versions && cd ./data/archive/<snap_id>
			
 
				+    await fs.promises.mkdir(path.join(snap_dir, 'versions'), {recursive: true})
			
 
				+    process.chdir(snap_dir)
			
 
				+
			
 
				+    // clear any /data/archive/<snap_id>/*.* symlinks pointing to existing ./versions/<versionid>/*.* files
			
 
				+    await clearSnapshotDirSymlinks(snap_dir)
			
 
				+
			
 
				+    // move /data/archive/<snap_id>/*.* loose output files from any prior run into ./versions/<versionid>/*.*
			
 
				+    await collectSnapshotDirVersionFiles(snap_dir)    
			
 
				+    
			
 
				+    // update /data/indexes/<index_name>/* to include references to /data/archive/<snap_id> as-needed
			
 
				+    await updateSnapshotDirIndexes(snap_dir, {original_url, start_time})
			
 
				+
			
 
				+    // assert /data/archive/<snap_id>/ contains no invalid/partial files + is empty/ready to receive new files
			
 
				+    await assertSnapshotDirIsValid(snap_dir, {is_empty: true})
			
 
				+
			
 
				+    return snap_dir
			
 
				+}
			
 
				+
			
 
				+// ./index/<index_name> : index_getter(page_state) => "<index_key_str>"
			
 
				+const INDEXES = {
			
 
				+    snapshots_by_day: ({start_time}) =>
			
 
				+        versionStrFromDate(start_time, {withDate: true, withTime: false}),
			
 
				+    snapshots_by_domain: ({original_url}) =>
			
 
				+        (new URL(original_url)).hostname || '',      // hostname does not include :port
			
 
				+}
			
 
				+
			
 
				+async function updateSnapshotDirIndexes(snap_dir, page_state, indexes=INDEXES, indexes_dir=INDEXES_DIR) {
			
 
				+    assert(indexes)
			
 
				+    console.log(`[🔎] Linking Snapshot in indexes (${Object.keys(indexes).join(', ')})...`)
			
 
				+    // const {snapshot_dir, original_url, start_ts} = page_state
			
 
				+    for (const [index_name, index_key_getter] of Object.entries(indexes)) {
			
 
				+        const index_entry = await indexSnapshotDir(snap_dir, {index_name, index_key_getter, indexes_dir}, page_state)
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+async function indexSnapshotDir(snap_dir, {index_name, index_key_getter, indexes_dir=INDEXES_DIR}, page_state) {
			
 
				+    // place symlinks to this snapshot in any /indexes/<index_name/ indexes as-needed
			
 
				+    // const snap_id = snap_dir.split('/').at(-1)
			
 
				+
			
 
				+    const index_dir = path.join(indexes_dir, index_name)                         // /data/index/snapshots_by_day
			
 
				+    await fs.promises.mkdir(index_dir, {recursive: true})
			
 
				+
			
 
				+    // calculate the index key, e.g. "200101231" or "example.com" 
			
 
				+    assert(index_name && index_key_getter)
			
 
				+    assert(page_state)
			
 
				+    const index_key = String(index_key_getter(page_state))                       // '20010131'
			
 
				+    assert(index_key)
			
 
				+    const snap_id = path.parse(snap_dir).base                                    // '19999999.23423523'
			
 
				+    assert(snap_id)
			
 
				+
			
 
				+    const index_entries_dir = path.join(index_dir, index_key)                    // /data/index/snapshots_by_day/20010131
			
 
				+    await fs.promises.mkdir(index_entries_dir, {recursive: true})
			
 
				+
			
 
				+    const symlink_path = path.join(index_entries_dir, snap_id)                   // /data/index/snapshots_by_day/20010131/19999999.23423523
			
 
				+
			
 
				+    // create symlink index/snapshots_by_day/<YYYYMMDD>/<snap id> -> ./archive/<snap_id> symlink
			
 
				+    const {symlink_abspath} = await overwriteSymlink(snap_dir, symlink_path, {relative: true, mkdirs: false})
			
 
				+}
			
 
				+
			
 
				+
			
 
				+async function collectSnapshotDirVersionFiles(snap_dir) {
			
 
				+    // move archive/<id>/*.* snapshot output files into archive/<id>/versions/<date>/* dated version folder
			
 
				+
			
 
				+    // detect start time / version info from previous result metrics.json
			
 
				+    const snap_id = snap_dir.split('/archive/').at(-1)
			
 
				+    const existing_metrics = path.join(snap_dir, 'metrics.json')
			
 
				+    let {start_time, VERSION} = {start_time: '1970-01-01T00:00:00.000Z', VERSION: '19700101000000'}
			
 
				+    try {
			
 
				+        ;({start_time, VERSION} = JSON.parse(await fs.promises.readFile(existing_metrics, 'utf-8')));
			
 
				+    } catch(err) {
			
 
				+        // continue normally, overwriting existing files is fine if they're broken to begin with
			
 
				+    }
			
 
				+ 
			
 
				+    // create new version folder based on metrics.json start_time (or epoch time as fallback for legacy output)
			
 
				+    const version_dir_name = VERSION || versionStrFromDate(start_time)
			
 
				+    const version_dir = path.join(snap_dir, 'versions', version_dir_name)
			
 
				+    await fs.promises.mkdir(version_dir, {recursive: true})
			
 
				+
			
 
				+    // move all result files from snapshot_dir root into version folder
			
 
				+    const existing_snapshot_files =
			
 
				+        (await fs.promises.readdir(snap_dir, {withFileTypes: true}))
			
 
				+            .filter(dirent => {
			
 
				+                if (dirent.name.startsWith('.')) return false    // ignore hidden files, dont version them
			
 
				+                if (dirent.name == 'versions') return false      // dont try to move versions folder into itself
			
 
				+                if (dirent.isSymbolicLink()) return false        // skip existing symbolic links
			
 
				+                return (dirent.isFile() || dirent.isDirectory()) // dont try to version sockets/FIFOs/devs etc.
			
 
				+            })
			
 
				+
			
 
				+    if (existing_snapshot_files.length) {
			
 
				+        console.log(`[📅] Moving snapshot results into version dir: ./data/archive/${snap_id}/* ->`.padEnd(82), `./data/archive/${snap_id}/versions/${VERSION}/`)
			
 
				+    }
			
 
				+    
			
 
				+    const snapshot_files = await getDirInfo(snap_dir, {withRoot: false, filter: ({relpath}) => !relpath.startsWith('versions')})
			
 
				+    const version_files = await getDirInfo(version_dir, {withRoot: false})
			
 
				+
			
 
				+    for (const {name} of existing_snapshot_files) {
			
 
				+        const snapdir_entry_abspath = path.join(snap_dir, name)
			
 
				+        const versioned_entry_abspath = path.join(version_dir, name)
			
 
				+
			
 
				+        const snapshot_entry = snapshot_files[name]
			
 
				+        const version_entry = version_files[name]
			
 
				+
			
 
				+        if (snapshot_entry && version_entry) {
			
 
				+            // a conflicting file/dir already exists in the destination path
			
 
				+            // we have a few options here, we can try to merge them, or we can create a new version
			
 
				+
			
 
				+            if (snapshot_entry.sha256 == version_entry.sha256) {
			
 
				+                // both are the same already, delete the duplicate (leaving the copy inside the version dir)
			
 
				+                // if (snapshot_entry.is_dir) {
			
 
				+                //     await fs.promises.rmdir(snapshot_entry.abspath, {recursive: true})
			
 
				+                // } else {
			
 
				+                //     await fs.promises.unlink(snapshot_entry.abspath)
			
 
				+                // }
			
 
				+                // console.warn(`[!] Found harmless exact duplicate files, leaving as is: ${snapshot_entry.summary} and ${version_entry.summary}`)
			
 
				+            } else {
			
 
				+                // both are different,
			
 
				+                if (snapshot_entry.num_bytes > version_entry.num_bytes) {
			
 
				+                    // snapshot entry is bigger, keep it and delete version entry?
			
 
				+                } else {
			
 
				+                    // version entry is bigger, keep it and delete snapshot entry
			
 
				+                }
			
 
				+                console.warn('    ', snapshot_entry.summary)
			
 
				+                console.warn('    ', version_entry.summary)
			
 
				+                // throw `Found conflicting duplicate files with different contents: ${name}`
			
 
				+            }
			
 
				+        } else {
			
 
				+            // mv ./data/archive/<snap_id>/example.txt -> ./data/archive/<snap_id>/versions/<version_id>/example.txt
			
 
				+            await fs.promises.rename(snapdir_entry_abspath, versioned_entry_abspath)
			
 
				+            console.log(`  ↣ ${prettyPath(snapdir_entry_abspath)} ->`.padEnd(82), prettyPath(versioned_entry_abspath))
			
 
				+        }
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+// Extractor definition
			
 
				+// {
			
 
				+//  phase: setup | load | sync1 | async1 | sync2 | close
			
 
				+//  name: 'media' | 'photos', 'wget', 'singlefile'
			
 
				+//  
			
 
				+//  shouldRun(page, page_state)
			
 
				+    
			
 
				+    // pageSetup
			
 
				+    // pageLoad
			
 
				+    // pageInteraction         clicking around/scrolling
			
 
				+    // archivePhase1           sync
			
 
				+    // archivePhase2           async
			
 
				+    // archivePhase3           async
			
 
				+    // pageClose
			
 
				+
			
 
				+//  execute(page, page_state)
			
 
				+//  validateResult(page, page_state)
			
 
				+// }
			
 
				+
			
 
				+async function clearSnapshotDirSymlinks(snap_dir) {
			
 
				+    // delete all archive/<id>/* symlinks in preparation for new snapshot output to be placed there
			
 
				+
			
 
				+    const existing_symlinks =
			
 
				+        (await fs.promises.readdir(snap_dir, {withFileTypes: true}))
			
 
				+            .filter(dirent => {
			
 
				+                if (dirent.name.startsWith('.')) return false    // ignore hidden files, dont version them
			
 
				+                if (dirent.name == 'versions') return false      // dont try to move versions folder into itself
			
 
				+                return dirent.isSymbolicLink()
			
 
				+            })
			
 
				+
			
 
				+    for (const {name: existing_symlink} of existing_symlinks) {
			
 
				+        await fs.promises.unlink(path.join(snap_dir, existing_symlink))
			
 
				+        // if symlinks are not cleared before starting, it can cause issues with outputs writing into previous versions folders
			
 
				+        // e.g. screerecording saves to ./media which could be pointing to previous version's ./versions/<olddate>/media
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+async function symlinkBestSnapshotResults(snap_dir) {
			
 
				+    // move any existing files into versions/<date> folder (clear out main folder)
			
 
				+    // symlink latest files from versions/<date>/* into main folder
			
 
				+    
			
 
				+    await fs.promises.mkdir(path.join(snap_dir, 'versions'), {recursive: true})
			
 
				+    process.chdir(snap_dir)
			
 
				+
			
 
				+    const metrics_file = path.join(snap_dir, 'metrics.json')
			
 
				+    // if (!fs.existsSync(metrics_file) || (await fs.promises.lstat(metrics_file)).isSymbolicLink()) {
			
 
				+    //     console.warn('[⚠️] Warning, found partial dirty snapshot state (did the snapshot get interrupted?)', snap_dir)
			
 
				+    // }
			
 
				+
			
 
				+    // move output files into versioned folder
			
 
				+    await collectSnapshotDirVersionFiles(snap_dir)    
			
 
				+
			
 
				+    // clear any existing symlinks
			
 
				+    await clearSnapshotDirSymlinks(snap_dir)
			
 
				+
			
 
				+    // assert task dir is empty and contains no bare files that might get overwritten, also asserts version dirs are valid
			
 
				+    await assertSnapshotDirIsValid(snap_dir, {is_empty: true})
			
 
				+
			
 
				+
			
 
				+    const version_dirs = (await fs.promises.readdir(path.join(snap_dir, 'versions'))).sort()   // earliest to latest
			
 
				+    const most_recent = version_dirs.at(-1)
			
 
				+
			
 
				+    // for each version dir in versions/ (oldest -> newest)
			
 
				+    for (const version_dir of version_dirs) {
			
 
				+        if (version_dir.startsWith('.')) continue
			
 
				+
			
 
				+        const version_dir_abspath = path.join(snap_dir, 'versions', version_dir)
			
 
				+        const version_dir_files = (
			
 
				+            (await fs.promises.readdir(version_dir_abspath))
			
 
				+                .filter(filename => !filename.startsWith('.')))
			
 
				+
			
 
				+        // iterate through all the files/folders in the version dir
			
 
				+        for (const filename of version_dir_files) {
			
 
				+            const snapdir_entry = path.join(snap_dir, filename)                                // ./data/archive/<snapid>/filename
			
 
				+            const versiondir_entry = path.join(snap_dir, 'versions', version_dir, filename)    // ./data/archive/<snapid>/versions/<versionid>/filename
			
 
				+            
			
 
				+            if (fs.existsSync(snapdir_entry)) {
			
 
				+                // if an entry already exists in the snapshot root for this filename
			
 
				+                if ((await fs.promises.lstat(snapdir_entry)).isSymbolicLink()) {
			
 
				+                    // if a symlink already exists in the root with the same name,
			
 
				+                    // check if the version file we're looking at is a better candidate to replace it
			
 
				+
			
 
				+                    const existing_abspath = await fs.promises.realpath(snapdir_entry)
			
 
				+                    const desired_abspath = path.join(version_dir_abspath, filename)
			
 
				+                    if (existing_abspath != desired_abspath) {
			
 
				+                        // check if the new candidate is larger or if the existing symlink is larger   (largest file = most likely to be highest quality capture data)
			
 
				+                        const largest_path = await getLargestPath(existing_abspath, desired_abspath)
			
 
				+                        if (largest_path != (await fs.promises.realpath(existing_abspath))) {
			
 
				+                            const larger_version = path.basename(path.dirname(largest_path))
			
 
				+                            const larger_abspath = path.join(snap_dir, 'versions', larger_version, filename)
			
 
				+                            
			
 
				+                            // console.log('    - swapping for larger file:', filename, '->', larger_abspath.split('/archive/').at(-1))
			
 
				+                            await overwriteSymlink(larger_abspath, snapdir_entry, {search_limit: snap_dir})
			
 
				+                        } else {
			
 
				+                            // console.log('    - leaving larger file:', largest_path.split('/archive/').at(-1))
			
 
				+                        }
			
 
				+                    } else {
			
 
				+                        // leave existing symlink pointing to current version file, nothing to change
			
 
				+                        // console.log('    - leaving current file:', existing_abspath.split('/archive/').at(-1))
			
 
				+                    }
			
 
				+                } else {
			
 
				+                    // clearSnapshotDirSymlinks() should have already cleared these files out!
			
 
				+                    throw `Non-symlink file found in root of snapshot dir! Refusing to overwrite: ${prettyPath(snapdir_entry)}`
			
 
				+                }
			
 
				+            } else {
			
 
				+                // no entry exists in the snapshot root for this filename, create one by linking to the version file
			
 
				+                await overwriteSymlink(versiondir_entry, snapdir_entry, {search_limit: snap_dir})
			
 
				+            }
			
 
				+            // if (version_dir == most_recent) {
			
 
				+            //     // only log most recent links even though we link older ones too (otherwise its too noisy)
			
 
				+            //     console.log(`  🔗 ./${filename} -> ./${versiondir_entry} linking...`)
			
 
				+            // }
			
 
				+        }
			
 
				+    }
			
 
				+
			
 
				+    return snap_dir
			
 
				+}
			
 
				+
			
 
				+async function assertSnapshotDirIsValid(snap_dir, {is_empty=false}={}) {
			
 
				+    process.chdir(snap_dir)
			
 
				+    console.log()
			
 
				+    console.log(`[☑️] Checking that snapshot records are valid...`)
			
 
				+
			
 
				+    // get all directory entries in archive/<snapshot_id>/*
			
 
				+    const snapshot_dir_entries =
			
 
				+        (await fs.promises.readdir(snap_dir, {withFileTypes: true}))
			
 
				+            .filter(dirent => {
			
 
				+                if (dirent.name.startsWith('.')) return false
			
 
				+                if (dirent.name == 'versions') return false
			
 
				+            })
			
 
				+
			
 
				+    // assert versions folder exists and is not a symbolic link
			
 
				+    const versions_dir = path.join(snap_dir, 'versions')
			
 
				+    assert(fs.existsSync(versions_dir))
			
 
				+    assert(!(await fs.promises.lstat(versions_dir)).isSymbolicLink())
			
 
				+
			
 
				+    // if it should be empty, check that no loose files exist
			
 
				+    if (is_empty) {
			
 
				+        assert(!snapshot_dir_entries.length, `Found loose files in snapshot-dir that shouldn't be there! ${snap_dir}`)
			
 
				+    }
			
 
				+
			
 
				+    // assert all non-hidden files in snapshot dir are symbolic links to actual data in versions/<date>/*
			
 
				+    for (const snapshot_dir_entry of snapshot_dir_entries) {
			
 
				+        if (snapshot_dir_entry.name.startsWith('.')) continue
			
 
				+        if (snapshot_dir_entry.name == 'versions') continue
			
 
				+        assert(snapshot_dir_entry.isSymbolicLink(), `Found non-symbolic link in root of snapshot dir! ${snap_dir}/${snapshot_dir_entry.name}`)
			
 
				+        assert(fs.existsSync(snapshot_dir_entry.name), `Found broken symbolic link in root of snapshot dir! ${snap_dir}/${snapshot_dir_entry.name}`)
			
 
				+    }
			
 
				+
			
 
				+    const version_entries = (
			
 
				+        (await fs.promises.readdir(versions_dir))
			
 
				+            .filter(foldername => !foldername.startsWith('.'))
			
 
				+            .sort())
			
 
				+
			
 
				+    console.log(`  √ ${prettyPath(versions_dir)}`, version_entries.length)
			
 
				+
			
 
				+    for (const version_dir of version_entries) {
			
 
				+        await assertVersionDirIsValid(path.join(versions_dir, version_dir))
			
 
				+    }
			
 
				+
			
 
				+    // write snapshot dir file listing w/ sizes & hashes to .files.json
			
 
				+    const directory_info = await getDirInfo(snap_dir, {withRoot: true, withHelpers: false, maxdepth: 3})
			
 
				+    await overwriteFile(path.join(snap_dir, '.files.json'), directory_info)
			
 
				+}
			
 
				+
			
 
				+async function assertVersionDirIsValid(version_dir) {
			
 
				+    const dirname = path.parse(version_dir).name
			
 
				+    assert(fs.existsSync(version_dir), `Version dir does not exist: ${prettyPath(version_dir)}`)
			
 
				+
			
 
				+    const dirent = await fs.promises.lstat(version_dir)
			
 
				+    assert(dirent.isDirectory() && !dirent.isSymbolicLink(), `Found non-directory in versions dir! ${prettyPath(version_dir)}`)
			
 
				+    
			
 
				+    const unix_epoch = '19700101000000'
			
 
				+    const is_name_valid_datestr = /^\d+$/.test(dirname) && (dirname.length == 14) && (dirname.startsWith('2') || dirname == unix_epoch) && parseVersionDateStr(dirname)
			
 
				+    assert(is_name_valid_datestr, `Version directories must be a 14-character long date string like 20251231235959! ${dirname}`)
			
 
				+
			
 
				+    // get all directory entries in archive/<snapshot_id>/versions/<version_id>/*
			
 
				+    const version_dir_entries = (
			
 
				+        (await fs.promises.readdir(version_dir, {withFileTypes: true}))
			
 
				+            .filter((dirent) => !dirent.name.startsWith('.')))
			
 
				+
			
 
				+    // assert version dir contains only actual snapshot output files (not-symbolic links or other version dirs)
			
 
				+    for (const version_dir_entry of version_dir_entries) {
			
 
				+        assert(version_dir_entry.name != 'versions', `Version dir cannot contain another versions folder! ${prettyPath(version_dir)}/versions`)
			
 
				+        assert(!version_dir_entry.isSymbolicLink(), `Version dir cannot contain symbolic link! ${prettyPath(version_dir)}/${version_dir_entry.name}`)
			
 
				+    }
			
 
				+
			
 
				+    // color highlight the unix epoch version in black, and any version created today in blue
			
 
				+    let pretty_dirname = dirname
			
 
				+    if (dirname == unix_epoch) {
			
 
				+        pretty_dirname = ANSI.black + unix_epoch + ANSI.reset
			
 
				+    }
			
 
				+    const today = versionStrFromDate(new Date(), {withDate: true, withTime: false})
			
 
				+    if (dirname.startsWith(today)) {
			
 
				+        pretty_dirname = ANSI.blue + dirname + ANSI.reset
			
 
				+    }
			
 
				+
			
 
				+    // write version dir file listing w/ sizes & hashes to .files.json
			
 
				+    const directory_info = await getDirInfo(version_dir, { withRoot: true, withHelpers: false, maxdepth: 3 })
			
 
				+    await overwriteFile(path.join(version_dir, '.files.json'), directory_info)
			
 
				+
			
 
				+    console.log(`    √ ./versions/${pretty_dirname} contains`, version_dir_entries.length, 'results')
			
 
				+}
			
 
				+
			
 
				+async function setupSnapshotDB({ original_url, start_time, snapshot_dir }) {
			
 
				+    // setup Snapshot database row, finding it if it already exists or creating a new one
			
 
				+
			
 
				+    const timestamp = snapshot_dir.split('/').at(-1)
			
 
				+    const search_attrs = { url: original_url, timestamp }
			
 
				+    const update_attrs = { url: original_url, timestamp, added: start_time, title: null }
			
 
				+
			
 
				+    let snapshot = await Snapshot.findOne({ where: search_attrs });
			
 
				+    let created = false
			
 
				+    if (!snapshot) {
			
 
				+        snapshot = await Snapshot.findOne({ where: {url: original_url} });
			
 
				+        if (snapshot) {
			
 
				+            // console.warn(`[X] Found DB Snapshot [${timestamp}](${original_url.substring(0, 30)}...) that has different timestamp from existing dir ${prettyPath(snapshot_dir)}!`)
			
 
				+            // throw 'Snapshot DB record does not match filesystem path!'
			
 
				+        } else {
			
 
				+            console.log(`[+] Creating new DB Snapshot [${timestamp}](${original_url.substring(0, 30)}...) for ${prettyPath(snapshot_dir)}...`)
			
 
				+            // ;([snapshot, created] = await Snapshot.findOrCreate({where: search_attrs, defaults: update_attrs }));
			
 
				+            // throw 'Wanted to create new Snapshot but refusing to modify DB during testing!'
			
 
				+        }
			
 
				+    }
			
 
				+
			
 
				+    // assert(snapshot && (snapshot instanceof Snapshot))
			
 
				+    return snapshot
			
 
				+}
			
 
				+
			
 
				+async function setupViewport(page, _page_state) {
			
 
				+    // setup viewport
			
 
				+    await page.setViewport(DEFAULT_VIEWPORT);
			
 
				+    await page.setGeolocation(DEFAULT_GEOLOCATION);
			
 
				+    // await page.setBypassCSP(true);             // bypass CSP restrictions (requires --disable-web-security)
			
 
				+    page.setDefaultTimeout(DEFAULT_TIMEOUT);
			
 
				+
			
 
				+    // Optional: emulate a mobile device
			
 
				+    //  await page.emulate(puppeteer.devices['iPhone 6']);
			
 
				+
			
 
				+    // Configure light mode/dark mode & accessibility reduced motion preferences
			
 
				+    await page.emulateMediaFeatures([
			
 
				+        {name: 'prefers-color-scheme', value: DEFAULT_COLOR_SCHEME},
			
 
				+        {name: 'prefers-reduced-motion', value: 'reduce'},
			
 
				+    ]);
			
 
				+
			
 
				+    // Setup headers & deterministically chose a random referrer based on URL
			
 
				+    const rand_idx = hashCode(await page.url()) % DEFAULT_REFERRERS.length
			
 
				+    await page.setExtraHTTPHeaders({
			
 
				+        ...DEFAULT_HEADERS,
			
 
				+        referrer: DEFAULT_REFERRERS[rand_idx],
			
 
				+    })
			
 
				+
			
 
				+    // Setup alert to trigger if site tries to sniff whether we are a bot
			
 
				+    function sniffDetector() {
			
 
				+        const userAgent = window.navigator.userAgent;
			
 
				+        const platform = window.navigator.platform;
			
 
				+        // @ts-ignore
			
 
				+        window.navigator.__defineGetter__('userAgent', function () {
			
 
				+            // @ts-ignore
			
 
				+            window.navigator.sniffed = true;
			
 
				+            return userAgent;
			
 
				+        });
			
 
				+        // @ts-ignore
			
 
				+        window.navigator.__defineGetter__('platform', function () {
			
 
				+            // @ts-ignore
			
 
				+            window.navigator.sniffed = true;
			
 
				+            return platform;
			
 
				+        });
			
 
				+    }
			
 
				+    await page.evaluateOnNewDocument(sniffDetector);
			
 
				+    // @ts-ignore
			
 
				+    const was_sniffed = await page.evaluate(() => (!!window.navigator.sniffed))
			
 
				+    if (was_sniffed) {
			
 
				+        console.warn('[⚠️] Site tried to sniff if we are a bot! Site may be difficult to archive.')
			
 
				+    }
			
 
				+    
			
 
				+    return page
			
 
				+}
			
 
				+
			
 
				+async function setupModalAutoClosing(page, page_state, {timeout=1_250}={}) {
			
 
				+    page.on('dialog', (dialog) => { 
			
 
				+        console.log(`[👆] Auto-closing modal that popped up: ${dialog.message()}...`)
			
 
				+        setTimeout(() => {try { dialog.accept() } catch(err) {}}, timeout);
			
 
				+    })
			
 
				+
			
 
				+    // if you expect a file-upload dialog, use this to catch it instead:
			
 
				+    // const [fileChooser] = await Promise.all([
			
 
				+    //   page.waitForFileChooser(),
			
 
				+    // ]);
			
 
				+    // await fileChooser.accept(['/tmp/myfile.pdf']);
			
 
				+    page.on('close', () => {
			
 
				+        try {
			
 
				+            page.off('dialog')
			
 
				+        } catch(err) {}
			
 
				+    })
			
 
				+}
			
 
				+
			
 
				+async function startScreenrecording(page, page_state, {duration_limit=60, codec='libx264'}={}) {
			
 
				+    await fs.promises.mkdir(path.dirname(SCREENRECORDING_PATH(page)), {recursive: true})
			
 
				+    // console.log(`[🎬] Starting screen-recording stream...`.padEnd(82), prettyPath(SCREENRECORDING_PATH(page)))
			
 
				+    
			
 
				+    // alternative: interact with low-level puppeteer screencast API directly
			
 
				+    // using puppeteer.page.screencast: https://pptr.dev/api/puppeteer.page.screencast
			
 
				+    // const recorder = await page.screencast({path: SCREENRECORDING_PATH(page)});
			
 
				+
			
 
				+    // alternative: use puppeteer-stream for .webm/.mp4 screen recordings with tab audio included
			
 
				+    // works sometimes but has a few issues, e.g.: https://github.com/SamuelScheit/puppeteer-stream/issues/8
			
 
				+
			
 
				+    // alternative: puppeteer-screen-recorder (most compatible/stable but doesn't include tab audio output)
			
 
				+    const recorder = new PuppeteerScreenRecorder(page, {
			
 
				+        followNewTab: false,
			
 
				+        recordDurationLimit: duration_limit,
			
 
				+        // fps: 25,
			
 
				+        // ffmpeg_Path: '<path of ffmpeg_path>' || null,
			
 
				+        // videoFrame: {
			
 
				+        //   width: 1024,
			
 
				+        //   height: 768,
			
 
				+        // },
			
 
				+        // videoCrf: 18,
			
 
				+        videoCodec: codec,
			
 
				+        // videoPreset: 'ultrafast',
			
 
				+        // videoBitrate: 1000,
			
 
				+        // autopad: {
			
 
				+        //   color: 'black' | '#35A5FF',
			
 
				+        // },
			
 
				+        // aspectRatio: '4:3',
			
 
				+    });
			
 
				+    page_state.recorder = recorder
			
 
				+    await recorder.start(SCREENRECORDING_PATH(page))
			
 
				+
			
 
				+    page.on('close', async () => {await saveScreenrecording(page, page_state)});
			
 
				+    return page_state
			
 
				+}
			
 
				+
			
 
				+async function startResponseSaving(page, page_state) {
			
 
				+    const dir = RESPONSES_PATH(page)
			
 
				+    await fs.promises.mkdir(dir, {recursive: true})
			
 
				+
			
 
				+    console.log(`[🌄] Starting raw response bytes recording...`.padEnd(82), prettyPath(dir) + '/')
			
 
				+
			
 
				+    // Document, Stylesheet, Image, Media, Font, Script, TextTrack, XHR, Fetch, Prefetch, EventSource, WebSocket, Manifest, SignedExchange, Ping, CSPViolationReport, Preflight, Other
			
 
				+    const types_to_save = [
			
 
				+        // 'document',
			
 
				+        'script',
			
 
				+        'stylesheet',
			
 
				+        'font',
			
 
				+        'image',
			
 
				+        'media',
			
 
				+        'xhr',
			
 
				+        'websocket',
			
 
				+    ]
			
 
				+
			
 
				+    // reset responses index file to empty
			
 
				+    const responses_log_path = path.join(dir, 'index.jsonl')
			
 
				+    await overwriteFile(responses_log_path, '')
			
 
				+
			
 
				+    // add handler to save all image repsonses into output directory
			
 
				+    page.on('response', async (response) => {
			
 
				+        try {
			
 
				+
			
 
				+            const timestamp = versionStrFromDate(new Date(), {withDate: true, withTime: true, withSeconds: true, withMilliseconds: true})
			
 
				+
			
 
				+            if (!page_state.main_response && (response.request().url() == page_state.original_url)) {
			
 
				+                // save first response as main page response (if we havent already caught it earlier)
			
 
				+                page_state.main_response = response
			
 
				+            }
			
 
				+
			
 
				+            const status = response.status()
			
 
				+            if ((status >= 300) && (status < 500)) {
			
 
				+                // console.log('Got bad response from', response.url(), 'to', response.headers()['location'])
			
 
				+                return
			
 
				+            }
			
 
				+            const request = response.request()
			
 
				+            const resourceType = request.resourceType()
			
 
				+            const url_scheme = (response.url() || request.url()).split(':')[0].toLowerCase()
			
 
				+            const method = (url_scheme === 'data') ? 'DATA' : request.method()
			
 
				+
			
 
				+            // console.log('    ', resourceType, response.url())
			
 
				+            if (types_to_save.includes(resourceType)) {
			
 
				+                // create ./responses/xhr/www.facebook.com/static/images/icons/ subdir based on hostname + path
			
 
				+                const resource_type_dir = path.join(dir, resourceType)
			
 
				+                const url = new URL(response.url())
			
 
				+                let subdir = resource_type_dir
			
 
				+                const url_path = (url.pathname || '').slice(0, 250).endsWith('/')
			
 
				+                    ? (url.pathname || '').slice(0, 250)
			
 
				+                    : path.dirname((url.pathname || '').slice(0, 250))
			
 
				+
			
 
				+                // determine subdirectory based on url type (handles http:,https:,file:,data:,chrome-extension:,about:,etc.)
			
 
				+                if (!URL_SCHEMES_IGNORED.includes(url_scheme)) {
			
 
				+                    // is a normal http:// or https:// url, use the domain + path to construct subdirectory
			
 
				+                    subdir = path.join(resource_type_dir, (url.hostname || 'data').slice(0, 250), url_path)
			
 
				+                } else if (url_scheme == 'data') {
			
 
				+                    // is a data:... url, store in ./data subdirectory
			
 
				+                    subdir = path.join(resource_type_dir, 'data')
			
 
				+                } else {
			
 
				+                    // is a chrome-extension:// or other special url, use the extension id + path to construct subdirectory
			
 
				+                    const url_path = path.dirname((url.pathname || '').slice(0, 999))
			
 
				+                    subdir = path.join(resource_type_dir, url_scheme, (url.hostname || 'data').slice(0, 250), url_path)
			
 
				+                }
			
 
				+
			
 
				+                // write response to responses/all/1716861056899__https%3A%2F%2Fwww.instagram.com%2Fgraphql%2Fquery.json
			
 
				+                let abspath = null
			
 
				+                let resp_mimetype = null
			
 
				+                let extension = ''
			
 
				+                let uniq_filename = null
			
 
				+                let uniq_abspath = null
			
 
				+                let symlink_abspath = null
			
 
				+                let responseSha256 = null
			
 
				+                try {
			
 
				+                    await fs.promises.mkdir(path.join(dir, 'all'), {recursive: true})
			
 
				+                    try {
			
 
				+                        await fs.promises.mkdir(subdir, {recursive: true})
			
 
				+                    } catch(err) {
			
 
				+                        subdir = subdir + '.dir' // TODO: apply this workaround to parent path entries too
			
 
				+                        try {
			
 
				+                            await fs.promises.mkdir(subdir, {recursive: true})
			
 
				+                        } catch(err) {
			
 
				+                            subdir = path.join(resource_type_dir, 'data')
			
 
				+                            await fs.promises.mkdir(subdir, {recursive: true})
			
 
				+                        }
			
 
				+                    }
			
 
				+                    ;({abspath: symlink_abspath, resp_mimetype, extension} = await detectFilename({page, response, dir: subdir, resourceType}))
			
 
				+                    
			
 
				+                    // responses/all/1716861056899__https%3A%2F%2Fwww.instagram.com%2Fgraphql%2Fquery.json
			
 
				+                    uniq_filename = `${timestamp}__${method}__` + [encodeURIComponent(url.href).slice(0, 64).replaceAll('/', '_').replace(new RegExp(`.${extension}$`), ''), extension].filter(s => s.length).join('.')
			
 
				+                    uniq_abspath = path.join(dir, 'all', uniq_filename)
			
 
				+
			
 
				+
			
 
				+                    let bytesBuffer = null
			
 
				+                    try {
			
 
				+                        bytesBuffer = await response.buffer()
			
 
				+                    } catch(err) {
			
 
				+                        if (String(err).includes("Cannot read properties of undefined (reading 'body')")) {
			
 
				+                            // not sure why it's happening but seems to be too late to caputre body sometimes? possible race condition
			
 
				+                        } else {
			
 
				+                            console.warn('[⚠️] Failed to save response bytes for:', response.request().url(), err)
			
 
				+                        }
			
 
				+                    }
			
 
				+                    if (bytesBuffer) {
			
 
				+                        // write response data into ./all/<TS>__<METHOD>__<URL>.<EXT>
			
 
				+                        await overwriteFile(uniq_abspath, bytesBuffer)
			
 
				+                        
			
 
				+                        responseSha256 = crypto.createHash('sha256').update(bytesBuffer).digest('hex')
			
 
				+
			
 
				+                        // write symlink file to ./<TYPE>/<DOMAIN>/...<PATH>/<FILENAME>.<EXT>   ->  ./all/<TS>__<METHOD>__<URL>.<EXT>
			
 
				+                        await overwriteSymlink(uniq_abspath, symlink_abspath, {relative: dir, mkdirs: true, search_limit: dir})
			
 
				+                    }
			
 
				+                    // console.log('    ->', symlink_abspath)
			
 
				+                } catch(err) {
			
 
				+                    // dont do anything for redirectresponses, error responses, etc.
			
 
				+                    console.warn(err)
			
 
				+                }
			
 
				+
			
 
				+                const urlSha256 = crypto.createHash('sha256').update(String(request.url())).digest('hex')
			
 
				+                // const headersSha256 = crypto.createHash('sha256').update(String(request.headers()))   // someday we may want to save headers hashes too
			
 
				+
			
 
				+                const truncated_url = (method == 'DATA') ? request.url().slice(0, 128) : request.url()   // don't duplicate bytes in data: urls (we already saved them in the file)
			
 
				+
			
 
				+                // this is essentially replicating the functionality of a WARC file, but in directory + index.jsonl form
			
 
				+                await fs.promises.appendFile(
			
 
				+                    responses_log_path,
			
 
				+                    JSON.stringify({
			
 
				+                        ts: timestamp,
			
 
				+                        method,
			
 
				+                        url: truncated_url,
			
 
				+                        urlSha256,
			
 
				+                        postData: request.postData(),
			
 
				+                        response_url: ((method != 'DATA') && (url.href != request.url())) ? url.href : undefined,
			
 
				+                        status,
			
 
				+                        resourceType,
			
 
				+                        mimeType: resp_mimetype,
			
 
				+                        responseSha256,
			
 
				+                        path: uniq_abspath?.replace(dir, '.'),
			
 
				+                        symlink_path: symlink_abspath?.replace(dir, '.'),
			
 
				+                        extension,
			
 
				+                    }) + '\n',
			
 
				+                    'utf-8',
			
 
				+                )
			
 
				+            }
			
 
				+        } catch(err) {
			
 
				+            // we should never throw hard errors here because there's nothing above us to catch it
			
 
				+            // and we dont want to crash the entire CDP session / browser / main node process
			
 
				+            console.warn('[❌] Error in response handler (set in startResponseSaving):', err)
			
 
				+        }
			
 
				+    });
			
 
				+    // handled by stopMetadataRecording():
			
 
				+    // page.on('close', () => {
			
 
				+    //     page.off('response')
			
 
				+    // })
			
 
				+}
			
 
				+
			
 
				+function dedupeCookies(cookies) {
			
 
				+    const len_before = cookies.length
			
 
				+
			
 
				+    const allowed_cookie_attrs = ['domain', 'path', 'name', 'value', 'expires', 'sameSite', 'sourceScheme', 'url', 'priority', 'secure', 'httpOnly']
			
 
				+
			
 
				+    const deduped_cookies = {}
			
 
				+    for (const cookie of cookies) {
			
 
				+        try {
			
 
				+            const unique_id = `${cookie.domain}${cookie.path}${cookie.name}`
			
 
				+            deduped_cookies[unique_id] = {
			
 
				+                ...(deduped_cookies[unique_id] || {}),
			
 
				+                ...cookie,
			
 
				+                expires: 2147483640,    // max allowed expiry time (2038-01-18)
			
 
				+                session: false,         // make sure cookies dont expire at browser close time
			
 
				+                secure: false,          // make cookie restrictions more lax (for archiving scripts)
			
 
				+                httpOnly: false,        // make it easier to tamper with cookies from JS (for archiving scripts)
			
 
				+                
			
 
				+                // "path": "/",
			
 
				+                // "expires": 2147483641,
			
 
				+                // "size": 194,
			
 
				+                // "httpOnly": false,
			
 
				+                // "secure": false,
			
 
				+                // "session": false,
			
 
				+                // "priority": "High",
			
 
				+                // "sameParty": false,
			
 
				+                // "sourceScheme": "Secure",
			
 
				+                // "sourcePort": 443
			
 
				+                
			
 
				+                // and more...                  https://pptr.dev/api/puppeteer.cookieparam
			
 
				+            } as Cookie
			
 
				+
			
 
				+            if (!deduped_cookies[unique_id].value) {
			
 
				+                delete deduped_cookies[unique_id]
			
 
				+                continue
			
 
				+            }
			
 
				+            if (deduped_cookies[unique_id].name.startsWith('__')) {
			
 
				+                // cookies that start with __ must be secure, see https://github.com/puppeteer/puppeteer/issues/6806
			
 
				+                deduped_cookies[unique_id].secure = true
			
 
				+                deduped_cookies[unique_id].sourceScheme = 'Secure'
			
 
				+            }
			
 
				+            if (deduped_cookies[unique_id].domain.startsWith('.')) {
			
 
				+                deduped_cookies[unique_id].sameParty = false
			
 
				+                deduped_cookies[unique_id].domain = deduped_cookies[unique_id].domain.slice(1)
			
 
				+            }
			
 
				+            
			
 
				+            for (const key of Object.keys(deduped_cookies[unique_id])) {
			
 
				+                if (!allowed_cookie_attrs.includes(key)) {
			
 
				+                    delete deduped_cookies[unique_id][key]
			
 
				+                }
			
 
				+            }
			
 
				+        } catch(err) {
			
 
				+            console.error('[❌] Failed to parse cookie during deduping', cookie)
			
 
				+            throw err
			
 
				+        }
			
 
				+    }
			
 
				+    // console.log(`[🍪] Deduped ${len_before} cookies to ${Object.keys(deduped_cookies).length}...`)
			
 
				+
			
 
				+    return Object.values(deduped_cookies) as Cookie[]
			
 
				+}
			
 
				+
			
 
				+async function loadCookiesTxt() {
			
 
				+    const cookies = [] as Cookie[]
			
 
				+    return cookies  // write-only from chrome -> files for now
			
 
				+
			
 
				+    if (fs.existsSync(COOKIES_TXT_PATH)) {
			
 
				+        // console.log(`[🍪] Loading cookies/localStorage/sessionStorage from ${COOKIES_TXT_PATH}...`)
			
 
				+
			
 
				+        // Read from to cookies.txt file using tough-cookie + @root/file-cookie-store
			
 
				+        const cookies_store = new FileCookieStore(COOKIES_TXT_PATH, {auto_sync: false, lockfile: false});
			
 
				+        cookies_store.getAllCookiesAsync = util.promisify(cookies_store.getAllCookies);
			
 
				+        const exported_cookies = await cookies_store.getAllCookiesAsync()
			
 
				+        for (const cookie of exported_cookies) {
			
 
				+            const cookie_from_tough = cookie.toJSON()
			
 
				+            const domain = cookie_from_tough.hostOnly ? `.${cookie_from_tough.domain}` : cookie_from_tough.domain
			
 
				+            const cookie_for_puppeteer: Cookie = {
			
 
				+                domain,
			
 
				+                name: cookie_from_tough.key,
			
 
				+                path: cookie_from_tough.path,
			
 
				+                value: cookie_from_tough.value,
			
 
				+                secure: cookie_from_tough.secure || false,
			
 
				+                httpOnly: cookie_from_tough.httpOnly || false,
			
 
				+                session: false,
			
 
				+                expires: (new Date(cookie_from_tough.expires)).valueOf()/1000,
			
 
				+                size: undefined,
			
 
				+            }
			
 
				+            // console.log('COOKIE_FROM_TOUGH_TXT', cookie_from_tough, cookie_for_puppeteer)
			
 
				+            cookies.push(cookie_for_puppeteer)
			
 
				+        }
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+type AuthJSON = {
			
 
				+    cookies: Cookie[],
			
 
				+    sessionStorage: any,
			
 
				+    localStorage: any,
			
 
				+}
			
 
				+
			
 
				+async function loadAuthStorage(page, {client}, {apply=true}={}) {
			
 
				+    var {
			
 
				+        cookies,
			
 
				+        sessionStorage,
			
 
				+        localStorage,
			
 
				+    }: AuthJSON = {cookies: [], sessionStorage: {}, localStorage: {}}
			
 
				+    
			
 
				+    if (!LOAD_AUTH_STORAGE) {
			
 
				+        // dont read auth from filesystem auth.json/cookies.txt, just rely on existing cookies in chrome profile
			
 
				+        return {cookies, sessionStorage, localStorage}
			
 
				+    }
			
 
				+
			
 
				+    if (fs.existsSync(COOKIES_TXT_PATH)) {
			
 
				+        try {
			
 
				+            cookies = await loadCookiesTxt()
			
 
				+        } catch(err) {
			
 
				+            console.warn('[⚠️] Loaded invalid cookies.txt, moved it to cookies.txt.corrupted (did two processes try to change it at the same time?)')
			
 
				+            await fs.promises.rename(COOKIES_TXT_PATH, COOKIES_TXT_PATH + '.corrupted')
			
 
				+        }
			
 
				+        // console.log(`[🍪] Loading cookies from cookies.txt...`, cookies.length)
			
 
				+    }
			
 
				+
			
 
				+    if (fs.existsSync(AUTH_JSON_PATH)) {
			
 
				+        try {
			
 
				+            var {
			
 
				+                cookies: auth_json_cookies,
			
 
				+                sessionStorage,
			
 
				+                localStorage,
			
 
				+            } = JSON.parse(await fs.promises.readFile(AUTH_JSON_PATH, 'utf-8'));
			
 
				+            cookies = [...cookies, ...auth_json_cookies]
			
 
				+            // console.log(`[🍪] Loading cookies from auth.json...`, auth_json_cookies.length)
			
 
				+        } catch(err) {
			
 
				+            console.warn('[⚠️] Loaded invalid auth.json, moved it to auth.json.corrupted (did two processes try to change it at the same time?)')
			
 
				+            await fs.promises.rename(AUTH_JSON_PATH, AUTH_JSON_PATH + '.corrupted')
			
 
				+        }
			
 
				+    }
			
 
				+
			
 
				+    cookies = dedupeCookies(cookies)
			
 
				+
			
 
				+    if (apply) {
			
 
				+        console.log(`[🍪] Loading stored cookies/localStorage/sessionStorage into session...`, cookies.length)
			
 
				+
			
 
				+        // if (cookies?.length) {
			
 
				+        //     try {
			
 
				+        //         // try setting all at once first (much faster)
			
 
				+        //         await page.setCookie(...cookies)
			
 
				+        //     } catch(err) {
			
 
				+        //         // if any errors, fall back to setting one-by-one so that individual error can be caught
			
 
				+        //         for (const cookie of cookies) {
			
 
				+        //             try {
			
 
				+        //                 await page.setCookie(cookie);
			
 
				+        //             } catch(err) {
			
 
				+        //                 console.error('[❌] Failed to set cookie', cookie)
			
 
				+        //                 throw err
			
 
				+        //             }
			
 
				+        //         }
			
 
				+        //     }
			
 
				+        // }
			
 
				+        const origin = await page.evaluate(() => window.location.origin)
			
 
				+
			
 
				+        await page.evaluate((savedSessionStorage) => {
			
 
				+            for (const [key, value] of Object.entries(savedSessionStorage)) {
			
 
				+                sessionStorage[key] = value;
			
 
				+            }
			
 
				+        }, sessionStorage[origin] || {});
			
 
				+      
			
 
				+        await page.evaluate((savedLocalStorage) => {
			
 
				+            for (const [key, value] of Object.entries(savedLocalStorage)) {
			
 
				+                localStorage[key] = value;
			
 
				+            }
			
 
				+        }, localStorage[origin] || {});
			
 
				+
			
 
				+        // origin/auth context changes when we do page.goto so we have to hook pageload and apply it then as well
			
 
				+        // https://stackoverflow.com/questions/51789038/set-localstorage-items-before-page-loads-in-puppeteer
			
 
				+        await page.evaluateOnNewDocument(({sessionStorage, localStorage}) => {
			
 
				+            const origin = window.location.origin;
			
 
				+
			
 
				+            for (const [key, value] of Object.entries(sessionStorage[origin] || {})) {
			
 
				+                window.sessionStorage.setItem(key, value as string)
			
 
				+            }
			
 
				+            for (const [key, value] of Object.entries(localStorage[origin] || {})) {
			
 
				+                window.localStorage.setItem(key, value as string)
			
 
				+            }
			
 
				+            
			
 
				+        }, {sessionStorage, localStorage});
			
 
				+    }
			
 
				+
			
 
				+    return {cookies, sessionStorage, localStorage}
			
 
				+}
			
 
				+
			
 
				+async function loadCloudflareCookie(page, {original_url}, {timeout=20_000}={}) {
			
 
				+    // make request to FlareSolverr server to get magic cookies that let us bypass cloudflare bot detection
			
 
				+    // docker run -p 8191:8191 -e LOG_LEVEL=info ghcr.io/flaresolverr/flaresolverr
			
 
				+
			
 
				+
			
 
				+    // alternatives if this stops working:
			
 
				+    // - https://github.com/omkarcloud/botasaurus
			
 
				+    // - https://github.com/ultrafunkamsterdam/nodriver
			
 
				+    // - https://github.com/Akmal-CloudFreed/CloudFreed-CloudFlare-bypass
			
 
				+    // - https://github.com/VeNoMouS/cloudscraper
			
 
				+
			
 
				+    const query = { url: original_url, cmd: "request.get", maxTimeout: timeout }
			
 
				+    try {
			
 
				+        const response = await fetch(FLARESOLVERR_API_ENDPOINT, {
			
 
				+            method: 'POST',
			
 
				+            headers: {'Content-Type': 'application/json'},
			
 
				+            body: JSON.stringify(query),
			
 
				+        });
			
 
				+        const data = await response.json();
			
 
				+
			
 
				+        const new_cookies = (data?.solution?.cookies || []).map(cookie => ({
			
 
				+            ...cookie,
			
 
				+            'expires': 2147483640,       // overwrite expiration to 32bit maximum timestamp (2038-01-18)
			
 
				+            'secure': false,             // cookie value is plain text (not encrypted/encoded)
			
 
				+        }))
			
 
				+
			
 
				+        if (new_cookies.length) {
			
 
				+            console.log(`[☑️] Got Cloudflare bypass cookies (${new_cookies.length}) from FlareSolverr API...`)
			
 
				+            await page.setCookie(...new_cookies);
			
 
				+            return new_cookies
			
 
				+        } else {
			
 
				+            const error_str = JSON.stringify(data?.message || data, null, 4)
			
 
				+            throw `Bad FlareSolverr Response: ${error_str}`
			
 
				+        }
			
 
				+
			
 
				+    } catch (error) {
			
 
				+        if (JSON.stringify(error).includes('Challenge not detected')) {
			
 
				+            console.log('[☑️] Page is accessible without FlareSolverr Cloudflare bypass.')
			
 
				+        } else {
			
 
				+            console.warn('[❌] Failed to get Cloudflare bypass cookies from FlareSolverr API.', error)
			
 
				+        }
			
 
				+    }
			
 
				+    return []
			
 
				+}
			
 
				+
			
 
				+async function setupURLRewriting(page, page_state) {
			
 
				+    await page.setRequestInterception(true);
			
 
				+
			
 
				+    const rewrites = URL_REWRITES.sort((a, b) => (a.idx || 0) - (b.idx || 0))
			
 
				+
			
 
				+    page.on('request', interceptedRequest => {
			
 
				+        if (interceptedRequest.isInterceptResolutionHandled()) return;
			
 
				+
			
 
				+        const original_url = interceptedRequest.url()
			
 
				+
			
 
				+        // apply all the rewrites in order to the request URL
			
 
				+        let url = original_url
			
 
				+        for (const rewrite of rewrites) {
			
 
				+            const new_url = url.replace(rewrite.pattern, rewrite.replacement)
			
 
				+            // console.log(rewrite, url, new_url)
			
 
				+        
			
 
				+            // if url is rewritten to an emptystring, abort the request
			
 
				+            if (!new_url) {
			
 
				+                console.warn('[🟥] Request blocked', rewrite.pattern, ':', url)
			
 
				+                interceptedRequest.abort()
			
 
				+                return
			
 
				+            }
			
 
				+            else if (new_url && new_url != url) {
			
 
				+                // console.warn('[📳] Request rewritten', rewrite.pattern, rewrite.replacement, ':', url, '->', new_url)
			
 
				+                console.warn('[📳] Request rewritten', rewrite.pattern, ':', new_url)
			
 
				+                url = new_url
			
 
				+            }
			
 
				+        }
			
 
				+
			
 
				+        if (url == original_url) {
			
 
				+            // if url is unchanged, continue request flow as-is
			
 
				+            interceptedRequest.continue()
			
 
				+        } else {
			
 
				+            // otherwise redirect the browser to our rewritten version
			
 
				+            interceptedRequest.respond({
			
 
				+                status: 302,
			
 
				+                headers: {
			
 
				+                    location: url,
			
 
				+                    'x-redirect-by': 'ArchiveBox.setupURLRewriting',
			
 
				+                },
			
 
				+            })
			
 
				+        }
			
 
				+    });
			
 
				+    // handled by stopMetadataRecording():
			
 
				+    // page.on('close', () => {
			
 
				+    //     page.off('request')
			
 
				+    //     page.setRequestInterception(false)
			
 
				+    // })
			
 
				+}
			
 
				+
			
 
				+async function startMetadataRecording(page, {original_url, version, client, traffic_log, console_log, redirects}) {
			
 
				+    // update helper state on page
			
 
				+    page._original_url = (original_url || (await page.url())).toString()
			
 
				+
			
 
				+    // DEBUGGING: helpers for repl() debugging, dont rely on these (global state is badd mmkay)
			
 
				+    // page._client = client || page._client || await page.target().createCDPSession()
			
 
				+    // page._redirects = redirects
			
 
				+    // page._traffic_log = traffic_log
			
 
				+
			
 
				+    // add initial entry to page redirect log
			
 
				+    redirects[original_url] = {
			
 
				+        idx: 0,
			
 
				+        url: original_url,
			
 
				+        src: null,
			
 
				+        type: 'Initial',
			
 
				+        wallTime: Date.now()/1000,
			
 
				+        frameId: page.mainFrame()._id,
			
 
				+        requestId: null,
			
 
				+        initiator: {type: "user"},
			
 
				+        isMainFrame: true,
			
 
				+    }
			
 
				+    
			
 
				+    // DEBUGGING: record optional chrome debug trace with screenshots (heavy)
			
 
				+    // try {
			
 
				+    //     await page.tracing.stop()
			
 
				+    //     await wait(200)
			
 
				+    // } catch(err) {}
			
 
				+    // try {
			
 
				+    //     await page.tracing.start({path: TRACE_PATH(page), screenshots: true});
			
 
				+    // } catch(err) {}
			
 
				+
			
 
				+    let last_main_frame_url = original_url
			
 
				+
			
 
				+    // setup network request intercepts handler
			
 
				+    const addCDPRequestDataListener = (eventName) => {
			
 
				+        client.on(eventName, event => {
			
 
				+            try {
			
 
				+                // save any HTTP/JS redirects to redirects for saveRedirects(page) to use later on
			
 
				+                const new_url = event.documentURL
			
 
				+                const http_status = event.redirectResponse?.status || 0
			
 
				+                const is_new_url = (new_url !== original_url) && !redirects[new_url]
			
 
				+                const is_main_frame_navigation = (event.frameId == page.mainFrame()._id)
			
 
				+                const is_http_redirect = (300 < http_status) && (http_status < 400)
			
 
				+
			
 
				+                if (new_url && is_new_url && (is_main_frame_navigation || is_http_redirect) && event.type == 'Document') {
			
 
				+                    const new_redirect_entry = {
			
 
				+                        url: new_url,
			
 
				+                        src: event.redirectResponse?.url || last_main_frame_url,
			
 
				+                        type: http_status || 'JS',
			
 
				+                        wallTime: Date.now()/1000,
			
 
				+                        frameId: event.frameId,
			
 
				+                        requestId: event.requestId,
			
 
				+                        initiator: event.initiator,
			
 
				+                        idx: Object.keys(redirects).length,
			
 
				+                        isMainFrame: is_main_frame_navigation,
			
 
				+                    }
			
 
				+                    redirects[new_url] = new_redirect_entry
			
 
				+                    if (is_main_frame_navigation) {
			
 
				+                        ALREADY_ARCHIVED.add(new_redirect_entry.url.slice(0, 4096))  // we're already archiving this tab as it redirects, dont create a duplicate archive for the destination
			
 
				+                        console.warn(`[➡️] NAVIGATION[${new_redirect_entry.type}]${ANSI.blue} ${last_main_frame_url} ${ANSI.reset}\n                  ->${ANSI.blue} ${new_redirect_entry.url} ${ANSI.reset}`)
			
 
				+                        last_main_frame_url = new_url
			
 
				+                    }
			
 
				+                }
			
 
				+
			
 
				+                if (event.loaderId) {
			
 
				+                    traffic_log[event.loaderId] = traffic_log[event.loaderId] || {}       // make sure loader is also in requests list first
			
 
				+                    // sometimes it's not in the list if we start archiving too late / after a page's initial request was already made
			
 
				+                }
			
 
				+
			
 
				+                // save to traffic_log as {8BC2087A2CCEF28017099C0E10E87440: {Network.eventWillBeSent: {eventId,loaderId, request|response, ...}}
			
 
				+                // https://stackoverflow.com/questions/47078655/missing-request-headers-in-puppeteer?noredirect=1&lq=1
			
 
				+                traffic_log[event.requestId] = traffic_log[event.requestId] || {}
			
 
				+                Object.assign(traffic_log[event.requestId], { [eventName]: event })
			
 
				+                
			
 
				+                // DEBUGGING: log page visits and navigation events to console
			
 
				+                // if (event?.response?.status) {
			
 
				+                //     // if we're expecting an HTML response, then we assume it's a page visit & log it to console
			
 
				+                //     const acceptMimeType = traffic_log[event.requestId]['Network.requestWillBeSentExtraInfo']?.headers?.accept
			
 
				+                //     if (acceptMimeType && acceptMimeType.includes('text/html')) {
			
 
				+                //         // log any HTML page responses (less noisy)
			
 
				+                //         console.log(`[>] GOT ${event.documentURL}: ${event.response.status} ${event.response.url} (${event.response.mimeType})`)
			
 
				+                //     } else {
			
 
				+                //         // log ALL responses, inclusing JS,CSS,Images,etc. (very noisy)
			
 
				+                //         // console.log(`      > ${event.response.status} ${event.response.url} (${event.response.mimeType})`)
			
 
				+                //     }
			
 
				+                // }
			
 
				+            } catch(err) {
			
 
				+                console.warn('[X] Error during request/response handler (startMetadataRecording.addCDPRequestDataListener)')
			
 
				+                console.warn(err)
			
 
				+            }
			
 
				+        })
			
 
				+    }
			
 
				+    addCDPRequestDataListener('Network.requestWillBeSent')
			
 
				+    addCDPRequestDataListener('Network.requestWillBeSentExtraInfo')
			
 
				+    addCDPRequestDataListener('Network.responseReceived')
			
 
				+    addCDPRequestDataListener('Network.responseReceivedExtraInfo')
			
 
				+
			
 
				+    // clear any existing log entries
			
 
				+    const consolelog_info = {
			
 
				+        TYPE: 'console',
			
 
				+        VERSION: version,
			
 
				+        URL: original_url,
			
 
				+    }
			
 
				+    await overwriteFile(CONSOLELOG_PATH(page), JSON.stringify(consolelog_info) + '\n')
			
 
				+
			
 
				+    // record console logs from page
			
 
				+    const appendConsoleLog = async (line) => {
			
 
				+        if (!line) return
			
 
				+        console_log.push(line)
			
 
				+        await fs.promises.appendFile(
			
 
				+            CONSOLELOG_PATH(page),
			
 
				+            line + '\n',
			
 
				+            'utf-8',
			
 
				+        )
			
 
				+    }
			
 
				+
			
 
				+    page.on('console', async(message) =>
			
 
				+            await appendConsoleLog(`${message.type().toUpperCase()} ${message.location()} ${JSON.stringify(message.text())}`))
			
 
				+    page.on('pageerror', async (error) =>
			
 
				+            await appendConsoleLog(error.message || JSON.stringify(error)))
			
 
				+    page.on('requestfailed', async (request) =>
			
 
				+            await appendConsoleLog(`${request.failure()?.errorText} ${request.url() || JSON.stringify(request)}`))
			
 
				+    
			
 
				+    // set puppeteer options on page
			
 
				+    await client.send('Network.enable')                         // enable network tampering API
			
 
				+    await client.send('Emulation.clearDeviceMetricsOverride');  // clear timing statistics
			
 
				+    await client.send('Page.setDownloadBehavior', {
			
 
				+        behavior: 'allow',
			
 
				+        downloadPath: CHROME_DOWNLOADS_DIR,
			
 
				+    })
			
 
				+
			
 
				+    // handled by stopMetadataRecording():
			
 
				+    // page.on('close', () => {
			
 
				+    //     try {
			
 
				+    //         page.off('request')
			
 
				+    //         page.off('console')
			
 
				+    //         page.off('pageerror')
			
 
				+    //         page.off('requestfailed')
			
 
				+    //         page.setRequestInterception(false)
			
 
				+    //     } catch(err) {
			
 
				+    //         // some versions of puppeteer have had race conditions here where page is already closed by now
			
 
				+    //         console.warn('[X] Error in page close handler', err)
			
 
				+    //     }
			
 
				+    // })
			
 
				+
			
 
				+    return {original_url, client, redirects, traffic_log, console_log}
			
 
				+}
			
 
				+
			
 
				+async function stopMetadataRecording(page, _page_state) {
			
 
				+    console.log('[🪝] Stopping CDP event hooks and request interception...')
			
 
				+    try {
			
 
				+        page.off('request')
			
 
				+        page.off('response')
			
 
				+        page.off('console')
			
 
				+        page.off('pageerror')
			
 
				+        page.off('requestfailed')
			
 
				+        page.off('hashchange')
			
 
				+        page.setRequestInterception(false)
			
 
				+        // page.tracing.stop()
			
 
				+    } catch(err) {
			
 
				+        // some versions of puppeteer have had race conditions here where page is already closed by now
			
 
				+        console.warn('[X] Error in page close handler', err)
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+/********************** Human Behavior Emulation ******************************/
			
 
				+
			
 
				+async function solveCaptchas(page, page_state, {timeout=90_000}={}) {
			
 
				+
			
 
				+    // using puppeteer-extra-plugin-recaptcha auto-solver
			
 
				+    // await page.solveRecaptchas()
			
 
				+
			
 
				+    // using 2captcha-solver extension auto-solver
			
 
				+    try {
			
 
				+        // console.log('[🕑] Waiting for CAPTCHA to appear...')
			
 
				+        await page.waitForSelector('.captcha-solver', {timeout: 5_000})
			
 
				+
			
 
				+        console.log('[🤖] CAPTCHA challenge found, submitting to 2Captcha for solving...')
			
 
				+        await page.click('.captcha-solver')
			
 
				+
			
 
				+        console.log(`[🧠] Waiting up to ${timeout/1000}s for CAPTCHA to be solved...`)
			
 
				+        await page.waitForSelector(`.captcha-solver[data-state="solved"]`, {timeout})
			
 
				+
			
 
				+        console.log('[🔓] CAPTCHA solution retrieved from 2captcha.')
			
 
				+    } catch(err) {
			
 
				+        console.log('[☑️] No CATPCHA challenges found, site thinks we are human.')
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+async function jiggleMouse(page, page_state, {timeout=600}={}) {
			
 
				+    console.log(`[🐁] Moving mouse around randomly for ${timeout/1000}s...`)
			
 
				+
			
 
				+    const randomPoint = await getRandomPagePoint(page)
			
 
				+    const cursor = createCursor(page, randomPoint, true)
			
 
				+
			
 
				+    cursor.toggleRandomMove(true)
			
 
				+    await wait(timeout/2);
			
 
				+    await cursor.moveTo({x: DEFAULT_VIEWPORT.width/2, y: DEFAULT_VIEWPORT.height/2});
			
 
				+    await wait(timeout/2);
			
 
				+    cursor.toggleRandomMove(false)
			
 
				+}
			
 
				+
			
 
				+async function blockRedirects(page, {original_url}) {
			
 
				+    page.on('request', req => {
			
 
				+        if (req.isInterceptResolutionHandled()) return;
			
 
				+
			
 
				+        // if it's a top-level navigation event to a new url
			
 
				+        if (req.isNavigationRequest() && req.frame() === page.mainFrame() && req.url() !== original_url) {
			
 
				+            req.abort('aborted');
			
 
				+            console.warn('[🟥] Blocked page attempt to naviage to new URL', req.url())
			
 
				+        } else {
			
 
				+            req.continue();
			
 
				+        }
			
 
				+    });
			
 
				+    // handled by stopMetadataRecording():
			
 
				+    // page.on('close', () => {
			
 
				+    //     page.off('request')
			
 
				+    //     page.setRequestInterception(false)
			
 
				+    // })
			
 
				+    await page.setRequestInterception(true);
			
 
				+}
			
 
				+
			
 
				+async function blockJSExecution(page, _page_state) {
			
 
				+    console.warn('[🟥] Stopping all JS execution on page...')
			
 
				+    await page.evaluate(() => {
			
 
				+        debugger; 
			
 
				+    })
			
 
				+    // OR alternatively this (more buggy, breaks many sites):
			
 
				+    // const html = await page.content();
			
 
				+    // page.setJavaScriptEnabled(false);
			
 
				+    // await page.setContent(html, { waitUntil: 'networkidle0' }); // 4
			
 
				+}
			
 
				+
			
 
				+async function scrollDown(page, _page_state, {timeout=120_000, scroll_delay=SCROLL_DELAY, scroll_distance=SCROLL_DISTANCE, scroll_limit=SCROLL_LIMIT}={}) {
			
 
				+    const starting_height = await page.evaluate('document.body.scrollHeight');
			
 
				+    let last_height = starting_height
			
 
				+
			
 
				+    let scroll_count = 0;
			
 
				+    let scroll_position = scroll_count * scroll_distance
			
 
				+    // await page.bringToFront()
			
 
				+
			
 
				+    // scroll to top
			
 
				+    await page.evaluate(() => { window.scrollTo({ top: 0, left: 0, behavior: 'smooth' }); });
			
 
				+
			
 
				+    while ((scroll_count < scroll_limit) && ((scroll_delay * scroll_count) < timeout)) {
			
 
				+        console.log(`[⬇️] Scrolling down ${scroll_count}x 1000px... (${scroll_position}/${last_height})`)
			
 
				+        await page.evaluate((y_offset) => { window.scrollTo({ top: y_offset, left: 0, behavior: 'smooth' }); }, scroll_position);
			
 
				+        scroll_count++
			
 
				+        scroll_position = scroll_count * scroll_distance
			
 
				+
			
 
				+        // check if any new content was added / if we are infiniscrolling
			
 
				+        let new_height = await page.evaluate('document.body.scrollHeight')
			
 
				+        const added_px = new_height - last_height
			
 
				+        if (added_px > 0) {
			
 
				+            console.log('[✚] Detected infini-scrolling...', `${last_height}+${added_px} => ${new_height}`)
			
 
				+        } else if (scroll_position >= new_height + scroll_distance) {
			
 
				+            // we've reached the bottom, condition isn't true until we've tried to go n+1 past the end (which is fine)
			
 
				+            if (scroll_count > 2)
			
 
				+                break
			
 
				+        }
			
 
				+        last_height = new_height
			
 
				+        
			
 
				+        // sleep 2s, perform the smooth scroll down by 1000px, and increment the counter
			
 
				+        await wait(scroll_delay);
			
 
				+
			
 
				+        // facebook watch pages infiniscroll (more and more recommendations forever), stop them after 3 pages
			
 
				+        if (page._original_url.startsWith('https://www.facebook.com/watch/?v') && scroll_count > 3) break
			
 
				+    }
			
 
				+
			
 
				+    // scroll to bottom
			
 
				+    if (scroll_position < last_height) {
			
 
				+        await page.evaluate(() => { window.scrollTo({ top: document.body.scrollHeight, left: 0, behavior: 'smooth' }); });
			
 
				+        await wait(scroll_delay)
			
 
				+        await page.evaluate(() => { window.scrollTo({ top: document.body.scrollHeight, left: 0, behavior: 'smooth' }); });
			
 
				+    }
			
 
				+
			
 
				+    // Always wait an additional 2sec at the end for scroll animations / loading / rendering to settle down
			
 
				+    console.log('[📉] Reached bottom of the page.', `(${scroll_position}/${last_height})`)
			
 
				+    await wait(scroll_delay);
			
 
				+    await page.evaluate(() => { window.scrollTo({ top: 0, left: 0, behavior: 'smooth' }); });
			
 
				+    await wait(scroll_delay);
			
 
				+
			
 
				+    return last_height
			
 
				+}
			
 
				+
			
 
				+async function disableAnimations(page, _page_state) {
			
 
				+    console.log(`[⛄️] Disabling all animations using CSS override...`)
			
 
				+
			
 
				+    // https://stackoverflow.com/questions/53167644/injecting-css-into-site-with-puppeteer
			
 
				+    const css_override = `*, *::before, *::after {
			
 
				+        -moz-animation: none !important;
			
 
				+        -moz-transition: none !important;
			
 
				+        animation: none !important;
			
 
				+        transition: none !important;
			
 
				+        caret-color: transparent !important;
			
 
				+    }`
			
 
				+
			
 
				+    // inject override into current page
			
 
				+    await page.addStyleTag({content: css_override});
			
 
				+
			
 
				+    // inject override into any subsequently navigated pages
			
 
				+    await page.evaluateOnNewDocument((css_override) => {
			
 
				+        const style_tag = document.createElement('style')
			
 
				+        style_tag.type = 'text/css'
			
 
				+        style_tag.innerHTML = css_override
			
 
				+        document.getElementsByTagName('head')[0].appendChild(style_tag)
			
 
				+    }, css_override);
			
 
				+}
			
 
				+
			
 
				+async function expandComments(page, _page_state, {timeout=120_000, limit=15_000, delay=650}={}) {
			
 
				+    console.log(`[🗃️] Expanding up to ${limit} comments every ${delay}ms...`)
			
 
				+    
			
 
				+    // expand all <details> sections in Github READMEs, HedgeDoc pages, etc.
			
 
				+    await page.$$eval('pierce/article details', elem => {elem.open = true})           // expand Github README details sections
			
 
				+    await page.$$eval('pierce/div.js-discussion details:not(.details-overlay)', elem => {elem.open = true}) // expand Github issue discussion hidden comments
			
 
				+    await page.$$eval('pierce/.markdown-body details', elem => {elem.open = true})    // expand HedgeDoc Markdown details sections
			
 
				+
			
 
				+    await page.exposeFunction('onHashChange', url => page.emit('hashchange', url));
			
 
				+    await page.evaluateOnNewDocument(() => {
			
 
				+        // @ts-ignore
			
 
				+        addEventListener('hashchange', (e) => onHashChange(location.href));
			
 
				+    });
			
 
				+
			
 
				+    // Listen for hashchange events in node Puppeteer code.
			
 
				+    page.on('hashchange', url => console.log('Page tried to navigate to:', new URL(url)));
			
 
				+
			
 
				+
			
 
				+    const num_expanded = await page.evaluate(async ({timeout, limit, delay}) => {
			
 
				+        function getElementsByXPath(xpath, ctx?) {
			
 
				+            var results = [];
			
 
				+            var xpathResult = document.evaluate(
			
 
				+                xpath,                                                          // e.g. //*[text()='"+text+"'] 
			
 
				+                ctx || document,
			
 
				+                null,
			
 
				+                XPathResult.ORDERED_NODE_ITERATOR_TYPE,
			
 
				+                null
			
 
				+            );
			
 
				+            var node;
			
 
				+            while ((node = xpathResult.iterateNext()) != null) {
			
 
				+               results.push(node);
			
 
				+            }
			
 
				+            return results;
			
 
				+        }
			
 
				+
			
 
				+        let num_expanded = 0
			
 
				+        const getLoadMoreLinks = () => [
			
 
				+            // find all the buttons/links to expand collapsed/hidden/lazy-loaded content 
			
 
				+            ...document.querySelectorAll('faceplate-partial[loading=action]'),  // new reddit
			
 
				+            ...document.querySelectorAll('a[onclick^="return morechildren"]'),  // old reddit show more replies
			
 
				+            ...document.querySelectorAll('a[onclick^="return togglecomment"]'), // old reddit show hidden replies
			
 
				+            // ...document.querySelectorAll('a.js-show-link'),                     // stack overflow comments show more (TODO: make this only work on SO)
			
 
				+            // ...document.querySelectorAll('a.morelink'),                         // HackerNews profile show more (TODO: make this only work on HN)
			
 
				+            // ...getElementsByXPath("//*[text()~='View \d+ replies']"),        // facebook comment expander
			
 
				+            ...getElementsByXPath("//*[text()='Show more replies']"),           // twitter infiniscroll expander
			
 
				+            ...getElementsByXPath("//*[text()='Show replies']"),                // twitter replies expander
			
 
				+        ]
			
 
				+        const wait = (ms) => new Promise(res => setTimeout(res, ms))
			
 
				+
			
 
				+        let load_more_links = getLoadMoreLinks()
			
 
				+        while (load_more_links.length) {
			
 
				+            console.log('Expanding comments...', load_more_links.length)
			
 
				+            for (const link of load_more_links) {
			
 
				+                link.scrollIntoView({behavior: 'smooth'})
			
 
				+                if (link.slot == 'children') {
			
 
				+                    continue
			
 
				+                    // patch new reddit "More replies" links that would open in a new window to display inline instead
			
 
				+                    // const comment_id = link.src.split('?')[0].split('/').at(-1)
			
 
				+                    // link.slot = `children-${comment_id}-0`
			
 
				+                    // link.__alwaysShowSlot = false
			
 
				+                }
			
 
				+                // click the "More replies" button
			
 
				+                link.click()
			
 
				+                num_expanded++
			
 
				+                await wait(delay)
			
 
				+                const time_elapsed = num_expanded * delay
			
 
				+                if ((num_expanded > limit) || (time_elapsed > timeout))
			
 
				+                    return num_expanded
			
 
				+            }
			
 
				+            load_more_links = getLoadMoreLinks()
			
 
				+        }
			
 
				+        return num_expanded
			
 
				+    }, {timeout, limit, delay});
			
 
				+
			
 
				+    page.off('hashchange')
			
 
				+
			
 
				+    if (num_expanded) {
			
 
				+        console.log(`[🗃️] Expanded ${num_expanded} comments...`)
			
 
				+        
			
 
				+        // scroll to bottom, then back up to top
			
 
				+        const final_height = await page.evaluate('document.body.scrollHeight');
			
 
				+        await page.evaluate((top) => { window.scrollTo({ top, left: 0, behavior: 'smooth' }); }, final_height + 1000);
			
 
				+        await wait(delay);
			
 
				+        await page.evaluate(() => { window.scrollTo({ top: 0, left: 0, behavior: 'smooth' }); });
			
 
				+        await wait(delay);
			
 
				+    }
			
 
				+
			
 
				+}
			
 
				+
			
 
				+async function submitForm(page, _page_state, {timeout=5_000}={}) {
			
 
				+    try {
			
 
				+        await page.waitForSelector('form button[type=submit]', {timeout: 1_500});
			
 
				+        console.log('[☑️] Submitting form...')
			
 
				+        await page.click('form button[type=submit]')
			
 
				+        await page.waitForNavigation({timeout});
			
 
				+        await page.goBack();
			
 
				+    } catch (err) {
			
 
				+        // no form found
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+// TODO: add an evasion to set navigator.connection.rtt = 365 (0 = detectable as headless)
			
 
				+
			
 
				+/******************************************************************************/
			
 
				+/******************************************************************************/
			
 
				+
			
 
				+/**************** Extension-Based Archive Output Tasks ************************/
			
 
				+
			
 
				+async function saveSinglefile(page, {main_response, extensions}) {
			
 
				+    const extension = extensions.filter(({name}) => name === 'singlefile')[0]
			
 
				+    if (!extension.version) throw 'Could not find Singlefile extension ID, is it installed?'
			
 
				+
			
 
				+    const url = await page.url() || main_response.url()
			
 
				+    if (URL_SCHEMES_IGNORED.includes(url.split(':')[0])) return null
			
 
				+
			
 
				+    // get list of existing past files in downloads/* to ignore
			
 
				+    const files_before = new Set(
			
 
				+        (await fs.promises.readdir(CHROME_DOWNLOADS_DIR))
			
 
				+            .filter(fn => fn.endsWith('.html'))
			
 
				+    );
			
 
				+
			
 
				+    const out_path = SINGLEFILE_PATH(page)
			
 
				+
			
 
				+    console.log(`[🛠️] Saving Singlefile HTML using extension (${extension.id})...`.padEnd(82+1), prettyPath(CHROME_DOWNLOADS_DIR))
			
 
				+    await page.bringToFront()     // action button acts on the foreground tab, so it has to be in front :(
			
 
				+    await extension.dispatchAction()
			
 
				+    let files_new = []
			
 
				+
			
 
				+    const check_delay = 3_000
			
 
				+    for (const _try in [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]) {
			
 
				+        await wait(check_delay)
			
 
				+
			
 
				+        const files_after = (await fs.promises.readdir(CHROME_DOWNLOADS_DIR)).filter(fn => fn.endsWith('.html'));
			
 
				+        files_new = files_after.filter(file => !files_before.has(file))
			
 
				+
			
 
				+        if (files_new.length == 0) {
			
 
				+            // console.warn(`    ...waiting for Singlefile to write HTML into ${CHROME_DOWNLOADS_DIR}...`)
			
 
				+            continue
			
 
				+        }
			
 
				+        // iterate through new downloads and find a matching .html containing our page's URL in the header
			
 
				+        for (const file of files_new) {
			
 
				+            const dl_path = path.join(CHROME_DOWNLOADS_DIR, file)
			
 
				+            const dl_text = await fs.promises.readFile(dl_path, 'utf-8')
			
 
				+            const dl_header = dl_text.split('meta charset')[0]
			
 
				+            if (dl_header.includes(`url: ${url}`)) {
			
 
				+                /// dont need this check anymore as now all output is versioned:
			
 
				+                // if (fs.existsSync(out_path)) {
			
 
				+                //     const {size: existingSize} = await fs.promises.stat(out_path)
			
 
				+                //     const {size: newFileSize} = await fs.promises.stat(dl_path)
			
 
				+                //     if (newFileSize < existingSize) {
			
 
				+                //         console.log(`[🗑️] Discarding singlefile output (${file}) as it's smaller than existing ${out_path}...`)
			
 
				+                //         await fs.promises.rm(dl_path)
			
 
				+                //         return out_path
			
 
				+                //     }
			
 
				+                // }
			
 
				+                console.log(`[✍️] Moving Singlefile download from ${file}...`.padEnd(82), prettyPath(out_path))
			
 
				+                await fs.promises.rename(dl_path, out_path)
			
 
				+                return out_path
			
 
				+            }
			
 
				+        }
			
 
				+    }
			
 
				+
			
 
				+    console.warn(`[❌] Couldn't find matching Singlefile HTML in ${CHROME_DOWNLOADS_DIR} after waiting ${(check_delay*10)/1000}s:`, files_new.join(', '))
			
 
				+    return null
			
 
				+}
			
 
				+
			
 
				+async function saveArchiveWebPage(page, {extensions}, {timeout=30_000}={}) {
			
 
				+    // TODO: waiting on them to expose commands so we can generate .wacz easily
			
 
				+    // https://github.com/webrecorder/archiveweb.page/issues/207
			
 
				+    // ...
			
 
				+    const browser = await page.browser()
			
 
				+    const extension = extensions.filter(({name}) => name === 'archivewebpage')[0]
			
 
				+    await page.bringToFront()
			
 
				+    await extension.dispatchPopup()
			
 
				+    await extension.dispatchAction()
			
 
				+    const popup = await browser.waitForTarget(
			
 
				+        target => target.url().toString().startsWith(`chrome-extension://${extension.id}/popup.html`),
			
 
				+        {timeout: 5_000},
			
 
				+    )
			
 
				+    await page.bringToFront()
			
 
				+
			
 
				+    // await puppeteer.Locator.race([
			
 
				+    //     popup.locator('::-p-aria(Start With Autopilot)'),
			
 
				+    //     popup.locator('wr-popup-viewer >>>> input'),
			
 
				+    //     popup.locator(':scope >>> input')
			
 
				+    // ])
			
 
				+    // .setTimeout(timeout)
			
 
				+    // .click({
			
 
				+    //   offset: {
			
 
				+    //     x: 7.7265625,
			
 
				+    //     y: 7.203125,
			
 
				+    //   },
			
 
				+    // });
			
 
				+
			
 
				+    // @ts-ignore
			
 
				+    await puppeteer.Locator.race([
			
 
				+        popup.locator('wr-popup-viewer >>>> div.status-row > p'),
			
 
				+        popup.locator(':scope >>> div.status-row > p'),
			
 
				+        popup.locator('::-p-text(Recording: \n)')
			
 
				+    ]).setTimeout(timeout).click({
			
 
				+      delay: 733.3000000007451,
			
 
				+      offset: {
			
 
				+        x: 293,
			
 
				+        y: 13.5,
			
 
				+      },
			
 
				+    })
			
 
				+
			
 
				+    await wait(8_000)
			
 
				+
			
 
				+    // @ts-ignore
			
 
				+    await puppeteer.Locator.race([
			
 
				+        popup.locator('wr-popup-viewer >>>> div:nth-of-type(2) > button > span:nth-of-type(2)'),
			
 
				+        popup.locator(':scope >>> div:nth-of-type(2) > button > span:nth-of-type(2)'),
			
 
				+        popup.locator('::-p-text(Stop)')
			
 
				+    ]).setTimeout(timeout).click({
			
 
				+      offset: {
			
 
				+        x: 7.859375,
			
 
				+        y: 23.203125,
			
 
				+      },
			
 
				+    });
			
 
				+
			
 
				+    return null
			
 
				+}
			
 
				+
			
 
				+async function savePocket(page, {extensions}) {
			
 
				+    const browser = await page.browser()
			
 
				+    const extension = extensions.filter(({name}) => name === 'pocket')[0]
			
 
				+    if (!extension.version) throw 'Could not find Pocket extension ID, is it installed?'
			
 
				+
			
 
				+    console.log(`[🛠️] Saving URL to Pocket API using extension (${extension.id})...`, 'https://getpocket.com/saves')
			
 
				+    await page.bringToFront()    // action button acts on the foreground tab, so it has to be in front
			
 
				+    await extension.dispatchAction()
			
 
				+    try {
			
 
				+        const login_window = await browser.waitForTarget(
			
 
				+            target => target.url().toString().startsWith('https://getpocket.com/'),
			
 
				+            {timeout: 3_000},
			
 
				+        )
			
 
				+        // login window will open if pocket is not signed-in
			
 
				+        if (login_window) return false
			
 
				+    } catch(e) {
			
 
				+        // no new window should open if it saves correctly
			
 
				+        return true
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+/***************** Synchronous Archive Output Tasks ***************************/
			
 
				+
			
 
				+async function saveScreenrecording(page, page_state, {save_gif=true}={}) {
			
 
				+    if (page_state.recorder) {
			
 
				+        const duration = Date.now() - page_state.start_ts
			
 
				+        console.log(`[🎥] Saving screen-recording video (${duration/1000}s)...`.padEnd(82), prettyPath(SCREENRECORDING_PATH(page)))
			
 
				+        const recorder = page_state.recorder
			
 
				+        page_state.recorder = null
			
 
				+        await recorder.stop()
			
 
				+
			
 
				+        // create symlink for legacy path
			
 
				+        const snap_dir = page_state.snapshot_dir
			
 
				+        const legacy_path = path.join(snap_dir, 'media', 'screenrecording.mp4')
			
 
				+        await overwriteSymlink(SCREENRECORDING_PATH(page), legacy_path, {relative: snap_dir, search_limit: snap_dir})
			
 
				+
			
 
				+        // // remove duplicate frames (white frames at start while it loads + static image at end)
			
 
				+        // const video_path = SCREENRECORDING_PATH(page)
			
 
				+        // const short_path = video_path.replace('.mp4', '.short.mp4')
			
 
				+        // try {
			
 
				+        //     await exec(
			
 
				+        //         // create a shortened video starting from 0:02s to 0:01s with duplicate frames removed (can look jumpy sometimes)
			
 
				+        //         `ffmpeg -ss 2 -sseof -1 -y -i ${video_path} -vf mpdecimate,setpts=N/FRAME_RATE/TB ${short_path}`
			
 
				+        //     )
			
 
				+        // } catch(err) {
			
 
				+        //     console.log('[❌] Failed to shorten screenrecording.mp4')
			
 
				+        // }
			
 
				+
			
 
				+        // convert video to GIF
			
 
				+        if (save_gif) {
			
 
				+            try {
			
 
				+                const BIN_NAME = '/Volumes/NVME/Users/squash/bin/ffmpeg'
			
 
				+                const child = child_process.spawn(
			
 
				+                    BIN_NAME,
			
 
				+                    [
			
 
				+                        '-hide_banner',
			
 
				+                        '-loglevel', 'error',
			
 
				+                        '-ss', '3',
			
 
				+                        '-t', '10',
			
 
				+                        '-y',
			
 
				+                        '-i', SCREENRECORDING_PATH(page),
			
 
				+                        '-vf', "fps=10,scale=1024:-1:flags=bicubic,split[s0][s1];[s0]palettegen[p];[s1][p]paletteuse",
			
 
				+                        '-loop', '0',
			
 
				+                        SCREENRECORDGIF_PATH(page),
			
 
				+                    ],
			
 
				+                    {
			
 
				+                        cwd: path.dirname(SCREENRECORDING_PATH(page)),
			
 
				+                        timeout: 60_000,
			
 
				+                        // stdio: [null, 'pipe', 'pipe'],
			
 
				+                        stdio: 'ignore',
			
 
				+                        detached: true,                          // run in background, don't block on response
			
 
				+                    },
			
 
				+                )
			
 
				+                await blockUntilExists(SCREENRECORDGIF_PATH(page), {min_bytes: 100, timeout: 40_000})
			
 
				+                console.log(`[🎥] Saved screen-recording GIF with ffmpeg pid=${child.pid} (${duration/1000}s)...`.padEnd(82), prettyPath(SCREENRECORDGIF_PATH(page)))
			
 
				+
			
 
				+                const snap_dir = page_state.snapshot_dir
			
 
				+                const legacy_path = path.join(snap_dir, 'media', 'screenrecording.gif')
			
 
				+                await overwriteSymlink(SCREENRECORDGIF_PATH(page), legacy_path, {relative: snap_dir, search_limit: snap_dir})
			
 
				+            } catch(err) {
			
 
				+                console.log('[❌] Failed to convert video to GIF:', err)
			
 
				+            }
			
 
				+        }
			
 
				+
			
 
				+        return SCREENRECORDING_PATH(page)
			
 
				+    }
			
 
				+    return null
			
 
				+}
			
 
				+
			
 
				+async function saveScreenshot(page, _page_state, {aspect_ratio=SCREENSHOT_ASPECT_RATIO, width=null, height=null, jpg_width=1440, jpg_quality=90, timeout=30_000}={}) {
			
 
				+    try {await fs.promises.unlink(SCREENSHOT_PATH(page))} catch(err) {}
			
 
				+    
			
 
				+    // setup width and height
			
 
				+    width = width || DEFAULT_VIEWPORT.width
			
 
				+    assert((typeof width === 'number') && width > 200)
			
 
				+    height = height || Math.floor(width/aspect_ratio)
			
 
				+    assert((typeof height === 'number') && height > 200)
			
 
				+    
			
 
				+    console.log(`[📸] Saving full-page screenshot (${width}x${height}px)...`.padEnd(82), prettyPath(SCREENSHOT_PATH(page)))
			
 
				+
			
 
				+    // set width, height, and deviceScale factor: https://github.com/puppeteer/puppeteer/issues/1576
			
 
				+    await page.setViewport({ ...DEFAULT_VIEWPORT, width, height, deviceScaleFactor: 2})
			
 
				+    await page.bringToFront()
			
 
				+    await wait(1_250)       // page takes a sec settle after foregrounding and viewport update
			
 
				+
			
 
				+    // take lossless fullpage screenshot of 1920x1440+px (4:3+) -> ./screenshot.png
			
 
				+    await page.screenshot({ path: SCREENSHOT_PATH(page), fullPage: true, type: 'png' })
			
 
				+    
			
 
				+    // wait for the screenshot to be created, then set the viewport to the next size
			
 
				+    await blockUntilExists(SCREENSHOT_PATH(page), {min_bytes: 100, timeout})
			
 
				+    await wait(6_000)       // puppeteer takes a while to finish writing png data when fullPage: true
			
 
				+    
			
 
				+    const jpg_height = Math.floor(jpg_width/aspect_ratio)
			
 
				+    await page.setViewport({ ...DEFAULT_VIEWPORT, width: jpg_width, height: jpg_height, deviceScaleFactor: 2})
			
 
				+    await wait(1_250)       // page takes a sec settle after foregrounding and viewport update
			
 
				+
			
 
				+    // WARNING: make sure you never try to create two screenshots at the same time (especially not fullpage screenshots)
			
 
				+    // thats why there are all these delays here.
			
 
				+    // screenshot creation messes up the whole viewport while it's running,
			
 
				+    // and it writes bad/white empty screenshots if you try to make more than one concurrently
			
 
				+
			
 
				+    // take compressed screenshot of jpg_width*jpg_height (4:3) -> ./screenshot.jpg
			
 
				+    await page.screenshot({
			
 
				+        path: SCREENSHOT_JPG_PATH(page),
			
 
				+        type: 'jpeg',
			
 
				+        quality: jpg_quality,
			
 
				+        clip: {
			
 
				+            x: 0,
			
 
				+            y: 0,
			
 
				+            width: jpg_width,
			
 
				+            height: jpg_height,
			
 
				+        },
			
 
				+        captureBeyondViewport: false,
			
 
				+    });
			
 
				+    await blockUntilExists(SCREENSHOT_JPG_PATH(page), {min_bytes: 100, timeout: timeout/2})
			
 
				+    console.log(`[📸] Saved screenshot as screenshot.jpg (${jpg_width}x${jpg_height}px)...`.padEnd(82), prettyPath(SCREENSHOT_JPG_PATH(page)))
			
 
				+
			
 
				+    // reset viewport back to defaults
			
 
				+    await wait(1_250)
			
 
				+    await page.setViewport(DEFAULT_VIEWPORT)
			
 
				+
			
 
				+    // ALTERNATIVE METHOD based on cropping fullpage png and converting to jpg manually:
			
 
				+    // import {PNG} from 'pngjs';
			
 
				+    // import jpeg from 'jpeg-js';
			
 
				+    // setTimeout(async () => {
			
 
				+    //     try {
			
 
				+    //         const screenshot_png = SCREENSHOT_PATH(page);
			
 
				+    //         const screenshot_jpg = SCREENSHOT_JPG_PATH(page)
			
 
				+    //         const jpg_max_height = height
			
 
				+    //         const jpg_quality = quality; // Adjust the quality as needed (0-100)
			
 
				+
			
 
				+    //         fs.createReadStream(screenshot_png)
			
 
				+    //             .pipe(new PNG())
			
 
				+    //             .on('parsed', function () {
			
 
				+    //                 const width = this.width;
			
 
				+    //                 const height = this.height;
			
 
				+            
			
 
				+    //                 let cropped_height = height;
			
 
				+    //                 if (height > jpg_max_height) {
			
 
				+    //                   cropped_height = jpg_max_height;
			
 
				+    //                 }
			
 
				+            
			
 
				+    //                 const cropped_bytes = new Uint8Array(width * cropped_height * 4);
			
 
				+    //                 for (let y = 0; y < cropped_height; y++) {
			
 
				+    //                   for (let x = 0; x < width; x++) {
			
 
				+    //                     const idx = (width * y + x) << 2;
			
 
				+    //                     cropped_bytes[idx] = this.data[idx];
			
 
				+    //                     cropped_bytes[idx + 1] = this.data[idx + 1];
			
 
				+    //                     cropped_bytes[idx + 2] = this.data[idx + 2];
			
 
				+    //                     cropped_bytes[idx + 3] = this.data[idx + 3];
			
 
				+    //                   }
			
 
				+    //                 }
			
 
				+            
			
 
				+    //                 const jpeg_obj = {
			
 
				+    //                   data: cropped_bytes,
			
 
				+    //                   width: width,
			
 
				+    //                   height: cropped_height,
			
 
				+    //                 };
			
 
				+            
			
 
				+    //                 const jpeg_bytes = jpeg.encode(jpeg_obj, jpg_quality);
			
 
				+    //                 fs.writeFileSync(screenshot_jpg, jpeg_bytes.data);
			
 
				+    //                 console.log(`[📸] Saved screenshot as screenshot.jpg (${width}x${jpg_max_height}px)...`.padEnd(82), prettyPath(SCREENSHOT_JPG_PATH(page)))
			
 
				+    //             });
			
 
				+    //     } catch(err) {
			
 
				+    //         console.error('[X] Error while generating JPG screenshot', SCREENSHOT_JPG_PATH(page), err)
			
 
				+    //     }
			
 
				+    // }, DELAY_BEFORE_JPG_CONVERSION)  
			
 
				+
			
 
				+    // ALTERNATIVE METHOD TO WRITE SCREENSHOT JPG:
			
 
				+    // await wait(5_000)  // puppeteer takes a while to finish writing png data when fullPage: true
			
 
				+    // if ((await page.evaluate('document.body.scrollHeight')) > max_height) {
			
 
				+    //     // if page exceeds max_height, save additional cropped screenshot as screenshot.top.png
			
 
				+    //     // (needed b.c. uncropped screenshot may have insane 1:20+ aspect ratio that is hard to use elsewhere)
			
 
				+    //     await page.screenshot({ path: SCREENSHOT_JPG_PATH(page), type: 'jpg', quality: 100})
			
 
				+    //     await wait(1_000)  // page takes a sec settle after a screenshot
			
 
				+    // }
			
 
				+
			
 
				+    return SCREENSHOT_PATH(page)
			
 
				+}
			
 
				+
			
 
				+async function savePDF(page, _page_state, {timeout=30_000}={}) {
			
 
				+    const url = page.url() || 'about:blank'
			
 
				+    if (URL_SCHEMES_IGNORED.includes(url.split(':')[0])) return null
			
 
				+
			
 
				+    const out_path = PDF_PATH(page)
			
 
				+    console.log(`[📓] Saving print-as-PDF export...`.padEnd(82), prettyPath(out_path))
			
 
				+    await page.bringToFront()
			
 
				+    try {await fs.promises.unlink(PDF_PATH(page))} catch(err) {}
			
 
				+
			
 
				+    // await page.emulateMediaType('screen')           // print as "@media(screen) instead of @media(print)"
			
 
				+
			
 
				+    // page.createPDFStream lets us to save larger PDFs than page.pdf() before crashing
			
 
				+    // (streams to disk in chunks instead of all at once)
			
 
				+    const pdf_stream = await page.createPDFStream({
			
 
				+        timeout: timeout,
			
 
				+        printBackground: true,
			
 
				+        outline: true,
			
 
				+        tagged: true,
			
 
				+        format: 'A4',
			
 
				+        displayHeaderFooter: false,
			
 
				+        // margin: { top: '0.5cm', right: '1cm', bottom: '0.8cm', left: '1cm' },
			
 
				+    })
			
 
				+    const reader = pdf_stream.getReader()
			
 
				+
			
 
				+    // iterate through reader and append chunks to out_path
			
 
				+    await fs.promises.rm(out_path, {force: true})
			
 
				+    let num_bytes = 0
			
 
				+    let error = '0 bytes written'
			
 
				+    try {
			
 
				+        while (true) {
			
 
				+            const {done, value} = await reader.read()
			
 
				+            if (done) break;
			
 
				+            await fs.promises.appendFile(out_path, value)
			
 
				+            num_bytes += value.length;
			
 
				+        }
			
 
				+    } catch(error) {
			
 
				+        num_bytes = 0
			
 
				+    }
			
 
				+
			
 
				+    if (!num_bytes) {
			
 
				+        console.warn('[❌] Failed to save PDF', JSON.stringify(error, null, 4))
			
 
				+        await fs.promises.rm(out_path, {force: true})
			
 
				+        return null
			
 
				+    }
			
 
				+
			
 
				+    return out_path
			
 
				+}
			
 
				+
			
 
				+async function inlineShadowDOM(page, _page_state, {limit=100_000}={}) {
			
 
				+    console.log(`[😎] Replacing Shadow DOM elements with inline HTML...`)
			
 
				+
			
 
				+    try {
			
 
				+        const num_replaced = await page.evaluate((limit) => {
			
 
				+            let num_replaced = 0
			
 
				+
			
 
				+            // Returns HTML of given shadow DOM.
			
 
				+            const getShadowDomHtml = (shadowRoot) => {
			
 
				+                let shadowHTML = '';
			
 
				+                for (const el of shadowRoot.childNodes) {
			
 
				+                    shadowHTML += el.nodeValue || el.outerHTML;
			
 
				+                }
			
 
				+                return shadowHTML;
			
 
				+            };
			
 
				+
			
 
				+            // Recursively replaces shadow DOMs with their HTML.
			
 
				+            const replaceShadowDomsWithHtml = (rootElement) => {
			
 
				+                if (num_replaced > limit) return
			
 
				+                for (const el of rootElement.querySelectorAll('*')) {
			
 
				+                    if (el.shadowRoot) {
			
 
				+                        replaceShadowDomsWithHtml(el.shadowRoot);
			
 
				+                        el.innerHTML += getShadowDomHtml(el.shadowRoot);
			
 
				+                    }
			
 
				+                }
			
 
				+                num_replaced++
			
 
				+            };
			
 
				+
			
 
				+            replaceShadowDomsWithHtml(document.body);
			
 
				+
			
 
				+            return num_replaced
			
 
				+        }, limit)
			
 
				+        // console.log('    √ replaced', num_replaced, 'Shadow DOM trees')
			
 
				+    } catch(err) {
			
 
				+        console.log('[⚠️] Inlining Shadow DOM failed', err)
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+async function saveAIQualityAssuranceResult(page, {original_url, version}) {
			
 
				+    console.log(`[🧠] Analyzing screenshot with GPT-4o for QA checks...`.padEnd(82), prettyPath(AIQA_PATH(page)))
			
 
				+    
			
 
				+    let screenshot_path = SCREENSHOT_PATH(page)
			
 
				+    const screenshot_cropped_path = SCREENSHOT_JPG_PATH(page)
			
 
				+
			
 
				+    if (fs.existsSync(screenshot_cropped_path)) {
			
 
				+        // screenshot is too tall to pass to openai, send cropped version instead
			
 
				+        screenshot_path = screenshot_cropped_path
			
 
				+    }
			
 
				+    try {
			
 
				+        await blockUntilExists(screenshot_path, {min_bytes: 100, timeout: 7_500})
			
 
				+    } catch (err) {
			
 
				+        console.warn('[❌] Failed to send screenshot to GTP-4o for analysis, no screenshot.{png,jpg} exists', err)
			
 
				+        return null
			
 
				+    }
			
 
				+    var stdout = ''
			
 
				+    var stderr = ''
			
 
				+    let result = null
			
 
				+    const PYTHON_BIN = path.join(__dirname, '.venv/bin/python')
			
 
				+    const SCRIPT_PATH = path.join(__dirname, 'ai_qa.py')
			
 
				+    await blockUntilExists(PYTHON_BIN, {min_bytes: 1, timeout: 250})
			
 
				+    await blockUntilExists(SCRIPT_PATH, {min_bytes: 1, timeout: 250})
			
 
				+
			
 
				+    try {
			
 
				+        var {stdout, stderr} = await exec(
			
 
				+            `${PYTHON_BIN} ${SCRIPT_PATH} --attach '${screenshot_path}'`
			
 
				+        )
			
 
				+        result = JSON.parse(stdout.toString())
			
 
				+        if (!result) throw 'Got empty result!'
			
 
				+        result = {
			
 
				+            TYPE: 'aiqa',
			
 
				+            VERSION: version,
			
 
				+            URL: original_url,
			
 
				+            ...result,
			
 
				+        }
			
 
				+    } catch(parse_err) {
			
 
				+        console.warn('[❌] Failed to get OpenAI analysis for screenshot.png', parse_err, stderr)
			
 
				+    }
			
 
				+    if (!(result || stdout)) {
			
 
				+        return null
			
 
				+    }
			
 
				+    await overwriteFile(
			
 
				+        AIQA_PATH(page),
			
 
				+        result || stdout.toString(),
			
 
				+    )
			
 
				+ 
			
 
				+    
			
 
				+
			
 
				+    return result
			
 
				+}
			
 
				+
			
 
				+async function saveYTDLP(page, {original_url, version}, {max_size='750m'}={}) {
			
 
				+    console.log(`[🎥] Saving media with YT-DLP (<=${max_size})...`.padEnd(82), prettyPath(YTDLP_PATH(page)))
			
 
				+
			
 
				+    await fs.promises.mkdir(YTDLP_PATH(page), {recursive: true})
			
 
				+
			
 
				+    const cwd = YTDLP_PATH(page)
			
 
				+    const bin_name = 'yt-dlp'
			
 
				+    const timeout = 300_000              // 5min timeout
			
 
				+    const args = [
			
 
				+        '--restrict-filenames',
			
 
				+        '--trim-filenames', '128',
			
 
				+        '--write-description',
			
 
				+        '--write-info-json',
			
 
				+        '--write-annotations',
			
 
				+        '--write-thumbnail',
			
 
				+        '--no-call-home',
			
 
				+        '--write-sub',
			
 
				+        '--write-auto-subs',
			
 
				+        '--convert-subs=srt',
			
 
				+        '--yes-playlist',
			
 
				+        '--continue',
			
 
				+        '--no-abort-on-error',
			
 
				+        '--ignore-errors',
			
 
				+        '--geo-bypass',
			
 
				+        '--add-metadata',
			
 
				+        `--format=(bv*+ba/b)[filesize<=${max_size}][filesize_approx<=?${max_size}]/(bv*+ba/b)`,
			
 
				+        '--no-check-certificate',
			
 
				+        '--no-progress',
			
 
				+        // `--cookies=${COOKIES_TXT_PATH}`,   // using logged in cookies actually makes it fail more often, not sure why
			
 
				+        original_url,
			
 
				+    ]
			
 
				+
			
 
				+    const {getResult, ...exec_info} = await saveExecResult(bin_name, args, {original_url, version}, {cwd, timeout})
			
 
				+
			
 
				+    return {getResult, ...exec_info}
			
 
				+}
			
 
				+
			
 
				+async function saveGALLERYDL(page, {original_url, version}) {
			
 
				+    console.log(`[🎥] Saving photos with gallery-dl...`.padEnd(82), prettyPath(GALLERYDL_PATH(page)))
			
 
				+
			
 
				+    await fs.promises.mkdir(GALLERYDL_PATH(page), {recursive: true})
			
 
				+
			
 
				+    const cwd = GALLERYDL_PATH(page)
			
 
				+    const bin_name = 'gallery-dl'
			
 
				+    const timeout = 300_000              // 5min timeout
			
 
				+    const args = [
			
 
				+        '--verbose',
			
 
				+        '--write-metadata',
			
 
				+        '--write-infojson',
			
 
				+        '--write-tags',
			
 
				+        '--sleep=1.5-2.5',
			
 
				+        `--cookies=${COOKIES_TXT_PATH}`,
			
 
				+        // '--no-check-certificate',
			
 
				+        // `--directory=media`,
			
 
				+        original_url,
			
 
				+    ]
			
 
				+
			
 
				+    const {getResult, ...exec_info} = await saveExecResult(bin_name, args, {original_url, version}, {cwd, timeout})
			
 
				+
			
 
				+    return {getResult, ...exec_info}
			
 
				+}
			
 
				+
			
 
				+// async function saveWget(page, {original_url, version}) {
			
 
				+//     console.log(`[⎒] Saving wget site clone...`.padEnd(82), prettyPath(WGET_PATH(page)))
			
 
				+
			
 
				+//     const args = [
			
 
				+//         // ...
			
 
				+//     ]
			
 
				+ 
			
 
				+//     spawn(
			
 
				+//         'wget',
			
 
				+//         [
			
 
				+//             ...args,
			
 
				+//             original_url,
			
 
				+//         ],
			
 
				+//         {
			
 
				+//             cwd: WGET_PATH(page),
			
 
				+//             detached: true,         // run in background, don't block on response
			
 
				+//             stdio: 'ignore',
			
 
				+//             timeout: 300_000,       // 5min timeout
			
 
				+//         },
			
 
				+//     )
			
 
				+
			
 
				+//     return {path: WGET_PATH(page)}
			
 
				+// }
			
 
				+
			
 
				+/**************** Asynchronous Archive Output Tasks ***************************/
			
 
				+
			
 
				+type FaviconCandidate = {
			
 
				+    url: string,
			
 
				+    basename: string,
			
 
				+    extension: string,
			
 
				+    expected_mimetype: string,
			
 
				+}
			
 
				+
			
 
				+const faviconFromDomain = (url) => {
			
 
				+    // https://auth:[email protected]:1234/a/bc123 -> https://auth:[email protected]:1234/favicon.ico
			
 
				+    const url_origin = (new URL(url)).origin
			
 
				+    return {
			
 
				+        url: url_origin ? `${url_origin}/favicon.ico` : null,
			
 
				+        basename: 'favicon',
			
 
				+        extension: undefined,                     // auto-detect extension at download time in case it redirects us to a png
			
 
				+        expected_mimetype: 'image/',              // only accept image/* to avoid saving html/txt error reponses as icon
			
 
				+    } as FaviconCandidate
			
 
				+}
			
 
				+
			
 
				+const faviconFromGoogle = (url, size=256) => {
			
 
				+    // https://auth:[email protected]:1234/a/bc123 -> https://www.google.com/s2.favicons?domain=t.co
			
 
				+    const domain = url && (new URL(url)).hostname
			
 
				+    return {
			
 
				+        url: domain?.includes('.') ? `https://www.google.com/s2/favicons?sz=${size},domain=${domain}` : null,
			
 
				+        basename: 'google_favicon',
			
 
				+        extension: 'png',
			
 
				+        expected_mimetype: 'image/png',           // google always provides PNGs in response
			
 
				+    } as FaviconCandidate
			
 
				+}
			
 
				+
			
 
				+const faviconFromHtml = async (page) => {
			
 
				+    // <link rel="icon" src="https://example.com/static/images/favicon.png"/> -> https://example.com/static/images/favicon.png
			
 
				+    let url
			
 
				+    try {
			
 
				+        url = await page.$eval('link[rel*="icon"]', (elem) => elem?.href)
			
 
				+        if (!url || !url.includes('://'))
			
 
				+            url = null
			
 
				+    } catch(err) {
			
 
				+        url = null
			
 
				+        // console.warn('Failed to find favicon tag in html', JSON.stringify(err, null, 4))
			
 
				+    }
			
 
				+
			
 
				+    return {
			
 
				+        url,
			
 
				+        basename: 'favicon',
			
 
				+        extension: undefined,                    // auto-detect extension at download time
			
 
				+        expected_mimetype: 'image/',             // accept any image/* mimetype at download time
			
 
				+    } as FaviconCandidate
			
 
				+}
			
 
				+
			
 
				+type FaviconResult = {
			
 
				+    url: string,
			
 
				+    num_bytes: number,
			
 
				+    abspath?: string,
			
 
				+    dir?: string,
			
 
				+    filename?: string,
			
 
				+    mimeType?: string,
			
 
				+}
			
 
				+
			
 
				+async function saveFavicon(page, {original_url, main_response, version}) {
			
 
				+    const dir = path.dirname(FAVICON_PATH(page))
			
 
				+    const response_url = main_response?.url()
			
 
				+
			
 
				+    const favicon_downloads_to_try: {[key: string]: FaviconCandidate} = unique([
			
 
				+        await faviconFromHtml(page),
			
 
				+        faviconFromDomain(response_url),
			
 
				+        faviconFromDomain(original_url),
			
 
				+        faviconFromGoogle(response_url),
			
 
				+        faviconFromGoogle(original_url),
			
 
				+    ].filter(({url}) => url), 'url')
			
 
				+
			
 
				+    const browser = await page.browser()
			
 
				+
			
 
				+    // let logs = []
			
 
				+    // let errors = []
			
 
				+    let output_files: {[key: string]: FaviconResult} = {}
			
 
				+
			
 
				+    for (const download_options of Object.values(favicon_downloads_to_try)) {
			
 
				+        let result: FaviconResult = {num_bytes: 0, url: download_options.url}
			
 
				+        // {url, num_bytes, abspath, dir, filename, basename, extension, mimeType}
			
 
				+        try {
			
 
				+            // try getting it with node-fetch first
			
 
				+            const response = await fetch(download_options.url) as Response
			
 
				+            const file_options = await detectFilename({...download_options, response, dir})
			
 
				+            if (response.headers.get("content-length")) {
			
 
				+                const favicon_stream = Readable.fromWeb(response.body as any)
			
 
				+                await overwriteFile(file_options.abspath, favicon_stream)
			
 
				+                result = {
			
 
				+                    ...file_options,
			
 
				+                    num_bytes: parseInt(response.headers.get("content-length") || '0'),
			
 
				+                    mimeType: response.headers.get("content-type"),
			
 
				+                }
			
 
				+            } else {
			
 
				+                throw 'Failed to download favicon with fetch()'
			
 
				+            }
			
 
				+        } catch(err) {
			
 
				+            // console.warn('[!] Failed to get favicon with node-fetch', err)
			
 
				+            // fallback to getting it by opening a new browser tab
			
 
				+            result = await download({...download_options, browser, dir, page})
			
 
				+        }
			
 
				+
			
 
				+        // logs.push(...(result.logs || []))
			
 
				+        // errors.push(...(result.errors || []))
			
 
				+
			
 
				+        if (result.num_bytes) {
			
 
				+            console.log(`[🌠] Saving page favicon (${result.url.substring(0, 35)}... ${result.mimeType})...`.padEnd(82), prettyPath(result.abspath))
			
 
				+            output_files[result.filename] = result
			
 
				+            break   // break here stops after the first successful download, comment out to keep going instead
			
 
				+        }
			
 
				+    }
			
 
				+    const output_file = Object.values(output_files).sort(file => file.num_bytes).at(-1)
			
 
				+    const favicon_info = {
			
 
				+        TYPE: 'favicon',
			
 
				+        VERSION: version,
			
 
				+        URL: original_url,
			
 
				+        succeeded: !!output_file,
			
 
				+        // stdout: JSON.stringify(logs),
			
 
				+        // stderr: JSON.stringify(errors),
			
 
				+        favicon_url: output_file?.url,
			
 
				+        favicon_urls: Object.keys(favicon_downloads_to_try),
			
 
				+        favicon_files: Object.keys(output_files).map(fname => fname.replace(dir, '.')),
			
 
				+        favicon_filename: output_file?.filename,
			
 
				+        favicon_num_bytes: output_file?.num_bytes,
			
 
				+    }
			
 
				+    await overwriteFile(FAVICON_PATH(page), favicon_info)
			
 
				+
			
 
				+    return favicon_info
			
 
				+}
			
 
				+
			
 
				+async function saveTitle(page, {original_url, version}) {
			
 
				+    const title_from_browser = (await page.title()) || null
			
 
				+    const title_from_js = await page.evaluate(() => document?.title || null)
			
 
				+    const title_from_html = await page.evaluate(() => document?.querySelector('title')?.innerText || null)
			
 
				+    const title_from_og = await page.evaluate(() => document?.querySelector('meta[property="og:title"]')?.getAttribute('content') || null)
			
 
				+
			
 
				+    // best guess at best title = longest title
			
 
				+    const title = ([title_from_html, title_from_og, title_from_js, title_from_browser]
			
 
				+        .filter(title => title)
			
 
				+        .sort((a, b) => b.length - a.length)[0] || '')
			
 
				+        .replaceAll('\n', ' ')
			
 
				+
			
 
				+    if (title?.length) {
			
 
				+        console.log(`[📗] Saving page title (${title.substring(0, 40)})...`.padEnd(82), prettyPath(TITLE_PATH(page)))
			
 
				+        await overwriteFile(TITLE_PATH(page), title)
			
 
				+    }
			
 
				+
			
 
				+    const title_info = {
			
 
				+        TYPE: 'title',
			
 
				+        VERSION: version,
			
 
				+        URL: original_url,
			
 
				+        title,
			
 
				+        title_from_html,
			
 
				+        title_from_og,
			
 
				+        title_from_js,
			
 
				+        title_from_browser,
			
 
				+    }
			
 
				+    const title_json_path = TITLE_PATH(page).replace('.txt', '.json')
			
 
				+    await overwriteFile(title_json_path, title_info)
			
 
				+
			
 
				+    return title_info
			
 
				+}
			
 
				+
			
 
				+async function saveRaw(page, {main_response}) {
			
 
				+    const response = main_response
			
 
				+    if (!response) {
			
 
				+        console.warn('[⚠️] Failed to save page RAW bytes, main_response is null', response)
			
 
				+    }
			
 
				+    const dir = RAW_PATH(page)
			
 
				+    await fs.promises.mkdir(dir, {recursive: true})
			
 
				+
			
 
				+    const {url, abspath, mimeType} = await detectFilename({page, response, dir})
			
 
				+
			
 
				+    console.log(`[🔟] Saving raw response bytes (${mimeType})...`.padEnd(82), prettyPath(abspath))
			
 
				+
			
 
				+    await download({page, response, abspath})
			
 
				+    return abspath
			
 
				+}
			
 
				+
			
 
				+async function saveSourceMaps(page, {original_url, version}) {
			
 
				+    console.log(`[🐛] Saving source maps to ./responses/all/*.{js,css}.map...`)
			
 
				+
			
 
				+    const response_index_path = path.join(RESPONSES_PATH(page), 'index.jsonl')
			
 
				+    const response_index = await fs.promises.readFile(response_index_path, 'utf-8')
			
 
				+
			
 
				+    const urls_to_download = []
			
 
				+
			
 
				+    for (const response of response_index.split('\n')) {
			
 
				+        try {
			
 
				+            const {url, extension} = JSON.parse(response)
			
 
				+            if (['css', 'js'].includes(extension?.toLowerCase())) {
			
 
				+                urls_to_download.push(url + '.map')
			
 
				+            }
			
 
				+        } catch(err) { continue }
			
 
				+    }
			
 
				+
			
 
				+    // TODO: fix this, it needs to both after stopSavingMetadata and before stopSavingMetadata
			
 
				+    // fix is to use traffic_log to get response url list instead of waiting for index.jsonl to be created
			
 
				+    await page.evaluate(async (urls_to_download) => {
			
 
				+        const promises = []
			
 
				+        for (const sourcemap_url in urls_to_download) {
			
 
				+            promises.push(fetch(sourcemap_url))
			
 
				+        }
			
 
				+        return Promise.allSettled(promises)
			
 
				+    }, urls_to_download)
			
 
				+
			
 
				+    return {
			
 
				+        TYPE: 'sourcemaps',
			
 
				+        URL: original_url,
			
 
				+        VERSION: version,
			
 
				+        sourcemaps: urls_to_download,
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+async function saveRequests(page, {original_url, version, traffic_log}) {
			
 
				+    console.log(`[📼] Saving requests log (${Object.keys(traffic_log).length})...`.padEnd(82), prettyPath(REQUESTS_PATH(page)))
			
 
				+
			
 
				+    const requests_info = {
			
 
				+        TYPE: 'requests',
			
 
				+        VERSION: version,
			
 
				+        URL: original_url,
			
 
				+        requests: traffic_log,
			
 
				+    }
			
 
				+
			
 
				+    await overwriteFile(REQUESTS_PATH(page), requests_info)
			
 
				+
			
 
				+    return requests_info
			
 
				+}
			
 
				+
			
 
				+async function saveRedirects(page, {original_url, main_response, traffic_log, redirects, version}) {
			
 
				+    const main_request_id = Object.keys(traffic_log).filter(id => !id.includes('.'))[0]
			
 
				+    const main_response_traffic = traffic_log[main_request_id] || {}
			
 
				+
			
 
				+    const url_from_browser = await page.url() || null
			
 
				+    const url_from_request = (
			
 
				+        main_response?.request()?.url()
			
 
				+        || main_response_traffic['Network.requestWillBeSent']?.request?.url
			
 
				+        || null)
			
 
				+    const url_from_response = (
			
 
				+        main_response?.url()
			
 
				+        || main_response_traffic['Network.responseReceived']?.main_response?.url
			
 
				+        || null)
			
 
				+
			
 
				+    const http_redirects = 
			
 
				+        Object.values(traffic_log)
			
 
				+            .filter(event => event['Network.requestWillBeSent']?.redirectResponse)
			
 
				+            .map(event => event['Network.requestWillBeSent'])
			
 
				+            .map(requestWillBeSent => ({
			
 
				+                url: requestWillBeSent.request.url,
			
 
				+                src: requestWillBeSent.redirectResponse.url,
			
 
				+                status: requestWillBeSent.redirectResponse.status,
			
 
				+                loaderId: requestWillBeSent.loaderId,
			
 
				+                requestId: requestWillBeSent.requestId,
			
 
				+                wallTime: requestWillBeSent.wallTime,
			
 
				+                initiator: requestWillBeSent.initiator,
			
 
				+                isMainFrame: (requestWillBeSent.loaderId == main_request_id),
			
 
				+            }))
			
 
				+    
			
 
				+    const url_parsed = new URL(url_from_response || url_from_request || url_from_browser)
			
 
				+
			
 
				+    const redirects_info = {
			
 
				+        TYPE: 'redirects',
			
 
				+        VERSION: version,
			
 
				+        URL: original_url,
			
 
				+        url_parsed,
			
 
				+        url_from_request,
			
 
				+        url_from_response,
			
 
				+        url_from_browser,
			
 
				+        redirects_from_browser: redirects,
			
 
				+        redirects_from_http: http_redirects,
			
 
				+    }
			
 
				+    console.log(`[🔗] Saving page redirects log (${http_redirects.length})...`.padEnd(82), prettyPath(REDIRECTS_PATH(page)))
			
 
				+
			
 
				+    await overwriteFile(REDIRECTS_PATH(page), redirects_info)
			
 
				+
			
 
				+    return redirects_info
			
 
				+}
			
 
				+
			
 
				+async function saveHeaders(page, {original_url, version, traffic_log}) {
			
 
				+    const main_request_id = Object.keys(traffic_log).filter(id => !id.includes('.'))[0]
			
 
				+    const main_response_traffic = traffic_log[main_request_id] || {}
			
 
				+
			
 
				+    // combine base request with browser-added request headers
			
 
				+    const request = {...main_response_traffic['Network.requestWillBeSent']?.request}
			
 
				+    const request_extra_headers = main_response_traffic['Network.requestWillBeSentExtraInfo']?.headers || {}
			
 
				+    request.headers = {...request.headers, ...request_extra_headers}
			
 
				+
			
 
				+    // combine base response with browser-added response headers
			
 
				+    const response = {...main_response_traffic['Network.responseReceived']?.response}
			
 
				+    const response_extra_headers = main_response_traffic['Network.responseReceivedExtraInfo']?.headers || {}
			
 
				+    response.headers = {...response.headers, ...response_extra_headers}
			
 
				+
			
 
				+    const headers_info = {
			
 
				+        TYPE: 'headers',
			
 
				+        VERSION: version,
			
 
				+        URL: original_url,
			
 
				+        request,
			
 
				+        response,
			
 
				+    }
			
 
				+
			
 
				+    const num_headers = Object.keys({...request.headers, ...response.headers}).length
			
 
				+    if (num_headers) {
			
 
				+        console.log(`[👾] Saving main request & response headers (${num_headers})...`.padEnd(82), prettyPath(HEADERS_PATH(page)))
			
 
				+        await overwriteFile(HEADERS_PATH(page), headers_info)
			
 
				+    }
			
 
				+
			
 
				+    return headers_info
			
 
				+}
			
 
				+ 
			
 
				+async function saveSSL(page, {original_url, version, traffic_log}) {
			
 
				+    const main_request_id = Object.keys(traffic_log).filter(id => !id.includes('.'))[0]
			
 
				+    const main_response_traffic = traffic_log[main_request_id] || {}
			
 
				+
			
 
				+    const relevant_response_keys = [
			
 
				+        'url',
			
 
				+        'status',
			
 
				+        'mimeType',
			
 
				+        'connectionReused',
			
 
				+        'remoteIPAddress',
			
 
				+        'remotePort',
			
 
				+        'fromServiceWorker',
			
 
				+        'encodedDataLength',
			
 
				+        'protocol',
			
 
				+        'alternateProtocolUsage',
			
 
				+        'securityState',
			
 
				+        'securityDetails',
			
 
				+    ]
			
 
				+    let ssl_info = Object.entries(main_response_traffic['Network.responseReceived']?.response || {})
			
 
				+        .reduce((obj, [key, val]) => {
			
 
				+            if (relevant_response_keys.includes(key)) {
			
 
				+                obj[key] = val
			
 
				+            }
			
 
				+            return obj
			
 
				+        }, {}) as any
			
 
				+
			
 
				+    // TODO: parse SSL certificate sha256 hash from chrome://system/#chrome_root_store
			
 
				+    // const ssl_certificate = await client.send('Network.getCertificate', {origin: original_url})
			
 
				+    // ssl_info.sslCertSha256 = '<unknown>'
			
 
				+
			
 
				+    ssl_info = {
			
 
				+        TYPE: 'ssl',
			
 
				+        VERSION: version,
			
 
				+        URL: original_url,
			
 
				+        ...ssl_info,
			
 
				+    }
			
 
				+
			
 
				+    if (Object.keys(ssl_info).length-3) {
			
 
				+        console.log(`[🔏] Saving page SSL details (${ssl_info?.securityDetails?.protocol})...`.padEnd(82), prettyPath(SSL_PATH(page)))
			
 
				+        await overwriteFile(SSL_PATH(page), ssl_info)
			
 
				+    }
			
 
				+
			
 
				+    return ssl_info
			
 
				+}
			
 
				+
			
 
				+
			
 
				+async function saveDOM(page, {original_url, version}) {
			
 
				+    const html = await page.content();
			
 
				+    console.log(`[📖] Saving DOM dump (${html.length})...`.padEnd(82), prettyPath(DOM_PATH(page)))
			
 
				+    const html_with_header = 
			
 
				+        `<!-- Saved by ArchiveBox TYPE=dom VERSION=${version} URL=${original_url} -->\n${html}`
			
 
				+    await overwriteFile(DOM_PATH(page), html_with_header)
			
 
				+    return DOM_PATH(page)
			
 
				+}
			
 
				+
			
 
				+async function saveBodyText(page, _page_state) {
			
 
				+    const innerText = await page.evaluate(() => document?.body?.innerText);
			
 
				+
			
 
				+    if (innerText?.length) {
			
 
				+        console.log(`[📃] Saving body text (${innerText.length})...`.padEnd(82), prettyPath(BODYTEXT_PATH(page)))
			
 
				+        await overwriteFile(BODYTEXT_PATH(page), innerText)
			
 
				+    }
			
 
				+
			
 
				+    // // alternative method: emulate Ctrl+A, Ctrl+C (sometimes gets more than body.innerText)
			
 
				+    // const innerText = await page.$eval('*', (el) => {
			
 
				+    //     const selection = window.getSelection();
			
 
				+    //     const range = document.createRange();
			
 
				+    //     range.selectNode(el);
			
 
				+    //     selection.removeAllRanges();
			
 
				+    //     selection.addRange(range);
			
 
				+    //     return window.getSelection().toString();
			
 
				+    // });
			
 
				+
			
 
				+    return innerText
			
 
				+}
			
 
				+
			
 
				+async function savePandoc(page, { original_url, version }) {
			
 
				+    console.log(`[📒] Converting DOM HTML to markdown with Pandoc...`.padEnd(82), prettyPath(PANDOC_PATH(page)))
			
 
				+
			
 
				+    let dom_paths = [DOM_PATH(page), SINGLEFILE_PATH(page)].filter(fs.existsSync)
			
 
				+    if (!dom_paths) return null
			
 
				+    const dom_path = dom_paths[0]
			
 
				+
			
 
				+    var stdout: string = ''
			
 
				+    var stderr: string = ''
			
 
				+    let result: any = null
			
 
				+    const BIN_NAME = 'pandoc'
			
 
				+    // pandoc --from html --to markdown_github --citeproc --wrap=none --highlight-style=kate
			
 
				+    const args = [
			
 
				+        BIN_NAME,
			
 
				+        '--from=html',
			
 
				+        '--to=markdown_github',
			
 
				+        '--wrap=none',
			
 
				+        '--citeproc',
			
 
				+        '--highlight-style=kate',
			
 
				+        `--output='${PANDOC_PATH(page)}'`,
			
 
				+        dom_path,
			
 
				+    ]
			
 
				+    try {
			
 
				+        ;({ stdout, stderr } = await exec(args.join(' ')));
			
 
				+        stdout = stdout.toString().trim()
			
 
				+        if (!stdout) throw 'Got empty result!'
			
 
				+        result = {
			
 
				+            TYPE: 'pandoc',
			
 
				+            VERSION: version,
			
 
				+            URL: original_url,
			
 
				+            cmd: args,
			
 
				+            markdown_file: PANDOC_PATH(page),
			
 
				+        }
			
 
				+    } catch (parse_err) {
			
 
				+        console.warn('[❌] Failed to run Pandoc HTML to MD conversion', parse_err, stderr)
			
 
				+    }
			
 
				+    if (!stdout) {return null}
			
 
				+    await overwriteFile(
			
 
				+        PANDOC_PATH(page),
			
 
				+        stdout,
			
 
				+    )
			
 
				+
			
 
				+    // pandoc --from markdown_github --to html --citeproc --wrap=none --highlight-style=kate
			
 
				+    const reverse_conversion_args = [
			
 
				+        BIN_NAME,
			
 
				+        '--from=markdown_github',
			
 
				+        '--to=html',
			
 
				+        '--wrap=none',
			
 
				+        '--citeproc',
			
 
				+        '--highlight-style=kate',
			
 
				+        `--output='${PANDOC_PATH(page).replace('.md', '.html')}'`,
			
 
				+        PANDOC_PATH(page),
			
 
				+    ]
			
 
				+    try {
			
 
				+        ; ({ stdout, stderr } = await exec(reverse_conversion_args.join(' ')));
			
 
				+        stdout = stdout.toString().trim()
			
 
				+        if (!stdout) throw 'Got empty result!'
			
 
				+        result = {
			
 
				+            ...result,
			
 
				+            html_file: PANDOC_PATH(page).replace('.md', '.html'),
			
 
				+        }
			
 
				+    } catch (parse_err) {
			
 
				+        console.warn('[❌] Failed to run Pandoc MD to HTML conversion', parse_err, stderr)
			
 
				+    }
			
 
				+    if (!result) { return null }
			
 
				+    await overwriteFile(
			
 
				+        PANDOC_PATH(page).replace('.md', '.html'),
			
 
				+        result,
			
 
				+    )
			
 
				+
			
 
				+    return result
			
 
				+}
			
 
				+
			
 
				+async function saveReadability(page, {original_url, version}) {
			
 
				+    const url = await page.url()
			
 
				+    let html = ''
			
 
				+    let article = null
			
 
				+    try {
			
 
				+        html = await page.content()
			
 
				+        if (html.length > 14_000_000) {
			
 
				+            console.warn('[⚠️] Truncating readability article text because html is too long...', html.length)
			
 
				+            html = html.substring(0, 13_900_000)
			
 
				+        }
			
 
				+        const virtualConsole = new VirtualConsole()
			
 
				+        const dom = new JSDOM(html, {url, virtualConsole})
			
 
				+        const reader = new Readability(dom.window.document);
			
 
				+        article = reader.parse()
			
 
				+    } catch(err) {
			
 
				+        console.warn(`[❌] Failed to get readability article text`)
			
 
				+        return null
			
 
				+    }
			
 
				+    if (article) {
			
 
				+        console.log(`[📜] Saving readability article text (${article.textContent?.length})...`.padEnd(82), prettyPath(READABILITY_PATH(page)))
			
 
				+        const {content, textContent, ...metadata} = article
			
 
				+        if (content.trim()) {
			
 
				+            await overwriteFile(READABILITY_PATH(page).replace('.json', '.html'), content);
			
 
				+        }
			
 
				+        if (textContent.trim()) {
			
 
				+            await overwriteFile(READABILITY_PATH(page).replace('.json', '.txt'), textContent);
			
 
				+        }
			
 
				+        const readability_info = {
			
 
				+            TYPE: 'readability',
			
 
				+            VERSION: version,
			
 
				+            URL: original_url,
			
 
				+            ...metadata,
			
 
				+        }
			
 
				+        await overwriteFile(READABILITY_PATH(page), readability_info)
			
 
				+        return readability_info
			
 
				+    }
			
 
				+    return null
			
 
				+}
			
 
				+
			
 
				+async function saveAccessibility(page, {original_url, version}) {
			
 
				+    // get accessibility tree
			
 
				+    const accessibility_tree = await page.accessibility.snapshot({interestingOnly: true});
			
 
				+    // console.log(accessibility_tree);
			
 
				+
			
 
				+    // get iframe tree
			
 
				+    const iframes = []
			
 
				+    function dumpFrameTree(frame, indent='>') {
			
 
				+        iframes.push(indent + frame.url());
			
 
				+        for (const child of frame.childFrames()) {
			
 
				+            dumpFrameTree(child, indent + '>');
			
 
				+        }
			
 
				+    }
			
 
				+    dumpFrameTree(page.mainFrame(), '');
			
 
				+    // console.log(iframes)
			
 
				+
			
 
				+    // generate simple table-of-contents of all the key html elements (e.g. h1, h2, h3, article, main, etc.)
			
 
				+    const outline = await page.evaluate(() => {
			
 
				+        const headings = []
			
 
				+        for (const elem of [...document.querySelectorAll("h1, h2, h3, h4, h5, h6, a, header, footer, article, main, aside, nav, section, figure, summary, table, form, iframe")] as HTMLElement[]) {
			
 
				+            
			
 
				+            // skip a tags that aren't named anchors
			
 
				+            if (elem.tagName.toLowerCase() == 'a' && !(elem as HTMLAnchorElement).name) continue
			
 
				+
			
 
				+            // e.g. article #main-article
			
 
				+            const elem_id = ((typeof elem.id === 'string' && elem.id) || (elem as HTMLAnchorElement).name || elem.ariaLabel || elem.role || '')
			
 
				+            const elem_classes = elem.className.trim().split(' ').slice(0, 3).join(' .') || ''
			
 
				+            const elem_action = (elem as any).action?.split('/')?.slice(-1)?.join('/')
			
 
				+            const summary = elem.innerText.length > 128
			
 
				+                ? `${elem.innerText?.slice(0, 128)}...`
			
 
				+                : elem.innerText
			
 
				+
			
 
				+            let prefix = ''
			
 
				+            let title = (elem_id ? `#${elem_id}` : '')
			
 
				+            if (!title && elem_classes) title = `.${elem_classes}`
			
 
				+            if (elem_action) title = `${title} /${elem_action}`
			
 
				+            if (summary) title = `${title}: ${summary}`
			
 
				+
			
 
				+            // if elem is a header, prepend a #### prefix based on its level
			
 
				+            const level = Number(elem.tagName.toLowerCase().replace('h', ''))
			
 
				+            if (!isNaN(level)) {
			
 
				+                prefix = '#'.repeat(level)
			
 
				+                title = elem.innerText || elem_id || elem_classes
			
 
				+            } else {
			
 
				+                // set prefix to element's breadcrumb path
			
 
				+                let node = elem
			
 
				+                const parents = [elem.tagName?.toLowerCase().trim()]
			
 
				+                while (node) {
			
 
				+                    // add each parent element's name to the path
			
 
				+                    // const elem_type = node.tagName?.toLowerCase().trim() || ''
			
 
				+                    // if (elem_type && !['div', 'span', 'p', 'body', 'html'].includes(elem_type)) {
			
 
				+                    //     parents.unshift(elem_type);
			
 
				+                    // }
			
 
				+                    parents.unshift('')  // add emptystring to abbreviate path as >>>> istead of main>article>header>div>...
			
 
				+                    node = node.parentNode as HTMLElement
			
 
				+                }
			
 
				+                prefix = parents.join('>')
			
 
				+            }
			
 
				+            // strip all repeated whitespace and newlines
			
 
				+            title = title.replaceAll('\n', ' ').replace(/\s+/g, ' ').trim()
			
 
				+
			
 
				+            if (prefix) {
			
 
				+                headings.push(`${prefix} ${title}`)
			
 
				+            }
			
 
				+        }
			
 
				+        // console.log(headings.join('\n'))
			
 
				+        return headings
			
 
				+    })
			
 
				+
			
 
				+    console.log(`[🩼] Saving accessibility outline (${Object.keys(accessibility_tree).length})...`.padEnd(82), prettyPath(ACCESIBILITY_PATH(page)))
			
 
				+    // console.log(outline.filter(line => line.startsWith('#')).join('\n'))
			
 
				+
			
 
				+    const accessibility_info = {
			
 
				+        TYPE: 'accessibility',
			
 
				+        VERSION: version,
			
 
				+        URL: original_url,
			
 
				+        iframes,
			
 
				+        headings: outline,
			
 
				+        tree: accessibility_tree,
			
 
				+    }
			
 
				+
			
 
				+    await overwriteFile(
			
 
				+        ACCESIBILITY_PATH(page),
			
 
				+        accessibility_info,
			
 
				+    )
			
 
				+
			
 
				+    return accessibility_info
			
 
				+}
			
 
				+
			
 
				+async function saveSEO(page, {original_url, version}) {
			
 
				+    // collect all <meta name="title" property="og:title" content="Page Title for SEO | Somesite.com"> tags into dict
			
 
				+    const seo_vars = await page.evaluate(() => 
			
 
				+        [...document.querySelectorAll('meta')]
			
 
				+            .map(tag => ({key: tag.getAttribute('name') || tag.getAttribute('property') || '', value: tag.getAttribute('content') || ''}))
			
 
				+            .filter(obj => obj.key && obj.value)
			
 
				+            .sort((a, b) => a.value.length - b.value.length)
			
 
				+            .reduce((acc, node) => {acc[node.key] = node.value; return acc}, {})
			
 
				+    )
			
 
				+
			
 
				+    const seo_info = {
			
 
				+        TYPE: 'seo',
			
 
				+        VERSION: version,
			
 
				+        URL: original_url,
			
 
				+        ...seo_vars,
			
 
				+    }
			
 
				+
			
 
				+    const num_vars = Object.keys(seo_vars).length
			
 
				+    if (num_vars) {
			
 
				+        console.log(`[🔎] Saving page SEO metadata (${num_vars})...`.padEnd(82), prettyPath(SEO_PATH(page)))
			
 
				+        await overwriteFile(SEO_PATH(page), seo_info)
			
 
				+    }
			
 
				+
			
 
				+    return seo_info
			
 
				+}
			
 
				+
			
 
				+async function saveOutlinks(page, {original_url, version}) {
			
 
				+    // TODO: slow to iterate over all elements so many times, perhaps we can collapse everything down into one loop
			
 
				+
			
 
				+
			
 
				+    // Regular expression that matches syntax for a link (https://stackoverflow.com/a/3809435/117030):
			
 
				+    const LINK_REGEX = /https?:\/\/(www\.)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b([-a-zA-Z0-9()@:%_\+.~#?&//=]*)/gi;
			
 
				+
			
 
				+    const filterW3Urls = (urls) =>
			
 
				+        urls.filter(url =>
			
 
				+            url && !url.startsWith('http://www.w3.org/'))
			
 
				+
			
 
				+    const filterDataUrls = (urls) =>
			
 
				+        urls.filter(url =>
			
 
				+            url && !url.startsWith('data:'))
			
 
				+
			
 
				+    const html = await page.content();
			
 
				+
			
 
				+    const raw = html?.match(LINK_REGEX) || [];
			
 
				+
			
 
				+    const hrefs = await page.$$eval(
			
 
				+        "pierce/a[href]",
			
 
				+        elems => elems
			
 
				+            .map(elem => elem.href)
			
 
				+            .filter(url => url),
			
 
				+    );
			
 
				+
			
 
				+    const links = await page.$$eval(
			
 
				+        "pierce/link[href]",
			
 
				+        elems => elems
			
 
				+            .map(({rel, href}) => ({rel, href}))
			
 
				+            .filter(({rel, href}) => rel !== 'stylesheet')
			
 
				+            .reduce((collection, entry) => {
			
 
				+                const {rel, href} = entry
			
 
				+                const non_empty_rel = collection[href]?.rel || rel
			
 
				+                collection[href] = {rel: non_empty_rel, href}
			
 
				+                return collection
			
 
				+            }, {})
			
 
				+    );
			
 
				+
			
 
				+    const iframes = await page.$$eval(
			
 
				+        "pierce/iframe[src]",
			
 
				+        elems => elems.map(iframe => iframe.src).filter(url => url)
			
 
				+    );
			
 
				+
			
 
				+    const images = await page.$$eval(
			
 
				+        "pierce/img[src]",
			
 
				+        elems => elems.map(img => img.src).filter(url => url && !url.startsWith('data:'))
			
 
				+    );
			
 
				+
			
 
				+
			
 
				+    const css_images = await page.$$eval(
			
 
				+        "pierce/*",
			
 
				+        elems => elems
			
 
				+            .map(elem => {
			
 
				+                const css_url_ptn = /url\(\s*?['"]?\s*?(\S+?)\s*?["']?\s*?\)/i;
			
 
				+                const bg_img = window.getComputedStyle(elem, null).getPropertyValue('background-image')
			
 
				+                const bg_url = css_url_ptn.exec(bg_img)
			
 
				+                return bg_url ? bg_url[1] : null
			
 
				+            })
			
 
				+    )
			
 
				+
			
 
				+    const css_stylesheets = await page.$$eval(
			
 
				+        "pierce/link[rel=stylesheet]",
			
 
				+        elems => elems.map(elem => elem.href).filter(url => url)
			
 
				+    );
			
 
				+
			
 
				+    const js_scripts = await page.$$eval(
			
 
				+        "pierce/script[src]",
			
 
				+        elems => elems.map(elem => elem.src).filter(url => url)
			
 
				+    );
			
 
				+
			
 
				+    const outlinks_info = {
			
 
				+        TYPE: 'outlinks',
			
 
				+        VERSION: version,
			
 
				+        URL: original_url,
			
 
				+        raw: [...new Set(filterDataUrls(filterW3Urls(raw)))],
			
 
				+        hrefs: [...new Set(filterDataUrls(hrefs))],
			
 
				+        links: [...Object.values(links)],
			
 
				+        iframes: [...new Set(iframes)],
			
 
				+        images: [...new Set(filterDataUrls(images))],
			
 
				+        css_images: [...new Set(filterDataUrls(css_images))],
			
 
				+        css_stylesheets: [...new Set(filterDataUrls(css_stylesheets))],
			
 
				+        js_scripts: [...new Set(filterDataUrls(js_scripts))],
			
 
				+    }
			
 
				+
			
 
				+    if (raw?.length || hrefs?.length || links?.length || iframes?.length) {
			
 
				+        console.log(`[🖇️] Saving page outgoing links (${raw?.length || hrefs?.length})...`.padEnd(82+1), prettyPath(OUTLINKS_PATH(page)))
			
 
				+
			
 
				+        await overwriteFile(OUTLINKS_PATH(page), outlinks_info)
			
 
				+    }
			
 
				+    return outlinks_info
			
 
				+}
			
 
				+
			
 
				+
			
 
				+async function saveAuthStorage(page, {client, version, original_url}) {
			
 
				+    const url = original_url || await page.url()
			
 
				+    if (URL_SCHEMES_IGNORED.includes(url.split(':')[0])) return null
			
 
				+    if (!SAVE_AUTH_STORAGE) return null
			
 
				+
			
 
				+    // const cookies = JSON.stringify(await page.cookies());  // doesnt include httponly cookies
			
 
				+    const auth_from_browser = {
			
 
				+        cookies: (await client.send('Network.getAllCookies')).cookies,
			
 
				+        localStorage: {},
			
 
				+        sessionStorage: {},
			
 
				+    }
			
 
				+
			
 
				+    // attempt to load localStorage and sessionStorage from browser (may fail in some cases https://github.com/puppeteer/puppeteer/issues/921)
			
 
				+    try {
			
 
				+        auth_from_browser.localStorage = (await page.evaluate(() =>
			
 
				+            JSON.parse(JSON.stringify({[window.location.origin]: window.localStorage}))))
			
 
				+    } catch(err) {
			
 
				+        throw `Failed to get page window.localStorage! ${err}`
			
 
				+    }
			
 
				+    try {
			
 
				+        auth_from_browser.sessionStorage = (await page.evaluate(() =>
			
 
				+            JSON.parse(JSON.stringify({[window.location.origin]: window.sessionStorage}))))
			
 
				+    } catch(err) {
			
 
				+        throw `Failed to get page window.sessionStorage! ${err}`
			
 
				+    }
			
 
				+
			
 
				+    // WARNING: small TOCTTOU gap between this read-before-write and the write below
			
 
				+    // can possibly overwrite changes made by other processes in this gap
			
 
				+    const auth_on_disk = await loadAuthStorage(page, {client}, {apply: false})
			
 
				+
			
 
				+    const cookies = dedupeCookies([...auth_on_disk.cookies, ...auth_from_browser.cookies])
			
 
				+
			
 
				+    const auth_info = {
			
 
				+        TYPE: 'auth',
			
 
				+        VERSION: version,
			
 
				+        URL: original_url,
			
 
				+        cookies: cookies,
			
 
				+        sessionStorage: merge(auth_on_disk.sessionStorage, auth_from_browser.sessionStorage),
			
 
				+        localStorage: merge(auth_on_disk.localStorage, auth_from_browser.localStorage),
			
 
				+    }
			
 
				+    // console.log(`[⛙] Merged ${auth_on_disk.cookies.length} existing + ${auth_from_browser.cookies.length} new -> ${auth_info.cookies.length} cookies`)
			
 
				+  
			
 
				+    console.log(`[🍪] Saving cookies/localStorage/sessionStorage (${auth_info.cookies.length})...`.padEnd(82), prettyPath(AUTH_JSON_PATH));
			
 
				+    await overwriteFile(AUTH_JSON_PATH, auth_info);
			
 
				+  
			
 
				+    // Write to cookies.txt file using tough-cookie + @root/file-cookie-store
			
 
				+    await saveCookiesTxt(cookies)
			
 
				+
			
 
				+    return auth_info
			
 
				+}
			
 
				+
			
 
				+async function saveCookiesTxt(cookies) {
			
 
				+    const cookies_store = new FileCookieStore(COOKIES_TXT_PATH, {auto_sync: false, lockfile: false})
			
 
				+    const cookie_jar = new ToughCookie.CookieJar(cookies_store)
			
 
				+    cookie_jar.setCookieAsync = util.promisify(cookie_jar.setCookie)
			
 
				+    cookies_store.saveAsync = util.promisify(cookies_store.save)
			
 
				+    for (const cookie of cookies) {
			
 
				+        const cookie_for_tough = {
			
 
				+            domain: cookie.domain,
			
 
				+            path: cookie.path,
			
 
				+            key: cookie.name,
			
 
				+            value: cookie.value,
			
 
				+            expires: (new Date(cookie.expires * 1000)).toISOString(),
			
 
				+            hostOnly: cookie.domain.startsWith('.'),
			
 
				+            secure: cookie.secure,
			
 
				+        }
			
 
				+        // console.log('COOKIE_FOR_TOUGH_TXT', cookie_for_tough)
			
 
				+        const parsed_cookie = ToughCookie.Cookie.fromJSON(cookie_for_tough)
			
 
				+        // console.log('COOKIE_FOR_TOUGH_TXT_TO_DUMP', parsed_cookie)
			
 
				+        try {
			
 
				+            // assemble a fake URL just to satisfy ToughCookieJar's requirement of having a URL at set time
			
 
				+            let url = cookie.secure ? 'https://' : 'http://'
			
 
				+            if (cookie.domain.startsWith('.')) {
			
 
				+                url = url + cookie.domain.slice(1)
			
 
				+            } else {
			
 
				+                url = url + cookie.domain
			
 
				+            }
			
 
				+            if (cookie.sourcePort && ![80, 443].includes(cookie.sourcePort)) {
			
 
				+                url = `${url}:${cookie.sourcePort}`
			
 
				+            }
			
 
				+            url = `${url}${cookie.path || ''}`
			
 
				+            await cookie_jar.setCookieAsync(parsed_cookie, url, {ignoreError: true})
			
 
				+        } catch(err) {
			
 
				+            console.error('[❌] Failed to dump browser cookie for cookies.txt...', cookie_for_tough, '->', parsed_cookie, err)
			
 
				+        }
			
 
				+    }
			
 
				+    console.log(`[🍪] Saving cookies TXT (${cookies.length})...`.padEnd(82), prettyPath(COOKIES_TXT_PATH));
			
 
				+    await cookies_store.saveAsync()
			
 
				+}
			
 
				+
			
 
				+async function saveMetrics(page, {original_url, version, start_time, start_ts, traffic_log, redirects}) {
			
 
				+    const end_time = (new Date()).toISOString()
			
 
				+    const end_ts = Date.now()
			
 
				+    const metrics_info = {
			
 
				+        TYPE: 'metrics',
			
 
				+        VERSION: version,
			
 
				+        URL: original_url,
			
 
				+        ...(await page.metrics()),
			
 
				+        start_time,
			
 
				+        start_ts,
			
 
				+        end_time,
			
 
				+        end_ts,
			
 
				+        duration: (end_ts - start_ts),
			
 
				+        num_requests: traffic_log.length,
			
 
				+        num_redirects: Object.keys(redirects).length -1,
			
 
				+    }
			
 
				+
			
 
				+    console.log(`[🏎️] Saving final summary + timing metrics...`.padEnd(82+1), prettyPath(METRICS_PATH(page)))
			
 
				+    await overwriteFile(METRICS_PATH(page), metrics_info)
			
 
				+
			
 
				+    return metrics_info
			
 
				+}
			
 
				+
			
 
				+
			
 
				+/******************************************************************************/
			
 
				+/******************************************************************************/
			
 
				+
			
 
				+/**************************** Utility Helpers *********************************/
			
 
				+
			
 
				+
			
 
				+function hashCode(str) {
			
 
				+    // get a simple integer hash for a given string (based on java String#hashCode)
			
 
				+    // useful only for throwaway nonces / easy deterministic random identifiers, not a replacement for sha256
			
 
				+    let hash = 0;
			
 
				+    for (let i=0; i<str.length; i++) {
			
 
				+       hash = str.charCodeAt(i) + ((hash << 5) - hash);
			
 
				+    }
			
 
				+    return Math.abs(hash)
			
 
				+}
			
 
				+
			
 
				+function unique(iter, key: string | ((any, number) => string)='id') {
			
 
				+    // uniqueify an array of objects by a value within them, key can be name of attr or getter function
			
 
				+    // > iter = [{id: 1}, {id: 2}, {id: 1}]
			
 
				+    // > Object.entries(iter) = [
			
 
				+    //   [ '0',  { id: 1 } ],
			
 
				+    //   [ '1',  { id: 2 } ],
			
 
				+    //   [ '2',  { id: 1 } ] ]
			
 
				+    // > unique(iter, 'id') => {1: {id: 1}, 2: {id: 2}}
			
 
				+
			
 
				+    // > iter = {a1: {id: 1}, b2: {id: 2}, a3: {id: 1}}
			
 
				+    // > Object.entries(iter) = [
			
 
				+    //   [ 'a1', { id: 1 } ],
			
 
				+    //   [ 'b2', { id: 2 } ],
			
 
				+    //   [ 'a3', { id: 1 } ]
			
 
				+    // ]
			
 
				+    // > unique(iter, 'id') => {1: {id: 1}, 2: {id: 2}}
			
 
				+
			
 
				+    const key_type = (typeof key)
			
 
				+    if (!['function', 'string'].includes(key_type))
			
 
				+        throw 'key must be either a string lookup key or a function (obj, idx) => return unique_id'
			
 
				+
			
 
				+    const key_func = (key_type === 'string')
			
 
				+        ? (entry_obj, idx) => entry_obj[(key as string)]
			
 
				+        : (entry_obj, idx) => (key as Function)(entry_obj, idx)   // otherwise key is a callback func
			
 
				+
			
 
				+    const seen = {}
			
 
				+    for (const [idx, entry_obj] of Object.entries(iter)) {
			
 
				+        const unique_id = key_func(entry_obj, idx)
			
 
				+        if (seen[unique_id] === undefined) {
			
 
				+            seen[unique_id] = entry_obj
			
 
				+        }
			
 
				+    }
			
 
				+
			
 
				+    return seen
			
 
				+}
			
 
				+
			
 
				+const wait = (ms: number) => new Promise(res => {
			
 
				+    if (ms > 10_000) {
			
 
				+        console.debug(`[⏲️] Waiting ${Math.round(ms/1000)}s...`)
			
 
				+    }
			
 
				+    setTimeout(res, ms)
			
 
				+})
			
 
				+
			
 
				+const TimeoutError = Symbol()
			
 
				+const withTimeout = (promise, ms) => {
			
 
				+    // run a promise with a time limit, raises a TimeoutError if it fails
			
 
				+    let timer
			
 
				+    return Promise.race([
			
 
				+        promise,
			
 
				+        new Promise((_r, reject) =>
			
 
				+            timer = setTimeout(reject, ms, TimeoutError)
			
 
				+        ),
			
 
				+    ]).finally(() => clearTimeout(timer))
			
 
				+}
			
 
				+
			
 
				+const MAX_VALID_DATE = new Date('2150-01-01T00:00:00.000Z')
			
 
				+const MIN_VALID_DATE = new Date('2010-01-01T00:00:00.000Z')
			
 
				+const UNIX_EPOCH_DATE = new Date(0)
			
 
				+
			
 
				+const validateDate = (date, {min=MIN_VALID_DATE, max=MAX_VALID_DATE, singleton=UNIX_EPOCH_DATE}={}) => {
			
 
				+    assert((date instanceof Date), `Got invalid type for Date: ${typeof date} ${date} (expected Date)`)
			
 
				+    assert(String(date) !== 'Invalid Date', `Got invalid value for Date: ${typeof date} ${date}`)
			
 
				+    if (Number(date) === Number(singleton)) return date  // epoch singleton is always valid
			
 
				+    assert(date < max, `Got Date that was higher than MAX_VALID_DATE=${max}`)
			
 
				+    assert(date > min, `Got Date that was lower than MIN_VALID_DATE=${min}`)
			
 
				+    return date
			
 
				+}
			
 
				+
			
 
				+const parseVersionDateStr = (yyyymmddtime) => {
			
 
				+    // YYYYMMDDhhmmssxxx or YYYYMMDDhhmmss or YYYYMMDDhhmm or YYYYMMDD -> Date
			
 
				+    const is_only_numbers = /^\d+$/.test(yyyymmddtime.replace('.', ''))
			
 
				+    assert(is_only_numbers, `Non-numeric characters in YYYYMMDD date are not allowed: ${yyyymmddtime} (while trying YYYYMMDDhhmmssxxx format)`)
			
 
				+
			
 
				+    const num_digits = String(yyyymmddtime).split('.')[0].length
			
 
				+    assert([17, 14, 12, 8].includes(num_digits), `Got invalid number of digits (${num_digits}) in YYYYMMDD date: ${yyyymmddtime} (while trying YYYYMMDDhhmmssxxx format)`)
			
 
				+
			
 
				+    const [_all, yyyy, mm, dd, hr, min, sec, ms] = /^(\d{4})(\d{2})(\d{2})(\d{2})?(\d{2})?(\d{2})?(\d{3})?$/.exec(yyyymmddtime)
			
 
				+    assert(yyyy && mm && dd, `Could not find YYYYMMDD`)
			
 
				+    const time_error_msg = `Detected YYYYMMDD[hhmm[ss[xxxx]]] but time segment is invalid ${hr}:${min || '__'}:${ms || '___'}`
			
 
				+    if (ms) assert(hr && min && sec, time_error_msg)
			
 
				+    if (sec) assert(hr && min, time_error_msg)
			
 
				+    if (min) assert(hr, time_error_msg)
			
 
				+    if (hr) assert (min, time_error_msg)
			
 
				+
			
 
				+    const iso_str = `${yyyy}-${mm}-${dd}T${hr || '00'}:${min || '00'}:${sec || '00'}.${ms || '00'}Z`
			
 
				+    const parsed_date = new Date(iso_str)
			
 
				+
			
 
				+    return validateDate(parsed_date)                        // 1970-01-01T00:00:00.000Z (ISO format)
			
 
				+}
			
 
				+
			
 
				+const parseTimestampDateStr = (timestamp) => {
			
 
				+    // 1709724291000 or 1709724291000.000 or 1709724291 or 1709724291.000 -> Date
			
 
				+    timestamp = String(timestamp)
			
 
				+    const is_only_numbers = /^\d+$/.test(timestamp.replace('.', ''))
			
 
				+    assert(is_only_numbers, `Got invalid characters in timstamp: ${timestamp} (while trying xxxxxxxxxxxxx format)`)
			
 
				+
			
 
				+    const num_digits = String(timestamp).split('.')[0].length
			
 
				+    assert([13, 10, 1].includes(num_digits), `Got invalid number of digits (${num_digits}) in timestamp: ${timestamp} (while trying xxxxxxxxxxxxx format)`)
			
 
				+
			
 
				+    let parsed_date = null
			
 
				+
			
 
				+    if (num_digits === 13) {
			
 
				+        parsed_date = new Date(Number(timestamp))                   // 1709724291000   (unix timestamp w/ milliseconds)
			
 
				+    } else if (num_digits === 10) {
			
 
				+        parsed_date = new Date(Number(timestamp) * 1000)            // 1709724291      (unix timestamp w/ seconds)
			
 
				+    } else if (num_digits === 1) {
			
 
				+        assert(String(timestamp) === '0', `Got invalid single-digit timestamp: ${timestamp} (while trying xxxxxxxxxxxxx format or 0 for UNIX epoch)`)
			
 
				+        parsed_date = UNIX_EPOCH_DATE
			
 
				+    }
			
 
				+    return validateDate(parsed_date)
			
 
				+}
			
 
				+
			
 
				+const parseISODateStr = (iso_str) => {
			
 
				+    // 1970-01-01T00:00:00.000Z -> Date
			
 
				+    const num_digits = String(iso_str).length
			
 
				+    assert([24, 19, 16, 10].includes(num_digits), `Got invalid number of digits (${num_digits}) in ISO date: ${iso_str} (while trying 1970-01-01T00:00:00.000Z format)`)
			
 
				+
			
 
				+    const parsed_date = new Date(iso_str)
			
 
				+    return validateDate(parsed_date)
			
 
				+}
			
 
				+
			
 
				+const parseDate = (date) => {
			
 
				+    // date === undefined      => use today/now
			
 
				+    // date === null           => use unix epoch 0 aka 1970-01-01T00:00:00.000Z
			
 
				+    // date *= YYYYMMDDHHMMSS  => use a version date string (e.g. 20010131235958)
			
 
				+    // date *= 1234567...      => use a timestmap (e.g. 1709724291000)
			
 
				+    // date *= 1970-01-01T...  => use iso datetime (e.g. 1970-01-01T00:00:00.000Z)
			
 
				+    // returns -> Date
			
 
				+
			
 
				+    if (date === undefined) {
			
 
				+        return (new Date())             // today      (2024-05-29T22:02:34.682Z) aka timestamp=1717020154682
			
 
				+    }
			
 
				+    if (date === null || date == 0) {
			
 
				+        return UNIX_EPOCH_DATE          // unix epoch (1970-01-01T00:00:00.000Z) aka timestamp=0
			
 
				+    }
			
 
				+    if (date instanceof Date) {
			
 
				+        return validateDate(date)       // JS date    Date('1970-01-01T00:00:00.000Z')
			
 
				+    }
			
 
				+
			
 
				+    if ((typeof date) === 'number') {
			
 
				+        date = String(date)             // unix timestamp e.g. 1717020154682
			
 
				+    }
			
 
				+    assert((typeof date) === 'string', `Tried to parse date but got unsupported type ${(typeof date)}: ${date}`)
			
 
				+
			
 
				+    const errors = [`Failed to parse Date from string: ${date}`]
			
 
				+    try {
			
 
				+        return parseVersionDateStr(date)
			
 
				+    } catch(err) { errors.push(err) }
			
 
				+    try {
			
 
				+        return parseTimestampDateStr(date)
			
 
				+    } catch(err) { errors.push(err) }
			
 
				+    try {
			
 
				+        return parseISODateStr(date)
			
 
				+    } catch(err) { errors.push(err) }
			
 
				+    
			
 
				+    throw errors.join('\n')
			
 
				+}
			
 
				+
			
 
				+const versionStrFromDate = (date, {withDate=true, withTime=true, withSeconds=true, withMilliseconds=false}={}) => {
			
 
				+    // takes Date, returns YYYYMMDDHHMMSSXXX or YYYYMMDDHHMMSS or YYYYMMDDHHMM or YYYYMMDD
			
 
				+    const parsed_date = parseDate(date)
			
 
				+
			
 
				+    const [date_iso, time_iso] = parsed_date.toISOString().split('T')                       // ['2001-01-31', '23:59:58.090Z']
			
 
				+
			
 
				+    const components_to_use = []
			
 
				+    if (withDate) {                                                                        
			
 
				+        components_to_use.push(date_iso.replaceAll('-', ''))                                // '20010131'
			
 
				+    }
			
 
				+    if (withTime) {
			
 
				+        const [hr, min, sec, ms] = time_iso.replace('Z', '').replace('.', ':').split(':')   // ['23', '59', '58', '090']
			
 
				+        components_to_use.push(hr)
			
 
				+        components_to_use.push(min)
			
 
				+        if (withSeconds) {
			
 
				+            components_to_use.push(sec)
			
 
				+            if (withMilliseconds) {
			
 
				+                components_to_use.push(ms)
			
 
				+            }
			
 
				+        }
			
 
				+    }
			
 
				+    assert(components_to_use.length, 'At least one of {withDate, withTime} must be set.')
			
 
				+
			
 
				+    const final_str = components_to_use.join('')                                            // 20010131235958
			
 
				+
			
 
				+    assert(parseVersionDateStr(final_str))  // sanity check to make sure it parses correctly
			
 
				+
			
 
				+    return final_str
			
 
				+}
			
 
				+
			
 
				+// test date functions:
			
 
				+// console.log(parseDate('20120131'))
			
 
				+// console.log(versionStrFromDate(parseDate('20120131')))
			
 
				+// console.log(versionStrFromDate(parseDate('0')))
			
 
				+// console.log(versionStrFromDate(parseDate(0)))
			
 
				+// console.log(versionStrFromDate(parseDate(null)))
			
 
				+// console.log(versionStrFromDate())
			
 
				+// console.log(versionStrFromDate(parseDate('20120131235859090')))
			
 
				+// console.log(versionStrFromDate(parseDate('1970-01-01T00:00:00.000Z')))
			
 
				+// console.log(versionStrFromDate(parseDate('2024-12-01T00:00')))
			
 
				+// console.log(versionStrFromDate(parseDate('2024-12-01'), {withTime: false}))
			
 
				+
			
 
				+const prettyPath = (path) => {
			
 
				+    // return a pretty-printable path where the abspath of the data dir is replaced with /data for brevity/privacy
			
 
				+    return path.replace(DATA_DIR, './data')
			
 
				+}
			
 
				+
			
 
				+const pathIsHidden = (relpath) => {
			
 
				+    // check if a path or any of the directories above it are hidden  (e.g. ./some/.dir/abc or ./.DS_Store)
			
 
				+    
			
 
				+    // make sure test path behaves like an abspath (avoids edge-cases messing up relpaths on '' or '.' or './')
			
 
				+    let test_path = relpath
			
 
				+    if (test_path.startsWith('./'))
			
 
				+        test_path = test_path.substring(2)
			
 
				+    if (!test_path.startsWith('/'))
			
 
				+        test_path = path.join('/', test_path)
			
 
				+
			
 
				+    // iterate through parents, checking if any parent is hidden until we reach /
			
 
				+    while (test_path !== '/') {
			
 
				+        const basename = path.basename(test_path)
			
 
				+        if (basename.startsWith('.')) {
			
 
				+            // console.log('PATH IS HIDDEN', relpath)
			
 
				+            return true
			
 
				+        }
			
 
				+        // otherwise set test_path to parent dir and repeat
			
 
				+        test_path = path.dirname(test_path)
			
 
				+    }
			
 
				+    return false
			
 
				+}
			
 
				+
			
 
				+const pathDepth = (child_path, relative_to='.') => {
			
 
				+    // get the number of directory hops deep a child path is relative to '.' (or a given parent)
			
 
				+    
			
 
				+    if (child_path.startsWith('/') && !relative_to.startsWith('/')) {
			
 
				+        // if child_path is absolute, then relative_to must be absolute as well otherwise depth will be depth all the way to the / root
			
 
				+        relative_to = fs.realpathSync(relative_to)
			
 
				+    }
			
 
				+    if (relative_to.startsWith('/') && !child_path.startsWith('/')) {
			
 
				+        // same deal, either both paths have to be relative, or both have to be absolute
			
 
				+        child_path = fs.realpathSync(child_path)
			
 
				+    }
			
 
				+    const relative_path_to_root = path.relative(relative_to, child_path)
			
 
				+    const num_hops_down = relative_path_to_root.split('/').length
			
 
				+    return num_hops_down
			
 
				+}
			
 
				+
			
 
				+interface DirentWithExtras extends fs.Dirent {
			
 
				+    relpath: string,
			
 
				+    abspath: string,
			
 
				+    reldepth: number,
			
 
				+}
			
 
				+
			
 
				+async function getDirEntries(dir_path, {pwd=null, recursive=true, includeHidden=false, includeFiles=true, includeDirs=true, includeLinks=false, filter=null, maxdepth=-1}={}) {
			
 
				+    // get the list of all sub-paths under a given path recursively
			
 
				+
			
 
				+    // console.log('GETTING DIRECTORY ENTRIES', {dir_path, pwd, recursive, includeHidden, includeFiles, includeDirs, maxdepth})
			
 
				+
			
 
				+    pwd = pwd || dir_path
			
 
				+    let dir_abspath = dir_path
			
 
				+
			
 
				+    if (!dir_abspath.startsWith(pwd)) {
			
 
				+        dir_abspath = path.join(pwd, dir_abspath)
			
 
				+    }
			
 
				+
			
 
				+    assert(fs.existsSync(dir_abspath), `Tried to get directory listing for dir that doesn't exist! ${prettyPath(dir_abspath)}`)
			
 
				+
			
 
				+    return (await fs.promises.readdir(dir_abspath, { recursive, withFileTypes: true }))
			
 
				+        .map((dirent: DirentWithExtras) => {
			
 
				+            // filter combined with map because relpath is re-used in both operations
			
 
				+            const relpath = path.join(path.relative(pwd, dirent.parentPath), dirent.name)
			
 
				+            // console.log('CALCULATED RELATIVE PATH', relpath)
			
 
				+            const abspath = path.join(dir_abspath, relpath)
			
 
				+            const basename = path.basename(dirent.name)
			
 
				+            if (!includeLinks && dirent.isSymbolicLink()) return null
			
 
				+            if (!includeFiles && dirent.isFile()) return null
			
 
				+            if (!includeDirs && dirent.isDirectory()) return null
			
 
				+            if (!includeHidden && pathIsHidden(relpath)) return null
			
 
				+
			
 
				+            dirent.relpath = relpath
			
 
				+            dirent.abspath = abspath
			
 
				+            dirent.reldepth = pathDepth(relpath)
			
 
				+            // console.log('RELATIVE DEPTH MEASURED', prettyPath(dir_abspath), prettyPath(relpath), dirent.reldepth)
			
 
				+
			
 
				+            if (maxdepth >= 0) {
			
 
				+                if ((dirent.reldepth-1) > maxdepth) return null
			
 
				+            }
			
 
				+            
			
 
				+            if ((typeof filter) === 'function') {
			
 
				+                const should_keep = filter({abspath, relpath, basename, dirent})
			
 
				+                if (!should_keep) {
			
 
				+                    // console.log('FILTER EXCLUDED RESULT', {abspath, relpath, basename, dirent})
			
 
				+                    return null
			
 
				+                }
			
 
				+            }
			
 
				+
			
 
				+            return relpath
			
 
				+        })
			
 
				+        .filter(Boolean)
			
 
				+        .sort() as string[]
			
 
				+}
			
 
				+
			
 
				+
			
 
				+async function getTotalSize(dir_or_file_path, {pwd=null, _cache=null, filter=null, subfiles=null}={}) {
			
 
				+    // get the total size in bytes of a file or directory (recursively adds up file sizes within directory)
			
 
				+
			
 
				+    // check _cache first
			
 
				+    if (_cache && (dir_or_file_path in _cache))
			
 
				+        return _cache[dir_or_file_path]
			
 
				+
			
 
				+    // make sure dir_or_file_path is under pwd
			
 
				+    pwd = pwd || path.dirname(dir_or_file_path)
			
 
				+    let abspath = dir_or_file_path
			
 
				+    if (!dir_or_file_path.startsWith(pwd)) {
			
 
				+        abspath = path.join(pwd, dir_or_file_path)
			
 
				+    }
			
 
				+
			
 
				+    // if it's a file, stat it and return the size
			
 
				+    // console.log('CALCUALTED ABSPATH', {abspath, dir_or_file_path, pwd})
			
 
				+    const dirent = await fs.promises.stat(abspath)
			
 
				+    if (dirent.isFile()) {
			
 
				+        // console.log('CALCULATING FILE SIZE subfile=', prettyPath(abspath))
			
 
				+        return dirent.size
			
 
				+    }
			
 
				+
			
 
				+    // if it's not a file and not a directory, give up, dont try to size special files like FIFO/socket/etc.
			
 
				+    if (!dirent.isDirectory()) return 0
			
 
				+
			
 
				+    // if it's a directory, size is the sum of all the sizes of files within
			
 
				+    // console.log('CALCULATING SUBDIR SIZE subdir=', prettyPath(abspath))
			
 
				+    let total_bytes = 0
			
 
				+    const files_within = subfiles || await getDirEntries(dir_or_file_path, {
			
 
				+        pwd,
			
 
				+        recursive: true,
			
 
				+        includeDirs: false,
			
 
				+        includeFiles: true,
			
 
				+        filter,
			
 
				+    })
			
 
				+    for (const subpath of files_within) {
			
 
				+        total_bytes += await getTotalSize(subpath, {pwd, _cache, filter})
			
 
				+    }
			
 
				+    return total_bytes
			
 
				+}
			
 
				+
			
 
				+
			
 
				+async function getDirSizes(dir_path, {pwd=null, subfiles=null, withRoot=true, filter=null, maxdepth=-1}={}) {
			
 
				+    // get the size of a directory and all the files within (recursively) as a number of bytes
			
 
				+    //  dir_path:     path   absolute or relative path of the directory you want size info for
			
 
				+    //       pwd:     path   (optional) absolute path of the directory you want to interpret dir_path relative to
			
 
				+    //  subfiles: dirent[]   (optional) instead of reading disk, you can manually provide a getDirEntries results list to use
			
 
				+    //  withRoot:     bool   include a summary entry for the root dir_path dir in the list as '.'
			
 
				+    //    filter: function   (optional) provide a filter func for dir entries ({abspath, relpath, basename, dirent}) => true/false
			
 
				+    //  maxdepth:   number   (optional) does not affect actual calculations, but hides entries below a certain depth in the returned output for brevity
			
 
				+
			
 
				+    assert((await fs.promises.stat(dir_path)).isDirectory(), `Tried to calculate directory sizes but path is not a directory! ${dir_path}`)
			
 
				+    pwd = pwd || dir_path
			
 
				+
			
 
				+    // {'.': 246, 'example.json': 123, 'example2.txt': 123}
			
 
				+    const sizes = {}
			
 
				+
			
 
				+    // first collect the list of all sub-files recursively and calculate their sizes individually
			
 
				+    const files_within = subfiles || await getDirEntries(dir_path, {
			
 
				+        pwd,
			
 
				+        recursive: true,
			
 
				+        includeDirs: false,
			
 
				+        includeFiles: true,
			
 
				+        // dont pass maxdepth here, we need the entire file listing to accurately calculate parent dir sizes
			
 
				+        // it never makes sense to ignore subfiles beyond a certain depth for size calculation
			
 
				+        filter,  // filter is allowed though, useful to calculcate size of some subset of files that match a pattern
			
 
				+    })
			
 
				+    for (const subpath of files_within) {
			
 
				+        sizes[subpath] = await getTotalSize(subpath, {pwd, _cache: sizes, filter})
			
 
				+    }
			
 
				+    
			
 
				+    // then calculate the top-level directory total as the sum of all the file sizes under it
			
 
				+    const total_size = Object.values(sizes).reduce((a: number, b: number) => a + b, 0)
			
 
				+
			
 
				+    // then calculate the subtotals of all the sub-directories
			
 
				+    const subdirs_within = await getDirEntries(dir_path, {pwd, recursive: true, includeDirs: true, includeFiles: false, filter, maxdepth})
			
 
				+    for (const subpath of subdirs_within) {
			
 
				+        sizes[subpath] = await getTotalSize(subpath, {pwd, _cache: sizes, filter})   // uses _cache to avoid re-computing
			
 
				+    }
			
 
				+
			
 
				+    // if maxdepth is passed, filter results to only include paths shallower than max depth
			
 
				+    if (maxdepth >= 0) {
			
 
				+        for (const subpath of Object.keys(sizes)) {
			
 
				+            if (pathDepth(subpath) > maxdepth) {
			
 
				+                delete sizes[subpath]
			
 
				+            }
			
 
				+        }
			
 
				+    }
			
 
				+
			
 
				+    // set total_size last so it appears at the bottom of the object in logs for convenience
			
 
				+    if (withRoot) {
			
 
				+        sizes['.'] = total_size
			
 
				+    }
			
 
				+
			
 
				+    return sizes
			
 
				+}
			
 
				+
			
 
				+
			
 
				+async function getLargestPath(path_a, path_b) {
			
 
				+    // compare two files/directories and return the largest one of the two (calculating size recursively)
			
 
				+    
			
 
				+    path_a = await fs.promises.realpath(path_a)
			
 
				+    path_b = await fs.promises.realpath(path_b)
			
 
				+    const size_a = await getTotalSize(path_a)
			
 
				+    const size_b = await getTotalSize(path_b)
			
 
				+
			
 
				+    // console.log('COMPARING', prettyPath(path_a), size_a, '  ', prettyPath(path_b), size_b)
			
 
				+
			
 
				+    if (size_a > size_b) return path_a
			
 
				+    return path_b
			
 
				+}
			
 
				+
			
 
				+async function findCommonAncestor(target_abspath, symlink_abspath, {relative=true, search_limit=DATA_DIR}: {relative?: boolean | string, search_limit?: string}={}) {
			
 
				+    // given a target path and a symlink path, find the common ancestor path they both share
			
 
				+    // (searches recursively through absolute path parent directories until a common dir is found, up to search_limit)
			
 
				+
			
 
				+    search_limit = await fs.promises.realpath(search_limit)
			
 
				+
			
 
				+    let relative_dir = search_limit
			
 
				+    if ((typeof relative) === 'boolean') {
			
 
				+        // if start dir is default, set it to symlinks directory path
			
 
				+        if (relative) {
			
 
				+            relative_dir = path.dirname(symlink_abspath)
			
 
				+        } else {
			
 
				+            relative_dir = search_limit
			
 
				+        }
			
 
				+    } else if ((typeof relative) === 'string') {
			
 
				+        // if start dir is a string, get its absolute path
			
 
				+        relative_dir = relative as string
			
 
				+    } else {
			
 
				+        throw `Got invalid type for relative path during common ancestor search: ${relative}`
			
 
				+    }
			
 
				+
			
 
				+    if ((await fs.promises.stat(relative_dir)).isFile()) {
			
 
				+        // if start dir is a file, set it to its parent dir path
			
 
				+        relative_dir = path.dirname(relative_dir)
			
 
				+    }
			
 
				+    assert(
			
 
				+        (await fs.promises.stat(relative_dir)).isDirectory(),
			
 
				+        `Tried to find common ancestor starting from invalid search directory:\n    🔗 ${prettyPath(symlink_abspath)}\n    -> ${prettyPath(target_abspath)}\n    Error: search dir does not exist or is not a directory: ❌ ${prettyPath(relative_dir)}`,
			
 
				+    )
			
 
				+
			
 
				+    const symlink_filename = path.basename(symlink_abspath)
			
 
				+    const target_filename = path.basename(target_abspath)
			
 
				+    const symlink_parent_abspath = await fs.promises.realpath(path.dirname(symlink_abspath))
			
 
				+    const target_parent_abspath = await fs.promises.realpath(path.dirname(target_abspath))
			
 
				+    const search_dir_abspath = await fs.promises.realpath(relative_dir)
			
 
				+
			
 
				+    let closest_common_ancestor = search_dir_abspath
			
 
				+
			
 
				+    const isAncestorCommon = (ancestor) => (
			
 
				+        target_parent_abspath.startsWith(ancestor)
			
 
				+        && symlink_parent_abspath.startsWith(ancestor))
			
 
				+
			
 
				+    // check if both src and target start with the same ancestor path
			
 
				+    while (closest_common_ancestor !== search_limit) {
			
 
				+        if (isAncestorCommon(closest_common_ancestor)) break
			
 
				+        else {
			
 
				+            // otherwise go up one directory and try again
			
 
				+            // console.log('    ...going up a directory', prettyPath(closest_common_ancestor)+'/..')
			
 
				+            closest_common_ancestor = path.dirname(closest_common_ancestor)
			
 
				+        }
			
 
				+    }
			
 
				+
			
 
				+    assert(
			
 
				+        isAncestorCommon(closest_common_ancestor),
			
 
				+        `Tried to create relative symlink but could not find common ancestor:\n    🔗 ${prettyPath(symlink_abspath)}\n    -> ${prettyPath(target_abspath)}\n    Error: target path and symlink path are not both under:\n      ❌ ${prettyPath(closest_common_ancestor)}`,
			
 
				+    )
			
 
				+    
			
 
				+    const symlink_to_ancestor_relpath = path.relative(symlink_parent_abspath, closest_common_ancestor)                             // ../../..
			
 
				+    const target_from_ancestor_relpath = path.join(path.relative(closest_common_ancestor, target_parent_abspath), target_filename)   // 'archive/19999999.23423523'
			
 
				+    const symlink_to_target_relpath = path.join(symlink_to_ancestor_relpath, target_from_ancestor_relpath)                           // '../../../archive/19999999.23423523'
			
 
				+
			
 
				+    return {
			
 
				+        closest_common_ancestor,
			
 
				+        search_dir_abspath,
			
 
				+
			
 
				+        target_abspath,
			
 
				+        target_filename,
			
 
				+        target_from_ancestor_relpath,
			
 
				+        
			
 
				+        symlink_abspath,
			
 
				+        symlink_filename,
			
 
				+        symlink_to_ancestor_relpath,
			
 
				+        symlink_to_target_relpath,
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+interface StatsWithExtras extends fs.Stats {
			
 
				+    abspath: string
			
 
				+    relpath?: string
			
 
				+    reldepth?: number
			
 
				+}
			
 
				+
			
 
				+async function blockUntilExists(file_path, {timeout=7_500, min_bytes=0}={}) {
			
 
				+    // wait up to timeout seconds until file we expect to exist appears on the filesystem
			
 
				+    // (used to handle eventual consistency in network filesystems where we need a delay after writing before reads show up)
			
 
				+    const interval = 250
			
 
				+    const max_tries = timeout / interval
			
 
				+    let tries = 0
			
 
				+    
			
 
				+    let abspath = null
			
 
				+    while (tries < max_tries) {
			
 
				+        try {
			
 
				+            const abspath = await fs.promises.realpath(file_path)
			
 
				+            assert(fs.existsSync(abspath))
			
 
				+            
			
 
				+            const dirent = await fs.promises.stat(abspath) as StatsWithExtras
			
 
				+            dirent.abspath = abspath
			
 
				+            
			
 
				+            if (min_bytes && (dirent.size < min_bytes)) {
			
 
				+                assert(dirent.size >= 1)
			
 
				+                // this is a valid warning but unfortunately its too common to bother showing:
			
 
				+                // console.warn(`[⚠️] Expected file to be >=${Math.round(min_bytes/1000)}kb but was only ${dirent.size/1000}kb:`, prettyPath(file_path))
			
 
				+            }
			
 
				+            
			
 
				+            return dirent
			
 
				+        } catch(err) {
			
 
				+            const waited = (tries * interval)
			
 
				+            if (waited === 5_000) {
			
 
				+                console.warn(`[⚠️] Waited >${waited/1000}s for file to appear (is filesystem or bg task running slow?):`, prettyPath(file_path))
			
 
				+            }
			
 
				+            await wait(interval)
			
 
				+            tries++
			
 
				+        }
			
 
				+    }
			
 
				+    throw `Expected file does not exist after ${timeout/1000}s: ${prettyPath(file_path)}`
			
 
				+}
			
 
				+
			
 
				+async function overwriteSymlink(target_path, symlink_path, {relative=true, mkdirs=false, search_limit=DATA_DIR, timeout=5_000}: {relative?: boolean | string, mkdirs?: boolean, search_limit?: string, timeout?: number}={}) {
			
 
				+    // create a symlink from symlink_path -> target_path
			
 
				+    // relative: true => symlink is created as a relative link by default (it will auto-find the closest common ancestor dir, often DATA_DIR)
			
 
				+    // mkdirs: true   => optionally creates symlink parent dirs automatically)
			
 
				+
			
 
				+    // make sure target file actually exists first
			
 
				+    let target_dirent
			
 
				+    try {
			
 
				+        target_dirent = await blockUntilExists(target_path, {timeout})
			
 
				+    } catch(err) {
			
 
				+        throw `Tried to create symlink pointing to file that does not exist:\n    🔗 ${prettyPath(symlink_path)}\n    -> ❌ ${prettyPath(target_path)}\n    ${err}`
			
 
				+    }
			
 
				+    const target_abspath = target_dirent.abspath
			
 
				+    const target_filename = path.basename(target_abspath)
			
 
				+    const target_parent_abspath = path.dirname(target_abspath)
			
 
				+    
			
 
				+    // make sure target is a valid file or directory and not a special character/block device/other weird file
			
 
				+    const target_is_dir = target_dirent.isDirectory()
			
 
				+    const target_is_file = target_dirent.isFile()
			
 
				+    assert(target_is_dir || target_is_file, `Tried to create symlink to an unsupported file type:\n    🔗 ${prettyPath(symlink_path)}\n    -> ❌ ${prettyPath(target_path)} (expected file or directory)`)
			
 
				+
			
 
				+    // create symlink file parent directories if needed
			
 
				+    const symlink_filename = path.basename(symlink_path)
			
 
				+    const symlink_parent_dir = path.dirname(symlink_path)
			
 
				+    if (mkdirs) {
			
 
				+        await fs.promises.mkdir(symlink_parent_dir, {recursive: true})
			
 
				+    }
			
 
				+    try {
			
 
				+        assert((await fs.promises.stat(symlink_parent_dir)).isDirectory())
			
 
				+    } catch(err) {
			
 
				+        throw `Tried to create symlink in a directory that doesn't exist:\n    🔗 ${symlink_parent_dir}❌/${symlink_filename}\n    -> ${target_path}\n    ${err}`
			
 
				+    }
			
 
				+    const symlink_parent_abspath = await fs.promises.realpath(symlink_parent_dir)
			
 
				+    const symlink_abspath = path.join(symlink_parent_abspath, symlink_filename)
			
 
				+
			
 
				+    // determine nearest common ancestor between symlink dir and target dir
			
 
				+    const {
			
 
				+        closest_common_ancestor,
			
 
				+        symlink_to_ancestor_relpath,
			
 
				+        target_from_ancestor_relpath,
			
 
				+        symlink_to_target_relpath,
			
 
				+    } = await findCommonAncestor(target_abspath, symlink_abspath, {relative, search_limit}) 
			
 
				+    
			
 
				+    // set final target path to abspath or relative path depending on {relative} options
			
 
				+    let target_path_final
			
 
				+    if (relative) {
			
 
				+        // make symlink into relative link (based on closest common ancestor dir between symlink_abspath and target_abspath)
			
 
				+        target_path_final = symlink_to_target_relpath
			
 
				+        // console.log('  🔗', prettyPath(symlink_abspath), '->', prettyPath(target_abspath), `(as relative link: ${target_path_final})`)
			
 
				+    } else {
			
 
				+        // make symlink into an absolute path (verbatim passed target_path)
			
 
				+        target_path_final = target_path
			
 
				+        // console.log('  🔗', prettyPath(symlink_abspath), '->', prettyPath(target_abspath), '(as absolute path)')
			
 
				+    }
			
 
				+
			
 
				+    // remove any existing symlink at destination if there is already one there
			
 
				+    const random_nonce = crypto.randomBytes(16).toString('hex').substring(0, 8)
			
 
				+    const symlink_temp_path = `${symlink_abspath}.${random_nonce}.dup`
			
 
				+    try { await fs.promises.unlink(symlink_abspath) } catch(err) {}
			
 
				+    try { await fs.promises.unlink(symlink_temp_path) } catch(err) {}
			
 
				+
			
 
				+    // create the symlink and check that it works after creation
			
 
				+    let created_symlink = null
			
 
				+    try {
			
 
				+        created_symlink = symlink_temp_path
			
 
				+        await fs.promises.symlink(target_path_final, symlink_temp_path)
			
 
				+        created_symlink = symlink_abspath
			
 
				+        await fs.promises.rename(symlink_temp_path, symlink_abspath)
			
 
				+    } catch(err) {
			
 
				+        if (String(err).includes('EISDIR')) {
			
 
				+            // console.warn('[⚠️] Tried to create symlink on top of existing directory', prettyPath(symlink_abspath))
			
 
				+
			
 
				+            // no real recourse in this situation, and its too noisy to log every time this happens
			
 
				+            // it's also not always safe to move the dir out of the way, so better to just fail silently here, leaving:
			
 
				+            // ${symlink_abspath}.${random_nonce}.dup
			
 
				+        } else {
			
 
				+            console.warn('[⚠️] Failed to create symlink', prettyPath(created_symlink), err)
			
 
				+        }
			
 
				+    }
			
 
				+
			
 
				+    let dirent
			
 
				+    try {
			
 
				+        dirent = await blockUntilExists(created_symlink, {timeout, min_bytes: 0})
			
 
				+        // best we can do here is just check that it exists ^, trying to check that it has the exact expected abspath that we set is bad, because its a race condition:
			
 
				+        // assert(dirent.abspath == target_abspath) // its often already overwritten by later activity, so final abspath may already be different
			
 
				+    } catch(err) {
			
 
				+        throw `Symlink created but does not seem to resolve to intended file:\n    🔗 ${symlink_path}\n    -> ❌ ${target_path}\n      actual=${dirent?.abspath}\n    expected=${target_abspath}\n    ${err}`
			
 
				+    }
			
 
				+
			
 
				+    return {
			
 
				+        symlink_path,
			
 
				+        symlink_abspath: created_symlink,
			
 
				+        symlink_filename: path.basename(created_symlink),
			
 
				+        symlink_parent_abspath,
			
 
				+        symlink_to_ancestor_relpath,
			
 
				+        symlink_to_target_relpath,
			
 
				+        
			
 
				+        target_path,
			
 
				+        target_abspath,
			
 
				+        target_filename,
			
 
				+        target_parent_abspath,
			
 
				+        target_from_ancestor_relpath,
			
 
				+        target_path_final,
			
 
				+        target_is_dir,
			
 
				+        target_is_file,
			
 
				+        target_is_relative: Boolean(relative),
			
 
				+        
			
 
				+        closest_common_ancestor,
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+// test symlink and common ancestor finding
			
 
				+// console.log(await findCommonAncestor('/Volumes/NVME/Users/squash/Local/Code/archiveboxes/archivebox-spreadsheet-bot/data/archive/1709724410.19269/seo.json', '/Volumes/NVME/Users/squash/Local/Code/archiveboxes/archivebox-spreadsheet-bot/data/archive/1709724410.19269/seo2.json'))
			
 
				+// console.log(await findCommonAncestor('/Volumes/NVME/Users/squash/Local/Code/archiveboxes/archivebox-spreadsheet-bot/data/archive/1709724410.19269', '/Volumes/NVME/Users/squash/Local/Code/archiveboxes/archivebox-spreadsheet-bot/data/index/snapshots_by_domain/twitter.com/1709724410.19269', {relative: true, search_limit: '/Volumes/NVME/Users/squash/Local/Code/archiveboxes/archivebox-spreadsheet-bot/data/'}))
			
 
				+// console.log(await overwriteSymlink('/Volumes/NVME/Users/squash/Local/Code/archiveboxes/archivebox-spreadsheet-bot/data/archive/1709724410.19269', '/Volumes/NVME/Users/squash/Local/Code/archiveboxes/archivebox-spreadsheet-bot/data/index/snapshots_by_domain/twitter.com/1709724410.19269'))
			
 
				+// console.log(await overwriteSymlink('/Volumes/NVME/Users/squash/Local/Code/archiveboxes/archivebox-spreadsheet-bot/data/index/snapshots_by_domain/twitter.com/1709724410.19269', '/Volumes/NVME/Users/squash/Local/Code/archiveboxes/archivebox-spreadsheet-bot/data/index/favorite_snapshots/1709724410.19269', {relative: false, mkdirs: true, search_limit: '/Volumes/NVME/Users/squash/Local/Code/archiveboxes/archivebox-spreadsheet-bot/data/'}))
			
 
				+
			
 
				+
			
 
				+
			
 
				+async function overwriteDir(path) {
			
 
				+    // delete any existing folder at the destination path (important otherwise we may create a folder inside an existing folder/symlink)
			
 
				+    try {
			
 
				+        await fs.promises.rm(path, { recursive: true, force: true });
			
 
				+    } catch(err) {}
			
 
				+
			
 
				+    await fs.promises.mkdir(path, {recursive: true})
			
 
				+
			
 
				+    return path
			
 
				+}
			
 
				+
			
 
				+async function overwriteFile(path, contents, options={encoding: 'utf8', flag: 'w', flush: false, block: true}) {
			
 
				+    // write any JS value to a fresh file (e.g. String, Buffer, WritableStream, etc. anything JSON-serializable)
			
 
				+
			
 
				+    const block_until_created = options.block || true
			
 
				+    delete options.block
			
 
				+
			
 
				+    try {
			
 
				+        // delete any existing symlink/file present at the destination path
			
 
				+        // (important otherwise we may write into an existing symlink by accident)
			
 
				+        await fs.promises.unlink(path)
			
 
				+    } catch(err) {}
			
 
				+
			
 
				+    try {
			
 
				+        let nonce = 1
			
 
				+        while ((await fs.promises.stat(path)).isDirectory()) {
			
 
				+            // if we try to write a file to a path that already has a directory in that location
			
 
				+            // (common when trying to write response JSON e.g. http://www.instagram.com/api/graphql returns json and www.instagram.com/api/graphql/abc returns json)
			
 
				+            path = path.replace(`.${nonce-1}`, '') + `.${nonce}`
			
 
				+            nonce++;
			
 
				+            if (nonce > 20) throw `Too many conflicting files while trying to write to ${prettyPath(path)}`
			
 
				+        }
			
 
				+    } catch(err) {
			
 
				+        if (!String(err).includes('no such file or directory')) {
			
 
				+            console.warn('[⚠️] Warning: Problem with conflicting directory at while trying to write file', err)
			
 
				+        }
			
 
				+    }
			
 
				+
			
 
				+    // refuse writing undefined/null/function because its likely an error and not intended
			
 
				+    const content_is_null =  (contents === null) || (contents === undefined)
			
 
				+    const content_is_func = (typeof contents === 'function')
			
 
				+    if (content_is_null || content_is_func) {
			
 
				+        throw `Cannot write ${typeof contents} ${contents} to file: ${path}`
			
 
				+    }
			
 
				+
			
 
				+    // Numbers, BigInts, and Booleans can be cast to strings, then wrt
			
 
				+    const content_is_primitive = ['number', 'bigint', 'boolean'].includes(typeof contents)
			
 
				+    if (content_is_primitive) {
			
 
				+        contents = String(contents)
			
 
				+        await fs.promises.writeFile(path, contents, options as any)
			
 
				+        if (block_until_created) await blockUntilExists(path, {min_bytes: Buffer.byteLength(contents)})
			
 
				+        return path
			
 
				+    }
			
 
				+
			
 
				+    // Strings and Buffers can be written directly to file
			
 
				+    const content_is_string = (typeof contents === 'string' || contents instanceof String)
			
 
				+    const content_is_buffer = Buffer.isBuffer(contents)
			
 
				+    if (content_is_string || content_is_buffer) {
			
 
				+        await fs.promises.writeFile(path, contents, options as any)
			
 
				+        if (block_until_created) await blockUntilExists(path, {min_bytes: Buffer.byteLength(contents)})
			
 
				+        return path
			
 
				+    }
			
 
				+    
			
 
				+    // WritableStream objects can be piped into file
			
 
				+    const content_is_stream = (contents?.pipe)
			
 
				+    if (content_is_stream) {
			
 
				+        const stream_byte_length = contents.writableLength
			
 
				+        const dest_file = fs.createWriteStream(path);
			
 
				+        await finished(contents.pipe(dest_file))
			
 
				+        if (block_until_created) await blockUntilExists(path, {min_bytes: stream_byte_length})
			
 
				+        return path
			
 
				+    }
			
 
				+
			
 
				+    // Objects and Arrays can be JSON-stringified then written into file
			
 
				+    const content_is_obj = (Array.isArray(contents) || typeof contents === 'object')
			
 
				+    if (content_is_obj) {
			
 
				+        contents = JSON.stringify(contents, null, 4)
			
 
				+        await fs.promises.writeFile(path, contents, options as any)
			
 
				+        if (block_until_created) await blockUntilExists(path, {min_bytes: Buffer.byteLength(contents)})
			
 
				+        return path
			
 
				+    }
			
 
				+    throw `Cannot write contents of type ${typeof contents} to file: ${path} < ${contents}`
			
 
				+}
			
 
				+
			
 
				+
			
 
				+async function saveExecResult(bin, args=null, {original_url, version}, {cwd='.', timeout=60_000, ...spawn_options}={}) {
			
 
				+    assert(bin)
			
 
				+    assert(original_url && original_url.includes('://'))
			
 
				+    assert(version)
			
 
				+
			
 
				+    const BIN_NAME = bin                 // 'yt-dlp'
			
 
				+    const ARGS = args || []              // ['--some-arg', '--some-other-arg']
			
 
				+    const CWD = cwd || process.cwd()     // '.'
			
 
				+    const TIMEOUT = 300_000              // 5min timeout
			
 
				+    const PATH = process.env.PATH
			
 
				+
			
 
				+    await fs.promises.mkdir(cwd, {recursive: true})
			
 
				+
			
 
				+    // quick-n-dirty dump of cmd to bash script, but this might be better: https://github.com/nodejs/node/issues/34840#issuecomment-677402567
			
 
				+    const cmd_log_str = `#!/usr/bin/env bash
			
 
				+TYPE="${BIN_NAME}"
			
 
				+URL="${original_url}"
			
 
				+VERSION="${version}"
			
 
				+
			
 
				+TIMEOUT=${TIMEOUT}
			
 
				+CWD="${CWD}"
			
 
				+PATH="${PATH}:$PATH"
			
 
				+
			
 
				+${BIN_NAME} ${ARGS.map(arg => JSON.stringify(arg)).join(' ')}
			
 
				+`
			
 
				+    const cmd_log = path.join(cwd, 'cmd.sh')
			
 
				+    await overwriteFile(cmd_log, cmd_log_str)
			
 
				+
			
 
				+    const stdout_log = fs.createWriteStream(path.join(cwd, 'stdout.log'))
			
 
				+    const stderr_log = fs.createWriteStream(path.join(cwd, 'stderr.log'))
			
 
				+ 
			
 
				+    const start_date = new Date()
			
 
				+    const start_ts = Number(start_date)
			
 
				+    const start_time = start_date.toISOString()
			
 
				+
			
 
				+    const child = child_process.spawn(
			
 
				+        BIN_NAME,
			
 
				+        ARGS,
			
 
				+        {
			
 
				+            cwd: CWD,
			
 
				+            timeout: TIMEOUT,                           // 5min timeout
			
 
				+            stdio: [null, 'pipe', 'pipe'],              // </dev/null >./stdout.log 2>./stderr.log
			
 
				+            // detached: true,                          // run in background, don't block on response
			
 
				+            ...(spawn_options || {}),
			
 
				+        },
			
 
				+    )
			
 
				+    child.stdout.setEncoding('utf8')
			
 
				+    child.stdout.pipe(stdout_log)
			
 
				+    child.stderr.setEncoding('utf8')
			
 
				+    child.stderr.pipe(stderr_log)
			
 
				+
			
 
				+    const exec_info = {
			
 
				+        TYPE: BIN_NAME,
			
 
				+        URL: original_url,
			
 
				+        VERSION: version,
			
 
				+        bin_name: BIN_NAME,
			
 
				+        args: ARGS,
			
 
				+        timeout: TIMEOUT,
			
 
				+        hostname: os.hostname(),
			
 
				+        bin_paths: PATH,
			
 
				+        ppid: process.pid,
			
 
				+        pid: child.pid,
			
 
				+        start_ts,
			
 
				+        start_time,
			
 
				+        end_time: null,
			
 
				+        end_ts: null,
			
 
				+        duration: null,
			
 
				+        returncode: null,
			
 
				+        log_files: {},
			
 
				+        output_files: {},
			
 
				+    }
			
 
				+
			
 
				+    // promise that resolves when the command is finished executing
			
 
				+    // TODO: refactor to use withTimeout
			
 
				+    const getResult = (timeout=TIMEOUT) =>
			
 
				+        new Promise((resolve, reject) => {
			
 
				+            const loop = setInterval(() => {
			
 
				+                if (exec_info.end_time) {
			
 
				+                    clearInterval(loop)
			
 
				+                    clearTimeout(timer)
			
 
				+                    resolve(exec_info)
			
 
				+                }
			
 
				+            }, 100)
			
 
				+
			
 
				+            const timer = setTimeout(() => {
			
 
				+                clearInterval(loop)
			
 
				+                if (!exec_info.end_time) {
			
 
				+                    reject(new Error(`Process ${BIN_NAME} did not finish within TIMEOUT=${TIMEOUT}`))
			
 
				+                }
			
 
				+            }, timeout);
			
 
				+        })
			
 
				+
			
 
				+    const logFilesFilter = ({relpath}) =>
			
 
				+        ['cmd.sh', 'stdout.log', 'stderr.log'].includes(relpath)
			
 
				+
			
 
				+    const outputFilesFilter = ({relpath}) =>
			
 
				+        !['cmd.sh', 'stdout.log', 'stderr.log', 'index.json'].includes(relpath)
			
 
				+
			
 
				+    const getOutputFiles = async (filter=outputFilesFilter) => {
			
 
				+        return await getDirInfo(CWD, {filter, withHelpers: false, withRoot: false, maxdepth: 6})
			
 
				+    }
			
 
				+
			
 
				+    child.on('close', async (returncode) => {
			
 
				+        const end_date = new Date()
			
 
				+        exec_info.returncode = returncode
			
 
				+        exec_info.pid = child.pid
			
 
				+        exec_info.end_ts = Number(end_date)
			
 
				+        exec_info.end_time = end_date.toISOString()
			
 
				+        exec_info.duration = exec_info.end_ts - exec_info.start_ts
			
 
				+        exec_info.log_files = await getOutputFiles(logFilesFilter)
			
 
				+        exec_info.output_files = await getOutputFiles(outputFilesFilter)
			
 
				+
			
 
				+        const end_metadata = ` 
			
 
				+# END_TIME="${exec_info.end_time}"
			
 
				+# DURATION=${exec_info.duration}
			
 
				+# RETURNCODE=${exec_info.returncode }
			
 
				+`
			
 
				+        await fs.promises.appendFile(cmd_log, end_metadata)
			
 
				+
			
 
				+        // write exec_info json (which includes file list) to CWD/index.json
			
 
				+        await overwriteFile(path.join(CWD, 'index.json'), exec_info)
			
 
				+    })
			
 
				+    // child.unref()  // dont wait for child process to close
			
 
				+    
			
 
				+    const start_metadata = `
			
 
				+#################### LAST RUN LOG ####################
			
 
				+# HOSTNAME="${exec_info.hostname}"
			
 
				+# PPID=${exec_info.ppid}
			
 
				+# PID=${exec_info.pid}
			
 
				+# START_TIME="${exec_info.start_time}"
			
 
				+`
			
 
				+    await fs.promises.appendFile(cmd_log, start_metadata)
			
 
				+
			
 
				+    return {
			
 
				+        ...exec_info,
			
 
				+        getResult,
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+const HASH_CACHE = {}
			
 
				+
			
 
				+async function sha256File(file_path: string, {pwd=null}: {pwd?: string}={}) {
			
 
				+    return new Promise((resolve, reject) => {
			
 
				+        pwd = pwd || path.dirname(file_path);
			
 
				+        if (!file_path.startsWith(pwd)) {
			
 
				+            file_path = path.join(pwd, file_path);
			
 
				+        }
			
 
				+
			
 
				+        const dirent = fs.statSync(file_path);
			
 
				+        const abspath = fs.realpathSync(file_path);
			
 
				+        const cache_key = `${abspath}:${dirent.size}:${dirent.mtimeMs}`; // PATH:SIZE:LAST_MODIFIED_TIME
			
 
				+        if (cache_key in HASH_CACHE) {
			
 
				+            resolve(HASH_CACHE[cache_key]);
			
 
				+        }
			
 
				+
			
 
				+        const hash = crypto.createHash('sha256');
			
 
				+        const rs = fs.createReadStream(abspath);
			
 
				+        rs.on('error', reject);
			
 
				+        rs.on('data', chunk => hash.update(chunk));
			
 
				+        rs.on('end', () => {
			
 
				+            const final_hash = hash.digest('hex');
			
 
				+            HASH_CACHE[cache_key] = final_hash;
			
 
				+            resolve(final_hash);
			
 
				+        });
			
 
				+    }) as Promise<string>
			
 
				+}
			
 
				+
			
 
				+async function getDirSha256(dir_path, {pwd=null, withRoot=true, filter=null, maxdepth=-1, subfiles=null}={}) {
			
 
				+    // console.log('CALCULATING SHA256 OF FILES IN DIR', dir_path, {withRoot, filter, maxdepth})
			
 
				+    //  dir_path:     path   absolute or relative path of the directory you want the merkle sha256 for
			
 
				+    //       pwd:     path   (optional) absolute path of the directory you want to interpret dir_path relative to
			
 
				+    //  withRoot:     bool   include a summary entry for the root dir_path dir in the list as '.'
			
 
				+    //    filter: function   (optional) provide a filter func for dir entries ({abspath, relpath, basename, dirent}) => true/false
			
 
				+    //  maxdepth:   number   (optional) does not affect actual calculations, but hides entries below a certain depth in the returned output for brevity
			
 
				+    //  subfiles: dirent[]   (optional) instead of reading disk, you can manually provide a getDirEntries results list to use
			
 
				+
			
 
				+    pwd = pwd || dir_path
			
 
				+    if (!dir_path.startsWith(pwd)) {
			
 
				+        dir_path = path.join(pwd, dir_path)
			
 
				+    }
			
 
				+
			
 
				+    const dirent = await fs.promises.stat(dir_path)
			
 
				+    assert(dirent.isDirectory(), `Tried to compute sha256 of path but missing or not a directory! ${dir_path}`)
			
 
				+    assert((maxdepth >= -1), `maxdepth must be -1, 0, or 1, 2, 3, etc... (got ${maxdepth})`)
			
 
				+    
			
 
				+    // assert(!(filter && withRoot), `Cannot generate root hash (consistently) when a custom filter is provided!`)
			
 
				+
			
 
				+    // get the sha256 of every file in a directory recursively (excluding hidden files and symlinks)
			
 
				+    // EQUIVALENT TO: find . -type f -not -path '*/.*' -print0 | sort -z | xargs -0 sha256sum
			
 
				+    const all_subfiles = (subfiles as string[]) || await getDirEntries(dir_path, {
			
 
				+        pwd,
			
 
				+        recursive: true,
			
 
				+        includeFiles: true,
			
 
				+        includeDirs: false,
			
 
				+
			
 
				+        // ~~maxdepth,~~    // dont pass maxdepth here, we need the entire file listing to accurately calculate parent dir hashes.
			
 
				+                            // it never makes sense to ignore subfiles beyond a certain depth for hash calculation. Hashes are
			
 
				+                            // only useful IDs if they are consistent+repeatable, hashing to an arbitrary depth will produce 
			
 
				+                            // many different hashes for the same directory, which is not something we need/want polluting the hash space.
			
 
				+
			
 
				+        
			
 
				+        filter,  // we do however allow passing a manual filter funcs which does actually affect the hash
			
 
				+                 // this is useful to allow quick checks to see whether a certain subset of files has changed or not
			
 
				+    })
			
 
				+    const hashes: {[key: string]: string} = {}
			
 
				+    let hashable_summary_str = ''
			
 
				+    for (const subfile of all_subfiles) {
			
 
				+        // {'versions/20240413144307/screen recording.mp4': '1df4d9c3aca8b36f1f73e327d56038f80a35db407a298edb16c72576d7dd894e', ...}
			
 
				+        hashes[subfile] = await sha256File(subfile, {pwd})
			
 
				+        const relpath = path.relative(await fs.promises.realpath(dir_path), await fs.promises.realpath(path.join(pwd, subfile)))
			
 
				+        hashable_summary_str += `${hashes[subfile]}  ./${relpath}\n`
			
 
				+    }
			
 
				+    // console.log('CALCULATED HASHES FOR ALL SUBFILES IN DIR', dir_path, Object.keys(hashes).length)
			
 
				+
			
 
				+    // get list of subdirectories and recursively hash every subdirectory
			
 
				+    // EQUIVALENT TO: find . -type d -not -path '*/.*' -maxdepth ${maxdepth} -print | sort
			
 
				+    const subdirs = await getDirEntries(dir_path, {pwd, recursive: true, includeHidden: false, includeDirs: true, includeFiles: false, filter, maxdepth})
			
 
				+
			
 
				+    // for each subdirectory, get its hash recursively and store it in the hash list
			
 
				+    for (const subdir of subdirs) {
			
 
				+        // console.log('GETTING SUBDIR HASH', subdir)
			
 
				+        // a directory's hash is defined as the hash of all the *files* within (excluding dirs/symlinks/hidden)
			
 
				+        const subdir_hashes = await getDirSha256(
			
 
				+            subdir,
			
 
				+            {pwd, withRoot: true, filter, maxdepth: 0},
			
 
				+        )
			
 
				+        hashes[subdir] = subdir_hashes['.']
			
 
				+    }
			
 
				+    // console.log('CALCULATED HASHES FOR ALL SUBDIRS IN DIR', dir_path, subdirs.length)
			
 
				+
			
 
				+    // filter results if maxdepth is provided
			
 
				+    if (maxdepth >= 0) {
			
 
				+        for (const subpath of Object.keys(hashes)) {
			
 
				+            if (pathDepth(subpath) > maxdepth) {
			
 
				+                delete hashes[subpath]
			
 
				+            }
			
 
				+        }
			
 
				+    }
			
 
				+    // console.log('LIMITED OUTPUT DUE TO MAXDEPTH', maxdepth, Object.keys(hashes).length)
			
 
				+
			
 
				+    // calculate the hash of the root '.' folder by hashing all of hashes of its contents
			
 
				+    // EQUIVALENT TO: find . -type f -not -path '*/.*' -print0 | sort -z | xargs -0 sha256sum | sha256sum
			
 
				+    if (withRoot) {
			
 
				+        // pass the first command's output containing the file list + hashes into another sha256
			
 
				+        // to get the final hash of the whole directory combined
			
 
				+        // console.log('CALCULATING FINAL ROOT HASH for ', dir_path)
			
 
				+        // console.log(hashable_summary_str)
			
 
				+        hashes['.'] = crypto.createHash('sha256').update(hashable_summary_str).digest('hex') as string
			
 
				+        // console.log('--->', hashes['.'])
			
 
				+    }
			
 
				+
			
 
				+    return hashes
			
 
				+}
			
 
				+
			
 
				+
			
 
				+async function getDirInfo(dir_path, {pwd=null, withRoot=true, withHelpers=true, filter=null, maxdepth=-1, subfiles=null}={}) {
			
 
				+    // get a detailed JSON/dumpable index of a directory's contents, w/ merkle sha256's, sizes, and mimeTypes
			
 
				+    //    dir_path:     path   absolute or relative path of the directory you want size info for
			
 
				+    //         pwd:     path   (optional) absolute path of the directory you want to interpret dir_path relative to
			
 
				+    //    withRoot:     bool   include a summary entry for the root dir_path dir in the list as '.'
			
 
				+    // withHelpers:     bool   attach many extra helper attrs/funcs to results (beyond JSON-serializable core data)
			
 
				+    //      filter: function   (optional) provide a filter func for dir entries ({abspath, relpath, basename, dirent}) => true/false
			
 
				+    //    maxdepth:   number   (optional) does not affect actual calculations, but hides entries below a certain depth in the returned output for brevity
			
 
				+    //    subfiles: dirent[]   (optional) instead of reading disk, you can manually provide a getDirEntries results list to use
			
 
				+
			
 
				+    // {
			
 
				+    //   ...
			
 
				+    //   'example.txt': { ... },
			
 
				+    //   'foobar/example.mp3': { ... },
			
 
				+    //   '.': {                                 // this is the fully agumented result when withHelpers=true
			
 
				+    //     is_file: false,
			
 
				+    //     is_dir: true,
			
 
				+    //     filename: '.',
			
 
				+    //     basename: '1709039915.378868',
			
 
				+    //     mimeType: 'inode/directory'
			
 
				+    //     extension: undefined,
			
 
				+    //     num_bytes: 11540961,
			
 
				+    //     num_subpaths: 15,
			
 
				+    //     sha256: '9fc58b3ed887e7139338062ebd49bd6795373759e8acb73d2f7a40f1413789da',
			
 
				+    //     reldepth: 1,
			
 
				+    //     relpath: './',
			
 
				+    //     cwd: '/opt/archivebox/data/archive/1709039915.378868/',
			
 
				+    //     dirname: '/opt/archivebox/data/archive',
			
 
				+    //     abspath: '/opt/archivebox/data/archive/1709039915.378868',
			
 
				+    //     dirent: Stats {
			
 
				+    //       dev: 16777240,
			
 
				+    //       mode: 16895,
			
 
				+    //       uid: 501,
			
 
				+    //       ...
			
 
				+    //       mtimeMs: 1717160622956.1357,
			
 
				+    //       ctimeMs: 1717160622956.1357,
			
 
				+    //     },
			
 
				+    //     created: '2024-05-31T13:03:42.956Z',
			
 
				+    //     modified: '2024-05-31T13:03:42.956Z',
			
 
				+    //     summary: './data/archive/1709039915.378868 (inode/directory 11541kb 9fc58b3e)',
			
 
				+    //     helptext: 'Verify these hashes by running:\n' +
			
 
				+    //       '  cd /opt/archivebox/data/archive/1709039915.378868 \n' +
			
 
				+    //       "  find . -type f -not -path '*/.*' -print0 | sort -z | xargs -0 sha256sum | sha256sum",
			
 
				+    //   },
			
 
				+    // }
			
 
				+
			
 
				+    pwd = pwd || dir_path
			
 
				+    if (!dir_path.startsWith(pwd)) {
			
 
				+        dir_path = path.join(pwd, dir_path)
			
 
				+    }
			
 
				+
			
 
				+    // calculate hashes and sizes recursively
			
 
				+    const hashes = await getDirSha256(dir_path, {pwd, withRoot, filter, maxdepth, subfiles})
			
 
				+    const sizes = await getDirSizes(dir_path, {pwd, withRoot, filter, maxdepth, subfiles})
			
 
				+
			
 
				+    const num_total_subpaths = Object.keys(hashes).filter(name => name !== '.').length
			
 
				+
			
 
				+    const details = {}
			
 
				+    for (const [filename, sha256] of Object.entries(hashes)) {
			
 
				+        if (filename === '.' && !withRoot) continue
			
 
				+
			
 
				+        const abspath = await fs.promises.realpath(path.join(dir_path, filename))
			
 
				+        const dirent = await fs.promises.stat(abspath)
			
 
				+        const num_subpaths = Object.keys(hashes).filter(subpath => subpath.startsWith(filename + '/')).length
			
 
				+        const is_file = dirent.isFile()
			
 
				+        const is_dir = dirent.isDirectory()
			
 
				+
			
 
				+        // bare-bones info suitable for JSON dumps/exports
			
 
				+        const basic_info = {
			
 
				+            sha256,
			
 
				+            num_bytes: sizes[filename],
			
 
				+            created: (new Date(dirent.ctimeMs)).toISOString(),
			
 
				+            mimeType: undefined,
			
 
				+            extension: undefined,
			
 
				+            num_subpaths: undefined,
			
 
				+        }
			
 
				+        if (is_dir) {
			
 
				+            basic_info.mimeType = 'inode/directory'
			
 
				+            basic_info.extension = undefined
			
 
				+            basic_info.num_subpaths = (filename === '.') ? num_total_subpaths : num_subpaths
			
 
				+        }
			
 
				+        if (is_file) {
			
 
				+            basic_info.mimeType = mime.lookup(abspath) || null
			
 
				+            basic_info.extension = path.extname(filename)
			
 
				+            basic_info.num_subpaths = undefined
			
 
				+        }
			
 
				+
			
 
				+        // extra helpers suitable for usage in other areas of the codebase
			
 
				+        const info_with_helpers = {
			
 
				+            ...basic_info,
			
 
				+            filename,
			
 
				+            basename: path.basename(abspath),
			
 
				+            dirname: path.dirname(abspath),
			
 
				+            cwd: dir_path,
			
 
				+            relpath: is_dir ? (filename + '/') : filename,
			
 
				+            reldepth: pathDepth(filename),
			
 
				+            abspath,
			
 
				+            is_file,
			
 
				+            is_dir,
			
 
				+            dirent,
			
 
				+            modified: (new Date(dirent.mtimeMs)).toISOString(),
			
 
				+            summary: `${prettyPath(abspath)} (${basic_info.mimeType} ${Math.round(basic_info.num_bytes/1000)}kb ${sha256.substring(0, 8)})`,
			
 
				+            helptext: undefined,
			
 
				+        }
			
 
				+        if (filename === '.') {
			
 
				+            info_with_helpers.helptext = `Verify these hashes by running:\n  cd ${prettyPath(abspath)} \n  find . -type f -not -path '*/.*' -print0 | sort -z | xargs -0 sha256sum | sha256sum`
			
 
				+        }
			
 
				+
			
 
				+        if ((typeof filter) === 'function') {
			
 
				+            if (!filter(info_with_helpers)) continue
			
 
				+        }
			
 
				+
			
 
				+        details[filename] = withHelpers ? info_with_helpers : basic_info
			
 
				+    }
			
 
				+    return details
			
 
				+}
			
 
				+
			
 
				+// console.log(await getDirSha256(
			
 
				+//     '/opt/archivebox/data/archive/1709039915.378868/',
			
 
				+//     {
			
 
				+//         withRoot: true,
			
 
				+//         maxdepth: -1,
			
 
				+//         filter: ({relpath}) => relpath.startsWith('versions'),
			
 
				+//     },
			
 
				+// ))
			
 
				+// console.log(await getDirSizes(
			
 
				+//     '/opt/archivebox/data/archive/1709039915.378868/',
			
 
				+//     {
			
 
				+//         withRoot: false,
			
 
				+//         maxdepth: 2,
			
 
				+//         filter: ({relpath}) => !relpath.startsWith('versions'),
			
 
				+//     },
			
 
				+// ))
			
 
				+// console.log(await getDirInfo(
			
 
				+//     '/opt/archivebox/data/archive/1709039915.378868/',
			
 
				+//     {
			
 
				+//         withRoot: true,
			
 
				+//         withHelpers: true,
			
 
				+//         maxdepth: 1,
			
 
				+//         // filter: ({relpath}) => relpath.startsWith('versions'),
			
 
				+//     },
			
 
				+// ))
			
 
				+
			
 
				+type DetectFilenameOptions = {
			
 
				+    url?: string,
			
 
				+    response?: HTTPResponse | Response,
			
 
				+    page?: Page,
			
 
				+    dir?: string,
			
 
				+    abspath?: string,
			
 
				+    filename?: string,
			
 
				+    basename?: string,
			
 
				+    extension?: string,
			
 
				+    mimeType?: string,
			
 
				+    resourceType?: string,
			
 
				+}
			
 
				+
			
 
				+async function detectFilename({ url, response, page, dir, abspath, filename, basename, extension, mimeType, resourceType }: DetectFilenameOptions) {
			
 
				+    // this function takes a url (and/or response/page), and detects the abspath,dir,filename,basename,extention,mimeType
			
 
				+    // from the URL (+ any enforced path components passed in via args)
			
 
				+    // example: detectFilename({url: 'https://example.com/favicon.png', extension: 'ico'}) outputs 'favicon.ico'
			
 
				+    //
			
 
				+    // it has some quirks that are specific to archiving and may not behave as you expect
			
 
				+    // e.g. if visiting the url https://example.com/error.zip returns a 500 text/html error page
			
 
				+    // this may still save it as a .zip with mimeType=application/x-zip and ignore the response mimeType the url ends in .zip
			
 
				+    // however, if the url has no extension, e.g. https://example.com/error it will 
			
 
				+    // auto-detect the mimeType based on the response and append an extension, saving as error.html
			
 
				+    //
			
 
				+    // ⚠️ SECURITY WARNING: think carefully about the permissions, shell injection, and RCE implications of any changes made here ⚠️
			
 
				+    // this function writes untrusted web content to the filesystem using auto-detected mimetype to co-erce the extension,
			
 
				+    // which can be dangerous (e.g. what if one of these downloads is a malicious ransomware .exe, do we really want to give it .exe?
			
 
				+    // if we do, how do we make sure it never gets executed? (without damaging the integrity of the copy)
			
 
				+
			
 
				+    if (!(response || page)) throw 'Either a page or a response must be provided in order to detect mimeType & URL'
			
 
				+
			
 
				+    if (response && (typeof response.headers !== 'function')) {
			
 
				+        const node_fetch_response: Response = response as Response
			
 
				+        response = {
			
 
				+            url: () => node_fetch_response.url,
			
 
				+            headers: () => node_fetch_response.headers,
			
 
				+        } as unknown as HTTPResponse
			
 
				+    }
			
 
				+    response = response as HTTPResponse
			
 
				+
			
 
				+    url = url || response?.url() || (await page.url())
			
 
				+    if (!url) throw 'URL was not provided and could not be detected from {response, page}'
			
 
				+
			
 
				+    // Document, Stylesheet, Image, Media, Font, Script, TextTrack, XHR, Fetch, Prefetch, EventSource, WebSocket, Manifest, SignedExchange, Ping, CSPViolationReport, Preflight, Other
			
 
				+    try {
			
 
				+        resourceType = resourceType || response?.request()?.resourceType()
			
 
				+    } catch(err) {
			
 
				+        // ignore, sometimes response is null/not available
			
 
				+    }
			
 
				+    const resourceTypeToMimeType = {
			
 
				+        'Stylesheet': 'text/css',
			
 
				+        'Script': 'application/x-javascript',
			
 
				+        'WebSocket': 'application/json',
			
 
				+        'Website': 'text/html',
			
 
				+    }
			
 
				+    
			
 
				+    mimeType = mimeType || resourceTypeToMimeType[resourceType]   // guess extension based on request resourceType
			
 
				+    extension = extension || (mimeType ? mime.extension(mimeType) : null)
			
 
				+
			
 
				+    // handle special url cases (e.g. schemes in URL_SCHEMES_IGNORED)
			
 
				+    if (url.startsWith('about:blank')) {
			
 
				+        filename = 'about_blank'
			
 
				+        mimeType = 'text/html'
			
 
				+    }
			
 
				+    else if (url.startsWith('data:')) {
			
 
				+        filename = `data__${hashCode(url)}`
			
 
				+    }
			
 
				+
			
 
				+    // console.log('detectFilename>', {url, dir, abspath, filename, basename, extension, mimeType, resourceType})
			
 
				+
			
 
				+    if (abspath) {
			
 
				+        if (dir || filename || basename || extension)
			
 
				+            throw '{abspath} should not be passed with other options (e.g. dir, filename, basename, extension)'
			
 
				+        var {dir, base: filename, ext: extension, name: basename} = path.parse(abspath)
			
 
				+        // path.parse('/home/user/dir/file.txt') returns:
			
 
				+        // { root: '/',
			
 
				+        //   dir: '/home/user/dir',
			
 
				+        //   base: 'file.txt',
			
 
				+        //   ext: '.txt',
			
 
				+        //   name: 'file' } 
			
 
				+    } else {
			
 
				+        dir = dir || path.resolve(process.cwd())
			
 
				+
			
 
				+        filename = filename                                         // https://example.com/a.1.zip?e.pdf=2#g.h=3  => a.1.zip
			
 
				+            || (new URL(url)).pathname.split('/').at(-1)          // https://example.com/file124.rss  => file124.rss    prefers last component of path with no query/hash, falls back to domain name if no path
			
 
				+            || 'index'                                            // https://example.com/abc/def/     => index.html
			
 
				+            //|| (new URL(url)).hostname.replaceAll('.', '_')     // https://example.com              => example_com  (but if disabled, this would be index.html)
			
 
				+    }
			
 
				+    if (!filename) throw 'filename/abspath were not passed and could not be detected from url'
			
 
				+    
			
 
				+    const path_extname = path.extname(filename)
			
 
				+    const resp_mimetype = response && (
			
 
				+        (response as any).mimeType
			
 
				+        || response.headers()['content-type']?.split(';')[0]
			
 
				+        || resourceTypeToMimeType[resourceType]
			
 
				+        || 'application/octet-stream'
			
 
				+    )
			
 
				+
			
 
				+    mimeType = mimeType                                         // https://example.com/a.1.zip?e.pdf=2#g.h=3  => application/x-zip    prefers mimetype based on extension in path, falls back to response mimeType
			
 
				+        || (path_extname && mime.lookup(path_extname))                    // https://example.com/file124.rss  => application/rss+xml
			
 
				+        || resp_mimetype                                                  // https://example.com/get?type=png => image/png
			
 
				+        
			
 
				+    extension = extension
			
 
				+        || (path_extname && path_extname.replace('.', ''))      // https://example.com/a.1.zip?e.pdf=2#g.h=3  => zip                  prefers extension in path, falls back to response mimeType's suggested extension
			
 
				+        || (resp_mimetype && mime.extension(resp_mimetype))               // https://example.com              => html
			
 
				+        || ''                                                             // https://example.com/websocket.1  => 
			
 
				+    if (extension.startsWith('.'))
			
 
				+        extension = extension.slice(1)
			
 
				+
			
 
				+    basename = basename                                         // https://example.com/a.1.zip?e.pdf=2#g.h=3  => a.1                  prefers to filename in path (without extension), falls back to domain name
			
 
				+        || (path.parse(filename).name)                          // https://mp4dl.example.com        => mp4dl_example_com
			
 
				+
			
 
				+    basename = basename.slice(0, 120)                            // truncate at 120 characters (leaving 8 chars for .ext)
			
 
				+    basename = basename.replace(/[^a-zA-Z0-9%+?&=@;_ \.-]/g, '') // strip characters not allowed in filenames
			
 
				+
			
 
				+    filename = basename + '.' + extension
			
 
				+
			
 
				+    if (filename.endsWith('.'))
			
 
				+        filename = filename.slice(0, -1)
			
 
				+
			
 
				+    abspath = abspath || path.join(dir, filename)
			
 
				+
			
 
				+    // console.log('detectFilename<', {url, dir, abspath, filename, basename, extension, mimeType, resourceType})
			
 
				+
			
 
				+    return {
			
 
				+        url,
			
 
				+        dir,
			
 
				+        abspath,
			
 
				+        filename,
			
 
				+        basename,
			
 
				+        extension,
			
 
				+        mimeType,
			
 
				+        resourceType,
			
 
				+        resp_mimetype,
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+interface DowloadOptions extends DetectFilenameOptions {
			
 
				+    browser?: Browser
			
 
				+    expected_mimetype?: string
			
 
				+    timeout?: number
			
 
				+}
			
 
				+
			
 
				+async function download({ url, browser, page, response, dir, abspath, filename, basename, extension, expected_mimetype, timeout }: DowloadOptions) {
			
 
				+    url = url || (response as HTTPResponse)?.url() || (await page?.url())
			
 
				+    ALREADY_ARCHIVED.add(url.slice(0, 4096))   // prevent running whole archive task on tabs we create for just for downloading
			
 
				+
			
 
				+    browser = browser || (page && (await page.browser()))
			
 
				+    timeout = timeout || 120_000
			
 
				+    expected_mimetype = expected_mimetype || ''
			
 
				+    let newPage = null
			
 
				+    let errors = []
			
 
				+    let num_bytes = 0
			
 
				+    let bytesBuffer = null
			
 
				+
			
 
				+
			
 
				+    // if we need to fetch the url (i.e. it's not already been requested)
			
 
				+    if (!response) {
			
 
				+        if (!browser) throw 'No {browser} or {page} was provided to download with'
			
 
				+        newPage = await browser.newPage()
			
 
				+        if (page) await page.bringToFront()  // if origin page is provided, make sure it stays in foreground
			
 
				+        response = await newPage.goto(url, {timeout: timeout, waitUntil: 'networkidle0'})
			
 
				+        if (page) await page.bringToFront()  // if origin page is provided, make sure it stays in foreground
			
 
				+    }
			
 
				+    url = url || (response as HTTPResponse)?.url() || (await newPage?.url()) || (await page?.url());
			
 
				+    const response_mimetype = (response as HTTPResponse).headers()['content-type']?.split(';')[0] || 'text/html'
			
 
				+
			
 
				+    // detect the filename we should write to based on provided url/response/page/filename/extension suggestions
			
 
				+    var {
			
 
				+        dir,
			
 
				+        abspath,
			
 
				+        filename,
			
 
				+        basename,
			
 
				+        extension,
			
 
				+        mimeType,
			
 
				+    } = await detectFilename({url, page, response, dir, abspath, filename, basename, extension, mimeType})
			
 
				+
			
 
				+    // if mimeType is passed, make sure response matches expected mimetype, otherwise consider download a failure
			
 
				+    if (!response_mimetype.startsWith(expected_mimetype)) {
			
 
				+        errors.push(`Expected ${expected_mimetype} but got ${response_mimetype}`)
			
 
				+    } else {
			
 
				+
			
 
				+        // download the file using puppeteer's response.buffer()
			
 
				+        try {
			
 
				+            // write the response bytes into the output file
			
 
				+            bytesBuffer = await (response as HTTPResponse).buffer()
			
 
				+            await overwriteFile(abspath, bytesBuffer)
			
 
				+            num_bytes = bytesBuffer.length
			
 
				+        } catch(err) {
			
 
				+            errors.push(err)
			
 
				+        }
			
 
				+
			
 
				+        // security check to make sure downloaded file is not executable (random binaries downloaded off the internet = dangerous)
			
 
				+        fs.access(abspath, fs.constants.X_OK, (err) => {
			
 
				+            if (!err) console.warn(
			
 
				+                '[⚠️] SECURITY WARNING: Downloaded file appears to be executable:', prettyPath(abspath),
			
 
				+                '\n     (be careful running untrusted programs downloaded from the internet!)'
			
 
				+            )
			
 
				+        })
			
 
				+    }
			
 
				+
			
 
				+    // if we opened a dedicated page for downloading, close it now
			
 
				+    if (newPage) {
			
 
				+        newPage.close()
			
 
				+    }
			
 
				+
			
 
				+    if (errors.length) {
			
 
				+        // console.warn(`[❌] Downloading ${url} (${mimeType}) to ${abspath} failed:`, JSON.stringify(errors, null, 4))
			
 
				+    } else {
			
 
				+        console.log(`[💾] Downloaded ${url.substring(0, 40)}  (${num_bytes} ${mimeType})...`.padEnd(82), prettyPath(abspath))
			
 
				+    }
			
 
				+
			
 
				+    return {
			
 
				+        url, response, errors,
			
 
				+        dir, abspath, filename, basename, extension, mimeType,
			
 
				+        bytesBuffer, num_bytes,
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+
			
 
				+/************************** Puppeteer Launching *******************************/
			
 
				+
			
 
				+
			
 
				+async function startCluster(puppeteer, args=CHROME_ARGS_DEFAULT) {
			
 
				+    console.log(`[🎭] Launching ${CHROME_CLUSTER_WORKERS}x Chromium browsers with puppeteer-cluster:`.padEnd(82), prettyPath(CHROME_PROFILE_PATH))
			
 
				+    const cluster = await Cluster.launch({
			
 
				+        puppeteer,
			
 
				+        monitor: true,
			
 
				+        maxConcurrency: CHROME_CLUSTER_WORKERS,
			
 
				+        sameDomainDelay: 2550,
			
 
				+        workerCreationDelay: 250,
			
 
				+        timeout: 300_000,                       // total ms timeout for an entire task (1000ms * 60s * 5m)
			
 
				+        concurrency: Cluster.CONCURRENCY_PAGE,  // share cookies between all tabs in a given browser
			
 
				+        puppeteerOptions: {
			
 
				+            args,                                           // all the chrome launch CLI args
			
 
				+            ignoreDefaultArgs: true,                        // trust me, we have enough args already...
			
 
				+            // dumpio: true,                                // full debug log output, super noisy
			
 
				+        }
			
 
				+    })
			
 
				+    console.log('*************************************************************************')
			
 
				+    return cluster
			
 
				+}
			
 
				+
			
 
				+async function remoteBrowser(puppeteer, {browserURL, browserWSEndpoint}) {
			
 
				+    console.log('[🎭] Connecting Puppeteer to existing Chromium browser via:', browserURL || browserWSEndpoint)
			
 
				+    let completed_initial_connection = false
			
 
				+    const browser = await puppeteer.connect({browserURL, browserWSEndpoint, defaultViewport: null, targetFilter: () => completed_initial_connection})
			
 
				+    completed_initial_connection = true
			
 
				+    console.log('*************************************************************************')
			
 
				+    return browser
			
 
				+}
			
 
				+
			
 
				+async function startBrowser(puppeteer, args=CHROME_ARGS_DEFAULT) {
			
 
				+    console.log('[🎭] Launching Puppeteer Chromium browser...'.padEnd(82+1), prettyPath(CHROME_PROFILE_PATH))
			
 
				+
			
 
				+    const browser = await puppeteer.launch({ignoreDefaultArgs: true, args, dumpio: true})
			
 
				+    globalThis.browser = browser
			
 
				+    console.log('*************************************************************************')
			
 
				+    
			
 
				+    // store all active tabs on global var by url for easier vscode interactive debugging
			
 
				+    const storeTabForDebugger = async (target) => {
			
 
				+        try {
			
 
				+            globalThis.tabs = globalThis.tabs || {}
			
 
				+            const url = target.url()
			
 
				+            const page = await target.page()
			
 
				+            if (!page || page?.isClosed()) {
			
 
				+                delete globalThis.tabs[url]
			
 
				+            } else {
			
 
				+                globalThis.tab = page
			
 
				+                globalThis.tabs[url] = page
			
 
				+            }
			
 
				+        } catch(err) {console.warn(err)}
			
 
				+    }
			
 
				+    browser.on('targetcreated', storeTabForDebugger)
			
 
				+    browser.on('targetchanged', storeTabForDebugger)
			
 
				+    browser.on('targetdestroyed', storeTabForDebugger)
			
 
				+
			
 
				+    // wait for initial extension background.js/service worker targets to load
			
 
				+    await wait(3_000)
			
 
				+
			
 
				+    // prime the extensions cache
			
 
				+    const extensions = await getChromeExtensionsFromCache({browser})
			
 
				+    globalThis.extensions = extensions  // for easier debugging only
			
 
				+
			
 
				+    // give the user 2min to check any issues with the initial startup pages (bot profile pages),
			
 
				+    // solve captchas, re-login, etc. then close them after that to save resources
			
 
				+    const startup_pages = (await browser.pages())
			
 
				+    const startup_page_close_delay = 120_000
			
 
				+    setTimeout(async () => {
			
 
				+        for (const page of startup_pages) {
			
 
				+            try { await page.close() } catch(err) { /* page may already be closed by now, which is fine */ }
			
 
				+        }
			
 
				+        
			
 
				+    }, startup_page_close_delay)
			
 
				+    
			
 
				+    // setup any extensions that need final runtime configuration using their options pages
			
 
				+    // await setup2CaptchaExtension({browser, extensions})
			
 
				+
			
 
				+    // open a placeholder page so browser window stays open when there are no active archiving pages
			
 
				+    // (it's annoying to have the entire window open/close/open/close/etc every time an archive task runs)
			
 
				+    const empty_page = await browser.newPage()
			
 
				+    await wait(250)
			
 
				+    await empty_page.goto('chrome://version')
			
 
				+    await wait(500)
			
 
				+    console.log('*************************************************************************')
			
 
				+
			
 
				+    return browser
			
 
				+}
			
 
				+
			
 
				+async function startAPIServer(port=API_SERVER_PORT, host=API_SERVER_HOST, taskCallback=null) {
			
 
				+    // taskCallback should be an async function that takes ({url}) => and does something with it
			
 
				+    assert(taskCallback && (typeof taskCallback === 'function'))
			
 
				+
			
 
				+    const server = createServer(async (req, res) => {
			
 
				+        if (req.method === 'POST') {
			
 
				+            console.log(`[API][POST] ${req.url}`)
			
 
				+            let body = '';
			
 
				+
			
 
				+            req.on('data', (chunk) => {
			
 
				+                body += chunk;
			
 
				+            });
			
 
				+
			
 
				+            req.on('end', () => {
			
 
				+                try {
			
 
				+                    const jsonData = JSON.parse(body);
			
 
				+                    // Process the JSON data
			
 
				+                    console.log(jsonData);
			
 
				+    
			
 
				+                    res.writeHead(200, { 'Content-Type': 'application/json' });
			
 
				+                    res.end(JSON.stringify({ message: 'JSON data received' }));
			
 
				+                } catch (error) {
			
 
				+                    res.writeHead(400, { 'Content-Type': 'application/json' });
			
 
				+                    res.end(JSON.stringify({ error: 'Invalid JSON data' }));
			
 
				+                }
			
 
				+            });
			
 
				+        } else if (req.method === 'GET') {
			
 
				+            console.log(`[API][GET] ${req.url}`)
			
 
				+            const parsedUrl = new URL(`http://${host}:${port}${req.url}`)
			
 
				+            const query = new URLSearchParams(parsedUrl.search);
			
 
				+            const url = query.get('url');
			
 
				+            if (url && url.includes('://')) {
			
 
				+                res.writeHead(200, { 'Content-Type': 'text/plain' });
			
 
				+                try {
			
 
				+                    await taskCallback({url})
			
 
				+                    res.end(`${url}\n${TASK_PATH(url)}`);
			
 
				+                } catch(err) {
			
 
				+                    res.end(`${url}\n${TASK_PATH(url)}\n${err}`);
			
 
				+                }
			
 
				+            } else {
			
 
				+                res.writeHead(500, { 'Content-Type': 'text/plain' });
			
 
				+                res.end(`Bad URL: ${url}\n\nExpected: /?url=https://example.com/url/to/archive`);
			
 
				+            }
			
 
				+        } else {
			
 
				+            res.writeHead(405, { 'Content-Type': 'application/json' });
			
 
				+            res.end(JSON.stringify({ error: 'Method not allowed' }));
			
 
				+        }
			
 
				+    })
			
 
				+
			
 
				+    server.listen(port, host, () => {
			
 
				+        console.log(`[🎰] API Server listening for requests on http://${host}:${port}/?url=...`);
			
 
				+    })
			
 
				+    console.log('*************************************************************************')
			
 
				+
			
 
				+    return server
			
 
				+}
			
 
				+
			
 
				+async function main(urls, cluster=CHROME_CLUSTER) {
			
 
				+    process.chdir(DATA_DIR)
			
 
				+
			
 
				+    const extensions =      await getChromeExtensionsFromPersona({CHROME_EXTENSIONS, CHROME_EXTENSIONS_DIR})
			
 
				+    const args =            getChromeArgs({...CHROME_LAUNCH_OPTIONS, CHROME_EXTENSIONS: extensions})
			
 
				+    const preferences =     getChromePreferences({CHROME_PREFERENCES_DEFAULT, CHROME_PREFERENCES_EXTRA, CHROME_DOWNLOADS_DIR, CHROME_EXTENSIONS: extensions})
			
 
				+    const Puppeteer =       applyChromePreferences(PupeteerExtra, CHROME_PREFERENCES_PATH, preferences)
			
 
				+    
			
 
				+    Puppeteer.use(StealthPlugin());
			
 
				+    // Puppeteer.use(ReplPlugin());
			
 
				+    // handled by uBlock Origin & ReCaptcha browser extensions, probably not needed here anymore:
			
 
				+    // Puppeteer.use(RecaptchaPlugin({
			
 
				+    //     provider: {id: '2captcha', token: API_KEY_2CAPTCHA},
			
 
				+    //     visualFeedback: true,
			
 
				+    // }))
			
 
				+    // const AdblockerPlugin = require('puppeteer-extra-plugin-adblocker')
			
 
				+    // puppeteer.use(AdblockerPlugin({ blockTrackers: true }))
			
 
				+
			
 
				+    if (cluster) {
			
 
				+        // launch browser with multiple tabs w/ puppeteer
			
 
				+        const cluster = await startCluster(Puppeteer, args)
			
 
				+
			
 
				+        const handleTask = async ({url}) => cluster.queue(url, botArchiveTask)
			
 
				+        const server = await startAPIServer(API_SERVER_PORT, API_SERVER_HOST, handleTask)
			
 
				+
			
 
				+        console.log('[📋] Running tasks in parallel with puppeteer cluster...')
			
 
				+        for (const url of urls) {
			
 
				+            if (fs.existsSync(path.join(TASK_PATH(url), 'aiqa.json'))) {
			
 
				+                try {
			
 
				+                    JSON.parse((await fs.promises.readFile(path.join(TASK_PATH(url), 'aiqa.json'))).toString())
			
 
				+                    console.log('    skipping (already present):', TASK_PATH(url), url)
			
 
				+                    continue
			
 
				+                } catch(err) {
			
 
				+                    // pass
			
 
				+                }
			
 
				+            }
			
 
				+            cluster.queue(url, botArchiveTask)
			
 
				+            await wait(3_000)
			
 
				+        }
			
 
				+
			
 
				+        await cluster.idle();
			
 
				+        await cluster.close();
			
 
				+    } else {
			
 
				+        // launch single new browser w/ puppeter / connect to remote CDP browser w/ puppeteer
			
 
				+        const browser = await startBrowser(Puppeteer, args)
			
 
				+        // const browser = await remoteBrowser(Puppeteer, {browserURL, browserWSEndpoint})
			
 
				+
			
 
				+        // run speedtest in the background
			
 
				+        speedtest({browser})
			
 
				+
			
 
				+        const handleTask = async ({url}) => await botArchiveTask({page: (await browser.newPage()), data: url})
			
 
				+        const server = await startAPIServer(API_SERVER_PORT, API_SERVER_HOST, handleTask)
			
 
				+
			
 
				+        // wait for any pre-run setup tasks or server requests
			
 
				+        await wait(5_000)
			
 
				+
			
 
				+        let num_succeeded = 0
			
 
				+        let num_failed = 0
			
 
				+
			
 
				+        console.log(`[📋] Running ${urls.length} tasks sequentially with puppeteer browser...`)
			
 
				+        for (const url of urls) {
			
 
				+            const run_count = (num_succeeded + num_failed) || 1
			
 
				+
			
 
				+            // check if task should be run or skipped based on existing snapshot data present in directory
			
 
				+            const metrics_path = path.join(TASK_PATH(url), 'metrics.json')
			
 
				+            const screenshot_path = path.join(TASK_PATH(url), 'screenrecording.gif')
			
 
				+            const aiqa_path = path.join(TASK_PATH(url), 'aiqa.json')
			
 
				+            const versions_path = path.join(TASK_PATH(url), 'versions')
			
 
				+            if (fs.existsSync(metrics_path) && fs.existsSync(screenshot_path) && fs.existsSync(aiqa_path) && fs.existsSync(versions_path)) {
			
 
				+                try {
			
 
				+                    const ai_qa_result = JSON.parse(await fs.promises.readFile(aiqa_path, 'utf-8'))
			
 
				+                    console.log(prettyPath(TASK_PATH(url)), `${ai_qa_result.pct_visible}%`, ai_qa_result.website_brand_name, url.substring(0, 80))
			
 
				+                    assert(ai_qa_result.website_brand_name)
			
 
				+                    continue
			
 
				+                } catch(err) {
			
 
				+                    // pass
			
 
				+                }
			
 
				+            }
			
 
				+            let delay = 0
			
 
				+
			
 
				+            // create a new browser page and run the archiving task
			
 
				+            const page = (await browser.newPage())
			
 
				+            try {
			
 
				+                console.log(ANSI.black + `◤==============================================================================[${String(run_count).padStart(3)}]/[${urls.length}]◥` + ANSI.reset)
			
 
				+                await botArchiveTask({page, data: url})
			
 
				+                delay = 1_000
			
 
				+                num_succeeded += 1
			
 
				+            } catch(err) {
			
 
				+                console.error('[❌] Archiving task failed!', url)
			
 
				+                console.error(err)
			
 
				+                num_failed += 1
			
 
				+                delay = 15_000   // extra delay if there are errors
			
 
				+            }
			
 
				+            console.log(ANSI.black + `◣==============================================================================[☑ ${num_succeeded}][🆇 ${num_failed}]◢` + ANSI.reset)
			
 
				+            
			
 
				+            // check for abnormally high failure rates and exit early if needed
			
 
				+            const failure_pct = Math.round((num_failed/run_count) * 100)
			
 
				+            if (failure_pct > 50) {
			
 
				+                if (run_count > 5) {
			
 
				+                    console.warn(`[⚠️] ${failure_pct}% Task failure rate is very high! Will self-cancel after 10 URLs if >50% continue to fail...`)
			
 
				+                }
			
 
				+                if (run_count > 10) {
			
 
				+                    throw `Too many tasks failed in a row! Quitting early after ${run_count}/${urls.length} tasks.`
			
 
				+                }
			
 
				+            }
			
 
				+            
			
 
				+            // increase the delay between tasks based on the ratio of how many are failing:succeeding
			
 
				+            delay = Math.pow(4, (num_failed/(num_succeeded + 3))) * delay
			
 
				+            // e.g. 0:1 failure ratio ==  1  * delay ==   1 ~ 15s
			
 
				+            //      1:1 failure ratio ==  5  * delay ==   5 ~  1m ... 5^(failed:succeeded) exponential increase
			
 
				+            //      2:1 failure ratio == 25  * delay == 25s ~  6m
			
 
				+            //      3:1 failure ratio == 125 * delay ==  2m ~ 31m
			
 
				+            //      etc...
			
 
				+            //      up to 1hr+
			
 
				+            delay = Math.min(delay, 3_600_000)   // 1hr maximum delay between tasks
			
 
				+            delay = Math.max(delay, 1_000)       // 1s minimum delay between tasks
			
 
				+            if (delay > 2_500) {
			
 
				+                console.log('... waiting', Math.round(delay/1000), 'seconds (self rate-limit)...')
			
 
				+            }
			
 
				+            await wait(delay)   // base ratelimit
			
 
				+            console.log()
			
 
				+        }
			
 
				+
			
 
				+
			
 
				+        if (PASSIVE_ARCHIVING) {
			
 
				+            // replace these as-needed:
			
 
				+            const browserURL = 'http://localhost:9222/'
			
 
				+            const browserWSEndpoint = 'ws://localhost:9222/devtools/browser'
			
 
				+
			
 
				+            const driver_browser = browser || await remoteBrowser(Puppeteer, {browserURL, browserWSEndpoint})
			
 
				+            const archiver_browser = {} //await startBrowser(Puppeteer, args)
			
 
				+
			
 
				+            const extensions = await getChromeExtensionsFromCache({browser: driver_browser})
			
 
				+
			
 
				+            // close both browsers if either one is closed
			
 
				+            let browser_is_open = true
			
 
				+            driver_browser.on('disconnected', async () => {browser_is_open = false})  // await archiver_browser.close()
			
 
				+            // archiver_browser.on('disconnected', async () => {browser_is_open = false; await driver_browser.close()})
			
 
				+
			
 
				+            // handle any tab navigation to a new URL in the driver browser
			
 
				+            const handleUserNavigation = async (target) => {
			
 
				+                const url = target.url()
			
 
				+                const page = await target.page()
			
 
				+                // const client = await target.createCDPSession()
			
 
				+
			
 
				+                if (target.type() == 'page' && page && url) {
			
 
				+                    console.log(ANSI.black + '==============================================================================' + ANSI.reset)
			
 
				+                    console.warn('[➕] DRIVER BROWSER NAVIGATED:', ANSI.blue, url, ANSI.reset)
			
 
				+                    
			
 
				+                    try {
			
 
				+                        await passiveArchiveTask({browser: driver_browser, page, url})
			
 
				+                        await wait(3_000)
			
 
				+                    } catch(err) {
			
 
				+                        console.error('[❌] Archiving task failed!', url)
			
 
				+                        console.error(err)
			
 
				+                        await wait(10_000)   // base ratelimit
			
 
				+                    }
			
 
				+                    console.log(ANSI.black + '==============================================================================' + ANSI.reset)
			
 
				+                    // await client.send('Page.enable')
			
 
				+                    // await client.send('Page.setWebLifecycleState', {state: 'active'})
			
 
				+                }
			
 
				+                // await client.send('Runtime.runIfWaitingForDebugger')
			
 
				+            }
			
 
				+
			
 
				+            // setup handler to archive new page whenever one is opened
			
 
				+            driver_browser.on('targetcreated', handleUserNavigation)
			
 
				+            driver_browser.on('targetchanged', handleUserNavigation)
			
 
				+
			
 
				+            console.log('------------------------------------------------------')
			
 
				+            console.log('[👀] Waiting for browser tabs to be opened by human...')
			
 
				+            while (browser_is_open) {
			
 
				+                await wait(2_000)
			
 
				+            }
			
 
				+        } else {
			
 
				+            while (true) {
			
 
				+                await wait(2_000)
			
 
				+            }
			
 
				+        }
			
 
				+
			
 
				+        await browser.close()
			
 
				+    }
			
 
				+    console.log('[✅] Finished all tasks and stopped browsers.')
			
 
				+    process.exit(0);
			
 
				+}
			
 
				+
			
 
				+
			
 
				+/******************************************************************************/
			
 
				+if (import.meta.main) {
			
 
				+    main(URLS).catch(console.error);
			
 
				+}
			
 
				+
			
 
				+/******************************************************************************/
			
 
				+
			
 
				+// if we want to handle CLI args in the future, minimist is great:
			
 
				+// var argv = require('minimist')(process.argv.slice(2));
			
 
				+// console.log(argv); // --url=https://example.com --binpath=/browsers/chromium-1047/bin/chromium --datadir=/Chromium
			
 
				+// const {url, binpath, datadir} = argv;
			
 
				+
			
 
				+
			
 
				+// OLD CODE, may be useful in the future if we need audio in screenrecordings:
			
 
				+// async function setupScreenrecordingWithAudio(page, wss) {
			
 
				+//     console.log('[🎬] Setting up screen-recording plugin...');
			
 
				+//     const stream_port = (await wss).options.port;
			
 
				+//     // streamPage = await (page.browser()).newPage()
			
 
				+//     await page.goto(`chrome-extension://jjndjgheafjngoipoacpjgeicjeomjli/options.html#${stream_port}`)
			
 
				+//
			
 
				+//     // puppeteer-stream recording start
			
 
				+//     streamFile = fs.createWriteStream(SCREENRECORDING_PATH(page))
			
 
				+//     stream = await getStream(page, {
			
 
				+//       audio: true,
			
 
				+//       video: true,
			
 
				+//       bitsPerSecond: 8000000,       // 1080p video
			
 
				+//     });
			
 
				+//     stream.pipe(streamFile);
			
 
				+//     return {stream, streamFile}
			
 
				+//
			
 
				+//     // puppeteer-stream recording stop & cleanup
			
 
				+//     if (stream && streamFile) {
			
 
				+//         await stream?.destroy();
			
 
				+//         streamFile?.close();
			
 
				+//         // await streamPage.close();
			
 
				+//     }
			
 
				+// }
			
 
				+
			
--- a/archivebox/__init__.py
+++ b/archivebox/__init__.py
@@ -14,7 +14,6 @@ __package__ = 'archivebox'
 
				 import os
			
 
				 import sys
			
 
				 from pathlib import Path
			
 
				-from typing import cast
			
 
				 
			
 
				 ASCII_LOGO = """
			
 
				  █████╗ ██████╗  ██████╗██╗  ██╗██╗██╗   ██╗███████╗ ██████╗  ██████╗ ██╗  ██╗
			
@@ -41,69 +40,29 @@ from .misc.checks import check_not_root, check_io_encoding      # noqa
 
				 check_not_root()
			
 
				 check_io_encoding()
			
 
				 
			
 
				-# print('INSTALLING MONKEY PATCHES')
			
 
				+# Install monkey patches for third-party libraries
			
 
				 from .misc.monkey_patches import *                    # noqa
			
 
				-# print('DONE INSTALLING MONKEY PATCHES')
			
 
				 
			
 
				+# Built-in plugin directories
			
 
				+BUILTIN_PLUGINS_DIR = PACKAGE_DIR / 'plugins'
			
 
				+USER_PLUGINS_DIR = Path(os.getcwd()) / 'plugins'
			
 
				 
			
 
				-# print('LOADING VENDORED LIBRARIES')
			
 
				-from .pkgs import load_vendored_pkgs             # noqa
			
 
				-load_vendored_pkgs()
			
 
				-# print('DONE LOADING VENDORED LIBRARIES')
			
 
				-
			
 
				-# print('LOADING ABX PLUGIN SPECIFICATIONS')
			
 
				-# Load ABX Plugin Specifications + Default Implementations
			
 
				-import abx                                       # noqa
			
 
				-import abx_spec_archivebox                       # noqa
			
 
				-import abx_spec_config                           # noqa
			
 
				-import abx_spec_abx_pkg                          # noqa
			
 
				-import abx_spec_django                           # noqa
			
 
				-import abx_spec_searchbackend                    # noqa
			
 
				-
			
 
				-abx.pm.add_hookspecs(abx_spec_config.PLUGIN_SPEC)
			
 
				-abx.pm.register(abx_spec_config.PLUGIN_SPEC())
			
 
				-
			
 
				-abx.pm.add_hookspecs(abx_spec_abx_pkg.PLUGIN_SPEC)
			
 
				-abx.pm.register(abx_spec_abx_pkg.PLUGIN_SPEC())
			
 
				-
			
 
				-abx.pm.add_hookspecs(abx_spec_django.PLUGIN_SPEC)
			
 
				-abx.pm.register(abx_spec_django.PLUGIN_SPEC())
			
 
				-
			
 
				-abx.pm.add_hookspecs(abx_spec_searchbackend.PLUGIN_SPEC)
			
 
				-abx.pm.register(abx_spec_searchbackend.PLUGIN_SPEC())
			
 
				-
			
 
				-# Cast to ArchiveBoxPluginSpec to enable static type checking of pm.hook.call() methods
			
 
				-abx.pm = cast(abx.ABXPluginManager[abx_spec_archivebox.ArchiveBoxPluginSpec], abx.pm)
			
 
				-pm = abx.pm
			
 
				-# print('DONE LOADING ABX PLUGIN SPECIFICATIONS')
			
 
				-
			
 
				-# Load all pip-installed ABX-compatible plugins
			
 
				-ABX_ECOSYSTEM_PLUGINS = abx.get_pip_installed_plugins(group='abx')
			
 
				-
			
 
				-# Load all built-in ArchiveBox plugins
			
 
				-ARCHIVEBOX_BUILTIN_PLUGINS = {
			
 
				-    'config': PACKAGE_DIR / 'config',
			
 
				-    'workers': PACKAGE_DIR / 'workers',
			
 
				-    'core': PACKAGE_DIR / 'core',
			
 
				-    'crawls': PACKAGE_DIR / 'crawls',
			
 
				-    # 'machine': PACKAGE_DIR / 'machine'
			
 
				-    # 'search': PACKAGE_DIR / 'search',
			
 
				+# These are kept for backwards compatibility with existing code
			
 
				+# that checks for plugins. The new hook system uses discover_hooks()
			
 
				+ALL_PLUGINS = {
			
 
				+    'builtin': BUILTIN_PLUGINS_DIR,
			
 
				+    'user': USER_PLUGINS_DIR,
			
 
				 }
			
 
				-
			
 
				-# Load all user-defined ArchiveBox plugins
			
 
				-USER_PLUGINS = abx.find_plugins_in_dir(Path(os.getcwd()) / 'user_plugins')
			
 
				-
			
 
				-# Import all plugins and register them with ABX Plugin Manager
			
 
				-ALL_PLUGINS = {**ABX_ECOSYSTEM_PLUGINS, **ARCHIVEBOX_BUILTIN_PLUGINS, **USER_PLUGINS}
			
 
				-# print('LOADING ALL PLUGINS')
			
 
				-LOADED_PLUGINS = abx.load_plugins(ALL_PLUGINS)
			
 
				-# print('DONE LOADING ALL PLUGINS')
			
 
				+LOADED_PLUGINS = ALL_PLUGINS
			
 
				 
			
 
				 # Setup basic config, constants, paths, and version
			
 
				 from .config.constants import CONSTANTS                         # noqa
			
 
				 from .config.paths import PACKAGE_DIR, DATA_DIR, ARCHIVE_DIR    # noqa
			
 
				 from .config.version import VERSION                             # noqa
			
 
				 
			
 
				+# Set MACHINE_ID env var so hook scripts can use it
			
 
				+os.environ.setdefault('MACHINE_ID', CONSTANTS.MACHINE_ID)
			
 
				+
			
 
				 __version__ = VERSION
			
 
				 __author__ = 'ArchiveBox'
			
 
				 __license__ = 'MIT'
			
--- a/archivebox/api/apps.py
+++ b/archivebox/api/apps.py
@@ -2,14 +2,11 @@ __package__ = 'archivebox.api'
 
				 
			
 
				 from django.apps import AppConfig
			
 
				 
			
 
				-import abx
			
 
				-
			
 
				 
			
 
				 class APIConfig(AppConfig):
			
 
				     name = 'api'
			
 
				 
			
 
				 
			
 
				[email protected]
			
 
				 def register_admin(admin_site):
			
 
				     from api.admin import register_admin
			
 
				     register_admin(admin_site)
			
--- a/archivebox/api/migrations/0001_initial.py
+++ b/archivebox/api/migrations/0001_initial.py
@@ -1,10 +1,11 @@
 
				-# Generated by Django 4.2.11 on 2024-04-25 04:19
			
 
				+# Generated by Django 5.0.6 on 2024-12-25 (squashed)
			
 
				 
			
 
				-import api.models
			
 
				+from uuid import uuid4
			
 
				 from django.conf import settings
			
 
				 from django.db import migrations, models
			
 
				 import django.db.models.deletion
			
 
				-import uuid
			
 
				+
			
 
				+import api.models
			
 
				 
			
 
				 
			
 
				 class Migration(migrations.Migration):
			
@@ -19,11 +20,41 @@ class Migration(migrations.Migration):
 
				         migrations.CreateModel(
			
 
				             name='APIToken',
			
 
				             fields=[
			
 
				-                ('id', models.UUIDField(default=uuid.uuid4, editable=False, primary_key=True, serialize=False)),
			
 
				+                ('id', models.UUIDField(default=uuid4, editable=False, primary_key=True, serialize=False, unique=True)),
			
 
				+                ('created_by', models.ForeignKey(default=None, on_delete=django.db.models.deletion.CASCADE, to=settings.AUTH_USER_MODEL)),
			
 
				+                ('created_at', models.DateTimeField(auto_now_add=True, db_index=True)),
			
 
				+                ('modified_at', models.DateTimeField(auto_now=True)),
			
 
				                 ('token', models.CharField(default=api.models.generate_secret_token, max_length=32, unique=True)),
			
 
				-                ('created', models.DateTimeField(auto_now_add=True)),
			
 
				                 ('expires', models.DateTimeField(blank=True, null=True)),
			
 
				-                ('user', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, to=settings.AUTH_USER_MODEL)),
			
 
				             ],
			
 
				+            options={
			
 
				+                'verbose_name': 'API Key',
			
 
				+                'verbose_name_plural': 'API Keys',
			
 
				+            },
			
 
				+        ),
			
 
				+        migrations.CreateModel(
			
 
				+            name='OutboundWebhook',
			
 
				+            fields=[
			
 
				+                ('id', models.UUIDField(default=uuid4, editable=False, primary_key=True, serialize=False, unique=True)),
			
 
				+                ('created_by', models.ForeignKey(default=None, on_delete=django.db.models.deletion.CASCADE, to=settings.AUTH_USER_MODEL)),
			
 
				+                ('created_at', models.DateTimeField(auto_now_add=True, db_index=True)),
			
 
				+                ('modified_at', models.DateTimeField(auto_now=True)),
			
 
				+                ('name', models.CharField(blank=True, default='', max_length=255)),
			
 
				+                ('signal', models.CharField(choices=[], db_index=True, max_length=255)),
			
 
				+                ('ref', models.CharField(db_index=True, max_length=255)),
			
 
				+                ('endpoint', models.URLField(max_length=2083)),
			
 
				+                ('headers', models.JSONField(blank=True, default=dict)),
			
 
				+                ('auth_token', models.CharField(blank=True, default='', max_length=4000)),
			
 
				+                ('enabled', models.BooleanField(db_index=True, default=True)),
			
 
				+                ('keep_last_response', models.BooleanField(default=False)),
			
 
				+                ('last_response', models.TextField(blank=True, default='')),
			
 
				+                ('last_success', models.DateTimeField(blank=True, null=True)),
			
 
				+                ('last_failure', models.DateTimeField(blank=True, null=True)),
			
 
				+            ],
			
 
				+            options={
			
 
				+                'verbose_name': 'API Outbound Webhook',
			
 
				+                'ordering': ['name', 'ref'],
			
 
				+                'abstract': False,
			
 
				+            },
			
 
				         ),
			
 
				     ]
			
--- a/archivebox/api/migrations/0002_alter_apitoken_options.py
+++ b/archivebox/api/migrations/0002_alter_apitoken_options.py
@@ -1,17 +0,0 @@
 
				-# Generated by Django 5.0.4 on 2024-04-26 05:28
			
 
				-
			
 
				-from django.db import migrations
			
 
				-
			
 
				-
			
 
				-class Migration(migrations.Migration):
			
 
				-
			
 
				-    dependencies = [
			
 
				-        ('api', '0001_initial'),
			
 
				-    ]
			
 
				-
			
 
				-    operations = [
			
 
				-        migrations.AlterModelOptions(
			
 
				-            name='apitoken',
			
 
				-            options={'verbose_name': 'API Key', 'verbose_name_plural': 'API Keys'},
			
 
				-        ),
			
 
				-    ]
			
--- a/archivebox/api/migrations/0003_rename_user_apitoken_created_by_apitoken_abid_and_more.py
+++ b/archivebox/api/migrations/0003_rename_user_apitoken_created_by_apitoken_abid_and_more.py
@@ -1,78 +0,0 @@
 
				-# Generated by Django 5.0.6 on 2024-06-03 01:52
			
 
				-
			
 
				-import charidfield.fields
			
 
				-import django.db.models.deletion
			
 
				-import signal_webhooks.fields
			
 
				-import signal_webhooks.utils
			
 
				-import uuid
			
 
				-from django.conf import settings
			
 
				-from django.db import migrations, models
			
 
				-
			
 
				-import archivebox.base_models.models
			
 
				-
			
 
				-
			
 
				-class Migration(migrations.Migration):
			
 
				-
			
 
				-    dependencies = [
			
 
				-        ('api', '0002_alter_apitoken_options'),
			
 
				-        migrations.swappable_dependency(settings.AUTH_USER_MODEL),
			
 
				-    ]
			
 
				-
			
 
				-    operations = [
			
 
				-        migrations.RenameField(
			
 
				-            model_name='apitoken',
			
 
				-            old_name='user',
			
 
				-            new_name='created_by',
			
 
				-        ),
			
 
				-        migrations.AddField(
			
 
				-            model_name='apitoken',
			
 
				-            name='abid',
			
 
				-            field=charidfield.fields.CharIDField(blank=True, db_index=True, default=None, help_text='ABID-format identifier for this entity (e.g. snp_01BJQMF54D093DXEAWZ6JYRPAQ)', max_length=30, null=True, prefix='apt_', unique=True),
			
 
				-        ),
			
 
				-        migrations.AddField(
			
 
				-            model_name='apitoken',
			
 
				-            name='modified',
			
 
				-            field=models.DateTimeField(auto_now=True),
			
 
				-        ),
			
 
				-        migrations.AddField(
			
 
				-            model_name='apitoken',
			
 
				-            name='uuid',
			
 
				-            field=models.UUIDField(blank=True, null=True, unique=True),
			
 
				-        ),
			
 
				-        migrations.AlterField(
			
 
				-            model_name='apitoken',
			
 
				-            name='id',
			
 
				-            field=models.UUIDField(default=uuid.uuid4, primary_key=True, serialize=False),
			
 
				-        ),
			
 
				-        migrations.CreateModel(
			
 
				-            name='OutboundWebhook',
			
 
				-            fields=[
			
 
				-                ('name', models.CharField(db_index=True, help_text='Give your webhook a descriptive name (e.g. Notify ACME Slack channel of any new ArchiveResults).', max_length=255, unique=True, verbose_name='name')),
			
 
				-                ('signal', models.CharField(choices=[('CREATE', 'Create'), ('UPDATE', 'Update'), ('DELETE', 'Delete'), ('M2M', 'M2M changed'), ('CREATE_OR_UPDATE', 'Create or Update'), ('CREATE_OR_DELETE', 'Create or Delete'), ('CREATE_OR_M2M', 'Create or M2M changed'), ('UPDATE_OR_DELETE', 'Update or Delete'), ('UPDATE_OR_M2M', 'Update or M2M changed'), ('DELETE_OR_M2M', 'Delete or M2M changed'), ('CREATE_UPDATE_OR_DELETE', 'Create, Update or Delete'), ('CREATE_UPDATE_OR_M2M', 'Create, Update or M2M changed'), ('CREATE_DELETE_OR_M2M', 'Create, Delete or M2M changed'), ('UPDATE_DELETE_OR_M2M', 'Update, Delete or M2M changed'), ('CREATE_UPDATE_DELETE_OR_M2M', 'Create, Update or Delete, or M2M changed')], help_text='The type of event the webhook should fire for (e.g. Create, Update, Delete).', max_length=255, verbose_name='signal')),
			
 
				-                ('ref', models.CharField(db_index=True, help_text='Dot import notation of the model the webhook should fire for (e.g. core.models.Snapshot or core.models.ArchiveResult).', max_length=1023, validators=[signal_webhooks.utils.model_from_reference], verbose_name='referenced model')),
			
 
				-                ('endpoint', models.URLField(help_text='External URL to POST the webhook notification to (e.g. https://someapp.example.com/webhook/some-webhook-receiver).', max_length=2047, verbose_name='endpoint')),
			
 
				-                ('headers', models.JSONField(blank=True, default=dict, help_text='Headers to send with the webhook request.', validators=[signal_webhooks.utils.is_dict], verbose_name='headers')),
			
 
				-                ('auth_token', signal_webhooks.fields.TokenField(blank=True, default='', help_text='Authentication token to use in an Authorization header.', max_length=8000, validators=[signal_webhooks.utils.decode_cipher_key], verbose_name='authentication token')),
			
 
				-                ('enabled', models.BooleanField(default=True, help_text='Is this webhook enabled?', verbose_name='enabled')),
			
 
				-                ('keep_last_response', models.BooleanField(default=False, help_text='Should the webhook keep a log of the latest response it got?', verbose_name='keep last response')),
			
 
				-                ('updated', models.DateTimeField(auto_now=True, help_text='When the webhook was last updated.', verbose_name='updated')),
			
 
				-                ('last_response', models.CharField(blank=True, default='', help_text='Latest response to this webhook.', max_length=8000, verbose_name='last response')),
			
 
				-                ('last_success', models.DateTimeField(default=None, help_text='When the webhook last succeeded.', null=True, verbose_name='last success')),
			
 
				-                ('last_failure', models.DateTimeField(default=None, help_text='When the webhook last failed.', null=True, verbose_name='last failure')),
			
 
				-                ('created', models.DateTimeField(auto_now_add=True)),
			
 
				-                ('modified', models.DateTimeField(auto_now=True)),
			
 
				-                ('id', models.UUIDField(blank=True, null=True, unique=True)),
			
 
				-                ('uuid', models.UUIDField(default=uuid.uuid4, primary_key=True, serialize=False)),
			
 
				-                ('abid', charidfield.fields.CharIDField(blank=True, db_index=True, default=None, help_text='ABID-format identifier for this entity (e.g. snp_01BJQMF54D093DXEAWZ6JYRPAQ)', max_length=30, null=True, prefix='whk_', unique=True)),
			
 
				-                ('created_by', models.ForeignKey(default=archivebox.base_models.models.get_or_create_system_user_pk, on_delete=django.db.models.deletion.CASCADE, to=settings.AUTH_USER_MODEL)),
			
 
				-            ],
			
 
				-            options={
			
 
				-                'verbose_name': 'API Outbound Webhook',
			
 
				-                'abstract': False,
			
 
				-            },
			
 
				-        ),
			
 
				-        migrations.AddConstraint(
			
 
				-            model_name='outboundwebhook',
			
 
				-            constraint=models.UniqueConstraint(fields=('ref', 'endpoint'), name='prevent_duplicate_hooks_api_outboundwebhook'),
			
 
				-        ),
			
 
				-    ]
			
--- a/archivebox/api/migrations/0004_alter_apitoken_id_alter_apitoken_uuid.py
+++ b/archivebox/api/migrations/0004_alter_apitoken_id_alter_apitoken_uuid.py
@@ -1,24 +0,0 @@
 
				-# Generated by Django 5.1 on 2024-08-20 10:44
			
 
				-
			
 
				-import uuid
			
 
				-from django.db import migrations, models
			
 
				-
			
 
				-
			
 
				-class Migration(migrations.Migration):
			
 
				-
			
 
				-    dependencies = [
			
 
				-        ('api', '0003_rename_user_apitoken_created_by_apitoken_abid_and_more'),
			
 
				-    ]
			
 
				-
			
 
				-    operations = [
			
 
				-        migrations.AlterField(
			
 
				-            model_name='apitoken',
			
 
				-            name='id',
			
 
				-            field=models.UUIDField(default=uuid.uuid4, editable=False, primary_key=True, serialize=False),
			
 
				-        ),
			
 
				-        migrations.AlterField(
			
 
				-            model_name='apitoken',
			
 
				-            name='uuid',
			
 
				-            field=models.UUIDField(blank=True, editable=False, null=True, unique=True),
			
 
				-        ),
			
 
				-    ]
			
--- a/archivebox/api/migrations/0005_remove_apitoken_uuid_remove_outboundwebhook_uuid_and_more.py
+++ b/archivebox/api/migrations/0005_remove_apitoken_uuid_remove_outboundwebhook_uuid_and_more.py
@@ -1,22 +0,0 @@
 
				-# Generated by Django 5.1 on 2024-08-20 22:40
			
 
				-
			
 
				-import uuid
			
 
				-from django.db import migrations, models
			
 
				-
			
 
				-
			
 
				-class Migration(migrations.Migration):
			
 
				-
			
 
				-    dependencies = [
			
 
				-        ('api', '0004_alter_apitoken_id_alter_apitoken_uuid'),
			
 
				-    ]
			
 
				-
			
 
				-    operations = [
			
 
				-        migrations.RemoveField(
			
 
				-            model_name='apitoken',
			
 
				-            name='uuid',
			
 
				-        ),
			
 
				-        migrations.RemoveField(
			
 
				-            model_name='outboundwebhook',
			
 
				-            name='id',
			
 
				-        ),
			
 
				-    ]
			
--- a/archivebox/api/migrations/0006_remove_outboundwebhook_uuid_apitoken_id_and_more.py
+++ b/archivebox/api/migrations/0006_remove_outboundwebhook_uuid_apitoken_id_and_more.py
@@ -1,29 +0,0 @@
 
				-# Generated by Django 5.1 on 2024-08-20 22:43
			
 
				-
			
 
				-import uuid
			
 
				-from django.db import migrations, models
			
 
				-
			
 
				-
			
 
				-class Migration(migrations.Migration):
			
 
				-
			
 
				-    dependencies = [
			
 
				-        ('api', '0005_remove_apitoken_uuid_remove_outboundwebhook_uuid_and_more'),
			
 
				-    ]
			
 
				-
			
 
				-    operations = [
			
 
				-        migrations.RenameField(
			
 
				-            model_name='outboundwebhook',
			
 
				-            old_name='uuid',
			
 
				-            new_name='id'
			
 
				-        ),
			
 
				-        migrations.AlterField(
			
 
				-            model_name='outboundwebhook',
			
 
				-            name='id',
			
 
				-            field=models.UUIDField(default=uuid.uuid4, editable=False, primary_key=True, serialize=False),
			
 
				-        ),
			
 
				-        migrations.AlterField(
			
 
				-            model_name='apitoken',
			
 
				-            name='id',
			
 
				-            field=models.UUIDField(default=uuid.uuid4, editable=False, primary_key=True, serialize=False),
			
 
				-        ),
			
 
				-    ]
			
--- a/archivebox/api/migrations/0007_alter_apitoken_created_by.py
+++ b/archivebox/api/migrations/0007_alter_apitoken_created_by.py
@@ -1,23 +0,0 @@
 
				-# Generated by Django 5.1 on 2024-08-20 22:52
			
 
				-
			
 
				-import django.db.models.deletion
			
 
				-from django.conf import settings
			
 
				-from django.db import migrations, models
			
 
				-
			
 
				-import archivebox.base_models.models
			
 
				-
			
 
				-
			
 
				-class Migration(migrations.Migration):
			
 
				-
			
 
				-    dependencies = [
			
 
				-        ('api', '0006_remove_outboundwebhook_uuid_apitoken_id_and_more'),
			
 
				-        migrations.swappable_dependency(settings.AUTH_USER_MODEL),
			
 
				-    ]
			
 
				-
			
 
				-    operations = [
			
 
				-        migrations.AlterField(
			
 
				-            model_name='apitoken',
			
 
				-            name='created_by',
			
 
				-            field=models.ForeignKey(default=archivebox.base_models.models.get_or_create_system_user_pk, on_delete=django.db.models.deletion.CASCADE, to=settings.AUTH_USER_MODEL),
			
 
				-        ),
			
 
				-    ]
			
--- a/archivebox/api/migrations/0008_alter_apitoken_created_alter_apitoken_created_by_and_more.py
+++ b/archivebox/api/migrations/0008_alter_apitoken_created_alter_apitoken_created_by_and_more.py
@@ -1,48 +0,0 @@
 
				-# Generated by Django 5.1 on 2024-09-04 23:32
			
 
				-
			
 
				-import django.db.models.deletion
			
 
				-from django.conf import settings
			
 
				-from django.db import migrations, models
			
 
				-
			
 
				-import archivebox.base_models.models
			
 
				-
			
 
				-
			
 
				-class Migration(migrations.Migration):
			
 
				-
			
 
				-    dependencies = [
			
 
				-        ('api', '0007_alter_apitoken_created_by'),
			
 
				-        migrations.swappable_dependency(settings.AUTH_USER_MODEL),
			
 
				-    ]
			
 
				-
			
 
				-    operations = [
			
 
				-        migrations.AlterField(
			
 
				-            model_name='apitoken',
			
 
				-            name='created',
			
 
				-            field=archivebox.base_models.models.AutoDateTimeField(db_index=True, default=None),
			
 
				-        ),
			
 
				-        migrations.AlterField(
			
 
				-            model_name='apitoken',
			
 
				-            name='created_by',
			
 
				-            field=models.ForeignKey(default=None, on_delete=django.db.models.deletion.CASCADE, to=settings.AUTH_USER_MODEL),
			
 
				-        ),
			
 
				-        migrations.AlterField(
			
 
				-            model_name='apitoken',
			
 
				-            name='id',
			
 
				-            field=models.UUIDField(default=None, editable=False, primary_key=True, serialize=False, unique=True, verbose_name='ID'),
			
 
				-        ),
			
 
				-        migrations.AlterField(
			
 
				-            model_name='outboundwebhook',
			
 
				-            name='created',
			
 
				-            field=archivebox.base_models.models.AutoDateTimeField(db_index=True, default=None),
			
 
				-        ),
			
 
				-        migrations.AlterField(
			
 
				-            model_name='outboundwebhook',
			
 
				-            name='created_by',
			
 
				-            field=models.ForeignKey(default=None, on_delete=django.db.models.deletion.CASCADE, to=settings.AUTH_USER_MODEL),
			
 
				-        ),
			
 
				-        migrations.AlterField(
			
 
				-            model_name='outboundwebhook',
			
 
				-            name='id',
			
 
				-            field=models.UUIDField(default=None, editable=False, primary_key=True, serialize=False, unique=True, verbose_name='ID'),
			
 
				-        ),
			
 
				-    ]
			
--- a/archivebox/api/migrations/0009_rename_created_apitoken_created_at_and_more.py
+++ b/archivebox/api/migrations/0009_rename_created_apitoken_created_at_and_more.py
@@ -1,40 +0,0 @@
 
				-# Generated by Django 5.1 on 2024-09-05 00:26
			
 
				-
			
 
				-from django.db import migrations, models
			
 
				-
			
 
				-import archivebox.base_models.models
			
 
				-
			
 
				-
			
 
				-class Migration(migrations.Migration):
			
 
				-
			
 
				-    dependencies = [
			
 
				-        ('api', '0008_alter_apitoken_created_alter_apitoken_created_by_and_more'),
			
 
				-    ]
			
 
				-
			
 
				-    operations = [
			
 
				-        migrations.RenameField(
			
 
				-            model_name='apitoken',
			
 
				-            old_name='created',
			
 
				-            new_name='created_at',
			
 
				-        ),
			
 
				-        migrations.RenameField(
			
 
				-            model_name='apitoken',
			
 
				-            old_name='modified',
			
 
				-            new_name='modified_at',
			
 
				-        ),
			
 
				-        migrations.RenameField(
			
 
				-            model_name='outboundwebhook',
			
 
				-            old_name='modified',
			
 
				-            new_name='modified_at',
			
 
				-        ),
			
 
				-        migrations.AddField(
			
 
				-            model_name='outboundwebhook',
			
 
				-            name='created_at',
			
 
				-            field=archivebox.base_models.models.AutoDateTimeField(db_index=True, default=None),
			
 
				-        ),
			
 
				-        migrations.AlterField(
			
 
				-            model_name='outboundwebhook',
			
 
				-            name='created',
			
 
				-            field=models.DateTimeField(auto_now_add=True, help_text='When the webhook was created.', verbose_name='created'),
			
 
				-        ),
			
 
				-    ]
			
--- a/archivebox/api/models.py
+++ b/archivebox/api/models.py
@@ -38,7 +38,7 @@ class APIToken(models.Model):
 
				         return not self.expires or self.expires >= (for_date or timezone.now())
			
 
				 
			
 
				 
			
 
				-class OutboundWebhook(models.Model, WebhookBase):
			
 
				+class OutboundWebhook(WebhookBase):
			
 
				     id = models.UUIDField(primary_key=True, default=uuid7, editable=False, unique=True)
			
 
				     created_by = models.ForeignKey(settings.AUTH_USER_MODEL, on_delete=models.CASCADE, default=None, null=False)
			
 
				     created_at = models.DateTimeField(default=timezone.now, db_index=True)
			
--- a/archivebox/api/v1_api.py
+++ b/archivebox/api/v1_api.py
@@ -84,7 +84,6 @@ api = NinjaAPIWithIOCapture(
 
				     title='ArchiveBox API',
			
 
				     description=html_description,
			
 
				     version=VERSION,
			
 
				-    csrf=False,
			
 
				     auth=API_AUTH_METHODS,
			
 
				     urls_namespace="api-1",
			
 
				     docs=Swagger(settings={"persistAuthorization": True}),
			
--- a/archivebox/base_models/admin.py
+++ b/archivebox/base_models/admin.py
@@ -3,9 +3,77 @@
 
				 __package__ = 'archivebox.base_models'
			
 
				 
			
 
				 from django.contrib import admin
			
 
				+from django.utils.html import format_html, mark_safe
			
 
				 from django_object_actions import DjangoObjectActions
			
 
				 
			
 
				 
			
 
				+class ConfigEditorMixin:
			
 
				+    """
			
 
				+    Mixin for admin classes with a config JSON field.
			
 
				+
			
 
				+    Provides a readonly field that shows available config options
			
 
				+    from all discovered plugin schemas.
			
 
				+    """
			
 
				+
			
 
				+    @admin.display(description='Available Config Options')
			
 
				+    def available_config_options(self, obj):
			
 
				+        """Show documentation for available config keys."""
			
 
				+        try:
			
 
				+            from archivebox.hooks import discover_plugin_configs
			
 
				+            plugin_configs = discover_plugin_configs()
			
 
				+        except ImportError:
			
 
				+            return format_html('<i>Plugin config system not available</i>')
			
 
				+
			
 
				+        html_parts = [
			
 
				+            '<details>',
			
 
				+            '<summary style="cursor: pointer; font-weight: bold; padding: 4px;">',
			
 
				+            'Click to see available config keys ({})</summary>'.format(
			
 
				+                sum(len(s.get('properties', {})) for s in plugin_configs.values())
			
 
				+            ),
			
 
				+            '<div style="max-height: 400px; overflow-y: auto; padding: 8px; background: #f8f8f8; border-radius: 4px; font-family: monospace; font-size: 11px;">',
			
 
				+        ]
			
 
				+
			
 
				+        for plugin_name, schema in sorted(plugin_configs.items()):
			
 
				+            properties = schema.get('properties', {})
			
 
				+            if not properties:
			
 
				+                continue
			
 
				+
			
 
				+            html_parts.append(f'<div style="margin: 8px 0;"><strong style="color: #333;">{plugin_name}</strong></div>')
			
 
				+            html_parts.append('<table style="width: 100%; border-collapse: collapse; margin-bottom: 12px;">')
			
 
				+            html_parts.append('<tr style="background: #eee;"><th style="text-align: left; padding: 4px;">Key</th><th style="text-align: left; padding: 4px;">Type</th><th style="text-align: left; padding: 4px;">Default</th><th style="text-align: left; padding: 4px;">Description</th></tr>')
			
 
				+
			
 
				+            for key, prop in sorted(properties.items()):
			
 
				+                prop_type = prop.get('type', 'string')
			
 
				+                default = prop.get('default', '')
			
 
				+                description = prop.get('description', '')
			
 
				+
			
 
				+                # Truncate long defaults
			
 
				+                default_str = str(default)
			
 
				+                if len(default_str) > 30:
			
 
				+                    default_str = default_str[:27] + '...'
			
 
				+
			
 
				+                html_parts.append(
			
 
				+                    f'<tr style="border-bottom: 1px solid #ddd;">'
			
 
				+                    f'<td style="padding: 4px; font-weight: bold;">{key}</td>'
			
 
				+                    f'<td style="padding: 4px; color: #666;">{prop_type}</td>'
			
 
				+                    f'<td style="padding: 4px; color: #666;">{default_str}</td>'
			
 
				+                    f'<td style="padding: 4px;">{description}</td>'
			
 
				+                    f'</tr>'
			
 
				+                )
			
 
				+
			
 
				+            html_parts.append('</table>')
			
 
				+
			
 
				+        html_parts.append('</div></details>')
			
 
				+        html_parts.append(
			
 
				+            '<p style="margin-top: 8px; color: #666; font-size: 11px;">'
			
 
				+            '<strong>Usage:</strong> Add key-value pairs in JSON format, e.g., '
			
 
				+            '<code>{"SAVE_WGET": false, "WGET_TIMEOUT": 120}</code>'
			
 
				+            '</p>'
			
 
				+        )
			
 
				+
			
 
				+        return mark_safe(''.join(html_parts))
			
 
				+
			
 
				+
			
 
				 class BaseModelAdmin(DjangoObjectActions, admin.ModelAdmin):
			
 
				     list_display = ('id', 'created_at', 'created_by')
			
 
				     readonly_fields = ('id', 'created_at', 'modified_at')
			
--- a/archivebox/base_models/apps.py
+++ b/archivebox/base_models/apps.py
@@ -1,7 +1,7 @@
 
				 # from django.apps import AppConfig
			
 
				 
			
 
				 
			
 
				-# class AbidUtilsConfig(AppConfig):
			
 
				+# class BaseModelsConfig(AppConfig):
			
 
				 #     default_auto_field = 'django.db.models.BigAutoField'
			
 
				-    
			
 
				+
			
 
				 #     name = 'base_models'
			
--- a/archivebox/base_models/models.py
+++ b/archivebox/base_models/models.py
@@ -19,7 +19,7 @@ from django.conf import settings
 
				 from django_stubs_ext.db.models import TypedModelMeta
			
 
				 
			
 
				 from archivebox import DATA_DIR
			
 
				-from archivebox.index.json import to_json
			
 
				+from archivebox.misc.util import to_json
			
 
				 from archivebox.misc.hashing import get_dir_info
			
 
				 
			
 
				 
			
@@ -31,6 +31,16 @@ def get_or_create_system_user_pk(username='system'):
 
				     return user.pk
			
 
				 
			
 
				 
			
 
				+class AutoDateTimeField(models.DateTimeField):
			
 
				+    """DateTimeField that automatically updates on save (legacy compatibility)."""
			
 
				+    def pre_save(self, model_instance, add):
			
 
				+        if add or not getattr(model_instance, self.attname):
			
 
				+            value = timezone.now()
			
 
				+            setattr(model_instance, self.attname, value)
			
 
				+            return value
			
 
				+        return super().pre_save(model_instance, add)
			
 
				+
			
 
				+
			
 
				 class ModelWithUUID(models.Model):
			
 
				     id = models.UUIDField(primary_key=True, default=uuid7, editable=False, unique=True)
			
 
				     created_at = models.DateTimeField(default=timezone.now, db_index=True)
			
@@ -74,6 +84,7 @@ class ModelWithSerializers(ModelWithUUID):
 
				 
			
 
				 
			
 
				 class ModelWithNotes(models.Model):
			
 
				+    """Mixin for models with a notes field."""
			
 
				     notes = models.TextField(blank=True, null=False, default='')
			
 
				 
			
 
				     class Meta:
			
@@ -81,6 +92,7 @@ class ModelWithNotes(models.Model):
 
				 
			
 
				 
			
 
				 class ModelWithHealthStats(models.Model):
			
 
				+    """Mixin for models with health tracking fields."""
			
 
				     num_uses_failed = models.PositiveIntegerField(default=0)
			
 
				     num_uses_succeeded = models.PositiveIntegerField(default=0)
			
 
				 
			
@@ -94,6 +106,7 @@ class ModelWithHealthStats(models.Model):
 
				 
			
 
				 
			
 
				 class ModelWithConfig(models.Model):
			
 
				+    """Mixin for models with a JSON config field."""
			
 
				     config = models.JSONField(default=dict, null=False, blank=False, editable=True)
			
 
				 
			
 
				     class Meta:
			
@@ -113,7 +126,7 @@ class ModelWithOutputDir(ModelWithSerializers):
 
				 
			
 
				     @property
			
 
				     def output_dir_parent(self) -> str:
			
 
				-        return getattr(self, 'output_dir_parent', f'{self._meta.model_name}s')
			
 
				+        return f'{self._meta.model_name}s'
			
 
				 
			
 
				     @property
			
 
				     def output_dir_name(self) -> str:
			
--- a/archivebox/cli/__init__.py
+++ b/archivebox/cli/__init__.py
@@ -37,7 +37,13 @@ class ArchiveBoxGroup(click.Group):
 
				         'server': 'archivebox.cli.archivebox_server.main',
			
 
				         'shell': 'archivebox.cli.archivebox_shell.main',
			
 
				         'manage': 'archivebox.cli.archivebox_manage.main',
			
 
				+        # Worker/orchestrator commands
			
 
				+        'orchestrator': 'archivebox.cli.archivebox_orchestrator.main',
			
 
				         'worker': 'archivebox.cli.archivebox_worker.main',
			
 
				+        # Task commands (called by workers as subprocesses)
			
 
				+        'crawl': 'archivebox.cli.archivebox_crawl.main',
			
 
				+        'snapshot': 'archivebox.cli.archivebox_snapshot.main',
			
 
				+        'extract': 'archivebox.cli.archivebox_extract.main',
			
 
				     }
			
 
				     all_subcommands = {
			
 
				         **meta_commands,
			
@@ -118,11 +124,14 @@ def cli(ctx, help=False):
 
				                 raise
			
 
				             
			
 
				 
			
 
				-def main(args=None, prog_name=None):
			
 
				+def main(args=None, prog_name=None, stdin=None):
			
 
				     # show `docker run archivebox xyz` in help messages if running in docker
			
 
				     IN_DOCKER = os.environ.get('IN_DOCKER', False) in ('1', 'true', 'True', 'TRUE', 'yes')
			
 
				     IS_TTY = sys.stdin.isatty()
			
 
				     prog_name = prog_name or (f'docker compose run{"" if IS_TTY else " -T"} archivebox' if IN_DOCKER else 'archivebox')
			
 
				+    
			
 
				+    # stdin param allows passing input data from caller (used by __main__.py)
			
 
				+    # currently not used by click-based CLI, but kept for backwards compatibility
			
 
				 
			
 
				     try:
			
 
				         cli(args=args, prog_name=prog_name)
			
--- a/archivebox/cli/archivebox_add.py
+++ b/archivebox/cli/archivebox_add.py
@@ -16,214 +16,135 @@ from archivebox.misc.util import enforce_types, docstring
 
				 from archivebox import CONSTANTS
			
 
				 from archivebox.config.common import ARCHIVING_CONFIG
			
 
				 from archivebox.config.permissions import USER, HOSTNAME
			
 
				-from archivebox.parsers import PARSERS
			
 
				 
			
 
				 
			
 
				 if TYPE_CHECKING:
			
 
				     from core.models import Snapshot
			
 
				 
			
 
				 
			
 
				-ORCHESTRATOR = None
			
 
				-
			
 
				 @enforce_types
			
 
				 def add(urls: str | list[str],
			
 
				         depth: int | str=0,
			
 
				         tag: str='',
			
 
				         parser: str="auto",
			
 
				-        extract: str="",
			
 
				+        plugins: str="",
			
 
				         persona: str='Default',
			
 
				         overwrite: bool=False,
			
 
				         update: bool=not ARCHIVING_CONFIG.ONLY_NEW,
			
 
				         index_only: bool=False,
			
 
				         bg: bool=False,
			
 
				         created_by_id: int | None=None) -> QuerySet['Snapshot']:
			
 
				-    """Add a new URL or list of URLs to your archive"""
			
 
				+    """Add a new URL or list of URLs to your archive.
			
 
				+
			
 
				+    The new flow is:
			
 
				+    1. Save URLs to sources file
			
 
				+    2. Create Seed pointing to the file
			
 
				+    3. Create Crawl with max_depth
			
 
				+    4. Create root Snapshot pointing to file:// URL (depth=0)
			
 
				+    5. Orchestrator runs parser extractors on root snapshot
			
 
				+    6. Parser extractors output to urls.jsonl
			
 
				+    7. URLs are added to Crawl.urls and child Snapshots are created
			
 
				+    8. Repeat until max_depth is reached
			
 
				+    """
			
 
				 
			
 
				-    global ORCHESTRATOR
			
 
				+    from rich import print
			
 
				 
			
 
				     depth = int(depth)
			
 
				 
			
 
				-    assert depth in (0, 1), 'Depth must be 0 or 1 (depth >1 is not supported yet)'
			
 
				-    
			
 
				+    assert depth in (0, 1, 2, 3, 4), 'Depth must be 0-4'
			
 
				+
			
 
				     # import models once django is set up
			
 
				+    from core.models import Snapshot
			
 
				     from crawls.models import Seed, Crawl
			
 
				-    from workers.orchestrator import Orchestrator
			
 
				     from archivebox.base_models.models import get_or_create_system_user_pk
			
 
				-
			
 
				+    from workers.orchestrator import Orchestrator
			
 
				 
			
 
				     created_by_id = created_by_id or get_or_create_system_user_pk()
			
 
				-    
			
 
				-    # 1. save the provided urls to sources/2024-11-05__23-59-59__cli_add.txt
			
 
				+
			
 
				+    # 1. Save the provided URLs to sources/2024-11-05__23-59-59__cli_add.txt
			
 
				     sources_file = CONSTANTS.SOURCES_DIR / f'{timezone.now().strftime("%Y-%m-%d__%H-%M-%S")}__cli_add.txt'
			
 
				+    sources_file.parent.mkdir(parents=True, exist_ok=True)
			
 
				     sources_file.write_text(urls if isinstance(urls, str) else '\n'.join(urls))
			
 
				-    
			
 
				-    # 2. create a new Seed pointing to the sources/2024-11-05__23-59-59__cli_add.txt
			
 
				+
			
 
				+    # 2. Create a new Seed pointing to the sources file
			
 
				     cli_args = [*sys.argv]
			
 
				     if cli_args[0].lower().endswith('archivebox'):
			
 
				-        cli_args[0] = 'archivebox'  # full path to archivebox bin to just archivebox e.g. /Volumes/NVME/Users/squash/archivebox/.venv/bin/archivebox -> archivebox
			
 
				+        cli_args[0] = 'archivebox'
			
 
				     cmd_str = ' '.join(cli_args)
			
 
				-    seed = Seed.from_file(sources_file, label=f'{USER}@{HOSTNAME} $ {cmd_str}', parser=parser, tag=tag, created_by=created_by_id, config={
			
 
				-        'ONLY_NEW': not update,
			
 
				-        'INDEX_ONLY': index_only,
			
 
				-        'OVERWRITE': overwrite,
			
 
				-        'EXTRACTORS': extract,
			
 
				-        'DEFAULT_PERSONA': persona or 'Default',
			
 
				-    })
			
 
				-    # 3. create a new Crawl pointing to the Seed
			
 
				+
			
 
				+    seed = Seed.from_file(
			
 
				+        sources_file,
			
 
				+        label=f'{USER}@{HOSTNAME} $ {cmd_str}',
			
 
				+        parser=parser,
			
 
				+        tag=tag,
			
 
				+        created_by=created_by_id,
			
 
				+        config={
			
 
				+            'ONLY_NEW': not update,
			
 
				+            'INDEX_ONLY': index_only,
			
 
				+            'OVERWRITE': overwrite,
			
 
				+            'EXTRACTORS': plugins,
			
 
				+            'DEFAULT_PERSONA': persona or 'Default',
			
 
				+        }
			
 
				+    )
			
 
				+
			
 
				+    # 3. Create a new Crawl pointing to the Seed (status=queued)
			
 
				     crawl = Crawl.from_seed(seed, max_depth=depth)
			
 
				-    
			
 
				-    # 4. start the Orchestrator & wait until it completes
			
 
				-    #    ... orchestrator will create the root Snapshot, which creates pending ArchiveResults, which gets run by the ArchiveResultActors ...
			
 
				-    # from crawls.actors import CrawlActor
			
 
				-    # from core.actors import SnapshotActor, ArchiveResultActor
			
 
				-
			
 
				-    if not bg:
			
 
				-        orchestrator = Orchestrator(exit_on_idle=True, max_concurrent_actors=4)
			
 
				-        orchestrator.start()
			
 
				-    
			
 
				-    # 5. return the list of new Snapshots created
			
 
				+
			
 
				+    print(f'[green]\\[+] Created Crawl {crawl.id} with max_depth={depth}[/green]')
			
 
				+    print(f'    [dim]Seed: {seed.uri}[/dim]')
			
 
				+
			
 
				+    # 4. The CrawlMachine will create the root Snapshot when started
			
 
				+    #    Root snapshot URL = file:///path/to/sources/...txt
			
 
				+    #    Parser extractors will run on it and discover URLs
			
 
				+    #    Those URLs become child Snapshots (depth=1)
			
 
				+
			
 
				+    if index_only:
			
 
				+        # Just create the crawl but don't start processing
			
 
				+        print('[yellow]\\[*] Index-only mode - crawl created but not started[/yellow]')
			
 
				+        # Create root snapshot manually
			
 
				+        crawl.create_root_snapshot()
			
 
				+        return crawl.snapshot_set.all()
			
 
				+
			
 
				+    # 5. Start the orchestrator to process the queue
			
 
				+    #    The orchestrator will:
			
 
				+    #    - Process Crawl -> create root Snapshot
			
 
				+    #    - Process root Snapshot -> run parser extractors -> discover URLs
			
 
				+    #    - Create child Snapshots from discovered URLs
			
 
				+    #    - Process child Snapshots -> run extractors
			
 
				+    #    - Repeat until max_depth reached
			
 
				+
			
 
				+    if bg:
			
 
				+        # Background mode: start orchestrator and return immediately
			
 
				+        print('[yellow]\\[*] Running in background mode - starting orchestrator...[/yellow]')
			
 
				+        orchestrator = Orchestrator(exit_on_idle=True)
			
 
				+        orchestrator.start()  # Fork to background
			
 
				+    else:
			
 
				+        # Foreground mode: run orchestrator until all work is done
			
 
				+        print(f'[green]\\[*] Starting orchestrator to process crawl...[/green]')
			
 
				+        orchestrator = Orchestrator(exit_on_idle=True)
			
 
				+        orchestrator.runloop()  # Block until complete
			
 
				+
			
 
				+    # 6. Return the list of Snapshots in this crawl
			
 
				     return crawl.snapshot_set.all()
			
 
				 
			
 
				 
			
 
				 @click.command()
			
 
				[email protected]('--depth', '-d', type=click.Choice(('0', '1')), default='0', help='Recursively archive linked pages up to N hops away')
			
 
				[email protected]('--depth', '-d', type=click.Choice([str(i) for i in range(5)]), default='0', help='Recursively archive linked pages up to N hops away')
			
 
				 @click.option('--tag', '-t', default='', help='Comma-separated list of tags to add to each snapshot e.g. tag1,tag2,tag3')
			
 
				[email protected]('--parser', type=click.Choice(['auto', *PARSERS.keys()]), default='auto', help='Parser for reading input URLs')
			
 
				[email protected]('--extract', '-e', default='', help='Comma-separated list of extractors to use e.g. title,favicon,screenshot,singlefile,...')
			
 
				[email protected]('--parser', default='auto', help='Parser for reading input URLs (auto, txt, html, rss, json, jsonl, netscape, ...)')
			
 
				[email protected]('--plugins', '-p', default='', help='Comma-separated list of plugins to run e.g. title,favicon,screenshot,singlefile,...')
			
 
				 @click.option('--persona', default='Default', help='Authentication profile to use when archiving')
			
 
				 @click.option('--overwrite', '-F', is_flag=True, help='Overwrite existing data if URLs have been archived previously')
			
 
				 @click.option('--update', is_flag=True, default=ARCHIVING_CONFIG.ONLY_NEW, help='Retry any previously skipped/failed URLs when re-adding them')
			
 
				 @click.option('--index-only', is_flag=True, help='Just add the URLs to the index without archiving them now')
			
 
				-# @click.option('--update-all', is_flag=True, help='Update ALL links in index when finished adding new ones')
			
 
				[email protected]('--bg', is_flag=True, help='Run crawl in background worker instead of immediately')
			
 
				[email protected]('--bg', is_flag=True, help='Run archiving in background (start orchestrator and return immediately)')
			
 
				 @click.argument('urls', nargs=-1, type=click.Path())
			
 
				 @docstring(add.__doc__)
			
 
				 def main(**kwargs):
			
 
				     """Add a new URL or list of URLs to your archive"""
			
 
				-    
			
 
				+
			
 
				     add(**kwargs)
			
 
				 
			
 
				 
			
 
				 if __name__ == '__main__':
			
 
				     main()
			
 
				-
			
 
				-
			
 
				-
			
 
				-
			
 
				-# OLD VERSION:
			
 
				-# def add(urls: Union[str, List[str]],
			
 
				-#         tag: str='',
			
 
				-#         depth: int=0,
			
 
				-#         update: bool=not ARCHIVING_CONFIG.ONLY_NEW,
			
 
				-#         update_all: bool=False,
			
 
				-#         index_only: bool=False,
			
 
				-#         overwrite: bool=False,
			
 
				-#         # duplicate: bool=False,  # TODO: reuse the logic from admin.py resnapshot to allow adding multiple snapshots by appending timestamp automatically
			
 
				-#         init: bool=False,
			
 
				-#         extractors: str="",
			
 
				-#         parser: str="auto",
			
 
				-#         created_by_id: int | None=None,
			
 
				-#         out_dir: Path=DATA_DIR) -> List[Link]:
			
 
				-#     """Add a new URL or list of URLs to your archive"""
			
 
				-
			
 
				-#     from core.models import Snapshot, Tag
			
 
				-#     # from workers.supervisord_util import start_cli_workers, tail_worker_logs
			
 
				-#     # from workers.tasks import bg_archive_link
			
 
				-    
			
 
				-
			
 
				-#     assert depth in (0, 1), 'Depth must be 0 or 1 (depth >1 is not supported yet)'
			
 
				-
			
 
				-#     extractors = extractors.split(",") if extractors else []
			
 
				-
			
 
				-#     if init:
			
 
				-#         run_subcommand('init', stdin=None, pwd=out_dir)
			
 
				-
			
 
				-#     # Load list of links from the existing index
			
 
				-#     check_data_folder()
			
 
				-
			
 
				-#     # worker = start_cli_workers()
			
 
				-    
			
 
				-#     new_links: List[Link] = []
			
 
				-#     all_links = load_main_index(out_dir=out_dir)
			
 
				-
			
 
				-#     log_importing_started(urls=urls, depth=depth, index_only=index_only)
			
 
				-#     if isinstance(urls, str):
			
 
				-#         # save verbatim stdin to sources
			
 
				-#         write_ahead_log = save_text_as_source(urls, filename='{ts}-import.txt', out_dir=out_dir)
			
 
				-#     elif isinstance(urls, list):
			
 
				-#         # save verbatim args to sources
			
 
				-#         write_ahead_log = save_text_as_source('\n'.join(urls), filename='{ts}-import.txt', out_dir=out_dir)
			
 
				-    
			
 
				-
			
 
				-#     new_links += parse_links_from_source(write_ahead_log, root_url=None, parser=parser)
			
 
				-
			
 
				-#     # If we're going one level deeper, download each link and look for more links
			
 
				-#     new_links_depth = []
			
 
				-#     if new_links and depth == 1:
			
 
				-#         log_crawl_started(new_links)
			
 
				-#         for new_link in new_links:
			
 
				-#             try:
			
 
				-#                 downloaded_file = save_file_as_source(new_link.url, filename=f'{new_link.timestamp}-crawl-{new_link.domain}.txt', out_dir=out_dir)
			
 
				-#                 new_links_depth += parse_links_from_source(downloaded_file, root_url=new_link.url)
			
 
				-#             except Exception as err:
			
 
				-#                 stderr('[!] Failed to get contents of URL {new_link.url}', err, color='red')
			
 
				-
			
 
				-#     imported_links = list({link.url: link for link in (new_links + new_links_depth)}.values())
			
 
				-    
			
 
				-#     new_links = dedupe_links(all_links, imported_links)
			
 
				-
			
 
				-#     write_main_index(links=new_links, out_dir=out_dir, created_by_id=created_by_id)
			
 
				-#     all_links = load_main_index(out_dir=out_dir)
			
 
				-
			
 
				-#     tags = [
			
 
				-#         Tag.objects.get_or_create(name=name.strip(), defaults={'created_by_id': created_by_id})[0]
			
 
				-#         for name in tag.split(',')
			
 
				-#         if name.strip()
			
 
				-#     ]
			
 
				-#     if tags:
			
 
				-#         for link in imported_links:
			
 
				-#             snapshot = Snapshot.objects.get(url=link.url)
			
 
				-#             snapshot.tags.add(*tags)
			
 
				-#             snapshot.tags_str(nocache=True)
			
 
				-#             snapshot.save()
			
 
				-#         # print(f'    √ Tagged {len(imported_links)} Snapshots with {len(tags)} tags {tags_str}')
			
 
				-
			
 
				-#     if index_only:
			
 
				-#         # mock archive all the links using the fake index_only extractor method in order to update their state
			
 
				-#         if overwrite:
			
 
				-#             archive_links(imported_links, overwrite=overwrite, methods=['index_only'], out_dir=out_dir, created_by_id=created_by_id)
			
 
				-#         else:
			
 
				-#             archive_links(new_links, overwrite=False, methods=['index_only'], out_dir=out_dir, created_by_id=created_by_id)
			
 
				-#     else:
			
 
				-#         # fully run the archive extractor methods for each link
			
 
				-#         archive_kwargs = {
			
 
				-#             "out_dir": out_dir,
			
 
				-#             "created_by_id": created_by_id,
			
 
				-#         }
			
 
				-#         if extractors:
			
 
				-#             archive_kwargs["methods"] = extractors
			
 
				-
			
 
				-#         stderr()
			
 
				-
			
 
				-#         ts = datetime.now(timezone.utc).strftime('%Y-%m-%d %H:%M:%S')
			
 
				-
			
 
				-#         if update:
			
 
				-#             stderr(f'[*] [{ts}] Archiving + updating {len(imported_links)}/{len(all_links)}', len(imported_links), 'URLs from added set...', color='green')
			
 
				-#             archive_links(imported_links, overwrite=overwrite, **archive_kwargs)
			
 
				-#         elif update_all:
			
 
				-#             stderr(f'[*] [{ts}] Archiving + updating {len(all_links)}/{len(all_links)}', len(all_links), 'URLs from entire library...', color='green')
			
 
				-#             archive_links(all_links, overwrite=overwrite, **archive_kwargs)
			
 
				-#         elif overwrite:
			
 
				-#             stderr(f'[*] [{ts}] Archiving + overwriting {len(imported_links)}/{len(all_links)}', len(imported_links), 'URLs from added set...', color='green')
			
 
				-#             archive_links(imported_links, overwrite=True, **archive_kwargs)
			
 
				-#         elif new_links:
			
 
				-#             stderr(f'[*] [{ts}] Archiving {len(new_links)}/{len(all_links)} URLs from added set...', color='green')
			
 
				-#             archive_links(new_links, overwrite=False, **archive_kwargs)
			
 
				-
			
 
				-#     # tail_worker_logs(worker['stdout_logfile'])
			
 
				-
			
 
				-#     # if CAN_UPGRADE:
			
 
				-#     #     hint(f"There's a new version of ArchiveBox available! Your current version is {VERSION}. You can upgrade to {VERSIONS_AVAILABLE['recommended_version']['tag_name']} ({VERSIONS_AVAILABLE['recommended_version']['html_url']}). For more on how to upgrade: https://github.com/ArchiveBox/ArchiveBox/wiki/Upgrading-or-Merging-Archives\n")
			
 
				-
			
 
				-#     return new_links
			
 
				-
			
--- a/archivebox/cli/archivebox_config.py
+++ b/archivebox/cli/archivebox_config.py
@@ -20,15 +20,15 @@ def config(*keys,
 
				           **kwargs) -> None:
			
 
				     """Get and set your ArchiveBox project configuration values"""
			
 
				 
			
 
				-    import archivebox
			
 
				     from archivebox.misc.checks import check_data_folder
			
 
				     from archivebox.misc.logging_util import printable_config
			
 
				     from archivebox.config.collection import load_all_config, write_config_file, get_real_name
			
 
				+    from archivebox.config.configset import get_flat_config, get_all_configs
			
 
				 
			
 
				     check_data_folder()
			
 
				 
			
 
				-    FLAT_CONFIG = archivebox.pm.hook.get_FLAT_CONFIG()
			
 
				-    CONFIGS = archivebox.pm.hook.get_CONFIGS()
			
 
				+    FLAT_CONFIG = get_flat_config()
			
 
				+    CONFIGS = get_all_configs()
			
 
				     
			
 
				     config_options: list[str] = list(kwargs.pop('key=value', []) or keys or [f'{key}={val}' for key, val in kwargs.items()])
			
 
				     no_args = not (get or set or reset or config_options)
			
@@ -105,7 +105,7 @@ def config(*keys,
 
				         if new_config:
			
 
				             before = FLAT_CONFIG
			
 
				             matching_config = write_config_file(new_config)
			
 
				-            after = {**load_all_config(), **archivebox.pm.hook.get_FLAT_CONFIG()}
			
 
				+            after = {**load_all_config(), **get_flat_config()}
			
 
				             print(printable_config(matching_config))
			
 
				 
			
 
				             side_effect_changes = {}
			
--- a/archivebox/cli/archivebox_crawl.py
+++ b/archivebox/cli/archivebox_crawl.py
@@ -0,0 +1,302 @@
 
				+#!/usr/bin/env python3
			
 
				+
			
 
				+"""
			
 
				+archivebox crawl [urls_or_snapshot_ids...] [--depth=N] [--plugin=NAME]
			
 
				+
			
 
				+Discover outgoing links from URLs or existing Snapshots.
			
 
				+
			
 
				+If a URL is passed, creates a Snapshot for it first, then runs parser plugins.
			
 
				+If a snapshot_id is passed, runs parser plugins on the existing Snapshot.
			
 
				+Outputs discovered outlink URLs as JSONL.
			
 
				+
			
 
				+Pipe the output to `archivebox snapshot` to archive the discovered URLs.
			
 
				+
			
 
				+Input formats:
			
 
				+    - Plain URLs (one per line)
			
 
				+    - Snapshot UUIDs (one per line)
			
 
				+    - JSONL: {"type": "Snapshot", "url": "...", ...}
			
 
				+    - JSONL: {"type": "Snapshot", "id": "...", ...}
			
 
				+
			
 
				+Output (JSONL):
			
 
				+    {"type": "Snapshot", "url": "https://discovered-url.com", "via_extractor": "...", ...}
			
 
				+
			
 
				+Examples:
			
 
				+    # Discover links from a page (creates snapshot first)
			
 
				+    archivebox crawl https://example.com
			
 
				+
			
 
				+    # Discover links from an existing snapshot
			
 
				+    archivebox crawl 01234567-89ab-cdef-0123-456789abcdef
			
 
				+
			
 
				+    # Full recursive crawl pipeline
			
 
				+    archivebox crawl https://example.com | archivebox snapshot | archivebox extract
			
 
				+
			
 
				+    # Use only specific parser plugin
			
 
				+    archivebox crawl --plugin=parse_html_urls https://example.com
			
 
				+
			
 
				+    # Chain: create snapshot, then crawl its outlinks
			
 
				+    archivebox snapshot https://example.com | archivebox crawl | archivebox snapshot | archivebox extract
			
 
				+"""
			
 
				+
			
 
				+__package__ = 'archivebox.cli'
			
 
				+__command__ = 'archivebox crawl'
			
 
				+
			
 
				+import sys
			
 
				+import json
			
 
				+from pathlib import Path
			
 
				+from typing import Optional
			
 
				+
			
 
				+import rich_click as click
			
 
				+
			
 
				+from archivebox.misc.util import docstring
			
 
				+
			
 
				+
			
 
				+def discover_outlinks(
			
 
				+    args: tuple,
			
 
				+    depth: int = 1,
			
 
				+    plugin: str = '',
			
 
				+    wait: bool = True,
			
 
				+) -> int:
			
 
				+    """
			
 
				+    Discover outgoing links from URLs or existing Snapshots.
			
 
				+
			
 
				+    Accepts URLs or snapshot_ids. For URLs, creates Snapshots first.
			
 
				+    Runs parser plugins, outputs discovered URLs as JSONL.
			
 
				+    The output can be piped to `archivebox snapshot` to archive the discovered links.
			
 
				+
			
 
				+    Exit codes:
			
 
				+        0: Success
			
 
				+        1: Failure
			
 
				+    """
			
 
				+    from rich import print as rprint
			
 
				+    from django.utils import timezone
			
 
				+
			
 
				+    from archivebox.misc.jsonl import (
			
 
				+        read_args_or_stdin, write_record,
			
 
				+        TYPE_SNAPSHOT, get_or_create_snapshot
			
 
				+    )
			
 
				+    from archivebox.base_models.models import get_or_create_system_user_pk
			
 
				+    from core.models import Snapshot, ArchiveResult
			
 
				+    from crawls.models import Seed, Crawl
			
 
				+    from archivebox.config import CONSTANTS
			
 
				+    from workers.orchestrator import Orchestrator
			
 
				+
			
 
				+    created_by_id = get_or_create_system_user_pk()
			
 
				+    is_tty = sys.stdout.isatty()
			
 
				+
			
 
				+    # Collect all input records
			
 
				+    records = list(read_args_or_stdin(args))
			
 
				+
			
 
				+    if not records:
			
 
				+        rprint('[yellow]No URLs or snapshot IDs provided. Pass as arguments or via stdin.[/yellow]', file=sys.stderr)
			
 
				+        return 1
			
 
				+
			
 
				+    # Separate records into existing snapshots vs new URLs
			
 
				+    existing_snapshot_ids = []
			
 
				+    new_url_records = []
			
 
				+
			
 
				+    for record in records:
			
 
				+        # Check if it's an existing snapshot (has id but no url, or looks like a UUID)
			
 
				+        if record.get('id') and not record.get('url'):
			
 
				+            existing_snapshot_ids.append(record['id'])
			
 
				+        elif record.get('id'):
			
 
				+            # Has both id and url - check if snapshot exists
			
 
				+            try:
			
 
				+                Snapshot.objects.get(id=record['id'])
			
 
				+                existing_snapshot_ids.append(record['id'])
			
 
				+            except Snapshot.DoesNotExist:
			
 
				+                new_url_records.append(record)
			
 
				+        elif record.get('url'):
			
 
				+            new_url_records.append(record)
			
 
				+
			
 
				+    # For new URLs, create a Crawl and Snapshots
			
 
				+    snapshot_ids = list(existing_snapshot_ids)
			
 
				+
			
 
				+    if new_url_records:
			
 
				+        # Create a Crawl to manage this operation
			
 
				+        sources_file = CONSTANTS.SOURCES_DIR / f'{timezone.now().strftime("%Y-%m-%d__%H-%M-%S")}__crawl.txt'
			
 
				+        sources_file.parent.mkdir(parents=True, exist_ok=True)
			
 
				+        sources_file.write_text('\n'.join(r.get('url', '') for r in new_url_records if r.get('url')))
			
 
				+
			
 
				+        seed = Seed.from_file(
			
 
				+            sources_file,
			
 
				+            label=f'crawl --depth={depth}',
			
 
				+            created_by=created_by_id,
			
 
				+        )
			
 
				+        crawl = Crawl.from_seed(seed, max_depth=depth)
			
 
				+
			
 
				+        # Create snapshots for new URLs
			
 
				+        for record in new_url_records:
			
 
				+            try:
			
 
				+                record['crawl_id'] = str(crawl.id)
			
 
				+                record['depth'] = record.get('depth', 0)
			
 
				+
			
 
				+                snapshot = get_or_create_snapshot(record, created_by_id=created_by_id)
			
 
				+                snapshot_ids.append(str(snapshot.id))
			
 
				+
			
 
				+            except Exception as e:
			
 
				+                rprint(f'[red]Error creating snapshot: {e}[/red]', file=sys.stderr)
			
 
				+                continue
			
 
				+
			
 
				+    if not snapshot_ids:
			
 
				+        rprint('[red]No snapshots to process[/red]', file=sys.stderr)
			
 
				+        return 1
			
 
				+
			
 
				+    if existing_snapshot_ids:
			
 
				+        rprint(f'[blue]Using {len(existing_snapshot_ids)} existing snapshots[/blue]', file=sys.stderr)
			
 
				+    if new_url_records:
			
 
				+        rprint(f'[blue]Created {len(snapshot_ids) - len(existing_snapshot_ids)} new snapshots[/blue]', file=sys.stderr)
			
 
				+    rprint(f'[blue]Running parser plugins on {len(snapshot_ids)} snapshots...[/blue]', file=sys.stderr)
			
 
				+
			
 
				+    # Create ArchiveResults for plugins
			
 
				+    # If --plugin is specified, only run that one. Otherwise, run all available plugins.
			
 
				+    # The orchestrator will handle dependency ordering (plugins declare deps in config.json)
			
 
				+    for snapshot_id in snapshot_ids:
			
 
				+        try:
			
 
				+            snapshot = Snapshot.objects.get(id=snapshot_id)
			
 
				+
			
 
				+            if plugin:
			
 
				+                # User specified a single plugin to run
			
 
				+                ArchiveResult.objects.get_or_create(
			
 
				+                    snapshot=snapshot,
			
 
				+                    extractor=plugin,
			
 
				+                    defaults={
			
 
				+                        'status': ArchiveResult.StatusChoices.QUEUED,
			
 
				+                        'retry_at': timezone.now(),
			
 
				+                        'created_by_id': snapshot.created_by_id,
			
 
				+                    }
			
 
				+                )
			
 
				+            else:
			
 
				+                # Create pending ArchiveResults for all enabled plugins
			
 
				+                # This uses hook discovery to find available plugins dynamically
			
 
				+                snapshot.create_pending_archiveresults()
			
 
				+
			
 
				+            # Mark snapshot as started
			
 
				+            snapshot.status = Snapshot.StatusChoices.STARTED
			
 
				+            snapshot.retry_at = timezone.now()
			
 
				+            snapshot.save()
			
 
				+
			
 
				+        except Snapshot.DoesNotExist:
			
 
				+            continue
			
 
				+
			
 
				+    # Run plugins
			
 
				+    if wait:
			
 
				+        rprint('[blue]Running outlink plugins...[/blue]', file=sys.stderr)
			
 
				+        orchestrator = Orchestrator(exit_on_idle=True)
			
 
				+        orchestrator.runloop()
			
 
				+
			
 
				+    # Collect discovered URLs from urls.jsonl files
			
 
				+    # Uses dynamic discovery - any plugin that outputs urls.jsonl is considered a parser
			
 
				+    from archivebox.hooks import collect_urls_from_extractors
			
 
				+
			
 
				+    discovered_urls = {}
			
 
				+    for snapshot_id in snapshot_ids:
			
 
				+        try:
			
 
				+            snapshot = Snapshot.objects.get(id=snapshot_id)
			
 
				+            snapshot_dir = Path(snapshot.output_dir)
			
 
				+
			
 
				+            # Dynamically collect urls.jsonl from ANY plugin subdirectory
			
 
				+            for entry in collect_urls_from_extractors(snapshot_dir):
			
 
				+                url = entry.get('url')
			
 
				+                if url and url not in discovered_urls:
			
 
				+                    # Add metadata for crawl tracking
			
 
				+                    entry['type'] = TYPE_SNAPSHOT
			
 
				+                    entry['depth'] = snapshot.depth + 1
			
 
				+                    entry['via_snapshot'] = str(snapshot.id)
			
 
				+                    discovered_urls[url] = entry
			
 
				+
			
 
				+        except Snapshot.DoesNotExist:
			
 
				+            continue
			
 
				+
			
 
				+    rprint(f'[green]Discovered {len(discovered_urls)} URLs[/green]', file=sys.stderr)
			
 
				+
			
 
				+    # Output discovered URLs as JSONL (when piped) or human-readable (when TTY)
			
 
				+    for url, entry in discovered_urls.items():
			
 
				+        if is_tty:
			
 
				+            via = entry.get('via_extractor', 'unknown')
			
 
				+            rprint(f'  [dim]{via}[/dim] {url[:80]}', file=sys.stderr)
			
 
				+        else:
			
 
				+            write_record(entry)
			
 
				+
			
 
				+    return 0
			
 
				+
			
 
				+
			
 
				+def process_crawl_by_id(crawl_id: str) -> int:
			
 
				+    """
			
 
				+    Process a single Crawl by ID (used by workers).
			
 
				+
			
 
				+    Triggers the Crawl's state machine tick() which will:
			
 
				+    - Transition from queued -> started (creates root snapshot)
			
 
				+    - Transition from started -> sealed (when all snapshots done)
			
 
				+    """
			
 
				+    from rich import print as rprint
			
 
				+    from crawls.models import Crawl
			
 
				+
			
 
				+    try:
			
 
				+        crawl = Crawl.objects.get(id=crawl_id)
			
 
				+    except Crawl.DoesNotExist:
			
 
				+        rprint(f'[red]Crawl {crawl_id} not found[/red]', file=sys.stderr)
			
 
				+        return 1
			
 
				+
			
 
				+    rprint(f'[blue]Processing Crawl {crawl.id} (status={crawl.status})[/blue]', file=sys.stderr)
			
 
				+
			
 
				+    try:
			
 
				+        crawl.sm.tick()
			
 
				+        crawl.refresh_from_db()
			
 
				+        rprint(f'[green]Crawl complete (status={crawl.status})[/green]', file=sys.stderr)
			
 
				+        return 0
			
 
				+    except Exception as e:
			
 
				+        rprint(f'[red]Crawl error: {type(e).__name__}: {e}[/red]', file=sys.stderr)
			
 
				+        return 1
			
 
				+
			
 
				+
			
 
				+def is_crawl_id(value: str) -> bool:
			
 
				+    """Check if value looks like a Crawl UUID."""
			
 
				+    import re
			
 
				+    uuid_pattern = re.compile(r'^[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}$', re.I)
			
 
				+    if not uuid_pattern.match(value):
			
 
				+        return False
			
 
				+    # Verify it's actually a Crawl (not a Snapshot or other object)
			
 
				+    from crawls.models import Crawl
			
 
				+    return Crawl.objects.filter(id=value).exists()
			
 
				+
			
 
				+
			
 
				[email protected]()
			
 
				[email protected]('--depth', '-d', type=int, default=1, help='Max depth for recursive crawling (default: 1)')
			
 
				[email protected]('--plugin', '-p', default='', help='Use only this parser plugin (e.g., parse_html_urls, parse_dom_outlinks)')
			
 
				[email protected]('--wait/--no-wait', default=True, help='Wait for plugins to complete (default: wait)')
			
 
				[email protected]('args', nargs=-1)
			
 
				+def main(depth: int, plugin: str, wait: bool, args: tuple):
			
 
				+    """Discover outgoing links from URLs or existing Snapshots, or process Crawl by ID"""
			
 
				+    from archivebox.misc.jsonl import read_args_or_stdin
			
 
				+
			
 
				+    # Read all input
			
 
				+    records = list(read_args_or_stdin(args))
			
 
				+
			
 
				+    if not records:
			
 
				+        from rich import print as rprint
			
 
				+        rprint('[yellow]No URLs, Snapshot IDs, or Crawl IDs provided. Pass as arguments or via stdin.[/yellow]', file=sys.stderr)
			
 
				+        sys.exit(1)
			
 
				+
			
 
				+    # Check if input looks like existing Crawl IDs to process
			
 
				+    # If ALL inputs are Crawl UUIDs, process them
			
 
				+    all_are_crawl_ids = all(
			
 
				+        is_crawl_id(r.get('id') or r.get('url', ''))
			
 
				+        for r in records
			
 
				+    )
			
 
				+
			
 
				+    if all_are_crawl_ids:
			
 
				+        # Process existing Crawls by ID
			
 
				+        exit_code = 0
			
 
				+        for record in records:
			
 
				+            crawl_id = record.get('id') or record.get('url')
			
 
				+            result = process_crawl_by_id(crawl_id)
			
 
				+            if result != 0:
			
 
				+                exit_code = result
			
 
				+        sys.exit(exit_code)
			
 
				+    else:
			
 
				+        # Default behavior: discover outlinks from input (URLs or Snapshot IDs)
			
 
				+        sys.exit(discover_outlinks(args, depth=depth, plugin=plugin, wait=wait))
			
 
				+
			
 
				+
			
 
				+if __name__ == '__main__':
			
 
				+    main()
			
--- a/archivebox/cli/archivebox_extract.py
+++ b/archivebox/cli/archivebox_extract.py
@@ -1,49 +1,262 @@
 
				 #!/usr/bin/env python3
			
 
				 
			
 
				+"""
			
 
				+archivebox extract [snapshot_ids...] [--plugin=NAME]
			
 
				+
			
 
				+Run plugins on Snapshots. Accepts snapshot IDs as arguments, from stdin, or via JSONL.
			
 
				+
			
 
				+Input formats:
			
 
				+    - Snapshot UUIDs (one per line)
			
 
				+    - JSONL: {"type": "Snapshot", "id": "...", "url": "..."}
			
 
				+    - JSONL: {"type": "ArchiveResult", "snapshot_id": "...", "plugin": "..."}
			
 
				+
			
 
				+Output (JSONL):
			
 
				+    {"type": "ArchiveResult", "id": "...", "snapshot_id": "...", "plugin": "...", "status": "..."}
			
 
				+
			
 
				+Examples:
			
 
				+    # Extract specific snapshot
			
 
				+    archivebox extract 01234567-89ab-cdef-0123-456789abcdef
			
 
				+
			
 
				+    # Pipe from snapshot command
			
 
				+    archivebox snapshot https://example.com | archivebox extract
			
 
				+
			
 
				+    # Run specific plugin only
			
 
				+    archivebox extract --plugin=screenshot 01234567-89ab-cdef-0123-456789abcdef
			
 
				+
			
 
				+    # Chain commands
			
 
				+    archivebox crawl https://example.com | archivebox snapshot | archivebox extract
			
 
				+"""
			
 
				+
			
 
				 __package__ = 'archivebox.cli'
			
 
				 __command__ = 'archivebox extract'
			
 
				 
			
 
				-
			
 
				 import sys
			
 
				-from typing import TYPE_CHECKING, Generator
			
 
				+from typing import Optional, List
			
 
				 
			
 
				 import rich_click as click
			
 
				 
			
 
				-from django.db.models import Q
			
 
				 
			
 
				-from archivebox.misc.util import enforce_types, docstring
			
 
				+def process_archiveresult_by_id(archiveresult_id: str) -> int:
			
 
				+    """
			
 
				+    Run extraction for a single ArchiveResult by ID (used by workers).
			
 
				 
			
 
				-
			
 
				-if TYPE_CHECKING:
			
 
				+    Triggers the ArchiveResult's state machine tick() to run the extractor.
			
 
				+    """
			
 
				+    from rich import print as rprint
			
 
				     from core.models import ArchiveResult
			
 
				 
			
 
				+    try:
			
 
				+        archiveresult = ArchiveResult.objects.get(id=archiveresult_id)
			
 
				+    except ArchiveResult.DoesNotExist:
			
 
				+        rprint(f'[red]ArchiveResult {archiveresult_id} not found[/red]', file=sys.stderr)
			
 
				+        return 1
			
 
				+
			
 
				+    rprint(f'[blue]Extracting {archiveresult.extractor} for {archiveresult.snapshot.url}[/blue]', file=sys.stderr)
			
 
				+
			
 
				+    try:
			
 
				+        # Trigger state machine tick - this runs the actual extraction
			
 
				+        archiveresult.sm.tick()
			
 
				+        archiveresult.refresh_from_db()
			
 
				+
			
 
				+        if archiveresult.status == ArchiveResult.StatusChoices.SUCCEEDED:
			
 
				+            print(f'[green]Extraction succeeded: {archiveresult.output}[/green]')
			
 
				+            return 0
			
 
				+        elif archiveresult.status == ArchiveResult.StatusChoices.FAILED:
			
 
				+            print(f'[red]Extraction failed: {archiveresult.output}[/red]', file=sys.stderr)
			
 
				+            return 1
			
 
				+        else:
			
 
				+            # Still in progress or backoff - not a failure
			
 
				+            print(f'[yellow]Extraction status: {archiveresult.status}[/yellow]')
			
 
				+            return 0
			
 
				+
			
 
				+    except Exception as e:
			
 
				+        print(f'[red]Extraction error: {type(e).__name__}: {e}[/red]', file=sys.stderr)
			
 
				+        return 1
			
 
				+
			
 
				+
			
 
				+def run_plugins(
			
 
				+    args: tuple,
			
 
				+    plugin: str = '',
			
 
				+    wait: bool = True,
			
 
				+) -> int:
			
 
				+    """
			
 
				+    Run plugins on Snapshots from input.
			
 
				+
			
 
				+    Reads Snapshot IDs or JSONL from args/stdin, runs plugins, outputs JSONL.
			
 
				+
			
 
				+    Exit codes:
			
 
				+        0: Success
			
 
				+        1: Failure
			
 
				+    """
			
 
				+    from rich import print as rprint
			
 
				+    from django.utils import timezone
			
 
				+
			
 
				+    from archivebox.misc.jsonl import (
			
 
				+        read_args_or_stdin, write_record, archiveresult_to_jsonl,
			
 
				+        TYPE_SNAPSHOT, TYPE_ARCHIVERESULT
			
 
				+    )
			
 
				+    from core.models import Snapshot, ArchiveResult
			
 
				+    from workers.orchestrator import Orchestrator
			
 
				+
			
 
				+    is_tty = sys.stdout.isatty()
			
 
				+
			
 
				+    # Collect all input records
			
 
				+    records = list(read_args_or_stdin(args))
			
 
				 
			
 
				-ORCHESTRATOR = None
			
 
				+    if not records:
			
 
				+        rprint('[yellow]No snapshots provided. Pass snapshot IDs as arguments or via stdin.[/yellow]', file=sys.stderr)
			
 
				+        return 1
			
 
				 
			
 
				-@enforce_types
			
 
				-def extract(archiveresult_id: str) -> Generator['ArchiveResult', None, None]:
			
 
				-    archiveresult = ArchiveResult.objects.get(id=archiveresult_id)
			
 
				-    if not archiveresult:
			
 
				-        raise Exception(f'ArchiveResult {archiveresult_id} not found')
			
 
				-    
			
 
				-    return archiveresult.EXTRACTOR.extract()
			
 
				+    # Gather snapshot IDs to process
			
 
				+    snapshot_ids = set()
			
 
				+    for record in records:
			
 
				+        record_type = record.get('type')
			
 
				+
			
 
				+        if record_type == TYPE_SNAPSHOT:
			
 
				+            snapshot_id = record.get('id')
			
 
				+            if snapshot_id:
			
 
				+                snapshot_ids.add(snapshot_id)
			
 
				+            elif record.get('url'):
			
 
				+                # Look up by URL
			
 
				+                try:
			
 
				+                    snap = Snapshot.objects.get(url=record['url'])
			
 
				+                    snapshot_ids.add(str(snap.id))
			
 
				+                except Snapshot.DoesNotExist:
			
 
				+                    rprint(f'[yellow]Snapshot not found for URL: {record["url"]}[/yellow]', file=sys.stderr)
			
 
				+
			
 
				+        elif record_type == TYPE_ARCHIVERESULT:
			
 
				+            snapshot_id = record.get('snapshot_id')
			
 
				+            if snapshot_id:
			
 
				+                snapshot_ids.add(snapshot_id)
			
 
				+
			
 
				+        elif 'id' in record:
			
 
				+            # Assume it's a snapshot ID
			
 
				+            snapshot_ids.add(record['id'])
			
 
				+
			
 
				+    if not snapshot_ids:
			
 
				+        rprint('[red]No valid snapshot IDs found in input[/red]', file=sys.stderr)
			
 
				+        return 1
			
 
				+
			
 
				+    # Get snapshots and ensure they have pending ArchiveResults
			
 
				+    processed_count = 0
			
 
				+    for snapshot_id in snapshot_ids:
			
 
				+        try:
			
 
				+            snapshot = Snapshot.objects.get(id=snapshot_id)
			
 
				+        except Snapshot.DoesNotExist:
			
 
				+            rprint(f'[yellow]Snapshot {snapshot_id} not found[/yellow]', file=sys.stderr)
			
 
				+            continue
			
 
				+
			
 
				+        # Create pending ArchiveResults if needed
			
 
				+        if plugin:
			
 
				+            # Only create for specific plugin
			
 
				+            result, created = ArchiveResult.objects.get_or_create(
			
 
				+                snapshot=snapshot,
			
 
				+                extractor=plugin,
			
 
				+                defaults={
			
 
				+                    'status': ArchiveResult.StatusChoices.QUEUED,
			
 
				+                    'retry_at': timezone.now(),
			
 
				+                    'created_by_id': snapshot.created_by_id,
			
 
				+                }
			
 
				+            )
			
 
				+            if not created and result.status in [ArchiveResult.StatusChoices.FAILED, ArchiveResult.StatusChoices.SKIPPED]:
			
 
				+                # Reset for retry
			
 
				+                result.status = ArchiveResult.StatusChoices.QUEUED
			
 
				+                result.retry_at = timezone.now()
			
 
				+                result.save()
			
 
				+        else:
			
 
				+            # Create all pending plugins
			
 
				+            snapshot.create_pending_archiveresults()
			
 
				+
			
 
				+        # Reset snapshot status to allow processing
			
 
				+        if snapshot.status == Snapshot.StatusChoices.SEALED:
			
 
				+            snapshot.status = Snapshot.StatusChoices.STARTED
			
 
				+            snapshot.retry_at = timezone.now()
			
 
				+            snapshot.save()
			
 
				+
			
 
				+        processed_count += 1
			
 
				+
			
 
				+    if processed_count == 0:
			
 
				+        rprint('[red]No snapshots to process[/red]', file=sys.stderr)
			
 
				+        return 1
			
 
				+
			
 
				+    rprint(f'[blue]Queued {processed_count} snapshots for extraction[/blue]', file=sys.stderr)
			
 
				+
			
 
				+    # Run orchestrator if --wait (default)
			
 
				+    if wait:
			
 
				+        rprint('[blue]Running plugins...[/blue]', file=sys.stderr)
			
 
				+        orchestrator = Orchestrator(exit_on_idle=True)
			
 
				+        orchestrator.runloop()
			
 
				+
			
 
				+    # Output results as JSONL (when piped) or human-readable (when TTY)
			
 
				+    for snapshot_id in snapshot_ids:
			
 
				+        try:
			
 
				+            snapshot = Snapshot.objects.get(id=snapshot_id)
			
 
				+            results = snapshot.archiveresult_set.all()
			
 
				+            if plugin:
			
 
				+                results = results.filter(extractor=plugin)
			
 
				+
			
 
				+            for result in results:
			
 
				+                if is_tty:
			
 
				+                    status_color = {
			
 
				+                        'succeeded': 'green',
			
 
				+                        'failed': 'red',
			
 
				+                        'skipped': 'yellow',
			
 
				+                    }.get(result.status, 'dim')
			
 
				+                    rprint(f'  [{status_color}]{result.status}[/{status_color}] {result.extractor} → {result.output or ""}', file=sys.stderr)
			
 
				+                else:
			
 
				+                    write_record(archiveresult_to_jsonl(result))
			
 
				+        except Snapshot.DoesNotExist:
			
 
				+            continue
			
 
				+
			
 
				+    return 0
			
 
				+
			
 
				+
			
 
				+def is_archiveresult_id(value: str) -> bool:
			
 
				+    """Check if value looks like an ArchiveResult UUID."""
			
 
				+    import re
			
 
				+    uuid_pattern = re.compile(r'^[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}$', re.I)
			
 
				+    if not uuid_pattern.match(value):
			
 
				+        return False
			
 
				+    # Verify it's actually an ArchiveResult (not a Snapshot or other object)
			
 
				+    from core.models import ArchiveResult
			
 
				+    return ArchiveResult.objects.filter(id=value).exists()
			
 
				 
			
 
				-# <user>@<machine_id>#<datetime>/absolute/path/to/binary
			
 
				-# 2014.24.01
			
 
				 
			
 
				 @click.command()
			
 
				[email protected]('--plugin', '-p', default='', help='Run only this plugin (e.g., screenshot, singlefile)')
			
 
				[email protected]('--wait/--no-wait', default=True, help='Wait for plugins to complete (default: wait)')
			
 
				[email protected]('args', nargs=-1)
			
 
				+def main(plugin: str, wait: bool, args: tuple):
			
 
				+    """Run plugins on Snapshots, or process existing ArchiveResults by ID"""
			
 
				+    from archivebox.misc.jsonl import read_args_or_stdin
			
 
				+
			
 
				+    # Read all input
			
 
				+    records = list(read_args_or_stdin(args))
			
 
				+
			
 
				+    if not records:
			
 
				+        from rich import print as rprint
			
 
				+        rprint('[yellow]No Snapshot IDs or ArchiveResult IDs provided. Pass as arguments or via stdin.[/yellow]', file=sys.stderr)
			
 
				+        sys.exit(1)
			
 
				 
			
 
				[email protected]('archiveresult_ids', nargs=-1, type=str)
			
 
				-@docstring(extract.__doc__)
			
 
				-def main(archiveresult_ids: list[str]):
			
 
				-    """Add a new URL or list of URLs to your archive"""
			
 
				-    
			
 
				-    for archiveresult_id in (archiveresult_ids or sys.stdin):
			
 
				-        print(f'Extracting {archiveresult_id}...')
			
 
				-        archiveresult = extract(str(archiveresult_id))
			
 
				-        print(archiveresult.as_json())
			
 
				+    # Check if input looks like existing ArchiveResult IDs to process
			
 
				+    all_are_archiveresult_ids = all(
			
 
				+        is_archiveresult_id(r.get('id') or r.get('url', ''))
			
 
				+        for r in records
			
 
				+    )
			
 
				+
			
 
				+    if all_are_archiveresult_ids:
			
 
				+        # Process existing ArchiveResults by ID
			
 
				+        exit_code = 0
			
 
				+        for record in records:
			
 
				+            archiveresult_id = record.get('id') or record.get('url')
			
 
				+            result = process_archiveresult_by_id(archiveresult_id)
			
 
				+            if result != 0:
			
 
				+                exit_code = result
			
 
				+        sys.exit(exit_code)
			
 
				+    else:
			
 
				+        # Default behavior: run plugins on Snapshots from input
			
 
				+        sys.exit(run_plugins(args, plugin=plugin, wait=wait))
			
 
				 
			
 
				 
			
 
				 if __name__ == '__main__':
			
 
				     main()
			
 
				-
			
--- a/archivebox/cli/archivebox_init.py
+++ b/archivebox/cli/archivebox_init.py
@@ -21,10 +21,9 @@ def init(force: bool=False, quick: bool=False, install: bool=False, setup: bool=
 
				     from archivebox.config import CONSTANTS, VERSION, DATA_DIR
			
 
				     from archivebox.config.common import SERVER_CONFIG
			
 
				     from archivebox.config.collection import write_config_file
			
 
				-    from archivebox.index import load_main_index, write_main_index, fix_invalid_folder_locations, get_invalid_folders
			
 
				-    from archivebox.index.schema import Link
			
 
				-    from archivebox.index.json import parse_json_main_index, parse_json_links_details
			
 
				-    from archivebox.index.sql import apply_migrations
			
 
				+    from archivebox.misc.folders import fix_invalid_folder_locations, get_invalid_folders
			
 
				+    from archivebox.misc.legacy import parse_json_main_index, parse_json_links_details, SnapshotDict
			
 
				+    from archivebox.misc.db import apply_migrations
			
 
				     
			
 
				     # if os.access(out_dir / CONSTANTS.JSON_INDEX_FILENAME, os.F_OK):
			
 
				     #     print("[red]:warning: This folder contains a JSON index. It is deprecated, and will no longer be kept up to date automatically.[/red]", file=sys.stderr)
			
@@ -100,10 +99,10 @@ def init(force: bool=False, quick: bool=False, install: bool=False, setup: bool=
 
				     from core.models import Snapshot
			
 
				 
			
 
				     all_links = Snapshot.objects.none()
			
 
				-    pending_links: dict[str, Link] = {}
			
 
				+    pending_links: dict[str, SnapshotDict] = {}
			
 
				 
			
 
				     if existing_index:
			
 
				-        all_links = load_main_index(DATA_DIR, warn=False)
			
 
				+        all_links = Snapshot.objects.all()
			
 
				         print(f'    √ Loaded {all_links.count()} links from existing main index.')
			
 
				 
			
 
				     if quick:
			
@@ -119,9 +118,9 @@ def init(force: bool=False, quick: bool=False, install: bool=False, setup: bool=
 
				 
			
 
				             # Links in JSON index but not in main index
			
 
				             orphaned_json_links = {
			
 
				-                link.url: link
			
 
				-                for link in parse_json_main_index(DATA_DIR)
			
 
				-                if not all_links.filter(url=link.url).exists()
			
 
				+                link_dict['url']: link_dict
			
 
				+                for link_dict in parse_json_main_index(DATA_DIR)
			
 
				+                if not all_links.filter(url=link_dict['url']).exists()
			
 
				             }
			
 
				             if orphaned_json_links:
			
 
				                 pending_links.update(orphaned_json_links)
			
@@ -129,9 +128,9 @@ def init(force: bool=False, quick: bool=False, install: bool=False, setup: bool=
 
				 
			
 
				             # Links in data dir indexes but not in main index
			
 
				             orphaned_data_dir_links = {
			
 
				-                link.url: link
			
 
				-                for link in parse_json_links_details(DATA_DIR)
			
 
				-                if not all_links.filter(url=link.url).exists()
			
 
				+                link_dict['url']: link_dict
			
 
				+                for link_dict in parse_json_links_details(DATA_DIR)
			
 
				+                if not all_links.filter(url=link_dict['url']).exists()
			
 
				             }
			
 
				             if orphaned_data_dir_links:
			
 
				                 pending_links.update(orphaned_data_dir_links)
			
@@ -159,7 +158,8 @@ def init(force: bool=False, quick: bool=False, install: bool=False, setup: bool=
 
				             print('        archivebox init --quick', file=sys.stderr)
			
 
				             raise SystemExit(1)
			
 
				         
			
 
				-        write_main_index(list(pending_links.values()), DATA_DIR)
			
 
				+        if pending_links:
			
 
				+            Snapshot.objects.create_from_dicts(list(pending_links.values()))
			
 
				 
			
 
				     print('\n[green]----------------------------------------------------------------------[/green]')
			
 
				 
			
--- a/archivebox/cli/archivebox_install.py
+++ b/archivebox/cli/archivebox_install.py
@@ -4,7 +4,7 @@ __package__ = 'archivebox.cli'
 
				 
			
 
				 import os
			
 
				 import sys
			
 
				-from typing import Optional, List
			
 
				+import shutil
			
 
				 
			
 
				 import rich_click as click
			
 
				 from rich import print
			
@@ -13,149 +13,86 @@ from archivebox.misc.util import docstring, enforce_types
 
				 
			
 
				 
			
 
				 @enforce_types
			
 
				-def install(binproviders: Optional[List[str]]=None, binaries: Optional[List[str]]=None, dry_run: bool=False) -> None:
			
 
				-    """Automatically install all ArchiveBox dependencies and extras"""
			
 
				-    
			
 
				-    # if running as root:
			
 
				-    #    - run init to create index + lib dir
			
 
				-    #    - chown -R 911 DATA_DIR
			
 
				-    #    - install all binaries as root
			
 
				-    #    - chown -R 911 LIB_DIR
			
 
				-    # else:
			
 
				-    #    - run init to create index + lib dir as current user
			
 
				-    #    - install all binaries as current user
			
 
				-    #    - recommend user re-run with sudo if any deps need to be installed as root
			
 
				-
			
 
				-    import abx
			
 
				-    import archivebox
			
 
				-    from archivebox.config.permissions import IS_ROOT, ARCHIVEBOX_USER, ARCHIVEBOX_GROUP, SudoPermission
			
 
				-    from archivebox.config.paths import DATA_DIR, ARCHIVE_DIR, get_or_create_working_lib_dir
			
 
				+def install(dry_run: bool=False) -> None:
			
 
				+    """Detect and install ArchiveBox dependencies by running a dependency-check crawl"""
			
 
				+
			
 
				+    from archivebox.config.permissions import IS_ROOT, ARCHIVEBOX_USER, ARCHIVEBOX_GROUP
			
 
				+    from archivebox.config.paths import ARCHIVE_DIR
			
 
				     from archivebox.misc.logging import stderr
			
 
				     from archivebox.cli.archivebox_init import init
			
 
				-    from archivebox.misc.system import run as run_shell
			
 
				-
			
 
				 
			
 
				     if not (os.access(ARCHIVE_DIR, os.R_OK) and ARCHIVE_DIR.is_dir()):
			
 
				         init()  # must init full index because we need a db to store InstalledBinary entries in
			
 
				 
			
 
				-    print('\n[green][+] Installing ArchiveBox dependencies automatically...[/green]')
			
 
				-    
			
 
				-    # we never want the data dir to be owned by root, detect owner of existing owner of DATA_DIR to try and guess desired non-root UID
			
 
				+    print('\n[green][+] Detecting ArchiveBox dependencies...[/green]')
			
 
				+
			
 
				     if IS_ROOT:
			
 
				         EUID = os.geteuid()
			
 
				-        
			
 
				-        # if we have sudo/root permissions, take advantage of them just while installing dependencies
			
 
				         print()
			
 
				-        print(f'[yellow]:warning:  Running as UID=[blue]{EUID}[/blue] with [red]sudo[/red] only for dependencies that need it.[/yellow]')
			
 
				-        print(f'    DATA_DIR, LIB_DIR, and TMP_DIR will be owned by [blue]{ARCHIVEBOX_USER}:{ARCHIVEBOX_GROUP}[/blue].')
			
 
				+        print(f'[yellow]:warning:  Running as UID=[blue]{EUID}[/blue].[/yellow]')
			
 
				+        print(f'    DATA_DIR will be owned by [blue]{ARCHIVEBOX_USER}:{ARCHIVEBOX_GROUP}[/blue].')
			
 
				         print()
			
 
				-    
			
 
				-    LIB_DIR = get_or_create_working_lib_dir()
			
 
				-    
			
 
				-    package_manager_names = ', '.join(
			
 
				-        f'[yellow]{binprovider.name}[/yellow]'
			
 
				-        for binprovider in reversed(list(abx.as_dict(abx.pm.hook.get_BINPROVIDERS()).values()))
			
 
				-        if not binproviders or (binproviders and binprovider.name in binproviders)
			
 
				-    )
			
 
				-    print(f'[+] Setting up package managers {package_manager_names}...')
			
 
				-    for binprovider in reversed(list(abx.as_dict(abx.pm.hook.get_BINPROVIDERS()).values())):
			
 
				-        if binproviders and binprovider.name not in binproviders:
			
 
				-            continue
			
 
				-        try:
			
 
				-            binprovider.setup()
			
 
				-        except Exception:
			
 
				-            # it's ok, installing binaries below will automatically set up package managers as needed
			
 
				-            # e.g. if user does not have npm available we cannot set it up here yet, but once npm Binary is installed
			
 
				-            # the next package that depends on npm will automatically call binprovider.setup() during its own install
			
 
				-            pass
			
 
				-    
			
 
				-    print()
			
 
				-    
			
 
				-    for binary in reversed(list(abx.as_dict(abx.pm.hook.get_BINARIES()).values())):
			
 
				-        if binary.name in ('archivebox', 'django', 'sqlite', 'python'):
			
 
				-            # obviously must already be installed if we are running
			
 
				-            continue
			
 
				-        
			
 
				-        if binaries and binary.name not in binaries:
			
 
				-            continue
			
 
				-        
			
 
				-        providers = ' [grey53]or[/grey53] '.join(
			
 
				-            provider.name for provider in binary.binproviders_supported
			
 
				-            if not binproviders or (binproviders and provider.name in binproviders)
			
 
				-        )
			
 
				-        if not providers:
			
 
				-            continue
			
 
				-        print(f'[+] Detecting / Installing [yellow]{binary.name.ljust(22)}[/yellow] using [red]{providers}[/red]...')
			
 
				-        try:
			
 
				-            with SudoPermission(uid=0, fallback=True):
			
 
				-                # print(binary.load_or_install(fresh=True).model_dump(exclude={'overrides', 'bin_dir', 'hook_type'}))
			
 
				-                if binproviders:
			
 
				-                    providers_supported_by_binary = [provider.name for provider in binary.binproviders_supported]
			
 
				-                    for binprovider_name in binproviders:
			
 
				-                        if binprovider_name not in providers_supported_by_binary:
			
 
				-                            continue
			
 
				-                        try:
			
 
				-                            if dry_run:
			
 
				-                                # always show install commands when doing a dry run
			
 
				-                                sys.stderr.write("\033[2;49;90m")  # grey53
			
 
				-                                result = binary.install(binproviders=[binprovider_name], dry_run=dry_run).model_dump(exclude={'overrides', 'bin_dir', 'hook_type'})
			
 
				-                                sys.stderr.write("\033[00m\n")     # reset
			
 
				-                            else:
			
 
				-                                loaded_binary = archivebox.pm.hook.binary_load_or_install(binary=binary, binproviders=[binprovider_name], fresh=True, dry_run=dry_run, quiet=False)
			
 
				-                                result = loaded_binary.model_dump(exclude={'overrides', 'bin_dir', 'hook_type'})
			
 
				-                            if result and result['loaded_version']:
			
 
				-                                break
			
 
				-                        except Exception as e:
			
 
				-                            print(f'[red]:cross_mark: Failed to install {binary.name} as using {binprovider_name} as user {ARCHIVEBOX_USER}: {e}[/red]')
			
 
				-                else:
			
 
				-                    if dry_run:
			
 
				-                        sys.stderr.write("\033[2;49;90m")  # grey53
			
 
				-                        binary.install(dry_run=dry_run).model_dump(exclude={'overrides', 'bin_dir', 'hook_type'})
			
 
				-                        sys.stderr.write("\033[00m\n")  # reset
			
 
				-                    else:
			
 
				-                        loaded_binary = archivebox.pm.hook.binary_load_or_install(binary=binary, fresh=True, dry_run=dry_run)
			
 
				-                        result = loaded_binary.model_dump(exclude={'overrides', 'bin_dir', 'hook_type'})
			
 
				-            if IS_ROOT and LIB_DIR:
			
 
				-                with SudoPermission(uid=0):
			
 
				-                    if ARCHIVEBOX_USER == 0:
			
 
				-                        os.system(f'chmod -R 777 "{LIB_DIR.resolve()}"')
			
 
				-                    else:    
			
 
				-                        os.system(f'chown -R {ARCHIVEBOX_USER} "{LIB_DIR.resolve()}"')
			
 
				-        except Exception as e:
			
 
				-            print(f'[red]:cross_mark: Failed to install {binary.name} as user {ARCHIVEBOX_USER}: {e}[/red]')
			
 
				-            if binaries and len(binaries) == 1:
			
 
				-                # if we are only installing a single binary, raise the exception so the user can see what went wrong
			
 
				-                raise
			
 
				-                
			
 
				+
			
 
				+    if dry_run:
			
 
				+        print('[dim]Dry run - would create a crawl to detect dependencies[/dim]')
			
 
				+        return
			
 
				+
			
 
				+    # Set up Django
			
 
				     from archivebox.config.django import setup_django
			
 
				     setup_django()
			
 
				-    
			
 
				+
			
 
				+    from django.utils import timezone
			
 
				+    from crawls.models import Seed, Crawl
			
 
				+    from archivebox.base_models.models import get_or_create_system_user_pk
			
 
				+
			
 
				+    # Create a seed and crawl for dependency detection
			
 
				+    # Using a minimal crawl that will trigger on_Crawl hooks
			
 
				+    created_by_id = get_or_create_system_user_pk()
			
 
				+
			
 
				+    seed = Seed.objects.create(
			
 
				+        uri='archivebox://install',
			
 
				+        label='Dependency detection',
			
 
				+        created_by_id=created_by_id,
			
 
				+    )
			
 
				+
			
 
				+    crawl = Crawl.objects.create(
			
 
				+        seed=seed,
			
 
				+        max_depth=0,
			
 
				+        created_by_id=created_by_id,
			
 
				+        status='queued',
			
 
				+    )
			
 
				+
			
 
				+    print(f'[+] Created dependency detection crawl: {crawl.id}')
			
 
				+    print('[+] Running crawl to detect binaries via on_Crawl hooks...')
			
 
				+    print()
			
 
				+
			
 
				+    # Run the crawl synchronously (this triggers on_Crawl hooks)
			
 
				+    from workers.orchestrator import Orchestrator
			
 
				+    orchestrator = Orchestrator(exit_on_idle=True)
			
 
				+    orchestrator.runloop()
			
 
				+
			
 
				+    print()
			
 
				+
			
 
				+    # Check for superuser
			
 
				     from django.contrib.auth import get_user_model
			
 
				     User = get_user_model()
			
 
				 
			
 
				     if not User.objects.filter(is_superuser=True).exclude(username='system').exists():
			
 
				         stderr('\n[+] Don\'t forget to create a new admin user for the Web UI...', color='green')
			
 
				         stderr('    archivebox manage createsuperuser')
			
 
				-        # run_subcommand('manage', subcommand_args=['createsuperuser'], pwd=out_dir)
			
 
				-    
			
 
				-    print('\n[green][√] Set up ArchiveBox and its dependencies successfully.[/green]\n', file=sys.stderr)
			
 
				-    
			
 
				-    from abx_plugin_pip.binaries import ARCHIVEBOX_BINARY
			
 
				-    
			
 
				-    extra_args = []
			
 
				-    if binproviders:
			
 
				-        extra_args.append(f'--binproviders={",".join(binproviders)}')
			
 
				-    if binaries:
			
 
				-        extra_args.append(f'--binaries={",".join(binaries)}')
			
 
				-    
			
 
				-    proc = run_shell([ARCHIVEBOX_BINARY.load().abspath, 'version', *extra_args], capture_output=False, cwd=DATA_DIR)
			
 
				-    raise SystemExit(proc.returncode)
			
 
				+
			
 
				+    print()
			
 
				+
			
 
				+    # Run version to show full status
			
 
				+    archivebox_path = shutil.which('archivebox') or sys.executable
			
 
				+    if 'python' in archivebox_path:
			
 
				+        os.system(f'{sys.executable} -m archivebox version')
			
 
				+    else:
			
 
				+        os.system(f'{archivebox_path} version')
			
 
				 
			
 
				 
			
 
				 @click.command()
			
 
				[email protected]('--binproviders', '-p', type=str, help='Select binproviders to use DEFAULT=env,apt,brew,sys_pip,venv_pip,lib_pip,pipx,sys_npm,lib_npm,puppeteer,playwright (all)', default=None)
			
 
				[email protected]('--binaries', '-b', type=str, help='Select binaries to install DEFAULT=curl,wget,git,yt-dlp,chrome,single-file,readability-extractor,postlight-parser,... (all)', default=None)
			
 
				[email protected]('--dry-run', '-d', is_flag=True, help='Show what would be installed without actually installing anything', default=False)
			
 
				[email protected]('--dry-run', '-d', is_flag=True, help='Show what would happen without actually running', default=False)
			
 
				 @docstring(install.__doc__)
			
 
				 def main(**kwargs) -> None:
			
 
				     install(**kwargs)
			
--- a/archivebox/cli/archivebox_orchestrator.py
+++ b/archivebox/cli/archivebox_orchestrator.py
@@ -0,0 +1,67 @@
 
				+#!/usr/bin/env python3
			
 
				+
			
 
				+"""
			
 
				+archivebox orchestrator [--daemon]
			
 
				+
			
 
				+Start the orchestrator process that manages workers.
			
 
				+
			
 
				+The orchestrator polls queues for each model type (Crawl, Snapshot, ArchiveResult)
			
 
				+and lazily spawns worker processes when there is work to be done.
			
 
				+"""
			
 
				+
			
 
				+__package__ = 'archivebox.cli'
			
 
				+__command__ = 'archivebox orchestrator'
			
 
				+
			
 
				+import sys
			
 
				+
			
 
				+import rich_click as click
			
 
				+
			
 
				+from archivebox.misc.util import docstring
			
 
				+
			
 
				+
			
 
				+def orchestrator(daemon: bool = False, watch: bool = False) -> int:
			
 
				+    """
			
 
				+    Start the orchestrator process.
			
 
				+    
			
 
				+    The orchestrator:
			
 
				+    1. Polls each model queue (Crawl, Snapshot, ArchiveResult)
			
 
				+    2. Spawns worker processes when there is work to do
			
 
				+    3. Monitors worker health and restarts failed workers
			
 
				+    4. Exits when all queues are empty (unless --daemon)
			
 
				+    
			
 
				+    Args:
			
 
				+        daemon: Run forever (don't exit when idle)
			
 
				+        watch: Just watch the queues without spawning workers (for debugging)
			
 
				+    
			
 
				+    Exit codes:
			
 
				+        0: All work completed successfully
			
 
				+        1: Error occurred
			
 
				+    """
			
 
				+    from workers.orchestrator import Orchestrator
			
 
				+    
			
 
				+    if Orchestrator.is_running():
			
 
				+        print('[yellow]Orchestrator is already running[/yellow]')
			
 
				+        return 0
			
 
				+    
			
 
				+    try:
			
 
				+        orchestrator_instance = Orchestrator(exit_on_idle=not daemon)
			
 
				+        orchestrator_instance.runloop()
			
 
				+        return 0
			
 
				+    except KeyboardInterrupt:
			
 
				+        return 0
			
 
				+    except Exception as e:
			
 
				+        print(f'[red]Orchestrator error: {type(e).__name__}: {e}[/red]', file=sys.stderr)
			
 
				+        return 1
			
 
				+
			
 
				+
			
 
				[email protected]()
			
 
				[email protected]('--daemon', '-d', is_flag=True, help="Run forever (don't exit on idle)")
			
 
				[email protected]('--watch', '-w', is_flag=True, help="Watch queues without spawning workers")
			
 
				+@docstring(orchestrator.__doc__)
			
 
				+def main(daemon: bool, watch: bool):
			
 
				+    """Start the ArchiveBox orchestrator process"""
			
 
				+    sys.exit(orchestrator(daemon=daemon, watch=watch))
			
 
				+
			
 
				+
			
 
				+if __name__ == '__main__':
			
 
				+    main()
			
--- a/archivebox/cli/archivebox_remove.py
+++ b/archivebox/cli/archivebox_remove.py
@@ -12,10 +12,7 @@ import rich_click as click
 
				 from django.db.models import QuerySet
			
 
				 
			
 
				 from archivebox.config import DATA_DIR
			
 
				-from archivebox.index.schema import Link
			
 
				 from archivebox.config.django import setup_django
			
 
				-from archivebox.index import load_main_index
			
 
				-from archivebox.index.sql import remove_from_sql_main_index
			
 
				 from archivebox.misc.util import enforce_types, docstring
			
 
				 from archivebox.misc.checks import check_data_folder
			
 
				 from archivebox.misc.logging_util import (
			
@@ -35,7 +32,7 @@ def remove(filter_patterns: Iterable[str]=(),
 
				           before: float | None=None,
			
 
				           yes: bool=False,
			
 
				           delete: bool=False,
			
 
				-          out_dir: Path=DATA_DIR) -> Iterable[Link]:
			
 
				+          out_dir: Path=DATA_DIR) -> QuerySet:
			
 
				     """Remove the specified URLs from the archive"""
			
 
				     
			
 
				     setup_django()
			
@@ -63,27 +60,27 @@ def remove(filter_patterns: Iterable[str]=(),
 
				         log_removal_finished(0, 0)
			
 
				         raise SystemExit(1)
			
 
				 
			
 
				-    log_links = [link.as_link() for link in snapshots]
			
 
				-    log_list_finished(log_links)
			
 
				-    log_removal_started(log_links, yes=yes, delete=delete)
			
 
				+    log_list_finished(snapshots)
			
 
				+    log_removal_started(snapshots, yes=yes, delete=delete)
			
 
				 
			
 
				     timer = TimedProgress(360, prefix='      ')
			
 
				     try:
			
 
				         for snapshot in snapshots:
			
 
				             if delete:
			
 
				-                shutil.rmtree(snapshot.as_link().link_dir, ignore_errors=True)
			
 
				+                shutil.rmtree(snapshot.output_dir, ignore_errors=True)
			
 
				     finally:
			
 
				         timer.end()
			
 
				 
			
 
				     to_remove = snapshots.count()
			
 
				 
			
 
				     from archivebox.search import flush_search_index
			
 
				+    from core.models import Snapshot
			
 
				 
			
 
				     flush_search_index(snapshots=snapshots)
			
 
				-    remove_from_sql_main_index(snapshots=snapshots, out_dir=out_dir)
			
 
				-    all_snapshots = load_main_index(out_dir=out_dir)
			
 
				+    snapshots.delete()
			
 
				+    all_snapshots = Snapshot.objects.all()
			
 
				     log_removal_finished(all_snapshots.count(), to_remove)
			
 
				-    
			
 
				+
			
 
				     return all_snapshots
			
 
				 
			
 
				 
			
--- a/archivebox/cli/archivebox_schedule.py
+++ b/archivebox/cli/archivebox_schedule.py
@@ -35,9 +35,12 @@ def schedule(add: bool=False,
 
				  
			
 
				     depth = int(depth)
			
 
				     
			
 
				+    import shutil
			
 
				     from crontab import CronTab, CronSlices
			
 
				     from archivebox.misc.system import dedupe_cron_jobs
			
 
				-    from abx_plugin_pip.binaries import ARCHIVEBOX_BINARY
			
 
				+    
			
 
				+    # Find the archivebox binary path
			
 
				+    ARCHIVEBOX_ABSPATH = shutil.which('archivebox') or sys.executable.replace('python', 'archivebox')
			
 
				 
			
 
				     Path(CONSTANTS.LOGS_DIR).mkdir(exist_ok=True)
			
 
				 
			
@@ -58,7 +61,7 @@ def schedule(add: bool=False,
 
				             'cd',
			
 
				             quoted(out_dir),
			
 
				             '&&',
			
 
				-            quoted(ARCHIVEBOX_BINARY.load().abspath),
			
 
				+            quoted(ARCHIVEBOX_ABSPATH),
			
 
				             *([
			
 
				                 'add',
			
 
				                 *(['--overwrite'] if overwrite else []),
			
--- a/archivebox/cli/archivebox_search.py
+++ b/archivebox/cli/archivebox_search.py
@@ -4,7 +4,7 @@ __package__ = 'archivebox.cli'
 
				 __command__ = 'archivebox search'
			
 
				 
			
 
				 from pathlib import Path
			
 
				-from typing import Optional, List, Iterable
			
 
				+from typing import Optional, List, Any
			
 
				 
			
 
				 import rich_click as click
			
 
				 from rich import print
			
@@ -12,11 +12,19 @@ from rich import print
 
				 from django.db.models import QuerySet
			
 
				 
			
 
				 from archivebox.config import DATA_DIR
			
 
				-from archivebox.index import LINK_FILTERS
			
 
				-from archivebox.index.schema import Link
			
 
				 from archivebox.misc.logging import stderr
			
 
				 from archivebox.misc.util import enforce_types, docstring
			
 
				 
			
 
				+# Filter types for URL matching
			
 
				+LINK_FILTERS = {
			
 
				+    'exact': lambda pattern: {'url': pattern},
			
 
				+    'substring': lambda pattern: {'url__icontains': pattern},
			
 
				+    'regex': lambda pattern: {'url__iregex': pattern},
			
 
				+    'domain': lambda pattern: {'url__istartswith': f'http://{pattern}'},
			
 
				+    'tag': lambda pattern: {'tags__name': pattern},
			
 
				+    'timestamp': lambda pattern: {'timestamp': pattern},
			
 
				+}
			
 
				+
			
 
				 STATUS_CHOICES = [
			
 
				     'indexed', 'archived', 'unarchived', 'present', 'valid', 'invalid',
			
 
				     'duplicate', 'orphaned', 'corrupted', 'unrecognized'
			
@@ -24,38 +32,37 @@ STATUS_CHOICES = [
 
				 
			
 
				 
			
 
				 
			
 
				-def list_links(snapshots: Optional[QuerySet]=None,
			
 
				-               filter_patterns: Optional[List[str]]=None,
			
 
				-               filter_type: str='substring',
			
 
				-               after: Optional[float]=None,
			
 
				-               before: Optional[float]=None,
			
 
				-               out_dir: Path=DATA_DIR) -> Iterable[Link]:
			
 
				-    
			
 
				-    from archivebox.index import load_main_index
			
 
				-    from archivebox.index import snapshot_filter
			
 
				+def get_snapshots(snapshots: Optional[QuerySet]=None,
			
 
				+                  filter_patterns: Optional[List[str]]=None,
			
 
				+                  filter_type: str='substring',
			
 
				+                  after: Optional[float]=None,
			
 
				+                  before: Optional[float]=None,
			
 
				+                  out_dir: Path=DATA_DIR) -> QuerySet:
			
 
				+    """Filter and return Snapshots matching the given criteria."""
			
 
				+    from core.models import Snapshot
			
 
				 
			
 
				     if snapshots:
			
 
				-        all_snapshots = snapshots
			
 
				+        result = snapshots
			
 
				     else:
			
 
				-        all_snapshots = load_main_index(out_dir=out_dir)
			
 
				+        result = Snapshot.objects.all()
			
 
				 
			
 
				     if after is not None:
			
 
				-        all_snapshots = all_snapshots.filter(timestamp__gte=after)
			
 
				+        result = result.filter(timestamp__gte=after)
			
 
				     if before is not None:
			
 
				-        all_snapshots = all_snapshots.filter(timestamp__lt=before)
			
 
				+        result = result.filter(timestamp__lt=before)
			
 
				     if filter_patterns:
			
 
				-        all_snapshots = snapshot_filter(all_snapshots, filter_patterns, filter_type)
			
 
				+        result = Snapshot.objects.filter_by_patterns(filter_patterns, filter_type)
			
 
				 
			
 
				-    if not all_snapshots:
			
 
				+    if not result:
			
 
				         stderr('[!] No Snapshots matched your filters:', filter_patterns, f'({filter_type})', color='lightyellow')
			
 
				 
			
 
				-    return all_snapshots
			
 
				+    return result
			
 
				 
			
 
				 
			
 
				-def list_folders(links: list[Link], status: str, out_dir: Path=DATA_DIR) -> dict[str, Link | None]:
			
 
				-    
			
 
				+def list_folders(snapshots: QuerySet, status: str, out_dir: Path=DATA_DIR) -> dict[str, Any]:
			
 
				+
			
 
				     from archivebox.misc.checks import check_data_folder
			
 
				-    from archivebox.index import (
			
 
				+    from archivebox.misc.folders import (
			
 
				         get_indexed_folders,
			
 
				         get_archived_folders,
			
 
				         get_unarchived_folders,
			
@@ -67,7 +74,7 @@ def list_folders(links: list[Link], status: str, out_dir: Path=DATA_DIR) -> dict
 
				         get_corrupted_folders,
			
 
				         get_unrecognized_folders,
			
 
				     )
			
 
				-    
			
 
				+
			
 
				     check_data_folder()
			
 
				 
			
 
				     STATUS_FUNCTIONS = {
			
@@ -84,7 +91,7 @@ def list_folders(links: list[Link], status: str, out_dir: Path=DATA_DIR) -> dict
 
				     }
			
 
				 
			
 
				     try:
			
 
				-        return STATUS_FUNCTIONS[status](links, out_dir=out_dir)
			
 
				+        return STATUS_FUNCTIONS[status](snapshots, out_dir=out_dir)
			
 
				     except KeyError:
			
 
				         raise ValueError('Status not recognized.')
			
 
				 
			
@@ -109,7 +116,7 @@ def search(filter_patterns: list[str] | None=None,
 
				         stderr('[X] --with-headers requires --json, --html or --csv\n', color='red')
			
 
				         raise SystemExit(2)
			
 
				 
			
 
				-    snapshots = list_links(
			
 
				+    snapshots = get_snapshots(
			
 
				         filter_patterns=list(filter_patterns) if filter_patterns else None,
			
 
				         filter_type=filter_type,
			
 
				         before=before,
			
@@ -120,20 +127,24 @@ def search(filter_patterns: list[str] | None=None,
 
				         snapshots = snapshots.order_by(sort)
			
 
				 
			
 
				     folders = list_folders(
			
 
				-        links=snapshots,
			
 
				+        snapshots=snapshots,
			
 
				         status=status,
			
 
				         out_dir=DATA_DIR,
			
 
				     )
			
 
				 
			
 
				     if json:
			
 
				-        from archivebox.index.json import generate_json_index_from_links
			
 
				-        output = generate_json_index_from_links(folders.values(), with_headers)
			
 
				+        from core.models import Snapshot
			
 
				+        # Filter for non-None snapshots
			
 
				+        valid_snapshots = [s for s in folders.values() if s is not None]
			
 
				+        output = Snapshot.objects.filter(pk__in=[s.pk for s in valid_snapshots]).to_json(with_headers=with_headers)
			
 
				     elif html:
			
 
				-        from archivebox.index.html import generate_index_from_links
			
 
				-        output = generate_index_from_links(folders.values(), with_headers) 
			
 
				+        from core.models import Snapshot
			
 
				+        valid_snapshots = [s for s in folders.values() if s is not None]
			
 
				+        output = Snapshot.objects.filter(pk__in=[s.pk for s in valid_snapshots]).to_html(with_headers=with_headers)
			
 
				     elif csv:
			
 
				-        from archivebox.index.csv import links_to_csv
			
 
				-        output = links_to_csv(folders.values(), csv.split(','), with_headers)
			
 
				+        from core.models import Snapshot
			
 
				+        valid_snapshots = [s for s in folders.values() if s is not None]
			
 
				+        output = Snapshot.objects.filter(pk__in=[s.pk for s in valid_snapshots]).to_csv(cols=csv.split(','), header=with_headers)
			
 
				     else:
			
 
				         from archivebox.misc.logging_util import printable_folders
			
 
				         output = printable_folders(folders, with_headers)
			
--- a/archivebox/cli/archivebox_snapshot.py
+++ b/archivebox/cli/archivebox_snapshot.py
@@ -0,0 +1,218 @@
 
				+#!/usr/bin/env python3
			
 
				+
			
 
				+"""
			
 
				+archivebox snapshot [urls...] [--depth=N] [--tag=TAG] [--plugins=...]
			
 
				+
			
 
				+Create Snapshots from URLs. Accepts URLs as arguments, from stdin, or via JSONL.
			
 
				+
			
 
				+Input formats:
			
 
				+    - Plain URLs (one per line)
			
 
				+    - JSONL: {"type": "Snapshot", "url": "...", "title": "...", "tags": "..."}
			
 
				+
			
 
				+Output (JSONL):
			
 
				+    {"type": "Snapshot", "id": "...", "url": "...", "status": "queued", ...}
			
 
				+
			
 
				+Examples:
			
 
				+    # Create snapshots from URLs
			
 
				+    archivebox snapshot https://example.com https://foo.com
			
 
				+
			
 
				+    # Pipe from stdin
			
 
				+    echo 'https://example.com' | archivebox snapshot
			
 
				+
			
 
				+    # Chain with extract
			
 
				+    archivebox snapshot https://example.com | archivebox extract
			
 
				+
			
 
				+    # With crawl depth
			
 
				+    archivebox snapshot --depth=1 https://example.com
			
 
				+"""
			
 
				+
			
 
				+__package__ = 'archivebox.cli'
			
 
				+__command__ = 'archivebox snapshot'
			
 
				+
			
 
				+import sys
			
 
				+from typing import Optional
			
 
				+
			
 
				+import rich_click as click
			
 
				+
			
 
				+from archivebox.misc.util import docstring
			
 
				+
			
 
				+
			
 
				+def process_snapshot_by_id(snapshot_id: str) -> int:
			
 
				+    """
			
 
				+    Process a single Snapshot by ID (used by workers).
			
 
				+
			
 
				+    Triggers the Snapshot's state machine tick() which will:
			
 
				+    - Transition from queued -> started (creates pending ArchiveResults)
			
 
				+    - Transition from started -> sealed (when all ArchiveResults done)
			
 
				+    """
			
 
				+    from rich import print as rprint
			
 
				+    from core.models import Snapshot
			
 
				+
			
 
				+    try:
			
 
				+        snapshot = Snapshot.objects.get(id=snapshot_id)
			
 
				+    except Snapshot.DoesNotExist:
			
 
				+        rprint(f'[red]Snapshot {snapshot_id} not found[/red]', file=sys.stderr)
			
 
				+        return 1
			
 
				+
			
 
				+    rprint(f'[blue]Processing Snapshot {snapshot.id} {snapshot.url[:50]} (status={snapshot.status})[/blue]', file=sys.stderr)
			
 
				+
			
 
				+    try:
			
 
				+        snapshot.sm.tick()
			
 
				+        snapshot.refresh_from_db()
			
 
				+        rprint(f'[green]Snapshot complete (status={snapshot.status})[/green]', file=sys.stderr)
			
 
				+        return 0
			
 
				+    except Exception as e:
			
 
				+        rprint(f'[red]Snapshot error: {type(e).__name__}: {e}[/red]', file=sys.stderr)
			
 
				+        return 1
			
 
				+
			
 
				+
			
 
				+def create_snapshots(
			
 
				+    urls: tuple,
			
 
				+    depth: int = 0,
			
 
				+    tag: str = '',
			
 
				+    plugins: str = '',
			
 
				+    created_by_id: Optional[int] = None,
			
 
				+) -> int:
			
 
				+    """
			
 
				+    Create Snapshots from URLs or JSONL records.
			
 
				+
			
 
				+    Reads from args or stdin, creates Snapshot objects, outputs JSONL.
			
 
				+    If --plugins is passed, also runs specified plugins (blocking).
			
 
				+
			
 
				+    Exit codes:
			
 
				+        0: Success
			
 
				+        1: Failure
			
 
				+    """
			
 
				+    from rich import print as rprint
			
 
				+    from django.utils import timezone
			
 
				+
			
 
				+    from archivebox.misc.jsonl import (
			
 
				+        read_args_or_stdin, write_record, snapshot_to_jsonl,
			
 
				+        TYPE_SNAPSHOT, TYPE_TAG, get_or_create_snapshot
			
 
				+    )
			
 
				+    from archivebox.base_models.models import get_or_create_system_user_pk
			
 
				+    from core.models import Snapshot
			
 
				+    from crawls.models import Seed, Crawl
			
 
				+    from archivebox.config import CONSTANTS
			
 
				+
			
 
				+    created_by_id = created_by_id or get_or_create_system_user_pk()
			
 
				+    is_tty = sys.stdout.isatty()
			
 
				+
			
 
				+    # Collect all input records
			
 
				+    records = list(read_args_or_stdin(urls))
			
 
				+
			
 
				+    if not records:
			
 
				+        rprint('[yellow]No URLs provided. Pass URLs as arguments or via stdin.[/yellow]', file=sys.stderr)
			
 
				+        return 1
			
 
				+
			
 
				+    # If depth > 0, we need a Crawl to manage recursive discovery
			
 
				+    crawl = None
			
 
				+    if depth > 0:
			
 
				+        # Create a seed for this batch
			
 
				+        sources_file = CONSTANTS.SOURCES_DIR / f'{timezone.now().strftime("%Y-%m-%d__%H-%M-%S")}__snapshot.txt'
			
 
				+        sources_file.parent.mkdir(parents=True, exist_ok=True)
			
 
				+        sources_file.write_text('\n'.join(r.get('url', '') for r in records if r.get('url')))
			
 
				+
			
 
				+        seed = Seed.from_file(
			
 
				+            sources_file,
			
 
				+            label=f'snapshot --depth={depth}',
			
 
				+            created_by=created_by_id,
			
 
				+        )
			
 
				+        crawl = Crawl.from_seed(seed, max_depth=depth)
			
 
				+
			
 
				+    # Process each record
			
 
				+    created_snapshots = []
			
 
				+    for record in records:
			
 
				+        if record.get('type') != TYPE_SNAPSHOT and 'url' not in record:
			
 
				+            continue
			
 
				+
			
 
				+        try:
			
 
				+            # Add crawl info if we have one
			
 
				+            if crawl:
			
 
				+                record['crawl_id'] = str(crawl.id)
			
 
				+                record['depth'] = record.get('depth', 0)
			
 
				+
			
 
				+            # Add tags if provided via CLI
			
 
				+            if tag and not record.get('tags'):
			
 
				+                record['tags'] = tag
			
 
				+
			
 
				+            # Get or create the snapshot
			
 
				+            snapshot = get_or_create_snapshot(record, created_by_id=created_by_id)
			
 
				+            created_snapshots.append(snapshot)
			
 
				+
			
 
				+            # Output JSONL record (only when piped)
			
 
				+            if not is_tty:
			
 
				+                write_record(snapshot_to_jsonl(snapshot))
			
 
				+
			
 
				+        except Exception as e:
			
 
				+            rprint(f'[red]Error creating snapshot: {e}[/red]', file=sys.stderr)
			
 
				+            continue
			
 
				+
			
 
				+    if not created_snapshots:
			
 
				+        rprint('[red]No snapshots created[/red]', file=sys.stderr)
			
 
				+        return 1
			
 
				+
			
 
				+    rprint(f'[green]Created {len(created_snapshots)} snapshots[/green]', file=sys.stderr)
			
 
				+
			
 
				+    # If TTY, show human-readable output
			
 
				+    if is_tty:
			
 
				+        for snapshot in created_snapshots:
			
 
				+            rprint(f'  [dim]{snapshot.id}[/dim] {snapshot.url[:60]}', file=sys.stderr)
			
 
				+
			
 
				+    # If --plugins is passed, run the orchestrator for those plugins
			
 
				+    if plugins:
			
 
				+        from workers.orchestrator import Orchestrator
			
 
				+        rprint(f'[blue]Running plugins: {plugins or "all"}...[/blue]', file=sys.stderr)
			
 
				+        orchestrator = Orchestrator(exit_on_idle=True)
			
 
				+        orchestrator.runloop()
			
 
				+
			
 
				+    return 0
			
 
				+
			
 
				+
			
 
				+def is_snapshot_id(value: str) -> bool:
			
 
				+    """Check if value looks like a Snapshot UUID."""
			
 
				+    import re
			
 
				+    uuid_pattern = re.compile(r'^[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}$', re.I)
			
 
				+    return bool(uuid_pattern.match(value))
			
 
				+
			
 
				+
			
 
				[email protected]()
			
 
				[email protected]('--depth', '-d', type=int, default=0, help='Recursively crawl linked pages up to N levels deep')
			
 
				[email protected]('--tag', '-t', default='', help='Comma-separated tags to add to each snapshot')
			
 
				[email protected]('--plugins', '-p', default='', help='Comma-separated list of plugins to run after creating snapshots (e.g. title,screenshot)')
			
 
				[email protected]('args', nargs=-1)
			
 
				+def main(depth: int, tag: str, plugins: str, args: tuple):
			
 
				+    """Create Snapshots from URLs, or process existing Snapshots by ID"""
			
 
				+    from archivebox.misc.jsonl import read_args_or_stdin
			
 
				+
			
 
				+    # Read all input
			
 
				+    records = list(read_args_or_stdin(args))
			
 
				+
			
 
				+    if not records:
			
 
				+        from rich import print as rprint
			
 
				+        rprint('[yellow]No URLs or Snapshot IDs provided. Pass as arguments or via stdin.[/yellow]', file=sys.stderr)
			
 
				+        sys.exit(1)
			
 
				+
			
 
				+    # Check if input looks like existing Snapshot IDs to process
			
 
				+    # If ALL inputs are UUIDs with no URL, assume we're processing existing Snapshots
			
 
				+    all_are_ids = all(
			
 
				+        (r.get('id') and not r.get('url')) or is_snapshot_id(r.get('url', ''))
			
 
				+        for r in records
			
 
				+    )
			
 
				+
			
 
				+    if all_are_ids:
			
 
				+        # Process existing Snapshots by ID
			
 
				+        exit_code = 0
			
 
				+        for record in records:
			
 
				+            snapshot_id = record.get('id') or record.get('url')
			
 
				+            result = process_snapshot_by_id(snapshot_id)
			
 
				+            if result != 0:
			
 
				+                exit_code = result
			
 
				+        sys.exit(exit_code)
			
 
				+    else:
			
 
				+        # Create new Snapshots from URLs
			
 
				+        sys.exit(create_snapshots(args, depth=depth, tag=tag, plugins=plugins))
			
 
				+
			
 
				+
			
 
				+if __name__ == '__main__':
			
 
				+    main()
			
--- a/archivebox/cli/archivebox_status.py
+++ b/archivebox/cli/archivebox_status.py
@@ -10,9 +10,8 @@ from rich import print
 
				 from archivebox.misc.util import enforce_types, docstring
			
 
				 from archivebox.config import DATA_DIR, CONSTANTS, ARCHIVE_DIR
			
 
				 from archivebox.config.common import SHELL_CONFIG
			
 
				-from archivebox.index.json import parse_json_links_details
			
 
				-from archivebox.index import (
			
 
				-    load_main_index,
			
 
				+from archivebox.misc.legacy import parse_json_links_details
			
 
				+from archivebox.misc.folders import (
			
 
				     get_indexed_folders,
			
 
				     get_archived_folders,
			
 
				     get_invalid_folders,
			
@@ -33,7 +32,7 @@ def status(out_dir: Path=DATA_DIR) -> None:
 
				     """Print out some info and statistics about the archive collection"""
			
 
				 
			
 
				     from django.contrib.auth import get_user_model
			
 
				-    from archivebox.index.sql import get_admins
			
 
				+    from archivebox.misc.db import get_admins
			
 
				     from core.models import Snapshot
			
 
				     User = get_user_model()
			
 
				 
			
@@ -44,7 +43,7 @@ def status(out_dir: Path=DATA_DIR) -> None:
 
				     print(f'    Index size: {size} across {num_files} files')
			
 
				     print()
			
 
				 
			
 
				-    links = load_main_index(out_dir=out_dir)
			
 
				+    links = Snapshot.objects.all()
			
 
				     num_sql_links = links.count()
			
 
				     num_link_details = sum(1 for link in parse_json_links_details(out_dir=out_dir))
			
 
				     print(f'    > SQL Main Index: {num_sql_links} links'.ljust(36), f'(found in {CONSTANTS.SQL_INDEX_FILENAME})')
			
--- a/archivebox/cli/archivebox_update.py
+++ b/archivebox/cli/archivebox_update.py
@@ -8,8 +8,7 @@ import rich_click as click
 
				 from typing import Iterable
			
 
				 
			
 
				 from archivebox.misc.util import enforce_types, docstring
			
 
				-from archivebox.index import (
			
 
				-    LINK_FILTERS,
			
 
				+from archivebox.misc.folders import (
			
 
				     get_indexed_folders,
			
 
				     get_archived_folders,
			
 
				     get_unarchived_folders,
			
@@ -22,6 +21,16 @@ from archivebox.index import (
 
				     get_unrecognized_folders,
			
 
				 )
			
 
				 
			
 
				+# Filter types for URL matching
			
 
				+LINK_FILTERS = {
			
 
				+    'exact': lambda pattern: {'url': pattern},
			
 
				+    'substring': lambda pattern: {'url__icontains': pattern},
			
 
				+    'regex': lambda pattern: {'url__iregex': pattern},
			
 
				+    'domain': lambda pattern: {'url__istartswith': f'http://{pattern}'},
			
 
				+    'tag': lambda pattern: {'tags__name': pattern},
			
 
				+    'timestamp': lambda pattern: {'timestamp': pattern},
			
 
				+}
			
 
				+
			
 
				 
			
 
				 @enforce_types
			
 
				 def update(filter_patterns: Iterable[str]=(),
			
@@ -33,15 +42,66 @@ def update(filter_patterns: Iterable[str]=(),
 
				           after: float | None=None,
			
 
				           status: str='indexed',
			
 
				           filter_type: str='exact',
			
 
				-          extract: str="") -> None:
			
 
				+          plugins: str="",
			
 
				+          max_workers: int=4) -> None:
			
 
				     """Import any new links from subscriptions and retry any previously failed/skipped links"""
			
 
				     
			
 
				+    from rich import print
			
 
				+    
			
 
				     from archivebox.config.django import setup_django
			
 
				     setup_django()
			
 
				+
			
 
				+    from django.utils import timezone
			
 
				+    from core.models import Snapshot
			
 
				+    from workers.orchestrator import parallel_archive
			
 
				+    
			
 
				+    # Get snapshots to update based on filters
			
 
				+    snapshots = Snapshot.objects.all()
			
 
				+    
			
 
				+    if filter_patterns:
			
 
				+        snapshots = Snapshot.objects.filter_by_patterns(list(filter_patterns), filter_type)
			
 
				+    
			
 
				+    if status == 'unarchived':
			
 
				+        snapshots = snapshots.filter(downloaded_at__isnull=True)
			
 
				+    elif status == 'archived':
			
 
				+        snapshots = snapshots.filter(downloaded_at__isnull=False)
			
 
				+    
			
 
				+    if before:
			
 
				+        from datetime import datetime
			
 
				+        snapshots = snapshots.filter(bookmarked_at__lt=datetime.fromtimestamp(before))
			
 
				+    if after:
			
 
				+        from datetime import datetime
			
 
				+        snapshots = snapshots.filter(bookmarked_at__gt=datetime.fromtimestamp(after))
			
 
				+    
			
 
				+    if resume:
			
 
				+        snapshots = snapshots.filter(timestamp__gte=str(resume))
			
 
				+    
			
 
				+    snapshot_ids = list(snapshots.values_list('pk', flat=True))
			
 
				+    
			
 
				+    if not snapshot_ids:
			
 
				+        print('[yellow]No snapshots found matching the given filters[/yellow]')
			
 
				+        return
			
 
				+    
			
 
				+    print(f'[green]\\[*] Found {len(snapshot_ids)} snapshots to update[/green]')
			
 
				+    
			
 
				+    if index_only:
			
 
				+        print('[yellow]Index-only mode - skipping archiving[/yellow]')
			
 
				+        return
			
 
				     
			
 
				-    from workers.orchestrator import Orchestrator
			
 
				-    orchestrator = Orchestrator(exit_on_idle=False)
			
 
				-    orchestrator.start()
			
 
				+    methods = plugins.split(',') if plugins else None
			
 
				+
			
 
				+    # Queue snapshots for archiving via the state machine system
			
 
				+    # Workers will pick them up and run the plugins
			
 
				+    if len(snapshot_ids) > 1 and max_workers > 1:
			
 
				+        parallel_archive(snapshot_ids, max_workers=max_workers, overwrite=overwrite, methods=methods)
			
 
				+    else:
			
 
				+        # Queue snapshots by setting status to queued
			
 
				+        for snapshot in snapshots:
			
 
				+            Snapshot.objects.filter(id=snapshot.id).update(
			
 
				+                status=Snapshot.StatusChoices.QUEUED,
			
 
				+                retry_at=timezone.now(),
			
 
				+            )
			
 
				+        print(f'[green]Queued {len(snapshot_ids)} snapshots for archiving[/green]')
			
 
				 
			
 
				 
			
 
				 @click.command()
			
@@ -71,7 +131,8 @@ Update only links or data directories that have the given status:
 
				     unrecognized  {get_unrecognized_folders.__doc__}
			
 
				 ''')
			
 
				 @click.option('--filter-type', '-t', type=click.Choice([*LINK_FILTERS.keys(), 'search']), default='exact', help='Type of pattern matching to use when filtering URLs')
			
 
				[email protected]('--extract', '-e', default='', help='Comma-separated list of extractors to use e.g. title,favicon,screenshot,singlefile,...')
			
 
				[email protected]('--plugins', '-p', default='', help='Comma-separated list of plugins to use e.g. title,favicon,screenshot,singlefile,...')
			
 
				[email protected]('--max-workers', '-j', type=int, default=4, help='Number of parallel worker processes for archiving')
			
 
				 @click.argument('filter_patterns', nargs=-1)
			
 
				 @docstring(update.__doc__)
			
 
				 def main(**kwargs):
			
--- a/archivebox/cli/archivebox_version.py
+++ b/archivebox/cli/archivebox_version.py
@@ -3,7 +3,10 @@
 
				 __package__ = 'archivebox.cli'
			
 
				 
			
 
				 import sys
			
 
				-from typing import Iterable
			
 
				+import os
			
 
				+import platform
			
 
				+from pathlib import Path
			
 
				+from typing import Iterable, Optional
			
 
				 
			
 
				 import rich_click as click
			
 
				 
			
@@ -12,7 +15,6 @@ from archivebox.misc.util import docstring, enforce_types
 
				 
			
 
				 @enforce_types
			
 
				 def version(quiet: bool=False,
			
 
				-            binproviders: Iterable[str]=(),
			
 
				             binaries: Iterable[str]=()) -> list[str]:
			
 
				     """Print the ArchiveBox version, debug metadata, and installed dependency versions"""
			
 
				     
			
@@ -22,37 +24,24 @@ def version(quiet: bool=False,
 
				     if quiet or '--version' in sys.argv:
			
 
				         return []
			
 
				     
			
 
				-    # Only do slower imports when getting full version info
			
 
				-    import os
			
 
				-    import platform
			
 
				-    from pathlib import Path
			
 
				-    
			
 
				     from rich.panel import Panel
			
 
				     from rich.console import Console
			
 
				-    from abx_pkg import Binary
			
 
				     
			
 
				-    import abx
			
 
				-    import archivebox
			
 
				     from archivebox.config import CONSTANTS, DATA_DIR
			
 
				     from archivebox.config.version import get_COMMIT_HASH, get_BUILD_TIME
			
 
				     from archivebox.config.permissions import ARCHIVEBOX_USER, ARCHIVEBOX_GROUP, RUNNING_AS_UID, RUNNING_AS_GID, IN_DOCKER
			
 
				     from archivebox.config.paths import get_data_locations, get_code_locations
			
 
				     from archivebox.config.common import SHELL_CONFIG, STORAGE_CONFIG, SEARCH_BACKEND_CONFIG
			
 
				     from archivebox.misc.logging_util import printable_folder_status
			
 
				-    
			
 
				-    from abx_plugin_default_binproviders import apt, brew, env
			
 
				+    from archivebox.config.configset import get_config
			
 
				     
			
 
				     console = Console()
			
 
				     prnt = console.print
			
 
				     
			
 
				-    LDAP_ENABLED = archivebox.pm.hook.get_SCOPE_CONFIG().LDAP_ENABLED
			
 
				+    # Check if LDAP is enabled (simple config lookup)
			
 
				+    config = get_config()
			
 
				+    LDAP_ENABLED = config.get('LDAP_ENABLED', False)
			
 
				 
			
 
				-    # 0.7.1
			
 
				-    # ArchiveBox v0.7.1+editable COMMIT_HASH=951bba5 BUILD_TIME=2023-12-17 16:46:05 1702860365
			
 
				-    # IN_DOCKER=False IN_QEMU=False ARCH=arm64 OS=Darwin PLATFORM=macOS-14.2-arm64-arm-64bit PYTHON=Cpython
			
 
				-    # FS_ATOMIC=True FS_REMOTE=False FS_USER=501:20 FS_PERMS=644
			
 
				-    # DEBUG=False IS_TTY=True TZ=UTC SEARCH_BACKEND=ripgrep LDAP=False
			
 
				-    
			
 
				     p = platform.uname()
			
 
				     COMMIT_HASH = get_COMMIT_HASH()
			
 
				     prnt(
			
@@ -68,15 +57,26 @@ def version(quiet: bool=False,
 
				         f'PLATFORM={platform.platform()}',
			
 
				         f'PYTHON={sys.implementation.name.title()}' + (' (venv)' if CONSTANTS.IS_INSIDE_VENV else ''),
			
 
				     )
			
 
				-    OUTPUT_IS_REMOTE_FS = get_data_locations().DATA_DIR.is_mount or get_data_locations().ARCHIVE_DIR.is_mount
			
 
				-    DATA_DIR_STAT = CONSTANTS.DATA_DIR.stat()
			
 
				-    prnt(
			
 
				-        f'EUID={os.geteuid()}:{os.getegid()} UID={RUNNING_AS_UID}:{RUNNING_AS_GID} PUID={ARCHIVEBOX_USER}:{ARCHIVEBOX_GROUP}',
			
 
				-        f'FS_UID={DATA_DIR_STAT.st_uid}:{DATA_DIR_STAT.st_gid}',
			
 
				-        f'FS_PERMS={STORAGE_CONFIG.OUTPUT_PERMISSIONS}',
			
 
				-        f'FS_ATOMIC={STORAGE_CONFIG.ENFORCE_ATOMIC_WRITES}',
			
 
				-        f'FS_REMOTE={OUTPUT_IS_REMOTE_FS}',
			
 
				-    )
			
 
				+    
			
 
				+    try:
			
 
				+        OUTPUT_IS_REMOTE_FS = get_data_locations().DATA_DIR.is_mount or get_data_locations().ARCHIVE_DIR.is_mount
			
 
				+    except Exception:
			
 
				+        OUTPUT_IS_REMOTE_FS = False
			
 
				+        
			
 
				+    try:
			
 
				+        DATA_DIR_STAT = CONSTANTS.DATA_DIR.stat()
			
 
				+        prnt(
			
 
				+            f'EUID={os.geteuid()}:{os.getegid()} UID={RUNNING_AS_UID}:{RUNNING_AS_GID} PUID={ARCHIVEBOX_USER}:{ARCHIVEBOX_GROUP}',
			
 
				+            f'FS_UID={DATA_DIR_STAT.st_uid}:{DATA_DIR_STAT.st_gid}',
			
 
				+            f'FS_PERMS={STORAGE_CONFIG.OUTPUT_PERMISSIONS}',
			
 
				+            f'FS_ATOMIC={STORAGE_CONFIG.ENFORCE_ATOMIC_WRITES}',
			
 
				+            f'FS_REMOTE={OUTPUT_IS_REMOTE_FS}',
			
 
				+        )
			
 
				+    except Exception:
			
 
				+        prnt(
			
 
				+            f'EUID={os.geteuid()}:{os.getegid()} UID={RUNNING_AS_UID}:{RUNNING_AS_GID} PUID={ARCHIVEBOX_USER}:{ARCHIVEBOX_GROUP}',
			
 
				+        )
			
 
				+        
			
 
				     prnt(
			
 
				         f'DEBUG={SHELL_CONFIG.DEBUG}',
			
 
				         f'IS_TTY={SHELL_CONFIG.IS_TTY}',
			
@@ -84,14 +84,11 @@ def version(quiet: bool=False,
 
				         f'ID={CONSTANTS.MACHINE_ID}:{CONSTANTS.COLLECTION_ID}',
			
 
				         f'SEARCH_BACKEND={SEARCH_BACKEND_CONFIG.SEARCH_BACKEND_ENGINE}',
			
 
				         f'LDAP={LDAP_ENABLED}',
			
 
				-        #f'DB=django.db.backends.sqlite3 (({CONFIG["SQLITE_JOURNAL_MODE"]})',  # add this if we have more useful info to show eventually
			
 
				     )
			
 
				     prnt()
			
 
				     
			
 
				     if not (os.access(CONSTANTS.ARCHIVE_DIR, os.R_OK) and os.access(CONSTANTS.CONFIG_FILE, os.R_OK)):
			
 
				         PANEL_TEXT = '\n'.join((
			
 
				-            # '',
			
 
				-            # f'[yellow]CURRENT DIR =[/yellow] [red]{os.getcwd()}[/red]',
			
 
				             '',
			
 
				             '[violet]Hint:[/violet] [green]cd[/green] into a collection [blue]DATA_DIR[/blue] and run [green]archivebox version[/green] again...',
			
 
				             '      [grey53]OR[/grey53] run [green]archivebox init[/green] to create a new collection in the current dir.',
			
@@ -105,77 +102,94 @@ def version(quiet: bool=False,
 
				 
			
 
				     prnt('[pale_green1][i] Binary Dependencies:[/pale_green1]')
			
 
				     failures = []
			
 
				-    BINARIES = abx.as_dict(archivebox.pm.hook.get_BINARIES())
			
 
				-    for name, binary in list(BINARIES.items()):
			
 
				-        if binary.name == 'archivebox':
			
 
				-            continue
			
 
				-        
			
 
				-        # skip if the binary is not in the requested list of binaries
			
 
				-        if binaries and binary.name not in binaries:
			
 
				-            continue
			
 
				-        
			
 
				-        # skip if the binary is not supported by any of the requested binproviders
			
 
				-        if binproviders and binary.binproviders_supported and not any(provider.name in binproviders for provider in binary.binproviders_supported):
			
 
				-            continue
			
 
				-        
			
 
				-        err = None
			
 
				-        try:
			
 
				-            loaded_bin = binary.load()
			
 
				-        except Exception as e:
			
 
				-            err = e
			
 
				-            loaded_bin = binary
			
 
				-        provider_summary = f'[dark_sea_green3]{loaded_bin.binprovider.name.ljust(10)}[/dark_sea_green3]' if loaded_bin.binprovider else '[grey23]not found[/grey23] '
			
 
				-        if loaded_bin.abspath:
			
 
				-            abspath = str(loaded_bin.abspath).replace(str(DATA_DIR), '[light_slate_blue].[/light_slate_blue]').replace(str(Path('~').expanduser()), '~')
			
 
				-            if ' ' in abspath:
			
 
				-                abspath = abspath.replace(' ', r'\ ')
			
 
				-        else:
			
 
				-            abspath = f'[red]{err}[/red]'
			
 
				-        prnt('', '[green]√[/green]' if loaded_bin.is_valid else '[red]X[/red]', '', loaded_bin.name.ljust(21), str(loaded_bin.version).ljust(12), provider_summary, abspath, overflow='ignore', crop=False)
			
 
				-        if not loaded_bin.is_valid:
			
 
				-            failures.append(loaded_bin.name)
			
 
				-            
			
 
				-    prnt()
			
 
				-    prnt('[gold3][i] Package Managers:[/gold3]')
			
 
				-    BINPROVIDERS = abx.as_dict(archivebox.pm.hook.get_BINPROVIDERS())
			
 
				-    for name, binprovider in list(BINPROVIDERS.items()):
			
 
				-        err = None
			
 
				-        
			
 
				-        if binproviders and binprovider.name not in binproviders:
			
 
				-            continue
			
 
				-        
			
 
				-        # TODO: implement a BinProvider.BINARY() method that gets the loaded binary for a binprovider's INSTALLER_BIN
			
 
				-        loaded_bin = binprovider.INSTALLER_BINARY or Binary(name=binprovider.INSTALLER_BIN, binproviders=[env, apt, brew])
			
 
				-        
			
 
				-        abspath = str(loaded_bin.abspath).replace(str(DATA_DIR), '[light_slate_blue].[/light_slate_blue]').replace(str(Path('~').expanduser()), '~')
			
 
				-        abspath = None
			
 
				-        if loaded_bin.abspath:
			
 
				-            abspath = str(loaded_bin.abspath).replace(str(DATA_DIR), '.').replace(str(Path('~').expanduser()), '~')
			
 
				-            if ' ' in abspath:
			
 
				-                abspath = abspath.replace(' ', r'\ ')
			
 
				-                
			
 
				-        PATH = str(binprovider.PATH).replace(str(DATA_DIR), '[light_slate_blue].[/light_slate_blue]').replace(str(Path('~').expanduser()), '~')
			
 
				-        ownership_summary = f'UID=[blue]{str(binprovider.EUID).ljust(4)}[/blue]'
			
 
				-        provider_summary = f'[dark_sea_green3]{str(abspath).ljust(52)}[/dark_sea_green3]' if abspath else f'[grey23]{"not available".ljust(52)}[/grey23]'
			
 
				-        prnt('', '[green]√[/green]' if binprovider.is_valid else '[grey53]-[/grey53]', '', binprovider.name.ljust(11), provider_summary, ownership_summary, f'PATH={PATH}', overflow='ellipsis', soft_wrap=True)
			
 
				-
			
 
				-    if not (binaries or binproviders):
			
 
				-        # dont show source code / data dir info if we just want to get version info for a binary or binprovider
			
 
				-        
			
 
				+
			
 
				+    # Setup Django before importing models
			
 
				+    from archivebox.config.django import setup_django
			
 
				+    setup_django()
			
 
				+
			
 
				+    from machine.models import Machine, InstalledBinary
			
 
				+
			
 
				+    machine = Machine.current()
			
 
				+
			
 
				+    # Get all *_BINARY config values
			
 
				+    binary_config_keys = [key for key in config.keys() if key.endswith('_BINARY')]
			
 
				+
			
 
				+    if not binary_config_keys:
			
 
				+        prnt('', '[grey53]No binary dependencies defined in config.[/grey53]')
			
 
				+    else:
			
 
				+        for key in sorted(set(binary_config_keys)):
			
 
				+            # Get the actual binary name/path from config value
			
 
				+            bin_value = config.get(key, '').strip()
			
 
				+            if not bin_value:
			
 
				+                continue
			
 
				+
			
 
				+            # Check if it's a path (has slashes) or just a name
			
 
				+            is_path = '/' in bin_value
			
 
				+
			
 
				+            if is_path:
			
 
				+                # It's a full path - match against abspath
			
 
				+                bin_name = Path(bin_value).name
			
 
				+                # Skip if user specified specific binaries and this isn't one
			
 
				+                if binaries and bin_name not in binaries:
			
 
				+                    continue
			
 
				+                # Find InstalledBinary where abspath ends with this path
			
 
				+                installed = InstalledBinary.objects.filter(
			
 
				+                    machine=machine,
			
 
				+                    abspath__endswith=bin_value,
			
 
				+                ).exclude(abspath='').exclude(abspath__isnull=True).order_by('-modified_at').first()
			
 
				+            else:
			
 
				+                # It's just a binary name - match against name
			
 
				+                bin_name = bin_value
			
 
				+                # Skip if user specified specific binaries and this isn't one
			
 
				+                if binaries and bin_name not in binaries:
			
 
				+                    continue
			
 
				+                # Find InstalledBinary by name
			
 
				+                installed = InstalledBinary.objects.filter(
			
 
				+                    machine=machine,
			
 
				+                    name__iexact=bin_name,
			
 
				+                ).exclude(abspath='').exclude(abspath__isnull=True).order_by('-modified_at').first()
			
 
				+
			
 
				+            if installed and installed.is_valid:
			
 
				+                display_path = installed.abspath.replace(str(DATA_DIR), '.').replace(str(Path('~').expanduser()), '~')
			
 
				+                version_str = (installed.version or 'unknown')[:15]
			
 
				+                provider = (installed.binprovider or 'env')[:8]
			
 
				+                prnt('', '[green]√[/green]', '', bin_name.ljust(18), version_str.ljust(16), provider.ljust(8), display_path, overflow='ignore', crop=False)
			
 
				+            else:
			
 
				+                prnt('', '[red]X[/red]', '', bin_name.ljust(18), '[grey53]not installed[/grey53]', overflow='ignore', crop=False)
			
 
				+                failures.append(bin_name)
			
 
				+
			
 
				+    # Show hint if no binaries are installed yet
			
 
				+    has_any_installed = InstalledBinary.objects.filter(machine=machine).exclude(abspath='').exists()
			
 
				+    if not has_any_installed:
			
 
				+        prnt()
			
 
				+        prnt('', '[grey53]Run [green]archivebox install[/green] to detect and install dependencies.[/grey53]')
			
 
				+
			
 
				+    if not binaries:
			
 
				+        # Show code and data locations
			
 
				         prnt()
			
 
				         prnt('[deep_sky_blue3][i] Code locations:[/deep_sky_blue3]')
			
 
				-        for name, path in get_code_locations().items():
			
 
				-            prnt(printable_folder_status(name, path), overflow='ignore', crop=False)
			
 
				+        try:
			
 
				+            for name, path in get_code_locations().items():
			
 
				+                if isinstance(path, dict):
			
 
				+                    prnt(printable_folder_status(name, path), overflow='ignore', crop=False)
			
 
				+        except Exception as e:
			
 
				+            prnt(f'  [red]Error getting code locations: {e}[/red]')
			
 
				 
			
 
				         prnt()
			
 
				         if os.access(CONSTANTS.ARCHIVE_DIR, os.R_OK) or os.access(CONSTANTS.CONFIG_FILE, os.R_OK):
			
 
				             prnt('[bright_yellow][i] Data locations:[/bright_yellow]')
			
 
				-            for name, path in get_data_locations().items():
			
 
				-                prnt(printable_folder_status(name, path), overflow='ignore', crop=False)
			
 
				-        
			
 
				-            from archivebox.misc.checks import check_data_dir_permissions
			
 
				+            try:
			
 
				+                for name, path in get_data_locations().items():
			
 
				+                    if isinstance(path, dict):
			
 
				+                        prnt(printable_folder_status(name, path), overflow='ignore', crop=False)
			
 
				+            except Exception as e:
			
 
				+                prnt(f'  [red]Error getting data locations: {e}[/red]')
			
 
				             
			
 
				-            check_data_dir_permissions()
			
 
				+            try:
			
 
				+                from archivebox.misc.checks import check_data_dir_permissions
			
 
				+                check_data_dir_permissions()
			
 
				+            except Exception:
			
 
				+                pass
			
 
				         else:
			
 
				             prnt()
			
 
				             prnt('[red][i] Data locations:[/red] (not in a data directory)')
			
@@ -194,7 +208,6 @@ def version(quiet: bool=False,
 
				 
			
 
				 @click.command()
			
 
				 @click.option('--quiet', '-q', is_flag=True, help='Only print ArchiveBox version number and nothing else. (equivalent to archivebox --version)')
			
 
				[email protected]('--binproviders', '-p', help='Select binproviders to detect DEFAULT=env,apt,brew,sys_pip,venv_pip,lib_pip,pipx,sys_npm,lib_npm,puppeteer,playwright (all)')
			
 
				 @click.option('--binaries', '-b', help='Select binaries to detect DEFAULT=curl,wget,git,yt-dlp,chrome,single-file,readability-extractor,postlight-parser,... (all)')
			
 
				 @docstring(version.__doc__)
			
 
				 def main(**kwargs):
			
--- a/archivebox/cli/archivebox_worker.py
+++ b/archivebox/cli/archivebox_worker.py
@@ -4,29 +4,46 @@ __package__ = 'archivebox.cli'
 
				 __command__ = 'archivebox worker'
			
 
				 
			
 
				 import sys
			
 
				-import json
			
 
				 
			
 
				 import rich_click as click
			
 
				 
			
 
				+from archivebox.misc.util import docstring
			
 
				+
			
 
				+
			
 
				+def worker(worker_type: str, daemon: bool = False, plugin: str | None = None):
			
 
				+    """
			
 
				+    Start a worker process to process items from the queue.
			
 
				+
			
 
				+    Worker types:
			
 
				+        - crawl: Process Crawl objects (parse seeds, create snapshots)
			
 
				+        - snapshot: Process Snapshot objects (create archive results)
			
 
				+        - archiveresult: Process ArchiveResult objects (run plugins)
			
 
				+
			
 
				+    Workers poll the database for queued items, claim them atomically,
			
 
				+    and spawn subprocess tasks to handle each item.
			
 
				+    """
			
 
				+    from workers.worker import get_worker_class
			
 
				+
			
 
				+    WorkerClass = get_worker_class(worker_type)
			
 
				+
			
 
				+    # Build kwargs
			
 
				+    kwargs = {'daemon': daemon}
			
 
				+    if plugin and worker_type == 'archiveresult':
			
 
				+        kwargs['extractor'] = plugin  # internal field still called extractor
			
 
				+
			
 
				+    # Create and run worker
			
 
				+    worker_instance = WorkerClass(**kwargs)
			
 
				+    worker_instance.runloop()
			
 
				+
			
 
				 
			
 
				 @click.command()
			
 
				[email protected]('worker_type')
			
 
				[email protected]('--wait-for-first-event', is_flag=True)
			
 
				[email protected]('--exit-on-idle', is_flag=True)
			
 
				-def main(worker_type: str, wait_for_first_event: bool, exit_on_idle: bool):
			
 
				-    """Start an ArchiveBox worker process of the given type"""
			
 
				-    
			
 
				-    from workers.worker import get_worker_type
			
 
				-    
			
 
				-    # allow piping in events to process from stdin
			
 
				-    # if not sys.stdin.isatty():
			
 
				-    #     for line in sys.stdin.readlines():
			
 
				-    #         Event.dispatch(event=json.loads(line), parent=None)
			
 
				-
			
 
				-    # run the actor
			
 
				-    Worker = get_worker_type(worker_type)
			
 
				-    for event in Worker.run(wait_for_first_event=wait_for_first_event, exit_on_idle=exit_on_idle):
			
 
				-        print(event)
			
 
				[email protected]('worker_type', type=click.Choice(['crawl', 'snapshot', 'archiveresult']))
			
 
				[email protected]('--daemon', '-d', is_flag=True, help="Run forever (don't exit on idle)")
			
 
				[email protected]('--plugin', '-p', default=None, help='Filter by plugin (archiveresult only)')
			
 
				+@docstring(worker.__doc__)
			
 
				+def main(worker_type: str, daemon: bool, plugin: str | None):
			
 
				+    """Start an ArchiveBox worker process"""
			
 
				+    worker(worker_type, daemon=daemon, plugin=plugin)
			
 
				 
			
 
				 
			
 
				 if __name__ == '__main__':
			
--- a/archivebox/cli/tests.py
+++ b/archivebox/cli/tests.py
@@ -31,7 +31,6 @@ DATA_DIR = 'data.tests'
 
				 os.environ.update(TEST_CONFIG)
			
 
				 
			
 
				 from ..main import init
			
 
				-from ..index import load_main_index
			
 
				 from archivebox.config.constants import (
			
 
				     SQL_INDEX_FILENAME,
			
 
				     JSON_INDEX_FILENAME,
			
--- a/archivebox/cli/tests_piping.py
+++ b/archivebox/cli/tests_piping.py
@@ -0,0 +1,966 @@
 
				+#!/usr/bin/env python3
			
 
				+"""
			
 
				+Tests for CLI piping workflow: crawl | snapshot | extract
			
 
				+
			
 
				+This module tests the JSONL-based piping between CLI commands as described in:
			
 
				+https://github.com/ArchiveBox/ArchiveBox/issues/1363
			
 
				+
			
 
				+Workflows tested:
			
 
				+    archivebox snapshot URL | archivebox extract
			
 
				+    archivebox crawl URL | archivebox snapshot | archivebox extract
			
 
				+    archivebox crawl --plugin=PARSER URL | archivebox snapshot | archivebox extract
			
 
				+
			
 
				+Each command should:
			
 
				+    - Accept URLs, snapshot_ids, or JSONL as input (args or stdin)
			
 
				+    - Output JSONL to stdout when piped (not TTY)
			
 
				+    - Output human-readable to stderr when TTY
			
 
				+"""
			
 
				+
			
 
				+__package__ = 'archivebox.cli'
			
 
				+
			
 
				+import os
			
 
				+import sys
			
 
				+import json
			
 
				+import shutil
			
 
				+import tempfile
			
 
				+import unittest
			
 
				+from io import StringIO
			
 
				+from pathlib import Path
			
 
				+from unittest.mock import patch, MagicMock
			
 
				+
			
 
				+# Test configuration - disable slow extractors
			
 
				+TEST_CONFIG = {
			
 
				+    'USE_COLOR': 'False',
			
 
				+    'SHOW_PROGRESS': 'False',
			
 
				+    'SAVE_ARCHIVE_DOT_ORG': 'False',
			
 
				+    'SAVE_TITLE': 'True',  # Fast extractor
			
 
				+    'SAVE_FAVICON': 'False',
			
 
				+    'SAVE_WGET': 'False',
			
 
				+    'SAVE_WARC': 'False',
			
 
				+    'SAVE_PDF': 'False',
			
 
				+    'SAVE_SCREENSHOT': 'False',
			
 
				+    'SAVE_DOM': 'False',
			
 
				+    'SAVE_SINGLEFILE': 'False',
			
 
				+    'SAVE_READABILITY': 'False',
			
 
				+    'SAVE_MERCURY': 'False',
			
 
				+    'SAVE_GIT': 'False',
			
 
				+    'SAVE_MEDIA': 'False',
			
 
				+    'SAVE_HEADERS': 'False',
			
 
				+    'USE_CURL': 'False',
			
 
				+    'USE_WGET': 'False',
			
 
				+    'USE_GIT': 'False',
			
 
				+    'USE_CHROME': 'False',
			
 
				+    'USE_YOUTUBEDL': 'False',
			
 
				+    'USE_NODE': 'False',
			
 
				+}
			
 
				+
			
 
				+os.environ.update(TEST_CONFIG)
			
 
				+
			
 
				+
			
 
				+# =============================================================================
			
 
				+# JSONL Utility Tests
			
 
				+# =============================================================================
			
 
				+
			
 
				+class TestJSONLParsing(unittest.TestCase):
			
 
				+    """Test JSONL input parsing utilities."""
			
 
				+
			
 
				+    def test_parse_plain_url(self):
			
 
				+        """Plain URLs should be parsed as Snapshot records."""
			
 
				+        from archivebox.misc.jsonl import parse_line, TYPE_SNAPSHOT
			
 
				+
			
 
				+        result = parse_line('https://example.com')
			
 
				+        self.assertIsNotNone(result)
			
 
				+        self.assertEqual(result['type'], TYPE_SNAPSHOT)
			
 
				+        self.assertEqual(result['url'], 'https://example.com')
			
 
				+
			
 
				+    def test_parse_jsonl_snapshot(self):
			
 
				+        """JSONL Snapshot records should preserve all fields."""
			
 
				+        from archivebox.misc.jsonl import parse_line, TYPE_SNAPSHOT
			
 
				+
			
 
				+        line = '{"type": "Snapshot", "url": "https://example.com", "tags": "test,demo"}'
			
 
				+        result = parse_line(line)
			
 
				+        self.assertIsNotNone(result)
			
 
				+        self.assertEqual(result['type'], TYPE_SNAPSHOT)
			
 
				+        self.assertEqual(result['url'], 'https://example.com')
			
 
				+        self.assertEqual(result['tags'], 'test,demo')
			
 
				+
			
 
				+    def test_parse_jsonl_with_id(self):
			
 
				+        """JSONL with id field should be recognized."""
			
 
				+        from archivebox.misc.jsonl import parse_line, TYPE_SNAPSHOT
			
 
				+
			
 
				+        line = '{"type": "Snapshot", "id": "abc123", "url": "https://example.com"}'
			
 
				+        result = parse_line(line)
			
 
				+        self.assertIsNotNone(result)
			
 
				+        self.assertEqual(result['id'], 'abc123')
			
 
				+        self.assertEqual(result['url'], 'https://example.com')
			
 
				+
			
 
				+    def test_parse_uuid_as_snapshot_id(self):
			
 
				+        """Bare UUIDs should be parsed as snapshot IDs."""
			
 
				+        from archivebox.misc.jsonl import parse_line, TYPE_SNAPSHOT
			
 
				+
			
 
				+        uuid = '01234567-89ab-cdef-0123-456789abcdef'
			
 
				+        result = parse_line(uuid)
			
 
				+        self.assertIsNotNone(result)
			
 
				+        self.assertEqual(result['type'], TYPE_SNAPSHOT)
			
 
				+        self.assertEqual(result['id'], uuid)
			
 
				+
			
 
				+    def test_parse_empty_line(self):
			
 
				+        """Empty lines should return None."""
			
 
				+        from archivebox.misc.jsonl import parse_line
			
 
				+
			
 
				+        self.assertIsNone(parse_line(''))
			
 
				+        self.assertIsNone(parse_line('   '))
			
 
				+        self.assertIsNone(parse_line('\n'))
			
 
				+
			
 
				+    def test_parse_comment_line(self):
			
 
				+        """Comment lines should return None."""
			
 
				+        from archivebox.misc.jsonl import parse_line
			
 
				+
			
 
				+        self.assertIsNone(parse_line('# This is a comment'))
			
 
				+        self.assertIsNone(parse_line('  # Indented comment'))
			
 
				+
			
 
				+    def test_parse_invalid_url(self):
			
 
				+        """Invalid URLs should return None."""
			
 
				+        from archivebox.misc.jsonl import parse_line
			
 
				+
			
 
				+        self.assertIsNone(parse_line('not-a-url'))
			
 
				+        self.assertIsNone(parse_line('ftp://example.com'))  # Only http/https/file
			
 
				+
			
 
				+    def test_parse_file_url(self):
			
 
				+        """file:// URLs should be parsed."""
			
 
				+        from archivebox.misc.jsonl import parse_line, TYPE_SNAPSHOT
			
 
				+
			
 
				+        result = parse_line('file:///path/to/file.txt')
			
 
				+        self.assertIsNotNone(result)
			
 
				+        self.assertEqual(result['type'], TYPE_SNAPSHOT)
			
 
				+        self.assertEqual(result['url'], 'file:///path/to/file.txt')
			
 
				+
			
 
				+
			
 
				+class TestJSONLOutput(unittest.TestCase):
			
 
				+    """Test JSONL output formatting."""
			
 
				+
			
 
				+    def test_snapshot_to_jsonl(self):
			
 
				+        """Snapshot model should serialize to JSONL correctly."""
			
 
				+        from archivebox.misc.jsonl import snapshot_to_jsonl, TYPE_SNAPSHOT
			
 
				+
			
 
				+        # Create a mock snapshot
			
 
				+        mock_snapshot = MagicMock()
			
 
				+        mock_snapshot.id = 'test-uuid-1234'
			
 
				+        mock_snapshot.url = 'https://example.com'
			
 
				+        mock_snapshot.title = 'Example Title'
			
 
				+        mock_snapshot.tags_str.return_value = 'tag1,tag2'
			
 
				+        mock_snapshot.bookmarked_at = None
			
 
				+        mock_snapshot.created_at = None
			
 
				+        mock_snapshot.timestamp = '1234567890'
			
 
				+        mock_snapshot.depth = 0
			
 
				+        mock_snapshot.status = 'queued'
			
 
				+
			
 
				+        result = snapshot_to_jsonl(mock_snapshot)
			
 
				+        self.assertEqual(result['type'], TYPE_SNAPSHOT)
			
 
				+        self.assertEqual(result['id'], 'test-uuid-1234')
			
 
				+        self.assertEqual(result['url'], 'https://example.com')
			
 
				+        self.assertEqual(result['title'], 'Example Title')
			
 
				+
			
 
				+    def test_archiveresult_to_jsonl(self):
			
 
				+        """ArchiveResult model should serialize to JSONL correctly."""
			
 
				+        from archivebox.misc.jsonl import archiveresult_to_jsonl, TYPE_ARCHIVERESULT
			
 
				+
			
 
				+        mock_result = MagicMock()
			
 
				+        mock_result.id = 'result-uuid-5678'
			
 
				+        mock_result.snapshot_id = 'snapshot-uuid-1234'
			
 
				+        mock_result.extractor = 'title'
			
 
				+        mock_result.status = 'succeeded'
			
 
				+        mock_result.output = 'Example Title'
			
 
				+        mock_result.start_ts = None
			
 
				+        mock_result.end_ts = None
			
 
				+
			
 
				+        result = archiveresult_to_jsonl(mock_result)
			
 
				+        self.assertEqual(result['type'], TYPE_ARCHIVERESULT)
			
 
				+        self.assertEqual(result['id'], 'result-uuid-5678')
			
 
				+        self.assertEqual(result['snapshot_id'], 'snapshot-uuid-1234')
			
 
				+        self.assertEqual(result['extractor'], 'title')
			
 
				+        self.assertEqual(result['status'], 'succeeded')
			
 
				+
			
 
				+
			
 
				+class TestReadArgsOrStdin(unittest.TestCase):
			
 
				+    """Test reading from args or stdin."""
			
 
				+
			
 
				+    def test_read_from_args(self):
			
 
				+        """Should read URLs from command line args."""
			
 
				+        from archivebox.misc.jsonl import read_args_or_stdin
			
 
				+
			
 
				+        args = ('https://example1.com', 'https://example2.com')
			
 
				+        records = list(read_args_or_stdin(args))
			
 
				+
			
 
				+        self.assertEqual(len(records), 2)
			
 
				+        self.assertEqual(records[0]['url'], 'https://example1.com')
			
 
				+        self.assertEqual(records[1]['url'], 'https://example2.com')
			
 
				+
			
 
				+    def test_read_from_stdin(self):
			
 
				+        """Should read URLs from stdin when no args provided."""
			
 
				+        from archivebox.misc.jsonl import read_args_or_stdin
			
 
				+
			
 
				+        stdin_content = 'https://example1.com\nhttps://example2.com\n'
			
 
				+        stream = StringIO(stdin_content)
			
 
				+
			
 
				+        # Mock isatty to return False (simulating piped input)
			
 
				+        stream.isatty = lambda: False
			
 
				+
			
 
				+        records = list(read_args_or_stdin((), stream=stream))
			
 
				+
			
 
				+        self.assertEqual(len(records), 2)
			
 
				+        self.assertEqual(records[0]['url'], 'https://example1.com')
			
 
				+        self.assertEqual(records[1]['url'], 'https://example2.com')
			
 
				+
			
 
				+    def test_read_jsonl_from_stdin(self):
			
 
				+        """Should read JSONL from stdin."""
			
 
				+        from archivebox.misc.jsonl import read_args_or_stdin
			
 
				+
			
 
				+        stdin_content = '{"type": "Snapshot", "url": "https://example.com", "tags": "test"}\n'
			
 
				+        stream = StringIO(stdin_content)
			
 
				+        stream.isatty = lambda: False
			
 
				+
			
 
				+        records = list(read_args_or_stdin((), stream=stream))
			
 
				+
			
 
				+        self.assertEqual(len(records), 1)
			
 
				+        self.assertEqual(records[0]['url'], 'https://example.com')
			
 
				+        self.assertEqual(records[0]['tags'], 'test')
			
 
				+
			
 
				+    def test_skip_tty_stdin(self):
			
 
				+        """Should not read from TTY stdin (would block)."""
			
 
				+        from archivebox.misc.jsonl import read_args_or_stdin
			
 
				+
			
 
				+        stream = StringIO('https://example.com')
			
 
				+        stream.isatty = lambda: True  # Simulate TTY
			
 
				+
			
 
				+        records = list(read_args_or_stdin((), stream=stream))
			
 
				+        self.assertEqual(len(records), 0)
			
 
				+
			
 
				+
			
 
				+# =============================================================================
			
 
				+# Unit Tests for Individual Commands
			
 
				+# =============================================================================
			
 
				+
			
 
				+class TestCrawlCommand(unittest.TestCase):
			
 
				+    """Unit tests for archivebox crawl command."""
			
 
				+
			
 
				+    def setUp(self):
			
 
				+        """Set up test environment."""
			
 
				+        self.test_dir = tempfile.mkdtemp()
			
 
				+        os.environ['DATA_DIR'] = self.test_dir
			
 
				+
			
 
				+    def tearDown(self):
			
 
				+        """Clean up test environment."""
			
 
				+        shutil.rmtree(self.test_dir, ignore_errors=True)
			
 
				+
			
 
				+    def test_crawl_accepts_url(self):
			
 
				+        """crawl should accept URLs as input."""
			
 
				+        from archivebox.misc.jsonl import read_args_or_stdin
			
 
				+
			
 
				+        args = ('https://example.com',)
			
 
				+        records = list(read_args_or_stdin(args))
			
 
				+
			
 
				+        self.assertEqual(len(records), 1)
			
 
				+        self.assertEqual(records[0]['url'], 'https://example.com')
			
 
				+
			
 
				+    def test_crawl_accepts_snapshot_id(self):
			
 
				+        """crawl should accept snapshot IDs as input."""
			
 
				+        from archivebox.misc.jsonl import read_args_or_stdin
			
 
				+
			
 
				+        uuid = '01234567-89ab-cdef-0123-456789abcdef'
			
 
				+        args = (uuid,)
			
 
				+        records = list(read_args_or_stdin(args))
			
 
				+
			
 
				+        self.assertEqual(len(records), 1)
			
 
				+        self.assertEqual(records[0]['id'], uuid)
			
 
				+
			
 
				+    def test_crawl_accepts_jsonl(self):
			
 
				+        """crawl should accept JSONL with snapshot info."""
			
 
				+        from archivebox.misc.jsonl import read_args_or_stdin
			
 
				+
			
 
				+        stdin = StringIO('{"type": "Snapshot", "id": "abc123", "url": "https://example.com"}\n')
			
 
				+        stdin.isatty = lambda: False
			
 
				+
			
 
				+        records = list(read_args_or_stdin((), stream=stdin))
			
 
				+
			
 
				+        self.assertEqual(len(records), 1)
			
 
				+        self.assertEqual(records[0]['id'], 'abc123')
			
 
				+        self.assertEqual(records[0]['url'], 'https://example.com')
			
 
				+
			
 
				+    def test_crawl_separates_existing_vs_new(self):
			
 
				+        """crawl should identify existing snapshots vs new URLs."""
			
 
				+        # This tests the logic in discover_outlinks() that separates
			
 
				+        # records with 'id' (existing) from records with just 'url' (new)
			
 
				+
			
 
				+        records = [
			
 
				+            {'type': 'Snapshot', 'id': 'existing-id-1'},  # Existing (id only)
			
 
				+            {'type': 'Snapshot', 'url': 'https://new-url.com'},  # New (url only)
			
 
				+            {'type': 'Snapshot', 'id': 'existing-id-2', 'url': 'https://existing.com'},  # Existing (has id)
			
 
				+        ]
			
 
				+
			
 
				+        existing = []
			
 
				+        new = []
			
 
				+
			
 
				+        for record in records:
			
 
				+            if record.get('id') and not record.get('url'):
			
 
				+                existing.append(record['id'])
			
 
				+            elif record.get('id'):
			
 
				+                existing.append(record['id'])  # Has both id and url - treat as existing
			
 
				+            elif record.get('url'):
			
 
				+                new.append(record)
			
 
				+
			
 
				+        self.assertEqual(len(existing), 2)
			
 
				+        self.assertEqual(len(new), 1)
			
 
				+        self.assertEqual(new[0]['url'], 'https://new-url.com')
			
 
				+
			
 
				+
			
 
				+class TestSnapshotCommand(unittest.TestCase):
			
 
				+    """Unit tests for archivebox snapshot command."""
			
 
				+
			
 
				+    def setUp(self):
			
 
				+        """Set up test environment."""
			
 
				+        self.test_dir = tempfile.mkdtemp()
			
 
				+        os.environ['DATA_DIR'] = self.test_dir
			
 
				+
			
 
				+    def tearDown(self):
			
 
				+        """Clean up test environment."""
			
 
				+        shutil.rmtree(self.test_dir, ignore_errors=True)
			
 
				+
			
 
				+    def test_snapshot_accepts_url(self):
			
 
				+        """snapshot should accept URLs as input."""
			
 
				+        from archivebox.misc.jsonl import read_args_or_stdin
			
 
				+
			
 
				+        args = ('https://example.com',)
			
 
				+        records = list(read_args_or_stdin(args))
			
 
				+
			
 
				+        self.assertEqual(len(records), 1)
			
 
				+        self.assertEqual(records[0]['url'], 'https://example.com')
			
 
				+
			
 
				+    def test_snapshot_accepts_jsonl_with_metadata(self):
			
 
				+        """snapshot should accept JSONL with tags and other metadata."""
			
 
				+        from archivebox.misc.jsonl import read_args_or_stdin
			
 
				+
			
 
				+        stdin = StringIO('{"type": "Snapshot", "url": "https://example.com", "tags": "tag1,tag2", "title": "Test"}\n')
			
 
				+        stdin.isatty = lambda: False
			
 
				+
			
 
				+        records = list(read_args_or_stdin((), stream=stdin))
			
 
				+
			
 
				+        self.assertEqual(len(records), 1)
			
 
				+        self.assertEqual(records[0]['url'], 'https://example.com')
			
 
				+        self.assertEqual(records[0]['tags'], 'tag1,tag2')
			
 
				+        self.assertEqual(records[0]['title'], 'Test')
			
 
				+
			
 
				+    def test_snapshot_output_format(self):
			
 
				+        """snapshot output should include id and url."""
			
 
				+        from archivebox.misc.jsonl import snapshot_to_jsonl
			
 
				+
			
 
				+        mock_snapshot = MagicMock()
			
 
				+        mock_snapshot.id = 'test-id'
			
 
				+        mock_snapshot.url = 'https://example.com'
			
 
				+        mock_snapshot.title = 'Test'
			
 
				+        mock_snapshot.tags_str.return_value = ''
			
 
				+        mock_snapshot.bookmarked_at = None
			
 
				+        mock_snapshot.created_at = None
			
 
				+        mock_snapshot.timestamp = '123'
			
 
				+        mock_snapshot.depth = 0
			
 
				+        mock_snapshot.status = 'queued'
			
 
				+
			
 
				+        output = snapshot_to_jsonl(mock_snapshot)
			
 
				+
			
 
				+        self.assertIn('id', output)
			
 
				+        self.assertIn('url', output)
			
 
				+        self.assertEqual(output['type'], 'Snapshot')
			
 
				+
			
 
				+
			
 
				+class TestExtractCommand(unittest.TestCase):
			
 
				+    """Unit tests for archivebox extract command."""
			
 
				+
			
 
				+    def setUp(self):
			
 
				+        """Set up test environment."""
			
 
				+        self.test_dir = tempfile.mkdtemp()
			
 
				+        os.environ['DATA_DIR'] = self.test_dir
			
 
				+
			
 
				+    def tearDown(self):
			
 
				+        """Clean up test environment."""
			
 
				+        shutil.rmtree(self.test_dir, ignore_errors=True)
			
 
				+
			
 
				+    def test_extract_accepts_snapshot_id(self):
			
 
				+        """extract should accept snapshot IDs as input."""
			
 
				+        from archivebox.misc.jsonl import read_args_or_stdin
			
 
				+
			
 
				+        uuid = '01234567-89ab-cdef-0123-456789abcdef'
			
 
				+        args = (uuid,)
			
 
				+        records = list(read_args_or_stdin(args))
			
 
				+
			
 
				+        self.assertEqual(len(records), 1)
			
 
				+        self.assertEqual(records[0]['id'], uuid)
			
 
				+
			
 
				+    def test_extract_accepts_jsonl_snapshot(self):
			
 
				+        """extract should accept JSONL Snapshot records."""
			
 
				+        from archivebox.misc.jsonl import read_args_or_stdin, TYPE_SNAPSHOT
			
 
				+
			
 
				+        stdin = StringIO('{"type": "Snapshot", "id": "abc123", "url": "https://example.com"}\n')
			
 
				+        stdin.isatty = lambda: False
			
 
				+
			
 
				+        records = list(read_args_or_stdin((), stream=stdin))
			
 
				+
			
 
				+        self.assertEqual(len(records), 1)
			
 
				+        self.assertEqual(records[0]['type'], TYPE_SNAPSHOT)
			
 
				+        self.assertEqual(records[0]['id'], 'abc123')
			
 
				+
			
 
				+    def test_extract_gathers_snapshot_ids(self):
			
 
				+        """extract should gather snapshot IDs from various input formats."""
			
 
				+        from archivebox.misc.jsonl import TYPE_SNAPSHOT, TYPE_ARCHIVERESULT
			
 
				+
			
 
				+        records = [
			
 
				+            {'type': TYPE_SNAPSHOT, 'id': 'snap-1'},
			
 
				+            {'type': TYPE_SNAPSHOT, 'id': 'snap-2', 'url': 'https://example.com'},
			
 
				+            {'type': TYPE_ARCHIVERESULT, 'snapshot_id': 'snap-3'},
			
 
				+            {'id': 'snap-4'},  # Bare id
			
 
				+        ]
			
 
				+
			
 
				+        snapshot_ids = set()
			
 
				+        for record in records:
			
 
				+            record_type = record.get('type')
			
 
				+
			
 
				+            if record_type == TYPE_SNAPSHOT:
			
 
				+                snapshot_id = record.get('id')
			
 
				+                if snapshot_id:
			
 
				+                    snapshot_ids.add(snapshot_id)
			
 
				+            elif record_type == TYPE_ARCHIVERESULT:
			
 
				+                snapshot_id = record.get('snapshot_id')
			
 
				+                if snapshot_id:
			
 
				+                    snapshot_ids.add(snapshot_id)
			
 
				+            elif 'id' in record:
			
 
				+                snapshot_ids.add(record['id'])
			
 
				+
			
 
				+        self.assertEqual(len(snapshot_ids), 4)
			
 
				+        self.assertIn('snap-1', snapshot_ids)
			
 
				+        self.assertIn('snap-2', snapshot_ids)
			
 
				+        self.assertIn('snap-3', snapshot_ids)
			
 
				+        self.assertIn('snap-4', snapshot_ids)
			
 
				+
			
 
				+
			
 
				+# =============================================================================
			
 
				+# URL Collection Tests
			
 
				+# =============================================================================
			
 
				+
			
 
				+class TestURLCollection(unittest.TestCase):
			
 
				+    """Test collecting urls.jsonl from extractor output."""
			
 
				+
			
 
				+    def setUp(self):
			
 
				+        """Create test directory structure."""
			
 
				+        self.test_dir = Path(tempfile.mkdtemp())
			
 
				+
			
 
				+        # Create fake extractor output directories with urls.jsonl
			
 
				+        (self.test_dir / 'wget').mkdir()
			
 
				+        (self.test_dir / 'wget' / 'urls.jsonl').write_text(
			
 
				+            '{"url": "https://wget-link-1.com"}\n'
			
 
				+            '{"url": "https://wget-link-2.com"}\n'
			
 
				+        )
			
 
				+
			
 
				+        (self.test_dir / 'parse_html_urls').mkdir()
			
 
				+        (self.test_dir / 'parse_html_urls' / 'urls.jsonl').write_text(
			
 
				+            '{"url": "https://html-link-1.com"}\n'
			
 
				+            '{"url": "https://html-link-2.com", "title": "HTML Link 2"}\n'
			
 
				+        )
			
 
				+
			
 
				+        (self.test_dir / 'screenshot').mkdir()
			
 
				+        # No urls.jsonl in screenshot dir - not a parser
			
 
				+
			
 
				+    def tearDown(self):
			
 
				+        """Clean up test directory."""
			
 
				+        shutil.rmtree(self.test_dir, ignore_errors=True)
			
 
				+
			
 
				+    def test_collect_urls_from_extractors(self):
			
 
				+        """Should collect urls.jsonl from all extractor subdirectories."""
			
 
				+        from archivebox.hooks import collect_urls_from_extractors
			
 
				+
			
 
				+        urls = collect_urls_from_extractors(self.test_dir)
			
 
				+
			
 
				+        self.assertEqual(len(urls), 4)
			
 
				+
			
 
				+        # Check that via_extractor is set
			
 
				+        extractors = {u['via_extractor'] for u in urls}
			
 
				+        self.assertIn('wget', extractors)
			
 
				+        self.assertIn('parse_html_urls', extractors)
			
 
				+        self.assertNotIn('screenshot', extractors)  # No urls.jsonl
			
 
				+
			
 
				+    def test_collect_urls_preserves_metadata(self):
			
 
				+        """Should preserve metadata from urls.jsonl entries."""
			
 
				+        from archivebox.hooks import collect_urls_from_extractors
			
 
				+
			
 
				+        urls = collect_urls_from_extractors(self.test_dir)
			
 
				+
			
 
				+        # Find the entry with title
			
 
				+        titled = [u for u in urls if u.get('title') == 'HTML Link 2']
			
 
				+        self.assertEqual(len(titled), 1)
			
 
				+        self.assertEqual(titled[0]['url'], 'https://html-link-2.com')
			
 
				+
			
 
				+    def test_collect_urls_empty_dir(self):
			
 
				+        """Should handle empty or non-existent directories."""
			
 
				+        from archivebox.hooks import collect_urls_from_extractors
			
 
				+
			
 
				+        empty_dir = self.test_dir / 'nonexistent'
			
 
				+        urls = collect_urls_from_extractors(empty_dir)
			
 
				+
			
 
				+        self.assertEqual(len(urls), 0)
			
 
				+
			
 
				+
			
 
				+# =============================================================================
			
 
				+# Integration Tests
			
 
				+# =============================================================================
			
 
				+
			
 
				+class TestPipingWorkflowIntegration(unittest.TestCase):
			
 
				+    """
			
 
				+    Integration tests for the complete piping workflow.
			
 
				+
			
 
				+    These tests require Django to be set up and use the actual database.
			
 
				+    """
			
 
				+
			
 
				+    @classmethod
			
 
				+    def setUpClass(cls):
			
 
				+        """Set up Django and test database."""
			
 
				+        cls.test_dir = tempfile.mkdtemp()
			
 
				+        os.environ['DATA_DIR'] = cls.test_dir
			
 
				+
			
 
				+        # Initialize Django
			
 
				+        from archivebox.config.django import setup_django
			
 
				+        setup_django()
			
 
				+
			
 
				+        # Initialize the archive
			
 
				+        from archivebox.cli.archivebox_init import init
			
 
				+        init()
			
 
				+
			
 
				+    @classmethod
			
 
				+    def tearDownClass(cls):
			
 
				+        """Clean up test database."""
			
 
				+        shutil.rmtree(cls.test_dir, ignore_errors=True)
			
 
				+
			
 
				+    def test_snapshot_creates_and_outputs_jsonl(self):
			
 
				+        """
			
 
				+        Test: archivebox snapshot URL
			
 
				+        Should create a Snapshot and output JSONL when piped.
			
 
				+        """
			
 
				+        from core.models import Snapshot
			
 
				+        from archivebox.misc.jsonl import (
			
 
				+            read_args_or_stdin, write_record, snapshot_to_jsonl,
			
 
				+            TYPE_SNAPSHOT, get_or_create_snapshot
			
 
				+        )
			
 
				+        from archivebox.base_models.models import get_or_create_system_user_pk
			
 
				+
			
 
				+        created_by_id = get_or_create_system_user_pk()
			
 
				+
			
 
				+        # Simulate input
			
 
				+        url = 'https://test-snapshot-1.example.com'
			
 
				+        records = list(read_args_or_stdin((url,)))
			
 
				+
			
 
				+        self.assertEqual(len(records), 1)
			
 
				+        self.assertEqual(records[0]['url'], url)
			
 
				+
			
 
				+        # Create snapshot
			
 
				+        snapshot = get_or_create_snapshot(records[0], created_by_id=created_by_id)
			
 
				+
			
 
				+        self.assertIsNotNone(snapshot.id)
			
 
				+        self.assertEqual(snapshot.url, url)
			
 
				+
			
 
				+        # Verify output format
			
 
				+        output = snapshot_to_jsonl(snapshot)
			
 
				+        self.assertEqual(output['type'], TYPE_SNAPSHOT)
			
 
				+        self.assertIn('id', output)
			
 
				+        self.assertEqual(output['url'], url)
			
 
				+
			
 
				+    def test_extract_accepts_snapshot_from_previous_command(self):
			
 
				+        """
			
 
				+        Test: archivebox snapshot URL | archivebox extract
			
 
				+        Extract should accept JSONL output from snapshot command.
			
 
				+        """
			
 
				+        from core.models import Snapshot, ArchiveResult
			
 
				+        from archivebox.misc.jsonl import (
			
 
				+            snapshot_to_jsonl, read_args_or_stdin, get_or_create_snapshot,
			
 
				+            TYPE_SNAPSHOT
			
 
				+        )
			
 
				+        from archivebox.base_models.models import get_or_create_system_user_pk
			
 
				+
			
 
				+        created_by_id = get_or_create_system_user_pk()
			
 
				+
			
 
				+        # Step 1: Create snapshot (simulating 'archivebox snapshot')
			
 
				+        url = 'https://test-extract-1.example.com'
			
 
				+        snapshot = get_or_create_snapshot({'url': url}, created_by_id=created_by_id)
			
 
				+        snapshot_output = snapshot_to_jsonl(snapshot)
			
 
				+
			
 
				+        # Step 2: Parse snapshot output as extract input
			
 
				+        stdin = StringIO(json.dumps(snapshot_output) + '\n')
			
 
				+        stdin.isatty = lambda: False
			
 
				+
			
 
				+        records = list(read_args_or_stdin((), stream=stdin))
			
 
				+
			
 
				+        self.assertEqual(len(records), 1)
			
 
				+        self.assertEqual(records[0]['type'], TYPE_SNAPSHOT)
			
 
				+        self.assertEqual(records[0]['id'], str(snapshot.id))
			
 
				+
			
 
				+        # Step 3: Gather snapshot IDs (as extract does)
			
 
				+        snapshot_ids = set()
			
 
				+        for record in records:
			
 
				+            if record.get('type') == TYPE_SNAPSHOT and record.get('id'):
			
 
				+                snapshot_ids.add(record['id'])
			
 
				+
			
 
				+        self.assertIn(str(snapshot.id), snapshot_ids)
			
 
				+
			
 
				+    def test_crawl_outputs_discovered_urls(self):
			
 
				+        """
			
 
				+        Test: archivebox crawl URL
			
 
				+        Should create snapshot, run plugins, output discovered URLs.
			
 
				+        """
			
 
				+        from archivebox.hooks import collect_urls_from_extractors
			
 
				+        from archivebox.misc.jsonl import TYPE_SNAPSHOT
			
 
				+
			
 
				+        # Create a mock snapshot directory with urls.jsonl
			
 
				+        test_snapshot_dir = Path(self.test_dir) / 'archive' / 'test-crawl-snapshot'
			
 
				+        test_snapshot_dir.mkdir(parents=True, exist_ok=True)
			
 
				+
			
 
				+        # Create mock extractor output
			
 
				+        (test_snapshot_dir / 'parse_html_urls').mkdir()
			
 
				+        (test_snapshot_dir / 'parse_html_urls' / 'urls.jsonl').write_text(
			
 
				+            '{"url": "https://discovered-1.com"}\n'
			
 
				+            '{"url": "https://discovered-2.com", "title": "Discovered 2"}\n'
			
 
				+        )
			
 
				+
			
 
				+        # Collect URLs (as crawl does)
			
 
				+        discovered = collect_urls_from_extractors(test_snapshot_dir)
			
 
				+
			
 
				+        self.assertEqual(len(discovered), 2)
			
 
				+
			
 
				+        # Add crawl metadata (as crawl does)
			
 
				+        for entry in discovered:
			
 
				+            entry['type'] = TYPE_SNAPSHOT
			
 
				+            entry['depth'] = 1
			
 
				+            entry['via_snapshot'] = 'test-crawl-snapshot'
			
 
				+
			
 
				+        # Verify output format
			
 
				+        self.assertEqual(discovered[0]['type'], TYPE_SNAPSHOT)
			
 
				+        self.assertEqual(discovered[0]['depth'], 1)
			
 
				+        self.assertEqual(discovered[0]['url'], 'https://discovered-1.com')
			
 
				+
			
 
				+    def test_full_pipeline_snapshot_extract(self):
			
 
				+        """
			
 
				+        Test: archivebox snapshot URL | archivebox extract
			
 
				+
			
 
				+        This is equivalent to: archivebox add URL
			
 
				+        """
			
 
				+        from core.models import Snapshot
			
 
				+        from archivebox.misc.jsonl import (
			
 
				+            get_or_create_snapshot, snapshot_to_jsonl, read_args_or_stdin,
			
 
				+            TYPE_SNAPSHOT
			
 
				+        )
			
 
				+        from archivebox.base_models.models import get_or_create_system_user_pk
			
 
				+
			
 
				+        created_by_id = get_or_create_system_user_pk()
			
 
				+
			
 
				+        # === archivebox snapshot https://example.com ===
			
 
				+        url = 'https://test-pipeline-1.example.com'
			
 
				+        snapshot = get_or_create_snapshot({'url': url}, created_by_id=created_by_id)
			
 
				+        snapshot_jsonl = json.dumps(snapshot_to_jsonl(snapshot))
			
 
				+
			
 
				+        # === | archivebox extract ===
			
 
				+        stdin = StringIO(snapshot_jsonl + '\n')
			
 
				+        stdin.isatty = lambda: False
			
 
				+
			
 
				+        records = list(read_args_or_stdin((), stream=stdin))
			
 
				+
			
 
				+        # Extract should receive the snapshot ID
			
 
				+        self.assertEqual(len(records), 1)
			
 
				+        self.assertEqual(records[0]['id'], str(snapshot.id))
			
 
				+
			
 
				+        # Verify snapshot exists in DB
			
 
				+        db_snapshot = Snapshot.objects.get(id=snapshot.id)
			
 
				+        self.assertEqual(db_snapshot.url, url)
			
 
				+
			
 
				+    def test_full_pipeline_crawl_snapshot_extract(self):
			
 
				+        """
			
 
				+        Test: archivebox crawl URL | archivebox snapshot | archivebox extract
			
 
				+
			
 
				+        This is equivalent to: archivebox add --depth=1 URL
			
 
				+        """
			
 
				+        from core.models import Snapshot
			
 
				+        from archivebox.misc.jsonl import (
			
 
				+            get_or_create_snapshot, snapshot_to_jsonl, read_args_or_stdin,
			
 
				+            TYPE_SNAPSHOT
			
 
				+        )
			
 
				+        from archivebox.base_models.models import get_or_create_system_user_pk
			
 
				+        from archivebox.hooks import collect_urls_from_extractors
			
 
				+
			
 
				+        created_by_id = get_or_create_system_user_pk()
			
 
				+
			
 
				+        # === archivebox crawl https://example.com ===
			
 
				+        # Step 1: Create snapshot for starting URL
			
 
				+        start_url = 'https://test-crawl-pipeline.example.com'
			
 
				+        start_snapshot = get_or_create_snapshot({'url': start_url}, created_by_id=created_by_id)
			
 
				+
			
 
				+        # Step 2: Simulate extractor output with discovered URLs
			
 
				+        snapshot_dir = Path(self.test_dir) / 'archive' / str(start_snapshot.timestamp)
			
 
				+        snapshot_dir.mkdir(parents=True, exist_ok=True)
			
 
				+        (snapshot_dir / 'parse_html_urls').mkdir(exist_ok=True)
			
 
				+        (snapshot_dir / 'parse_html_urls' / 'urls.jsonl').write_text(
			
 
				+            '{"url": "https://outlink-1.example.com"}\n'
			
 
				+            '{"url": "https://outlink-2.example.com"}\n'
			
 
				+        )
			
 
				+
			
 
				+        # Step 3: Collect discovered URLs (crawl output)
			
 
				+        discovered = collect_urls_from_extractors(snapshot_dir)
			
 
				+        crawl_output = []
			
 
				+        for entry in discovered:
			
 
				+            entry['type'] = TYPE_SNAPSHOT
			
 
				+            entry['depth'] = 1
			
 
				+            crawl_output.append(json.dumps(entry))
			
 
				+
			
 
				+        # === | archivebox snapshot ===
			
 
				+        stdin = StringIO('\n'.join(crawl_output) + '\n')
			
 
				+        stdin.isatty = lambda: False
			
 
				+
			
 
				+        records = list(read_args_or_stdin((), stream=stdin))
			
 
				+        self.assertEqual(len(records), 2)
			
 
				+
			
 
				+        # Create snapshots for discovered URLs
			
 
				+        created_snapshots = []
			
 
				+        for record in records:
			
 
				+            snap = get_or_create_snapshot(record, created_by_id=created_by_id)
			
 
				+            created_snapshots.append(snap)
			
 
				+
			
 
				+        self.assertEqual(len(created_snapshots), 2)
			
 
				+
			
 
				+        # === | archivebox extract ===
			
 
				+        snapshot_jsonl_lines = [json.dumps(snapshot_to_jsonl(s)) for s in created_snapshots]
			
 
				+        stdin = StringIO('\n'.join(snapshot_jsonl_lines) + '\n')
			
 
				+        stdin.isatty = lambda: False
			
 
				+
			
 
				+        records = list(read_args_or_stdin((), stream=stdin))
			
 
				+        self.assertEqual(len(records), 2)
			
 
				+
			
 
				+        # Verify all snapshots exist in DB
			
 
				+        for record in records:
			
 
				+            db_snapshot = Snapshot.objects.get(id=record['id'])
			
 
				+            self.assertIn(db_snapshot.url, [
			
 
				+                'https://outlink-1.example.com',
			
 
				+                'https://outlink-2.example.com'
			
 
				+            ])
			
 
				+
			
 
				+
			
 
				+class TestDepthWorkflows(unittest.TestCase):
			
 
				+    """Test various depth crawl workflows."""
			
 
				+
			
 
				+    @classmethod
			
 
				+    def setUpClass(cls):
			
 
				+        """Set up Django and test database."""
			
 
				+        cls.test_dir = tempfile.mkdtemp()
			
 
				+        os.environ['DATA_DIR'] = cls.test_dir
			
 
				+
			
 
				+        from archivebox.config.django import setup_django
			
 
				+        setup_django()
			
 
				+
			
 
				+        from archivebox.cli.archivebox_init import init
			
 
				+        init()
			
 
				+
			
 
				+    @classmethod
			
 
				+    def tearDownClass(cls):
			
 
				+        """Clean up test database."""
			
 
				+        shutil.rmtree(cls.test_dir, ignore_errors=True)
			
 
				+
			
 
				+    def test_depth_0_workflow(self):
			
 
				+        """
			
 
				+        Test: archivebox snapshot URL | archivebox extract
			
 
				+
			
 
				+        Depth 0: Only archive the specified URL, no crawling.
			
 
				+        """
			
 
				+        from core.models import Snapshot
			
 
				+        from archivebox.misc.jsonl import get_or_create_snapshot
			
 
				+        from archivebox.base_models.models import get_or_create_system_user_pk
			
 
				+
			
 
				+        created_by_id = get_or_create_system_user_pk()
			
 
				+
			
 
				+        # Create snapshot
			
 
				+        url = 'https://depth0-test.example.com'
			
 
				+        snapshot = get_or_create_snapshot({'url': url}, created_by_id=created_by_id)
			
 
				+
			
 
				+        # Verify only one snapshot created
			
 
				+        self.assertEqual(Snapshot.objects.filter(url=url).count(), 1)
			
 
				+        self.assertEqual(snapshot.url, url)
			
 
				+
			
 
				+    def test_depth_1_workflow(self):
			
 
				+        """
			
 
				+        Test: archivebox crawl URL | archivebox snapshot | archivebox extract
			
 
				+
			
 
				+        Depth 1: Archive URL + all outlinks from that URL.
			
 
				+        """
			
 
				+        # This is tested in test_full_pipeline_crawl_snapshot_extract
			
 
				+        pass
			
 
				+
			
 
				+    def test_depth_metadata_propagation(self):
			
 
				+        """Test that depth metadata propagates through the pipeline."""
			
 
				+        from archivebox.misc.jsonl import TYPE_SNAPSHOT
			
 
				+
			
 
				+        # Simulate crawl output with depth metadata
			
 
				+        crawl_output = [
			
 
				+            {'type': TYPE_SNAPSHOT, 'url': 'https://hop1.com', 'depth': 1, 'via_snapshot': 'root'},
			
 
				+            {'type': TYPE_SNAPSHOT, 'url': 'https://hop2.com', 'depth': 2, 'via_snapshot': 'hop1'},
			
 
				+        ]
			
 
				+
			
 
				+        # Verify depth is preserved
			
 
				+        for entry in crawl_output:
			
 
				+            self.assertIn('depth', entry)
			
 
				+            self.assertIn('via_snapshot', entry)
			
 
				+
			
 
				+
			
 
				+class TestParserPluginWorkflows(unittest.TestCase):
			
 
				+    """Test workflows with specific parser plugins."""
			
 
				+
			
 
				+    @classmethod
			
 
				+    def setUpClass(cls):
			
 
				+        """Set up Django and test database."""
			
 
				+        cls.test_dir = tempfile.mkdtemp()
			
 
				+        os.environ['DATA_DIR'] = cls.test_dir
			
 
				+
			
 
				+        from archivebox.config.django import setup_django
			
 
				+        setup_django()
			
 
				+
			
 
				+        from archivebox.cli.archivebox_init import init
			
 
				+        init()
			
 
				+
			
 
				+    @classmethod
			
 
				+    def tearDownClass(cls):
			
 
				+        """Clean up test database."""
			
 
				+        shutil.rmtree(cls.test_dir, ignore_errors=True)
			
 
				+
			
 
				+    def test_html_parser_workflow(self):
			
 
				+        """
			
 
				+        Test: archivebox crawl --plugin=parse_html_urls URL | archivebox snapshot | archivebox extract
			
 
				+        """
			
 
				+        from archivebox.hooks import collect_urls_from_extractors
			
 
				+        from archivebox.misc.jsonl import TYPE_SNAPSHOT
			
 
				+
			
 
				+        # Create mock output directory
			
 
				+        snapshot_dir = Path(self.test_dir) / 'archive' / 'html-parser-test'
			
 
				+        snapshot_dir.mkdir(parents=True, exist_ok=True)
			
 
				+        (snapshot_dir / 'parse_html_urls').mkdir(exist_ok=True)
			
 
				+        (snapshot_dir / 'parse_html_urls' / 'urls.jsonl').write_text(
			
 
				+            '{"url": "https://html-discovered.com", "title": "HTML Link"}\n'
			
 
				+        )
			
 
				+
			
 
				+        # Collect URLs
			
 
				+        discovered = collect_urls_from_extractors(snapshot_dir)
			
 
				+
			
 
				+        self.assertEqual(len(discovered), 1)
			
 
				+        self.assertEqual(discovered[0]['url'], 'https://html-discovered.com')
			
 
				+        self.assertEqual(discovered[0]['via_extractor'], 'parse_html_urls')
			
 
				+
			
 
				+    def test_rss_parser_workflow(self):
			
 
				+        """
			
 
				+        Test: archivebox crawl --plugin=parse_rss_urls URL | archivebox snapshot | archivebox extract
			
 
				+        """
			
 
				+        from archivebox.hooks import collect_urls_from_extractors
			
 
				+
			
 
				+        # Create mock output directory
			
 
				+        snapshot_dir = Path(self.test_dir) / 'archive' / 'rss-parser-test'
			
 
				+        snapshot_dir.mkdir(parents=True, exist_ok=True)
			
 
				+        (snapshot_dir / 'parse_rss_urls').mkdir(exist_ok=True)
			
 
				+        (snapshot_dir / 'parse_rss_urls' / 'urls.jsonl').write_text(
			
 
				+            '{"url": "https://rss-item-1.com", "title": "RSS Item 1"}\n'
			
 
				+            '{"url": "https://rss-item-2.com", "title": "RSS Item 2"}\n'
			
 
				+        )
			
 
				+
			
 
				+        # Collect URLs
			
 
				+        discovered = collect_urls_from_extractors(snapshot_dir)
			
 
				+
			
 
				+        self.assertEqual(len(discovered), 2)
			
 
				+        self.assertTrue(all(d['via_extractor'] == 'parse_rss_urls' for d in discovered))
			
 
				+
			
 
				+    def test_multiple_parsers_dedupe(self):
			
 
				+        """
			
 
				+        Multiple parsers may discover the same URL - should be deduplicated.
			
 
				+        """
			
 
				+        from archivebox.hooks import collect_urls_from_extractors
			
 
				+
			
 
				+        # Create mock output with duplicate URLs from different parsers
			
 
				+        snapshot_dir = Path(self.test_dir) / 'archive' / 'dedupe-test'
			
 
				+        snapshot_dir.mkdir(parents=True, exist_ok=True)
			
 
				+
			
 
				+        (snapshot_dir / 'parse_html_urls').mkdir(exist_ok=True)
			
 
				+        (snapshot_dir / 'parse_html_urls' / 'urls.jsonl').write_text(
			
 
				+            '{"url": "https://same-url.com"}\n'
			
 
				+        )
			
 
				+
			
 
				+        (snapshot_dir / 'wget').mkdir(exist_ok=True)
			
 
				+        (snapshot_dir / 'wget' / 'urls.jsonl').write_text(
			
 
				+            '{"url": "https://same-url.com"}\n'  # Same URL, different extractor
			
 
				+        )
			
 
				+
			
 
				+        # Collect URLs
			
 
				+        all_discovered = collect_urls_from_extractors(snapshot_dir)
			
 
				+
			
 
				+        # Both entries are returned (deduplication happens at the crawl command level)
			
 
				+        self.assertEqual(len(all_discovered), 2)
			
 
				+
			
 
				+        # Verify both extractors found the same URL
			
 
				+        urls = {d['url'] for d in all_discovered}
			
 
				+        self.assertEqual(urls, {'https://same-url.com'})
			
 
				+
			
 
				+
			
 
				+class TestEdgeCases(unittest.TestCase):
			
 
				+    """Test edge cases and error handling."""
			
 
				+
			
 
				+    def test_empty_input(self):
			
 
				+        """Commands should handle empty input gracefully."""
			
 
				+        from archivebox.misc.jsonl import read_args_or_stdin
			
 
				+
			
 
				+        # Empty args, TTY stdin (should not block)
			
 
				+        stdin = StringIO('')
			
 
				+        stdin.isatty = lambda: True
			
 
				+
			
 
				+        records = list(read_args_or_stdin((), stream=stdin))
			
 
				+        self.assertEqual(len(records), 0)
			
 
				+
			
 
				+    def test_malformed_jsonl(self):
			
 
				+        """Should skip malformed JSONL lines."""
			
 
				+        from archivebox.misc.jsonl import read_args_or_stdin
			
 
				+
			
 
				+        stdin = StringIO(
			
 
				+            '{"url": "https://good.com"}\n'
			
 
				+            'not valid json\n'
			
 
				+            '{"url": "https://also-good.com"}\n'
			
 
				+        )
			
 
				+        stdin.isatty = lambda: False
			
 
				+
			
 
				+        records = list(read_args_or_stdin((), stream=stdin))
			
 
				+
			
 
				+        self.assertEqual(len(records), 2)
			
 
				+        urls = {r['url'] for r in records}
			
 
				+        self.assertEqual(urls, {'https://good.com', 'https://also-good.com'})
			
 
				+
			
 
				+    def test_mixed_input_formats(self):
			
 
				+        """Should handle mixed URLs and JSONL."""
			
 
				+        from archivebox.misc.jsonl import read_args_or_stdin
			
 
				+
			
 
				+        stdin = StringIO(
			
 
				+            'https://plain-url.com\n'
			
 
				+            '{"type": "Snapshot", "url": "https://jsonl-url.com", "tags": "test"}\n'
			
 
				+            '01234567-89ab-cdef-0123-456789abcdef\n'  # UUID
			
 
				+        )
			
 
				+        stdin.isatty = lambda: False
			
 
				+
			
 
				+        records = list(read_args_or_stdin((), stream=stdin))
			
 
				+
			
 
				+        self.assertEqual(len(records), 3)
			
 
				+
			
 
				+        # Plain URL
			
 
				+        self.assertEqual(records[0]['url'], 'https://plain-url.com')
			
 
				+
			
 
				+        # JSONL with metadata
			
 
				+        self.assertEqual(records[1]['url'], 'https://jsonl-url.com')
			
 
				+        self.assertEqual(records[1]['tags'], 'test')
			
 
				+
			
 
				+        # UUID
			
 
				+        self.assertEqual(records[2]['id'], '01234567-89ab-cdef-0123-456789abcdef')
			
 
				+
			
 
				+
			
 
				+if __name__ == '__main__':
			
 
				+    unittest.main()
			
--- a/archivebox/config/__init__.py
+++ b/archivebox/config/__init__.py
@@ -1,6 +1,17 @@
 
				+"""
			
 
				+ArchiveBox config exports.
			
 
				+
			
 
				+This module provides backwards-compatible config exports for extractors
			
 
				+and other modules that expect to import config values directly.
			
 
				+"""
			
 
				+
			
 
				 __package__ = 'archivebox.config'
			
 
				 __order__ = 200
			
 
				 
			
 
				+import shutil
			
 
				+from pathlib import Path
			
 
				+from typing import Dict, List, Optional
			
 
				+
			
 
				 from .paths import (
			
 
				     PACKAGE_DIR,                                    # noqa
			
 
				     DATA_DIR,                                       # noqa
			
@@ -9,28 +20,219 @@ from .paths import (
 
				 from .constants import CONSTANTS, CONSTANTS_CONFIG, PACKAGE_DIR, DATA_DIR, ARCHIVE_DIR      # noqa
			
 
				 from .version import VERSION                        # noqa
			
 
				 
			
 
				-# import abx
			
 
				-
			
 
				-# @abx.hookimpl
			
 
				-# def get_CONFIG():
			
 
				-#     from .common import (
			
 
				-#         SHELL_CONFIG,
			
 
				-#         STORAGE_CONFIG,
			
 
				-#         GENERAL_CONFIG,
			
 
				-#         SERVER_CONFIG,
			
 
				-#         ARCHIVING_CONFIG,
			
 
				-#         SEARCH_BACKEND_CONFIG,
			
 
				-#     )
			
 
				-#     return {
			
 
				-#         'SHELL_CONFIG': SHELL_CONFIG,
			
 
				-#         'STORAGE_CONFIG': STORAGE_CONFIG,
			
 
				-#         'GENERAL_CONFIG': GENERAL_CONFIG,
			
 
				-#         'SERVER_CONFIG': SERVER_CONFIG,
			
 
				-#         'ARCHIVING_CONFIG': ARCHIVING_CONFIG,
			
 
				-#         'SEARCHBACKEND_CONFIG': SEARCH_BACKEND_CONFIG,
			
 
				-#     }
			
 
				-
			
 
				-# @abx.hookimpl
			
 
				-# def ready():
			
 
				-#     for config in get_CONFIG().values():
			
 
				-#         config.validate()
			
 
				+
			
 
				+###############################################################################
			
 
				+# Config value exports for extractors
			
 
				+# These provide backwards compatibility with extractors that import from ..config
			
 
				+###############################################################################
			
 
				+
			
 
				+def _get_config():
			
 
				+    """Lazy import to avoid circular imports."""
			
 
				+    from .common import ARCHIVING_CONFIG, STORAGE_CONFIG
			
 
				+    return ARCHIVING_CONFIG, STORAGE_CONFIG
			
 
				+
			
 
				+# Direct exports (evaluated at import time for backwards compat)
			
 
				+# These are recalculated each time the module attribute is accessed
			
 
				+
			
 
				+def __getattr__(name: str):
			
 
				+    """Module-level __getattr__ for lazy config loading."""
			
 
				+    
			
 
				+    # Timeout settings
			
 
				+    if name == 'TIMEOUT':
			
 
				+        cfg, _ = _get_config()
			
 
				+        return cfg.TIMEOUT
			
 
				+    if name == 'MEDIA_TIMEOUT':
			
 
				+        cfg, _ = _get_config()
			
 
				+        return cfg.MEDIA_TIMEOUT
			
 
				+    
			
 
				+    # SSL/Security settings
			
 
				+    if name == 'CHECK_SSL_VALIDITY':
			
 
				+        cfg, _ = _get_config()
			
 
				+        return cfg.CHECK_SSL_VALIDITY
			
 
				+    
			
 
				+    # Storage settings  
			
 
				+    if name == 'RESTRICT_FILE_NAMES':
			
 
				+        _, storage = _get_config()
			
 
				+        return storage.RESTRICT_FILE_NAMES
			
 
				+    
			
 
				+    # User agent / cookies
			
 
				+    if name == 'COOKIES_FILE':
			
 
				+        cfg, _ = _get_config()
			
 
				+        return cfg.COOKIES_FILE
			
 
				+    if name == 'USER_AGENT':
			
 
				+        cfg, _ = _get_config()
			
 
				+        return cfg.USER_AGENT
			
 
				+    if name == 'CURL_USER_AGENT':
			
 
				+        cfg, _ = _get_config()
			
 
				+        return cfg.USER_AGENT
			
 
				+    if name == 'WGET_USER_AGENT':
			
 
				+        cfg, _ = _get_config()
			
 
				+        return cfg.USER_AGENT
			
 
				+    if name == 'CHROME_USER_AGENT':
			
 
				+        cfg, _ = _get_config()
			
 
				+        return cfg.USER_AGENT
			
 
				+    
			
 
				+    # Archive method toggles (SAVE_*)
			
 
				+    if name == 'SAVE_TITLE':
			
 
				+        return True
			
 
				+    if name == 'SAVE_FAVICON':
			
 
				+        return True
			
 
				+    if name == 'SAVE_WGET':
			
 
				+        return True
			
 
				+    if name == 'SAVE_WARC':
			
 
				+        return True
			
 
				+    if name == 'SAVE_WGET_REQUISITES':
			
 
				+        return True
			
 
				+    if name == 'SAVE_SINGLEFILE':
			
 
				+        return True
			
 
				+    if name == 'SAVE_READABILITY':
			
 
				+        return True
			
 
				+    if name == 'SAVE_MERCURY':
			
 
				+        return True
			
 
				+    if name == 'SAVE_HTMLTOTEXT':
			
 
				+        return True
			
 
				+    if name == 'SAVE_PDF':
			
 
				+        return True
			
 
				+    if name == 'SAVE_SCREENSHOT':
			
 
				+        return True
			
 
				+    if name == 'SAVE_DOM':
			
 
				+        return True
			
 
				+    if name == 'SAVE_HEADERS':
			
 
				+        return True
			
 
				+    if name == 'SAVE_GIT':
			
 
				+        return True
			
 
				+    if name == 'SAVE_MEDIA':
			
 
				+        return True
			
 
				+    if name == 'SAVE_ARCHIVE_DOT_ORG':
			
 
				+        return True
			
 
				+    
			
 
				+    # Extractor-specific settings
			
 
				+    if name == 'RESOLUTION':
			
 
				+        cfg, _ = _get_config()
			
 
				+        return cfg.RESOLUTION
			
 
				+    if name == 'GIT_DOMAINS':
			
 
				+        return 'github.com,bitbucket.org,gitlab.com,gist.github.com,codeberg.org,gitea.com,git.sr.ht'
			
 
				+    if name == 'MEDIA_MAX_SIZE':
			
 
				+        cfg, _ = _get_config()
			
 
				+        return cfg.MEDIA_MAX_SIZE
			
 
				+    if name == 'FAVICON_PROVIDER':
			
 
				+        return 'https://www.google.com/s2/favicons?domain={}'
			
 
				+    
			
 
				+    # Binary paths (use shutil.which for detection)
			
 
				+    if name == 'CURL_BINARY':
			
 
				+        return shutil.which('curl') or 'curl'
			
 
				+    if name == 'WGET_BINARY':
			
 
				+        return shutil.which('wget') or 'wget'
			
 
				+    if name == 'GIT_BINARY':
			
 
				+        return shutil.which('git') or 'git'
			
 
				+    if name == 'YOUTUBEDL_BINARY':
			
 
				+        return shutil.which('yt-dlp') or shutil.which('youtube-dl') or 'yt-dlp'
			
 
				+    if name == 'CHROME_BINARY':
			
 
				+        for chrome in ['chromium', 'chromium-browser', 'google-chrome', 'google-chrome-stable', 'chrome']:
			
 
				+            path = shutil.which(chrome)
			
 
				+            if path:
			
 
				+                return path
			
 
				+        return 'chromium'
			
 
				+    if name == 'NODE_BINARY':
			
 
				+        return shutil.which('node') or 'node'
			
 
				+    if name == 'SINGLEFILE_BINARY':
			
 
				+        return shutil.which('single-file') or shutil.which('singlefile') or 'single-file'
			
 
				+    if name == 'READABILITY_BINARY':
			
 
				+        return shutil.which('readability-extractor') or 'readability-extractor'
			
 
				+    if name == 'MERCURY_BINARY':
			
 
				+        return shutil.which('mercury-parser') or shutil.which('postlight-parser') or 'mercury-parser'
			
 
				+    
			
 
				+    # Binary versions (return placeholder, actual version detection happens elsewhere)
			
 
				+    if name == 'CURL_VERSION':
			
 
				+        return 'curl'
			
 
				+    if name == 'WGET_VERSION':
			
 
				+        return 'wget'
			
 
				+    if name == 'GIT_VERSION':
			
 
				+        return 'git'
			
 
				+    if name == 'YOUTUBEDL_VERSION':
			
 
				+        return 'yt-dlp'
			
 
				+    if name == 'CHROME_VERSION':
			
 
				+        return 'chromium'
			
 
				+    if name == 'SINGLEFILE_VERSION':
			
 
				+        return 'singlefile'
			
 
				+    if name == 'READABILITY_VERSION':
			
 
				+        return 'readability'
			
 
				+    if name == 'MERCURY_VERSION':
			
 
				+        return 'mercury'
			
 
				+    
			
 
				+    # Binary arguments
			
 
				+    if name == 'CURL_ARGS':
			
 
				+        return ['--silent', '--location', '--compressed']
			
 
				+    if name == 'WGET_ARGS':
			
 
				+        return [
			
 
				+            '--no-verbose',
			
 
				+            '--adjust-extension',
			
 
				+            '--convert-links',
			
 
				+            '--force-directories',
			
 
				+            '--backup-converted',
			
 
				+            '--span-hosts',
			
 
				+            '--no-parent',
			
 
				+            '-e', 'robots=off',
			
 
				+        ]
			
 
				+    if name == 'GIT_ARGS':
			
 
				+        return ['--recursive']
			
 
				+    if name == 'YOUTUBEDL_ARGS':
			
 
				+        cfg, _ = _get_config()
			
 
				+        return [
			
 
				+            '--write-description',
			
 
				+            '--write-info-json',
			
 
				+            '--write-annotations',
			
 
				+            '--write-thumbnail',
			
 
				+            '--no-call-home',
			
 
				+            '--write-sub',
			
 
				+            '--write-auto-subs',
			
 
				+            '--convert-subs=srt',
			
 
				+            '--yes-playlist',
			
 
				+            '--continue',
			
 
				+            '--no-abort-on-error',
			
 
				+            '--ignore-errors',
			
 
				+            '--geo-bypass',
			
 
				+            '--add-metadata',
			
 
				+            f'--format=(bv*+ba/b)[filesize<={cfg.MEDIA_MAX_SIZE}][filesize_approx<=?{cfg.MEDIA_MAX_SIZE}]/(bv*+ba/b)',
			
 
				+        ]
			
 
				+    if name == 'SINGLEFILE_ARGS':
			
 
				+        return None  # Uses defaults
			
 
				+    if name == 'CHROME_ARGS':
			
 
				+        return []
			
 
				+    
			
 
				+    # Other settings
			
 
				+    if name == 'WGET_AUTO_COMPRESSION':
			
 
				+        return True
			
 
				+    if name == 'DEPENDENCIES':
			
 
				+        return {}  # Legacy, not used anymore
			
 
				+    
			
 
				+    # Allowlist/Denylist patterns (compiled regexes)
			
 
				+    if name == 'SAVE_ALLOWLIST_PTN':
			
 
				+        cfg, _ = _get_config()
			
 
				+        return cfg.SAVE_ALLOWLIST_PTNS
			
 
				+    if name == 'SAVE_DENYLIST_PTN':
			
 
				+        cfg, _ = _get_config()
			
 
				+        return cfg.SAVE_DENYLIST_PTNS
			
 
				+    
			
 
				+    raise AttributeError(f"module 'archivebox.config' has no attribute '{name}'")
			
 
				+
			
 
				+
			
 
				+# Re-export common config classes for direct imports
			
 
				+def get_CONFIG():
			
 
				+    """Get all config sections as a dict."""
			
 
				+    from .common import (
			
 
				+        SHELL_CONFIG,
			
 
				+        STORAGE_CONFIG,
			
 
				+        GENERAL_CONFIG,
			
 
				+        SERVER_CONFIG,
			
 
				+        ARCHIVING_CONFIG,
			
 
				+        SEARCH_BACKEND_CONFIG,
			
 
				+    )
			
 
				+    return {
			
 
				+        'SHELL_CONFIG': SHELL_CONFIG,
			
 
				+        'STORAGE_CONFIG': STORAGE_CONFIG,
			
 
				+        'GENERAL_CONFIG': GENERAL_CONFIG,
			
 
				+        'SERVER_CONFIG': SERVER_CONFIG,
			
 
				+        'ARCHIVING_CONFIG': ARCHIVING_CONFIG,
			
 
				+        'SEARCHBACKEND_CONFIG': SEARCH_BACKEND_CONFIG,
			
 
				+    }
			
--- a/archivebox/config/collection.py
+++ b/archivebox/config/collection.py
@@ -18,13 +18,8 @@ from archivebox.misc.logging import stderr
 
				 
			
 
				 def get_real_name(key: str) -> str:
			
 
				     """get the up-to-date canonical name for a given old alias or current key"""
			
 
				-    CONFIGS = archivebox.pm.hook.get_CONFIGS()
			
 
				-    
			
 
				-    for section in CONFIGS.values():
			
 
				-        try:
			
 
				-            return section.aliases[key]
			
 
				-        except (KeyError, AttributeError):
			
 
				-            pass
			
 
				+    # Config aliases are no longer used with the simplified config system
			
 
				+    # Just return the key as-is since we no longer have a complex alias mapping
			
 
				     return key
			
 
				 
			
 
				 
			
@@ -117,9 +112,20 @@ def load_config_file() -> Optional[benedict]:
 
				 
			
 
				 
			
 
				 def section_for_key(key: str) -> Any:
			
 
				-    for config_section in archivebox.pm.hook.get_CONFIGS().values():
			
 
				-        if hasattr(config_section, key):
			
 
				-            return config_section
			
 
				+    """Find the config section containing a given key."""
			
 
				+    from archivebox.config.common import (
			
 
				+        SHELL_CONFIG,
			
 
				+        STORAGE_CONFIG,
			
 
				+        GENERAL_CONFIG,
			
 
				+        SERVER_CONFIG,
			
 
				+        ARCHIVING_CONFIG,
			
 
				+        SEARCH_BACKEND_CONFIG,
			
 
				+    )
			
 
				+    
			
 
				+    for section in [SHELL_CONFIG, STORAGE_CONFIG, GENERAL_CONFIG, 
			
 
				+                    SERVER_CONFIG, ARCHIVING_CONFIG, SEARCH_BACKEND_CONFIG]:
			
 
				+        if hasattr(section, key):
			
 
				+            return section
			
 
				     raise ValueError(f'No config section found for key: {key}')
			
 
				 
			
 
				 
			
@@ -178,7 +184,8 @@ def write_config_file(config: Dict[str, str]) -> benedict:
 
				     updated_config = {}
			
 
				     try:
			
 
				         # validate the updated_config by attempting to re-parse it
			
 
				-        updated_config = {**load_all_config(), **archivebox.pm.hook.get_FLAT_CONFIG()}
			
 
				+        from archivebox.config.configset import get_flat_config
			
 
				+        updated_config = {**load_all_config(), **get_flat_config()}
			
 
				     except BaseException:                                                       # lgtm [py/catch-base-exception]
			
 
				         # something went horribly wrong, revert to the previous version
			
 
				         with open(f'{config_path}.bak', 'r', encoding='utf-8') as old:
			
@@ -236,12 +243,20 @@ def load_config(defaults: Dict[str, Any],
 
				     return benedict(extended_config)
			
 
				 
			
 
				 def load_all_config():
			
 
				-    import abx
			
 
				+    """Load all config sections and return as a flat dict."""
			
 
				+    from archivebox.config.common import (
			
 
				+        SHELL_CONFIG,
			
 
				+        STORAGE_CONFIG,
			
 
				+        GENERAL_CONFIG,
			
 
				+        SERVER_CONFIG,
			
 
				+        ARCHIVING_CONFIG,
			
 
				+        SEARCH_BACKEND_CONFIG,
			
 
				+    )
			
 
				     
			
 
				     flat_config = benedict()
			
 
				     
			
 
				-    for config_section in abx.pm.hook.get_CONFIGS().values():
			
 
				-        config_section.__init__()
			
 
				+    for config_section in [SHELL_CONFIG, STORAGE_CONFIG, GENERAL_CONFIG, 
			
 
				+                           SERVER_CONFIG, ARCHIVING_CONFIG, SEARCH_BACKEND_CONFIG]:
			
 
				         flat_config.update(dict(config_section))
			
 
				         
			
 
				     return flat_config
			
--- a/archivebox/config/common.py
+++ b/archivebox/config/common.py
@@ -1,4 +1,4 @@
 
				-__package__ = 'archivebox.config'
			
 
				+__package__ = "archivebox.config"
			
 
				 
			
 
				 import re
			
 
				 import sys
			
@@ -10,7 +10,7 @@ from rich import print
 
				 from pydantic import Field, field_validator
			
 
				 from django.utils.crypto import get_random_string
			
 
				 
			
 
				-from abx_spec_config.base_configset import BaseConfigSet
			
 
				+from archivebox.config.configset import BaseConfigSet
			
 
				 
			
 
				 from .constants import CONSTANTS
			
 
				 from .version import get_COMMIT_HASH, get_BUILD_TIME, VERSION
			
@@ -20,109 +20,127 @@ from .permissions import IN_DOCKER
 
				 
			
 
				 
			
 
				 class ShellConfig(BaseConfigSet):
			
 
				-    DEBUG: bool                         = Field(default=lambda: '--debug' in sys.argv)
			
 
				-    
			
 
				-    IS_TTY: bool                        = Field(default=sys.stdout.isatty())
			
 
				-    USE_COLOR: bool                     = Field(default=lambda c: c.IS_TTY)
			
 
				-    SHOW_PROGRESS: bool                 = Field(default=lambda c: c.IS_TTY)
			
 
				-    
			
 
				-    IN_DOCKER: bool                     = Field(default=IN_DOCKER)
			
 
				-    IN_QEMU: bool                       = Field(default=False)
			
 
				+    toml_section_header: str = "SHELL_CONFIG"
			
 
				 
			
 
				-    ANSI: Dict[str, str]                = Field(default=lambda c: CONSTANTS.DEFAULT_CLI_COLORS if c.USE_COLOR else CONSTANTS.DISABLED_CLI_COLORS)
			
 
				+    DEBUG: bool = Field(default="--debug" in sys.argv)
			
 
				+
			
 
				+    IS_TTY: bool = Field(default=sys.stdout.isatty())
			
 
				+    USE_COLOR: bool = Field(default=sys.stdout.isatty())
			
 
				+    SHOW_PROGRESS: bool = Field(default=sys.stdout.isatty())
			
 
				+
			
 
				+    IN_DOCKER: bool = Field(default=IN_DOCKER)
			
 
				+    IN_QEMU: bool = Field(default=False)
			
 
				+
			
 
				+    ANSI: Dict[str, str] = Field(
			
 
				+        default_factory=lambda: CONSTANTS.DEFAULT_CLI_COLORS if sys.stdout.isatty() else CONSTANTS.DISABLED_CLI_COLORS
			
 
				+    )
			
 
				 
			
 
				     @property
			
 
				     def TERM_WIDTH(self) -> int:
			
 
				         if not self.IS_TTY:
			
 
				             return 200
			
 
				         return shutil.get_terminal_size((140, 10)).columns
			
 
				-    
			
 
				+
			
 
				     @property
			
 
				     def COMMIT_HASH(self) -> Optional[str]:
			
 
				         return get_COMMIT_HASH()
			
 
				-    
			
 
				+
			
 
				     @property
			
 
				     def BUILD_TIME(self) -> str:
			
 
				         return get_BUILD_TIME()
			
 
				- 
			
 
				+
			
 
				 
			
 
				 SHELL_CONFIG = ShellConfig()
			
 
				 
			
 
				 
			
 
				 class StorageConfig(BaseConfigSet):
			
 
				+    toml_section_header: str = "STORAGE_CONFIG"
			
 
				+
			
 
				     # TMP_DIR must be a local, fast, readable/writable dir by archivebox user,
			
 
				     # must be a short path due to unix path length restrictions for socket files (<100 chars)
			
 
				     # must be a local SSD/tmpfs for speed and because bind mounts/network mounts/FUSE dont support unix sockets
			
 
				-    TMP_DIR: Path                       = Field(default=CONSTANTS.DEFAULT_TMP_DIR)
			
 
				-    
			
 
				+    TMP_DIR: Path = Field(default=CONSTANTS.DEFAULT_TMP_DIR)
			
 
				+
			
 
				     # LIB_DIR must be a local, fast, readable/writable dir by archivebox user,
			
 
				     # must be able to contain executable binaries (up to 5GB size)
			
 
				     # should not be a remote/network/FUSE mount for speed reasons, otherwise extractors will be slow
			
 
				-    LIB_DIR: Path                       = Field(default=CONSTANTS.DEFAULT_LIB_DIR)
			
 
				-    
			
 
				-    OUTPUT_PERMISSIONS: str             = Field(default='644')
			
 
				-    RESTRICT_FILE_NAMES: str            = Field(default='windows')
			
 
				-    ENFORCE_ATOMIC_WRITES: bool         = Field(default=True)
			
 
				-    
			
 
				+    LIB_DIR: Path = Field(default=CONSTANTS.DEFAULT_LIB_DIR)
			
 
				+
			
 
				+    OUTPUT_PERMISSIONS: str = Field(default="644")
			
 
				+    RESTRICT_FILE_NAMES: str = Field(default="windows")
			
 
				+    ENFORCE_ATOMIC_WRITES: bool = Field(default=True)
			
 
				+
			
 
				     # not supposed to be user settable:
			
 
				-    DIR_OUTPUT_PERMISSIONS: str         = Field(default=lambda c: c['OUTPUT_PERMISSIONS'].replace('6', '7').replace('4', '5'))
			
 
				+    DIR_OUTPUT_PERMISSIONS: str = Field(default="755")  # computed from OUTPUT_PERMISSIONS
			
 
				 
			
 
				 
			
 
				 STORAGE_CONFIG = StorageConfig()
			
 
				 
			
 
				 
			
 
				 class GeneralConfig(BaseConfigSet):
			
 
				-    TAG_SEPARATOR_PATTERN: str          = Field(default=r'[,]')
			
 
				+    toml_section_header: str = "GENERAL_CONFIG"
			
 
				+
			
 
				+    TAG_SEPARATOR_PATTERN: str = Field(default=r"[,]")
			
 
				+
			
 
				 
			
 
				 GENERAL_CONFIG = GeneralConfig()
			
 
				 
			
 
				 
			
 
				 class ServerConfig(BaseConfigSet):
			
 
				-    SECRET_KEY: str                     = Field(default=lambda: get_random_string(50, 'abcdefghijklmnopqrstuvwxyz0123456789_'))
			
 
				-    BIND_ADDR: str                      = Field(default=lambda: ['127.0.0.1:8000', '0.0.0.0:8000'][SHELL_CONFIG.IN_DOCKER])
			
 
				-    ALLOWED_HOSTS: str                  = Field(default='*')
			
 
				-    CSRF_TRUSTED_ORIGINS: str           = Field(default=lambda c: 'http://localhost:8000,http://127.0.0.1:8000,http://0.0.0.0:8000,http://{}'.format(c.BIND_ADDR))
			
 
				-    
			
 
				-    SNAPSHOTS_PER_PAGE: int             = Field(default=40)
			
 
				-    PREVIEW_ORIGINALS: bool             = Field(default=True)
			
 
				-    FOOTER_INFO: str                    = Field(default='Content is hosted for personal archiving purposes only.  Contact server owner for any takedown requests.')
			
 
				+    toml_section_header: str = "SERVER_CONFIG"
			
 
				+
			
 
				+    SECRET_KEY: str = Field(default_factory=lambda: get_random_string(50, "abcdefghijklmnopqrstuvwxyz0123456789_"))
			
 
				+    BIND_ADDR: str = Field(default="127.0.0.1:8000")
			
 
				+    ALLOWED_HOSTS: str = Field(default="*")
			
 
				+    CSRF_TRUSTED_ORIGINS: str = Field(default="http://localhost:8000,http://127.0.0.1:8000,http://0.0.0.0:8000")
			
 
				+
			
 
				+    SNAPSHOTS_PER_PAGE: int = Field(default=40)
			
 
				+    PREVIEW_ORIGINALS: bool = Field(default=True)
			
 
				+    FOOTER_INFO: str = Field(
			
 
				+        default="Content is hosted for personal archiving purposes only.  Contact server owner for any takedown requests."
			
 
				+    )
			
 
				     # CUSTOM_TEMPLATES_DIR: Path          = Field(default=None)  # this is now a constant
			
 
				 
			
 
				-    PUBLIC_INDEX: bool                  = Field(default=True)
			
 
				-    PUBLIC_SNAPSHOTS: bool              = Field(default=True)
			
 
				-    PUBLIC_ADD_VIEW: bool               = Field(default=False)
			
 
				-    
			
 
				-    ADMIN_USERNAME: str                 = Field(default=None)
			
 
				-    ADMIN_PASSWORD: str                 = Field(default=None)
			
 
				-    
			
 
				-    REVERSE_PROXY_USER_HEADER: str      = Field(default='Remote-User')
			
 
				-    REVERSE_PROXY_WHITELIST: str        = Field(default='')
			
 
				-    LOGOUT_REDIRECT_URL: str            = Field(default='/')
			
 
				-    
			
 
				+    PUBLIC_INDEX: bool = Field(default=True)
			
 
				+    PUBLIC_SNAPSHOTS: bool = Field(default=True)
			
 
				+    PUBLIC_ADD_VIEW: bool = Field(default=False)
			
 
				+
			
 
				+    ADMIN_USERNAME: Optional[str] = Field(default=None)
			
 
				+    ADMIN_PASSWORD: Optional[str] = Field(default=None)
			
 
				+
			
 
				+    REVERSE_PROXY_USER_HEADER: str = Field(default="Remote-User")
			
 
				+    REVERSE_PROXY_WHITELIST: str = Field(default="")
			
 
				+    LOGOUT_REDIRECT_URL: str = Field(default="/")
			
 
				+
			
 
				+
			
 
				 SERVER_CONFIG = ServerConfig()
			
 
				 
			
 
				 
			
 
				 class ArchivingConfig(BaseConfigSet):
			
 
				-    ONLY_NEW: bool                        = Field(default=True)
			
 
				-    OVERWRITE: bool                       = Field(default=False)
			
 
				-    
			
 
				-    TIMEOUT: int                          = Field(default=60)
			
 
				-    MEDIA_TIMEOUT: int                    = Field(default=3600)
			
 
				-
			
 
				-    MEDIA_MAX_SIZE: str                   = Field(default='750m')
			
 
				-    RESOLUTION: str                       = Field(default='1440,2000')
			
 
				-    CHECK_SSL_VALIDITY: bool              = Field(default=True)
			
 
				-    USER_AGENT: str                       = Field(default=f'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36 ArchiveBox/{VERSION} (+https://github.com/ArchiveBox/ArchiveBox/)')
			
 
				-    COOKIES_FILE: Path | None             = Field(default=None)
			
 
				-    
			
 
				-    URL_DENYLIST: str                     = Field(default=r'\.(css|js|otf|ttf|woff|woff2|gstatic\.com|googleapis\.com/css)(\?.*)?$', alias='URL_BLACKLIST')
			
 
				-    URL_ALLOWLIST: str | None             = Field(default=None, alias='URL_WHITELIST')
			
 
				-    
			
 
				-    SAVE_ALLOWLIST: Dict[str, List[str]]  = Field(default={})  # mapping of regex patterns to list of archive methods
			
 
				-    SAVE_DENYLIST: Dict[str, List[str]]   = Field(default={})
			
 
				-    
			
 
				-    DEFAULT_PERSONA: str                  = Field(default='Default')
			
 
				-    
			
 
				+    toml_section_header: str = "ARCHIVING_CONFIG"
			
 
				+
			
 
				+    ONLY_NEW: bool = Field(default=True)
			
 
				+    OVERWRITE: bool = Field(default=False)
			
 
				+
			
 
				+    TIMEOUT: int = Field(default=60)
			
 
				+    MEDIA_TIMEOUT: int = Field(default=3600)
			
 
				+
			
 
				+    MEDIA_MAX_SIZE: str = Field(default="750m")
			
 
				+    RESOLUTION: str = Field(default="1440,2000")
			
 
				+    CHECK_SSL_VALIDITY: bool = Field(default=True)
			
 
				+    USER_AGENT: str = Field(
			
 
				+        default=f"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36 ArchiveBox/{VERSION} (+https://github.com/ArchiveBox/ArchiveBox/)"
			
 
				+    )
			
 
				+    COOKIES_FILE: Path | None = Field(default=None)
			
 
				+
			
 
				+    URL_DENYLIST: str = Field(default=r"\.(css|js|otf|ttf|woff|woff2|gstatic\.com|googleapis\.com/css)(\?.*)?$", alias="URL_BLACKLIST")
			
 
				+    URL_ALLOWLIST: str | None = Field(default=None, alias="URL_WHITELIST")
			
 
				+
			
 
				+    SAVE_ALLOWLIST: Dict[str, List[str]] = Field(default={})  # mapping of regex patterns to list of archive methods
			
 
				+    SAVE_DENYLIST: Dict[str, List[str]] = Field(default={})
			
 
				+
			
 
				+    DEFAULT_PERSONA: str = Field(default="Default")
			
 
				+
			
 
				     # GIT_DOMAINS: str                    = Field(default='github.com,bitbucket.org,gitlab.com,gist.github.com,codeberg.org,gitea.com,git.sr.ht')
			
 
				     # WGET_USER_AGENT: str                = Field(default=lambda c: c['USER_AGENT'] + ' wget/{WGET_VERSION}')
			
 
				     # CURL_USER_AGENT: str                = Field(default=lambda c: c['USER_AGENT'] + ' curl/{CURL_VERSION}')
			
@@ -134,58 +152,70 @@ class ArchivingConfig(BaseConfigSet):
 
				 
			
 
				     def validate(self):
			
 
				         if int(self.TIMEOUT) < 5:
			
 
				-            print(f'[red][!] Warning: TIMEOUT is set too low! (currently set to TIMEOUT={self.TIMEOUT} seconds)[/red]', file=sys.stderr)
			
 
				-            print('    You must allow *at least* 5 seconds for indexing and archive methods to run succesfully.', file=sys.stderr)
			
 
				-            print('    (Setting it to somewhere between 30 and 3000 seconds is recommended)', file=sys.stderr)
			
 
				+            print(f"[red][!] Warning: TIMEOUT is set too low! (currently set to TIMEOUT={self.TIMEOUT} seconds)[/red]", file=sys.stderr)
			
 
				+            print("    You must allow *at least* 5 seconds for indexing and archive methods to run succesfully.", file=sys.stderr)
			
 
				+            print("    (Setting it to somewhere between 30 and 3000 seconds is recommended)", file=sys.stderr)
			
 
				             print(file=sys.stderr)
			
 
				-            print('    If you want to make ArchiveBox run faster, disable specific archive methods instead:', file=sys.stderr)
			
 
				-            print('        https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#archive-method-toggles', file=sys.stderr)
			
 
				+            print("    If you want to make ArchiveBox run faster, disable specific archive methods instead:", file=sys.stderr)
			
 
				+            print("        https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#archive-method-toggles", file=sys.stderr)
			
 
				             print(file=sys.stderr)
			
 
				-    
			
 
				-    @field_validator('CHECK_SSL_VALIDITY', mode='after')
			
 
				+
			
 
				+    @field_validator("CHECK_SSL_VALIDITY", mode="after")
			
 
				     def validate_check_ssl_validity(cls, v):
			
 
				         """SIDE EFFECT: disable "you really shouldnt disable ssl" warnings emitted by requests"""
			
 
				         if not v:
			
 
				             import requests
			
 
				             import urllib3
			
 
				+
			
 
				             requests.packages.urllib3.disable_warnings(requests.packages.urllib3.exceptions.InsecureRequestWarning)
			
 
				             urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
			
 
				         return v
			
 
				-    
			
 
				+
			
 
				     @property
			
 
				     def URL_ALLOWLIST_PTN(self) -> re.Pattern | None:
			
 
				         return re.compile(self.URL_ALLOWLIST, CONSTANTS.ALLOWDENYLIST_REGEX_FLAGS) if self.URL_ALLOWLIST else None
			
 
				-    
			
 
				+
			
 
				     @property
			
 
				     def URL_DENYLIST_PTN(self) -> re.Pattern:
			
 
				         return re.compile(self.URL_DENYLIST, CONSTANTS.ALLOWDENYLIST_REGEX_FLAGS)
			
 
				-    
			
 
				+
			
 
				     @property
			
 
				     def SAVE_ALLOWLIST_PTNS(self) -> Dict[re.Pattern, List[str]]:
			
 
				-        return {
			
 
				-            # regexp: methods list
			
 
				-            re.compile(key, CONSTANTS.ALLOWDENYLIST_REGEX_FLAGS): val
			
 
				-            for key, val in self.SAVE_ALLOWLIST.items()
			
 
				-        } if self.SAVE_ALLOWLIST else {}
			
 
				-    
			
 
				+        return (
			
 
				+            {
			
 
				+                # regexp: methods list
			
 
				+                re.compile(key, CONSTANTS.ALLOWDENYLIST_REGEX_FLAGS): val
			
 
				+                for key, val in self.SAVE_ALLOWLIST.items()
			
 
				+            }
			
 
				+            if self.SAVE_ALLOWLIST
			
 
				+            else {}
			
 
				+        )
			
 
				+
			
 
				     @property
			
 
				     def SAVE_DENYLIST_PTNS(self) -> Dict[re.Pattern, List[str]]:
			
 
				-        return {
			
 
				-            # regexp: methods list
			
 
				-            re.compile(key, CONSTANTS.ALLOWDENYLIST_REGEX_FLAGS): val
			
 
				-            for key, val in self.SAVE_DENYLIST.items()
			
 
				-        } if self.SAVE_DENYLIST else {}
			
 
				+        return (
			
 
				+            {
			
 
				+                # regexp: methods list
			
 
				+                re.compile(key, CONSTANTS.ALLOWDENYLIST_REGEX_FLAGS): val
			
 
				+                for key, val in self.SAVE_DENYLIST.items()
			
 
				+            }
			
 
				+            if self.SAVE_DENYLIST
			
 
				+            else {}
			
 
				+        )
			
 
				+
			
 
				 
			
 
				 ARCHIVING_CONFIG = ArchivingConfig()
			
 
				 
			
 
				 
			
 
				 class SearchBackendConfig(BaseConfigSet):
			
 
				-    USE_INDEXING_BACKEND: bool          = Field(default=True)
			
 
				-    USE_SEARCHING_BACKEND: bool         = Field(default=True)
			
 
				-    
			
 
				-    SEARCH_BACKEND_ENGINE: str          = Field(default='ripgrep')
			
 
				-    SEARCH_PROCESS_HTML: bool           = Field(default=True)
			
 
				-    SEARCH_BACKEND_TIMEOUT: int         = Field(default=10)
			
 
				+    toml_section_header: str = "SEARCH_BACKEND_CONFIG"
			
 
				 
			
 
				-SEARCH_BACKEND_CONFIG = SearchBackendConfig()
			
 
				+    USE_INDEXING_BACKEND: bool = Field(default=True)
			
 
				+    USE_SEARCHING_BACKEND: bool = Field(default=True)
			
 
				+
			
 
				+    SEARCH_BACKEND_ENGINE: str = Field(default="ripgrep")
			
 
				+    SEARCH_PROCESS_HTML: bool = Field(default=True)
			
 
				+    SEARCH_BACKEND_TIMEOUT: int = Field(default=10)
			
 
				 
			
 
				+
			
 
				+SEARCH_BACKEND_CONFIG = SearchBackendConfig()
			
--- a/archivebox/config/configset.py
+++ b/archivebox/config/configset.py
@@ -0,0 +1,266 @@
 
				+"""
			
 
				+Simplified config system for ArchiveBox.
			
 
				+
			
 
				+This replaces the complex abx_spec_config/base_configset.py with a simpler
			
 
				+approach that still supports environment variables, config files, and
			
 
				+per-object overrides.
			
 
				+"""
			
 
				+
			
 
				+__package__ = "archivebox.config"
			
 
				+
			
 
				+import os
			
 
				+import json
			
 
				+from pathlib import Path
			
 
				+from typing import Any, Dict, Optional, List, Type, TYPE_CHECKING, cast
			
 
				+from configparser import ConfigParser
			
 
				+
			
 
				+from pydantic import Field
			
 
				+from pydantic_settings import BaseSettings
			
 
				+
			
 
				+
			
 
				+class BaseConfigSet(BaseSettings):
			
 
				+    """
			
 
				+    Base class for config sections.
			
 
				+
			
 
				+    Automatically loads values from:
			
 
				+    1. Environment variables (highest priority)
			
 
				+    2. ArchiveBox.conf file (if exists)
			
 
				+    3. Default values (lowest priority)
			
 
				+
			
 
				+    Subclasses define fields with defaults and types:
			
 
				+
			
 
				+        class ShellConfig(BaseConfigSet):
			
 
				+            DEBUG: bool = Field(default=False)
			
 
				+            USE_COLOR: bool = Field(default=True)
			
 
				+    """
			
 
				+
			
 
				+    class Config:
			
 
				+        # Use env vars with ARCHIVEBOX_ prefix or raw name
			
 
				+        env_prefix = ""
			
 
				+        extra = "ignore"
			
 
				+        validate_default = True
			
 
				+
			
 
				+    @classmethod
			
 
				+    def load_from_file(cls, config_path: Path) -> Dict[str, str]:
			
 
				+        """Load config values from INI file."""
			
 
				+        if not config_path.exists():
			
 
				+            return {}
			
 
				+
			
 
				+        parser = ConfigParser()
			
 
				+        parser.optionxform = lambda x: x  # type: ignore  # preserve case
			
 
				+        parser.read(config_path)
			
 
				+
			
 
				+        # Flatten all sections into single namespace
			
 
				+        return {key.upper(): value for section in parser.sections() for key, value in parser.items(section)}
			
 
				+
			
 
				+    def update_in_place(self, warn: bool = True, persist: bool = False, **kwargs) -> None:
			
 
				+        """
			
 
				+        Update config values in place.
			
 
				+
			
 
				+        This allows runtime updates to config without reloading.
			
 
				+        """
			
 
				+        for key, value in kwargs.items():
			
 
				+            if hasattr(self, key):
			
 
				+                # Use object.__setattr__ to bypass pydantic's frozen model
			
 
				+                object.__setattr__(self, key, value)
			
 
				+
			
 
				+
			
 
				+def get_config(
			
 
				+    scope: str = "global",
			
 
				+    defaults: Optional[Dict] = None,
			
 
				+    user: Any = None,
			
 
				+    crawl: Any = None,
			
 
				+    snapshot: Any = None,
			
 
				+) -> Dict[str, Any]:
			
 
				+    """
			
 
				+    Get merged config from all sources.
			
 
				+
			
 
				+    Priority (highest to lowest):
			
 
				+    1. Per-snapshot config (snapshot.config JSON field)
			
 
				+    2. Per-crawl config (crawl.config JSON field)
			
 
				+    3. Per-user config (user.config JSON field)
			
 
				+    4. Environment variables
			
 
				+    5. Config file (ArchiveBox.conf)
			
 
				+    6. Plugin schema defaults (config.json)
			
 
				+    7. Core config defaults
			
 
				+
			
 
				+    Args:
			
 
				+        scope: Config scope ('global', 'crawl', 'snapshot', etc.)
			
 
				+        defaults: Default values to start with
			
 
				+        user: User object with config JSON field
			
 
				+        crawl: Crawl object with config JSON field
			
 
				+        snapshot: Snapshot object with config JSON field
			
 
				+
			
 
				+    Returns:
			
 
				+        Merged config dict
			
 
				+    """
			
 
				+    from archivebox.config.constants import CONSTANTS
			
 
				+    from archivebox.config.common import (
			
 
				+        SHELL_CONFIG,
			
 
				+        STORAGE_CONFIG,
			
 
				+        GENERAL_CONFIG,
			
 
				+        SERVER_CONFIG,
			
 
				+        ARCHIVING_CONFIG,
			
 
				+        SEARCH_BACKEND_CONFIG,
			
 
				+    )
			
 
				+
			
 
				+    # Start with defaults
			
 
				+    config = dict(defaults or {})
			
 
				+
			
 
				+    # Add plugin config defaults from JSONSchema config.json files
			
 
				+    try:
			
 
				+        from archivebox.hooks import get_config_defaults_from_plugins
			
 
				+        plugin_defaults = get_config_defaults_from_plugins()
			
 
				+        config.update(plugin_defaults)
			
 
				+    except ImportError:
			
 
				+        pass  # hooks not available yet during early startup
			
 
				+
			
 
				+    # Add all core config sections
			
 
				+    config.update(dict(SHELL_CONFIG))
			
 
				+    config.update(dict(STORAGE_CONFIG))
			
 
				+    config.update(dict(GENERAL_CONFIG))
			
 
				+    config.update(dict(SERVER_CONFIG))
			
 
				+    config.update(dict(ARCHIVING_CONFIG))
			
 
				+    config.update(dict(SEARCH_BACKEND_CONFIG))
			
 
				+
			
 
				+    # Load from config file
			
 
				+    config_file = CONSTANTS.CONFIG_FILE
			
 
				+    if config_file.exists():
			
 
				+        file_config = BaseConfigSet.load_from_file(config_file)
			
 
				+        config.update(file_config)
			
 
				+
			
 
				+    # Override with environment variables
			
 
				+    for key in config:
			
 
				+        env_val = os.environ.get(key)
			
 
				+        if env_val is not None:
			
 
				+            config[key] = _parse_env_value(env_val, config.get(key))
			
 
				+
			
 
				+    # Also check plugin config aliases in environment
			
 
				+    try:
			
 
				+        from archivebox.hooks import discover_plugin_configs
			
 
				+        plugin_configs = discover_plugin_configs()
			
 
				+        for plugin_name, schema in plugin_configs.items():
			
 
				+            for key, prop_schema in schema.get('properties', {}).items():
			
 
				+                # Check x-aliases
			
 
				+                for alias in prop_schema.get('x-aliases', []):
			
 
				+                    if alias in os.environ and key not in os.environ:
			
 
				+                        config[key] = _parse_env_value(os.environ[alias], config.get(key))
			
 
				+                        break
			
 
				+                # Check x-fallback
			
 
				+                fallback = prop_schema.get('x-fallback')
			
 
				+                if fallback and fallback in config and key not in config:
			
 
				+                    config[key] = config[fallback]
			
 
				+    except ImportError:
			
 
				+        pass
			
 
				+
			
 
				+    # Apply user config overrides
			
 
				+    if user and hasattr(user, "config") and user.config:
			
 
				+        config.update(user.config)
			
 
				+
			
 
				+    # Apply crawl config overrides
			
 
				+    if crawl and hasattr(crawl, "config") and crawl.config:
			
 
				+        config.update(crawl.config)
			
 
				+
			
 
				+    # Apply snapshot config overrides (highest priority)
			
 
				+    if snapshot and hasattr(snapshot, "config") and snapshot.config:
			
 
				+        config.update(snapshot.config)
			
 
				+
			
 
				+    return config
			
 
				+
			
 
				+
			
 
				+def get_flat_config() -> Dict[str, Any]:
			
 
				+    """
			
 
				+    Get a flat dictionary of all config values.
			
 
				+
			
 
				+    Replaces abx.pm.hook.get_FLAT_CONFIG()
			
 
				+    """
			
 
				+    return get_config(scope="global")
			
 
				+
			
 
				+
			
 
				+def get_all_configs() -> Dict[str, BaseConfigSet]:
			
 
				+    """
			
 
				+    Get all config section objects as a dictionary.
			
 
				+
			
 
				+    Replaces abx.pm.hook.get_CONFIGS()
			
 
				+    """
			
 
				+    from archivebox.config.common import (
			
 
				+        SHELL_CONFIG, SERVER_CONFIG, ARCHIVING_CONFIG, SEARCH_BACKEND_CONFIG
			
 
				+    )
			
 
				+    return {
			
 
				+        'SHELL_CONFIG': SHELL_CONFIG,
			
 
				+        'SERVER_CONFIG': SERVER_CONFIG,
			
 
				+        'ARCHIVING_CONFIG': ARCHIVING_CONFIG,
			
 
				+        'SEARCH_BACKEND_CONFIG': SEARCH_BACKEND_CONFIG,
			
 
				+    }
			
 
				+
			
 
				+
			
 
				+def _parse_env_value(value: str, default: Any = None) -> Any:
			
 
				+    """Parse an environment variable value based on expected type."""
			
 
				+    if default is None:
			
 
				+        # Try to guess the type
			
 
				+        if value.lower() in ("true", "false", "yes", "no", "1", "0"):
			
 
				+            return value.lower() in ("true", "yes", "1")
			
 
				+        try:
			
 
				+            return int(value)
			
 
				+        except ValueError:
			
 
				+            pass
			
 
				+        try:
			
 
				+            return json.loads(value)
			
 
				+        except (json.JSONDecodeError, ValueError):
			
 
				+            pass
			
 
				+        return value
			
 
				+
			
 
				+    # Parse based on default's type
			
 
				+    if isinstance(default, bool):
			
 
				+        return value.lower() in ("true", "yes", "1")
			
 
				+    elif isinstance(default, int):
			
 
				+        return int(value)
			
 
				+    elif isinstance(default, float):
			
 
				+        return float(value)
			
 
				+    elif isinstance(default, (list, dict)):
			
 
				+        return json.loads(value)
			
 
				+    elif isinstance(default, Path):
			
 
				+        return Path(value)
			
 
				+    else:
			
 
				+        return value
			
 
				+
			
 
				+
			
 
				+# Default worker concurrency settings
			
 
				+DEFAULT_WORKER_CONCURRENCY = {
			
 
				+    "crawl": 2,
			
 
				+    "snapshot": 3,
			
 
				+    "wget": 2,
			
 
				+    "ytdlp": 2,
			
 
				+    "screenshot": 3,
			
 
				+    "singlefile": 2,
			
 
				+    "title": 5,
			
 
				+    "favicon": 5,
			
 
				+    "headers": 5,
			
 
				+    "archive_org": 2,
			
 
				+    "readability": 3,
			
 
				+    "mercury": 3,
			
 
				+    "git": 2,
			
 
				+    "pdf": 2,
			
 
				+    "dom": 3,
			
 
				+}
			
 
				+
			
 
				+
			
 
				+def get_worker_concurrency() -> Dict[str, int]:
			
 
				+    """
			
 
				+    Get worker concurrency settings.
			
 
				+
			
 
				+    Can be configured via WORKER_CONCURRENCY env var as JSON dict.
			
 
				+    """
			
 
				+    config = get_config()
			
 
				+
			
 
				+    # Start with defaults
			
 
				+    concurrency = DEFAULT_WORKER_CONCURRENCY.copy()
			
 
				+
			
 
				+    # Override with config
			
 
				+    if "WORKER_CONCURRENCY" in config:
			
 
				+        custom = config["WORKER_CONCURRENCY"]
			
 
				+        if isinstance(custom, str):
			
 
				+            custom = json.loads(custom)
			
 
				+        concurrency.update(custom)
			
 
				+
			
 
				+    return concurrency
			
--- a/archivebox/config/views.py
+++ b/archivebox/config/views.py
@@ -1,6 +1,7 @@
 
				-__package__ = 'abx.archivebox'
			
 
				+__package__ = 'archivebox.config'
			
 
				 
			
 
				 import os
			
 
				+import shutil
			
 
				 import inspect
			
 
				 from pathlib import Path
			
 
				 from typing import Any, List, Dict, cast
			
@@ -13,14 +14,22 @@ from django.utils.html import format_html, mark_safe
 
				 from admin_data_views.typing import TableContext, ItemContext
			
 
				 from admin_data_views.utils import render_with_table_view, render_with_item_view, ItemLink
			
 
				 
			
 
				-import abx
			
 
				-import archivebox
			
 
				 from archivebox.config import CONSTANTS
			
 
				 from archivebox.misc.util import parse_date
			
 
				 
			
 
				 from machine.models import InstalledBinary
			
 
				 
			
 
				 
			
 
				+# Common binaries to check for
			
 
				+KNOWN_BINARIES = [
			
 
				+    'wget', 'curl', 'chromium', 'chrome', 'google-chrome', 'google-chrome-stable',
			
 
				+    'node', 'npm', 'npx', 'yt-dlp', 'ytdlp', 'youtube-dl',
			
 
				+    'git', 'singlefile', 'readability-extractor', 'mercury-parser',
			
 
				+    'python3', 'python', 'bash', 'zsh',
			
 
				+    'ffmpeg', 'ripgrep', 'rg', 'sonic', 'archivebox',
			
 
				+]
			
 
				+
			
 
				+
			
 
				 def obj_to_yaml(obj: Any, indent: int=0) -> str:
			
 
				     indent_str = "  " * indent
			
 
				     if indent == 0:
			
@@ -62,65 +71,92 @@ def obj_to_yaml(obj: Any, indent: int=0) -> str:
 
				     else:
			
 
				         return f" {str(obj)}"
			
 
				 
			
 
				+
			
 
				+def get_detected_binaries() -> Dict[str, Dict[str, Any]]:
			
 
				+    """Detect available binaries using shutil.which."""
			
 
				+    binaries = {}
			
 
				+    
			
 
				+    for name in KNOWN_BINARIES:
			
 
				+        path = shutil.which(name)
			
 
				+        if path:
			
 
				+            binaries[name] = {
			
 
				+                'name': name,
			
 
				+                'abspath': path,
			
 
				+                'version': None,  # Could add version detection later
			
 
				+                'is_available': True,
			
 
				+            }
			
 
				+    
			
 
				+    return binaries
			
 
				+
			
 
				+
			
 
				+def get_filesystem_plugins() -> Dict[str, Dict[str, Any]]:
			
 
				+    """Discover plugins from filesystem directories."""
			
 
				+    from archivebox.hooks import BUILTIN_PLUGINS_DIR, USER_PLUGINS_DIR
			
 
				+    
			
 
				+    plugins = {}
			
 
				+    
			
 
				+    for base_dir, source in [(BUILTIN_PLUGINS_DIR, 'builtin'), (USER_PLUGINS_DIR, 'user')]:
			
 
				+        if not base_dir.exists():
			
 
				+            continue
			
 
				+        
			
 
				+        for plugin_dir in base_dir.iterdir():
			
 
				+            if plugin_dir.is_dir() and not plugin_dir.name.startswith('_'):
			
 
				+                plugin_id = f'{source}.{plugin_dir.name}'
			
 
				+                
			
 
				+                # Find hook scripts
			
 
				+                hooks = []
			
 
				+                for ext in ('sh', 'py', 'js'):
			
 
				+                    hooks.extend(plugin_dir.glob(f'on_*__*.{ext}'))
			
 
				+                
			
 
				+                plugins[plugin_id] = {
			
 
				+                    'id': plugin_id,
			
 
				+                    'name': plugin_dir.name,
			
 
				+                    'path': str(plugin_dir),
			
 
				+                    'source': source,
			
 
				+                    'hooks': [str(h.name) for h in hooks],
			
 
				+                }
			
 
				+    
			
 
				+    return plugins
			
 
				+
			
 
				+
			
 
				 @render_with_table_view
			
 
				 def binaries_list_view(request: HttpRequest, **kwargs) -> TableContext:
			
 
				-    FLAT_CONFIG = archivebox.pm.hook.get_FLAT_CONFIG()
			
 
				     assert request.user.is_superuser, 'Must be a superuser to view configuration settings.'
			
 
				 
			
 
				     rows = {
			
 
				         "Binary Name": [],
			
 
				         "Found Version": [],
			
 
				-        "From Plugin": [],
			
 
				         "Provided By": [],
			
 
				         "Found Abspath": [],
			
 
				-        "Related Configuration": [],
			
 
				-        # "Overrides": [],
			
 
				-        # "Description": [],
			
 
				-    }
			
 
				-
			
 
				-    relevant_configs = {
			
 
				-        key: val
			
 
				-        for key, val in FLAT_CONFIG.items()
			
 
				-        if '_BINARY' in key or '_VERSION' in key
			
 
				     }
			
 
				 
			
 
				-    for plugin_id, plugin in abx.get_all_plugins().items():
			
 
				-        plugin = benedict(plugin)
			
 
				-        if not hasattr(plugin.plugin, 'get_BINARIES'):
			
 
				-            continue
			
 
				+    # Get binaries from database (previously detected/installed)
			
 
				+    db_binaries = {b.name: b for b in InstalledBinary.objects.all()}
			
 
				+    
			
 
				+    # Get currently detectable binaries  
			
 
				+    detected = get_detected_binaries()
			
 
				+    
			
 
				+    # Merge and display
			
 
				+    all_binary_names = sorted(set(list(db_binaries.keys()) + list(detected.keys())))
			
 
				+    
			
 
				+    for name in all_binary_names:
			
 
				+        db_binary = db_binaries.get(name)
			
 
				+        detected_binary = detected.get(name)
			
 
				         
			
 
				-        for binary in plugin.plugin.get_BINARIES().values():
			
 
				-            try:
			
 
				-                installed_binary = InstalledBinary.objects.get_from_db_or_cache(binary)
			
 
				-                binary = installed_binary.load_from_db()
			
 
				-            except Exception as e:
			
 
				-                print(e)
			
 
				-
			
 
				-            rows['Binary Name'].append(ItemLink(binary.name, key=binary.name))
			
 
				-            rows['Found Version'].append(f'✅ {binary.loaded_version}' if binary.loaded_version else '❌ missing')
			
 
				-            rows['From Plugin'].append(plugin.package)
			
 
				-            rows['Provided By'].append(
			
 
				-                ', '.join(
			
 
				-                    f'[{binprovider.name}]' if binprovider.name == getattr(binary.loaded_binprovider, 'name', None) else binprovider.name
			
 
				-                    for binprovider in binary.binproviders_supported
			
 
				-                    if binprovider
			
 
				-                )
			
 
				-                # binary.loaded_binprovider.name
			
 
				-                # if binary.loaded_binprovider else
			
 
				-                # ', '.join(getattr(provider, 'name', str(provider)) for provider in binary.binproviders_supported)
			
 
				-            )
			
 
				-            rows['Found Abspath'].append(str(binary.loaded_abspath or '❌ missing'))
			
 
				-            rows['Related Configuration'].append(mark_safe(', '.join(
			
 
				-                f'<a href="/admin/environment/config/{config_key}/">{config_key}</a>'
			
 
				-                for config_key, config_value in relevant_configs.items()
			
 
				-                    if str(binary.name).lower().replace('-', '').replace('_', '').replace('ytdlp', 'youtubedl') in config_key.lower()
			
 
				-                    or config_value.lower().endswith(binary.name.lower())
			
 
				-                    # or binary.name.lower().replace('-', '').replace('_', '') in str(config_value).lower()
			
 
				-            )))
			
 
				-            # if not binary.overrides:
			
 
				-                # import ipdb; ipdb.set_trace()
			
 
				-            # rows['Overrides'].append(str(obj_to_yaml(binary.overrides) or str(binary.overrides))[:200])
			
 
				-            # rows['Description'].append(binary.description)
			
 
				+        rows['Binary Name'].append(ItemLink(name, key=name))
			
 
				+        
			
 
				+        if db_binary:
			
 
				+            rows['Found Version'].append(f'✅ {db_binary.version}' if db_binary.version else '✅ found')
			
 
				+            rows['Provided By'].append(db_binary.binprovider or 'PATH')
			
 
				+            rows['Found Abspath'].append(str(db_binary.abspath or ''))
			
 
				+        elif detected_binary:
			
 
				+            rows['Found Version'].append('✅ found')
			
 
				+            rows['Provided By'].append('PATH')
			
 
				+            rows['Found Abspath'].append(detected_binary['abspath'])
			
 
				+        else:
			
 
				+            rows['Found Version'].append('❌ missing')
			
 
				+            rows['Provided By'].append('-')
			
 
				+            rows['Found Abspath'].append('-')
			
 
				 
			
 
				     return TableContext(
			
 
				         title="Binaries",
			
@@ -132,43 +168,65 @@ def binary_detail_view(request: HttpRequest, key: str, **kwargs) -> ItemContext:
 
				 
			
 
				     assert request.user and request.user.is_superuser, 'Must be a superuser to view configuration settings.'
			
 
				 
			
 
				-    binary = None
			
 
				-    plugin = None
			
 
				-    for plugin_id, plugin in abx.get_all_plugins().items():
			
 
				-        try:
			
 
				-            for loaded_binary in plugin['hooks'].get_BINARIES().values():
			
 
				-                if loaded_binary.name == key:
			
 
				-                    binary = loaded_binary
			
 
				-                    plugin = plugin
			
 
				-                    # break  # last write wins
			
 
				-        except Exception as e:
			
 
				-            print(e)
			
 
				-
			
 
				-    assert plugin and binary, f'Could not find a binary matching the specified name: {key}'
			
 
				-
			
 
				+    # Try database first
			
 
				     try:
			
 
				-        binary = binary.load()
			
 
				-    except Exception as e:
			
 
				-        print(e)
			
 
				-
			
 
				+        binary = InstalledBinary.objects.get(name=key)
			
 
				+        return ItemContext(
			
 
				+            slug=key,
			
 
				+            title=key,
			
 
				+            data=[
			
 
				+                {
			
 
				+                    "name": binary.name,
			
 
				+                    "description": str(binary.abspath or ''),
			
 
				+                    "fields": {
			
 
				+                        'name': binary.name,
			
 
				+                        'binprovider': binary.binprovider,
			
 
				+                        'abspath': str(binary.abspath),
			
 
				+                        'version': binary.version,
			
 
				+                        'sha256': binary.sha256,
			
 
				+                    },
			
 
				+                    "help_texts": {},
			
 
				+                },
			
 
				+            ],
			
 
				+        )
			
 
				+    except InstalledBinary.DoesNotExist:
			
 
				+        pass
			
 
				+    
			
 
				+    # Try to detect from PATH
			
 
				+    path = shutil.which(key)
			
 
				+    if path:
			
 
				+        return ItemContext(
			
 
				+            slug=key,
			
 
				+            title=key,
			
 
				+            data=[
			
 
				+                {
			
 
				+                    "name": key,
			
 
				+                    "description": path,
			
 
				+                    "fields": {
			
 
				+                        'name': key,
			
 
				+                        'binprovider': 'PATH',
			
 
				+                        'abspath': path,
			
 
				+                        'version': 'unknown',
			
 
				+                    },
			
 
				+                    "help_texts": {},
			
 
				+                },
			
 
				+            ],
			
 
				+        )
			
 
				+    
			
 
				     return ItemContext(
			
 
				         slug=key,
			
 
				         title=key,
			
 
				         data=[
			
 
				             {
			
 
				-                "name": binary.name,
			
 
				-                "description": binary.abspath,
			
 
				+                "name": key,
			
 
				+                "description": "Binary not found",
			
 
				                 "fields": {
			
 
				-                    'plugin': plugin['package'],
			
 
				-                    'binprovider': binary.loaded_binprovider,
			
 
				-                    'abspath': binary.loaded_abspath,
			
 
				-                    'version': binary.loaded_version,
			
 
				-                    'overrides': obj_to_yaml(binary.overrides),
			
 
				-                    'providers': obj_to_yaml(binary.binproviders_supported),
			
 
				-                },
			
 
				-                "help_texts": {
			
 
				-                    # TODO
			
 
				+                    'name': key,
			
 
				+                    'binprovider': 'not installed',
			
 
				+                    'abspath': 'not found',
			
 
				+                    'version': 'N/A',
			
 
				                 },
			
 
				+                "help_texts": {},
			
 
				             },
			
 
				         ],
			
 
				     )
			
@@ -180,66 +238,26 @@ def plugins_list_view(request: HttpRequest, **kwargs) -> TableContext:
 
				     assert request.user.is_superuser, 'Must be a superuser to view configuration settings.'
			
 
				 
			
 
				     rows = {
			
 
				-        "Label": [],
			
 
				-        "Version": [],
			
 
				-        "Author": [],
			
 
				-        "Package": [],
			
 
				-        "Source Code": [],
			
 
				-        "Config": [],
			
 
				-        "Binaries": [],
			
 
				-        "Package Managers": [],
			
 
				-        # "Search Backends": [],
			
 
				+        "Name": [],
			
 
				+        "Source": [],
			
 
				+        "Path": [],
			
 
				+        "Hooks": [],
			
 
				     }
			
 
				 
			
 
				-    config_colors = {
			
 
				-        '_BINARY': '#339',
			
 
				-        'USE_': 'green',
			
 
				-        'SAVE_': 'green',
			
 
				-        '_ARGS': '#33e',
			
 
				-        'KEY': 'red',
			
 
				-        'COOKIES': 'red',
			
 
				-        'AUTH': 'red',
			
 
				-        'SECRET': 'red',
			
 
				-        'TOKEN': 'red',
			
 
				-        'PASSWORD': 'red',
			
 
				-        'TIMEOUT': '#533',
			
 
				-        'RETRIES': '#533',
			
 
				-        'MAX': '#533',
			
 
				-        'MIN': '#533',
			
 
				-    }
			
 
				-    def get_color(key):
			
 
				-        for pattern, color in config_colors.items():
			
 
				-            if pattern in key:
			
 
				-                return color
			
 
				-        return 'black'
			
 
				-
			
 
				-    for plugin_id, plugin in abx.get_all_plugins().items():
			
 
				-        plugin.hooks.get_BINPROVIDERS = getattr(plugin.plugin, 'get_BINPROVIDERS', lambda: {})
			
 
				-        plugin.hooks.get_BINARIES = getattr(plugin.plugin, 'get_BINARIES', lambda: {})
			
 
				-        plugin.hooks.get_CONFIG = getattr(plugin.plugin, 'get_CONFIG', lambda: {})
			
 
				-        
			
 
				-        rows['Label'].append(ItemLink(plugin.label, key=plugin.package))
			
 
				-        rows['Version'].append(str(plugin.version))
			
 
				-        rows['Author'].append(mark_safe(f'<a href="{plugin.homepage}" target="_blank">{plugin.author}</a>'))
			
 
				-        rows['Package'].append(ItemLink(plugin.package, key=plugin.package))
			
 
				-        rows['Source Code'].append(format_html('<code>{}</code>', str(plugin.source_code).replace(str(Path('~').expanduser()), '~')))
			
 
				-        rows['Config'].append(mark_safe(''.join(
			
 
				-            f'<a href="/admin/environment/config/{key}/"><b><code style="color: {get_color(key)};">{key}</code></b>=<code>{value}</code></a><br/>'
			
 
				-            for configdict in plugin.hooks.get_CONFIG().values()
			
 
				-                for key, value in benedict(configdict).items()
			
 
				-        )))
			
 
				-        rows['Binaries'].append(mark_safe(', '.join(
			
 
				-            f'<a href="/admin/environment/binaries/{binary.name}/"><code>{binary.name}</code></a>'
			
 
				-            for binary in plugin.hooks.get_BINARIES().values()
			
 
				-        )))
			
 
				-        rows['Package Managers'].append(mark_safe(', '.join(
			
 
				-            f'<a href="/admin/environment/binproviders/{binprovider.name}/"><code>{binprovider.name}</code></a>'
			
 
				-            for binprovider in plugin.hooks.get_BINPROVIDERS().values()
			
 
				-        )))
			
 
				-        # rows['Search Backends'].append(mark_safe(', '.join(
			
 
				-        #     f'<a href="/admin/environment/searchbackends/{searchbackend.name}/"><code>{searchbackend.name}</code></a>'
			
 
				-        #     for searchbackend in plugin.SEARCHBACKENDS.values()
			
 
				-        # )))
			
 
				+    plugins = get_filesystem_plugins()
			
 
				+
			
 
				+    for plugin_id, plugin in plugins.items():
			
 
				+        rows['Name'].append(ItemLink(plugin['name'], key=plugin_id))
			
 
				+        rows['Source'].append(plugin['source'])
			
 
				+        rows['Path'].append(format_html('<code>{}</code>', plugin['path']))
			
 
				+        rows['Hooks'].append(', '.join(plugin['hooks']) or '(none)')
			
 
				+
			
 
				+    if not plugins:
			
 
				+        # Show a helpful message when no plugins found
			
 
				+        rows['Name'].append('(no plugins found)')
			
 
				+        rows['Source'].append('-')
			
 
				+        rows['Path'].append(format_html('<code>archivebox/plugins/</code> or <code>data/plugins/</code>'))
			
 
				+        rows['Hooks'].append('-')
			
 
				 
			
 
				     return TableContext(
			
 
				         title="Installed plugins",
			
@@ -251,39 +269,31 @@ def plugin_detail_view(request: HttpRequest, key: str, **kwargs) -> ItemContext:
 
				 
			
 
				     assert request.user.is_superuser, 'Must be a superuser to view configuration settings.'
			
 
				 
			
 
				-    plugins = abx.get_all_plugins()
			
 
				-
			
 
				-    plugin_id = None
			
 
				-    for check_plugin_id, loaded_plugin in plugins.items():
			
 
				-        if check_plugin_id.split('.')[-1] == key.split('.')[-1]:
			
 
				-            plugin_id = check_plugin_id
			
 
				-            break
			
 
				-
			
 
				-    assert plugin_id, f'Could not find a plugin matching the specified name: {key}'
			
 
				-
			
 
				-    plugin = abx.get_plugin(plugin_id)
			
 
				+    plugins = get_filesystem_plugins()
			
 
				+    
			
 
				+    plugin = plugins.get(key)
			
 
				+    if not plugin:
			
 
				+        return ItemContext(
			
 
				+            slug=key,
			
 
				+            title=f'Plugin not found: {key}',
			
 
				+            data=[],
			
 
				+        )
			
 
				 
			
 
				     return ItemContext(
			
 
				         slug=key,
			
 
				-        title=key,
			
 
				+        title=plugin['name'],
			
 
				         data=[
			
 
				             {
			
 
				-                "name": plugin.package,
			
 
				-                "description": plugin.label,
			
 
				+                "name": plugin['name'],
			
 
				+                "description": plugin['path'],
			
 
				                 "fields": {
			
 
				-                    "id": plugin.id,
			
 
				-                    "package": plugin.package,
			
 
				-                    "label": plugin.label,
			
 
				-                    "version": plugin.version,
			
 
				-                    "author": plugin.author,
			
 
				-                    "homepage": plugin.homepage,
			
 
				-                    "dependencies": getattr(plugin, 'DEPENDENCIES', []),
			
 
				-                    "source_code": plugin.source_code,
			
 
				-                    "hooks": plugin.hooks,
			
 
				-                },
			
 
				-                "help_texts": {
			
 
				-                    # TODO
			
 
				+                    "id": plugin['id'],
			
 
				+                    "name": plugin['name'],
			
 
				+                    "source": plugin['source'],
			
 
				+                    "path": plugin['path'],
			
 
				+                    "hooks": plugin['hooks'],
			
 
				                 },
			
 
				+                "help_texts": {},
			
 
				             },
			
 
				         ],
			
 
				     )
			
@@ -333,22 +343,6 @@ def worker_list_view(request: HttpRequest, **kwargs) -> TableContext:
 
				     # Add a row for each worker process managed by supervisord
			
 
				     for proc in cast(List[Dict[str, Any]], supervisor.getAllProcessInfo()):
			
 
				         proc = benedict(proc)
			
 
				-        # {
			
 
				-        #     "name": "daphne",
			
 
				-        #     "group": "daphne",
			
 
				-        #     "start": 1725933056,
			
 
				-        #     "stop": 0,
			
 
				-        #     "now": 1725933438,
			
 
				-        #     "state": 20,
			
 
				-        #     "statename": "RUNNING",
			
 
				-        #     "spawnerr": "",
			
 
				-        #     "exitstatus": 0,
			
 
				-        #     "logfile": "logs/server.log",
			
 
				-        #     "stdout_logfile": "logs/server.log",
			
 
				-        #     "stderr_logfile": "",
			
 
				-        #     "pid": 33283,
			
 
				-        #     "description": "pid 33283, uptime 0:06:22",
			
 
				-        # }
			
 
				         rows["Name"].append(ItemLink(proc.name, key=proc.name))
			
 
				         rows["State"].append(proc.statename)
			
 
				         rows['PID'].append(proc.description.replace('pid ', ''))
			
--- a/archivebox/core/__init__.py
+++ b/archivebox/core/__init__.py
@@ -1,16 +1,13 @@
 
				 __package__ = 'archivebox.core'
			
 
				 __order__ = 100
			
 
				-import abx
			
 
				 
			
 
				-@abx.hookimpl
			
 
				+
			
 
				 def register_admin(admin_site):
			
 
				     """Register the core.models views (Snapshot, ArchiveResult, Tag, etc.) with the admin site"""
			
 
				-    from core.admin import register_admin
			
 
				-    register_admin(admin_site)
			
 
				-
			
 
				+    from core.admin import register_admin as do_register
			
 
				+    do_register(admin_site)
			
 
				 
			
 
				 
			
 
				[email protected]
			
 
				 def get_CONFIG():
			
 
				     from archivebox.config.common import (
			
 
				         SHELL_CONFIG,
			
@@ -28,4 +25,3 @@ def get_CONFIG():
 
				         'ARCHIVING_CONFIG': ARCHIVING_CONFIG,
			
 
				         'SEARCHBACKEND_CONFIG': SEARCH_BACKEND_CONFIG,
			
 
				     }
			
 
				-
			
--- a/archivebox/core/admin.py
+++ b/archivebox/core/admin.py
@@ -9,10 +9,7 @@ from core.admin_snapshots import SnapshotAdmin
 
				 from core.admin_archiveresults import ArchiveResultAdmin
			
 
				 from core.admin_users import UserAdmin
			
 
				 
			
 
				-import abx
			
 
				 
			
 
				-
			
 
				[email protected]
			
 
				 def register_admin(admin_site):
			
 
				     admin_site.register(get_user_model(), UserAdmin)
			
 
				     admin_site.register(ArchiveResult, ArchiveResultAdmin)
			
--- a/archivebox/core/admin_archiveresults.py
+++ b/archivebox/core/admin_archiveresults.py
@@ -11,8 +11,6 @@ from django.utils import timezone
 
				 
			
 
				 from huey_monitor.admin import TaskModel
			
 
				 
			
 
				-import abx
			
 
				-
			
 
				 from archivebox.config import DATA_DIR
			
 
				 from archivebox.config.common import SERVER_CONFIG
			
 
				 from archivebox.misc.paginators import AccelleratedPaginator
			
@@ -43,7 +41,6 @@ class ArchiveResultInline(admin.TabularInline):
 
				     ordering = ('end_ts',)
			
 
				     show_change_link = True
			
 
				     # # classes = ['collapse']
			
 
				-    # # list_display_links = ['abid']
			
 
				 
			
 
				     def get_parent_object_from_request(self, request):
			
 
				         resolved = resolve(request.path_info)
			
@@ -80,7 +77,7 @@ class ArchiveResultInline(admin.TabularInline):
 
				         formset.form.base_fields['start_ts'].initial = timezone.now()
			
 
				         formset.form.base_fields['end_ts'].initial = timezone.now()
			
 
				         formset.form.base_fields['cmd_version'].initial = '-'
			
 
				-        formset.form.base_fields['pwd'].initial = str(snapshot.link_dir)
			
 
				+        formset.form.base_fields['pwd'].initial = str(snapshot.output_dir)
			
 
				         formset.form.base_fields['created_by'].initial = request.user
			
 
				         formset.form.base_fields['cmd'].initial = '["-"]'
			
 
				         formset.form.base_fields['output'].initial = 'Manually recorded cmd output...'
			
@@ -193,6 +190,5 @@ class ArchiveResultAdmin(BaseModelAdmin):
 
				 
			
 
				 
			
 
				 
			
 
				[email protected]
			
 
				 def register_admin(admin_site):
			
 
				     admin_site.register(ArchiveResult, ArchiveResultAdmin)
			
--- a/archivebox/core/admin_site.py
+++ b/archivebox/core/admin_site.py
@@ -36,7 +36,7 @@ def register_admin_site():
 
				     admin.site = archivebox_admin
			
 
				     sites.site = archivebox_admin
			
 
				     
			
 
				-    # register all plugins admin classes
			
 
				-    archivebox.pm.hook.register_admin(admin_site=archivebox_admin)
			
 
				+    # Plugin admin registration is now handled by individual app admins
			
 
				+    # No longer using archivebox.pm.hook.register_admin()
			
 
				     
			
 
				     return archivebox_admin
			
--- a/archivebox/core/admin_snapshots.py
+++ b/archivebox/core/admin_snapshots.py
@@ -19,11 +19,9 @@ from archivebox.misc.util import htmldecode, urldecode
 
				 from archivebox.misc.paginators import AccelleratedPaginator
			
 
				 from archivebox.misc.logging_util import printable_filesize
			
 
				 from archivebox.search.admin import SearchResultsAdminMixin
			
 
				-from archivebox.index.html import snapshot_icons
			
 
				-from archivebox.extractors import archive_links
			
 
				 
			
 
				-from archivebox.base_models.admin import BaseModelAdmin
			
 
				-from archivebox.workers.tasks import bg_archive_links, bg_add
			
 
				+from archivebox.base_models.admin import BaseModelAdmin, ConfigEditorMixin
			
 
				+from archivebox.workers.tasks import bg_archive_snapshots, bg_add
			
 
				 
			
 
				 from core.models import Tag
			
 
				 from core.admin_tags import TagInline
			
@@ -53,13 +51,13 @@ class SnapshotActionForm(ActionForm):
 
				     # )
			
 
				 
			
 
				 
			
 
				-class SnapshotAdmin(SearchResultsAdminMixin, BaseModelAdmin):
			
 
				+class SnapshotAdmin(SearchResultsAdminMixin, ConfigEditorMixin, BaseModelAdmin):
			
 
				     list_display = ('created_at', 'title_str', 'status', 'files', 'size', 'url_str')
			
 
				     sort_fields = ('title_str', 'url_str', 'created_at', 'status', 'crawl')
			
 
				-    readonly_fields = ('admin_actions', 'status_info', 'tags_str', 'imported_timestamp', 'created_at', 'modified_at', 'downloaded_at', 'link_dir')
			
 
				+    readonly_fields = ('admin_actions', 'status_info', 'tags_str', 'imported_timestamp', 'created_at', 'modified_at', 'downloaded_at', 'link_dir', 'available_config_options')
			
 
				     search_fields = ('id', 'url', 'timestamp', 'title', 'tags__name')
			
 
				     list_filter = ('created_at', 'downloaded_at', 'archiveresult__status', 'created_by', 'tags__name')
			
 
				-    fields = ('url', 'title', 'created_by', 'bookmarked_at', 'status', 'retry_at', 'crawl', *readonly_fields)
			
 
				+    fields = ('url', 'title', 'created_by', 'bookmarked_at', 'status', 'retry_at', 'crawl', 'config', 'available_config_options', *readonly_fields[:-1])
			
 
				     ordering = ['-created_at']
			
 
				     actions = ['add_tags', 'remove_tags', 'update_titles', 'update_snapshots', 'resnapshot_snapshot', 'overwrite_snapshots', 'delete_snapshots']
			
 
				     inlines = [TagInline, ArchiveResultInline]
			
@@ -196,14 +194,14 @@ class SnapshotAdmin(SearchResultsAdminMixin, BaseModelAdmin):
 
				     )
			
 
				     def files(self, obj):
			
 
				         # return '-'
			
 
				-        return snapshot_icons(obj)
			
 
				+        return obj.icons()
			
 
				 
			
 
				 
			
 
				     @admin.display(
			
 
				         # ordering='archiveresult_count'
			
 
				     )
			
 
				     def size(self, obj):
			
 
				-        archive_size = os.access(Path(obj.link_dir) / 'index.html', os.F_OK) and obj.archive_size
			
 
				+        archive_size = os.access(Path(obj.output_dir) / 'index.html', os.F_OK) and obj.archive_size
			
 
				         if archive_size:
			
 
				             size_txt = printable_filesize(archive_size)
			
 
				             if archive_size > 52428800:
			
@@ -261,30 +259,27 @@ class SnapshotAdmin(SearchResultsAdminMixin, BaseModelAdmin):
 
				         description="ℹ️ Get Title"
			
 
				     )
			
 
				     def update_titles(self, request, queryset):
			
 
				-        links = [snapshot.as_link() for snapshot in queryset]
			
 
				-        if len(links) < 3:
			
 
				-            # run syncronously if there are only 1 or 2 links
			
 
				-            archive_links(links, overwrite=True, methods=('title','favicon'), out_dir=DATA_DIR)
			
 
				-            messages.success(request, f"Title and favicon have been fetched and saved for {len(links)} URLs.")
			
 
				-        else:
			
 
				-            # otherwise run in a background worker
			
 
				-            result = bg_archive_links((links,), kwargs={"overwrite": True, "methods": ["title", "favicon"], "out_dir": DATA_DIR})
			
 
				-            messages.success(
			
 
				-                request,
			
 
				-                mark_safe(f"Title and favicon are updating in the background for {len(links)} URLs. {result_url(result)}"),
			
 
				-            )
			
 
				+        from core.models import Snapshot
			
 
				+        count = queryset.count()
			
 
				+
			
 
				+        # Queue snapshots for archiving via the state machine system
			
 
				+        result = bg_archive_snapshots(queryset, kwargs={"overwrite": True, "methods": ["title", "favicon"], "out_dir": DATA_DIR})
			
 
				+        messages.success(
			
 
				+            request,
			
 
				+            mark_safe(f"Title and favicon are updating in the background for {count} URLs. {result_url(result)}"),
			
 
				+        )
			
 
				 
			
 
				     @admin.action(
			
 
				         description="⬇️ Get Missing"
			
 
				     )
			
 
				     def update_snapshots(self, request, queryset):
			
 
				-        links = [snapshot.as_link() for snapshot in queryset]
			
 
				+        count = queryset.count()
			
 
				 
			
 
				-        result = bg_archive_links((links,), kwargs={"overwrite": False, "out_dir": DATA_DIR})
			
 
				+        result = bg_archive_snapshots(queryset, kwargs={"overwrite": False, "out_dir": DATA_DIR})
			
 
				 
			
 
				         messages.success(
			
 
				             request,
			
 
				-            mark_safe(f"Re-trying any previously failed methods for {len(links)} URLs in the background. {result_url(result)}"),
			
 
				+            mark_safe(f"Re-trying any previously failed methods for {count} URLs in the background. {result_url(result)}"),
			
 
				         )
			
 
				 
			
 
				 
			
@@ -307,13 +302,13 @@ class SnapshotAdmin(SearchResultsAdminMixin, BaseModelAdmin):
 
				         description="🔄 Redo"
			
 
				     )
			
 
				     def overwrite_snapshots(self, request, queryset):
			
 
				-        links = [snapshot.as_link() for snapshot in queryset]
			
 
				+        count = queryset.count()
			
 
				 
			
 
				-        result = bg_archive_links((links,), kwargs={"overwrite": True, "out_dir": DATA_DIR})
			
 
				+        result = bg_archive_snapshots(queryset, kwargs={"overwrite": True, "out_dir": DATA_DIR})
			
 
				 
			
 
				         messages.success(
			
 
				             request,
			
 
				-            mark_safe(f"Clearing all previous results and re-downloading {len(links)} URLs in the background. {result_url(result)}"),
			
 
				+            mark_safe(f"Clearing all previous results and re-downloading {count} URLs in the background. {result_url(result)}"),
			
 
				         )
			
 
				 
			
 
				     @admin.action(
			
--- a/archivebox/core/admin_tags.py
+++ b/archivebox/core/admin_tags.py
@@ -3,8 +3,6 @@ __package__ = 'archivebox.core'
 
				 from django.contrib import admin
			
 
				 from django.utils.html import format_html, mark_safe
			
 
				 
			
 
				-import abx
			
 
				-
			
 
				 from archivebox.misc.paginators import AccelleratedPaginator
			
 
				 from archivebox.base_models.admin import BaseModelAdmin
			
 
				 
			
@@ -150,7 +148,7 @@ class TagAdmin(BaseModelAdmin):
 
				 
			
 
				 
			
 
				 # @admin.register(SnapshotTag, site=archivebox_admin)
			
 
				-# class SnapshotTagAdmin(ABIDModelAdmin):
			
 
				+# class SnapshotTagAdmin(BaseModelAdmin):
			
 
				 #     list_display = ('id', 'snapshot', 'tag')
			
 
				 #     sort_fields = ('id', 'snapshot', 'tag')
			
 
				 #     search_fields = ('id', 'snapshot_id', 'tag_id')
			
@@ -159,7 +157,6 @@ class TagAdmin(BaseModelAdmin):
 
				 #     ordering = ['-id']
			
 
				 
			
 
				 
			
 
				[email protected]
			
 
				 def register_admin(admin_site):
			
 
				     admin_site.register(Tag, TagAdmin)
			
 
				 
			
--- a/archivebox/core/admin_users.py
+++ b/archivebox/core/admin_users.py
@@ -5,8 +5,6 @@ from django.contrib.auth.admin import UserAdmin
 
				 from django.utils.html import format_html, mark_safe
			
 
				 from django.contrib.auth import get_user_model
			
 
				 
			
 
				-import abx
			
 
				-
			
 
				 
			
 
				 class CustomUserAdmin(UserAdmin):
			
 
				     sort_fields = ['id', 'email', 'username', 'is_superuser', 'last_login', 'date_joined']
			
@@ -86,6 +84,5 @@ class CustomUserAdmin(UserAdmin):
 
				 
			
 
				 
			
 
				 
			
 
				[email protected]
			
 
				 def register_admin(admin_site):
			
 
				     admin_site.register(get_user_model(), CustomUserAdmin)
			
--- a/archivebox/core/apps.py
+++ b/archivebox/core/apps.py
@@ -2,17 +2,12 @@ __package__ = 'archivebox.core'
 
				 
			
 
				 from django.apps import AppConfig
			
 
				 
			
 
				-import archivebox
			
 
				-
			
 
				 
			
 
				 class CoreConfig(AppConfig):
			
 
				     name = 'core'
			
 
				 
			
 
				     def ready(self):
			
 
				         """Register the archivebox.core.admin_site as the main django admin site"""
			
 
				-        from django.conf import settings
			
 
				-        archivebox.pm.hook.ready(settings=settings)
			
 
				-        
			
 
				         from core.admin_site import register_admin_site
			
 
				         register_admin_site()
			
 
				         
			
--- a/archivebox/core/forms.py
+++ b/archivebox/core/forms.py
@@ -3,37 +3,34 @@ __package__ = 'archivebox.core'
 
				 from django import forms
			
 
				 
			
 
				 from archivebox.misc.util import URL_REGEX
			
 
				-from ..parsers import PARSERS
			
 
				 from taggit.utils import edit_string_for_tags, parse_tags
			
 
				 
			
 
				-PARSER_CHOICES = [
			
 
				-    (parser_key, parser[0])
			
 
				-    for parser_key, parser in PARSERS.items()
			
 
				-]
			
 
				 DEPTH_CHOICES = (
			
 
				     ('0', 'depth = 0 (archive just these URLs)'),
			
 
				     ('1', 'depth = 1 (archive these URLs and all URLs one hop away)'),
			
 
				 )
			
 
				 
			
 
				-from ..extractors import get_default_archive_methods
			
 
				+from archivebox.hooks import get_extractors
			
 
				 
			
 
				-ARCHIVE_METHODS = [
			
 
				-    (name, name)
			
 
				-    for name, _, _ in get_default_archive_methods()
			
 
				-]
			
 
				+def get_archive_methods():
			
 
				+    """Get available archive methods from discovered hooks."""
			
 
				+    return [(name, name) for name in get_extractors()]
			
 
				 
			
 
				 
			
 
				 class AddLinkForm(forms.Form):
			
 
				     url = forms.RegexField(label="URLs (one per line)", regex=URL_REGEX, min_length='6', strip=True, widget=forms.Textarea, required=True)
			
 
				-    parser = forms.ChoiceField(label="URLs format", choices=[('auto', 'Auto-detect parser'), *PARSER_CHOICES], initial='auto')
			
 
				     tag = forms.CharField(label="Tags (comma separated tag1,tag2,tag3)", strip=True, required=False)
			
 
				     depth = forms.ChoiceField(label="Archive depth", choices=DEPTH_CHOICES, initial='0', widget=forms.RadioSelect(attrs={"class": "depth-selection"}))
			
 
				     archive_methods = forms.MultipleChoiceField(
			
 
				         label="Archive methods (select at least 1, otherwise all will be used by default)",
			
 
				         required=False,
			
 
				         widget=forms.SelectMultiple,
			
 
				-        choices=ARCHIVE_METHODS,
			
 
				+        choices=[],  # populated dynamically in __init__
			
 
				     )
			
 
				+
			
 
				+    def __init__(self, *args, **kwargs):
			
 
				+        super().__init__(*args, **kwargs)
			
 
				+        self.fields['archive_methods'].choices = get_archive_methods()
			
 
				     # TODO: hook these up to the view and put them 
			
 
				     # in a collapsible UI section labeled "Advanced"
			
 
				     #
			
--- a/archivebox/core/migrations/0007_archiveresult.py
+++ b/archivebox/core/migrations/0007_archiveresult.py
@@ -1,18 +1,14 @@
 
				 # Generated by Django 3.0.8 on 2020-11-04 12:25
			
 
				 
			
 
				-import os
			
 
				 import json
			
 
				 from pathlib import Path
			
 
				 
			
 
				 from django.db import migrations, models
			
 
				 import django.db.models.deletion
			
 
				 
			
 
				+from config import CONFIG
			
 
				 from index.json import to_json
			
 
				 
			
 
				-DATA_DIR = Path(os.getcwd()).resolve()                    # archivebox user data dir
			
 
				-ARCHIVE_DIR = DATA_DIR / 'archive'                      # archivebox snapshot data dir
			
 
				-
			
 
				-
			
 
				 try:
			
 
				     JSONField = models.JSONField
			
 
				 except AttributeError:
			
@@ -21,12 +17,14 @@ except AttributeError:
 
				 
			
 
				 
			
 
				 def forwards_func(apps, schema_editor):
			
 
				+    from core.models import EXTRACTORS
			
 
				+
			
 
				     Snapshot = apps.get_model("core", "Snapshot")
			
 
				     ArchiveResult = apps.get_model("core", "ArchiveResult")
			
 
				 
			
 
				     snapshots = Snapshot.objects.all()
			
 
				     for snapshot in snapshots:
			
 
				-        out_dir = ARCHIVE_DIR / snapshot.timestamp
			
 
				+        out_dir = Path(CONFIG['ARCHIVE_DIR']) / snapshot.timestamp
			
 
				 
			
 
				         try:
			
 
				             with open(out_dir / "index.json", "r") as f:
			
@@ -61,7 +59,7 @@ def forwards_func(apps, schema_editor):
 
				 
			
 
				 def verify_json_index_integrity(snapshot):
			
 
				     results = snapshot.archiveresult_set.all()
			
 
				-    out_dir = ARCHIVE_DIR / snapshot.timestamp
			
 
				+    out_dir = Path(CONFIG['ARCHIVE_DIR']) / snapshot.timestamp
			
 
				     with open(out_dir / "index.json", "r") as f:
			
 
				         index = json.load(f)
			
 
				 
			
--- a/archivebox/core/migrations/0023_alter_archiveresult_options_archiveresult_abid_and_more.py
+++ b/archivebox/core/migrations/0023_alter_archiveresult_options_archiveresult_abid_and_more.py
@@ -1,58 +0,0 @@
 
				-# Generated by Django 5.0.6 on 2024-05-13 10:56
			
 
				-
			
 
				-import charidfield.fields
			
 
				-from django.db import migrations, models
			
 
				-
			
 
				-
			
 
				-class Migration(migrations.Migration):
			
 
				-
			
 
				-    dependencies = [
			
 
				-        ('core', '0022_auto_20231023_2008'),
			
 
				-    ]
			
 
				-
			
 
				-    operations = [
			
 
				-        migrations.AlterModelOptions(
			
 
				-            name='archiveresult',
			
 
				-            options={'verbose_name': 'Result'},
			
 
				-        ),
			
 
				-        migrations.AddField(
			
 
				-            model_name='archiveresult',
			
 
				-            name='abid',
			
 
				-            field=charidfield.fields.CharIDField(blank=True, db_index=True, default=None, help_text='ABID-format identifier for this entity (e.g. snp_01BJQMF54D093DXEAWZ6JYRPAQ)', max_length=30, null=True, prefix='res_', unique=True),
			
 
				-        ),
			
 
				-        migrations.AddField(
			
 
				-            model_name='snapshot',
			
 
				-            name='abid',
			
 
				-            field=charidfield.fields.CharIDField(blank=True, db_index=True, default=None, help_text='ABID-format identifier for this entity (e.g. snp_01BJQMF54D093DXEAWZ6JYRPAQ)', max_length=30, null=True, prefix='snp_', unique=True),
			
 
				-        ),
			
 
				-        migrations.AddField(
			
 
				-            model_name='snapshot',
			
 
				-            name='uuid',
			
 
				-            field=models.UUIDField(blank=True, null=True, unique=True),
			
 
				-        ),
			
 
				-        migrations.AddField(
			
 
				-            model_name='tag',
			
 
				-            name='abid',
			
 
				-            field=charidfield.fields.CharIDField(blank=True, db_index=True, default=None, help_text='ABID-format identifier for this entity (e.g. snp_01BJQMF54D093DXEAWZ6JYRPAQ)', max_length=30, null=True, prefix='tag_', unique=True),
			
 
				-        ),
			
 
				-        migrations.AlterField(
			
 
				-            model_name='archiveresult',
			
 
				-            name='extractor',
			
 
				-            field=models.CharField(choices=(
			
 
				-                ('htmltotext', 'htmltotext'),
			
 
				-                ('git', 'git'),
			
 
				-                ('singlefile', 'singlefile'),
			
 
				-                ('media', 'media'),
			
 
				-                ('archive_org', 'archive_org'),
			
 
				-                ('readability', 'readability'),
			
 
				-                ('mercury', 'mercury'),
			
 
				-                ('favicon', 'favicon'),
			
 
				-                ('pdf', 'pdf'),
			
 
				-                ('headers', 'headers'),
			
 
				-                ('screenshot', 'screenshot'),
			
 
				-                ('dom', 'dom'),
			
 
				-                ('title', 'title'),
			
 
				-                ('wget', 'wget'),
			
 
				-            ), max_length=32),
			
 
				-        ),
			
 
				-    ]
			
--- a/archivebox/core/migrations/0023_new_schema.py
+++ b/archivebox/core/migrations/0023_new_schema.py
@@ -0,0 +1,466 @@
 
				+# Generated by Django 5.0.6 on 2024-12-25
			
 
				+# Transforms schema from 0022 to new simplified schema (ABID system removed)
			
 
				+
			
 
				+from uuid import uuid4
			
 
				+from django.conf import settings
			
 
				+from django.db import migrations, models
			
 
				+import django.db.models.deletion
			
 
				+import django.utils.timezone
			
 
				+
			
 
				+
			
 
				+def get_or_create_system_user_pk(apps, schema_editor):
			
 
				+    """Get or create system user for migrations."""
			
 
				+    User = apps.get_model('auth', 'User')
			
 
				+    user, _ = User.objects.get_or_create(
			
 
				+        username='system',
			
 
				+        defaults={'is_active': False, 'password': '!'}
			
 
				+    )
			
 
				+    return user.pk
			
 
				+
			
 
				+
			
 
				+def populate_created_by_snapshot(apps, schema_editor):
			
 
				+    """Populate created_by for existing snapshots."""
			
 
				+    User = apps.get_model('auth', 'User')
			
 
				+    Snapshot = apps.get_model('core', 'Snapshot')
			
 
				+
			
 
				+    system_user, _ = User.objects.get_or_create(
			
 
				+        username='system',
			
 
				+        defaults={'is_active': False, 'password': '!'}
			
 
				+    )
			
 
				+
			
 
				+    Snapshot.objects.filter(created_by__isnull=True).update(created_by=system_user)
			
 
				+
			
 
				+
			
 
				+def populate_created_by_archiveresult(apps, schema_editor):
			
 
				+    """Populate created_by for existing archive results."""
			
 
				+    User = apps.get_model('auth', 'User')
			
 
				+    ArchiveResult = apps.get_model('core', 'ArchiveResult')
			
 
				+
			
 
				+    system_user, _ = User.objects.get_or_create(
			
 
				+        username='system',
			
 
				+        defaults={'is_active': False, 'password': '!'}
			
 
				+    )
			
 
				+
			
 
				+    ArchiveResult.objects.filter(created_by__isnull=True).update(created_by=system_user)
			
 
				+
			
 
				+
			
 
				+def populate_created_by_tag(apps, schema_editor):
			
 
				+    """Populate created_by for existing tags."""
			
 
				+    User = apps.get_model('auth', 'User')
			
 
				+    Tag = apps.get_model('core', 'Tag')
			
 
				+
			
 
				+    system_user, _ = User.objects.get_or_create(
			
 
				+        username='system',
			
 
				+        defaults={'is_active': False, 'password': '!'}
			
 
				+    )
			
 
				+
			
 
				+    Tag.objects.filter(created_by__isnull=True).update(created_by=system_user)
			
 
				+
			
 
				+
			
 
				+def generate_uuid_for_archiveresults(apps, schema_editor):
			
 
				+    """Generate UUIDs for archive results that don't have them."""
			
 
				+    ArchiveResult = apps.get_model('core', 'ArchiveResult')
			
 
				+    for ar in ArchiveResult.objects.filter(uuid__isnull=True).iterator(chunk_size=500):
			
 
				+        ar.uuid = uuid4()
			
 
				+        ar.save(update_fields=['uuid'])
			
 
				+
			
 
				+
			
 
				+def generate_uuid_for_tags(apps, schema_editor):
			
 
				+    """Generate UUIDs for tags that don't have them."""
			
 
				+    Tag = apps.get_model('core', 'Tag')
			
 
				+    for tag in Tag.objects.filter(uuid__isnull=True).iterator(chunk_size=500):
			
 
				+        tag.uuid = uuid4()
			
 
				+        tag.save(update_fields=['uuid'])
			
 
				+
			
 
				+
			
 
				+def copy_bookmarked_at_from_added(apps, schema_editor):
			
 
				+    """Copy added timestamp to bookmarked_at."""
			
 
				+    Snapshot = apps.get_model('core', 'Snapshot')
			
 
				+    Snapshot.objects.filter(bookmarked_at__isnull=True).update(
			
 
				+        bookmarked_at=models.F('added')
			
 
				+    )
			
 
				+
			
 
				+
			
 
				+def copy_created_at_from_added(apps, schema_editor):
			
 
				+    """Copy added timestamp to created_at for snapshots."""
			
 
				+    Snapshot = apps.get_model('core', 'Snapshot')
			
 
				+    Snapshot.objects.filter(created_at__isnull=True).update(
			
 
				+        created_at=models.F('added')
			
 
				+    )
			
 
				+
			
 
				+
			
 
				+def copy_created_at_from_start_ts(apps, schema_editor):
			
 
				+    """Copy start_ts to created_at for archive results."""
			
 
				+    ArchiveResult = apps.get_model('core', 'ArchiveResult')
			
 
				+    ArchiveResult.objects.filter(created_at__isnull=True).update(
			
 
				+        created_at=models.F('start_ts')
			
 
				+    )
			
 
				+
			
 
				+
			
 
				+class Migration(migrations.Migration):
			
 
				+    """
			
 
				+    This migration transforms the schema from the main branch (0022) to the new
			
 
				+    simplified schema without the ABID system.
			
 
				+
			
 
				+    For dev branch users who had ABID migrations (0023-0074), this replaces them
			
 
				+    with a clean transformation.
			
 
				+    """
			
 
				+
			
 
				+    replaces = [
			
 
				+        ('core', '0023_alter_archiveresult_options_archiveresult_abid_and_more'),
			
 
				+        ('core', '0024_auto_20240513_1143'),
			
 
				+        ('core', '0025_alter_archiveresult_uuid'),
			
 
				+        ('core', '0026_archiveresult_created_archiveresult_created_by_and_more'),
			
 
				+        ('core', '0027_update_snapshot_ids'),
			
 
				+        ('core', '0028_alter_archiveresult_uuid'),
			
 
				+        ('core', '0029_alter_archiveresult_id'),
			
 
				+        ('core', '0030_alter_archiveresult_uuid'),
			
 
				+        ('core', '0031_alter_archiveresult_id_alter_archiveresult_uuid_and_more'),
			
 
				+        ('core', '0032_alter_archiveresult_id'),
			
 
				+        ('core', '0033_rename_id_archiveresult_old_id'),
			
 
				+        ('core', '0034_alter_archiveresult_old_id_alter_archiveresult_uuid'),
			
 
				+        ('core', '0035_remove_archiveresult_uuid_archiveresult_id'),
			
 
				+        ('core', '0036_alter_archiveresult_id_alter_archiveresult_old_id'),
			
 
				+        ('core', '0037_rename_id_snapshot_old_id'),
			
 
				+        ('core', '0038_rename_uuid_snapshot_id'),
			
 
				+        ('core', '0039_rename_snapshot_archiveresult_snapshot_old'),
			
 
				+        ('core', '0040_archiveresult_snapshot'),
			
 
				+        ('core', '0041_alter_archiveresult_snapshot_and_more'),
			
 
				+        ('core', '0042_remove_archiveresult_snapshot_old'),
			
 
				+        ('core', '0043_alter_archiveresult_snapshot_alter_snapshot_id_and_more'),
			
 
				+        ('core', '0044_alter_archiveresult_snapshot_alter_tag_uuid_and_more'),
			
 
				+        ('core', '0045_alter_snapshot_old_id'),
			
 
				+        ('core', '0046_alter_archiveresult_snapshot_alter_snapshot_id_and_more'),
			
 
				+        ('core', '0047_alter_snapshottag_unique_together_and_more'),
			
 
				+        ('core', '0048_alter_archiveresult_snapshot_and_more'),
			
 
				+        ('core', '0049_rename_snapshot_snapshottag_snapshot_old_and_more'),
			
 
				+        ('core', '0050_alter_snapshottag_snapshot_old'),
			
 
				+        ('core', '0051_snapshottag_snapshot_alter_snapshottag_snapshot_old'),
			
 
				+        ('core', '0052_alter_snapshottag_unique_together_and_more'),
			
 
				+        ('core', '0053_remove_snapshottag_snapshot_old'),
			
 
				+        ('core', '0054_alter_snapshot_timestamp'),
			
 
				+        ('core', '0055_alter_tag_slug'),
			
 
				+        ('core', '0056_remove_tag_uuid'),
			
 
				+        ('core', '0057_rename_id_tag_old_id'),
			
 
				+        ('core', '0058_alter_tag_old_id'),
			
 
				+        ('core', '0059_tag_id'),
			
 
				+        ('core', '0060_alter_tag_id'),
			
 
				+        ('core', '0061_rename_tag_snapshottag_old_tag_and_more'),
			
 
				+        ('core', '0062_alter_snapshottag_old_tag'),
			
 
				+        ('core', '0063_snapshottag_tag_alter_snapshottag_old_tag'),
			
 
				+        ('core', '0064_alter_snapshottag_unique_together_and_more'),
			
 
				+        ('core', '0065_remove_snapshottag_old_tag'),
			
 
				+        ('core', '0066_alter_snapshottag_tag_alter_tag_id_alter_tag_old_id'),
			
 
				+        ('core', '0067_alter_snapshottag_tag'),
			
 
				+        ('core', '0068_alter_archiveresult_options'),
			
 
				+        ('core', '0069_alter_archiveresult_created_alter_snapshot_added_and_more'),
			
 
				+        ('core', '0070_alter_archiveresult_created_by_alter_snapshot_added_and_more'),
			
 
				+        ('core', '0071_remove_archiveresult_old_id_remove_snapshot_old_id_and_more'),
			
 
				+        ('core', '0072_rename_added_snapshot_bookmarked_at_and_more'),
			
 
				+        ('core', '0073_rename_created_archiveresult_created_at_and_more'),
			
 
				+        ('core', '0074_alter_snapshot_downloaded_at'),
			
 
				+    ]
			
 
				+
			
 
				+    dependencies = [
			
 
				+        ('core', '0022_auto_20231023_2008'),
			
 
				+        migrations.swappable_dependency(settings.AUTH_USER_MODEL),
			
 
				+    ]
			
 
				+
			
 
				+    operations = [
			
 
				+        # === SNAPSHOT CHANGES ===
			
 
				+
			
 
				+        # Add new fields to Snapshot
			
 
				+        migrations.AddField(
			
 
				+            model_name='snapshot',
			
 
				+            name='created_by',
			
 
				+            field=models.ForeignKey(
			
 
				+                default=None, null=True, blank=True,
			
 
				+                on_delete=django.db.models.deletion.CASCADE,
			
 
				+                related_name='snapshot_set',
			
 
				+                to=settings.AUTH_USER_MODEL,
			
 
				+            ),
			
 
				+        ),
			
 
				+        migrations.AddField(
			
 
				+            model_name='snapshot',
			
 
				+            name='created_at',
			
 
				+            field=models.DateTimeField(default=django.utils.timezone.now, db_index=True, null=True),
			
 
				+        ),
			
 
				+        migrations.AddField(
			
 
				+            model_name='snapshot',
			
 
				+            name='modified_at',
			
 
				+            field=models.DateTimeField(auto_now=True),
			
 
				+        ),
			
 
				+        migrations.AddField(
			
 
				+            model_name='snapshot',
			
 
				+            name='bookmarked_at',
			
 
				+            field=models.DateTimeField(default=django.utils.timezone.now, db_index=True, null=True),
			
 
				+        ),
			
 
				+        migrations.AddField(
			
 
				+            model_name='snapshot',
			
 
				+            name='downloaded_at',
			
 
				+            field=models.DateTimeField(default=None, null=True, blank=True, db_index=True),
			
 
				+        ),
			
 
				+        migrations.AddField(
			
 
				+            model_name='snapshot',
			
 
				+            name='depth',
			
 
				+            field=models.PositiveSmallIntegerField(default=0, db_index=True),
			
 
				+        ),
			
 
				+        migrations.AddField(
			
 
				+            model_name='snapshot',
			
 
				+            name='status',
			
 
				+            field=models.CharField(choices=[('queued', 'Queued'), ('started', 'Started'), ('sealed', 'Sealed')], default='queued', max_length=15, db_index=True),
			
 
				+        ),
			
 
				+        migrations.AddField(
			
 
				+            model_name='snapshot',
			
 
				+            name='retry_at',
			
 
				+            field=models.DateTimeField(default=django.utils.timezone.now, null=True, blank=True, db_index=True),
			
 
				+        ),
			
 
				+        migrations.AddField(
			
 
				+            model_name='snapshot',
			
 
				+            name='config',
			
 
				+            field=models.JSONField(default=dict, blank=False),
			
 
				+        ),
			
 
				+        migrations.AddField(
			
 
				+            model_name='snapshot',
			
 
				+            name='notes',
			
 
				+            field=models.TextField(blank=True, default=''),
			
 
				+        ),
			
 
				+        migrations.AddField(
			
 
				+            model_name='snapshot',
			
 
				+            name='output_dir',
			
 
				+            field=models.CharField(max_length=256, default=None, null=True, blank=True),
			
 
				+        ),
			
 
				+
			
 
				+        # Copy data from old fields to new
			
 
				+        migrations.RunPython(copy_bookmarked_at_from_added, migrations.RunPython.noop),
			
 
				+        migrations.RunPython(copy_created_at_from_added, migrations.RunPython.noop),
			
 
				+        migrations.RunPython(populate_created_by_snapshot, migrations.RunPython.noop),
			
 
				+
			
 
				+        # Make created_by non-nullable after population
			
 
				+        migrations.AlterField(
			
 
				+            model_name='snapshot',
			
 
				+            name='created_by',
			
 
				+            field=models.ForeignKey(
			
 
				+                on_delete=django.db.models.deletion.CASCADE,
			
 
				+                related_name='snapshot_set',
			
 
				+                to=settings.AUTH_USER_MODEL,
			
 
				+                db_index=True,
			
 
				+            ),
			
 
				+        ),
			
 
				+
			
 
				+        # Update timestamp field constraints
			
 
				+        migrations.AlterField(
			
 
				+            model_name='snapshot',
			
 
				+            name='timestamp',
			
 
				+            field=models.CharField(max_length=32, unique=True, db_index=True, editable=False),
			
 
				+        ),
			
 
				+
			
 
				+        # Update title field size
			
 
				+        migrations.AlterField(
			
 
				+            model_name='snapshot',
			
 
				+            name='title',
			
 
				+            field=models.CharField(max_length=512, null=True, blank=True, db_index=True),
			
 
				+        ),
			
 
				+
			
 
				+        # Remove old 'added' and 'updated' fields
			
 
				+        migrations.RemoveField(model_name='snapshot', name='added'),
			
 
				+        migrations.RemoveField(model_name='snapshot', name='updated'),
			
 
				+
			
 
				+        # Remove old 'tags' CharField (now M2M via Tag model)
			
 
				+        migrations.RemoveField(model_name='snapshot', name='tags'),
			
 
				+
			
 
				+        # === TAG CHANGES ===
			
 
				+
			
 
				+        # Add uuid field to Tag temporarily for ID migration
			
 
				+        migrations.AddField(
			
 
				+            model_name='tag',
			
 
				+            name='uuid',
			
 
				+            field=models.UUIDField(default=uuid4, null=True, blank=True),
			
 
				+        ),
			
 
				+        migrations.AddField(
			
 
				+            model_name='tag',
			
 
				+            name='created_by',
			
 
				+            field=models.ForeignKey(
			
 
				+                default=None, null=True, blank=True,
			
 
				+                on_delete=django.db.models.deletion.CASCADE,
			
 
				+                related_name='tag_set',
			
 
				+                to=settings.AUTH_USER_MODEL,
			
 
				+            ),
			
 
				+        ),
			
 
				+        migrations.AddField(
			
 
				+            model_name='tag',
			
 
				+            name='created_at',
			
 
				+            field=models.DateTimeField(default=django.utils.timezone.now, db_index=True, null=True),
			
 
				+        ),
			
 
				+        migrations.AddField(
			
 
				+            model_name='tag',
			
 
				+            name='modified_at',
			
 
				+            field=models.DateTimeField(auto_now=True),
			
 
				+        ),
			
 
				+
			
 
				+        # Populate UUIDs for tags
			
 
				+        migrations.RunPython(generate_uuid_for_tags, migrations.RunPython.noop),
			
 
				+        migrations.RunPython(populate_created_by_tag, migrations.RunPython.noop),
			
 
				+
			
 
				+        # Make created_by non-nullable
			
 
				+        migrations.AlterField(
			
 
				+            model_name='tag',
			
 
				+            name='created_by',
			
 
				+            field=models.ForeignKey(
			
 
				+                on_delete=django.db.models.deletion.CASCADE,
			
 
				+                related_name='tag_set',
			
 
				+                to=settings.AUTH_USER_MODEL,
			
 
				+            ),
			
 
				+        ),
			
 
				+
			
 
				+        # Update slug field
			
 
				+        migrations.AlterField(
			
 
				+            model_name='tag',
			
 
				+            name='slug',
			
 
				+            field=models.SlugField(unique=True, max_length=100, editable=False),
			
 
				+        ),
			
 
				+
			
 
				+        # === ARCHIVERESULT CHANGES ===
			
 
				+
			
 
				+        # Add uuid field for new ID
			
 
				+        migrations.AddField(
			
 
				+            model_name='archiveresult',
			
 
				+            name='uuid',
			
 
				+            field=models.UUIDField(default=uuid4, null=True, blank=True),
			
 
				+        ),
			
 
				+        migrations.AddField(
			
 
				+            model_name='archiveresult',
			
 
				+            name='created_by',
			
 
				+            field=models.ForeignKey(
			
 
				+                default=None, null=True, blank=True,
			
 
				+                on_delete=django.db.models.deletion.CASCADE,
			
 
				+                related_name='archiveresult_set',
			
 
				+                to=settings.AUTH_USER_MODEL,
			
 
				+            ),
			
 
				+        ),
			
 
				+        migrations.AddField(
			
 
				+            model_name='archiveresult',
			
 
				+            name='created_at',
			
 
				+            field=models.DateTimeField(default=django.utils.timezone.now, db_index=True, null=True),
			
 
				+        ),
			
 
				+        migrations.AddField(
			
 
				+            model_name='archiveresult',
			
 
				+            name='modified_at',
			
 
				+            field=models.DateTimeField(auto_now=True),
			
 
				+        ),
			
 
				+        migrations.AddField(
			
 
				+            model_name='archiveresult',
			
 
				+            name='retry_at',
			
 
				+            field=models.DateTimeField(default=django.utils.timezone.now, null=True, blank=True, db_index=True),
			
 
				+        ),
			
 
				+        migrations.AddField(
			
 
				+            model_name='archiveresult',
			
 
				+            name='notes',
			
 
				+            field=models.TextField(blank=True, default=''),
			
 
				+        ),
			
 
				+        migrations.AddField(
			
 
				+            model_name='archiveresult',
			
 
				+            name='output_dir',
			
 
				+            field=models.CharField(max_length=256, default=None, null=True, blank=True),
			
 
				+        ),
			
 
				+
			
 
				+        # Populate UUIDs and data for archive results
			
 
				+        migrations.RunPython(generate_uuid_for_archiveresults, migrations.RunPython.noop),
			
 
				+        migrations.RunPython(copy_created_at_from_start_ts, migrations.RunPython.noop),
			
 
				+        migrations.RunPython(populate_created_by_archiveresult, migrations.RunPython.noop),
			
 
				+
			
 
				+        # Make created_by non-nullable
			
 
				+        migrations.AlterField(
			
 
				+            model_name='archiveresult',
			
 
				+            name='created_by',
			
 
				+            field=models.ForeignKey(
			
 
				+                on_delete=django.db.models.deletion.CASCADE,
			
 
				+                related_name='archiveresult_set',
			
 
				+                to=settings.AUTH_USER_MODEL,
			
 
				+                db_index=True,
			
 
				+            ),
			
 
				+        ),
			
 
				+
			
 
				+        # Update extractor choices
			
 
				+        migrations.AlterField(
			
 
				+            model_name='archiveresult',
			
 
				+            name='extractor',
			
 
				+            field=models.CharField(
			
 
				+                choices=[
			
 
				+                    ('htmltotext', 'htmltotext'), ('git', 'git'), ('singlefile', 'singlefile'),
			
 
				+                    ('media', 'media'), ('archive_org', 'archive_org'), ('readability', 'readability'),
			
 
				+                    ('mercury', 'mercury'), ('favicon', 'favicon'), ('pdf', 'pdf'),
			
 
				+                    ('headers', 'headers'), ('screenshot', 'screenshot'), ('dom', 'dom'),
			
 
				+                    ('title', 'title'), ('wget', 'wget'),
			
 
				+                ],
			
 
				+                max_length=32, db_index=True,
			
 
				+            ),
			
 
				+        ),
			
 
				+
			
 
				+        # Update status field
			
 
				+        migrations.AlterField(
			
 
				+            model_name='archiveresult',
			
 
				+            name='status',
			
 
				+            field=models.CharField(
			
 
				+                choices=[
			
 
				+                    ('queued', 'Queued'), ('started', 'Started'), ('backoff', 'Waiting to retry'),
			
 
				+                    ('succeeded', 'Succeeded'), ('failed', 'Failed'), ('skipped', 'Skipped'),
			
 
				+                ],
			
 
				+                max_length=16, default='queued', db_index=True,
			
 
				+            ),
			
 
				+        ),
			
 
				+
			
 
				+        # Update output field size
			
 
				+        migrations.AlterField(
			
 
				+            model_name='archiveresult',
			
 
				+            name='output',
			
 
				+            field=models.CharField(max_length=1024, default=None, null=True, blank=True),
			
 
				+        ),
			
 
				+
			
 
				+        # Update cmd_version field size
			
 
				+        migrations.AlterField(
			
 
				+            model_name='archiveresult',
			
 
				+            name='cmd_version',
			
 
				+            field=models.CharField(max_length=128, default=None, null=True, blank=True),
			
 
				+        ),
			
 
				+
			
 
				+        # Make start_ts and end_ts nullable
			
 
				+        migrations.AlterField(
			
 
				+            model_name='archiveresult',
			
 
				+            name='start_ts',
			
 
				+            field=models.DateTimeField(default=None, null=True, blank=True),
			
 
				+        ),
			
 
				+        migrations.AlterField(
			
 
				+            model_name='archiveresult',
			
 
				+            name='end_ts',
			
 
				+            field=models.DateTimeField(default=None, null=True, blank=True),
			
 
				+        ),
			
 
				+
			
 
				+        # Make pwd nullable
			
 
				+        migrations.AlterField(
			
 
				+            model_name='archiveresult',
			
 
				+            name='pwd',
			
 
				+            field=models.CharField(max_length=256, default=None, null=True, blank=True),
			
 
				+        ),
			
 
				+
			
 
				+        # Make cmd nullable
			
 
				+        migrations.AlterField(
			
 
				+            model_name='archiveresult',
			
 
				+            name='cmd',
			
 
				+            field=models.JSONField(default=None, null=True, blank=True),
			
 
				+        ),
			
 
				+
			
 
				+        # Update model options
			
 
				+        migrations.AlterModelOptions(
			
 
				+            name='archiveresult',
			
 
				+            options={'verbose_name': 'Archive Result', 'verbose_name_plural': 'Archive Results Log'},
			
 
				+        ),
			
 
				+        migrations.AlterModelOptions(
			
 
				+            name='snapshot',
			
 
				+            options={'verbose_name': 'Snapshot', 'verbose_name_plural': 'Snapshots'},
			
 
				+        ),
			
 
				+        migrations.AlterModelOptions(
			
 
				+            name='tag',
			
 
				+            options={'verbose_name': 'Tag', 'verbose_name_plural': 'Tags'},
			
 
				+        ),
			
 
				+    ]
			
--- a/archivebox/core/migrations/0024_auto_20240513_1143.py
+++ b/archivebox/core/migrations/0024_auto_20240513_1143.py
@@ -1,101 +0,0 @@
 
				-# Generated by Django 5.0.6 on 2024-05-13 11:43
			
 
				-
			
 
				-from django.db import migrations
			
 
				-from datetime import datetime
			
 
				-
			
 
				-from archivebox.base_models.abid import abid_from_values, DEFAULT_ABID_URI_SALT
			
 
				-
			
 
				-
			
 
				-def calculate_abid(self):
			
 
				-    """
			
 
				-    Return a freshly derived ABID (assembled from attrs defined in ABIDModel.abid_*_src).
			
 
				-    """
			
 
				-    prefix = self.abid_prefix
			
 
				-    ts = eval(self.abid_ts_src)
			
 
				-    uri = eval(self.abid_uri_src)
			
 
				-    subtype = eval(self.abid_subtype_src)
			
 
				-    rand = eval(self.abid_rand_src)
			
 
				-
			
 
				-    if (not prefix) or prefix == 'obj_':
			
 
				-        suggested_abid = self.__class__.__name__[:3].lower()
			
 
				-        raise Exception(f'{self.__class__.__name__}.abid_prefix must be defined to calculate ABIDs (suggested: {suggested_abid})')
			
 
				-
			
 
				-    if not ts:
			
 
				-        ts = datetime.utcfromtimestamp(0)
			
 
				-        print(f'[!] WARNING: Generating ABID with ts=0000000000 placeholder because {self.__class__.__name__}.abid_ts_src={self.abid_ts_src} is unset!', ts.isoformat())
			
 
				-
			
 
				-    if not uri:
			
 
				-        uri = str(self)
			
 
				-        print(f'[!] WARNING: Generating ABID with uri=str(self) placeholder because {self.__class__.__name__}.abid_uri_src={self.abid_uri_src} is unset!', uri)
			
 
				-
			
 
				-    if not subtype:
			
 
				-        subtype = self.__class__.__name__
			
 
				-        print(f'[!] WARNING: Generating ABID with subtype={subtype} placeholder because {self.__class__.__name__}.abid_subtype_src={self.abid_subtype_src} is unset!', subtype)
			
 
				-
			
 
				-    if not rand:
			
 
				-        rand = getattr(self, 'uuid', None) or getattr(self, 'id', None) or getattr(self, 'pk')
			
 
				-        print(f'[!] WARNING: Generating ABID with rand=self.id placeholder because {self.__class__.__name__}.abid_rand_src={self.abid_rand_src} is unset!', rand)
			
 
				-
			
 
				-    abid = abid_from_values(
			
 
				-        prefix=prefix,
			
 
				-        ts=ts,
			
 
				-        uri=uri,
			
 
				-        subtype=subtype,
			
 
				-        rand=rand,
			
 
				-        salt=DEFAULT_ABID_URI_SALT,
			
 
				-    )
			
 
				-    assert abid.ulid and abid.uuid and abid.typeid, f'Failed to calculate {prefix}_ABID for {self.__class__.__name__}'
			
 
				-    return abid
			
 
				-
			
 
				-
			
 
				-def copy_snapshot_uuids(apps, schema_editor):
			
 
				-    print('   Copying snapshot.id -> snapshot.uuid...')
			
 
				-    Snapshot = apps.get_model("core", "Snapshot")
			
 
				-    for snapshot in Snapshot.objects.all():
			
 
				-        snapshot.uuid = snapshot.id
			
 
				-        snapshot.save(update_fields=["uuid"])
			
 
				-
			
 
				-def generate_snapshot_abids(apps, schema_editor):
			
 
				-    print('   Generating snapshot.abid values...')
			
 
				-    Snapshot = apps.get_model("core", "Snapshot")
			
 
				-    for snapshot in Snapshot.objects.all():
			
 
				-        snapshot.abid_prefix = 'snp_'
			
 
				-        snapshot.abid_ts_src = 'self.added'
			
 
				-        snapshot.abid_uri_src = 'self.url'
			
 
				-        snapshot.abid_subtype_src = '"01"'
			
 
				-        snapshot.abid_rand_src = 'self.uuid'
			
 
				-
			
 
				-        snapshot.abid = calculate_abid(snapshot)
			
 
				-        snapshot.uuid = snapshot.abid.uuid
			
 
				-        snapshot.save(update_fields=["abid", "uuid"])
			
 
				-
			
 
				-def generate_archiveresult_abids(apps, schema_editor):
			
 
				-    print('   Generating ArchiveResult.abid values... (may take an hour or longer for large collections...)')
			
 
				-    ArchiveResult = apps.get_model("core", "ArchiveResult")
			
 
				-    Snapshot = apps.get_model("core", "Snapshot")
			
 
				-    for result in ArchiveResult.objects.all():
			
 
				-        result.abid_prefix = 'res_'
			
 
				-        result.snapshot = Snapshot.objects.get(pk=result.snapshot_id)
			
 
				-        result.snapshot_added = result.snapshot.added
			
 
				-        result.snapshot_url = result.snapshot.url
			
 
				-        result.abid_ts_src = 'self.snapshot_added'
			
 
				-        result.abid_uri_src = 'self.snapshot_url'
			
 
				-        result.abid_subtype_src = 'self.extractor'
			
 
				-        result.abid_rand_src = 'self.id'
			
 
				-
			
 
				-        result.abid = calculate_abid(result)
			
 
				-        result.uuid = result.abid.uuid
			
 
				-        result.save(update_fields=["abid", "uuid"])
			
 
				-
			
 
				-
			
 
				-class Migration(migrations.Migration):
			
 
				-
			
 
				-    dependencies = [
			
 
				-        ('core', '0023_alter_archiveresult_options_archiveresult_abid_and_more'),
			
 
				-    ]
			
 
				-
			
 
				-    operations = [
			
 
				-        migrations.RunPython(copy_snapshot_uuids, reverse_code=migrations.RunPython.noop),
			
 
				-        migrations.RunPython(generate_snapshot_abids, reverse_code=migrations.RunPython.noop),
			
 
				-        migrations.RunPython(generate_archiveresult_abids, reverse_code=migrations.RunPython.noop),
			
 
				-    ]
			
--- a/archivebox/core/migrations/0025_alter_archiveresult_uuid.py
+++ b/archivebox/core/migrations/0025_alter_archiveresult_uuid.py
@@ -1,19 +0,0 @@
 
				-# Generated by Django 5.0.6 on 2024-05-13 12:08
			
 
				-
			
 
				-import uuid
			
 
				-from django.db import migrations, models
			
 
				-
			
 
				-
			
 
				-class Migration(migrations.Migration):
			
 
				-
			
 
				-    dependencies = [
			
 
				-        ('core', '0024_auto_20240513_1143'),
			
 
				-    ]
			
 
				-
			
 
				-    operations = [
			
 
				-        migrations.AlterField(
			
 
				-            model_name='archiveresult',
			
 
				-            name='uuid',
			
 
				-            field=models.UUIDField(default=uuid.uuid4, editable=False, unique=True),
			
 
				-        ),
			
 
				-    ]
			
--- a/archivebox/core/migrations/0026_archiveresult_created_archiveresult_created_by_and_more.py
+++ b/archivebox/core/migrations/0026_archiveresult_created_archiveresult_created_by_and_more.py
@@ -1,117 +0,0 @@
 
				-# Generated by Django 5.0.6 on 2024-05-13 13:01
			
 
				-
			
 
				-import django.db.models.deletion
			
 
				-import django.utils.timezone
			
 
				-from django.conf import settings
			
 
				-from django.db import migrations, models
			
 
				-
			
 
				-import archivebox.base_models.models
			
 
				-
			
 
				-
			
 
				-def updated_created_by_ids(apps, schema_editor):
			
 
				-    """Get or create a system user with is_superuser=True to be the default owner for new DB rows"""
			
 
				-
			
 
				-    User = apps.get_model("auth", "User")
			
 
				-    ArchiveResult = apps.get_model("core", "ArchiveResult")
			
 
				-    Snapshot = apps.get_model("core", "Snapshot")
			
 
				-    Tag = apps.get_model("core", "Tag")
			
 
				-
			
 
				-    # if only one user exists total, return that user
			
 
				-    if User.objects.filter(is_superuser=True).count() == 1:
			
 
				-        user_id = User.objects.filter(is_superuser=True).values_list('pk', flat=True)[0]
			
 
				-
			
 
				-    # otherwise, create a dedicated "system" user
			
 
				-    user_id = User.objects.get_or_create(username='system', is_staff=True, is_superuser=True, defaults={'email': '', 'password': ''})[0].pk
			
 
				-    
			
 
				-    ArchiveResult.objects.all().update(created_by_id=user_id)
			
 
				-    Snapshot.objects.all().update(created_by_id=user_id)
			
 
				-    Tag.objects.all().update(created_by_id=user_id)
			
 
				-
			
 
				-
			
 
				-
			
 
				-class Migration(migrations.Migration):
			
 
				-
			
 
				-    dependencies = [
			
 
				-        ('core', '0025_alter_archiveresult_uuid'),
			
 
				-        migrations.swappable_dependency(settings.AUTH_USER_MODEL),
			
 
				-    ]
			
 
				-
			
 
				-    operations = [
			
 
				-        migrations.AddField(
			
 
				-            model_name='archiveresult',
			
 
				-            name='created',
			
 
				-            field=models.DateTimeField(auto_now_add=True, default=django.utils.timezone.now),
			
 
				-            preserve_default=False,
			
 
				-        ),
			
 
				-        migrations.AddField(
			
 
				-            model_name='archiveresult',
			
 
				-            name='created_by',
			
 
				-            field=models.ForeignKey(null=True, default=archivebox.base_models.models.get_or_create_system_user_pk, on_delete=django.db.models.deletion.CASCADE, to=settings.AUTH_USER_MODEL),
			
 
				-        ),
			
 
				-        migrations.AddField(
			
 
				-            model_name='archiveresult',
			
 
				-            name='modified',
			
 
				-            field=models.DateTimeField(auto_now=True),
			
 
				-        ),
			
 
				-        migrations.AddField(
			
 
				-            model_name='snapshot',
			
 
				-            name='created',
			
 
				-            field=models.DateTimeField(auto_now_add=True, default=django.utils.timezone.now),
			
 
				-            preserve_default=False,
			
 
				-        ),
			
 
				-        migrations.AddField(
			
 
				-            model_name='snapshot',
			
 
				-            name='created_by',
			
 
				-            field=models.ForeignKey(null=True, default=archivebox.base_models.models.get_or_create_system_user_pk, on_delete=django.db.models.deletion.CASCADE, to=settings.AUTH_USER_MODEL),
			
 
				-        ),
			
 
				-        migrations.AddField(
			
 
				-            model_name='snapshot',
			
 
				-            name='modified',
			
 
				-            field=models.DateTimeField(auto_now=True),
			
 
				-        ),
			
 
				-        migrations.AddField(
			
 
				-            model_name='tag',
			
 
				-            name='created',
			
 
				-            field=models.DateTimeField(auto_now_add=True, default=django.utils.timezone.now),
			
 
				-            preserve_default=False,
			
 
				-        ),
			
 
				-        migrations.AddField(
			
 
				-            model_name='tag',
			
 
				-            name='created_by',
			
 
				-            field=models.ForeignKey(null=True, default=archivebox.base_models.models.get_or_create_system_user_pk, on_delete=django.db.models.deletion.CASCADE, to=settings.AUTH_USER_MODEL),
			
 
				-        ),
			
 
				-        migrations.AddField(
			
 
				-            model_name='tag',
			
 
				-            name='modified',
			
 
				-            field=models.DateTimeField(auto_now=True),
			
 
				-        ),
			
 
				-        migrations.AddField(
			
 
				-            model_name='tag',
			
 
				-            name='uuid',
			
 
				-            field=models.UUIDField(blank=True, null=True, unique=True),
			
 
				-        ),
			
 
				-        migrations.AlterField(
			
 
				-            model_name='archiveresult',
			
 
				-            name='uuid',
			
 
				-            field=models.UUIDField(blank=True, null=True, unique=True),
			
 
				-        ),
			
 
				-
			
 
				-
			
 
				-        migrations.RunPython(updated_created_by_ids, reverse_code=migrations.RunPython.noop),
			
 
				-
			
 
				-        migrations.AddField(
			
 
				-            model_name='snapshot',
			
 
				-            name='created_by',
			
 
				-            field=models.ForeignKey(default=archivebox.base_models.models.get_or_create_system_user_pk, on_delete=django.db.models.deletion.CASCADE, to=settings.AUTH_USER_MODEL),
			
 
				-        ),
			
 
				-        migrations.AlterField(
			
 
				-            model_name='archiveresult',
			
 
				-            name='created_by',
			
 
				-            field=models.ForeignKey(default=archivebox.base_models.models.get_or_create_system_user_pk, on_delete=django.db.models.deletion.CASCADE, to=settings.AUTH_USER_MODEL),
			
 
				-        ),
			
 
				-        migrations.AddField(
			
 
				-            model_name='tag',
			
 
				-            name='created_by',
			
 
				-            field=models.ForeignKey(default=archivebox.base_models.models.get_or_create_system_user_pk, on_delete=django.db.models.deletion.CASCADE, to=settings.AUTH_USER_MODEL),
			
 
				-        ),
			
 
				-    ]
			
--- a/archivebox/core/migrations/0027_update_snapshot_ids.py
+++ b/archivebox/core/migrations/0027_update_snapshot_ids.py
@@ -1,105 +0,0 @@
 
				-# Generated by Django 5.0.6 on 2024-08-18 02:48
			
 
				-
			
 
				-from django.db import migrations
			
 
				-
			
 
				-from datetime import datetime
			
 
				-from archivebox.base_models.abid import ABID, abid_from_values, DEFAULT_ABID_URI_SALT
			
 
				-
			
 
				-
			
 
				-def calculate_abid(self):
			
 
				-    """
			
 
				-    Return a freshly derived ABID (assembled from attrs defined in ABIDModel.abid_*_src).
			
 
				-    """
			
 
				-    prefix = self.abid_prefix
			
 
				-    ts = eval(self.abid_ts_src)
			
 
				-    uri = eval(self.abid_uri_src)
			
 
				-    subtype = eval(self.abid_subtype_src)
			
 
				-    rand = eval(self.abid_rand_src)
			
 
				-
			
 
				-    if (not prefix) or prefix == 'obj_':
			
 
				-        suggested_abid = self.__class__.__name__[:3].lower()
			
 
				-        raise Exception(f'{self.__class__.__name__}.abid_prefix must be defined to calculate ABIDs (suggested: {suggested_abid})')
			
 
				-
			
 
				-    if not ts:
			
 
				-        ts = datetime.utcfromtimestamp(0)
			
 
				-        print(f'[!] WARNING: Generating ABID with ts=0000000000 placeholder because {self.__class__.__name__}.abid_ts_src={self.abid_ts_src} is unset!', ts.isoformat())
			
 
				-
			
 
				-    if not uri:
			
 
				-        uri = str(self)
			
 
				-        print(f'[!] WARNING: Generating ABID with uri=str(self) placeholder because {self.__class__.__name__}.abid_uri_src={self.abid_uri_src} is unset!', uri)
			
 
				-
			
 
				-    if not subtype:
			
 
				-        subtype = self.__class__.__name__
			
 
				-        print(f'[!] WARNING: Generating ABID with subtype={subtype} placeholder because {self.__class__.__name__}.abid_subtype_src={self.abid_subtype_src} is unset!', subtype)
			
 
				-
			
 
				-    if not rand:
			
 
				-        rand = getattr(self, 'uuid', None) or getattr(self, 'id', None) or getattr(self, 'pk')
			
 
				-        print(f'[!] WARNING: Generating ABID with rand=self.id placeholder because {self.__class__.__name__}.abid_rand_src={self.abid_rand_src} is unset!', rand)
			
 
				-
			
 
				-    abid = abid_from_values(
			
 
				-        prefix=prefix,
			
 
				-        ts=ts,
			
 
				-        uri=uri,
			
 
				-        subtype=subtype,
			
 
				-        rand=rand,
			
 
				-        salt=DEFAULT_ABID_URI_SALT,
			
 
				-    )
			
 
				-    assert abid.ulid and abid.uuid and abid.typeid, f'Failed to calculate {prefix}_ABID for {self.__class__.__name__}'
			
 
				-    return abid
			
 
				-
			
 
				-def update_snapshot_ids(apps, schema_editor):
			
 
				-    Snapshot = apps.get_model("core", "Snapshot")
			
 
				-    num_total = Snapshot.objects.all().count()
			
 
				-    print(f'   Updating {num_total} Snapshot.id, Snapshot.uuid values in place...')
			
 
				-    for idx, snapshot in enumerate(Snapshot.objects.all().only('abid').iterator(chunk_size=500)):
			
 
				-        assert snapshot.abid
			
 
				-        snapshot.abid_prefix = 'snp_'
			
 
				-        snapshot.abid_ts_src = 'self.added'
			
 
				-        snapshot.abid_uri_src = 'self.url'
			
 
				-        snapshot.abid_subtype_src = '"01"'
			
 
				-        snapshot.abid_rand_src = 'self.uuid'
			
 
				-
			
 
				-        snapshot.abid = calculate_abid(snapshot)
			
 
				-        snapshot.uuid = snapshot.abid.uuid
			
 
				-        snapshot.save(update_fields=["abid", "uuid"])
			
 
				-        assert str(ABID.parse(snapshot.abid).uuid) == str(snapshot.uuid)
			
 
				-        if idx % 1000 == 0:
			
 
				-            print(f'Migrated {idx}/{num_total} Snapshot objects...')
			
 
				-
			
 
				-def update_archiveresult_ids(apps, schema_editor):
			
 
				-    Snapshot = apps.get_model("core", "Snapshot")
			
 
				-    ArchiveResult = apps.get_model("core", "ArchiveResult")
			
 
				-    num_total = ArchiveResult.objects.all().count()
			
 
				-    print(f'   Updating {num_total} ArchiveResult.id, ArchiveResult.uuid values in place... (may take an hour or longer for large collections...)')
			
 
				-    for idx, result in enumerate(ArchiveResult.objects.all().only('abid', 'snapshot_id').iterator(chunk_size=500)):
			
 
				-        assert result.abid
			
 
				-        result.abid_prefix = 'res_'
			
 
				-        result.snapshot = Snapshot.objects.get(pk=result.snapshot_id)
			
 
				-        result.snapshot_added = result.snapshot.added
			
 
				-        result.snapshot_url = result.snapshot.url
			
 
				-        result.abid_ts_src = 'self.snapshot_added'
			
 
				-        result.abid_uri_src = 'self.snapshot_url'
			
 
				-        result.abid_subtype_src = 'self.extractor'
			
 
				-        result.abid_rand_src = 'self.id'
			
 
				-
			
 
				-        result.abid = calculate_abid(result)
			
 
				-        result.uuid = result.abid.uuid
			
 
				-        result.uuid = ABID.parse(result.abid).uuid
			
 
				-        result.save(update_fields=["abid", "uuid"])
			
 
				-        assert str(ABID.parse(result.abid).uuid) == str(result.uuid)
			
 
				-        if idx % 5000 == 0:
			
 
				-            print(f'Migrated {idx}/{num_total} ArchiveResult objects...')
			
 
				-
			
 
				-
			
 
				-class Migration(migrations.Migration):
			
 
				-
			
 
				-    dependencies = [
			
 
				-        ('core', '0026_archiveresult_created_archiveresult_created_by_and_more'),
			
 
				-    ]
			
 
				-
			
 
				-    operations = [
			
 
				-        migrations.RunPython(update_snapshot_ids, reverse_code=migrations.RunPython.noop),
			
 
				-        migrations.RunPython(update_archiveresult_ids, reverse_code=migrations.RunPython.noop),
			
 
				-    ]
			
 
				-
			
 
				-
			
--- a/archivebox/core/migrations/0028_alter_archiveresult_uuid.py
+++ b/archivebox/core/migrations/0028_alter_archiveresult_uuid.py
@@ -1,19 +0,0 @@
 
				-# Generated by Django 5.0.6 on 2024-08-18 04:28
			
 
				-
			
 
				-import uuid
			
 
				-from django.db import migrations, models
			
 
				-
			
 
				-
			
 
				-class Migration(migrations.Migration):
			
 
				-
			
 
				-    dependencies = [
			
 
				-        ('core', '0027_update_snapshot_ids'),
			
 
				-    ]
			
 
				-
			
 
				-    operations = [
			
 
				-        migrations.AlterField(
			
 
				-            model_name='archiveresult',
			
 
				-            name='uuid',
			
 
				-            field=models.UUIDField(default=uuid.uuid4),
			
 
				-        ),
			
 
				-    ]
			
--- a/archivebox/core/migrations/0029_alter_archiveresult_id.py
+++ b/archivebox/core/migrations/0029_alter_archiveresult_id.py
@@ -1,18 +0,0 @@
 
				-# Generated by Django 5.0.6 on 2024-08-18 04:28
			
 
				-
			
 
				-from django.db import migrations, models
			
 
				-
			
 
				-
			
 
				-class Migration(migrations.Migration):
			
 
				-
			
 
				-    dependencies = [
			
 
				-        ('core', '0028_alter_archiveresult_uuid'),
			
 
				-    ]
			
 
				-
			
 
				-    operations = [
			
 
				-        migrations.AlterField(
			
 
				-            model_name='archiveresult',
			
 
				-            name='id',
			
 
				-            field=models.BigIntegerField(primary_key=True, serialize=False, verbose_name='ID'),
			
 
				-        ),
			
 
				-    ]
			
--- a/archivebox/core/migrations/0030_alter_archiveresult_uuid.py
+++ b/archivebox/core/migrations/0030_alter_archiveresult_uuid.py
@@ -1,18 +0,0 @@
 
				-# Generated by Django 5.0.6 on 2024-08-18 05:00
			
 
				-
			
 
				-from django.db import migrations, models
			
 
				-
			
 
				-
			
 
				-class Migration(migrations.Migration):
			
 
				-
			
 
				-    dependencies = [
			
 
				-        ('core', '0029_alter_archiveresult_id'),
			
 
				-    ]
			
 
				-
			
 
				-    operations = [
			
 
				-        migrations.AlterField(
			
 
				-            model_name='archiveresult',
			
 
				-            name='uuid',
			
 
				-            field=models.UUIDField(unique=True),
			
 
				-        ),
			
 
				-    ]
			
--- a/archivebox/core/migrations/0031_alter_archiveresult_id_alter_archiveresult_uuid_and_more.py
+++ b/archivebox/core/migrations/0031_alter_archiveresult_id_alter_archiveresult_uuid_and_more.py
@@ -1,34 +0,0 @@
 
				-# Generated by Django 5.0.6 on 2024-08-18 05:09
			
 
				-
			
 
				-import uuid
			
 
				-from django.db import migrations, models
			
 
				-
			
 
				-
			
 
				-class Migration(migrations.Migration):
			
 
				-
			
 
				-    dependencies = [
			
 
				-        ('core', '0030_alter_archiveresult_uuid'),
			
 
				-    ]
			
 
				-
			
 
				-    operations = [
			
 
				-        migrations.AlterField(
			
 
				-            model_name='archiveresult',
			
 
				-            name='id',
			
 
				-            field=models.IntegerField(default=uuid.uuid4, primary_key=True, serialize=False, verbose_name='ID'),
			
 
				-        ),
			
 
				-        migrations.AlterField(
			
 
				-            model_name='archiveresult',
			
 
				-            name='uuid',
			
 
				-            field=models.UUIDField(default=uuid.uuid4, unique=True),
			
 
				-        ),
			
 
				-        migrations.AlterField(
			
 
				-            model_name='snapshot',
			
 
				-            name='uuid',
			
 
				-            field=models.UUIDField(default=uuid.uuid4, unique=True),
			
 
				-        ),
			
 
				-        migrations.AlterField(
			
 
				-            model_name='tag',
			
 
				-            name='uuid',
			
 
				-            field=models.UUIDField(default=uuid.uuid4, null=True, unique=True),
			
 
				-        ),
			
 
				-    ]
			
--- a/archivebox/core/migrations/0032_alter_archiveresult_id.py
+++ b/archivebox/core/migrations/0032_alter_archiveresult_id.py
@@ -1,23 +0,0 @@
 
				-# Generated by Django 5.0.6 on 2024-08-18 05:20
			
 
				-
			
 
				-import core.models
			
 
				-import random
			
 
				-from django.db import migrations, models
			
 
				-
			
 
				-
			
 
				-def rand_int_id():
			
 
				-    return random.getrandbits(32)
			
 
				-
			
 
				-class Migration(migrations.Migration):
			
 
				-
			
 
				-    dependencies = [
			
 
				-        ('core', '0031_alter_archiveresult_id_alter_archiveresult_uuid_and_more'),
			
 
				-    ]
			
 
				-
			
 
				-    operations = [
			
 
				-        migrations.AlterField(
			
 
				-            model_name='archiveresult',
			
 
				-            name='id',
			
 
				-            field=models.BigIntegerField(default=rand_int_id, primary_key=True, serialize=False, verbose_name='ID'),
			
 
				-        ),
			
 
				-    ]
			
--- a/archivebox/core/migrations/0033_rename_id_archiveresult_old_id.py
+++ b/archivebox/core/migrations/0033_rename_id_archiveresult_old_id.py
@@ -1,18 +0,0 @@
 
				-# Generated by Django 5.0.6 on 2024-08-18 05:34
			
 
				-
			
 
				-from django.db import migrations
			
 
				-
			
 
				-
			
 
				-class Migration(migrations.Migration):
			
 
				-
			
 
				-    dependencies = [
			
 
				-        ('core', '0032_alter_archiveresult_id'),
			
 
				-    ]
			
 
				-
			
 
				-    operations = [
			
 
				-        migrations.RenameField(
			
 
				-            model_name='archiveresult',
			
 
				-            old_name='id',
			
 
				-            new_name='old_id',
			
 
				-        ),
			
 
				-    ]
			
--- a/archivebox/core/migrations/0034_alter_archiveresult_old_id_alter_archiveresult_uuid.py
+++ b/archivebox/core/migrations/0034_alter_archiveresult_old_id_alter_archiveresult_uuid.py
@@ -1,45 +0,0 @@
 
				-# Generated by Django 5.0.6 on 2024-08-18 05:37
			
 
				-
			
 
				-import uuid
			
 
				-import random
			
 
				-from django.db import migrations, models
			
 
				-
			
 
				-from archivebox.base_models.abid import ABID
			
 
				-
			
 
				-
			
 
				-def rand_int_id():
			
 
				-    return random.getrandbits(32)
			
 
				-
			
 
				-
			
 
				-def update_archiveresult_ids(apps, schema_editor):
			
 
				-    ArchiveResult = apps.get_model("core", "ArchiveResult")
			
 
				-    num_total = ArchiveResult.objects.all().count()
			
 
				-    print(f'   Updating {num_total} ArchiveResult.id, ArchiveResult.uuid values in place... (may take an hour or longer for large collections...)')
			
 
				-    for idx, result in enumerate(ArchiveResult.objects.all().only('abid').iterator(chunk_size=500)):
			
 
				-        assert result.abid
			
 
				-        result.uuid = ABID.parse(result.abid).uuid
			
 
				-        result.save(update_fields=["uuid"])
			
 
				-        assert str(ABID.parse(result.abid).uuid) == str(result.uuid)
			
 
				-        if idx % 2500 == 0:
			
 
				-            print(f'Migrated {idx}/{num_total} ArchiveResult objects...')
			
 
				-
			
 
				-
			
 
				-class Migration(migrations.Migration):
			
 
				-
			
 
				-    dependencies = [
			
 
				-        ('core', '0033_rename_id_archiveresult_old_id'),
			
 
				-    ]
			
 
				-
			
 
				-    operations = [
			
 
				-        migrations.AlterField(
			
 
				-            model_name='archiveresult',
			
 
				-            name='old_id',
			
 
				-            field=models.BigIntegerField(default=rand_int_id, serialize=False, verbose_name='ID'),
			
 
				-        ),
			
 
				-        migrations.RunPython(update_archiveresult_ids, reverse_code=migrations.RunPython.noop),
			
 
				-        migrations.AlterField(
			
 
				-            model_name='archiveresult',
			
 
				-            name='uuid',
			
 
				-            field=models.UUIDField(default=uuid.uuid4, primary_key=True, serialize=False, unique=True),
			
 
				-        ),
			
 
				-    ]
			
--- a/archivebox/core/migrations/0035_remove_archiveresult_uuid_archiveresult_id.py
+++ b/archivebox/core/migrations/0035_remove_archiveresult_uuid_archiveresult_id.py
@@ -1,19 +0,0 @@
 
				-# Generated by Django 5.0.6 on 2024-08-18 05:49
			
 
				-
			
 
				-import uuid
			
 
				-from django.db import migrations, models
			
 
				-
			
 
				-
			
 
				-class Migration(migrations.Migration):
			
 
				-
			
 
				-    dependencies = [
			
 
				-        ('core', '0034_alter_archiveresult_old_id_alter_archiveresult_uuid'),
			
 
				-    ]
			
 
				-
			
 
				-    operations = [
			
 
				-        migrations.RenameField(
			
 
				-            model_name='archiveresult',
			
 
				-            old_name='uuid',
			
 
				-            new_name='id',
			
 
				-        ),
			
 
				-    ]
			
--- a/archivebox/core/migrations/0036_alter_archiveresult_id_alter_archiveresult_old_id.py
+++ b/archivebox/core/migrations/0036_alter_archiveresult_id_alter_archiveresult_old_id.py
@@ -1,29 +0,0 @@
 
				-# Generated by Django 5.0.6 on 2024-08-18 05:59
			
 
				-
			
 
				-import core.models
			
 
				-import uuid
			
 
				-import random
			
 
				-from django.db import migrations, models
			
 
				-
			
 
				-
			
 
				-def rand_int_id():
			
 
				-    return random.getrandbits(32)
			
 
				-
			
 
				-class Migration(migrations.Migration):
			
 
				-
			
 
				-    dependencies = [
			
 
				-        ('core', '0035_remove_archiveresult_uuid_archiveresult_id'),
			
 
				-    ]
			
 
				-
			
 
				-    operations = [
			
 
				-        migrations.AlterField(
			
 
				-            model_name='archiveresult',
			
 
				-            name='id',
			
 
				-            field=models.UUIDField(default=uuid.uuid4, primary_key=True, serialize=False, unique=True, verbose_name='ID'),
			
 
				-        ),
			
 
				-        migrations.AlterField(
			
 
				-            model_name='archiveresult',
			
 
				-            name='old_id',
			
 
				-            field=models.BigIntegerField(default=rand_int_id, serialize=False, verbose_name='Old ID'),
			
 
				-        ),
			
 
				-    ]
			
--- a/archivebox/core/migrations/0037_rename_id_snapshot_old_id.py
+++ b/archivebox/core/migrations/0037_rename_id_snapshot_old_id.py
@@ -1,18 +0,0 @@
 
				-# Generated by Django 5.0.6 on 2024-08-18 06:08
			
 
				-
			
 
				-from django.db import migrations
			
 
				-
			
 
				-
			
 
				-class Migration(migrations.Migration):
			
 
				-
			
 
				-    dependencies = [
			
 
				-        ('core', '0036_alter_archiveresult_id_alter_archiveresult_old_id'),
			
 
				-    ]
			
 
				-
			
 
				-    operations = [
			
 
				-        migrations.RenameField(
			
 
				-            model_name='snapshot',
			
 
				-            old_name='id',
			
 
				-            new_name='old_id',
			
 
				-        ),
			
 
				-    ]
			
--- a/archivebox/core/migrations/0038_rename_uuid_snapshot_id.py
+++ b/archivebox/core/migrations/0038_rename_uuid_snapshot_id.py
@@ -1,18 +0,0 @@
 
				-# Generated by Django 5.0.6 on 2024-08-18 06:09
			
 
				-
			
 
				-from django.db import migrations
			
 
				-
			
 
				-
			
 
				-class Migration(migrations.Migration):
			
 
				-
			
 
				-    dependencies = [
			
 
				-        ('core', '0037_rename_id_snapshot_old_id'),
			
 
				-    ]
			
 
				-
			
 
				-    operations = [
			
 
				-        migrations.RenameField(
			
 
				-            model_name='snapshot',
			
 
				-            old_name='uuid',
			
 
				-            new_name='id',
			
 
				-        ),
			
 
				-    ]
			
--- a/archivebox/core/migrations/0039_rename_snapshot_archiveresult_snapshot_old.py
+++ b/archivebox/core/migrations/0039_rename_snapshot_archiveresult_snapshot_old.py
@@ -1,18 +0,0 @@
 
				-# Generated by Django 5.0.6 on 2024-08-18 06:25
			
 
				-
			
 
				-from django.db import migrations
			
 
				-
			
 
				-
			
 
				-class Migration(migrations.Migration):
			
 
				-
			
 
				-    dependencies = [
			
 
				-        ('core', '0038_rename_uuid_snapshot_id'),
			
 
				-    ]
			
 
				-
			
 
				-    operations = [
			
 
				-        migrations.RenameField(
			
 
				-            model_name='archiveresult',
			
 
				-            old_name='snapshot',
			
 
				-            new_name='snapshot_old',
			
 
				-        ),
			
 
				-    ]
			
--- a/archivebox/core/migrations/0040_archiveresult_snapshot.py
+++ b/archivebox/core/migrations/0040_archiveresult_snapshot.py
@@ -1,34 +0,0 @@
 
				-# Generated by Django 5.0.6 on 2024-08-18 06:46
			
 
				-
			
 
				-import django.db.models.deletion
			
 
				-from django.db import migrations, models
			
 
				-
			
 
				-def update_archiveresult_snapshot_ids(apps, schema_editor):
			
 
				-    ArchiveResult = apps.get_model("core", "ArchiveResult")
			
 
				-    Snapshot = apps.get_model("core", "Snapshot")
			
 
				-    num_total = ArchiveResult.objects.all().count()
			
 
				-    print(f'   Updating {num_total} ArchiveResult.snapshot_id values in place... (may take an hour or longer for large collections...)')
			
 
				-    for idx, result in enumerate(ArchiveResult.objects.all().only('snapshot_old_id').iterator(chunk_size=5000)):
			
 
				-        assert result.snapshot_old_id
			
 
				-        snapshot = Snapshot.objects.only('id').get(old_id=result.snapshot_old_id)
			
 
				-        result.snapshot_id = snapshot.id
			
 
				-        result.save(update_fields=["snapshot_id"])
			
 
				-        assert str(result.snapshot_id) == str(snapshot.id)
			
 
				-        if idx % 5000 == 0:
			
 
				-            print(f'Migrated {idx}/{num_total} ArchiveResult objects...')
			
 
				-
			
 
				-
			
 
				-class Migration(migrations.Migration):
			
 
				-
			
 
				-    dependencies = [
			
 
				-        ('core', '0039_rename_snapshot_archiveresult_snapshot_old'),
			
 
				-    ]
			
 
				-
			
 
				-    operations = [
			
 
				-        migrations.AddField(
			
 
				-            model_name='archiveresult',
			
 
				-            name='snapshot',
			
 
				-            field=models.ForeignKey(null=True, on_delete=django.db.models.deletion.CASCADE, related_name='archiveresults', to='core.snapshot', to_field='id'),
			
 
				-        ),
			
 
				-        migrations.RunPython(update_archiveresult_snapshot_ids, reverse_code=migrations.RunPython.noop),
			
 
				-    ]
			
--- a/archivebox/core/migrations/0041_alter_archiveresult_snapshot_and_more.py
+++ b/archivebox/core/migrations/0041_alter_archiveresult_snapshot_and_more.py
@@ -1,24 +0,0 @@
 
				-# Generated by Django 5.0.6 on 2024-08-18 06:50
			
 
				-
			
 
				-import django.db.models.deletion
			
 
				-from django.db import migrations, models
			
 
				-
			
 
				-
			
 
				-class Migration(migrations.Migration):
			
 
				-
			
 
				-    dependencies = [
			
 
				-        ('core', '0040_archiveresult_snapshot'),
			
 
				-    ]
			
 
				-
			
 
				-    operations = [
			
 
				-        migrations.AlterField(
			
 
				-            model_name='archiveresult',
			
 
				-            name='snapshot',
			
 
				-            field=models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, to='core.snapshot', to_field='id'),
			
 
				-        ),
			
 
				-        migrations.AlterField(
			
 
				-            model_name='archiveresult',
			
 
				-            name='snapshot_old',
			
 
				-            field=models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, related_name='archiveresults_old', to='core.snapshot'),
			
 
				-        ),
			
 
				-    ]
			
--- a/archivebox/core/migrations/0042_remove_archiveresult_snapshot_old.py
+++ b/archivebox/core/migrations/0042_remove_archiveresult_snapshot_old.py
@@ -1,17 +0,0 @@
 
				-# Generated by Django 5.0.6 on 2024-08-18 06:51
			
 
				-
			
 
				-from django.db import migrations
			
 
				-
			
 
				-
			
 
				-class Migration(migrations.Migration):
			
 
				-
			
 
				-    dependencies = [
			
 
				-        ('core', '0041_alter_archiveresult_snapshot_and_more'),
			
 
				-    ]
			
 
				-
			
 
				-    operations = [
			
 
				-        migrations.RemoveField(
			
 
				-            model_name='archiveresult',
			
 
				-            name='snapshot_old',
			
 
				-        ),
			
 
				-    ]
			
--- a/archivebox/core/migrations/0043_alter_archiveresult_snapshot_alter_snapshot_id_and_more.py
+++ b/archivebox/core/migrations/0043_alter_archiveresult_snapshot_alter_snapshot_id_and_more.py
@@ -1,20 +0,0 @@
 
				-# Generated by Django 5.0.6 on 2024-08-18 06:52
			
 
				-
			
 
				-import django.db.models.deletion
			
 
				-import uuid
			
 
				-from django.db import migrations, models
			
 
				-
			
 
				-
			
 
				-class Migration(migrations.Migration):
			
 
				-
			
 
				-    dependencies = [
			
 
				-        ('core', '0042_remove_archiveresult_snapshot_old'),
			
 
				-    ]
			
 
				-
			
 
				-    operations = [
			
 
				-        migrations.AlterField(
			
 
				-            model_name='archiveresult',
			
 
				-            name='snapshot',
			
 
				-            field=models.ForeignKey(db_column='snapshot_id', on_delete=django.db.models.deletion.CASCADE, to='core.snapshot', to_field='id'),
			
 
				-        ),
			
 
				-    ]
			
--- a/archivebox/core/migrations/0044_alter_archiveresult_snapshot_alter_tag_uuid_and_more.py
+++ b/archivebox/core/migrations/0044_alter_archiveresult_snapshot_alter_tag_uuid_and_more.py
@@ -1,40 +0,0 @@
 
				-# Generated by Django 5.0.6 on 2024-08-19 23:01
			
 
				-
			
 
				-import django.db.models.deletion
			
 
				-import uuid
			
 
				-from django.db import migrations, models
			
 
				-
			
 
				-
			
 
				-
			
 
				-class Migration(migrations.Migration):
			
 
				-
			
 
				-    dependencies = [
			
 
				-        ('core', '0043_alter_archiveresult_snapshot_alter_snapshot_id_and_more'),
			
 
				-    ]
			
 
				-
			
 
				-    operations = [
			
 
				-        migrations.SeparateDatabaseAndState(
			
 
				-            database_operations=[
			
 
				-                # No-op, SnapshotTag model already exists in DB
			
 
				-            ],
			
 
				-            state_operations=[
			
 
				-                migrations.CreateModel(
			
 
				-                    name='SnapshotTag',
			
 
				-                    fields=[
			
 
				-                        ('id', models.AutoField(primary_key=True, serialize=False)),
			
 
				-                        ('snapshot', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, to='core.snapshot')),
			
 
				-                        ('tag', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, to='core.tag')),
			
 
				-                    ],
			
 
				-                    options={
			
 
				-                        'db_table': 'core_snapshot_tags',
			
 
				-                        'unique_together': {('snapshot', 'tag')},
			
 
				-                    },
			
 
				-                ),
			
 
				-                migrations.AlterField(
			
 
				-                    model_name='snapshot',
			
 
				-                    name='tags',
			
 
				-                    field=models.ManyToManyField(blank=True, related_name='snapshot_set', through='core.SnapshotTag', to='core.tag'),
			
 
				-                ),
			
 
				-            ],
			
 
				-        ),
			
 
				-    ]
			
--- a/archivebox/core/migrations/0045_alter_snapshot_old_id.py
+++ b/archivebox/core/migrations/0045_alter_snapshot_old_id.py
@@ -1,19 +0,0 @@
 
				-# Generated by Django 5.0.6 on 2024-08-20 01:54
			
 
				-
			
 
				-import uuid
			
 
				-from django.db import migrations, models
			
 
				-
			
 
				-
			
 
				-class Migration(migrations.Migration):
			
 
				-
			
 
				-    dependencies = [
			
 
				-        ('core', '0044_alter_archiveresult_snapshot_alter_tag_uuid_and_more'),
			
 
				-    ]
			
 
				-
			
 
				-    operations = [
			
 
				-        migrations.AlterField(
			
 
				-            model_name='snapshot',
			
 
				-            name='old_id',
			
 
				-            field=models.UUIDField(default=uuid.uuid4, editable=False, primary_key=True, serialize=False, unique=True),
			
 
				-        ),
			
 
				-    ]
			
--- a/archivebox/core/migrations/0046_alter_archiveresult_snapshot_alter_snapshot_id_and_more.py
+++ b/archivebox/core/migrations/0046_alter_archiveresult_snapshot_alter_snapshot_id_and_more.py
@@ -1,30 +0,0 @@
 
				-# Generated by Django 5.0.6 on 2024-08-20 01:55
			
 
				-
			
 
				-import django.db.models.deletion
			
 
				-import uuid
			
 
				-from django.db import migrations, models
			
 
				-
			
 
				-
			
 
				-class Migration(migrations.Migration):
			
 
				-
			
 
				-    dependencies = [
			
 
				-        ('core', '0045_alter_snapshot_old_id'),
			
 
				-    ]
			
 
				-
			
 
				-    operations = [
			
 
				-        migrations.AlterField(
			
 
				-            model_name='archiveresult',
			
 
				-            name='snapshot',
			
 
				-            field=models.ForeignKey(db_column='snapshot_id', on_delete=django.db.models.deletion.CASCADE, to='core.snapshot', to_field='id'),
			
 
				-        ),
			
 
				-        migrations.AlterField(
			
 
				-            model_name='snapshot',
			
 
				-            name='id',
			
 
				-            field=models.UUIDField(default=uuid.uuid4, primary_key=True, serialize=False, unique=True),
			
 
				-        ),
			
 
				-        migrations.AlterField(
			
 
				-            model_name='snapshot',
			
 
				-            name='old_id',
			
 
				-            field=models.UUIDField(default=uuid.uuid4, editable=False, unique=True),
			
 
				-        ),
			
 
				-    ]
			
--- a/archivebox/core/migrations/0047_alter_snapshottag_unique_together_and_more.py
+++ b/archivebox/core/migrations/0047_alter_snapshottag_unique_together_and_more.py
@@ -1,24 +0,0 @@
 
				-# Generated by Django 5.0.6 on 2024-08-20 02:16
			
 
				-
			
 
				-import django.db.models.deletion
			
 
				-from django.db import migrations, models
			
 
				-
			
 
				-
			
 
				-class Migration(migrations.Migration):
			
 
				-
			
 
				-    dependencies = [
			
 
				-        ('core', '0046_alter_archiveresult_snapshot_alter_snapshot_id_and_more'),
			
 
				-    ]
			
 
				-
			
 
				-    operations = [
			
 
				-        migrations.AlterField(
			
 
				-            model_name='archiveresult',
			
 
				-            name='snapshot',
			
 
				-            field=models.ForeignKey(db_column='snapshot_id', on_delete=django.db.models.deletion.CASCADE, to='core.snapshot', to_field='id'),
			
 
				-        ),
			
 
				-        migrations.AlterField(
			
 
				-            model_name='snapshottag',
			
 
				-            name='tag',
			
 
				-            field=models.ForeignKey(db_column='tag_id', on_delete=django.db.models.deletion.CASCADE, to='core.tag'),
			
 
				-        ),
			
 
				-    ]
			
--- a/archivebox/core/migrations/0048_alter_archiveresult_snapshot_and_more.py
+++ b/archivebox/core/migrations/0048_alter_archiveresult_snapshot_and_more.py
@@ -1,24 +0,0 @@
 
				-# Generated by Django 5.0.6 on 2024-08-20 02:17
			
 
				-
			
 
				-import django.db.models.deletion
			
 
				-from django.db import migrations, models
			
 
				-
			
 
				-
			
 
				-class Migration(migrations.Migration):
			
 
				-
			
 
				-    dependencies = [
			
 
				-        ('core', '0047_alter_snapshottag_unique_together_and_more'),
			
 
				-    ]
			
 
				-
			
 
				-    operations = [
			
 
				-        migrations.AlterField(
			
 
				-            model_name='archiveresult',
			
 
				-            name='snapshot',
			
 
				-            field=models.ForeignKey(db_column='snapshot_id', on_delete=django.db.models.deletion.CASCADE, to='core.snapshot'),
			
 
				-        ),
			
 
				-        migrations.AlterField(
			
 
				-            model_name='snapshottag',
			
 
				-            name='snapshot',
			
 
				-            field=models.ForeignKey(db_column='snapshot_id', on_delete=django.db.models.deletion.CASCADE, to='core.snapshot', to_field='old_id'),
			
 
				-        ),
			
 
				-    ]
			
--- a/archivebox/core/migrations/0049_rename_snapshot_snapshottag_snapshot_old_and_more.py
+++ b/archivebox/core/migrations/0049_rename_snapshot_snapshottag_snapshot_old_and_more.py
@@ -1,22 +0,0 @@
 
				-# Generated by Django 5.0.6 on 2024-08-20 02:26
			
 
				-
			
 
				-from django.db import migrations
			
 
				-
			
 
				-
			
 
				-class Migration(migrations.Migration):
			
 
				-
			
 
				-    dependencies = [
			
 
				-        ('core', '0048_alter_archiveresult_snapshot_and_more'),
			
 
				-    ]
			
 
				-
			
 
				-    operations = [
			
 
				-        migrations.RenameField(
			
 
				-            model_name='snapshottag',
			
 
				-            old_name='snapshot',
			
 
				-            new_name='snapshot_old',
			
 
				-        ),
			
 
				-        migrations.AlterUniqueTogether(
			
 
				-            name='snapshottag',
			
 
				-            unique_together={('snapshot_old', 'tag')},
			
 
				-        ),
			
 
				-    ]
			
--- a/archivebox/core/migrations/0050_alter_snapshottag_snapshot_old.py
+++ b/archivebox/core/migrations/0050_alter_snapshottag_snapshot_old.py
@@ -1,19 +0,0 @@
 
				-# Generated by Django 5.0.6 on 2024-08-20 02:30
			
 
				-
			
 
				-import django.db.models.deletion
			
 
				-from django.db import migrations, models
			
 
				-
			
 
				-
			
 
				-class Migration(migrations.Migration):
			
 
				-
			
 
				-    dependencies = [
			
 
				-        ('core', '0049_rename_snapshot_snapshottag_snapshot_old_and_more'),
			
 
				-    ]
			
 
				-
			
 
				-    operations = [
			
 
				-        migrations.AlterField(
			
 
				-            model_name='snapshottag',
			
 
				-            name='snapshot_old',
			
 
				-            field=models.ForeignKey(db_column='snapshot_old_id', on_delete=django.db.models.deletion.CASCADE, to='core.snapshot', to_field='old_id'),
			
 
				-        ),
			
 
				-    ]
			
--- a/archivebox/core/migrations/0051_snapshottag_snapshot_alter_snapshottag_snapshot_old.py
+++ b/archivebox/core/migrations/0051_snapshottag_snapshot_alter_snapshottag_snapshot_old.py
@@ -1,40 +0,0 @@
 
				-# Generated by Django 5.0.6 on 2024-08-20 02:31
			
 
				-
			
 
				-import django.db.models.deletion
			
 
				-from django.db import migrations, models
			
 
				-
			
 
				-
			
 
				-def update_snapshottag_ids(apps, schema_editor):
			
 
				-    Snapshot = apps.get_model("core", "Snapshot")
			
 
				-    SnapshotTag = apps.get_model("core", "SnapshotTag")
			
 
				-    num_total = SnapshotTag.objects.all().count()
			
 
				-    print(f'   Updating {num_total} SnapshotTag.snapshot_id values in place... (may take an hour or longer for large collections...)')
			
 
				-    for idx, snapshottag in enumerate(SnapshotTag.objects.all().only('snapshot_old_id').iterator(chunk_size=500)):
			
 
				-        assert snapshottag.snapshot_old_id
			
 
				-        snapshot = Snapshot.objects.get(old_id=snapshottag.snapshot_old_id)
			
 
				-        snapshottag.snapshot_id = snapshot.id
			
 
				-        snapshottag.save(update_fields=["snapshot_id"])
			
 
				-        assert str(snapshottag.snapshot_id) == str(snapshot.id)
			
 
				-        if idx % 100 == 0:
			
 
				-            print(f'Migrated {idx}/{num_total} SnapshotTag objects...')
			
 
				-
			
 
				-
			
 
				-class Migration(migrations.Migration):
			
 
				-
			
 
				-    dependencies = [
			
 
				-        ('core', '0050_alter_snapshottag_snapshot_old'),
			
 
				-    ]
			
 
				-
			
 
				-    operations = [
			
 
				-        migrations.AddField(
			
 
				-            model_name='snapshottag',
			
 
				-            name='snapshot',
			
 
				-            field=models.ForeignKey(blank=True, db_column='snapshot_id', null=True, on_delete=django.db.models.deletion.CASCADE, to='core.snapshot'),
			
 
				-        ),
			
 
				-        migrations.AlterField(
			
 
				-            model_name='snapshottag',
			
 
				-            name='snapshot_old',
			
 
				-            field=models.ForeignKey(db_column='snapshot_old_id', on_delete=django.db.models.deletion.CASCADE, related_name='snapshottag_old_set', to='core.snapshot', to_field='old_id'),
			
 
				-        ),
			
 
				-        migrations.RunPython(update_snapshottag_ids, reverse_code=migrations.RunPython.noop),
			
 
				-    ]
			
--- a/archivebox/core/migrations/0052_alter_snapshottag_unique_together_and_more.py
+++ b/archivebox/core/migrations/0052_alter_snapshottag_unique_together_and_more.py
@@ -1,27 +0,0 @@
 
				-# Generated by Django 5.0.6 on 2024-08-20 02:37
			
 
				-
			
 
				-import django.db.models.deletion
			
 
				-from django.db import migrations, models
			
 
				-
			
 
				-
			
 
				-class Migration(migrations.Migration):
			
 
				-
			
 
				-    dependencies = [
			
 
				-        ('core', '0051_snapshottag_snapshot_alter_snapshottag_snapshot_old'),
			
 
				-    ]
			
 
				-
			
 
				-    operations = [
			
 
				-        migrations.AlterUniqueTogether(
			
 
				-            name='snapshottag',
			
 
				-            unique_together=set(),
			
 
				-        ),
			
 
				-        migrations.AlterField(
			
 
				-            model_name='snapshottag',
			
 
				-            name='snapshot',
			
 
				-            field=models.ForeignKey(db_column='snapshot_id', on_delete=django.db.models.deletion.CASCADE, to='core.snapshot'),
			
 
				-        ),
			
 
				-        migrations.AlterUniqueTogether(
			
 
				-            name='snapshottag',
			
 
				-            unique_together={('snapshot', 'tag')},
			
 
				-        ),
			
 
				-    ]
			
--- a/archivebox/core/migrations/0053_remove_snapshottag_snapshot_old.py
+++ b/archivebox/core/migrations/0053_remove_snapshottag_snapshot_old.py
@@ -1,17 +0,0 @@
 
				-# Generated by Django 5.0.6 on 2024-08-20 02:38
			
 
				-
			
 
				-from django.db import migrations
			
 
				-
			
 
				-
			
 
				-class Migration(migrations.Migration):
			
 
				-
			
 
				-    dependencies = [
			
 
				-        ('core', '0052_alter_snapshottag_unique_together_and_more'),
			
 
				-    ]
			
 
				-
			
 
				-    operations = [
			
 
				-        migrations.RemoveField(
			
 
				-            model_name='snapshottag',
			
 
				-            name='snapshot_old',
			
 
				-        ),
			
 
				-    ]
			
--- a/archivebox/core/migrations/0054_alter_snapshot_timestamp.py
+++ b/archivebox/core/migrations/0054_alter_snapshot_timestamp.py
@@ -1,18 +0,0 @@
 
				-# Generated by Django 5.0.6 on 2024-08-20 02:40
			
 
				-
			
 
				-from django.db import migrations, models
			
 
				-
			
 
				-
			
 
				-class Migration(migrations.Migration):
			
 
				-
			
 
				-    dependencies = [
			
 
				-        ('core', '0053_remove_snapshottag_snapshot_old'),
			
 
				-    ]
			
 
				-
			
 
				-    operations = [
			
 
				-        migrations.AlterField(
			
 
				-            model_name='snapshot',
			
 
				-            name='timestamp',
			
 
				-            field=models.CharField(db_index=True, editable=False, max_length=32, unique=True),
			
 
				-        ),
			
 
				-    ]
			
--- a/archivebox/core/migrations/0055_alter_tag_slug.py
+++ b/archivebox/core/migrations/0055_alter_tag_slug.py
@@ -1,18 +0,0 @@
 
				-# Generated by Django 5.0.6 on 2024-08-20 03:24
			
 
				-
			
 
				-from django.db import migrations, models
			
 
				-
			
 
				-
			
 
				-class Migration(migrations.Migration):
			
 
				-
			
 
				-    dependencies = [
			
 
				-        ('core', '0054_alter_snapshot_timestamp'),
			
 
				-    ]
			
 
				-
			
 
				-    operations = [
			
 
				-        migrations.AlterField(
			
 
				-            model_name='tag',
			
 
				-            name='slug',
			
 
				-            field=models.SlugField(editable=False, max_length=100, unique=True),
			
 
				-        ),
			
 
				-    ]
			
--- a/archivebox/core/migrations/0056_remove_tag_uuid.py
+++ b/archivebox/core/migrations/0056_remove_tag_uuid.py
@@ -1,17 +0,0 @@
 
				-# Generated by Django 5.0.6 on 2024-08-20 03:25
			
 
				-
			
 
				-from django.db import migrations
			
 
				-
			
 
				-
			
 
				-class Migration(migrations.Migration):
			
 
				-
			
 
				-    dependencies = [
			
 
				-        ('core', '0055_alter_tag_slug'),
			
 
				-    ]
			
 
				-
			
 
				-    operations = [
			
 
				-        migrations.RemoveField(
			
 
				-            model_name='tag',
			
 
				-            name='uuid',
			
 
				-        ),
			
 
				-    ]
			
--- a/archivebox/core/migrations/0057_rename_id_tag_old_id.py
+++ b/archivebox/core/migrations/0057_rename_id_tag_old_id.py
@@ -1,18 +0,0 @@
 
				-# Generated by Django 5.0.6 on 2024-08-20 03:29
			
 
				-
			
 
				-from django.db import migrations
			
 
				-
			
 
				-
			
 
				-class Migration(migrations.Migration):
			
 
				-
			
 
				-    dependencies = [
			
 
				-        ('core', '0056_remove_tag_uuid'),
			
 
				-    ]
			
 
				-
			
 
				-    operations = [
			
 
				-        migrations.RenameField(
			
 
				-            model_name='tag',
			
 
				-            old_name='id',
			
 
				-            new_name='old_id',
			
 
				-        ),
			
 
				-    ]
			
--- a/archivebox/core/migrations/0058_alter_tag_old_id.py
+++ b/archivebox/core/migrations/0058_alter_tag_old_id.py
@@ -1,22 +0,0 @@
 
				-# Generated by Django 5.0.6 on 2024-08-20 03:30
			
 
				-
			
 
				-import random
			
 
				-from django.db import migrations, models
			
 
				-
			
 
				-
			
 
				-def rand_int_id():
			
 
				-    return random.getrandbits(32)
			
 
				-
			
 
				-class Migration(migrations.Migration):
			
 
				-
			
 
				-    dependencies = [
			
 
				-        ('core', '0057_rename_id_tag_old_id'),
			
 
				-    ]
			
 
				-
			
 
				-    operations = [
			
 
				-        migrations.AlterField(
			
 
				-            model_name='tag',
			
 
				-            name='old_id',
			
 
				-            field=models.BigIntegerField(default=rand_int_id, primary_key=True, serialize=False, verbose_name='Old ID'),
			
 
				-        ),
			
 
				-    ]
			
--- a/archivebox/core/migrations/0059_tag_id.py
+++ b/archivebox/core/migrations/0059_tag_id.py
@@ -1,90 +0,0 @@
 
				-# Generated by Django 5.0.6 on 2024-08-20 03:33
			
 
				-
			
 
				-from datetime import datetime
			
 
				-from django.db import migrations, models
			
 
				-from archivebox.base_models.abid import abid_from_values
			
 
				-from archivebox.base_models.models import ABID
			
 
				-
			
 
				-def calculate_abid(self):
			
 
				-    """
			
 
				-    Return a freshly derived ABID (assembled from attrs defined in ABIDModel.abid_*_src).
			
 
				-    """
			
 
				-    prefix = self.abid_prefix
			
 
				-    ts = eval(self.abid_ts_src)
			
 
				-    uri = eval(self.abid_uri_src)
			
 
				-    subtype = eval(self.abid_subtype_src)
			
 
				-    rand = eval(self.abid_rand_src)
			
 
				-
			
 
				-    if (not prefix) or prefix == 'obj_':
			
 
				-        suggested_abid = self.__class__.__name__[:3].lower()
			
 
				-        raise Exception(f'{self.__class__.__name__}.abid_prefix must be defined to calculate ABIDs (suggested: {suggested_abid})')
			
 
				-
			
 
				-    if not ts:
			
 
				-        ts = datetime.utcfromtimestamp(0)
			
 
				-        print(f'[!] WARNING: Generating ABID with ts=0000000000 placeholder because {self.__class__.__name__}.abid_ts_src={self.abid_ts_src} is unset!', ts.isoformat())
			
 
				-
			
 
				-    if not uri:
			
 
				-        uri = str(self)
			
 
				-        print(f'[!] WARNING: Generating ABID with uri=str(self) placeholder because {self.__class__.__name__}.abid_uri_src={self.abid_uri_src} is unset!', uri)
			
 
				-
			
 
				-    if not subtype:
			
 
				-        subtype = self.__class__.__name__
			
 
				-        print(f'[!] WARNING: Generating ABID with subtype={subtype} placeholder because {self.__class__.__name__}.abid_subtype_src={self.abid_subtype_src} is unset!', subtype)
			
 
				-
			
 
				-    if not rand:
			
 
				-        rand = getattr(self, 'uuid', None) or getattr(self, 'id', None) or getattr(self, 'pk')
			
 
				-        print(f'[!] WARNING: Generating ABID with rand=self.id placeholder because {self.__class__.__name__}.abid_rand_src={self.abid_rand_src} is unset!', rand)
			
 
				-
			
 
				-    abid = abid_from_values(
			
 
				-        prefix=prefix,
			
 
				-        ts=ts,
			
 
				-        uri=uri,
			
 
				-        subtype=subtype,
			
 
				-        rand=rand,
			
 
				-    )
			
 
				-    assert abid.ulid and abid.uuid and abid.typeid, f'Failed to calculate {prefix}_ABID for {self.__class__.__name__}'
			
 
				-    return abid
			
 
				-
			
 
				-
			
 
				-def update_archiveresult_ids(apps, schema_editor):
			
 
				-    Tag = apps.get_model("core", "Tag")
			
 
				-    num_total = Tag.objects.all().count()
			
 
				-    print(f'   Updating {num_total} Tag.id, ArchiveResult.uuid values in place...')
			
 
				-    for idx, tag in enumerate(Tag.objects.all().iterator(chunk_size=500)):
			
 
				-        if not tag.slug:
			
 
				-            tag.slug = tag.name.lower().replace(' ', '_')
			
 
				-        if not tag.name:
			
 
				-            tag.name = tag.slug
			
 
				-        if not (tag.name or tag.slug):
			
 
				-            tag.delete()
			
 
				-            continue
			
 
				-
			
 
				-        assert tag.slug or tag.name, f'Tag.slug must be defined! You have a Tag(id={tag.pk}) missing a slug!'
			
 
				-        tag.abid_prefix = 'tag_'
			
 
				-        tag.abid_ts_src = 'self.created'
			
 
				-        tag.abid_uri_src = 'self.slug'
			
 
				-        tag.abid_subtype_src = '"03"'
			
 
				-        tag.abid_rand_src = 'self.old_id'
			
 
				-        tag.abid = calculate_abid(tag)
			
 
				-        tag.id = tag.abid.uuid
			
 
				-        tag.save(update_fields=["abid", "id", "name", "slug"])
			
 
				-        assert str(ABID.parse(tag.abid).uuid) == str(tag.id)
			
 
				-        if idx % 10 == 0:
			
 
				-            print(f'Migrated {idx}/{num_total} Tag objects...')
			
 
				-
			
 
				-
			
 
				-
			
 
				-class Migration(migrations.Migration):
			
 
				-
			
 
				-    dependencies = [
			
 
				-        ('core', '0058_alter_tag_old_id'),
			
 
				-    ]
			
 
				-
			
 
				-    operations = [
			
 
				-        migrations.AddField(
			
 
				-            model_name='tag',
			
 
				-            name='id',
			
 
				-            field=models.UUIDField(blank=True, null=True),
			
 
				-        ),
			
 
				-        migrations.RunPython(update_archiveresult_ids, reverse_code=migrations.RunPython.noop),
			
 
				-    ]
			
--- a/archivebox/core/migrations/0060_alter_tag_id.py
+++ b/archivebox/core/migrations/0060_alter_tag_id.py
@@ -1,19 +0,0 @@
 
				-# Generated by Django 5.0.6 on 2024-08-20 03:42
			
 
				-
			
 
				-import uuid
			
 
				-from django.db import migrations, models
			
 
				-
			
 
				-
			
 
				-class Migration(migrations.Migration):
			
 
				-
			
 
				-    dependencies = [
			
 
				-        ('core', '0059_tag_id'),
			
 
				-    ]
			
 
				-
			
 
				-    operations = [
			
 
				-        migrations.AlterField(
			
 
				-            model_name='tag',
			
 
				-            name='id',
			
 
				-            field=models.UUIDField(default=uuid.uuid4, editable=False, unique=True),
			
 
				-        ),
			
 
				-    ]
			
--- a/archivebox/core/migrations/0061_rename_tag_snapshottag_old_tag_and_more.py
+++ b/archivebox/core/migrations/0061_rename_tag_snapshottag_old_tag_and_more.py
@@ -1,22 +0,0 @@
 
				-# Generated by Django 5.0.6 on 2024-08-20 03:43
			
 
				-
			
 
				-from django.db import migrations
			
 
				-
			
 
				-
			
 
				-class Migration(migrations.Migration):
			
 
				-
			
 
				-    dependencies = [
			
 
				-        ('core', '0060_alter_tag_id'),
			
 
				-    ]
			
 
				-
			
 
				-    operations = [
			
 
				-        migrations.RenameField(
			
 
				-            model_name='snapshottag',
			
 
				-            old_name='tag',
			
 
				-            new_name='old_tag',
			
 
				-        ),
			
 
				-        migrations.AlterUniqueTogether(
			
 
				-            name='snapshottag',
			
 
				-            unique_together={('snapshot', 'old_tag')},
			
 
				-        ),
			
 
				-    ]
			
--- a/archivebox/core/migrations/0062_alter_snapshottag_old_tag.py
+++ b/archivebox/core/migrations/0062_alter_snapshottag_old_tag.py
@@ -1,19 +0,0 @@
 
				-# Generated by Django 5.0.6 on 2024-08-20 03:44
			
 
				-
			
 
				-import django.db.models.deletion
			
 
				-from django.db import migrations, models
			
 
				-
			
 
				-
			
 
				-class Migration(migrations.Migration):
			
 
				-
			
 
				-    dependencies = [
			
 
				-        ('core', '0061_rename_tag_snapshottag_old_tag_and_more'),
			
 
				-    ]
			
 
				-
			
 
				-    operations = [
			
 
				-        migrations.AlterField(
			
 
				-            model_name='snapshottag',
			
 
				-            name='old_tag',
			
 
				-            field=models.ForeignKey(db_column='old_tag_id', on_delete=django.db.models.deletion.CASCADE, to='core.tag'),
			
 
				-        ),
			
 
				-    ]
			
--- a/archivebox/core/migrations/0063_snapshottag_tag_alter_snapshottag_old_tag.py
+++ b/archivebox/core/migrations/0063_snapshottag_tag_alter_snapshottag_old_tag.py
@@ -1,40 +0,0 @@
 
				-# Generated by Django 5.0.6 on 2024-08-20 03:45
			
 
				-
			
 
				-import django.db.models.deletion
			
 
				-from django.db import migrations, models
			
 
				-
			
 
				-
			
 
				-def update_snapshottag_ids(apps, schema_editor):
			
 
				-    Tag = apps.get_model("core", "Tag")
			
 
				-    SnapshotTag = apps.get_model("core", "SnapshotTag")
			
 
				-    num_total = SnapshotTag.objects.all().count()
			
 
				-    print(f'   Updating {num_total} SnapshotTag.tag_id values in place... (may take an hour or longer for large collections...)')
			
 
				-    for idx, snapshottag in enumerate(SnapshotTag.objects.all().only('old_tag_id').iterator(chunk_size=500)):
			
 
				-        assert snapshottag.old_tag_id
			
 
				-        tag = Tag.objects.get(old_id=snapshottag.old_tag_id)
			
 
				-        snapshottag.tag_id = tag.id
			
 
				-        snapshottag.save(update_fields=["tag_id"])
			
 
				-        assert str(snapshottag.tag_id) == str(tag.id)
			
 
				-        if idx % 100 == 0:
			
 
				-            print(f'Migrated {idx}/{num_total} SnapshotTag objects...')
			
 
				-
			
 
				-
			
 
				-class Migration(migrations.Migration):
			
 
				-
			
 
				-    dependencies = [
			
 
				-        ('core', '0062_alter_snapshottag_old_tag'),
			
 
				-    ]
			
 
				-
			
 
				-    operations = [
			
 
				-        migrations.AddField(
			
 
				-            model_name='snapshottag',
			
 
				-            name='tag',
			
 
				-            field=models.ForeignKey(blank=True, db_column='tag_id', null=True, on_delete=django.db.models.deletion.CASCADE, to='core.tag', to_field='id'),
			
 
				-        ),
			
 
				-        migrations.AlterField(
			
 
				-            model_name='snapshottag',
			
 
				-            name='old_tag',
			
 
				-            field=models.ForeignKey(db_column='old_tag_id', on_delete=django.db.models.deletion.CASCADE, related_name='snapshottags_old', to='core.tag'),
			
 
				-        ),
			
 
				-        migrations.RunPython(update_snapshottag_ids, reverse_code=migrations.RunPython.noop),
			
 
				-    ]
			
--- a/archivebox/core/migrations/0064_alter_snapshottag_unique_together_and_more.py
+++ b/archivebox/core/migrations/0064_alter_snapshottag_unique_together_and_more.py
@@ -1,27 +0,0 @@
 
				-# Generated by Django 5.0.6 on 2024-08-20 03:50
			
 
				-
			
 
				-import django.db.models.deletion
			
 
				-from django.db import migrations, models
			
 
				-
			
 
				-
			
 
				-class Migration(migrations.Migration):
			
 
				-
			
 
				-    dependencies = [
			
 
				-        ('core', '0063_snapshottag_tag_alter_snapshottag_old_tag'),
			
 
				-    ]
			
 
				-
			
 
				-    operations = [
			
 
				-        migrations.AlterUniqueTogether(
			
 
				-            name='snapshottag',
			
 
				-            unique_together=set(),
			
 
				-        ),
			
 
				-        migrations.AlterField(
			
 
				-            model_name='snapshottag',
			
 
				-            name='tag',
			
 
				-            field=models.ForeignKey(db_column='tag_id', on_delete=django.db.models.deletion.CASCADE, to='core.tag', to_field='id'),
			
 
				-        ),
			
 
				-        migrations.AlterUniqueTogether(
			
 
				-            name='snapshottag',
			
 
				-            unique_together={('snapshot', 'tag')},
			
 
				-        ),
			
 
				-    ]
			
--- a/archivebox/core/migrations/0065_remove_snapshottag_old_tag.py
+++ b/archivebox/core/migrations/0065_remove_snapshottag_old_tag.py
@@ -1,17 +0,0 @@
 
				-# Generated by Django 5.0.6 on 2024-08-20 03:51
			
 
				-
			
 
				-from django.db import migrations
			
 
				-
			
 
				-
			
 
				-class Migration(migrations.Migration):
			
 
				-
			
 
				-    dependencies = [
			
 
				-        ('core', '0064_alter_snapshottag_unique_together_and_more'),
			
 
				-    ]
			
 
				-
			
 
				-    operations = [
			
 
				-        migrations.RemoveField(
			
 
				-            model_name='snapshottag',
			
 
				-            name='old_tag',
			
 
				-        ),
			
 
				-    ]
			
--- a/archivebox/core/migrations/0066_alter_snapshottag_tag_alter_tag_id_alter_tag_old_id.py
+++ b/archivebox/core/migrations/0066_alter_snapshottag_tag_alter_tag_id_alter_tag_old_id.py
@@ -1,34 +0,0 @@
 
				-# Generated by Django 5.0.6 on 2024-08-20 03:52
			
 
				-
			
 
				-import core.models
			
 
				-import django.db.models.deletion
			
 
				-import uuid
			
 
				-import random
			
 
				-from django.db import migrations, models
			
 
				-
			
 
				-def rand_int_id():
			
 
				-    return random.getrandbits(32)
			
 
				-
			
 
				-class Migration(migrations.Migration):
			
 
				-
			
 
				-    dependencies = [
			
 
				-        ('core', '0065_remove_snapshottag_old_tag'),
			
 
				-    ]
			
 
				-
			
 
				-    operations = [
			
 
				-        migrations.AlterField(
			
 
				-            model_name='snapshottag',
			
 
				-            name='tag',
			
 
				-            field=models.ForeignKey(db_column='tag_id', on_delete=django.db.models.deletion.CASCADE, to='core.tag', to_field='id'),
			
 
				-        ),
			
 
				-        migrations.AlterField(
			
 
				-            model_name='tag',
			
 
				-            name='id',
			
 
				-            field=models.UUIDField(default=uuid.uuid4, editable=False, primary_key=True, serialize=False, unique=True),
			
 
				-        ),
			
 
				-        migrations.AlterField(
			
 
				-            model_name='tag',
			
 
				-            name='old_id',
			
 
				-            field=models.BigIntegerField(default=rand_int_id, serialize=False, unique=True, verbose_name='Old ID'),
			
 
				-        ),
			
 
				-    ]