Browse Source

logging and admin ui improvements

Nick Sweeting 2 months ago
parent
commit
866f993f26
60 changed files with 2943 additions and 508 deletions
  1. 3 0
      archivebox/ArchiveBox.conf
  2. 38 1
      archivebox/api/admin.py
  3. 3 5
      archivebox/cli/archivebox_add.py
  4. 4 4
      archivebox/cli/archivebox_extract.py
  5. 28 3
      archivebox/cli/archivebox_install.py
  6. 40 7
      archivebox/cli/archivebox_server.py
  7. 3 2
      archivebox/cli/archivebox_version.py
  8. 94 125
      archivebox/config/django.py
  9. 174 4
      archivebox/core/admin_archiveresults.py
  10. 43 4
      archivebox/core/admin_snapshots.py
  11. 15 1
      archivebox/core/admin_tags.py
  12. 0 40
      archivebox/core/apps.py
  13. 22 0
      archivebox/core/migrations/0025_allow_duplicate_urls_per_crawl.py
  14. 62 6
      archivebox/core/models.py
  15. 25 34
      archivebox/core/statemachines.py
  16. 48 15
      archivebox/core/views.py
  17. 220 28
      archivebox/crawls/admin.py
  18. 55 1
      archivebox/crawls/models.py
  19. 114 12
      archivebox/crawls/statemachines.py
  20. 8 0
      archivebox/hooks.py
  21. 2 0
      archivebox/logs/errors.log
  22. 92 4
      archivebox/machine/admin.py
  23. 16 8
      archivebox/misc/logging_util.py
  24. 3 6
      archivebox/plugins/accessibility/on_Snapshot__39_accessibility.js
  25. 1 1
      archivebox/plugins/archive_org/on_Snapshot__13_archive_org.py
  26. 1 1
      archivebox/plugins/chrome_cleanup/on_Snapshot__45_chrome_cleanup.py
  27. 1 1
      archivebox/plugins/chrome_navigate/on_Snapshot__30_chrome_navigate.js
  28. 3 6
      archivebox/plugins/consolelog/on_Snapshot__21_consolelog.js
  29. 4 7
      archivebox/plugins/dom/on_Snapshot__36_dom.js
  30. 1 1
      archivebox/plugins/favicon/on_Snapshot__11_favicon.py
  31. 1 1
      archivebox/plugins/git/on_Snapshot__12_git.py
  32. 3 6
      archivebox/plugins/headers/on_Snapshot__33_headers.js
  33. 2 3
      archivebox/plugins/htmltotext/on_Snapshot__54_htmltotext.py
  34. 3 4
      archivebox/plugins/media/on_Snapshot__51_media.py
  35. 2 3
      archivebox/plugins/mercury/on_Snapshot__53_mercury.py
  36. 3 6
      archivebox/plugins/parse_dom_outlinks/on_Snapshot__40_parse_dom_outlinks.js
  37. 4 7
      archivebox/plugins/pdf/on_Snapshot__35_pdf.js
  38. 2 3
      archivebox/plugins/readability/on_Snapshot__52_readability.py
  39. 3 6
      archivebox/plugins/redirects/on_Snapshot__22_redirects.js
  40. 4 6
      archivebox/plugins/responses/on_Snapshot__24_responses.js
  41. 4 7
      archivebox/plugins/screenshot/on_Snapshot__34_screenshot.js
  42. 131 0
      archivebox/plugins/search_backend_ripgrep/on_Crawl__00_validate_ripgrep.py
  43. 0 0
      archivebox/plugins/search_backend_ripgrep/tests/__init__.py
  44. 306 0
      archivebox/plugins/search_backend_ripgrep/tests/test_ripgrep_detection.py
  45. 1 1
      archivebox/plugins/search_backend_sonic/on_Snapshot__91_index_sonic.py
  46. 1 1
      archivebox/plugins/search_backend_sqlite/on_Snapshot__90_index_sqlite.py
  47. 3 6
      archivebox/plugins/seo/on_Snapshot__38_seo.js
  48. 3 5
      archivebox/plugins/singlefile/on_Snapshot__04_singlefile.js
  49. 5 6
      archivebox/plugins/singlefile/on_Snapshot__37_singlefile.py
  50. 3 6
      archivebox/plugins/ssl/on_Snapshot__23_ssl.js
  51. 3 4
      archivebox/plugins/staticfile/on_Snapshot__31_staticfile.py
  52. 3 6
      archivebox/plugins/title/on_Snapshot__32_title.js
  53. 2 2
      archivebox/plugins/wget/on_Snapshot__50_wget.py
  54. 1025 0
      archivebox/templates/admin/base.html
  55. 187 50
      archivebox/templates/admin/progress_monitor.html
  56. 8 3
      archivebox/workers/management/commands/orchestrator.py
  57. 22 13
      archivebox/workers/orchestrator.py
  58. 81 4
      archivebox/workers/supervisord_util.py
  59. 3 20
      archivebox/workers/tasks.py
  60. 2 2
      archivebox/workers/worker.py

+ 3 - 0
archivebox/ArchiveBox.conf

@@ -0,0 +1,3 @@
+[SERVER_CONFIG]
+SECRET_KEY = amuxg7v5e2l_6jrktp_f3kszlpx4ieqk4rtwda5q6nfiavits4
+

+ 38 - 1
archivebox/api/admin.py

@@ -13,7 +13,21 @@ class APITokenAdmin(BaseModelAdmin):
     sort_fields = ('id', 'created_at', 'created_by', 'expires')
     readonly_fields = ('created_at', 'modified_at')
     search_fields = ('id', 'created_by__username', 'token')
-    fields = ('created_by', 'token', 'expires', *readonly_fields)
+
+    fieldsets = (
+        ('Token', {
+            'fields': ('token', 'expires'),
+            'classes': ('card',),
+        }),
+        ('Owner', {
+            'fields': ('created_by',),
+            'classes': ('card',),
+        }),
+        ('Timestamps', {
+            'fields': ('created_at', 'modified_at'),
+            'classes': ('card',),
+        }),
+    )
 
     list_filter = ('created_by',)
     ordering = ['-created_at']
@@ -25,6 +39,29 @@ class CustomWebhookAdmin(WebhookAdmin, BaseModelAdmin):
     sort_fields = ('created_at', 'created_by', 'id', 'referenced_model', 'endpoint', 'last_success', 'last_error')
     readonly_fields = ('created_at', 'modified_at', *WebhookAdmin.readonly_fields)
 
+    fieldsets = (
+        ('Webhook', {
+            'fields': ('name', 'signal', 'referenced_model', 'endpoint'),
+            'classes': ('card', 'wide'),
+        }),
+        ('Authentication', {
+            'fields': ('auth_token',),
+            'classes': ('card',),
+        }),
+        ('Status', {
+            'fields': ('enabled', 'last_success', 'last_error'),
+            'classes': ('card',),
+        }),
+        ('Owner', {
+            'fields': ('created_by',),
+            'classes': ('card',),
+        }),
+        ('Timestamps', {
+            'fields': ('created_at', 'modified_at'),
+            'classes': ('card',),
+        }),
+    )
+
 
 def register_admin(admin_site):
     admin_site.register(APIToken, APITokenAdmin)

+ 3 - 5
archivebox/cli/archivebox_add.py

@@ -115,12 +115,10 @@ def add(urls: str | list[str],
     #    - Repeat until max_depth reached
 
     if bg:
-        # Background mode: start orchestrator and return immediately
-        print('[yellow]\\[*] Running in background mode - starting orchestrator...[/yellow]')
-        orchestrator = Orchestrator(exit_on_idle=True)
-        orchestrator.start()  # Fork to background
+        # Background mode: just queue work and return (orchestrator via server will pick it up)
+        print('[yellow]\\[*] URLs queued. Orchestrator will process them (run `archivebox server` if not already running).[/yellow]')
     else:
-        # Foreground mode: run orchestrator until all work is done
+        # Foreground mode: run orchestrator inline until all work is done
         print(f'[green]\\[*] Starting orchestrator to process crawl...[/green]')
         orchestrator = Orchestrator(exit_on_idle=True)
         orchestrator.runloop()  # Block until complete

+ 4 - 4
archivebox/cli/archivebox_extract.py

@@ -117,11 +117,11 @@ def run_plugins(
             if snapshot_id:
                 snapshot_ids.add(snapshot_id)
             elif record.get('url'):
-                # Look up by URL
-                try:
-                    snap = Snapshot.objects.get(url=record['url'])
+                # Look up by URL (get most recent if multiple exist)
+                snap = Snapshot.objects.filter(url=record['url']).order_by('-created_at').first()
+                if snap:
                     snapshot_ids.add(str(snap.id))
-                except Snapshot.DoesNotExist:
+                else:
                     rprint(f'[yellow]Snapshot not found for URL: {record["url"]}[/yellow]', file=sys.stderr)
 
         elif record_type == TYPE_ARCHIVERESULT:

+ 28 - 3
archivebox/cli/archivebox_install.py

@@ -49,20 +49,45 @@ def install(dry_run: bool=False) -> None:
     # Using a minimal crawl that will trigger on_Crawl hooks
     created_by_id = get_or_create_system_user_pk()
 
-    seed = Seed.objects.create(
+    seed, _created = Seed.objects.get_or_create(
         uri='archivebox://install',
         label='Dependency detection',
         created_by_id=created_by_id,
+        defaults={
+            'extractor': 'auto',
+        }
     )
 
-    crawl = Crawl.objects.create(
+    crawl, created = Crawl.objects.get_or_create(
         seed=seed,
         max_depth=0,
         created_by_id=created_by_id,
-        status='queued',
+        defaults={
+            'status': 'queued',
+        }
     )
 
+    # If crawl already existed, reset it to queued state so it can be processed again
+    if not created:
+        crawl.status = 'queued'
+        crawl.retry_at = timezone.now()
+        crawl.save()
+
     print(f'[+] Created dependency detection crawl: {crawl.id}')
+    print(f'[+] Crawl status: {crawl.status}, retry_at: {crawl.retry_at}')
+
+    # Verify the crawl is in the queue
+    from crawls.models import Crawl as CrawlModel
+    queued_crawls = CrawlModel.objects.filter(
+        retry_at__lte=timezone.now()
+    ).exclude(
+        status__in=CrawlModel.FINAL_STATES
+    )
+    print(f'[+] Crawls in queue: {queued_crawls.count()}')
+    if queued_crawls.exists():
+        for c in queued_crawls:
+            print(f'    - Crawl {c.id}: status={c.status}, retry_at={c.retry_at}')
+
     print('[+] Running crawl to detect binaries via on_Crawl hooks...')
     print()
 

+ 40 - 7
archivebox/cli/archivebox_server.py

@@ -56,20 +56,53 @@ def server(runserver_args: Iterable[str]=(SERVER_CONFIG.BIND_ADDR,),
     except IndexError:
         pass
 
-    print('[green][+] Starting ArchiveBox webserver...[/green]')
-    print(f'    [blink][green]>[/green][/blink] Starting ArchiveBox webserver on [deep_sky_blue4][link=http://{host}:{port}]http://{host}:{port}[/link][/deep_sky_blue4]')
-    print(f'    [green]>[/green] Log in to ArchiveBox Admin UI on [deep_sky_blue3][link=http://{host}:{port}/admin]http://{host}:{port}/admin[/link][/deep_sky_blue3]')
-    print('    > Writing ArchiveBox error log to ./logs/errors.log')
-
     if SHELL_CONFIG.DEBUG:
+        print('[green][+] Starting ArchiveBox webserver in DEBUG mode...[/green]')
+        print(f'    [blink][green]>[/green][/blink] Starting ArchiveBox webserver on [deep_sky_blue4][link=http://{host}:{port}]http://{host}:{port}[/link][/deep_sky_blue4]')
+        print(f'    [green]>[/green] Log in to ArchiveBox Admin UI on [deep_sky_blue3][link=http://{host}:{port}/admin]http://{host}:{port}/admin[/link][/deep_sky_blue3]')
+        print('    > Writing ArchiveBox error log to ./logs/errors.log')
         if not reload:
             runserver_args.append('--noreload')  # '--insecure'
         if nothreading:
             runserver_args.append('--nothreading')
         call_command("runserver", *runserver_args)
     else:
-        from workers.supervisord_util import start_server_workers
-
+        from workers.supervisord_util import (
+            get_existing_supervisord_process,
+            get_worker,
+            start_server_workers,
+            tail_multiple_worker_logs,
+        )
+
+        # Check if supervisord is already running
+        supervisor = get_existing_supervisord_process()
+        if supervisor:
+            daphne_proc = get_worker(supervisor, 'worker_daphne')
+
+            # If daphne is already running, just tail logs
+            if daphne_proc and daphne_proc.get('statename') == 'RUNNING':
+                orchestrator_proc = get_worker(supervisor, 'worker_orchestrator')
+                print('[yellow][!] ArchiveBox server is already running[/yellow]')
+                print(f'    [green]√[/green] Web server (worker_daphne) is RUNNING on [deep_sky_blue4][link=http://{host}:{port}]http://{host}:{port}[/link][/deep_sky_blue4]')
+                if orchestrator_proc and orchestrator_proc.get('statename') == 'RUNNING':
+                    print(f'    [green]√[/green] Background worker (worker_orchestrator) is RUNNING')
+                print()
+                print('[blue][i] Tailing worker logs (Ctrl+C to stop watching)...[/i][/blue]')
+                print()
+
+                # Tail logs for both workers
+                tail_multiple_worker_logs(
+                    log_files=['logs/worker_daphne.log', 'logs/worker_orchestrator.log'],
+                    follow=True,
+                )
+                return
+            # Otherwise, daphne is not running - fall through to start it
+
+        # No existing workers found - start new ones
+        print('[green][+] Starting ArchiveBox webserver...[/green]')
+        print(f'    [blink][green]>[/green][/blink] Starting ArchiveBox webserver on [deep_sky_blue4][link=http://{host}:{port}]http://{host}:{port}[/link][/deep_sky_blue4]')
+        print(f'    [green]>[/green] Log in to ArchiveBox Admin UI on [deep_sky_blue3][link=http://{host}:{port}/admin]http://{host}:{port}/admin[/link][/deep_sky_blue3]')
+        print('    > Writing ArchiveBox error log to ./logs/errors.log')
         print()
         start_server_workers(host=host, port=port, daemonize=daemonize)
         print("\n[i][green][🟩] ArchiveBox server shut down gracefully.[/green][/i]")

+ 3 - 2
archivebox/cli/archivebox_version.py

@@ -119,12 +119,13 @@ def version(quiet: bool=False,
     else:
         for key in sorted(set(binary_config_keys)):
             # Get the actual binary name/path from config value
-            bin_value = config.get(key, '').strip()
+            # Prioritize Machine.config overrides over base config
+            bin_value = machine.config.get(key) or config.get(key, '').strip()
             if not bin_value:
                 continue
 
             # Check if it's a path (has slashes) or just a name
-            is_path = '/' in bin_value
+            is_path = '/' in str(bin_value)
 
             if is_path:
                 # It's a full path - match against abspath

+ 94 - 125
archivebox/config/django.py

@@ -5,7 +5,6 @@ import sys
 
 from datetime import datetime, timezone
 
-from rich.progress import Progress
 from rich.console import Console
 
 import django
@@ -27,16 +26,6 @@ STDERR = Console(stderr=True)
 logging.CONSOLE = CONSOLE
 
 
-INITIAL_STARTUP_PROGRESS = None
-INITIAL_STARTUP_PROGRESS_TASK = 0
-
-def bump_startup_progress_bar(advance=1):
-    global INITIAL_STARTUP_PROGRESS
-    global INITIAL_STARTUP_PROGRESS_TASK
-    if INITIAL_STARTUP_PROGRESS:
-        INITIAL_STARTUP_PROGRESS.update(INITIAL_STARTUP_PROGRESS_TASK, advance=advance)   # type: ignore
-
-
 def setup_django_minimal():
     # sys.path.append(str(CONSTANTS.PACKAGE_DIR))
     # os.environ.setdefault('ARCHIVEBOX_DATA_DIR', str(CONSTANTS.DATA_DIR))
@@ -49,9 +38,7 @@ DJANGO_SET_UP = False
 
 def setup_django(check_db=False, in_memory_db=False) -> None:
     from rich.panel import Panel
-    
-    global INITIAL_STARTUP_PROGRESS
-    global INITIAL_STARTUP_PROGRESS_TASK
+
     global DJANGO_SET_UP
 
     if DJANGO_SET_UP:
@@ -59,118 +46,100 @@ def setup_django(check_db=False, in_memory_db=False) -> None:
         # TODO: figure out why CLI entrypoints with init_pending are running this twice sometimes
         return
 
-    with Progress(transient=True, expand=True, console=STDERR) as INITIAL_STARTUP_PROGRESS:
-        INITIAL_STARTUP_PROGRESS_TASK = INITIAL_STARTUP_PROGRESS.add_task("[green]Loading modules...", total=25, visible=True)
-        
-        from archivebox.config.permissions import IS_ROOT, ARCHIVEBOX_USER, ARCHIVEBOX_GROUP, SudoPermission
-    
-        # if running as root, chown the data dir to the archivebox user to make sure it's accessible to the archivebox user
-        if IS_ROOT and ARCHIVEBOX_USER != 0:
-            with SudoPermission(uid=0):
-                # running as root is a special case where it's ok to be a bit slower
-                # make sure data dir is always owned by the correct user
-                os.system(f'chown {ARCHIVEBOX_USER}:{ARCHIVEBOX_GROUP} "{CONSTANTS.DATA_DIR}" 2>/dev/null')
-                os.system(f'chown {ARCHIVEBOX_USER}:{ARCHIVEBOX_GROUP} "{CONSTANTS.DATA_DIR}"/* 2>/dev/null')
-
-        bump_startup_progress_bar()
-        try:
-            from django.core.management import call_command
-                
-            bump_startup_progress_bar()
-
-            if in_memory_db:
-                raise Exception('dont use this anymore')
-            
-                # some commands (e.g. oneshot) dont store a long-lived sqlite3 db file on disk.
-                # in those cases we create a temporary in-memory db and run the migrations
-                # immediately to get a usable in-memory-database at startup
-                os.environ.setdefault("ARCHIVEBOX_DATABASE_NAME", ":memory:")
+    from archivebox.config.permissions import IS_ROOT, ARCHIVEBOX_USER, ARCHIVEBOX_GROUP, SudoPermission
+
+    # if running as root, chown the data dir to the archivebox user to make sure it's accessible to the archivebox user
+    if IS_ROOT and ARCHIVEBOX_USER != 0:
+        with SudoPermission(uid=0):
+            # running as root is a special case where it's ok to be a bit slower
+            # make sure data dir is always owned by the correct user
+            os.system(f'chown {ARCHIVEBOX_USER}:{ARCHIVEBOX_GROUP} "{CONSTANTS.DATA_DIR}" 2>/dev/null')
+            os.system(f'chown {ARCHIVEBOX_USER}:{ARCHIVEBOX_GROUP} "{CONSTANTS.DATA_DIR}"/* 2>/dev/null')
+
+    try:
+        from django.core.management import call_command
+
+        if in_memory_db:
+            raise Exception('dont use this anymore')
+
+            # some commands (e.g. oneshot) dont store a long-lived sqlite3 db file on disk.
+            # in those cases we create a temporary in-memory db and run the migrations
+            # immediately to get a usable in-memory-database at startup
+            os.environ.setdefault("ARCHIVEBOX_DATABASE_NAME", ":memory:")
+            django.setup()
+
+            call_command("migrate", interactive=False, verbosity=0)
+        else:
+            # Otherwise use default sqlite3 file-based database and initialize django
+            # without running migrations automatically (user runs them manually by calling init)
+            try:
                 django.setup()
-                
-                bump_startup_progress_bar()
-                call_command("migrate", interactive=False, verbosity=0)
-            else:
-                # Otherwise use default sqlite3 file-based database and initialize django
-                # without running migrations automatically (user runs them manually by calling init)
-                try:
-                    django.setup()
-                except Exception as e:
-                    bump_startup_progress_bar(advance=1000)
-                    
-                    is_using_meta_cmd = any(ignored_subcommand in sys.argv for ignored_subcommand in ('help', 'version', '--help', '--version'))
-                    if not is_using_meta_cmd:
-                        # show error message to user only if they're not running a meta command / just trying to get help
-                        STDERR.print()
-                        STDERR.print(Panel(
-                            f'\n[red]{e.__class__.__name__}[/red]: [yellow]{e}[/yellow]\nPlease check your config and [blue]DATA_DIR[/blue] permissions.\n',
-                            title='\n\n[red][X] Error while trying to load database![/red]',
-                            subtitle='[grey53]NO WRITES CAN BE PERFORMED[/grey53]',
-                            expand=False,
-                            style='bold red',
-                        ))
-                        STDERR.print()
-                        STDERR.print_exception(show_locals=False)
-                    return
-            
-            bump_startup_progress_bar()
-
-            from django.conf import settings
-            
-            # log startup message to the error log
-            with open(settings.ERROR_LOG, "a", encoding='utf-8') as f:
-                command = ' '.join(sys.argv)
-                ts = datetime.now(timezone.utc).strftime('%Y-%m-%d__%H:%M:%S')
-                f.write(f"\n> {command}; TS={ts} VERSION={CONSTANTS.VERSION} IN_DOCKER={SHELL_CONFIG.IN_DOCKER} IS_TTY={SHELL_CONFIG.IS_TTY}\n")
-
-            if check_db:
-                # make sure the data dir is owned by a non-root user
-                if CONSTANTS.DATA_DIR.stat().st_uid == 0:
-                    STDERR.print('[red][X] Error: ArchiveBox DATA_DIR cannot be owned by root![/red]')
-                    STDERR.print(f'    {CONSTANTS.DATA_DIR}')
+            except Exception as e:
+                is_using_meta_cmd = any(ignored_subcommand in sys.argv for ignored_subcommand in ('help', 'version', '--help', '--version'))
+                if not is_using_meta_cmd:
+                    # show error message to user only if they're not running a meta command / just trying to get help
                     STDERR.print()
-                    STDERR.print('[violet]Hint:[/violet] Are you running archivebox in the right folder? (and as a non-root user?)')
-                    STDERR.print('    cd path/to/your/archive/data')
-                    STDERR.print('    archivebox [command]')
+                    STDERR.print(Panel(
+                        f'\n[red]{e.__class__.__name__}[/red]: [yellow]{e}[/yellow]\nPlease check your config and [blue]DATA_DIR[/blue] permissions.\n',
+                        title='\n\n[red][X] Error while trying to load database![/red]',
+                        subtitle='[grey53]NO WRITES CAN BE PERFORMED[/grey53]',
+                        expand=False,
+                        style='bold red',
+                    ))
                     STDERR.print()
-                    raise SystemExit(9)
-                
-                # Create cache table in DB if needed
-                try:
-                    from django.core.cache import cache
-                    cache.get('test', None)
-                except django.db.utils.OperationalError:
-                    call_command("createcachetable", verbosity=0)
-
-                bump_startup_progress_bar()
-
-                # if archivebox gets imported multiple times, we have to close
-                # the sqlite3 whenever we init from scratch to avoid multiple threads
-                # sharing the same connection by accident
-                from django.db import connections
-                for conn in connections.all():
-                    conn.close_if_unusable_or_obsolete()
-
-                sql_index_path = CONSTANTS.DATABASE_FILE
-                assert os.access(sql_index_path, os.F_OK), (
-                    f'No database file {sql_index_path} found in: {CONSTANTS.DATA_DIR} (Are you in an ArchiveBox collection directory?)')
-
-                bump_startup_progress_bar()
-
-                # https://docs.pydantic.dev/logfire/integrations/django/ Logfire Debugging
-                # if settings.DEBUG_LOGFIRE:
-                #     from opentelemetry.instrumentation.sqlite3 import SQLite3Instrumentor
-                #     SQLite3Instrumentor().instrument()
-
-                #     import logfire
-
-                #     logfire.configure()
-                #     logfire.instrument_django(is_sql_commentor_enabled=True)
-                #     logfire.info(f'Started ArchiveBox v{CONSTANTS.VERSION}', argv=sys.argv)
-
-        except KeyboardInterrupt:
-            raise SystemExit(2)
-        
-    DJANGO_SET_UP = True
+                    STDERR.print_exception(show_locals=False)
+                return
+
+        from django.conf import settings
+
+        # log startup message to the error log
+        with open(settings.ERROR_LOG, "a", encoding='utf-8') as f:
+            command = ' '.join(sys.argv)
+            ts = datetime.now(timezone.utc).strftime('%Y-%m-%d__%H:%M:%S')
+            f.write(f"\n> {command}; TS={ts} VERSION={CONSTANTS.VERSION} IN_DOCKER={SHELL_CONFIG.IN_DOCKER} IS_TTY={SHELL_CONFIG.IS_TTY}\n")
+
+        if check_db:
+            # make sure the data dir is owned by a non-root user
+            if CONSTANTS.DATA_DIR.stat().st_uid == 0:
+                STDERR.print('[red][X] Error: ArchiveBox DATA_DIR cannot be owned by root![/red]')
+                STDERR.print(f'    {CONSTANTS.DATA_DIR}')
+                STDERR.print()
+                STDERR.print('[violet]Hint:[/violet] Are you running archivebox in the right folder? (and as a non-root user?)')
+                STDERR.print('    cd path/to/your/archive/data')
+                STDERR.print('    archivebox [command]')
+                STDERR.print()
+                raise SystemExit(9)
+
+            # Create cache table in DB if needed
+            try:
+                from django.core.cache import cache
+                cache.get('test', None)
+            except django.db.utils.OperationalError:
+                call_command("createcachetable", verbosity=0)
+
+            # if archivebox gets imported multiple times, we have to close
+            # the sqlite3 whenever we init from scratch to avoid multiple threads
+            # sharing the same connection by accident
+            from django.db import connections
+            for conn in connections.all():
+                conn.close_if_unusable_or_obsolete()
+
+            sql_index_path = CONSTANTS.DATABASE_FILE
+            assert os.access(sql_index_path, os.F_OK), (
+                f'No database file {sql_index_path} found in: {CONSTANTS.DATA_DIR} (Are you in an ArchiveBox collection directory?)')
+
+            # https://docs.pydantic.dev/logfire/integrations/django/ Logfire Debugging
+            # if settings.DEBUG_LOGFIRE:
+            #     from opentelemetry.instrumentation.sqlite3 import SQLite3Instrumentor
+            #     SQLite3Instrumentor().instrument()
+
+            #     import logfire
+
+            #     logfire.configure()
+            #     logfire.instrument_django(is_sql_commentor_enabled=True)
+            #     logfire.info(f'Started ArchiveBox v{CONSTANTS.VERSION}', argv=sys.argv)
+
+    except KeyboardInterrupt:
+        raise SystemExit(2)
 
-    INITIAL_STARTUP_PROGRESS = None
-    INITIAL_STARTUP_PROGRESS_TASK = None
+    DJANGO_SET_UP = True

+ 174 - 4
archivebox/core/admin_archiveresults.py

@@ -19,6 +19,150 @@ from archivebox.hooks import get_extractor_icon
 from core.models import ArchiveResult, Snapshot
 
 
+def render_archiveresults_list(archiveresults_qs, limit=50):
+    """Render a nice inline list view of archive results with status, extractor, output, and actions."""
+
+    results = list(archiveresults_qs.order_by('-end_ts').select_related('snapshot')[:limit])
+
+    if not results:
+        return mark_safe('<div style="color: #64748b; font-style: italic; padding: 16px 0;">No Archive Results yet...</div>')
+
+    # Status colors
+    status_colors = {
+        'succeeded': ('#166534', '#dcfce7'),   # green
+        'failed': ('#991b1b', '#fee2e2'),       # red
+        'queued': ('#6b7280', '#f3f4f6'),       # gray
+        'started': ('#92400e', '#fef3c7'),      # amber
+    }
+
+    rows = []
+    for idx, result in enumerate(results):
+        status = result.status or 'queued'
+        color, bg = status_colors.get(status, ('#6b7280', '#f3f4f6'))
+
+        # Get extractor icon
+        icon = get_extractor_icon(result.extractor)
+
+        # Format timestamp
+        end_time = result.end_ts.strftime('%Y-%m-%d %H:%M:%S') if result.end_ts else '-'
+
+        # Truncate output for display
+        full_output = result.output or '-'
+        output_display = full_output[:60]
+        if len(full_output) > 60:
+            output_display += '...'
+
+        # Get full command as tooltip
+        cmd_str = ' '.join(result.cmd) if isinstance(result.cmd, list) else str(result.cmd or '-')
+
+        # Build output link
+        output_link = f'/archive/{result.snapshot.timestamp}/{result.output}' if result.output and result.status == 'succeeded' else f'/archive/{result.snapshot.timestamp}/'
+
+        # Get version - try cmd_version field
+        version = result.cmd_version if result.cmd_version else '-'
+
+        # Unique ID for this row's expandable output
+        row_id = f'output_{idx}_{str(result.id)[:8]}'
+
+        rows.append(f'''
+            <tr style="border-bottom: 1px solid #f1f5f9; transition: background 0.15s;" onmouseover="this.style.background='#f8fafc'" onmouseout="this.style.background='transparent'">
+                <td style="padding: 10px 12px; white-space: nowrap;">
+                    <span style="display: inline-block; padding: 3px 10px; border-radius: 12px;
+                                 font-size: 11px; font-weight: 600; text-transform: uppercase;
+                                 color: {color}; background: {bg};">{status}</span>
+                </td>
+                <td style="padding: 10px 12px; white-space: nowrap; font-size: 20px;" title="{result.extractor}">
+                    {icon}
+                </td>
+                <td style="padding: 10px 12px; font-weight: 500; color: #334155;">
+                    {result.extractor}
+                </td>
+                <td style="padding: 10px 12px; max-width: 280px;">
+                    <span onclick="document.getElementById('{row_id}').open = !document.getElementById('{row_id}').open"
+                          style="color: #2563eb; text-decoration: none; font-family: ui-monospace, monospace; font-size: 12px; cursor: pointer;"
+                          title="Click to expand full output">
+                        {output_display}
+                    </span>
+                </td>
+                <td style="padding: 10px 12px; white-space: nowrap; color: #64748b; font-size: 12px;">
+                    {end_time}
+                </td>
+                <td style="padding: 10px 12px; white-space: nowrap; font-family: ui-monospace, monospace; font-size: 11px; color: #64748b;">
+                    {version}
+                </td>
+                <td style="padding: 10px 8px; white-space: nowrap;">
+                    <div style="display: flex; gap: 4px;">
+                        <a href="{output_link}" target="_blank"
+                           style="padding: 4px 8px; background: #f1f5f9; border-radius: 4px; color: #475569; text-decoration: none; font-size: 11px;"
+                           title="View output">📄</a>
+                        <a href="{reverse('admin:core_archiveresult_change', args=[result.id])}"
+                           style="padding: 4px 8px; background: #f1f5f9; border-radius: 4px; color: #475569; text-decoration: none; font-size: 11px;"
+                           title="Edit">✏️</a>
+                    </div>
+                </td>
+            </tr>
+            <tr style="border-bottom: 1px solid #e2e8f0;">
+                <td colspan="7" style="padding: 0 12px 10px 12px;">
+                    <details id="{row_id}" style="margin: 0;">
+                        <summary style="cursor: pointer; font-size: 11px; color: #94a3b8; user-select: none;">
+                            Details &amp; Output
+                        </summary>
+                        <div style="margin-top: 8px; padding: 10px; background: #f8fafc; border: 1px solid #e2e8f0; border-radius: 6px; max-height: 200px; overflow: auto;">
+                            <div style="font-size: 11px; color: #64748b; margin-bottom: 8px;">
+                                <span style="margin-right: 16px;"><b>ID:</b> <code>{str(result.id)[:8]}...</code></span>
+                                <span style="margin-right: 16px;"><b>Version:</b> <code>{version}</code></span>
+                                <span style="margin-right: 16px;"><b>PWD:</b> <code>{result.pwd or '-'}</code></span>
+                            </div>
+                            <div style="font-size: 11px; color: #64748b; margin-bottom: 8px;">
+                                <b>Output:</b>
+                            </div>
+                            <pre style="margin: 0; padding: 8px; background: #1e293b; border-radius: 4px; color: #e2e8f0; font-size: 12px; white-space: pre-wrap; word-break: break-all; max-height: 120px; overflow: auto;">{full_output}</pre>
+                            <div style="font-size: 11px; color: #64748b; margin-top: 8px;">
+                                <b>Command:</b>
+                            </div>
+                            <pre style="margin: 0; padding: 8px; background: #1e293b; border-radius: 4px; color: #e2e8f0; font-size: 11px; white-space: pre-wrap; word-break: break-all;">{cmd_str}</pre>
+                        </div>
+                    </details>
+                </td>
+            </tr>
+        ''')
+
+    total_count = archiveresults_qs.count()
+    footer = ''
+    if total_count > limit:
+        footer = f'''
+            <tr>
+                <td colspan="7" style="padding: 12px; text-align: center; color: #64748b; font-size: 13px; background: #f8fafc;">
+                    Showing {limit} of {total_count} results &nbsp;
+                    <a href="/admin/core/archiveresult/?snapshot__id__exact={results[0].snapshot_id if results else ''}"
+                       style="color: #2563eb;">View all →</a>
+                </td>
+            </tr>
+        '''
+
+    return mark_safe(f'''
+        <div style="border: 1px solid #e2e8f0; border-radius: 8px; overflow: hidden; background: #fff; width: 100%;">
+            <table style="width: 100%; border-collapse: collapse; font-size: 14px;">
+                <thead>
+                    <tr style="background: #f8fafc; border-bottom: 2px solid #e2e8f0;">
+                        <th style="padding: 10px 12px; text-align: left; font-weight: 600; color: #475569; font-size: 12px; text-transform: uppercase; letter-spacing: 0.05em;">Status</th>
+                        <th style="padding: 10px 12px; text-align: left; font-weight: 600; color: #475569; font-size: 12px; width: 32px;"></th>
+                        <th style="padding: 10px 12px; text-align: left; font-weight: 600; color: #475569; font-size: 12px; text-transform: uppercase; letter-spacing: 0.05em;">Extractor</th>
+                        <th style="padding: 10px 12px; text-align: left; font-weight: 600; color: #475569; font-size: 12px; text-transform: uppercase; letter-spacing: 0.05em;">Output</th>
+                        <th style="padding: 10px 12px; text-align: left; font-weight: 600; color: #475569; font-size: 12px; text-transform: uppercase; letter-spacing: 0.05em;">Completed</th>
+                        <th style="padding: 10px 12px; text-align: left; font-weight: 600; color: #475569; font-size: 12px; text-transform: uppercase; letter-spacing: 0.05em;">Version</th>
+                        <th style="padding: 10px 8px; text-align: left; font-weight: 600; color: #475569; font-size: 12px; text-transform: uppercase; letter-spacing: 0.05em;">Actions</th>
+                    </tr>
+                </thead>
+                <tbody>
+                    {''.join(rows)}
+                    {footer}
+                </tbody>
+            </table>
+        </div>
+    ''')
+
+
 
 class ArchiveResultInline(admin.TabularInline):
     name = 'Archive Results Log'
@@ -97,18 +241,44 @@ class ArchiveResultAdmin(BaseModelAdmin):
     sort_fields = ('id', 'created_by', 'created_at', 'extractor', 'status')
     readonly_fields = ('cmd_str', 'snapshot_info', 'tags_str', 'created_at', 'modified_at', 'output_summary', 'extractor_with_icon')
     search_fields = ('id', 'snapshot__url', 'extractor', 'output', 'cmd_version', 'cmd', 'snapshot__timestamp')
-    fields = ('snapshot', 'extractor', 'status', 'retry_at', 'start_ts', 'end_ts', 'created_by', 'pwd', 'cmd_version', 'cmd', 'output', *readonly_fields)
     autocomplete_fields = ['snapshot']
 
+    fieldsets = (
+        ('Snapshot', {
+            'fields': ('snapshot', 'snapshot_info', 'tags_str'),
+            'classes': ('card', 'wide'),
+        }),
+        ('Extractor', {
+            'fields': ('extractor', 'extractor_with_icon', 'status', 'retry_at'),
+            'classes': ('card',),
+        }),
+        ('Timing', {
+            'fields': ('start_ts', 'end_ts', 'created_at', 'modified_at'),
+            'classes': ('card',),
+        }),
+        ('Command', {
+            'fields': ('cmd', 'cmd_str', 'cmd_version', 'pwd'),
+            'classes': ('card',),
+        }),
+        ('Output', {
+            'fields': ('output', 'output_summary'),
+            'classes': ('card', 'wide'),
+        }),
+        ('Metadata', {
+            'fields': ('created_by',),
+            'classes': ('card',),
+        }),
+    )
+
     list_filter = ('status', 'extractor', 'start_ts', 'cmd_version')
     ordering = ['-start_ts']
     list_per_page = SERVER_CONFIG.SNAPSHOTS_PER_PAGE
-    
+
     paginator = AccelleratedPaginator
     save_on_top = True
-    
+
     actions = ['delete_selected']
-    
+
     class Meta:
         verbose_name = 'Archive Result'
         verbose_name_plural = 'Archive Results'

+ 43 - 4
archivebox/core/admin_snapshots.py

@@ -25,7 +25,7 @@ from archivebox.workers.tasks import bg_archive_snapshots, bg_add
 
 from core.models import Tag
 from core.admin_tags import TagInline
-from core.admin_archiveresults import ArchiveResultInline
+from core.admin_archiveresults import ArchiveResultInline, render_archiveresults_list
 
 
 # GLOBAL_CONTEXT = {'VERSION': VERSION, 'VERSIONS_AVAILABLE': [], 'CAN_UPGRADE': False}
@@ -54,13 +54,48 @@ class SnapshotActionForm(ActionForm):
 class SnapshotAdmin(SearchResultsAdminMixin, ConfigEditorMixin, BaseModelAdmin):
     list_display = ('created_at', 'title_str', 'status', 'files', 'size', 'url_str')
     sort_fields = ('title_str', 'url_str', 'created_at', 'status', 'crawl')
-    readonly_fields = ('admin_actions', 'status_info', 'tags_str', 'imported_timestamp', 'created_at', 'modified_at', 'downloaded_at', 'output_dir')
+    readonly_fields = ('admin_actions', 'status_info', 'tags_str', 'imported_timestamp', 'created_at', 'modified_at', 'downloaded_at', 'output_dir', 'archiveresults_list')
     search_fields = ('id', 'url', 'timestamp', 'title', 'tags__name')
     list_filter = ('created_at', 'downloaded_at', 'archiveresult__status', 'created_by', 'tags__name')
-    fields = ('url', 'title', 'created_by', 'bookmarked_at', 'status', 'retry_at', 'crawl', 'config', *readonly_fields)
+
+    fieldsets = (
+        ('URL', {
+            'fields': ('url', 'title'),
+            'classes': ('card', 'wide'),
+        }),
+        ('Status', {
+            'fields': ('status', 'retry_at', 'status_info'),
+            'classes': ('card',),
+        }),
+        ('Timestamps', {
+            'fields': ('bookmarked_at', 'created_at', 'modified_at', 'downloaded_at'),
+            'classes': ('card',),
+        }),
+        ('Relations', {
+            'fields': ('crawl', 'created_by', 'tags_str'),
+            'classes': ('card',),
+        }),
+        ('Config', {
+            'fields': ('config',),
+            'classes': ('card',),
+        }),
+        ('Files', {
+            'fields': ('output_dir',),
+            'classes': ('card',),
+        }),
+        ('Actions', {
+            'fields': ('admin_actions',),
+            'classes': ('card', 'wide'),
+        }),
+        ('Archive Results', {
+            'fields': ('archiveresults_list',),
+            'classes': ('card', 'wide'),
+        }),
+    )
+
     ordering = ['-created_at']
     actions = ['add_tags', 'remove_tags', 'update_titles', 'update_snapshots', 'resnapshot_snapshot', 'overwrite_snapshots', 'delete_snapshots']
-    inlines = [TagInline, ArchiveResultInline]
+    inlines = [TagInline]  # Removed ArchiveResultInline, using custom renderer instead
     list_per_page = min(max(5, SERVER_CONFIG.SNAPSHOTS_PER_PAGE), 5000)
 
     action_form = SnapshotActionForm
@@ -155,6 +190,10 @@ class SnapshotAdmin(SearchResultsAdminMixin, ConfigEditorMixin, BaseModelAdmin):
             obj.extension or '-',
         )
 
+    @admin.display(description='Archive Results')
+    def archiveresults_list(self, obj):
+        return render_archiveresults_list(obj.archiveresult_set.all())
+
     @admin.display(
         description='Title',
         ordering='title',

+ 15 - 1
archivebox/core/admin_tags.py

@@ -51,11 +51,25 @@ class TagAdmin(BaseModelAdmin):
     sort_fields = ('name', 'slug', 'id', 'created_by', 'created_at')
     readonly_fields = ('slug', 'id', 'created_at', 'modified_at', 'snapshots')
     search_fields = ('id', 'name', 'slug')
-    fields = ('name', 'created_by', *readonly_fields)
     actions = ['delete_selected', 'merge_tags']
     ordering = ['-created_at']
     # inlines = [TaggedItemInline]
 
+    fieldsets = (
+        ('Tag Info', {
+            'fields': ('name', 'slug'),
+            'classes': ('card',),
+        }),
+        ('Metadata', {
+            'fields': ('id', 'created_by', 'created_at', 'modified_at'),
+            'classes': ('card',),
+        }),
+        ('Snapshots', {
+            'fields': ('snapshots',),
+            'classes': ('card', 'wide'),
+        }),
+    )
+
     paginator = AccelleratedPaginator
 
 

+ 0 - 40
archivebox/core/apps.py

@@ -1,7 +1,5 @@
 __package__ = 'archivebox.core'
 
-import sys
-
 from django.apps import AppConfig
 
 
@@ -12,41 +10,3 @@ class CoreConfig(AppConfig):
         """Register the archivebox.core.admin_site as the main django admin site"""
         from core.admin_site import register_admin_site
         register_admin_site()
-
-        # Auto-start the orchestrator when running the web server
-        self._maybe_start_orchestrator()
-
-    def _maybe_start_orchestrator(self):
-        """Start the orchestrator if we're running a web server."""
-        import os
-
-        # Don't start orchestrator during migrations, shell, tests, etc.
-        # Only start when running: runserver, daphne, gunicorn, uwsgi
-        if not self._is_web_server():
-            return
-
-        # Don't start if RUN_ORCHESTRATOR env var is explicitly set to false
-        if os.environ.get('RUN_ORCHESTRATOR', '').lower() in ('false', '0', 'no'):
-            return
-
-        # Don't start in autoreload child process (avoid double-start)
-        if os.environ.get('RUN_MAIN') != 'true' and 'runserver' in sys.argv:
-            return
-
-        try:
-            from workers.orchestrator import Orchestrator
-
-            if not Orchestrator.is_running():
-                # Start orchestrator as daemon (won't exit on idle when started by server)
-                orchestrator = Orchestrator(exit_on_idle=False)
-                orchestrator.start()
-        except Exception as e:
-            # Don't crash the server if orchestrator fails to start
-            import logging
-            logging.getLogger('archivebox').warning(f'Failed to auto-start orchestrator: {e}')
-
-    def _is_web_server(self) -> bool:
-        """Check if we're running a web server command."""
-        # Check for common web server indicators
-        server_commands = ('runserver', 'daphne', 'gunicorn', 'uwsgi', 'server')
-        return any(cmd in ' '.join(sys.argv).lower() for cmd in server_commands)

+ 22 - 0
archivebox/core/migrations/0025_allow_duplicate_urls_per_crawl.py

@@ -0,0 +1,22 @@
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ('core', '0024_snapshot_crawl'),
+    ]
+
+    operations = [
+        # Remove the unique constraint on url
+        migrations.AlterField(
+            model_name='snapshot',
+            name='url',
+            field=models.URLField(db_index=True, unique=False),
+        ),
+        # Add unique constraint on (url, crawl) combination
+        migrations.AddConstraint(
+            model_name='snapshot',
+            constraint=models.UniqueConstraint(fields=['url', 'crawl'], name='unique_url_per_crawl'),
+        ),
+    ]

+ 62 - 6
archivebox/core/models.py

@@ -60,7 +60,8 @@ class Tag(ModelWithSerializers):
         return self.name
 
     def save(self, *args, **kwargs):
-        if self._state.adding:
+        is_new = self._state.adding
+        if is_new:
             self.slug = slugify(self.name)
             existing = set(Tag.objects.filter(slug__startswith=self.slug).values_list("slug", flat=True))
             i = None
@@ -72,6 +73,19 @@ class Tag(ModelWithSerializers):
                 i = (i or 0) + 1
         super().save(*args, **kwargs)
 
+        if is_new:
+            from archivebox.misc.logging_util import log_worker_event
+            log_worker_event(
+                worker_type='DB',
+                event='Created Tag',
+                indent_level=0,
+                metadata={
+                    'id': self.id,
+                    'name': self.name,
+                    'slug': self.slug,
+                },
+            )
+
     @property
     def api_url(self) -> str:
         return reverse_lazy('api-1:get_tag', args=[self.id])
@@ -241,12 +255,13 @@ class SnapshotManager(models.Manager.from_queryset(SnapshotQuerySet)):
                 if tag.strip()
             ))
 
-        try:
-            snapshot = self.get(url=url)
+        # Get most recent snapshot with this URL (URLs can exist in multiple crawls)
+        snapshot = self.filter(url=url).order_by('-created_at').first()
+        if snapshot:
             if title and (not snapshot.title or len(title) > len(snapshot.title or '')):
                 snapshot.title = title
                 snapshot.save(update_fields=['title', 'modified_at'])
-        except self.model.DoesNotExist:
+        else:
             if timestamp:
                 while self.filter(timestamp=timestamp).exists():
                     timestamp = str(float(timestamp) + 1.0)
@@ -284,7 +299,7 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
     created_at = models.DateTimeField(default=timezone.now, db_index=True)
     modified_at = models.DateTimeField(auto_now=True)
 
-    url = models.URLField(unique=True, db_index=True)
+    url = models.URLField(unique=False, db_index=True)  # URLs can appear in multiple crawls
     timestamp = models.CharField(max_length=32, unique=True, db_index=True, editable=False)
     bookmarked_at = models.DateTimeField(default=timezone.now, db_index=True)
     crawl: Crawl = models.ForeignKey(Crawl, on_delete=models.CASCADE, default=None, null=True, blank=True, related_name='snapshot_set', db_index=True)  # type: ignore
@@ -313,11 +328,16 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
     class Meta(TypedModelMeta):
         verbose_name = "Snapshot"
         verbose_name_plural = "Snapshots"
+        constraints = [
+            # Allow same URL in different crawls, but not duplicates within same crawl
+            models.UniqueConstraint(fields=['url', 'crawl'], name='unique_url_per_crawl'),
+        ]
 
     def __str__(self):
         return f'[{self.id}] {self.url[:64]}'
 
     def save(self, *args, **kwargs):
+        is_new = self._state.adding
         if not self.bookmarked_at:
             self.bookmarked_at = self.created_at or timezone.now()
         if not self.timestamp:
@@ -327,6 +347,21 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
             self.crawl.urls += f'\n{self.url}'
             self.crawl.save()
 
+        if is_new:
+            from archivebox.misc.logging_util import log_worker_event
+            log_worker_event(
+                worker_type='DB',
+                event='Created Snapshot',
+                indent_level=2,
+                url=self.url,
+                metadata={
+                    'id': str(self.id),
+                    'crawl_id': str(self.crawl_id) if self.crawl_id else None,
+                    'depth': self.depth,
+                    'status': self.status,
+                },
+            )
+
     def output_dir_parent(self) -> str:
         return 'archive'
 
@@ -807,6 +842,24 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
     def __str__(self):
         return f'[{self.id}] {self.snapshot.url[:64]} -> {self.extractor}'
 
+    def save(self, *args, **kwargs):
+        is_new = self._state.adding
+        super().save(*args, **kwargs)
+        if is_new:
+            from archivebox.misc.logging_util import log_worker_event
+            log_worker_event(
+                worker_type='DB',
+                event='Created ArchiveResult',
+                indent_level=3,
+                extractor=self.extractor,
+                metadata={
+                    'id': str(self.id),
+                    'snapshot_id': str(self.snapshot_id),
+                    'snapshot_url': str(self.snapshot.url)[:64],
+                    'status': self.status,
+                },
+            )
+
     @cached_property
     def snapshot_dir(self):
         return Path(self.snapshot.output_dir)
@@ -879,7 +932,6 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
         from django.utils import timezone
         from archivebox.hooks import BUILTIN_PLUGINS_DIR, USER_PLUGINS_DIR, run_hook
 
-        extractor_dir = Path(self.snapshot.output_dir) / self.extractor
         config_objects = [self.snapshot.crawl, self.snapshot] if self.snapshot.crawl else [self.snapshot]
 
         # Find hook for this extractor
@@ -899,6 +951,10 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
             self.save()
             return
 
+        # Use plugin directory name instead of extractor name (removes numeric prefix)
+        plugin_name = hook.parent.name
+        extractor_dir = Path(self.snapshot.output_dir) / plugin_name
+
         # Run the hook
         start_ts = timezone.now()
         result = run_hook(

+ 25 - 34
archivebox/core/statemachines.py

@@ -45,15 +45,14 @@ class SnapshotMachine(StateMachine, strict_states=True):
         super().__init__(snapshot, *args, **kwargs)
         
     def __repr__(self) -> str:
-        return f'[grey53]Snapshot\\[{self.snapshot.id}] 🏃‍♂️ Worker\\[pid={os.getpid()}].tick()[/grey53] [blue]{self.snapshot.status.upper()}[/blue] ⚙️ [grey37]Machine[/grey37]'
-    
+        return f'Snapshot[{self.snapshot.id}]'
+
     def __str__(self) -> str:
         return self.__repr__()
-        
+
     def can_start(self) -> bool:
         can_start = bool(self.snapshot.url)
-        if not can_start:
-            print(f'{self}.can_start() [blue]QUEUED[/blue] ➡️❌ [blue]STARTED[/blue] cant start yet +{timezone.now() - self.snapshot.retry_at}s')
+        # Suppressed: queue waiting logs
         return can_start
         
     def is_finished(self) -> bool:
@@ -73,15 +72,15 @@ class SnapshotMachine(StateMachine, strict_states=True):
         
     @queued.enter
     def enter_queued(self):
-        print(f'{self}.on_queued() ↳ snapshot.retry_at = now()')
+        # Suppressed: state transition logs
         self.snapshot.update_for_workers(
             retry_at=timezone.now(),
             status=Snapshot.StatusChoices.QUEUED,
         )
-        
+
     @started.enter
     def enter_started(self):
-        print(f'{self}.on_started() ↳ snapshot.run()')
+        # Suppressed: state transition logs
         # lock the snapshot while we create the pending archiveresults
         self.snapshot.update_for_workers(
             retry_at=timezone.now() + timedelta(seconds=30),  # if failed, wait 30s before retrying
@@ -95,10 +94,10 @@ class SnapshotMachine(StateMachine, strict_states=True):
             retry_at=timezone.now() + timedelta(seconds=5),  # wait 5s before checking it again
             status=Snapshot.StatusChoices.STARTED,
         )
-        
+
     @sealed.enter
     def enter_sealed(self):
-        print(f'{self}.on_sealed() ↳ snapshot.retry_at=None')
+        # Suppressed: state transition logs
         self.snapshot.update_for_workers(
             retry_at=None,
             status=Snapshot.StatusChoices.SEALED,
@@ -161,15 +160,14 @@ class ArchiveResultMachine(StateMachine, strict_states=True):
         super().__init__(archiveresult, *args, **kwargs)
     
     def __repr__(self) -> str:
-        return f'[grey53]ArchiveResult\\[{self.archiveresult.id}] 🏃‍♂️ Worker\\[pid={os.getpid()}].tick()[/grey53] [blue]{self.archiveresult.status.upper()}[/blue] ⚙️ [grey37]Machine[/grey37]'
-    
+        return f'ArchiveResult[{self.archiveresult.id}]'
+
     def __str__(self) -> str:
         return self.__repr__()
-        
+
     def can_start(self) -> bool:
         can_start = bool(self.archiveresult.snapshot.url)
-        if not can_start:
-            print(f'{self}.can_start() [blue]QUEUED[/blue] ➡️❌ [blue]STARTED[/blue]: cant start yet +{timezone.now() - self.archiveresult.retry_at}s')
+        # Suppressed: queue waiting logs
         return can_start
     
     def is_succeeded(self) -> bool:
@@ -202,41 +200,34 @@ class ArchiveResultMachine(StateMachine, strict_states=True):
 
     @queued.enter
     def enter_queued(self):
-        print(f'{self}.on_queued() ↳ archiveresult.retry_at = now()')
+        # Suppressed: state transition logs
         self.archiveresult.update_for_workers(
             retry_at=timezone.now(),
             status=ArchiveResult.StatusChoices.QUEUED,
             start_ts=None,
         )  # bump the snapshot's retry_at so they pickup any new changes
-        
+
     @started.enter
     def enter_started(self):
-        print(f'{self}.on_started() ↳ archiveresult.start_ts + run_extractor()')
-        
+        # Suppressed: state transition logs
         # Lock the object and mark start time
         self.archiveresult.update_for_workers(
             retry_at=timezone.now() + timedelta(seconds=120),  # 2 min timeout for extractor
             status=ArchiveResult.StatusChoices.STARTED,
             start_ts=timezone.now(),
         )
-        
+
         # Run the extractor - this updates status, output, timestamps, etc.
         self.archiveresult.run()
-        
+
         # Save the updated result
         self.archiveresult.save()
-        
-        # Log the result
-        if self.archiveresult.status == ArchiveResult.StatusChoices.SUCCEEDED:
-            print(f'{self} ✅ extractor succeeded: {self.archiveresult.output[:50] if self.archiveresult.output else ""}...')
-        elif self.archiveresult.status == ArchiveResult.StatusChoices.FAILED:
-            print(f'{self} ❌ extractor failed: {self.archiveresult.output[:100] if self.archiveresult.output else ""}...')
-        elif self.archiveresult.status == ArchiveResult.StatusChoices.SKIPPED:
-            print(f'{self} ⏭️ extractor skipped: {self.archiveresult.output[:50] if self.archiveresult.output else ""}')
+
+        # Suppressed: extractor result logs (already logged by worker)
 
     @backoff.enter
     def enter_backoff(self):
-        print(f'{self}.on_backoff() ↳ archiveresult.retries += 1, archiveresult.bump_retry_at(+60s), archiveresult.end_ts = None')
+        # Suppressed: state transition logs
         self.archiveresult.update_for_workers(
             retry_at=timezone.now() + timedelta(seconds=60),
             status=ArchiveResult.StatusChoices.BACKOFF,
@@ -244,10 +235,10 @@ class ArchiveResultMachine(StateMachine, strict_states=True):
             # retries=F('retries') + 1,               # F() equivalent to getattr(self.archiveresult, 'retries', 0) + 1,
         )
         self.archiveresult.save(write_indexes=True)
-        
+
     @succeeded.enter
     def enter_succeeded(self):
-        print(f'{self}.on_succeeded() ↳ archiveresult.retry_at = None, archiveresult.end_ts = now()')
+        # Suppressed: state transition logs
         self.archiveresult.update_for_workers(
             retry_at=None,
             status=ArchiveResult.StatusChoices.SUCCEEDED,
@@ -270,7 +261,7 @@ class ArchiveResultMachine(StateMachine, strict_states=True):
 
     @failed.enter
     def enter_failed(self):
-        print(f'{self}.on_failed() ↳ archiveresult.retry_at = None, archiveresult.end_ts = now()')
+        # Suppressed: state transition logs
         self.archiveresult.update_for_workers(
             retry_at=None,
             status=ArchiveResult.StatusChoices.FAILED,
@@ -291,7 +282,7 @@ class ArchiveResultMachine(StateMachine, strict_states=True):
 
     @skipped.enter
     def enter_skipped(self):
-        print(f'{self}.on_skipped() ↳ archiveresult.retry_at = None, archiveresult.end_ts = now()')
+        # Suppressed: state transition logs
         self.archiveresult.update_for_workers(
             retry_at=None,
             status=ArchiveResult.StatusChoices.SKIPPED,

+ 48 - 15
archivebox/core/views.py

@@ -503,15 +503,7 @@ class AddView(UserPassesTestMixin, FormView):
             mark_safe(f"Adding {rough_url_count} URLs in the background. (refresh in a minute start seeing results) {crawl.admin_change_url}"),
         )
 
-        # Start orchestrator in background to process the queued crawl
-        try:
-            from archivebox.workers.tasks import ensure_orchestrator_running
-            ensure_orchestrator_running()
-        except Exception as e:
-            # Orchestrator may already be running via supervisord, or fail to start
-            # This is not fatal - the crawl will be processed when orchestrator runs
-            print(f'[!] Failed to start orchestrator: {e}')
-
+        # Orchestrator (managed by supervisord) will pick up the queued crawl
         return redirect(crawl.admin_change_url)
 
 
@@ -539,6 +531,7 @@ def live_progress_view(request):
         from workers.orchestrator import Orchestrator
         from crawls.models import Crawl
         from core.models import Snapshot, ArchiveResult
+        from django.db.models import Case, When, Value, IntegerField
 
         # Get orchestrator status
         orchestrator_running = Orchestrator.is_running()
@@ -570,8 +563,26 @@ def live_progress_view(request):
             crawl_snapshots = Snapshot.objects.filter(crawl=crawl)
             total_snapshots = crawl_snapshots.count()
             completed_snapshots = crawl_snapshots.filter(status=Snapshot.StatusChoices.SEALED).count()
+            started_snapshots = crawl_snapshots.filter(status=Snapshot.StatusChoices.STARTED).count()
             pending_snapshots = crawl_snapshots.filter(status=Snapshot.StatusChoices.QUEUED).count()
 
+            # Count URLs in the crawl (for when snapshots haven't been created yet)
+            urls_count = 0
+            if crawl.urls:
+                urls_count = len([u for u in crawl.urls.split('\n') if u.strip()])
+            elif crawl.seed and crawl.seed.uri:
+                # Try to get URL count from seed
+                if crawl.seed.uri.startswith('file:///'):
+                    try:
+                        from pathlib import Path
+                        seed_file = Path(crawl.seed.uri.replace('file://', ''))
+                        if seed_file.exists():
+                            urls_count = len([l for l in seed_file.read_text().split('\n') if l.strip() and not l.startswith('#')])
+                    except:
+                        pass
+                else:
+                    urls_count = 1  # Single URL seed
+
             # Calculate crawl progress
             crawl_progress = int((completed_snapshots / total_snapshots) * 100) if total_snapshots > 0 else 0
 
@@ -590,16 +601,24 @@ def live_progress_view(request):
                 # Calculate snapshot progress
                 snapshot_progress = int(((completed_extractors + failed_extractors) / total_extractors) * 100) if total_extractors > 0 else 0
 
-                # Get active extractors for this snapshot
-                active_extractors = [
+                # Get all extractors for this snapshot
+                # Order: started first, then queued, then completed
+                all_extractors = [
                     {
                         'id': str(ar.id),
                         'extractor': ar.extractor,
                         'status': ar.status,
-                        'started': ar.start_ts.isoformat() if ar.start_ts else None,
-                        'progress': 50,
                     }
-                    for ar in snapshot_results.filter(status=ArchiveResult.StatusChoices.STARTED).order_by('-start_ts')[:5]
+                    for ar in snapshot_results.annotate(
+                        status_order=Case(
+                            When(status=ArchiveResult.StatusChoices.STARTED, then=Value(0)),
+                            When(status=ArchiveResult.StatusChoices.QUEUED, then=Value(1)),
+                            When(status=ArchiveResult.StatusChoices.SUCCEEDED, then=Value(2)),
+                            When(status=ArchiveResult.StatusChoices.FAILED, then=Value(3)),
+                            default=Value(4),
+                            output_field=IntegerField(),
+                        )
+                    ).order_by('status_order', 'extractor')
                 ]
 
                 active_snapshots_for_crawl.append({
@@ -612,9 +631,17 @@ def live_progress_view(request):
                     'completed_extractors': completed_extractors,
                     'failed_extractors': failed_extractors,
                     'pending_extractors': pending_extractors,
-                    'active_extractors': active_extractors,
+                    'all_extractors': all_extractors,
                 })
 
+            # Check if crawl can start (for debugging stuck crawls)
+            can_start = bool(crawl.seed and crawl.seed.uri)
+            seed_uri = crawl.seed.uri[:60] if crawl.seed and crawl.seed.uri else None
+
+            # Check if retry_at is in the future (would prevent worker from claiming)
+            retry_at_future = crawl.retry_at > timezone.now() if crawl.retry_at else False
+            seconds_until_retry = int((crawl.retry_at - timezone.now()).total_seconds()) if crawl.retry_at and retry_at_future else 0
+
             active_crawls.append({
                 'id': str(crawl.id),
                 'label': str(crawl)[:60],
@@ -622,11 +649,17 @@ def live_progress_view(request):
                 'started': crawl.modified_at.isoformat() if crawl.modified_at else None,
                 'progress': crawl_progress,
                 'max_depth': crawl.max_depth,
+                'urls_count': urls_count,
                 'total_snapshots': total_snapshots,
                 'completed_snapshots': completed_snapshots,
+                'started_snapshots': started_snapshots,
                 'failed_snapshots': 0,
                 'pending_snapshots': pending_snapshots,
                 'active_snapshots': active_snapshots_for_crawl,
+                'can_start': can_start,
+                'seed_uri': seed_uri,
+                'retry_at_future': retry_at_future,
+                'seconds_until_retry': seconds_until_retry,
             })
 
         return JsonResponse({

+ 220 - 28
archivebox/crawls/admin.py

@@ -8,6 +8,7 @@ from django.contrib import admin, messages
 from django.urls import path
 from django.http import JsonResponse
 from django.views.decorators.http import require_POST
+from django.db.models import Count, Q
 
 from archivebox import DATA_DIR
 
@@ -19,13 +20,155 @@ from core.models import Snapshot
 from crawls.models import Seed, Crawl, CrawlSchedule
 
 
+def render_snapshots_list(snapshots_qs, limit=20):
+    """Render a nice inline list view of snapshots with status, title, URL, and progress."""
+
+    snapshots = snapshots_qs.order_by('-created_at')[:limit].annotate(
+        total_results=Count('archiveresult'),
+        succeeded_results=Count('archiveresult', filter=Q(archiveresult__status='succeeded')),
+        failed_results=Count('archiveresult', filter=Q(archiveresult__status='failed')),
+    )
+
+    if not snapshots:
+        return mark_safe('<div style="color: #666; font-style: italic; padding: 8px 0;">No Snapshots yet...</div>')
+
+    # Status colors matching Django admin and progress monitor
+    status_colors = {
+        'queued': ('#6c757d', '#f8f9fa'),      # gray
+        'started': ('#856404', '#fff3cd'),     # amber
+        'sealed': ('#155724', '#d4edda'),      # green
+        'failed': ('#721c24', '#f8d7da'),      # red
+    }
+
+    rows = []
+    for snapshot in snapshots:
+        status = snapshot.status or 'queued'
+        color, bg = status_colors.get(status, ('#6c757d', '#f8f9fa'))
+
+        # Calculate progress
+        total = snapshot.total_results
+        done = snapshot.succeeded_results + snapshot.failed_results
+        progress_pct = int((done / total) * 100) if total > 0 else 0
+        progress_text = f'{done}/{total}' if total > 0 else '-'
+
+        # Truncate title and URL
+        title = (snapshot.title or 'Untitled')[:60]
+        if len(snapshot.title or '') > 60:
+            title += '...'
+        url_display = snapshot.url[:50]
+        if len(snapshot.url) > 50:
+            url_display += '...'
+
+        # Format date
+        date_str = snapshot.created_at.strftime('%Y-%m-%d %H:%M') if snapshot.created_at else '-'
+
+        rows.append(f'''
+            <tr style="border-bottom: 1px solid #eee;">
+                <td style="padding: 6px 8px; white-space: nowrap;">
+                    <span style="display: inline-block; padding: 2px 8px; border-radius: 10px;
+                                 font-size: 11px; font-weight: 500; text-transform: uppercase;
+                                 color: {color}; background: {bg};">{status}</span>
+                </td>
+                <td style="padding: 6px 8px; white-space: nowrap;">
+                    <a href="/archive/{snapshot.timestamp}/" style="text-decoration: none;">
+                        <img src="/archive/{snapshot.timestamp}/favicon.ico"
+                             style="width: 16px; height: 16px; vertical-align: middle; margin-right: 4px;"
+                             onerror="this.style.display='none'"/>
+                    </a>
+                </td>
+                <td style="padding: 6px 8px; max-width: 300px;">
+                    <a href="{snapshot.admin_change_url}" style="color: #417690; text-decoration: none; font-weight: 500;"
+                       title="{snapshot.title or 'Untitled'}">{title}</a>
+                </td>
+                <td style="padding: 6px 8px; max-width: 250px;">
+                    <a href="{snapshot.url}" target="_blank"
+                       style="color: #666; text-decoration: none; font-family: monospace; font-size: 11px;"
+                       title="{snapshot.url}">{url_display}</a>
+                </td>
+                <td style="padding: 6px 8px; white-space: nowrap; text-align: center;">
+                    <div style="display: inline-flex; align-items: center; gap: 6px;">
+                        <div style="width: 60px; height: 6px; background: #eee; border-radius: 3px; overflow: hidden;">
+                            <div style="width: {progress_pct}%; height: 100%;
+                                        background: {'#28a745' if snapshot.failed_results == 0 else '#ffc107' if snapshot.succeeded_results > 0 else '#dc3545'};
+                                        transition: width 0.3s;"></div>
+                        </div>
+                        <a href="/admin/core/archiveresult/?snapshot__id__exact={snapshot.id}"
+                           style="font-size: 11px; color: #417690; min-width: 35px; text-decoration: none;"
+                           title="View archive results">{progress_text}</a>
+                    </div>
+                </td>
+                <td style="padding: 6px 8px; white-space: nowrap; color: #888; font-size: 11px;">
+                    {date_str}
+                </td>
+            </tr>
+        ''')
+
+    total_count = snapshots_qs.count()
+    footer = ''
+    if total_count > limit:
+        footer = f'''
+            <tr>
+                <td colspan="6" style="padding: 8px; text-align: center; color: #666; font-size: 12px; background: #f8f9fa;">
+                    Showing {limit} of {total_count} snapshots
+                </td>
+            </tr>
+        '''
+
+    return mark_safe(f'''
+        <div style="border: 1px solid #ddd; border-radius: 6px; overflow: hidden; max-width: 100%;">
+            <table style="width: 100%; border-collapse: collapse; font-size: 13px;">
+                <thead>
+                    <tr style="background: #f5f5f5; border-bottom: 2px solid #ddd;">
+                        <th style="padding: 8px; text-align: left; font-weight: 600; color: #333;">Status</th>
+                        <th style="padding: 8px; text-align: left; font-weight: 600; color: #333; width: 24px;"></th>
+                        <th style="padding: 8px; text-align: left; font-weight: 600; color: #333;">Title</th>
+                        <th style="padding: 8px; text-align: left; font-weight: 600; color: #333;">URL</th>
+                        <th style="padding: 8px; text-align: center; font-weight: 600; color: #333;">Progress</th>
+                        <th style="padding: 8px; text-align: left; font-weight: 600; color: #333;">Created</th>
+                    </tr>
+                </thead>
+                <tbody>
+                    {''.join(rows)}
+                    {footer}
+                </tbody>
+            </table>
+        </div>
+    ''')
+
+
 class SeedAdmin(ConfigEditorMixin, BaseModelAdmin):
     list_display = ('id', 'created_at', 'created_by', 'label', 'notes', 'uri', 'extractor', 'tags_str', 'crawls', 'num_crawls', 'num_snapshots')
     sort_fields = ('id', 'created_at', 'created_by', 'label', 'notes', 'uri', 'extractor', 'tags_str')
     search_fields = ('id', 'created_by__username', 'label', 'notes', 'uri', 'extractor', 'tags_str')
 
     readonly_fields = ('created_at', 'modified_at', 'scheduled_crawls', 'crawls', 'snapshots', 'contents')
-    fields = ('label', 'notes', 'uri', 'extractor', 'tags_str', 'config', 'created_by', *readonly_fields)
+
+    fieldsets = (
+        ('Source', {
+            'fields': ('uri', 'contents'),
+            'classes': ('card', 'wide'),
+        }),
+        ('Info', {
+            'fields': ('label', 'notes', 'tags_str'),
+            'classes': ('card',),
+        }),
+        ('Settings', {
+            'fields': ('extractor', 'config'),
+            'classes': ('card',),
+        }),
+        ('Metadata', {
+            'fields': ('created_by', 'created_at', 'modified_at'),
+            'classes': ('card',),
+        }),
+        ('Crawls', {
+            'fields': ('scheduled_crawls', 'crawls'),
+            'classes': ('card',),
+        }),
+        ('Snapshots', {
+            'fields': ('snapshots',),
+            'classes': ('card',),
+        }),
+    )
 
     list_filter = ('extractor', 'created_by')
     ordering = ['-created_at']
@@ -51,22 +194,19 @@ class SeedAdmin(ConfigEditorMixin, BaseModelAdmin):
         )) or mark_safe('<i>No Crawls yet...</i>')
 
     def snapshots(self, obj):
-        return format_html_join('<br/>', ' - <a href="{}">{}</a>', (
-            (snapshot.admin_change_url, snapshot)
-            for snapshot in obj.snapshot_set.all().order_by('-created_at')[:20]
-        )) or mark_safe('<i>No Snapshots yet...</i>')
+        return render_snapshots_list(obj.snapshot_set.all())
 
     def contents(self, obj):
-        if obj.uri.startswith('file:///data/'):
-            source_file = DATA_DIR / obj.uri.replace('file:///data/', '', 1)
+        source_file = obj.get_file_path()
+        if source_file:
             contents = ""
             try:
                 contents = source_file.read_text().strip()[:14_000]
             except Exception as e:
                 contents = f'Error reading {source_file}: {e}'
-                
+
             return format_html('<b><code>{}</code>:</b><br/><pre>{}</pre>', source_file, contents)
-        
+
         return format_html('See URLs here: <a href="{}">{}</a>', obj.uri, obj.uri)
 
 
@@ -78,7 +218,37 @@ class CrawlAdmin(ConfigEditorMixin, BaseModelAdmin):
     search_fields = ('id', 'created_by__username', 'max_depth', 'label', 'notes', 'seed_id', 'schedule_id', 'status', 'seed__uri')
 
     readonly_fields = ('created_at', 'modified_at', 'snapshots', 'seed_urls_editor')
-    fields = ('label', 'notes', 'seed_urls_editor', 'config', 'status', 'retry_at', 'max_depth', 'seed', 'schedule', 'created_by', 'created_at', 'modified_at', 'snapshots')
+
+    fieldsets = (
+        ('URLs', {
+            'fields': ('seed_urls_editor',),
+            'classes': ('card', 'wide'),
+        }),
+        ('Info', {
+            'fields': ('label', 'notes'),
+            'classes': ('card',),
+        }),
+        ('Settings', {
+            'fields': ('max_depth', 'config'),
+            'classes': ('card',),
+        }),
+        ('Status', {
+            'fields': ('status', 'retry_at'),
+            'classes': ('card',),
+        }),
+        ('Relations', {
+            'fields': ('seed', 'schedule', 'created_by'),
+            'classes': ('card',),
+        }),
+        ('Timestamps', {
+            'fields': ('created_at', 'modified_at'),
+            'classes': ('card',),
+        }),
+        ('Snapshots', {
+            'fields': ('snapshots',),
+            'classes': ('card', 'wide'),
+        }),
+    )
 
     list_filter = ('max_depth', 'seed', 'schedule', 'created_by', 'status', 'retry_at')
     ordering = ['-created_at', '-retry_at']
@@ -90,6 +260,16 @@ class CrawlAdmin(ConfigEditorMixin, BaseModelAdmin):
     def recrawl(self, request, obj):
         """Duplicate this crawl as a new crawl with the same seed and settings."""
         from django.utils import timezone
+        from django.shortcuts import redirect
+
+        # Validate seed has a URI (required for crawl to start)
+        if not obj.seed:
+            messages.error(request, 'Cannot recrawl: original crawl has no seed.')
+            return redirect('admin:crawls_crawl_change', obj.id)
+
+        if not obj.seed.uri:
+            messages.error(request, 'Cannot recrawl: seed has no URI.')
+            return redirect('admin:crawls_crawl_change', obj.id)
 
         new_crawl = Crawl.objects.create(
             seed=obj.seed,
@@ -110,8 +290,6 @@ class CrawlAdmin(ConfigEditorMixin, BaseModelAdmin):
             f'It will start processing shortly.'
         )
 
-        # Redirect to the new crawl's change page
-        from django.shortcuts import redirect
         return redirect('admin:crawls_crawl_change', new_crawl.id)
 
     def get_urls(self):
@@ -133,7 +311,8 @@ class CrawlAdmin(ConfigEditorMixin, BaseModelAdmin):
         except Crawl.DoesNotExist:
             return JsonResponse({'success': False, 'error': 'Crawl not found'}, status=404)
 
-        if not (crawl.seed and crawl.seed.uri and crawl.seed.uri.startswith('file:///data/')):
+        source_file = crawl.seed.get_file_path() if crawl.seed else None
+        if not source_file:
             return JsonResponse({'success': False, 'error': 'Seed is not a local file'}, status=400)
 
         try:
@@ -142,8 +321,6 @@ class CrawlAdmin(ConfigEditorMixin, BaseModelAdmin):
         except json.JSONDecodeError:
             return JsonResponse({'success': False, 'error': 'Invalid JSON'}, status=400)
 
-        source_file = DATA_DIR / crawl.seed.uri.replace('file:///data/', '', 1)
-
         try:
             # Ensure parent directory exists
             source_file.parent.mkdir(parents=True, exist_ok=True)
@@ -156,10 +333,7 @@ class CrawlAdmin(ConfigEditorMixin, BaseModelAdmin):
         return obj.snapshot_set.count()
 
     def snapshots(self, obj):
-        return format_html_join('<br/>', '<a href="{}">{}</a>', (
-            (snapshot.admin_change_url, snapshot)
-            for snapshot in obj.snapshot_set.all().order_by('-created_at')[:20]
-        )) or mark_safe('<i>No Snapshots yet...</i>')
+        return render_snapshots_list(obj.snapshot_set.all())
 
     @admin.display(description='Schedule', ordering='schedule')
     def schedule_str(self, obj):
@@ -186,13 +360,12 @@ class CrawlAdmin(ConfigEditorMixin, BaseModelAdmin):
             seed_uri = obj.urls
 
         # Check if it's a local file we can edit
-        is_file = seed_uri.startswith('file:///data/')
+        source_file = obj.seed.get_file_path() if obj.seed else None
+        is_file = source_file is not None
         contents = ""
         error = None
-        source_file = None
 
-        if is_file:
-            source_file = DATA_DIR / seed_uri.replace('file:///data/', '', 1)
+        if is_file and source_file:
             try:
                 contents = source_file.read_text().strip()
             except Exception as e:
@@ -337,7 +510,29 @@ class CrawlScheduleAdmin(BaseModelAdmin):
     search_fields = ('id', 'created_by__username', 'label', 'notes', 'schedule_id', 'template_id', 'template__seed__uri')
 
     readonly_fields = ('created_at', 'modified_at', 'crawls', 'snapshots')
-    fields = ('label', 'notes', 'schedule', 'template', 'created_by', *readonly_fields)
+
+    fieldsets = (
+        ('Schedule Info', {
+            'fields': ('label', 'notes'),
+            'classes': ('card',),
+        }),
+        ('Configuration', {
+            'fields': ('schedule', 'template'),
+            'classes': ('card',),
+        }),
+        ('Metadata', {
+            'fields': ('created_by', 'created_at', 'modified_at'),
+            'classes': ('card',),
+        }),
+        ('Crawls', {
+            'fields': ('crawls',),
+            'classes': ('card', 'wide'),
+        }),
+        ('Snapshots', {
+            'fields': ('snapshots',),
+            'classes': ('card', 'wide'),
+        }),
+    )
 
     list_filter = ('created_by',)
     ordering = ['-created_at']
@@ -362,10 +557,7 @@ class CrawlScheduleAdmin(BaseModelAdmin):
     
     def snapshots(self, obj):
         crawl_ids = obj.crawl_set.values_list('pk', flat=True)
-        return format_html_join('<br/>', ' - <a href="{}">{}</a>', (
-            (snapshot.admin_change_url, snapshot)
-            for snapshot in Snapshot.objects.filter(crawl_id__in=crawl_ids).order_by('-created_at')[:20]
-        )) or mark_safe('<i>No Snapshots yet...</i>')
+        return render_snapshots_list(Snapshot.objects.filter(crawl_id__in=crawl_ids))
 
 
 def register_admin(admin_site):

+ 55 - 1
archivebox/crawls/models.py

@@ -44,9 +44,27 @@ class Seed(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHealthS
     def __str__(self):
         return f'[{self.id}] {self.uri[:64]}'
 
+    def save(self, *args, **kwargs):
+        is_new = self._state.adding
+        super().save(*args, **kwargs)
+        if is_new:
+            from archivebox.misc.logging_util import log_worker_event
+            log_worker_event(
+                worker_type='DB',
+                event='Created Seed',
+                indent_level=0,
+                metadata={
+                    'id': str(self.id),
+                    'uri': str(self.uri)[:64],
+                    'extractor': self.extractor,
+                    'label': self.label or None,
+                },
+            )
+
     @classmethod
     def from_file(cls, source_file: Path, label: str = '', parser: str = 'auto', tag: str = '', created_by=None, config=None):
-        source_path = str(source_file.resolve()).replace(str(CONSTANTS.DATA_DIR), '/data')
+        # Use absolute path for file:// URLs so extractors can find the files
+        source_path = str(source_file.resolve())
         seed, _ = cls.objects.get_or_create(
             label=label or source_file.name, uri=f'file://{source_path}',
             created_by_id=getattr(created_by, 'pk', created_by) or get_or_create_system_user_pk(),
@@ -62,6 +80,25 @@ class Seed(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHealthS
     def api_url(self) -> str:
         return reverse_lazy('api-1:get_seed', args=[self.id])
 
+    def get_file_path(self) -> Path | None:
+        """
+        Get the filesystem path for file:// URIs.
+        Handles both old format (file:///data/...) and new format (file:///absolute/path).
+        Returns None if URI is not a file:// URI.
+        """
+        if not self.uri.startswith('file://'):
+            return None
+
+        # Remove file:// prefix
+        path_str = self.uri.replace('file://', '', 1)
+
+        # Handle old format: file:///data/... -> DATA_DIR/...
+        if path_str.startswith('/data/'):
+            return CONSTANTS.DATA_DIR / path_str.replace('/data/', '', 1)
+
+        # Handle new format: file:///absolute/path
+        return Path(path_str)
+
     @property
     def snapshot_set(self) -> QuerySet['Snapshot']:
         from core.models import Snapshot
@@ -136,6 +173,23 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith
     def __str__(self):
         return f'[{self.id}] {self.seed.uri[:64] if self.seed else ""}'
 
+    def save(self, *args, **kwargs):
+        is_new = self._state.adding
+        super().save(*args, **kwargs)
+        if is_new:
+            from archivebox.misc.logging_util import log_worker_event
+            log_worker_event(
+                worker_type='DB',
+                event='Created Crawl',
+                indent_level=1,
+                metadata={
+                    'id': str(self.id),
+                    'seed_uri': str(self.seed.uri)[:64] if self.seed else None,
+                    'max_depth': self.max_depth,
+                    'status': self.status,
+                },
+            )
+
     @classmethod
     def from_seed(cls, seed: Seed, max_depth: int = 0, persona: str = 'Default', tags_str: str = '', config=None, created_by=None):
         crawl, _ = cls.objects.get_or_create(

+ 114 - 12
archivebox/crawls/statemachines.py

@@ -36,13 +36,19 @@ class CrawlMachine(StateMachine, strict_states=True):
         super().__init__(crawl, *args, **kwargs)
     
     def __repr__(self) -> str:
-        return f'[grey53]Crawl\\[{self.crawl.id}] 🏃‍♂️ Worker\\[pid={os.getpid()}].tick()[/grey53] [blue]{self.crawl.status.upper()}[/blue] ⚙️ [grey37]Machine[/grey37]'
-    
+        return f'Crawl[{self.crawl.id}]'
+
     def __str__(self) -> str:
         return self.__repr__()
         
     def can_start(self) -> bool:
-        return bool(self.crawl.seed and self.crawl.seed.uri)
+        if not self.crawl.seed:
+            print(f'[red]⚠️ Crawl {self.crawl.id} cannot start: no seed[/red]')
+            return False
+        if not self.crawl.seed.uri:
+            print(f'[red]⚠️ Crawl {self.crawl.id} cannot start: seed has no URI[/red]')
+            return False
+        return True
         
     def is_finished(self) -> bool:
         from core.models import Snapshot, ArchiveResult
@@ -73,25 +79,121 @@ class CrawlMachine(StateMachine, strict_states=True):
 
     @started.enter
     def enter_started(self):
-        print(f'{self}.on_started(): [blue]↳ STARTED[/blue] crawl.run()')
+        # Suppressed: state transition logs
         # lock the crawl object while we create snapshots
         self.crawl.update_for_workers(
             retry_at=timezone.now() + timedelta(seconds=5),
             status=Crawl.StatusChoices.QUEUED,
         )
 
-        # Run the crawl - creates root snapshot and processes queued URLs
-        self.crawl.run()
+        try:
+            # Run on_Crawl hooks to validate/install dependencies
+            self._run_crawl_hooks()
 
-        # only update status to STARTED once snapshots are created
-        self.crawl.update_for_workers(
-            retry_at=timezone.now() + timedelta(seconds=5),
-            status=Crawl.StatusChoices.STARTED,
+            # Run the crawl - creates root snapshot and processes queued URLs
+            self.crawl.run()
+
+            # only update status to STARTED once snapshots are created
+            self.crawl.update_for_workers(
+                retry_at=timezone.now() + timedelta(seconds=5),
+                status=Crawl.StatusChoices.STARTED,
+            )
+        except Exception as e:
+            print(f'[red]⚠️ Crawl {self.crawl.id} failed to start: {e}[/red]')
+            import traceback
+            traceback.print_exc()
+            # Re-raise so the worker knows it failed
+            raise
+
+    def _run_crawl_hooks(self):
+        """Run on_Crawl hooks to validate/install dependencies."""
+        from pathlib import Path
+        from archivebox.hooks import run_hooks, discover_hooks
+        from archivebox.config import CONSTANTS
+
+        # Discover and run all on_Crawl hooks
+        hooks = discover_hooks('Crawl')
+        if not hooks:
+            return
+
+        # Create a temporary output directory for hook results
+        output_dir = Path(CONSTANTS.DATA_DIR) / 'tmp' / f'crawl_{self.crawl.id}'
+        output_dir.mkdir(parents=True, exist_ok=True)
+
+        # Run all on_Crawl hooks
+        results = run_hooks(
+            event_name='Crawl',
+            output_dir=output_dir,
+            timeout=60,
+            config_objects=[self.crawl, self.crawl.seed] if self.crawl.seed else [self.crawl],
+            crawl_id=str(self.crawl.id),
+            seed_uri=self.crawl.seed.uri if self.crawl.seed else '',
         )
 
-    @sealed.enter        
+        # Process hook results - parse JSONL output and create DB objects
+        self._process_hook_results(results)
+
+    def _process_hook_results(self, results: list):
+        """Process JSONL output from hooks to create InstalledBinary and update Machine config."""
+        import json
+        from machine.models import Machine, InstalledBinary
+
+        machine = Machine.current()
+
+        for result in results:
+            if result['returncode'] != 0:
+                # Hook failed - might indicate missing dependency
+                continue
+
+            # Parse JSONL output
+            for line in result['stdout'].strip().split('\n'):
+                if not line.strip():
+                    continue
+
+                try:
+                    obj = json.loads(line)
+                    obj_type = obj.get('type')
+
+                    if obj_type == 'InstalledBinary':
+                        # Create or update InstalledBinary record
+                        # Skip if essential fields are missing
+                        if not obj.get('name') or not obj.get('abspath') or not obj.get('version'):
+                            continue
+
+                        InstalledBinary.objects.update_or_create(
+                            machine=machine,
+                            name=obj['name'],
+                            defaults={
+                                'abspath': obj['abspath'],
+                                'version': obj['version'],
+                                'sha256': obj.get('sha256') or '',
+                                'binprovider': obj.get('binprovider') or 'env',
+                            }
+                        )
+
+                    elif obj_type == 'Machine':
+                        # Update Machine config
+                        method = obj.get('_method', 'update')
+                        if method == 'update':
+                            key = obj.get('key', '')
+                            value = obj.get('value')
+                            if key.startswith('config/'):
+                                config_key = key[7:]  # Remove 'config/' prefix
+                                machine.config[config_key] = value
+                                machine.save(update_fields=['config'])
+
+                    elif obj_type == 'Dependency':
+                        # Dependency request - could trigger installation
+                        # For now just log it (installation hooks would be separate)
+                        print(f'[yellow]Dependency requested: {obj.get("bin_name")}[/yellow]')
+
+                except json.JSONDecodeError:
+                    # Not JSON, skip
+                    continue
+
+    @sealed.enter
     def enter_sealed(self):
-        print(f'{self}.on_sealed(): [blue]↳ SEALED[/blue] crawl.retry_at=None')
+        # Suppressed: state transition logs
         self.crawl.update_for_workers(
             retry_at=None,
             status=Crawl.StatusChoices.SEALED,

+ 8 - 0
archivebox/hooks.py

@@ -245,6 +245,14 @@ def run_hook(
     env.setdefault('USER_AGENT', str(getattr(config, 'USER_AGENT', '')))
     env.setdefault('RESOLUTION', str(getattr(config, 'RESOLUTION', '')))
 
+    # Pass SEARCH_BACKEND_ENGINE from new-style config
+    try:
+        from archivebox.config.configset import get_config
+        search_config = get_config()
+        env.setdefault('SEARCH_BACKEND_ENGINE', str(search_config.get('SEARCH_BACKEND_ENGINE', 'ripgrep')))
+    except Exception:
+        env.setdefault('SEARCH_BACKEND_ENGINE', 'ripgrep')
+
     # Create output directory if needed
     output_dir.mkdir(parents=True, exist_ok=True)
 

+ 2 - 0
archivebox/logs/errors.log

@@ -0,0 +1,2 @@
+
+> /Users/squash/Local/Code/archiveboxes/archivebox-nue/archivebox/cli/archivebox_init.py --force; TS=2025-12-25__08:03:12 VERSION=0.9.0rc1 IN_DOCKER=False IS_TTY=False

+ 92 - 4
archivebox/machine/admin.py

@@ -12,7 +12,33 @@ class MachineAdmin(ConfigEditorMixin, BaseModelAdmin):
     sort_fields = ('id', 'created_at', 'hostname', 'ips', 'os_platform', 'hw_in_docker', 'hw_in_vm', 'hw_manufacturer', 'hw_product', 'os_arch', 'os_family', 'os_release', 'hw_uuid')
 
     readonly_fields = ('guid', 'created_at', 'modified_at', 'ips')
-    fields = (*readonly_fields, 'hostname', 'hw_in_docker', 'hw_in_vm', 'hw_manufacturer', 'hw_product', 'hw_uuid', 'os_arch', 'os_family', 'os_platform', 'os_kernel', 'os_release', 'stats', 'config', 'num_uses_succeeded', 'num_uses_failed')
+
+    fieldsets = (
+        ('Identity', {
+            'fields': ('hostname', 'guid', 'ips'),
+            'classes': ('card',),
+        }),
+        ('Hardware', {
+            'fields': ('hw_manufacturer', 'hw_product', 'hw_uuid', 'hw_in_docker', 'hw_in_vm'),
+            'classes': ('card',),
+        }),
+        ('Operating System', {
+            'fields': ('os_platform', 'os_family', 'os_arch', 'os_kernel', 'os_release'),
+            'classes': ('card',),
+        }),
+        ('Statistics', {
+            'fields': ('stats', 'num_uses_succeeded', 'num_uses_failed'),
+            'classes': ('card',),
+        }),
+        ('Configuration', {
+            'fields': ('config',),
+            'classes': ('card', 'wide'),
+        }),
+        ('Timestamps', {
+            'fields': ('created_at', 'modified_at'),
+            'classes': ('card',),
+        }),
+    )
 
     list_filter = ('hw_in_docker', 'hw_in_vm', 'os_arch', 'os_family', 'os_platform')
     ordering = ['-created_at']
@@ -33,7 +59,29 @@ class NetworkInterfaceAdmin(BaseModelAdmin):
     search_fields = ('id', 'machine__id', 'iface', 'ip_public', 'ip_local', 'mac_address', 'dns_server', 'hostname', 'isp', 'city', 'region', 'country')
 
     readonly_fields = ('machine', 'created_at', 'modified_at', 'mac_address', 'ip_public', 'ip_local', 'dns_server')
-    fields = (*readonly_fields, 'iface', 'hostname', 'isp', 'city', 'region', 'country', 'num_uses_succeeded', 'num_uses_failed')
+
+    fieldsets = (
+        ('Machine', {
+            'fields': ('machine',),
+            'classes': ('card',),
+        }),
+        ('Network', {
+            'fields': ('iface', 'ip_public', 'ip_local', 'mac_address', 'dns_server'),
+            'classes': ('card',),
+        }),
+        ('Location', {
+            'fields': ('hostname', 'isp', 'city', 'region', 'country'),
+            'classes': ('card',),
+        }),
+        ('Usage', {
+            'fields': ('num_uses_succeeded', 'num_uses_failed'),
+            'classes': ('card',),
+        }),
+        ('Timestamps', {
+            'fields': ('created_at', 'modified_at'),
+            'classes': ('card',),
+        }),
+    )
 
     list_filter = ('isp', 'country', 'region')
     ordering = ['-created_at']
@@ -54,7 +102,25 @@ class DependencyAdmin(ConfigEditorMixin, BaseModelAdmin):
     search_fields = ('id', 'bin_name', 'bin_providers')
 
     readonly_fields = ('id', 'created_at', 'modified_at', 'is_installed', 'installed_count')
-    fields = ('bin_name', 'bin_providers', 'custom_cmds', 'config', *readonly_fields)
+
+    fieldsets = (
+        ('Binary', {
+            'fields': ('bin_name', 'bin_providers', 'is_installed', 'installed_count'),
+            'classes': ('card',),
+        }),
+        ('Commands', {
+            'fields': ('custom_cmds',),
+            'classes': ('card',),
+        }),
+        ('Configuration', {
+            'fields': ('config',),
+            'classes': ('card', 'wide'),
+        }),
+        ('Timestamps', {
+            'fields': ('id', 'created_at', 'modified_at'),
+            'classes': ('card',),
+        }),
+    )
 
     list_filter = ('bin_providers', 'created_at')
     ordering = ['-created_at']
@@ -82,7 +148,29 @@ class InstalledBinaryAdmin(BaseModelAdmin):
     search_fields = ('id', 'machine__id', 'name', 'binprovider', 'version', 'abspath', 'sha256', 'dependency__bin_name')
 
     readonly_fields = ('created_at', 'modified_at')
-    fields = ('machine', 'dependency', 'name', 'binprovider', 'abspath', 'version', 'sha256', *readonly_fields, 'num_uses_succeeded', 'num_uses_failed')
+
+    fieldsets = (
+        ('Binary Info', {
+            'fields': ('name', 'dependency', 'binprovider'),
+            'classes': ('card',),
+        }),
+        ('Location', {
+            'fields': ('machine', 'abspath'),
+            'classes': ('card',),
+        }),
+        ('Version', {
+            'fields': ('version', 'sha256'),
+            'classes': ('card',),
+        }),
+        ('Usage', {
+            'fields': ('num_uses_succeeded', 'num_uses_failed'),
+            'classes': ('card',),
+        }),
+        ('Timestamps', {
+            'fields': ('created_at', 'modified_at'),
+            'classes': ('card',),
+        }),
+    )
 
     list_filter = ('name', 'binprovider', 'machine_id', 'dependency')
     ordering = ['-created_at']

+ 16 - 8
archivebox/misc/logging_util.py

@@ -544,16 +544,21 @@ def log_worker_event(
 
     # Build worker identifier
     worker_parts = [worker_type]
-    if pid:
+    # Don't add pid/worker_id for DB operations (they happen in whatever process is running)
+    if pid and worker_type != 'DB':
         worker_parts.append(f'pid={pid}')
-    if worker_id and worker_type in ('CrawlWorker', 'Orchestrator'):
+    if worker_id and worker_type in ('CrawlWorker', 'Orchestrator') and worker_type != 'DB':
         worker_parts.append(f'id={worker_id}')
-    if url and worker_type == 'SnapshotWorker':
+    if url and worker_type in ('SnapshotWorker', 'DB'):
         worker_parts.append(f'url={truncate_url(url)}')
-    if extractor and worker_type == 'ArchiveResultWorker':
+    if extractor and worker_type in ('ArchiveResultWorker', 'DB'):
         worker_parts.append(f'extractor={extractor}')
 
-    worker_label = f'{worker_parts[0]}[{", ".join(worker_parts[1:])}]'
+    # Format worker label - only add brackets if there are additional identifiers
+    if len(worker_parts) > 1:
+        worker_label = f'{worker_parts[0]}[{", ".join(worker_parts[1:])}]'
+    else:
+        worker_label = worker_parts[0]
 
     # Build metadata string
     metadata_str = ''
@@ -579,12 +584,14 @@ def log_worker_event(
                 meta_parts.append(f'{k}: {len(v)}')
             else:
                 meta_parts.append(f'{k}: {v}')
-        metadata_str = ' {' + ', '.join(meta_parts) + '}'
+        metadata_str = ' | '.join(meta_parts)
 
     # Determine color based on event
     color = 'white'
     if event in ('Starting...', 'Started', 'STARTED', 'Started in background'):
         color = 'green'
+    elif event.startswith('Created'):
+        color = 'cyan'  # DB creation events
     elif event in ('Processing...', 'PROCESSING'):
         color = 'blue'
     elif event in ('Completed', 'COMPLETED', 'All work complete'):
@@ -606,8 +613,9 @@ def log_worker_event(
     text.append(indent)  # Indentation
     # Append worker label and event with color
     text.append(f'{worker_label} {event}{error_str}', style=color)
-    # Append metadata without color
-    text.append(metadata_str)
+    # Append metadata without color (add separator if metadata exists)
+    if metadata_str:
+        text.append(f' | {metadata_str}')
 
     CONSOLE.print(text)
 

+ 3 - 6
archivebox/plugins/accessibility/on_Snapshot__39_accessibility.js

@@ -21,9 +21,9 @@ const puppeteer = require('puppeteer-core');
 
 // Extractor metadata
 const EXTRACTOR_NAME = 'accessibility';
-const OUTPUT_DIR = 'accessibility';
+const OUTPUT_DIR = '.';
 const OUTPUT_FILE = 'accessibility.json';
-const CHROME_SESSION_DIR = 'chrome_session';
+const CHROME_SESSION_DIR = '../chrome_session';
 
 // Parse command line arguments
 function parseArgs() {
@@ -60,10 +60,7 @@ function getCdpUrl() {
 
 // Extract accessibility info
 async function extractAccessibility(url) {
-    // Create output directory
-    if (!fs.existsSync(OUTPUT_DIR)) {
-        fs.mkdirSync(OUTPUT_DIR, { recursive: true });
-    }
+    // Output directory is current directory (hook already runs in output dir)
     const outputPath = path.join(OUTPUT_DIR, OUTPUT_FILE);
 
     let browser = null;

+ 1 - 1
archivebox/plugins/archive_org/on_Snapshot__13_archive_org.py

@@ -24,7 +24,7 @@ import rich_click as click
 
 # Extractor metadata
 EXTRACTOR_NAME = 'archive_org'
-OUTPUT_DIR = 'archive_org'
+OUTPUT_DIR = '.'
 OUTPUT_FILE = 'archive.org.txt'
 
 

+ 1 - 1
archivebox/plugins/chrome_cleanup/on_Snapshot__45_chrome_cleanup.py

@@ -26,7 +26,7 @@ import rich_click as click
 
 # Extractor metadata
 EXTRACTOR_NAME = 'chrome_cleanup'
-CHROME_SESSION_DIR = 'chrome_session'
+CHROME_SESSION_DIR = '../chrome_session'
 
 
 def get_env(name: str, default: str = '') -> str:

+ 1 - 1
archivebox/plugins/chrome_navigate/on_Snapshot__30_chrome_navigate.js

@@ -31,7 +31,7 @@ const puppeteer = require('puppeteer-core');
 
 // Extractor metadata
 const EXTRACTOR_NAME = 'chrome_navigate';
-const CHROME_SESSION_DIR = 'chrome_session';
+const CHROME_SESSION_DIR = '../chrome_session';
 
 // Parse command line arguments
 function parseArgs() {

+ 3 - 6
archivebox/plugins/consolelog/on_Snapshot__21_consolelog.js

@@ -21,9 +21,9 @@ const puppeteer = require('puppeteer-core');
 
 // Extractor metadata
 const EXTRACTOR_NAME = 'consolelog';
-const OUTPUT_DIR = 'consolelog';
+const OUTPUT_DIR = '.';
 const OUTPUT_FILE = 'console.jsonl';
-const CHROME_SESSION_DIR = 'chrome_session';
+const CHROME_SESSION_DIR = '../chrome_session';
 
 // Parse command line arguments
 function parseArgs() {
@@ -86,10 +86,7 @@ async function serializeArgs(args) {
 async function captureConsoleLogs(url) {
     const captureTimeout = (getEnvInt('CONSOLELOG_TIMEOUT') || 5) * 1000;
 
-    // Create output directory
-    if (!fs.existsSync(OUTPUT_DIR)) {
-        fs.mkdirSync(OUTPUT_DIR, { recursive: true });
-    }
+    // Output directory is current directory (hook already runs in output dir)
     const outputPath = path.join(OUTPUT_DIR, OUTPUT_FILE);
 
     // Clear existing file

+ 4 - 7
archivebox/plugins/dom/on_Snapshot__36_dom.js

@@ -24,9 +24,9 @@ const puppeteer = require('puppeteer-core');
 
 // Extractor metadata
 const EXTRACTOR_NAME = 'dom';
-const OUTPUT_DIR = 'dom';
+const OUTPUT_DIR = '.';
 const OUTPUT_FILE = 'output.html';
-const CHROME_SESSION_DIR = 'chrome_session';
+const CHROME_SESSION_DIR = '../chrome_session';
 
 // Parse command line arguments
 function parseArgs() {
@@ -58,7 +58,7 @@ function getEnvInt(name, defaultValue = 0) {
 }
 
 // Check if staticfile extractor already downloaded this URL
-const STATICFILE_DIR = 'staticfile';
+const STATICFILE_DIR = '../staticfile';
 function hasStaticFileOutput() {
     return fs.existsSync(STATICFILE_DIR) && fs.readdirSync(STATICFILE_DIR).length > 0;
 }
@@ -114,10 +114,7 @@ async function dumpDom(url) {
 
     const { width, height } = parseResolution(resolution);
 
-    // Create output directory
-    if (!fs.existsSync(OUTPUT_DIR)) {
-        fs.mkdirSync(OUTPUT_DIR, { recursive: true });
-    }
+    // Output directory is current directory (hook already runs in output dir)
     const outputPath = path.join(OUTPUT_DIR, OUTPUT_FILE);
 
     let browser = null;

+ 1 - 1
archivebox/plugins/favicon/on_Snapshot__11_favicon.py

@@ -26,7 +26,7 @@ import rich_click as click
 
 # Extractor metadata
 EXTRACTOR_NAME = 'favicon'
-OUTPUT_DIR = 'favicon'
+OUTPUT_DIR = '.'
 OUTPUT_FILE = 'favicon.ico'
 
 

+ 1 - 1
archivebox/plugins/git/on_Snapshot__12_git.py

@@ -26,7 +26,7 @@ import rich_click as click
 EXTRACTOR_NAME = 'git'
 BIN_NAME = 'git'
 BIN_PROVIDERS = 'apt,brew,env'
-OUTPUT_DIR = 'repo'
+OUTPUT_DIR = '.'
 
 
 def get_env(name: str, default: str = '') -> str:

+ 3 - 6
archivebox/plugins/headers/on_Snapshot__33_headers.js

@@ -22,9 +22,9 @@ const http = require('http');
 
 // Extractor metadata
 const EXTRACTOR_NAME = 'headers';
-const OUTPUT_DIR = 'headers';
+const OUTPUT_DIR = '.';
 const OUTPUT_FILE = 'headers.json';
-const CHROME_SESSION_DIR = 'chrome_session';
+const CHROME_SESSION_DIR = '../chrome_session';
 const CHROME_HEADERS_FILE = 'response_headers.json';
 
 // Parse command line arguments
@@ -110,10 +110,7 @@ function fetchHeaders(url) {
 }
 
 async function extractHeaders(url) {
-    // Create output directory
-    if (!fs.existsSync(OUTPUT_DIR)) {
-        fs.mkdirSync(OUTPUT_DIR, { recursive: true });
-    }
+    // Output directory is current directory (hook already runs in output dir)
     const outputPath = path.join(OUTPUT_DIR, OUTPUT_FILE);
 
     // Try Chrome session first

+ 2 - 3
archivebox/plugins/htmltotext/on_Snapshot__54_htmltotext.py

@@ -28,7 +28,7 @@ import rich_click as click
 
 # Extractor metadata
 EXTRACTOR_NAME = 'htmltotext'
-OUTPUT_DIR = 'htmltotext'
+OUTPUT_DIR = '.'
 OUTPUT_FILE = 'htmltotext.txt'
 
 
@@ -114,9 +114,8 @@ def extract_htmltotext(url: str) -> tuple[bool, str | None, str]:
     if not text or len(text) < 10:
         return False, None, 'No meaningful text extracted from HTML'
 
-    # Create output directory and write output
+    # Output directory is current directory (hook already runs in output dir)
     output_dir = Path(OUTPUT_DIR)
-    output_dir.mkdir(exist_ok=True)
     output_path = output_dir / OUTPUT_FILE
     output_path.write_text(text, encoding='utf-8')
 

+ 3 - 4
archivebox/plugins/media/on_Snapshot__51_media.py

@@ -39,7 +39,7 @@ import rich_click as click
 EXTRACTOR_NAME = 'media'
 BIN_NAME = 'yt-dlp'
 BIN_PROVIDERS = 'pip,apt,brew,env'
-OUTPUT_DIR = 'media'
+OUTPUT_DIR = '.'
 
 
 def get_env(name: str, default: str = '') -> str:
@@ -62,7 +62,7 @@ def get_env_int(name: str, default: int = 0) -> int:
         return default
 
 
-STATICFILE_DIR = 'staticfile'
+STATICFILE_DIR = '../staticfile'
 
 def has_staticfile_output() -> bool:
     """Check if staticfile extractor already downloaded this URL."""
@@ -129,9 +129,8 @@ def save_media(url: str, binary: str) -> tuple[bool, str | None, str]:
     extra_args = get_env('YTDLP_EXTRA_ARGS') or get_env('YOUTUBEDL_EXTRA_ARGS', '')
     media_max_size = get_env('MEDIA_MAX_SIZE', '750m')
 
-    # Create output directory
+    # Output directory is current directory (hook already runs in output dir)
     output_dir = Path(OUTPUT_DIR)
-    output_dir.mkdir(exist_ok=True)
 
     # Build command (later options take precedence)
     cmd = [

+ 2 - 3
archivebox/plugins/mercury/on_Snapshot__53_mercury.py

@@ -27,7 +27,7 @@ import rich_click as click
 EXTRACTOR_NAME = 'mercury'
 BIN_NAME = 'postlight-parser'
 BIN_PROVIDERS = 'npm,env'
-OUTPUT_DIR = 'mercury'
+OUTPUT_DIR = '.'
 
 
 def get_env(name: str, default: str = '') -> str:
@@ -72,9 +72,8 @@ def extract_mercury(url: str, binary: str) -> tuple[bool, str | None, str]:
     """
     timeout = get_env_int('TIMEOUT', 60)
 
-    # Create output directory
+    # Output directory is current directory (hook already runs in output dir)
     output_dir = Path(OUTPUT_DIR)
-    output_dir.mkdir(exist_ok=True)
 
     try:
         # Get text version

+ 3 - 6
archivebox/plugins/parse_dom_outlinks/on_Snapshot__40_parse_dom_outlinks.js

@@ -24,10 +24,10 @@ const puppeteer = require('puppeteer-core');
 
 // Extractor metadata
 const EXTRACTOR_NAME = 'parse_dom_outlinks';
-const OUTPUT_DIR = 'parse_dom_outlinks';
+const OUTPUT_DIR = '.';
 const OUTPUT_FILE = 'outlinks.json';
 const URLS_FILE = 'urls.jsonl';  // For crawl system
-const CHROME_SESSION_DIR = 'chrome_session';
+const CHROME_SESSION_DIR = '../chrome_session';
 
 // Parse command line arguments
 function parseArgs() {
@@ -64,10 +64,7 @@ function getCdpUrl() {
 
 // Extract outlinks
 async function extractOutlinks(url) {
-    // Create output directory
-    if (!fs.existsSync(OUTPUT_DIR)) {
-        fs.mkdirSync(OUTPUT_DIR, { recursive: true });
-    }
+    // Output directory is current directory (hook already runs in output dir)
     const outputPath = path.join(OUTPUT_DIR, OUTPUT_FILE);
 
     let browser = null;

+ 4 - 7
archivebox/plugins/pdf/on_Snapshot__35_pdf.js

@@ -23,9 +23,9 @@ const puppeteer = require('puppeteer-core');
 
 // Extractor metadata
 const EXTRACTOR_NAME = 'pdf';
-const OUTPUT_DIR = 'pdf';
+const OUTPUT_DIR = '.';
 const OUTPUT_FILE = 'output.pdf';
-const CHROME_SESSION_DIR = 'chrome_session';
+const CHROME_SESSION_DIR = '../chrome_session';
 
 // Parse command line arguments
 function parseArgs() {
@@ -57,7 +57,7 @@ function getEnvInt(name, defaultValue = 0) {
 }
 
 // Check if staticfile extractor already downloaded this URL
-const STATICFILE_DIR = 'staticfile';
+const STATICFILE_DIR = '../staticfile';
 function hasStaticFileOutput() {
     return fs.existsSync(STATICFILE_DIR) && fs.readdirSync(STATICFILE_DIR).length > 0;
 }
@@ -113,10 +113,7 @@ async function printToPdf(url) {
 
     const { width, height } = parseResolution(resolution);
 
-    // Create output directory
-    if (!fs.existsSync(OUTPUT_DIR)) {
-        fs.mkdirSync(OUTPUT_DIR, { recursive: true });
-    }
+    // Output directory is current directory (hook already runs in output dir)
     const outputPath = path.join(OUTPUT_DIR, OUTPUT_FILE);
 
     let browser = null;

+ 2 - 3
archivebox/plugins/readability/on_Snapshot__52_readability.py

@@ -29,7 +29,7 @@ import rich_click as click
 EXTRACTOR_NAME = 'readability'
 BIN_NAME = 'readability-extractor'
 BIN_PROVIDERS = 'npm,env'
-OUTPUT_DIR = 'readability'
+OUTPUT_DIR = '.'
 
 
 def get_env(name: str, default: str = '') -> str:
@@ -101,9 +101,8 @@ def extract_readability(url: str, binary: str) -> tuple[bool, str | None, str]:
     if not html_source:
         return False, None, 'No HTML source found (run singlefile, dom, or wget first)'
 
-    # Create output directory
+    # Output directory is current directory (hook already runs in output dir)
     output_dir = Path(OUTPUT_DIR)
-    output_dir.mkdir(exist_ok=True)
 
     try:
         # Run readability-extractor (outputs JSON by default)

+ 3 - 6
archivebox/plugins/redirects/on_Snapshot__22_redirects.js

@@ -21,9 +21,9 @@ const puppeteer = require('puppeteer-core');
 
 // Extractor metadata
 const EXTRACTOR_NAME = 'redirects';
-const OUTPUT_DIR = 'redirects';
+const OUTPUT_DIR = '.';
 const OUTPUT_FILE = 'redirects.json';
-const CHROME_SESSION_DIR = 'chrome_session';
+const CHROME_SESSION_DIR = '../chrome_session';
 
 // Parse command line arguments
 function parseArgs() {
@@ -60,10 +60,7 @@ function getCdpUrl() {
 
 // Track redirect chain
 async function trackRedirects(url) {
-    // Create output directory
-    if (!fs.existsSync(OUTPUT_DIR)) {
-        fs.mkdirSync(OUTPUT_DIR, { recursive: true });
-    }
+    // Output directory is current directory (hook already runs in output dir)
     const outputPath = path.join(OUTPUT_DIR, OUTPUT_FILE);
 
     let browser = null;

+ 4 - 6
archivebox/plugins/responses/on_Snapshot__24_responses.js

@@ -26,8 +26,8 @@ const puppeteer = require('puppeteer-core');
 
 // Extractor metadata
 const EXTRACTOR_NAME = 'responses';
-const OUTPUT_DIR = 'responses';
-const CHROME_SESSION_DIR = 'chrome_session';
+const OUTPUT_DIR = '.';
+const CHROME_SESSION_DIR = '../chrome_session';
 
 // Resource types to capture (by default, capture everything)
 const DEFAULT_TYPES = ['script', 'stylesheet', 'font', 'image', 'media', 'xhr', 'websocket'];
@@ -149,10 +149,8 @@ async function archiveResponses(originalUrl) {
     const typesStr = getEnv('RESPONSES_TYPES', DEFAULT_TYPES.join(','));
     const typesToSave = typesStr.split(',').map(t => t.trim().toLowerCase());
 
-    // Create output directories
-    if (!fs.existsSync(OUTPUT_DIR)) {
-        fs.mkdirSync(OUTPUT_DIR, { recursive: true });
-    }
+    // Output directory is current directory (hook already runs in output dir)
+    // Create subdirectories for organizing responses
     const allDir = path.join(OUTPUT_DIR, 'all');
     if (!fs.existsSync(allDir)) {
         fs.mkdirSync(allDir, { recursive: true });

+ 4 - 7
archivebox/plugins/screenshot/on_Snapshot__34_screenshot.js

@@ -23,9 +23,9 @@ const puppeteer = require('puppeteer-core');
 
 // Extractor metadata
 const EXTRACTOR_NAME = 'screenshot';
-const OUTPUT_DIR = 'screenshot';
+const OUTPUT_DIR = '.';
 const OUTPUT_FILE = 'screenshot.png';
-const CHROME_SESSION_DIR = 'chrome_session';
+const CHROME_SESSION_DIR = '../chrome_session';
 
 // Parse command line arguments
 function parseArgs() {
@@ -57,7 +57,7 @@ function getEnvInt(name, defaultValue = 0) {
 }
 
 // Check if staticfile extractor already downloaded this URL
-const STATICFILE_DIR = 'staticfile';
+const STATICFILE_DIR = '../staticfile';
 function hasStaticFileOutput() {
     return fs.existsSync(STATICFILE_DIR) && fs.readdirSync(STATICFILE_DIR).length > 0;
 }
@@ -116,10 +116,7 @@ async function takeScreenshot(url) {
 
     const { width, height } = parseResolution(resolution);
 
-    // Create output directory
-    if (!fs.existsSync(OUTPUT_DIR)) {
-        fs.mkdirSync(OUTPUT_DIR, { recursive: true });
-    }
+    // Output directory is current directory (hook already runs in output dir)
     const outputPath = path.join(OUTPUT_DIR, OUTPUT_FILE);
 
     let browser = null;

+ 131 - 0
archivebox/plugins/search_backend_ripgrep/on_Crawl__00_validate_ripgrep.py

@@ -0,0 +1,131 @@
+#!/usr/bin/env python3
+"""
+Validation hook for ripgrep binary.
+
+Only runs if SEARCH_BACKEND_ENGINE is set to 'ripgrep'.
+Outputs JSONL for InstalledBinary and Machine config updates.
+"""
+
+import os
+import sys
+import json
+import shutil
+import hashlib
+import subprocess
+from pathlib import Path
+
+
+def get_binary_version(abspath: str) -> str | None:
+    """Get version string from ripgrep binary."""
+    try:
+        result = subprocess.run(
+            [abspath, '--version'],
+            capture_output=True,
+            text=True,
+            timeout=5,
+        )
+        if result.returncode == 0 and result.stdout:
+            # ripgrep version string: "ripgrep 14.1.0"
+            first_line = result.stdout.strip().split('\n')[0]
+            parts = first_line.split()
+            for i, part in enumerate(parts):
+                if part.lower() == 'ripgrep' and i + 1 < len(parts):
+                    return parts[i + 1]
+            # Try to find version number pattern
+            for part in parts:
+                if part[0].isdigit() and '.' in part:
+                    return part
+            return first_line[:32]
+    except Exception:
+        pass
+    return None
+
+
+def get_binary_hash(abspath: str) -> str | None:
+    """Get SHA256 hash of binary."""
+    try:
+        with open(abspath, 'rb') as f:
+            return hashlib.sha256(f.read()).hexdigest()
+    except Exception:
+        return None
+
+
+def find_ripgrep() -> dict | None:
+    """Find ripgrep binary using shutil.which or env var."""
+    # Check env var first - if it's an absolute path and exists, use it
+    ripgrep_env = os.environ.get('RIPGREP_BINARY', '')
+    if ripgrep_env and '/' in ripgrep_env and Path(ripgrep_env).is_file():
+        abspath = ripgrep_env
+    else:
+        # Otherwise try shutil.which with the env var as the binary name
+        abspath = shutil.which(ripgrep_env) if ripgrep_env else None
+        if not abspath:
+            abspath = shutil.which('rg')
+
+    if abspath and Path(abspath).is_file():
+        return {
+            'name': 'rg',
+            'abspath': abspath,
+            'version': get_binary_version(abspath),
+            'sha256': get_binary_hash(abspath),
+            'binprovider': 'env',
+        }
+
+    return None
+
+
+def main():
+    """Validate ripgrep binary and output JSONL."""
+
+    # Check if ripgrep search backend is enabled
+    search_backend = os.environ.get('SEARCH_BACKEND_ENGINE', '').lower()
+
+    if search_backend != 'ripgrep':
+        # No-op: ripgrep is not the active search backend
+        sys.exit(0)
+
+    result = find_ripgrep()
+
+    if result and result.get('abspath'):
+        # Output InstalledBinary
+        print(json.dumps({
+            'type': 'InstalledBinary',
+            'name': result['name'],
+            'abspath': result['abspath'],
+            'version': result['version'],
+            'sha256': result['sha256'],
+            'binprovider': result['binprovider'],
+        }))
+
+        # Output Machine config update
+        print(json.dumps({
+            'type': 'Machine',
+            '_method': 'update',
+            'key': 'config/RIPGREP_BINARY',
+            'value': result['abspath'],
+        }))
+
+        if result['version']:
+            print(json.dumps({
+                'type': 'Machine',
+                '_method': 'update',
+                'key': 'config/RIPGREP_VERSION',
+                'value': result['version'],
+            }))
+
+        sys.exit(0)
+    else:
+        # Output Dependency request
+        print(json.dumps({
+            'type': 'Dependency',
+            'bin_name': 'rg',
+            'bin_providers': 'apt,brew,cargo,env',
+        }))
+
+        # Exit non-zero to indicate binary not found
+        print(f"ripgrep binary not found", file=sys.stderr)
+        sys.exit(1)
+
+
+if __name__ == '__main__':
+    main()

+ 0 - 0
archivebox/plugins/search_backend_ripgrep/tests/__init__.py


+ 306 - 0
archivebox/plugins/search_backend_ripgrep/tests/test_ripgrep_detection.py

@@ -0,0 +1,306 @@
+#!/usr/bin/env python3
+"""
+Tests for ripgrep binary detection and archivebox install functionality.
+
+Guards against regressions in:
+1. Machine.config overrides not being used in version command
+2. Ripgrep hook not resolving binary names via shutil.which()
+3. SEARCH_BACKEND_ENGINE not being passed to hook environment
+"""
+
+import os
+import sys
+import json
+import shutil
+import tempfile
+import subprocess
+from pathlib import Path
+from unittest.mock import patch, MagicMock
+
+import pytest
+
+
+def test_ripgrep_hook_detects_binary_from_path():
+    """Test that ripgrep hook finds binary using shutil.which() when env var is just a name."""
+    hook_path = Path(__file__).parent.parent / 'on_Crawl__00_validate_ripgrep.py'
+
+    # Skip if rg is not installed
+    if not shutil.which('rg'):
+        pytest.skip("ripgrep (rg) not installed")
+
+    # Set SEARCH_BACKEND_ENGINE to enable the hook
+    env = os.environ.copy()
+    env['SEARCH_BACKEND_ENGINE'] = 'ripgrep'
+    env['RIPGREP_BINARY'] = 'rg'  # Just the name, not the full path (this was the bug)
+
+    result = subprocess.run(
+        [sys.executable, str(hook_path)],
+        capture_output=True,
+        text=True,
+        env=env,
+        timeout=10,
+    )
+
+    assert result.returncode == 0, f"Hook failed: {result.stderr}"
+
+    # Parse JSONL output
+    lines = [line for line in result.stdout.strip().split('\n') if line.strip()]
+    assert len(lines) >= 2, "Expected at least 2 JSONL lines (InstalledBinary + Machine config)"
+
+    installed_binary = json.loads(lines[0])
+    assert installed_binary['type'] == 'InstalledBinary'
+    assert installed_binary['name'] == 'rg'
+    assert '/' in installed_binary['abspath'], "Expected full path, not just binary name"
+    assert Path(installed_binary['abspath']).is_file(), "Binary path should exist"
+    assert installed_binary['version'], "Version should be detected"
+
+    machine_config = json.loads(lines[1])
+    assert machine_config['type'] == 'Machine'
+    assert machine_config['key'] == 'config/RIPGREP_BINARY'
+    assert '/' in machine_config['value'], "Machine config should store full path"
+
+
+def test_ripgrep_hook_skips_when_backend_not_ripgrep():
+    """Test that ripgrep hook exits silently when search backend is not ripgrep."""
+    hook_path = Path(__file__).parent.parent / 'on_Crawl__00_validate_ripgrep.py'
+
+    env = os.environ.copy()
+    env['SEARCH_BACKEND_ENGINE'] = 'sqlite'  # Different backend
+
+    result = subprocess.run(
+        [sys.executable, str(hook_path)],
+        capture_output=True,
+        text=True,
+        env=env,
+        timeout=10,
+    )
+
+    assert result.returncode == 0, "Hook should exit successfully when backend is not ripgrep"
+    assert result.stdout.strip() == '', "Hook should produce no output when backend is not ripgrep"
+
+
+def test_ripgrep_hook_handles_absolute_path():
+    """Test that ripgrep hook works when RIPGREP_BINARY is an absolute path."""
+    hook_path = Path(__file__).parent.parent / 'on_Crawl__00_validate_ripgrep.py'
+
+    rg_path = shutil.which('rg')
+    if not rg_path:
+        pytest.skip("ripgrep (rg) not installed")
+
+    env = os.environ.copy()
+    env['SEARCH_BACKEND_ENGINE'] = 'ripgrep'
+    env['RIPGREP_BINARY'] = rg_path  # Full absolute path
+
+    result = subprocess.run(
+        [sys.executable, str(hook_path)],
+        capture_output=True,
+        text=True,
+        env=env,
+        timeout=10,
+    )
+
+    assert result.returncode == 0, f"Hook failed: {result.stderr}"
+    assert result.stdout.strip(), "Hook should produce output"
+
+    installed_binary = json.loads(result.stdout.strip().split('\n')[0])
+    assert installed_binary['abspath'] == rg_path
+
+
[email protected]_db
+def test_machine_config_overrides_base_config():
+    """
+    Test that Machine.config overrides take precedence over base config.
+
+    Guards against regression where archivebox version was showing binaries
+    as "not installed" even though they were detected and stored in Machine.config.
+    """
+    from machine.models import Machine, InstalledBinary
+
+    machine = Machine.current()
+
+    # Simulate a hook detecting chrome and storing it with a different path than base config
+    detected_chrome_path = '/custom/path/to/chrome'
+    machine.config['CHROME_BINARY'] = detected_chrome_path
+    machine.config['CHROME_VERSION'] = '143.0.7499.170'
+    machine.save()
+
+    # Create InstalledBinary record
+    InstalledBinary.objects.create(
+        machine=machine,
+        name='chrome',
+        abspath=detected_chrome_path,
+        version='143.0.7499.170',
+        binprovider='env',
+    )
+
+    # Verify Machine.config takes precedence
+    from archivebox.config.configset import get_config
+    config = get_config()
+
+    # Machine.config should override the base config value
+    assert machine.config.get('CHROME_BINARY') == detected_chrome_path
+
+    # The version command should use Machine.config, not base config
+    # (Base config might have 'chromium' while Machine.config has the full path)
+    bin_value = machine.config.get('CHROME_BINARY') or config.get('CHROME_BINARY', '')
+    assert bin_value == detected_chrome_path, \
+        "Machine.config override should take precedence over base config"
+
+
[email protected]_db
+def test_search_backend_engine_passed_to_hooks():
+    """
+    Test that SEARCH_BACKEND_ENGINE is passed to hook environment.
+
+    Guards against regression where hooks couldn't determine which search backend was active.
+    """
+    from pathlib import Path
+    from archivebox.hooks import build_hook_environment
+    from archivebox.config.configset import get_config
+
+    config = get_config()
+    search_backend = config.get('SEARCH_BACKEND_ENGINE', 'ripgrep')
+
+    env = build_hook_environment(overrides=None)
+
+    assert 'SEARCH_BACKEND_ENGINE' in env, \
+        "SEARCH_BACKEND_ENGINE must be in hook environment"
+    assert env['SEARCH_BACKEND_ENGINE'] == search_backend, \
+        f"Expected SEARCH_BACKEND_ENGINE={search_backend}, got {env.get('SEARCH_BACKEND_ENGINE')}"
+
+
[email protected]_db
+def test_install_creates_installedbinary_records():
+    """
+    Test that archivebox install creates InstalledBinary records for detected binaries.
+
+    This is an integration test that verifies the full install flow.
+    """
+    from machine.models import Machine, InstalledBinary
+    from crawls.models import Seed, Crawl
+    from crawls.statemachines import CrawlMachine
+    from archivebox.base_models.models import get_or_create_system_user_pk
+
+    machine = Machine.current()
+    initial_binary_count = InstalledBinary.objects.filter(machine=machine).count()
+
+    # Create an install crawl (like archivebox install does)
+    created_by_id = get_or_create_system_user_pk()
+    seed, _ = Seed.objects.get_or_create(
+        uri='archivebox://test-install',
+        label='Test dependency detection',
+        created_by_id=created_by_id,
+        defaults={'extractor': 'auto'},
+    )
+
+    crawl = Crawl.objects.create(
+        seed=seed,
+        max_depth=0,
+        created_by_id=created_by_id,
+        status='queued',
+    )
+
+    # Run the crawl state machine (this triggers hooks)
+    sm = CrawlMachine(crawl)
+    sm.send('tick')  # queued -> started (runs hooks)
+
+    # Verify InstalledBinary records were created
+    final_binary_count = InstalledBinary.objects.filter(machine=machine).count()
+    assert final_binary_count > initial_binary_count, \
+        "archivebox install should create InstalledBinary records"
+
+    # Verify at least some common binaries were detected
+    common_binaries = ['git', 'wget', 'node']
+    detected = []
+    for bin_name in common_binaries:
+        if InstalledBinary.objects.filter(machine=machine, name=bin_name).exists():
+            detected.append(bin_name)
+
+    assert detected, f"At least one of {common_binaries} should be detected"
+
+    # Verify detected binaries have valid paths and versions
+    for binary in InstalledBinary.objects.filter(machine=machine):
+        if binary.abspath:  # Only check non-empty paths
+            assert '/' in binary.abspath, \
+                f"{binary.name} should have full path, not just name: {binary.abspath}"
+            # Version might be empty for some binaries, that's ok
+
+
[email protected]_db
+def test_ripgrep_only_detected_when_backend_enabled():
+    """
+    Test that ripgrep is only detected when SEARCH_BACKEND_ENGINE='ripgrep'.
+
+    Guards against ripgrep being installed/detected when not needed.
+    """
+    from machine.models import Machine, InstalledBinary
+    from crawls.models import Seed, Crawl
+    from crawls.statemachines import CrawlMachine
+    from archivebox.base_models.models import get_or_create_system_user_pk
+    from django.conf import settings
+
+    if not shutil.which('rg'):
+        pytest.skip("ripgrep (rg) not installed")
+
+    machine = Machine.current()
+
+    # Clear any existing ripgrep records
+    InstalledBinary.objects.filter(machine=machine, name='rg').delete()
+
+    # Test 1: With ripgrep backend - should be detected
+    with patch('archivebox.config.configset.get_config') as mock_config:
+        mock_config.return_value = {'SEARCH_BACKEND_ENGINE': 'ripgrep', 'RIPGREP_BINARY': 'rg'}
+
+        created_by_id = get_or_create_system_user_pk()
+        seed = Seed.objects.create(
+            uri='archivebox://test-rg-enabled',
+            label='Test ripgrep detection enabled',
+            created_by_id=created_by_id,
+            extractor='auto',
+        )
+
+        crawl = Crawl.objects.create(
+            seed=seed,
+            max_depth=0,
+            created_by_id=created_by_id,
+            status='queued',
+        )
+
+        sm = CrawlMachine(crawl)
+        sm.send('tick')
+
+        # Ripgrep should be detected
+        rg_detected = InstalledBinary.objects.filter(machine=machine, name='rg').exists()
+        assert rg_detected, "Ripgrep should be detected when SEARCH_BACKEND_ENGINE='ripgrep'"
+
+    # Clear records again
+    InstalledBinary.objects.filter(machine=machine, name='rg').delete()
+
+    # Test 2: With different backend - should NOT be detected
+    with patch('archivebox.config.configset.get_config') as mock_config:
+        mock_config.return_value = {'SEARCH_BACKEND_ENGINE': 'sqlite', 'RIPGREP_BINARY': 'rg'}
+
+        seed2 = Seed.objects.create(
+            uri='archivebox://test-rg-disabled',
+            label='Test ripgrep detection disabled',
+            created_by_id=created_by_id,
+            extractor='auto',
+        )
+
+        crawl2 = Crawl.objects.create(
+            seed=seed2,
+            max_depth=0,
+            created_by_id=created_by_id,
+            status='queued',
+        )
+
+        sm2 = CrawlMachine(crawl2)
+        sm2.send('tick')
+
+        # Ripgrep should NOT be detected
+        rg_detected = InstalledBinary.objects.filter(machine=machine, name='rg').exists()
+        assert not rg_detected, "Ripgrep should NOT be detected when SEARCH_BACKEND_ENGINE!='ripgrep'"
+
+
+if __name__ == '__main__':
+    pytest.main([__file__, '-v'])

+ 1 - 1
archivebox/plugins/search_backend_sonic/on_Snapshot__91_index_sonic.py

@@ -29,7 +29,7 @@ import rich_click as click
 
 # Extractor metadata
 EXTRACTOR_NAME = 'index_sonic'
-OUTPUT_DIR = 'search_index'
+OUTPUT_DIR = '.'
 
 # Text file patterns to index
 INDEXABLE_FILES = [

+ 1 - 1
archivebox/plugins/search_backend_sqlite/on_Snapshot__90_index_sqlite.py

@@ -27,7 +27,7 @@ import rich_click as click
 
 # Extractor metadata
 EXTRACTOR_NAME = 'index_sqlite'
-OUTPUT_DIR = 'search_index'
+OUTPUT_DIR = '.'
 
 # Text file patterns to index, in priority order
 INDEXABLE_FILES = [

+ 3 - 6
archivebox/plugins/seo/on_Snapshot__38_seo.js

@@ -21,9 +21,9 @@ const puppeteer = require('puppeteer-core');
 
 // Extractor metadata
 const EXTRACTOR_NAME = 'seo';
-const OUTPUT_DIR = 'seo';
+const OUTPUT_DIR = '.';
 const OUTPUT_FILE = 'seo.json';
-const CHROME_SESSION_DIR = 'chrome_session';
+const CHROME_SESSION_DIR = '../chrome_session';
 
 // Parse command line arguments
 function parseArgs() {
@@ -60,10 +60,7 @@ function getCdpUrl() {
 
 // Extract SEO metadata
 async function extractSeo(url) {
-    // Create output directory
-    if (!fs.existsSync(OUTPUT_DIR)) {
-        fs.mkdirSync(OUTPUT_DIR, { recursive: true });
-    }
+    // Output directory is current directory (hook already runs in output dir)
     const outputPath = path.join(OUTPUT_DIR, OUTPUT_FILE);
 
     let browser = null;

+ 3 - 5
archivebox/plugins/singlefile/on_Snapshot__04_singlefile.js

@@ -40,7 +40,7 @@ const EXTENSIONS_DIR = process.env.CHROME_EXTENSIONS_DIR ||
 const CHROME_DOWNLOADS_DIR = process.env.CHROME_DOWNLOADS_DIR ||
     path.join(process.env.DATA_DIR || './data', 'personas', process.env.ACTIVE_PERSONA || 'Default', 'chrome_downloads');
 
-const OUTPUT_DIR = 'singlefile';
+const OUTPUT_DIR = '.';
 const OUTPUT_FILE = 'singlefile.html';
 
 /**
@@ -102,8 +102,7 @@ async function saveSinglefileWithExtension(page, extension, options = {}) {
             .filter(fn => fn.endsWith('.html'))
     );
 
-    // Ensure output directory exists
-    await fs.promises.mkdir(OUTPUT_DIR, { recursive: true });
+    // Output directory is current directory (hook already runs in output dir)
     const out_path = path.join(OUTPUT_DIR, OUTPUT_FILE);
 
     console.log(`[🛠️] Saving SingleFile HTML using extension (${extension.id})...`);
@@ -170,8 +169,7 @@ async function saveSinglefileWithCLI(url, options = {}) {
         return null;
     }
 
-    // Ensure output directory exists
-    await fs.promises.mkdir(OUTPUT_DIR, { recursive: true });
+    // Output directory is current directory (hook already runs in output dir)
     const out_path = path.join(OUTPUT_DIR, OUTPUT_FILE);
 
     // Build command

+ 5 - 6
archivebox/plugins/singlefile/on_Snapshot__37_singlefile.py

@@ -41,7 +41,7 @@ import rich_click as click
 EXTRACTOR_NAME = 'singlefile'
 BIN_NAME = 'single-file'
 BIN_PROVIDERS = 'npm,env'
-OUTPUT_DIR = 'singlefile'
+OUTPUT_DIR = '.'
 OUTPUT_FILE = 'singlefile.html'
 
 
@@ -65,7 +65,7 @@ def get_env_int(name: str, default: int = 0) -> int:
         return default
 
 
-STATICFILE_DIR = 'staticfile'
+STATICFILE_DIR = '../staticfile'
 
 def has_staticfile_output() -> bool:
     """Check if staticfile extractor already downloaded this URL."""
@@ -135,7 +135,7 @@ def get_version(binary: str) -> str:
         return ''
 
 
-CHROME_SESSION_DIR = 'chrome_session'
+CHROME_SESSION_DIR = '../chrome_session'
 
 
 def get_cdp_url() -> str | None:
@@ -203,9 +203,8 @@ def save_singlefile(url: str, binary: str) -> tuple[bool, str | None, str]:
     if extra_args:
         cmd.extend(extra_args.split())
 
-    # Create output directory
+    # Output directory is current directory (hook already runs in output dir)
     output_dir = Path(OUTPUT_DIR)
-    output_dir.mkdir(exist_ok=True)
     output_path = output_dir / OUTPUT_FILE
 
     cmd.extend([url, str(output_path)])
@@ -274,7 +273,7 @@ def main(url: str, snapshot_id: str):
             sys.exit(1)
 
         version = get_version(binary)
-        cmd_str = f'{binary} {url} {OUTPUT_DIR}/{OUTPUT_FILE}'
+        cmd_str = f'{binary} {url} {OUTPUT_FILE}'
 
         # Run extraction
         success, output, error = save_singlefile(url, binary)

+ 3 - 6
archivebox/plugins/ssl/on_Snapshot__23_ssl.js

@@ -21,9 +21,9 @@ const puppeteer = require('puppeteer-core');
 
 // Extractor metadata
 const EXTRACTOR_NAME = 'ssl';
-const OUTPUT_DIR = 'ssl';
+const OUTPUT_DIR = '.';
 const OUTPUT_FILE = 'ssl.json';
-const CHROME_SESSION_DIR = 'chrome_session';
+const CHROME_SESSION_DIR = '../chrome_session';
 
 // Parse command line arguments
 function parseArgs() {
@@ -60,10 +60,7 @@ function getCdpUrl() {
 
 // Extract SSL details
 async function extractSsl(url) {
-    // Create output directory
-    if (!fs.existsSync(OUTPUT_DIR)) {
-        fs.mkdirSync(OUTPUT_DIR, { recursive: true });
-    }
+    // Output directory is current directory (hook already runs in output dir)
     const outputPath = path.join(OUTPUT_DIR, OUTPUT_FILE);
 
     // Only extract SSL for HTTPS URLs

+ 3 - 4
archivebox/plugins/staticfile/on_Snapshot__31_staticfile.py

@@ -31,8 +31,8 @@ import rich_click as click
 
 # Extractor metadata
 EXTRACTOR_NAME = 'staticfile'
-OUTPUT_DIR = 'staticfile'
-CHROME_SESSION_DIR = 'chrome_session'
+OUTPUT_DIR = '.'
+CHROME_SESSION_DIR = '../chrome_session'
 
 # Content-Types that indicate static files
 # These can't be meaningfully processed by Chrome-based extractors
@@ -214,9 +214,8 @@ def download_file(url: str) -> tuple[bool, str | None, str]:
         if content_length and int(content_length) > max_size:
             return False, None, f'File too large: {int(content_length)} bytes > {max_size} max'
 
-        # Create output directory
+        # Output directory is current directory (hook already runs in output dir)
         output_dir = Path(OUTPUT_DIR)
-        output_dir.mkdir(exist_ok=True)
 
         # Determine filename
         filename = get_filename_from_url(url)

+ 3 - 6
archivebox/plugins/title/on_Snapshot__32_title.js

@@ -21,9 +21,9 @@ const http = require('http');
 
 // Extractor metadata
 const EXTRACTOR_NAME = 'title';
-const OUTPUT_DIR = 'title';
+const OUTPUT_DIR = '.';
 const OUTPUT_FILE = 'title.txt';
-const CHROME_SESSION_DIR = 'chrome_session';
+const CHROME_SESSION_DIR = '../chrome_session';
 
 // Parse command line arguments
 function parseArgs() {
@@ -162,10 +162,7 @@ async function getTitleFromCdp(cdpUrl) {
 }
 
 async function extractTitle(url) {
-    // Create output directory
-    if (!fs.existsSync(OUTPUT_DIR)) {
-        fs.mkdirSync(OUTPUT_DIR, { recursive: true });
-    }
+    // Output directory is current directory (hook already runs in output dir)
     const outputPath = path.join(OUTPUT_DIR, OUTPUT_FILE);
 
     // Try Chrome session first

+ 2 - 2
archivebox/plugins/wget/on_Snapshot__50_wget.py

@@ -43,7 +43,7 @@ import rich_click as click
 EXTRACTOR_NAME = 'wget'
 BIN_NAME = 'wget'
 BIN_PROVIDERS = 'apt,brew,env'
-OUTPUT_DIR = 'wget'
+OUTPUT_DIR = '.'
 
 
 def get_env(name: str, default: str = '') -> str:
@@ -66,7 +66,7 @@ def get_env_int(name: str, default: int = 0) -> int:
         return default
 
 
-STATICFILE_DIR = 'staticfile'
+STATICFILE_DIR = '../staticfile'
 
 def has_staticfile_output() -> bool:
     """Check if staticfile extractor already downloaded this URL."""

+ 1025 - 0
archivebox/templates/admin/base.html

@@ -30,6 +30,1031 @@
                 color: white;
                 cursor: pointer;
             }
+
+            /* ============================================
+               Modern card-based admin UI (shadcn-inspired)
+               ============================================ */
+
+            /* Base font improvements */
+            body, html {
+                font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, "Helvetica Neue", Arial, sans-serif;
+                -webkit-font-smoothing: antialiased;
+                -moz-osx-font-smoothing: grayscale;
+                font-size: 15px;
+                line-height: 1.6;
+                color: #0f172a;
+                background: #f8fafc;
+            }
+
+            #container {
+                background: #f8fafc;
+            }
+
+            #content {
+                padding: 24px;
+            }
+
+            /* Main form container - flexbox grid */
+            #content-main form > div,
+            #content form > div {
+                display: flex;
+                flex-wrap: wrap;
+                gap: 20px;
+                align-items: stretch;
+            }
+
+            /* Each fieldset becomes a card */
+            #content-main form fieldset,
+            #content form fieldset,
+            #content-main form .module:not(.inline-group),
+            #content form .module:not(.inline-group) {
+                background: #fff !important;
+                border: 1px solid #e2e8f0 !important;
+                border-top: 1px solid #e2e8f0 !important;
+                border-left: 1px solid #e2e8f0 !important;
+                border-right: 1px solid #e2e8f0 !important;
+                border-bottom: 1px solid #e2e8f0 !important;
+                border-radius: 12px !important;
+                padding: 0 !important;
+                margin: 0 !important;
+                box-shadow: 0 1px 3px rgba(0,0,0,0.04), 0 1px 2px rgba(0,0,0,0.06);
+                flex: 1 1 340px;
+                min-width: 320px;
+                max-width: calc(33.33% - 14px);
+                box-sizing: border-box;
+                display: flex;
+                flex-direction: column;
+                transition: box-shadow 0.2s ease, border-color 0.2s ease;
+                overflow: hidden;
+            }
+
+            /* Wide fieldsets MUST override card max-width - placed after card rules for specificity */
+            #content-main form fieldset.wide,
+            #content form fieldset.wide,
+            #content-main form fieldset:has(.field-archiveresults_list),
+            #content form fieldset:has(.field-archiveresults_list),
+            #content-main form fieldset:has(.field-snapshots),
+            #content form fieldset:has(.field-snapshots) {
+                flex: 1 1 100% !important;
+                max-width: 100% !important;
+                min-width: 100% !important;
+                width: 100% !important;
+                flex-basis: 100% !important;
+            }
+
+            /* Inline groups should NOT have card constraints */
+            #content-main form .inline-group,
+            #content form .inline-group,
+            .inline-group fieldset,
+            .inline-group .module {
+                flex: 1 1 100% !important;
+                max-width: 100% !important;
+                min-width: 100% !important;
+                width: 100% !important;
+            }
+
+            #content-main form fieldset:hover,
+            #content form fieldset:hover {
+                box-shadow: 0 4px 6px rgba(0,0,0,0.05), 0 2px 4px rgba(0,0,0,0.06);
+                border-color: #cbd5e1;
+            }
+
+            /* Archive results list content should take full width */
+            .field-archiveresults_list,
+            .field-archiveresults_list .readonly,
+            .field-snapshots,
+            .field-snapshots .readonly {
+                width: 100% !important;
+                max-width: 100% !important;
+                background: transparent !important;
+                border: none !important;
+                padding: 0 !important;
+            }
+
+            /* Card headers - no borders, just background */
+            #content-main form fieldset h2,
+            #content form fieldset h2,
+            #content-main form .module h2,
+            #content form .module h2 {
+                margin: 0 !important;
+                padding: 8px 16px !important;
+                background: #f1f5f9 !important;
+                color: #334155 !important;
+                font-size: 12px !important;
+                font-weight: 600 !important;
+                font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, "Helvetica Neue", Arial, sans-serif !important;
+                border: none !important;
+                border-top: none !important;
+                border-left: none !important;
+                border-right: none !important;
+                border-bottom: none !important;
+                border-radius: 0 !important;
+                text-transform: uppercase;
+                letter-spacing: 0.5px;
+                flex-shrink: 0;
+                -webkit-font-smoothing: antialiased;
+                box-shadow: none !important;
+                outline: none !important;
+            }
+
+            /* Collapse toggle styling */
+            #content-main form fieldset h2 a.collapse-toggle,
+            #content form fieldset h2 a.collapse-toggle {
+                color: #64748b;
+            }
+
+            /* Card content area */
+            #content-main form fieldset > div,
+            #content form fieldset > div {
+                padding: 20px;
+                flex: 1;
+                overflow-x: hidden;
+                overflow-y: visible;
+                min-width: 0;
+            }
+
+            /* Form rows inside cards */
+            #content-main form fieldset .form-row,
+            #content form fieldset .form-row {
+                padding: 8px 0;
+                border-bottom: 1px solid #f1f5f9;
+                min-width: 0;
+                min-height: auto;
+            }
+
+            #content-main form fieldset .form-row:first-child,
+            #content form fieldset .form-row:first-child {
+                padding-top: 0;
+            }
+
+            #content-main form fieldset .form-row:last-child,
+            #content form fieldset .form-row:last-child {
+                border-bottom: none;
+                padding-bottom: 0;
+            }
+
+            /* Remove borders from nested fieldsets and flex-containers inside cards */
+            #content-main form fieldset fieldset,
+            #content form fieldset fieldset,
+            #content-main form fieldset .flex-container,
+            #content form fieldset .flex-container,
+            #content-main form .module fieldset,
+            #content form .module fieldset {
+                background: transparent !important;
+                border: none !important;
+                border-radius: 0 !important;
+                box-shadow: none !important;
+                padding: 0 !important;
+                margin: 0 !important;
+                min-width: 0 !important;
+                max-width: 94% !important;
+                flex: none !important;
+                display: block !important;
+            }
+
+            /* Nested fieldset headers should be invisible */
+            #content-main form fieldset fieldset h2,
+            #content form fieldset fieldset h2,
+            #content-main form fieldset .flex-container legend,
+            #content form fieldset .flex-container legend {
+                background: transparent !important;
+                padding: 0 0 4px 0 !important;
+                font-size: 13px !important;
+                color: #374151 !important;
+                text-transform: none !important;
+                letter-spacing: normal !important;
+            }
+
+            /* Ensure form elements inside cards don't overflow */
+            #content-main form fieldset input,
+            #content-main form fieldset select,
+            #content-main form fieldset textarea,
+            #content form fieldset input,
+            #content form fieldset select,
+            #content form fieldset textarea {
+                max-width: 100%;
+                box-sizing: border-box;
+            }
+
+            /* Related widget wrapper should fit within card */
+            #content-main form fieldset .related-widget-wrapper,
+            #content form fieldset .related-widget-wrapper {
+                max-width: 100%;
+            }
+
+            #content-main form fieldset .related-widget-wrapper select,
+            #content form fieldset .related-widget-wrapper select {
+                min-width: 0;
+                flex: 1;
+            }
+
+            /* Labels inside cards */
+            #content-main form fieldset .form-row > label,
+            #content form fieldset .form-row > label,
+            #content-main form fieldset .form-row > .flex-container > label,
+            #content form fieldset .form-row > .flex-container > label,
+            #content-main form label,
+            #content form label,
+            .aligned label,
+            legend {
+                font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, "Helvetica Neue", Arial, sans-serif;
+                font-weight: 500;
+                color: #374151;
+                display: block;
+                margin-bottom: 8px;
+                float: none !important;
+                width: auto !important;
+                padding: 0 !important;
+                font-size: 13px;
+                letter-spacing: -0.01em;
+                -webkit-font-smoothing: antialiased;
+                -moz-osx-font-smoothing: grayscale;
+            }
+
+            /* Readonly fields styling */
+            #content-main form fieldset .readonly,
+            #content form fieldset .readonly {
+                background: #f8fafc;
+                padding: 12px 14px;
+                border-radius: 8px;
+                font-family: ui-monospace, SFMono-Regular, "SF Mono", Menlo, Monaco, Consolas, monospace;
+                font-size: 13px;
+                word-break: break-word;
+                line-height: 1.6;
+                border: 1px solid #e2e8f0;
+                color: #475569;
+            }
+
+            /* Long content in readonly */
+            #content-main form fieldset .readonly pre,
+            #content form fieldset .readonly pre {
+                margin: 0;
+                white-space: pre-wrap;
+                word-break: break-word;
+                font-family: inherit;
+            }
+
+            /* Input styling */
+            #content-main form input[type="text"],
+            #content-main form input[type="number"],
+            #content-main form input[type="url"],
+            #content-main form input[type="email"],
+            #content-main form input[type="password"],
+            #content form input[type="text"],
+            #content form input[type="number"],
+            #content form input[type="url"],
+            #content form input[type="email"],
+            #content form input[type="password"] {
+                width: 100%;
+                padding: 10px 14px;
+                border: 1px solid #d1d5db;
+                border-radius: 8px;
+                font-size: 14px;
+                font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, "Helvetica Neue", Arial, sans-serif;
+                box-sizing: border-box;
+                background: #fff;
+                color: #1e293b;
+                transition: border-color 0.15s ease, box-shadow 0.15s ease;
+                -webkit-font-smoothing: antialiased;
+            }
+
+            #content-main form select,
+            #content form select {
+                width: 100%;
+                border: 1px solid #d1d5db;
+                border-radius: 8px;
+                font-size: 14px;
+                font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, "Helvetica Neue", Arial, sans-serif;
+                box-sizing: border-box;
+                background: #fff;
+                color: #1e293b;
+                transition: border-color 0.15s ease, box-shadow 0.15s ease;
+                -webkit-font-smoothing: antialiased;
+            }
+
+            #content-main form input::placeholder,
+            #content form input::placeholder {
+                color: #94a3b8;
+            }
+
+            /* Focus states */
+            #content-main form input:focus,
+            #content-main form select:focus,
+            #content-main form textarea:focus,
+            #content form input:focus,
+            #content form select:focus,
+            #content form textarea:focus {
+                border-color: #3b82f6;
+                outline: none;
+                box-shadow: 0 0 0 3px rgba(59, 130, 246, 0.15);
+            }
+
+            /* Textarea styling */
+            #content-main form textarea,
+            #content form textarea {
+                width: 100%;
+                box-sizing: border-box;
+                border: 1px solid #d1d5db;
+                border-radius: 8px;
+                padding: 12px 14px;
+                font-size: 14px;
+                font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, "Helvetica Neue", Arial, sans-serif;
+                line-height: 1.6;
+                resize: vertical;
+                min-height: 80px;
+                color: #1e293b;
+                transition: border-color 0.15s ease, box-shadow 0.15s ease;
+                -webkit-font-smoothing: antialiased;
+            }
+
+            /* Fix vTextField width */
+            .vTextField {
+                width: 100% !important;
+            }
+
+            /* ============================================
+               Button styling (shadcn-inspired)
+               ============================================ */
+
+            /* Base button styles */
+            input[type="submit"],
+            button,
+            .button,
+            .btn,
+            a.button,
+            .submit-row input,
+            .submit-row a.button {
+                display: inline-flex;
+                align-items: center;
+                justify-content: center;
+                gap: 8px;
+                padding: 10px 18px;
+                font-size: 14px;
+                font-weight: 500;
+                font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, "Helvetica Neue", Arial, sans-serif;
+                line-height: 1.4;
+                border-radius: 8px;
+                border: 1px solid transparent;
+                cursor: pointer;
+                transition: all 0.15s ease;
+                text-decoration: none;
+                white-space: nowrap;
+                -webkit-font-smoothing: antialiased;
+            }
+
+            /* Primary button (default) */
+            input[type="submit"],
+            button[type="submit"],
+            .button.default,
+            .submit-row input[type="submit"] {
+                background: #0f172a;
+                color: #fff;
+                border-color: #0f172a;
+            }
+
+            input[type="submit"]:hover,
+            button[type="submit"]:hover,
+            .button.default:hover,
+            .submit-row input[type="submit"]:hover {
+                background: #1e293b;
+                border-color: #1e293b;
+            }
+
+            input[type="submit"]:active,
+            button[type="submit"]:active {
+                background: #334155;
+                transform: translateY(1px);
+            }
+
+            /* Secondary/outline buttons */
+            button:not([type="submit"]),
+            .button:not(.default),
+            a.button {
+                background: #fff;
+                color: #374151;
+                border-color: #d1d5db;
+            }
+
+            button:not([type="submit"]):hover,
+            .button:not(.default):hover,
+            a.button:hover {
+                background: #f9fafb;
+                border-color: #9ca3af;
+                color: #1f2937;
+            }
+
+            /* Danger button */
+            .deletelink,
+            a.deletelink,
+            button.deletelink,
+            input[name="delete"],
+            .button.delete {
+                background: #fff;
+                color: #dc2626;
+                border-color: #fecaca;
+            }
+
+            .deletelink:hover,
+            a.deletelink:hover,
+            button.deletelink:hover,
+            input[name="delete"]:hover,
+            .button.delete:hover {
+                background: #fef2f2;
+                border-color: #f87171;
+                color: #b91c1c;
+            }
+
+            /* Small buttons */
+            .btn-sm,
+            .object-tools a,
+            .datetimeshortcuts a {
+                padding: 6px 12px;
+                font-size: 13px;
+                border-radius: 6px;
+            }
+
+            /* Object tools (top action buttons) */
+            .object-tools {
+                margin-bottom: 20px;
+            }
+
+            .object-tools li {
+                margin-left: 10px;
+            }
+
+            .object-tools a {
+                background: #fff;
+                color: #374151;
+                border: 1px solid #d1d5db;
+                text-decoration: none;
+                display: inline-flex;
+                align-items: center;
+            }
+
+            .object-tools a:hover {
+                background: #f9fafb;
+                border-color: #9ca3af;
+            }
+
+            /* Submit row styling */
+            .submit-row {
+                margin-top: 24px;
+                padding: 20px;
+                background: #fff;
+                border-radius: 12px;
+                border: 1px solid #e2e8f0;
+                box-shadow: 0 1px 3px rgba(0,0,0,0.04);
+                clear: both;
+                flex: 1 1 100%;
+                display: flex;
+                gap: 12px;
+                flex-wrap: wrap;
+                align-items: center;
+            }
+
+            .submit-row p {
+                margin: 0;
+            }
+
+            .submit-row .deletelink-box {
+                margin-left: auto;
+            }
+
+            /* Responsive: 2 columns on medium screens */
+            @media (max-width: 1400px) {
+                #content-main form fieldset,
+                #content form fieldset {
+                    max-width: calc(50% - 10px);
+                    flex: 1 1 320px;
+                }
+            }
+
+            /* Responsive: stack on smaller screens */
+            @media (max-width: 900px) {
+                #content-main form fieldset,
+                #content form fieldset {
+                    flex: 1 1 100%;
+                    max-width: 100%;
+                    min-width: auto;
+                }
+
+                #content {
+                    padding: 16px;
+                }
+            }
+
+            /* Module content padding */
+            #content-main form .module > div,
+            #content form .module > div {
+                padding: 12px;
+            }
+
+            /* Fix for JSON/config editor */
+            .field-config .readonly,
+            .field-config textarea {
+                width: 100%;
+                min-height: 120px;
+                max-height: none;
+            }
+
+            /* Related widget styling */
+            .related-widget-wrapper {
+                display: flex;
+                align-items: center;
+                gap: 8px;
+                flex-wrap: wrap;
+            }
+
+            .related-widget-wrapper select {
+                flex: 1;
+                min-width: 150px;
+            }
+
+            .related-widget-wrapper a {
+                flex-shrink: 0;
+                padding: 8px;
+                border-radius: 6px;
+                color: #64748b;
+                transition: color 0.15s ease, background 0.15s ease;
+            }
+
+            .related-widget-wrapper a:hover {
+                color: #1e293b;
+                background: #f1f5f9;
+            }
+
+            /* Help text styling */
+            .help {
+                font-size: 13px;
+                color: #64748b;
+                margin-top: 6px;
+                line-height: 1.5;
+            }
+
+            /* Error styling */
+            .errorlist {
+                color: #dc2626;
+                font-size: 13px;
+                margin: 6px 0;
+                padding: 0;
+                list-style: none;
+            }
+
+            .errorlist li {
+                background: #fef2f2;
+                padding: 8px 12px;
+                border-radius: 6px;
+                border: 1px solid #fecaca;
+            }
+
+            /* Inline related objects - force full width */
+            .inline-group,
+            #archiveresult_set-group,
+            #content-main form .inline-group,
+            #content-main form > div > .inline-group,
+            #content form > div > .inline-group,
+            .change-form .inline-group,
+            div.inline-group {
+                flex: 1 1 100% !important;
+                max-width: 100% !important;
+                min-width: 100% !important;
+                width: 100% !important;
+                margin-top: 20px;
+                flex-basis: 100% !important;
+            }
+
+            /* Ensure inline-group breaks out of card grid */
+            #content-main form > div,
+            #content form > div {
+                flex-wrap: wrap;
+            }
+
+            /* TabularInline table full width */
+            .inline-group .tabular,
+            .inline-group table {
+                width: 100% !important;
+            }
+
+            .inline-related {
+                margin: 12px 0;
+                padding: 16px;
+                background: #fff;
+                border-radius: 10px;
+                border: 1px solid #e2e8f0;
+            }
+
+            .inline-related h3 {
+                margin: -16px -16px 16px -16px;
+                padding: 12px 16px;
+                background: #f8fafc;
+                border-radius: 9px 9px 0 0;
+                border-bottom: 1px solid #e2e8f0;
+                font-size: 13px;
+                font-weight: 600;
+                color: #374151;
+            }
+
+            /* Tabular inline styling */
+            .tabular {
+                border-radius: 8px;
+                overflow: hidden;
+                border: 1px solid #e2e8f0;
+            }
+
+            .tabular td, .tabular th {
+                padding: 12px 14px;
+                font-size: 13px;
+                border-bottom: 1px solid #f1f5f9;
+            }
+
+            .tabular th {
+                background: #f8fafc;
+                font-weight: 600;
+                color: #374151;
+                text-align: left;
+            }
+
+            .tabular tr:last-child td {
+                border-bottom: none;
+            }
+
+            /* Delete checkbox */
+            .inline-deletelink {
+                color: #dc2626;
+                font-size: 13px;
+            }
+
+            /* Datetime widgets */
+            .datetimeshortcuts {
+                margin-left: 10px;
+            }
+
+            .datetimeshortcuts a {
+                background: #f1f5f9;
+                color: #475569;
+                border: none;
+                padding: 4px 10px;
+            }
+
+            .datetimeshortcuts a:hover {
+                background: #e2e8f0;
+                color: #1e293b;
+            }
+
+            /* Aligned forms - fix label positioning */
+            .aligned .form-row > div {
+                margin-left: 0 !important;
+            }
+
+            /* Checkbox styling */
+            input[type="checkbox"] {
+                width: 18px;
+                height: 18px;
+                border-radius: 4px;
+                border: 1px solid #d1d5db;
+                cursor: pointer;
+                accent-color: #3b82f6;
+            }
+
+            /* Links styling */
+            a {
+                color: #2563eb;
+                text-decoration: none;
+                transition: color 0.15s ease;
+            }
+
+            a:hover {
+                color: #1d4ed8;
+            }
+
+            /* Messages/alerts */
+            .messagelist {
+                padding: 0;
+                margin: 0 0 20px 0;
+            }
+
+            .messagelist li {
+                padding: 14px 18px;
+                border-radius: 10px;
+                font-size: 14px;
+                margin-bottom: 10px;
+                display: flex;
+                align-items: center;
+                gap: 10px;
+            }
+
+            ul.messagelist li.success {
+                background: #f0fdf4 !important;
+                background-image: none !important;
+                border: 1px solid #bbf7d0;
+                color: #166534;
+            }
+
+            .messagelist li.warning {
+                background: #fffbeb;
+                border: 1px solid #fde68a;
+                color: #92400e;
+            }
+
+            .messagelist li.error {
+                background: #fef2f2;
+                border: 1px solid #fecaca;
+                color: #991b1b;
+            }
+
+            /* Breadcrumbs */
+            .breadcrumbs {
+                background: transparent;
+                padding: 12px 24px;
+                font-size: 13px;
+                color: #64748b;
+            }
+
+            .breadcrumbs a {
+                color: #64748b;
+            }
+
+            .breadcrumbs a:hover {
+                color: #1e293b;
+            }
+
+            /* Action buttons in cards */
+            .card .btn,
+            .card button {
+                margin-top: 10px;
+            }
+
+            /* Select2 overrides */
+            .select2-container--default .select2-selection--single,
+            .select2-container--default .select2-selection--multiple {
+                border: 1px solid #d1d5db;
+                border-radius: 8px;
+                min-height: 42px;
+            }
+
+            .select2-container--default .select2-selection--single:focus,
+            .select2-container--default .select2-selection--multiple:focus {
+                border-color: #3b82f6;
+                box-shadow: 0 0 0 3px rgba(59, 130, 246, 0.15);
+            }
+
+            /* ============================================
+               Admin List/Changelist Page Styling
+               ============================================ */
+
+            /* Results table container */
+            #changelist {
+                background: #fff;
+                border-radius: 12px;
+                border: 1px solid #e2e8f0;
+                box-shadow: 0 1px 3px rgba(0,0,0,0.04);
+                overflow: hidden;
+            }
+
+            /* Table styling */
+            #result_list {
+                width: 100%;
+                border-collapse: collapse;
+                font-size: 14px;
+            }
+
+            #result_list thead th {
+                background: #f8fafc;
+                border-bottom: 2px solid #e2e8f0;
+                padding: 12px 16px;
+                font-weight: 600;
+                font-size: 13px;
+                color: #475569;
+                text-align: left;
+                text-transform: uppercase;
+                letter-spacing: 0.025em;
+                white-space: nowrap;
+            }
+
+            #result_list thead th a {
+                color: #475569;
+                text-decoration: none;
+            }
+
+            #result_list thead th a:hover {
+                color: #1e293b;
+            }
+
+            #result_list thead th.sorted {
+                background: #f1f5f9;
+            }
+
+            #result_list thead th .text span {
+                padding-right: 5px;
+            }
+
+            #result_list tbody tr {
+                border-bottom: 1px solid #f1f5f9;
+                transition: background-color 0.15s ease;
+            }
+
+            #result_list tbody tr:hover {
+                background-color: #f8fafc;
+            }
+
+            #result_list tbody tr.selected {
+                background-color: #eff6ff;
+            }
+
+            #result_list tbody td {
+                padding: 12px 16px;
+                color: #334155;
+                vertical-align: middle;
+            }
+
+            #result_list tbody td a {
+                color: #2563eb;
+                font-weight: 500;
+            }
+
+            #result_list tbody td a:hover {
+                color: #1d4ed8;
+                text-decoration: underline;
+            }
+
+            /* Checkbox column */
+            #result_list .action-checkbox,
+            #result_list th.action-checkbox-column {
+                width: 40px;
+                text-align: center;
+                padding: 12px 8px;
+            }
+
+            /* Pagination */
+            .paginator {
+                background: #f8fafc;
+                padding: 12px 16px;
+                border-top: 1px solid #e2e8f0;
+                font-size: 14px;
+                color: #64748b;
+            }
+
+            .paginator a {
+                color: #2563eb;
+                padding: 6px 12px;
+                border-radius: 6px;
+                margin: 0 2px;
+                text-decoration: none;
+            }
+
+            .paginator a:hover {
+                background: #e2e8f0;
+            }
+
+            /* Toolbar / search bar */
+            #toolbar {
+                padding: 16px;
+                background: #fff;
+                border-bottom: 1px solid #e2e8f0;
+                display: flex;
+                align-items: center;
+                gap: 12px;
+            }
+
+            #toolbar form {
+                display: flex;
+                align-items: center;
+                gap: 8px;
+                flex: 1;
+            }
+
+            #searchbar {
+                flex: 1;
+                max-width: 400px;
+                padding: 10px 14px;
+                border: 1px solid #d1d5db;
+                border-radius: 8px;
+                font-size: 14px;
+            }
+
+            #searchbar:focus {
+                border-color: #3b82f6;
+                outline: none;
+                box-shadow: 0 0 0 3px rgba(59, 130, 246, 0.15);
+            }
+
+            /* Filter sidebar */
+            #changelist-filter {
+                background: #fff;
+                border: 1px solid #e2e8f0;
+                border-radius: 12px;
+                box-shadow: 0 1px 3px rgba(0,0,0,0.04);
+                overflow: hidden;
+            }
+
+            #changelist-filter h2 {
+                background: #f8fafc;
+                padding: 12px 16px;
+                font-size: 13px;
+                font-weight: 600;
+                color: #475569;
+                text-transform: uppercase;
+                letter-spacing: 0.025em;
+                margin: 0;
+                border-bottom: 1px solid #e2e8f0;
+            }
+
+            #changelist-filter h3 {
+                padding: 12px 16px 8px;
+                font-size: 12px;
+                font-weight: 600;
+                color: #64748b;
+                text-transform: uppercase;
+                letter-spacing: 0.05em;
+                margin: 0;
+            }
+
+            #changelist-filter ul {
+                padding: 0 8px 12px;
+                margin: 0;
+                list-style: none;
+            }
+
+            #changelist-filter li {
+                margin: 0;
+            }
+
+            #changelist-filter li a {
+                display: block;
+                padding: 8px 12px;
+                color: #475569;
+                text-decoration: none;
+                border-radius: 6px;
+                font-size: 14px;
+                transition: background-color 0.15s ease;
+            }
+
+            #changelist-filter li a:hover {
+                background: #f1f5f9;
+                color: #1e293b;
+            }
+
+            #changelist-filter li.selected a {
+                background: #eff6ff;
+                color: #2563eb;
+                font-weight: 500;
+            }
+
+            /* Actions bar */
+            .actions {
+                padding: 12px 16px;
+                background: #f8fafc;
+                border-bottom: 1px solid #e2e8f0;
+                display: flex;
+                align-items: center;
+                gap: 12px;
+                flex-wrap: wrap;
+            }
+
+            .actions label {
+                font-size: 14px;
+                color: #475569;
+            }
+
+            .actions select {
+                padding: 8px 12px;
+                border: 1px solid #d1d5db;
+                border-radius: 6px;
+                font-size: 14px;
+                background: #fff;
+            }
+
+            .actions .button {
+                padding: 8px 16px;
+                font-size: 14px;
+            }
+
+            /* Object count */
+            .actions .action-counter {
+                color: #64748b;
+                font-size: 14px;
+            }
+
+            /* Empty results */
+            #changelist-form .results + p,
+            .paginator + p {
+                padding: 40px;
+                text-align: center;
+                color: #64748b;
+                font-size: 15px;
+            }
+
+            /* Date hierarchy */
+            .xfull {
+                padding: 12px 16px;
+                background: #f8fafc;
+                border-bottom: 1px solid #e2e8f0;
+            }
+
+            .xfull a {
+                color: #2563eb;
+                margin-right: 8px;
+            }
         </style>
         {% endblock %}
         

+ 187 - 50
archivebox/templates/admin/progress_monitor.html

@@ -57,13 +57,24 @@
         box-shadow: 0 0 8px #3fb950;
         animation: pulse 2s infinite;
     }
+    #progress-monitor .status-dot.idle {
+        background: #d29922;
+        box-shadow: 0 0 4px #d29922;
+    }
     #progress-monitor .status-dot.stopped {
-        background: #f85149;
+        background: #6e7681;
+    }
+    #progress-monitor .status-dot.flash {
+        animation: flash 0.3s ease-out;
     }
     @keyframes pulse {
         0%, 100% { opacity: 1; box-shadow: 0 0 8px #3fb950; }
         50% { opacity: 0.6; box-shadow: 0 0 4px #3fb950; }
     }
+    @keyframes flash {
+        0% { transform: scale(1.5); }
+        100% { transform: scale(1); }
+    }
 
     /* Stats */
     #progress-monitor .stats {
@@ -89,6 +100,19 @@
     #progress-monitor .stat-value.error { color: #f85149; }
     #progress-monitor .stat-value.warning { color: #d29922; }
     #progress-monitor .stat-value.info { color: #58a6ff; }
+    #progress-monitor .stat.clickable {
+        cursor: pointer;
+        padding: 2px 6px;
+        margin: -2px -6px;
+        border-radius: 4px;
+        transition: background 0.2s;
+    }
+    #progress-monitor .stat.clickable:hover {
+        background: rgba(255,255,255,0.1);
+    }
+    #progress-monitor .stat.clickable:active {
+        background: rgba(255,255,255,0.2);
+    }
 
     /* Toggle Button */
     #progress-monitor .toggle-btn {
@@ -259,48 +283,86 @@
         padding: 0 12px 8px;
     }
 
-    /* Extractor List */
+    /* Extractor List - Compact Badge Layout */
     #progress-monitor .extractor-list {
         padding: 8px 12px;
         background: rgba(0,0,0,0.2);
         border-top: 1px solid #21262d;
+        display: flex;
+        flex-wrap: wrap;
+        gap: 4px;
+    }
+    #progress-monitor .extractor-badge {
+        position: relative;
+        display: inline-flex;
+        align-items: center;
+        gap: 4px;
+        padding: 3px 8px;
+        border-radius: 4px;
+        font-family: 'Monaco', 'Menlo', 'Ubuntu Mono', monospace;
+        font-size: 10px;
+        background: #21262d;
+        overflow: hidden;
+        white-space: nowrap;
     }
-    #progress-monitor .extractor-item {
+    #progress-monitor .extractor-badge .progress-fill {
+        position: absolute;
+        top: 0;
+        left: 0;
+        bottom: 0;
+        z-index: 0;
+        transition: width 0.3s ease-out;
+    }
+    #progress-monitor .extractor-badge .badge-content {
+        position: relative;
+        z-index: 1;
         display: flex;
         align-items: center;
-        gap: 8px;
-        padding: 4px 0;
+        gap: 4px;
     }
-    #progress-monitor .extractor-icon {
-        font-size: 12px;
-        width: 16px;
-        text-align: center;
+    #progress-monitor .extractor-badge.queued {
+        color: #8b949e;
     }
-    #progress-monitor .extractor-icon.running {
+    #progress-monitor .extractor-badge.queued .progress-fill {
+        background: rgba(110, 118, 129, 0.2);
+        width: 0%;
+    }
+    #progress-monitor .extractor-badge.started {
         color: #d29922;
-        animation: spin 1s linear infinite;
     }
-    #progress-monitor .extractor-icon.success {
+    #progress-monitor .extractor-badge.started .progress-fill {
+        background: rgba(210, 153, 34, 0.3);
+        width: 50%;
+        animation: progress-pulse 1.5s ease-in-out infinite;
+    }
+    @keyframes progress-pulse {
+        0%, 100% { opacity: 0.5; }
+        50% { opacity: 1; }
+    }
+    #progress-monitor .extractor-badge.succeeded {
         color: #3fb950;
     }
-    #progress-monitor .extractor-icon.failed {
+    #progress-monitor .extractor-badge.succeeded .progress-fill {
+        background: rgba(63, 185, 80, 0.25);
+        width: 100%;
+    }
+    #progress-monitor .extractor-badge.failed {
         color: #f85149;
     }
-    #progress-monitor .extractor-icon.pending {
-        color: #8b949e;
+    #progress-monitor .extractor-badge.failed .progress-fill {
+        background: rgba(248, 81, 73, 0.25);
+        width: 100%;
+    }
+    #progress-monitor .extractor-badge .badge-icon {
+        font-size: 10px;
+    }
+    #progress-monitor .extractor-badge.started .badge-icon {
+        animation: spin 1s linear infinite;
     }
     @keyframes spin {
         from { transform: rotate(0deg); }
         to { transform: rotate(360deg); }
     }
-    #progress-monitor .extractor-name {
-        flex: 1;
-        font-family: 'Monaco', 'Menlo', 'Ubuntu Mono', monospace;
-        font-size: 11px;
-    }
-    #progress-monitor .extractor-progress {
-        width: 60px;
-    }
 
     /* Status Badge */
     #progress-monitor .status-badge {
@@ -356,11 +418,11 @@
                     <span class="stat-label">Queued</span>
                     <span class="stat-value warning" id="total-queued">0</span>
                 </div>
-                <div class="stat">
+                <div class="stat clickable" id="stat-succeeded" title="Click to reset counter">
                     <span class="stat-label">Done</span>
                     <span class="stat-value success" id="total-succeeded">0</span>
                 </div>
-                <div class="stat">
+                <div class="stat clickable" id="stat-failed" title="Click to reset counter">
                     <span class="stat-label">Failed</span>
                     <span class="stat-value error" id="total-failed">0</span>
                 </div>
@@ -390,6 +452,24 @@
     let expandedCrawls = new Set(JSON.parse(localStorage.getItem('progress-monitor-expanded-crawls') || '[]'));
     let expandedSnapshots = new Set(JSON.parse(localStorage.getItem('progress-monitor-expanded-snapshots') || '[]'));
 
+    // Baselines for resettable counters
+    let succeededBaseline = parseInt(localStorage.getItem('progress-succeeded-baseline') || '0');
+    let failedBaseline = parseInt(localStorage.getItem('progress-failed-baseline') || '0');
+    let lastSucceeded = 0;
+    let lastFailed = 0;
+
+    // Click handlers for resetting counters
+    document.getElementById('stat-succeeded').addEventListener('click', function() {
+        succeededBaseline = lastSucceeded;
+        localStorage.setItem('progress-succeeded-baseline', succeededBaseline);
+        document.getElementById('total-succeeded').textContent = '0';
+    });
+    document.getElementById('stat-failed').addEventListener('click', function() {
+        failedBaseline = lastFailed;
+        localStorage.setItem('progress-failed-baseline', failedBaseline);
+        document.getElementById('total-failed').textContent = '0';
+    });
+
     function formatUrl(url) {
         try {
             const u = new URL(url);
@@ -400,24 +480,18 @@
     }
 
     function renderExtractor(extractor) {
-        const iconClass = extractor.status === 'started' ? 'running' :
-                         extractor.status === 'succeeded' ? 'success' :
-                         extractor.status === 'failed' ? 'failed' : 'pending';
         const icon = extractor.status === 'started' ? '&#8635;' :
                     extractor.status === 'succeeded' ? '&#10003;' :
                     extractor.status === 'failed' ? '&#10007;' : '&#9675;';
 
         return `
-            <div class="extractor-item">
-                <span class="extractor-icon ${iconClass}">${icon}</span>
-                <span class="extractor-name">${extractor.extractor}</span>
-                <div class="extractor-progress">
-                    <div class="progress-bar-container">
-                        <div class="progress-bar extractor ${extractor.status === 'started' ? 'indeterminate' : ''}"
-                             style="width: ${extractor.status === 'succeeded' ? '100' : extractor.status === 'failed' ? '100' : extractor.progress}%"></div>
-                    </div>
-                </div>
-            </div>
+            <span class="extractor-badge ${extractor.status}">
+                <span class="progress-fill"></span>
+                <span class="badge-content">
+                    <span class="badge-icon">${icon}</span>
+                    <span>${extractor.extractor}</span>
+                </span>
+            </span>
         `;
     }
 
@@ -427,10 +501,14 @@
         const statusIcon = snapshot.status === 'started' ? '&#8635;' : '&#128196;';
 
         let extractorHtml = '';
-        if (snapshot.active_extractors && snapshot.active_extractors.length > 0) {
+        if (snapshot.all_extractors && snapshot.all_extractors.length > 0) {
+            // Sort extractors alphabetically by name to prevent reordering on updates
+            const sortedExtractors = [...snapshot.all_extractors].sort((a, b) =>
+                a.extractor.localeCompare(b.extractor)
+            );
             extractorHtml = `
                 <div class="extractor-list" style="${isExpanded ? '' : 'display:none'}">
-                    ${snapshot.active_extractors.map(e => renderExtractor(e)).join('')}
+                    ${sortedExtractors.map(e => renderExtractor(e)).join('')}
                 </div>
             `;
         }
@@ -438,7 +516,7 @@
         return `
             <div class="snapshot-item" data-snapshot-key="${snapshotKey}">
                 <div class="snapshot-header" onclick="window.toggleSnapshot('${snapshotKey}')">
-                    <span class="expand-icon ${isExpanded ? 'expanded' : ''}">${snapshot.active_extractors?.length ? '&#9654;' : ''}</span>
+                    <span class="expand-icon ${isExpanded ? 'expanded' : ''}">${snapshot.all_extractors?.length ? '&#9654;' : ''}</span>
                     <span class="snapshot-icon">${statusIcon}</span>
                     <div class="snapshot-info">
                         <div class="snapshot-url">${formatUrl(snapshot.url)}</div>
@@ -469,6 +547,40 @@
             snapshotsHtml = crawl.active_snapshots.map(s => renderSnapshot(s, crawl.id)).join('');
         }
 
+        // Show warning if crawl is stuck (queued but can't start)
+        let warningHtml = '';
+        if (crawl.status === 'queued' && !crawl.can_start) {
+            warningHtml = `
+                <div style="padding: 8px 14px; background: rgba(248, 81, 73, 0.1); border-top: 1px solid #f85149; color: #f85149; font-size: 11px;">
+                    ⚠️ Crawl cannot start: ${crawl.seed_uri ? 'unknown error' : 'no seed URI'}
+                </div>
+            `;
+        } else if (crawl.status === 'queued' && crawl.retry_at_future) {
+            // Queued but retry_at is in future (was claimed by worker, will retry)
+            warningHtml = `
+                <div style="padding: 8px 14px; background: rgba(88, 166, 255, 0.1); border-top: 1px solid #58a6ff; color: #58a6ff; font-size: 11px;">
+                    🔄 Retrying in ${crawl.seconds_until_retry}s...${crawl.seed_uri ? ` (${crawl.seed_uri})` : ''}
+                </div>
+            `;
+        } else if (crawl.status === 'queued' && crawl.total_snapshots === 0) {
+            // Queued and waiting to be picked up by worker
+            warningHtml = `
+                <div style="padding: 8px 14px; background: rgba(210, 153, 34, 0.1); border-top: 1px solid #d29922; color: #d29922; font-size: 11px;">
+                    ⏳ Waiting for worker to pick up...${crawl.seed_uri ? ` (${crawl.seed_uri})` : ''}
+                </div>
+            `;
+        }
+
+        // Show snapshot info or URL count if no snapshots yet
+        let metaText = `depth: ${crawl.max_depth}`;
+        if (crawl.total_snapshots > 0) {
+            metaText += ` | ${crawl.total_snapshots} snapshots`;
+        } else if (crawl.urls_count > 0) {
+            metaText += ` | ${crawl.urls_count} URLs`;
+        } else if (crawl.seed_uri) {
+            metaText += ` | ${crawl.seed_uri.substring(0, 40)}${crawl.seed_uri.length > 40 ? '...' : ''}`;
+        }
+
         return `
             <div class="crawl-item" data-crawl-id="${crawl.id}">
                 <div class="crawl-header" onclick="window.toggleCrawl('${crawl.id}')">
@@ -476,10 +588,11 @@
                     <span class="crawl-icon">${statusIcon}</span>
                     <div class="crawl-info">
                         <div class="crawl-label">${crawl.label}</div>
-                        <div class="crawl-meta">depth: ${crawl.max_depth} | ${crawl.total_snapshots} snapshots</div>
+                        <div class="crawl-meta">${metaText}</div>
                     </div>
                     <div class="crawl-stats">
                         <span style="color:#3fb950">${crawl.completed_snapshots} done</span>
+                        <span style="color:#d29922">${crawl.started_snapshots || 0} active</span>
                         <span style="color:#8b949e">${crawl.pending_snapshots} pending</span>
                     </div>
                     <span class="status-badge ${crawl.status}">${crawl.status}</span>
@@ -490,6 +603,7 @@
                              style="width: ${crawl.progress}%"></div>
                     </div>
                 </div>
+                ${warningHtml}
                 <div class="crawl-body" style="${isExpanded ? '' : 'display:none'}">
                     <div class="snapshot-list">
                         ${snapshotsHtml}
@@ -542,25 +656,48 @@
                            data.snapshots_pending > 0 || data.snapshots_started > 0 ||
                            data.archiveresults_pending > 0 || data.archiveresults_started > 0;
 
-        // Update orchestrator status
+        // Update orchestrator status - show "Running" only when there's actual activity
+        // Don't distinguish between "Stopped" and "Idle" since orchestrator starts/stops frequently
         const dot = document.getElementById('orchestrator-dot');
         const text = document.getElementById('orchestrator-text');
-        if (data.orchestrator_running) {
-            dot.classList.remove('stopped');
+        const hasWorkers = data.total_workers > 0;
+
+        if (hasWorkers || hasActivity) {
+            dot.classList.remove('stopped', 'idle');
             dot.classList.add('running');
             text.textContent = 'Running';
         } else {
-            dot.classList.remove('running');
-            dot.classList.add('stopped');
-            text.textContent = 'Stopped';
+            // No activity - show as idle (whether orchestrator process exists or not)
+            dot.classList.remove('stopped', 'running');
+            dot.classList.add('idle');
+            text.textContent = 'Idle';
         }
 
+        // Pulse the dot to show we got fresh data
+        dot.classList.add('flash');
+        setTimeout(() => dot.classList.remove('flash'), 300);
+
         // Update stats
         document.getElementById('worker-count').textContent = data.total_workers;
         document.getElementById('total-queued').textContent =
             data.crawls_pending + data.snapshots_pending + data.archiveresults_pending;
-        document.getElementById('total-succeeded').textContent = data.archiveresults_succeeded;
-        document.getElementById('total-failed').textContent = data.archiveresults_failed;
+
+        // Store raw values and display relative to baseline
+        lastSucceeded = data.archiveresults_succeeded;
+        lastFailed = data.archiveresults_failed;
+
+        // If baseline is higher than current (e.g. after DB reset), reset baseline
+        if (succeededBaseline > lastSucceeded) {
+            succeededBaseline = 0;
+            localStorage.setItem('progress-succeeded-baseline', '0');
+        }
+        if (failedBaseline > lastFailed) {
+            failedBaseline = 0;
+            localStorage.setItem('progress-failed-baseline', '0');
+        }
+
+        document.getElementById('total-succeeded').textContent = lastSucceeded - succeededBaseline;
+        document.getElementById('total-failed').textContent = lastFailed - failedBaseline;
 
         // Render crawl tree
         if (data.active_crawls.length > 0) {

+ 8 - 3
archivebox/workers/management/commands/orchestrator.py

@@ -7,9 +7,14 @@ class Command(BaseCommand):
     help = 'Run the archivebox orchestrator'
 
     def add_arguments(self, parser):
-        parser.add_argument('--daemon', '-d', action='store_true', help="Run forever (don't exit on idle)")
+        parser.add_argument(
+            '--exit-on-idle',
+            action='store_true',
+            default=False,
+            help="Exit when all work is complete (default: run forever)"
+        )
 
     def handle(self, *args, **kwargs):
-        daemon = kwargs.get('daemon', False)
-        orchestrator = Orchestrator(exit_on_idle=not daemon)
+        exit_on_idle = kwargs.get('exit_on_idle', False)
+        orchestrator = Orchestrator(exit_on_idle=exit_on_idle)
         orchestrator.runloop()

+ 22 - 13
archivebox/workers/orchestrator.py

@@ -12,16 +12,17 @@ Architecture:
         └── Each worker spawns task subprocesses via CLI
 
 Usage:
-    # Embedded in other commands (exits when done)
+    # Default: runs forever (for use as subprocess of server)
+    orchestrator = Orchestrator(exit_on_idle=False)
+    orchestrator.runloop()
+
+    # Exit when done (for embedded use in other commands)
     orchestrator = Orchestrator(exit_on_idle=True)
     orchestrator.runloop()
-    
-    # Daemon mode (runs forever)
-    orchestrator = Orchestrator(exit_on_idle=False)
-    orchestrator.start()  # fork and return
-    
+
     # Or run via CLI
-    archivebox orchestrator [--daemon]
+    archivebox manage orchestrator              # runs forever
+    archivebox manage orchestrator --exit-on-idle  # exits when done
 """
 
 __package__ = 'archivebox.workers'
@@ -45,6 +46,14 @@ from .pid_utils import (
 )
 
 
+def _run_orchestrator_process(exit_on_idle: bool) -> None:
+    """Top-level function for multiprocessing (must be picklable)."""
+    from archivebox.config.django import setup_django
+    setup_django()
+    orchestrator = Orchestrator(exit_on_idle=exit_on_idle)
+    orchestrator.runloop()
+
+
 class Orchestrator:
     """
     Manages worker processes by polling queues and spawning workers as needed.
@@ -277,12 +286,12 @@ class Orchestrator:
         Fork orchestrator as a background process.
         Returns the PID of the new process.
         """
-        def run_orchestrator():
-            from archivebox.config.django import setup_django
-            setup_django()
-            self.runloop()
-        
-        proc = Process(target=run_orchestrator, name='orchestrator')
+        # Use module-level function to avoid pickle errors with local functions
+        proc = Process(
+            target=_run_orchestrator_process,
+            args=(self.exit_on_idle,),
+            name='orchestrator'
+        )
         proc.start()
 
         assert proc.pid is not None

+ 81 - 4
archivebox/workers/supervisord_util.py

@@ -28,7 +28,7 @@ WORKERS_DIR_NAME = "workers"
 
 ORCHESTRATOR_WORKER = {
     "name": "worker_orchestrator",
-    "command": "archivebox manage orchestrator",
+    "command": "archivebox manage orchestrator",  # runs forever by default
     "autostart": "true",
     "autorestart": "true",
     "stdout_logfile": "logs/worker_orchestrator.log",
@@ -332,14 +332,14 @@ def stop_worker(supervisor, daemon_name):
 
 def tail_worker_logs(log_path: str):
     get_or_create_supervisord_process(daemonize=False)
-    
+
     from rich.live import Live
     from rich.table import Table
-    
+
     table = Table()
     table.add_column("TS")
     table.add_column("URL")
-    
+
     try:
         with Live(table, refresh_per_second=1) as live:  # update 4 times a second to feel fluid
             with open(log_path, 'r') as f:
@@ -352,6 +352,83 @@ def tail_worker_logs(log_path: str):
     except SystemExit:
         pass
 
+
+def tail_multiple_worker_logs(log_files: list[str], follow=True):
+    """Tail multiple log files simultaneously, interleaving their output."""
+    import select
+    from pathlib import Path
+
+    # Convert relative paths to absolute paths
+    log_paths = []
+    for log_file in log_files:
+        log_path = Path(log_file)
+        if not log_path.is_absolute():
+            log_path = CONSTANTS.DATA_DIR / log_path
+
+        # Create log file if it doesn't exist
+        if not log_path.exists():
+            log_path.parent.mkdir(parents=True, exist_ok=True)
+            log_path.touch()
+
+        log_paths.append(log_path)
+
+    # Open all log files
+    file_handles = []
+    for log_path in log_paths:
+        try:
+            f = open(log_path, 'r')
+            # Seek to end of file if following
+            if follow:
+                f.seek(0, 2)  # Seek to end
+            file_handles.append((log_path.name, f))
+        except Exception as e:
+            print(f"[yellow]Warning: Could not open {log_path}: {e}[/yellow]")
+
+    if not file_handles:
+        print("[red]No log files could be opened[/red]")
+        return
+
+    # Print which logs we're tailing
+    log_names = [name for name, _ in file_handles]
+    print(f"[dim]Tailing: {', '.join(log_names)}[/dim]")
+    print()
+
+    try:
+        while follow:
+            # Read available lines from all files
+            for log_name, f in file_handles:
+                line = f.readline()
+                if line:
+                    # Colorize based on log source
+                    if 'orchestrator' in log_name.lower():
+                        color = 'cyan'
+                    elif 'daphne' in log_name.lower():
+                        color = 'green'
+                    else:
+                        color = 'white'
+
+                    # Strip ANSI codes if present (supervisord does this but just in case)
+                    import re
+                    line_clean = re.sub(r'\x1b\[[0-9;]*m', '', line.rstrip())
+
+                    if line_clean:
+                        print(f'[{color}][{log_name}][/{color}] {line_clean}')
+
+            # Small sleep to avoid busy-waiting
+            time.sleep(0.1)
+
+    except (KeyboardInterrupt, BrokenPipeError, IOError):
+        print("\n[yellow][i] Stopped tailing logs[/i][/yellow]")
+    except SystemExit:
+        pass
+    finally:
+        # Close all file handles
+        for _, f in file_handles:
+            try:
+                f.close()
+            except Exception:
+                pass
+
 def watch_worker(supervisor, daemon_name, interval=5):
     """loop continuously and monitor worker's health"""
     while True:

+ 3 - 20
archivebox/workers/tasks.py

@@ -3,6 +3,9 @@ Background task functions for queuing work to the orchestrator.
 
 These functions queue Snapshots/Crawls for processing by setting their status
 to QUEUED, which the orchestrator workers will pick up and process.
+
+NOTE: These functions do NOT start the orchestrator - they assume it's already
+running via `archivebox server` (supervisord) or will be run inline by the CLI.
 """
 
 __package__ = 'archivebox.workers'
@@ -10,16 +13,6 @@ __package__ = 'archivebox.workers'
 from django.utils import timezone
 
 
-def ensure_orchestrator_running():
-    """Ensure the orchestrator is running to process queued items."""
-    from .orchestrator import Orchestrator
-
-    if not Orchestrator.is_running():
-        # Start orchestrator in background
-        orchestrator = Orchestrator(exit_on_idle=True)
-        orchestrator.start()
-
-
 def bg_add(add_kwargs: dict) -> int:
     """
     Add URLs and queue them for archiving.
@@ -36,9 +29,6 @@ def bg_add(add_kwargs: dict) -> int:
 
     result = add(**add_kwargs)
 
-    # Ensure orchestrator is running to process the new snapshots
-    ensure_orchestrator_running()
-
     return len(result) if result else 0
 
 
@@ -66,10 +56,6 @@ def bg_archive_snapshots(snapshots, kwargs: dict | None = None) -> int:
             )
             queued_count += 1
 
-    # Ensure orchestrator is running to process the queued snapshots
-    if queued_count > 0:
-        ensure_orchestrator_running()
-
     return queued_count
 
 
@@ -90,9 +76,6 @@ def bg_archive_snapshot(snapshot, overwrite: bool = False, methods: list | None
             status=Snapshot.StatusChoices.QUEUED,
             retry_at=timezone.now(),
         )
-
-        # Ensure orchestrator is running to process the queued snapshot
-        ensure_orchestrator_running()
         return 1
 
     return 0

+ 2 - 2
archivebox/workers/worker.py

@@ -67,8 +67,8 @@ class Worker:
     # Configuration (can be overridden by subclasses)
     MAX_TICK_TIME: ClassVar[int] = 60
     MAX_CONCURRENT_TASKS: ClassVar[int] = 1
-    POLL_INTERVAL: ClassVar[float] = 0.5
-    IDLE_TIMEOUT: ClassVar[int] = 3  # Exit after N idle iterations (set to 0 to never exit)
+    POLL_INTERVAL: ClassVar[float] = 1.0
+    IDLE_TIMEOUT: ClassVar[int] = 10  # Exit after N idle iterations (10 sec at 1.0 poll interval)
 
     def __init__(self, worker_id: int = 0, daemon: bool = False, **kwargs: Any):
         self.worker_id = worker_id