浏览代码

speed up startup time, add rich startup progressbar, split logging and checks into misc, fix search index import backend bug

Nick Sweeting 1 年之前
父节点
当前提交
64c7100cf9

+ 4 - 0
archivebox/__init__.py

@@ -1,5 +1,7 @@
 __package__ = 'archivebox'
 
+# print('INSTALLING MONKEY PATCHES')
+
 from .monkey_patches import *
 
 import os
@@ -28,3 +30,5 @@ def _detect_installed_version():
 
 
 __version__ = _detect_installed_version()
+
+# print('DONE INSTALLING MONKEY PATCHES')

+ 106 - 56
archivebox/cli/__init__.py

@@ -1,16 +1,20 @@
 __package__ = 'archivebox.cli'
 __command__ = 'archivebox'
 
-import os
 import sys
 import argparse
 import threading
+import archivebox
+
 from time import sleep
+from collections.abc import Mapping
 
-from typing import Optional, Dict, List, IO, Union, Iterable
+from typing import Optional, List, IO, Union, Iterable
 from pathlib import Path
 
-from ..config import OUTPUT_DIR, check_data_folder, check_migrations, stderr
+
+from ..misc.checks import check_data_folder, check_migrations
+from ..misc.logging import stderr
 
 from importlib import import_module
 
@@ -18,13 +22,46 @@ BUILTIN_LIST = list
 
 CLI_DIR = Path(__file__).resolve().parent
 
-# these common commands will appear sorted before any others for ease-of-use
-meta_cmds = ('help', 'version')                               # dont require valid data folder at all
-main_cmds = ('init', 'config', 'setup')                       # dont require existing db present
-archive_cmds = ('add', 'remove', 'update', 'list', 'status')  # require existing db present
-fake_db = ("oneshot",)                                        # use fake in-memory db
 
-display_first = (*meta_cmds, *main_cmds, *archive_cmds)
+# def list_subcommands() -> Dict[str, str]:
+#     """find and import all valid archivebox_<subcommand>.py files in CLI_DIR"""
+#     COMMANDS = []
+#     for filename in os.listdir(CLI_DIR):
+#         if is_cli_module(filename):
+#             subcommand = filename.replace('archivebox_', '').replace('.py', '')
+#             module = import_module('.archivebox_{}'.format(subcommand), __package__)
+#             assert is_valid_cli_module(module, subcommand)
+#             COMMANDS.append((subcommand, module.main.__doc__))
+#             globals()[subcommand] = module.main
+#     display_order = lambda cmd: (
+#         display_first.index(cmd[0])
+#         if cmd[0] in display_first else
+#         100 + len(cmd[0])
+#     )
+#     return dict(sorted(COMMANDS, key=display_order))
+
+# just define it statically, it's much faster:
+SUBCOMMAND_MODULES = {
+    'help': 'archivebox_help',
+    'version': 'archivebox_version' ,
+    
+    'init': 'archivebox_init',
+    'config': 'archivebox_config',
+    'setup': 'archivebox_setup',
+    
+    'add': 'archivebox_add',
+    'remove': 'archivebox_remove',
+    'update': 'archivebox_update',
+    'list': 'archivebox_list',
+    'status': 'archivebox_status',
+    
+    'schedule': 'archivebox_schedule',
+    'server': 'archivebox_server',
+    'shell': 'archivebox_shell',
+    'manage': 'archivebox_manage',
+
+    'oneshot': 'archivebox_oneshot',
+}
 
 # every imported command module must have these properties in order to be valid
 required_attrs = ('__package__', '__command__', 'main')
@@ -36,6 +73,38 @@ is_valid_cli_module = lambda module, subcommand: (
     and module.__command__.split(' ')[-1] == subcommand
 )
 
+class LazySubcommands(Mapping):
+    def keys(self):
+        return SUBCOMMAND_MODULES.keys()
+    
+    def values(self):
+        return [self[key] for key in self.keys()]
+    
+    def items(self):
+        return [(key, self[key]) for key in self.keys()]
+    
+    def __getitem__(self, key):
+        module = import_module(f'.{SUBCOMMAND_MODULES[key]}', __package__)
+        assert is_valid_cli_module(module, key)
+        return module.main
+    
+    def __iter__(self):
+        return iter(SUBCOMMAND_MODULES.keys())
+    
+    def __len__(self):
+        return len(SUBCOMMAND_MODULES)
+
+CLI_SUBCOMMANDS = LazySubcommands()
+
+
+# these common commands will appear sorted before any others for ease-of-use
+meta_cmds = ('help', 'version')                               # dont require valid data folder at all
+main_cmds = ('init', 'config', 'setup')                       # dont require existing db present
+archive_cmds = ('add', 'remove', 'update', 'list', 'status')  # require existing db present
+fake_db = ("oneshot",)                                        # use fake in-memory db
+
+display_first = (*meta_cmds, *main_cmds, *archive_cmds)
+
 
 IGNORED_BG_THREADS = ('MainThread', 'ThreadPoolExecutor', 'IPythonHistorySavingThread', 'Scheduler')  # threads we dont have to wait for before exiting
 
@@ -71,29 +140,9 @@ def wait_for_bg_threads_to_exit(thread_names: Iterable[str]=(), ignore_names: It
     raise Exception(f'Background threads failed to exit after {tries}s: {threads_summary}')
 
 
-def list_subcommands() -> Dict[str, str]:
-    """find and import all valid archivebox_<subcommand>.py files in CLI_DIR"""
-
-    COMMANDS = []
-    for filename in os.listdir(CLI_DIR):
-        if is_cli_module(filename):
-            subcommand = filename.replace('archivebox_', '').replace('.py', '')
-            module = import_module('.archivebox_{}'.format(subcommand), __package__)
-            assert is_valid_cli_module(module, subcommand)
-            COMMANDS.append((subcommand, module.main.__doc__))
-            globals()[subcommand] = module.main
-
-    display_order = lambda cmd: (
-        display_first.index(cmd[0])
-        if cmd[0] in display_first else
-        100 + len(cmd[0])
-    )
-
-    return dict(sorted(COMMANDS, key=display_order))
-
 
 def run_subcommand(subcommand: str,
-                   subcommand_args: List[str]=None,
+                   subcommand_args: List[str] | None = None,
                    stdin: Optional[IO]=None,
                    pwd: Union[Path, str, None]=None) -> None:
     """Run a given ArchiveBox subcommand with the given list of args"""
@@ -101,18 +150,18 @@ def run_subcommand(subcommand: str,
     subcommand_args = subcommand_args or []
 
     if subcommand not in meta_cmds:
-        from ..config import setup_django
+        from ..config import setup_django, CONFIG
 
         cmd_requires_db = subcommand in archive_cmds
         init_pending = '--init' in subcommand_args or '--quick-init' in subcommand_args
 
         if cmd_requires_db:
-            check_data_folder(pwd)
+            check_data_folder(CONFIG)
 
         setup_django(in_memory_db=subcommand in fake_db, check_db=cmd_requires_db and not init_pending)
 
         if cmd_requires_db:
-            check_migrations()
+            check_migrations(CONFIG)
 
     module = import_module('.archivebox_{}'.format(subcommand), __package__)
     module.main(args=subcommand_args, stdin=stdin, pwd=pwd)    # type: ignore
@@ -121,17 +170,28 @@ def run_subcommand(subcommand: str,
     wait_for_bg_threads_to_exit(timeout=60)
 
 
-SUBCOMMANDS = list_subcommands()
+
+
 
 class NotProvided:
-    pass
+    def __len__(self):
+        return 0
+    def __bool__(self):
+        return False
+    def __repr__(self):
+        return '<not provided>'
+
+Omitted = Union[None, NotProvided]
 
+OMITTED = NotProvided()
 
-def main(args: Optional[List[str]]=NotProvided, stdin: Optional[IO]=NotProvided, pwd: Optional[str]=None) -> None:
-    args = sys.argv[1:] if args is NotProvided else args
-    stdin = sys.stdin if stdin is NotProvided else stdin
 
-    subcommands = list_subcommands()
+def main(args: List[str] | Omitted=OMITTED, stdin: IO | Omitted=OMITTED, pwd: str | None=None) -> None:
+    # print('STARTING CLI MAIN ENTRYPOINT')
+    
+    args = sys.argv[1:] if args is OMITTED else args
+    stdin = sys.stdin if stdin is OMITTED else stdin
+
     parser = argparse.ArgumentParser(
         prog=__command__,
         description='ArchiveBox: The self-hosted internet archive',
@@ -141,19 +201,19 @@ def main(args: Optional[List[str]]=NotProvided, stdin: Optional[IO]=NotProvided,
     group.add_argument(
         '--help', '-h',
         action='store_true',
-        help=subcommands['help'],
+        help=CLI_SUBCOMMANDS['help'].__doc__,
     )
     group.add_argument(
         '--version',
         action='store_true',
-        help=subcommands['version'],
+        help=CLI_SUBCOMMANDS['version'].__doc__,
     )
     group.add_argument(
         "subcommand",
         type=str,
         help= "The name of the subcommand to run",
         nargs='?',
-        choices=subcommands.keys(),
+        choices=CLI_SUBCOMMANDS.keys(),
         default=None,
     )
     parser.add_argument(
@@ -174,23 +234,13 @@ def main(args: Optional[List[str]]=NotProvided, stdin: Optional[IO]=NotProvided,
         log_cli_command(
             subcommand=command.subcommand,
             subcommand_args=command.subcommand_args,
-            stdin=stdin,
-            pwd=pwd or OUTPUT_DIR
+            stdin=stdin or None,
+            pwd=pwd or archivebox.DATA_DIR,
         )
 
     run_subcommand(
         subcommand=command.subcommand,
         subcommand_args=command.subcommand_args,
-        stdin=stdin,
-        pwd=pwd or OUTPUT_DIR,
+        stdin=stdin or None,
+        pwd=pwd or archivebox.DATA_DIR,
     )
-
-
-__all__ = (
-    'SUBCOMMANDS',
-    'list_subcommands',
-    'run_subcommand',
-    *SUBCOMMANDS.keys(),
-)
-
-

+ 125 - 288
archivebox/config.py

@@ -28,21 +28,19 @@ import sys
 import json
 import inspect
 import getpass
-import platform
 import shutil
 import requests
 
 from hashlib import md5
 from pathlib import Path
-from benedict import benedict
 from datetime import datetime, timezone
-from typing import Optional, Type, Tuple, Dict, Union, List
+from typing import Optional, Type, Tuple, Dict
 from subprocess import run, PIPE, DEVNULL, STDOUT, TimeoutExpired
 from configparser import ConfigParser
-from collections import defaultdict
 import importlib.metadata
 
 from pydantic_pkgr import SemVer
+from rich.progress import Progress
 
 import django
 from django.db.backends.sqlite3.base import Database as sqlite3
@@ -56,6 +54,17 @@ from .config_stubs import (
     ConfigDefaultDict,
 )
 
+from .misc.logging import (
+    CONSOLE,
+    SHOW_PROGRESS,
+    DEFAULT_CLI_COLORS,
+    ANSI,
+    COLOR_DICT,
+    stderr,
+    hint,
+)
+from .misc.checks import check_system_config
+
 # print('STARTING CONFIG LOADING')
 
 # load fallback libraries from vendor dir
@@ -70,7 +79,7 @@ CONFIG_SCHEMA: Dict[str, ConfigDefaultDict] = {
     'SHELL_CONFIG': {
         'IS_TTY':                   {'type': bool,  'default': lambda _: sys.stdout.isatty()},
         'USE_COLOR':                {'type': bool,  'default': lambda c: c['IS_TTY']},
-        'SHOW_PROGRESS':            {'type': bool,  'default': lambda c: (c['IS_TTY'] and platform.system() != 'Darwin')},  # progress bars are buggy on mac, disable for now
+        'SHOW_PROGRESS':            {'type': bool,  'default': lambda c: c['IS_TTY']},  # progress bars are buggy on mac, disable for now
         'IN_DOCKER':                {'type': bool,  'default': False},
         'IN_QEMU':                  {'type': bool,  'default': False},
         'PUID':                     {'type': int,   'default': os.getuid()},
@@ -306,32 +315,7 @@ ROBOTS_TXT_FILENAME = 'robots.txt'
 FAVICON_FILENAME = 'favicon.ico'
 CONFIG_FILENAME = 'ArchiveBox.conf'
 
-DEFAULT_CLI_COLORS = benedict(
-    {
-        "reset": "\033[00;00m",
-        "lightblue": "\033[01;30m",
-        "lightyellow": "\033[01;33m",
-        "lightred": "\033[01;35m",
-        "red": "\033[01;31m",
-        "green": "\033[01;32m",
-        "blue": "\033[01;34m",
-        "white": "\033[01;37m",
-        "black": "\033[01;30m",
-    }
-)
-ANSI = AttrDict({k: '' for k in DEFAULT_CLI_COLORS.keys()})
-
-COLOR_DICT = defaultdict(lambda: [(0, 0, 0), (0, 0, 0)], {
-    '00': [(0, 0, 0), (0, 0, 0)],
-    '30': [(0, 0, 0), (0, 0, 0)],
-    '31': [(255, 0, 0), (128, 0, 0)],
-    '32': [(0, 200, 0), (0, 128, 0)],
-    '33': [(255, 255, 0), (128, 128, 0)],
-    '34': [(0, 0, 255), (0, 0, 128)],
-    '35': [(255, 0, 255), (128, 0, 128)],
-    '36': [(0, 255, 255), (0, 128, 128)],
-    '37': [(255, 255, 255), (255, 255, 255)],
-})
+
 
 STATICFILE_EXTENSIONS = {
     # 99.999% of the time, URLs ending in these extensions are static files
@@ -880,37 +864,6 @@ def parse_version_string(version: str) -> Tuple[int, int, int]:
     return tuple(int(part) for part in base.split('.'))[:3]
 
 
-# Logging Helpers
-def stdout(*args, color: Optional[str]=None, prefix: str='', config: Optional[ConfigDict]=None) -> None:
-    ansi = DEFAULT_CLI_COLORS if (config or {}).get('USE_COLOR') else ANSI
-
-    if color:
-        strs = [ansi[color], ' '.join(str(a) for a in args), ansi['reset'], '\n']
-    else:
-        strs = [' '.join(str(a) for a in args), '\n']
-
-    sys.stdout.write(prefix + ''.join(strs))
-
-def stderr(*args, color: Optional[str]=None, prefix: str='', config: Optional[ConfigDict]=None) -> None:
-    ansi = DEFAULT_CLI_COLORS if (config or {}).get('USE_COLOR') else ANSI
-
-    if color:
-        strs = [ansi[color], ' '.join(str(a) for a in args), ansi['reset'], '\n']
-    else:
-        strs = [' '.join(str(a) for a in args), '\n']
-
-    sys.stderr.write(prefix + ''.join(strs))
-
-def hint(text: Union[Tuple[str, ...], List[str], str], prefix='    ', config: Optional[ConfigDict]=None) -> None:
-    ansi = DEFAULT_CLI_COLORS if (config or {}).get('USE_COLOR') else ANSI
-
-    if isinstance(text, str):
-        stderr('{}{lightred}Hint:{reset} {}'.format(prefix, text, **ansi))
-    else:
-        stderr('{}{lightred}Hint:{reset} {}'.format(prefix, text[0], **ansi))
-        for line in text[1:]:
-            stderr('{}      {}'.format(prefix, line))
-
 
 # Dependency Metadata Helpers
 def bin_version(binary: Optional[str], cmd: Optional[str]=None, timeout: int=3) -> Optional[str]:
@@ -919,6 +872,10 @@ def bin_version(binary: Optional[str], cmd: Optional[str]=None, timeout: int=3)
     abspath = bin_path(binary)
     if not binary or not abspath:
         return None
+    
+    return '999.999.999'
+
+    # Now handled by new BinProvider plugin system, no longer needed:
 
     try:
         bin_env = os.environ | {'LANG': 'C'}
@@ -960,6 +917,9 @@ def bin_path(binary: Optional[str]) -> Optional[str]:
     return shutil.which(str(Path(binary).expanduser())) or shutil.which(str(binary)) or binary
 
 def bin_hash(binary: Optional[str]) -> Optional[str]:
+    return 'UNUSED'
+    # DEPRECATED: now handled by new BinProvider plugin system, no longer needed:
+
     if binary is None:
         return None
     abs_path = bin_path(binary)
@@ -1329,246 +1289,123 @@ if not CONFIG['CHECK_SSL_VALIDITY']:
 
 ########################### Config Validity Checkers ###########################
 
+INITIAL_STARTUP_PROGRESS = None
+INITIAL_STARTUP_PROGRESS_TASK = 0
 
-def check_system_config(config: ConfigDict=CONFIG) -> None:
-    ### Check system environment
-    if config['USER'] == 'root' or str(config['PUID']) == "0":
-        stderr('[!] ArchiveBox should never be run as root!', color='red')
-        stderr('    For more information, see the security overview documentation:')
-        stderr('        https://github.com/ArchiveBox/ArchiveBox/wiki/Security-Overview#do-not-run-as-root')
-        
-        if config['IN_DOCKER']:
-            attempted_command = ' '.join(sys.argv[:3])
-            stderr('')
-            stderr('    {lightred}Hint{reset}: When using Docker, you must run commands with {green}docker run{reset} instead of {lightyellow}docker exec{reset}, e.g.:'.format(**config['ANSI']))
-            stderr(f'        docker compose run archivebox {attempted_command}')
-            stderr(f'        docker run -it -v $PWD/data:/data archivebox/archivebox {attempted_command}')
-            stderr('        or:')
-            stderr(f'        docker compose exec --user=archivebox archivebox /bin/bash -c "archivebox {attempted_command}"')
-            stderr(f'        docker exec -it --user=archivebox <container id> /bin/bash -c "archivebox {attempted_command}"')
-        
-        raise SystemExit(2)
-
-    ### Check Python environment
-    if sys.version_info[:3] < (3, 7, 0):
-        stderr(f'[X] Python version is not new enough: {config["PYTHON_VERSION"]} (>3.6 is required)', color='red')
-        stderr('    See https://github.com/ArchiveBox/ArchiveBox/wiki/Troubleshooting#python for help upgrading your Python installation.')
-        raise SystemExit(2)
-
-    if int(CONFIG['DJANGO_VERSION'].split('.')[0]) < 3:
-        stderr(f'[X] Django version is not new enough: {config["DJANGO_VERSION"]} (>3.0 is required)', color='red')
-        stderr('    Upgrade django using pip or your system package manager: pip3 install --upgrade django')
-        raise SystemExit(2)
-
-    if config['PYTHON_ENCODING'] not in ('UTF-8', 'UTF8'):
-        stderr(f'[X] Your system is running python3 scripts with a bad locale setting: {config["PYTHON_ENCODING"]} (it should be UTF-8).', color='red')
-        stderr('    To fix it, add the line "export PYTHONIOENCODING=UTF-8" to your ~/.bashrc file (without quotes)')
-        stderr('    Or if you\'re using ubuntu/debian, run "dpkg-reconfigure locales"')
-        stderr('')
-        stderr('    Confirm that it\'s fixed by opening a new shell and running:')
-        stderr('        python3 -c "import sys; print(sys.stdout.encoding)"   # should output UTF-8')
-        raise SystemExit(2)
-
-    # stderr('[i] Using Chrome binary: {}'.format(shutil.which(CHROME_BINARY) or CHROME_BINARY))
-    # stderr('[i] Using Chrome data dir: {}'.format(os.path.abspath(CHROME_USER_DATA_DIR)))
-    if config['CHROME_USER_DATA_DIR'] is not None and Path(config['CHROME_USER_DATA_DIR']).exists():
-        if not (Path(config['CHROME_USER_DATA_DIR']) / 'Default').exists():
-            stderr('[X] Could not find profile "Default" in CHROME_USER_DATA_DIR.', color='red')
-            stderr(f'    {config["CHROME_USER_DATA_DIR"]}')
-            stderr('    Make sure you set it to a Chrome user data directory containing a Default profile folder.')
-            stderr('    For more info see:')
-            stderr('        https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#CHROME_USER_DATA_DIR')
-            if '/Default' in str(config['CHROME_USER_DATA_DIR']):
-                stderr()
-                stderr('    Try removing /Default from the end e.g.:')
-                stderr('        CHROME_USER_DATA_DIR="{}"'.format(str(config['CHROME_USER_DATA_DIR']).split('/Default')[0]))
-            
-            # hard error is too annoying here, instead just set it to nothing
-            # raise SystemExit(2)
-            config['CHROME_USER_DATA_DIR'] = None
-    else:
-        config['CHROME_USER_DATA_DIR'] = None
-
-
-def check_dependencies(config: ConfigDict=CONFIG, show_help: bool=True) -> None:
-    invalid_dependencies = [
-        (name, info) for name, info in config['DEPENDENCIES'].items()
-        if info['enabled'] and not info['is_valid']
-    ]
-    if invalid_dependencies and show_help:
-        stderr(f'[!] Warning: Missing {len(invalid_dependencies)} recommended dependencies', color='lightyellow')
-        for dependency, info in invalid_dependencies:
-            stderr(
-                '    ! {}: {} ({})'.format(
-                    dependency,
-                    info['path'] or 'unable to find binary',
-                    info['version'] or 'unable to detect version',
-                )
-            )
-            if dependency in ('YOUTUBEDL_BINARY', 'CHROME_BINARY', 'SINGLEFILE_BINARY', 'READABILITY_BINARY', 'MERCURY_BINARY'):
-                hint(('To install all packages automatically run: archivebox setup',
-                    f'or to disable it and silence this warning: archivebox config --set SAVE_{dependency.rsplit("_", 1)[0]}=False',
-                    ''), prefix='      ')
-        stderr('')
-
-    if config['TIMEOUT'] < 5:
-        stderr(f'[!] Warning: TIMEOUT is set too low! (currently set to TIMEOUT={config["TIMEOUT"]} seconds)', color='red')
-        stderr('    You must allow *at least* 5 seconds for indexing and archive methods to run succesfully.')
-        stderr('    (Setting it to somewhere between 30 and 3000 seconds is recommended)')
-        stderr()
-        stderr('    If you want to make ArchiveBox run faster, disable specific archive methods instead:')
-        stderr('        https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#archive-method-toggles')
-        stderr()
-
-    elif config['USE_CHROME'] and config['TIMEOUT'] < 15:
-        stderr(f'[!] Warning: TIMEOUT is set too low! (currently set to TIMEOUT={config["TIMEOUT"]} seconds)', color='red')
-        stderr('    Chrome will fail to archive all sites if set to less than ~15 seconds.')
-        stderr('    (Setting it to somewhere between 30 and 300 seconds is recommended)')
-        stderr()
-        stderr('    If you want to make ArchiveBox run faster, disable specific archive methods instead:')
-        stderr('        https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#archive-method-toggles')
-        stderr()
-
-    if config['USE_YOUTUBEDL'] and config['MEDIA_TIMEOUT'] < 20:
-        stderr(f'[!] Warning: MEDIA_TIMEOUT is set too low! (currently set to MEDIA_TIMEOUT={config["MEDIA_TIMEOUT"]} seconds)', color='red')
-        stderr('    youtube-dl/yt-dlp will fail to archive any media if set to less than ~20 seconds.')
-        stderr('    (Setting it somewhere over 60 seconds is recommended)')
-        stderr()
-        stderr('    If you want to disable media archiving entirely, set SAVE_MEDIA=False instead:')
-        stderr('        https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#save_media')
-        stderr()
+def bump_startup_progress_bar():
+    global INITIAL_STARTUP_PROGRESS
+    global INITIAL_STARTUP_PROGRESS_TASK
+    if INITIAL_STARTUP_PROGRESS:
+        INITIAL_STARTUP_PROGRESS.update(INITIAL_STARTUP_PROGRESS_TASK, advance=1)   # type: ignore
 
-        
-def check_data_folder(out_dir: Union[str, Path, None]=None, config: ConfigDict=CONFIG) -> None:
-    output_dir = out_dir or config['OUTPUT_DIR']
-    assert isinstance(output_dir, (str, Path))
-
-    archive_dir_exists = (Path(output_dir) / ARCHIVE_DIR_NAME).exists()
-    if not archive_dir_exists:
-        stderr('[X] No archivebox index found in the current directory.', color='red')
-        stderr(f'    {output_dir}', color='lightyellow')
-        stderr()
-        stderr('    {lightred}Hint{reset}: Are you running archivebox in the right folder?'.format(**config['ANSI']))
-        stderr('        cd path/to/your/archive/folder')
-        stderr('        archivebox [command]')
-        stderr()
-        stderr('    {lightred}Hint{reset}: To create a new archive collection or import existing data in this folder, run:'.format(**config['ANSI']))
-        stderr('        archivebox init')
-        raise SystemExit(2)
-
-
-def check_migrations(out_dir: Union[str, Path, None]=None, config: ConfigDict=CONFIG):
-    output_dir = out_dir or config['OUTPUT_DIR']
-    from .index.sql import list_migrations
-
-    pending_migrations = [name for status, name in list_migrations() if not status]
-
-    if pending_migrations:
-        stderr('[X] This collection was created with an older version of ArchiveBox and must be upgraded first.', color='lightyellow')
-        stderr(f'    {output_dir}')
-        stderr()
-        stderr(f'    To upgrade it to the latest version and apply the {len(pending_migrations)} pending migrations, run:')
-        stderr('        archivebox init')
-        raise SystemExit(3)
-
-    (Path(output_dir) / SOURCES_DIR_NAME).mkdir(exist_ok=True)
-    (Path(output_dir) / LOGS_DIR_NAME).mkdir(exist_ok=True)
-    (Path(output_dir) / CACHE_DIR_NAME).mkdir(exist_ok=True)
-    (Path(output_dir) / LIB_DIR_NAME / 'bin').mkdir(exist_ok=True, parents=True)
-    (Path(output_dir) / PERSONAS_DIR_NAME / 'Default').mkdir(exist_ok=True, parents=True)
+def setup_django(out_dir: Path=None, check_db=False, config: ConfigDict=CONFIG, in_memory_db=False) -> None:
+    global INITIAL_STARTUP_PROGRESS
+    global INITIAL_STARTUP_PROGRESS_TASK
+    
+    with Progress(transient=True, expand=True, console=CONSOLE) as INITIAL_STARTUP_PROGRESS:
+        INITIAL_STARTUP_PROGRESS_TASK = INITIAL_STARTUP_PROGRESS.add_task("[green]Loading modules...", total=25)
+        check_system_config(config)
 
+        output_dir = out_dir or Path(config['OUTPUT_DIR'])
 
+        assert isinstance(output_dir, Path) and isinstance(config['PACKAGE_DIR'], Path)
 
-def setup_django(out_dir: Path=None, check_db=False, config: ConfigDict=CONFIG, in_memory_db=False) -> None:
-    check_system_config()
+        bump_startup_progress_bar()
+        try:
+            from django.core.management import call_command
 
-    output_dir = out_dir or Path(config['OUTPUT_DIR'])
+            sys.path.append(str(config['PACKAGE_DIR']))
+            os.environ.setdefault('OUTPUT_DIR', str(output_dir))
+            assert (config['PACKAGE_DIR'] / 'core' / 'settings.py').exists(), 'settings.py was not found at archivebox/core/settings.py'
+            os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'core.settings')
 
-    assert isinstance(output_dir, Path) and isinstance(config['PACKAGE_DIR'], Path)
+            # Check to make sure JSON extension is available in our Sqlite3 instance
+            try:
+                cursor = sqlite3.connect(':memory:').cursor()
+                cursor.execute('SELECT JSON(\'{"a": "b"}\')')
+            except sqlite3.OperationalError as exc:
+                stderr(f'[X] Your SQLite3 version is missing the required JSON1 extension: {exc}', color='red')
+                hint([
+                    'Upgrade your Python version or install the extension manually:',
+                    'https://code.djangoproject.com/wiki/JSON1Extension'
+                ])
+                
+            bump_startup_progress_bar()
+
+            if in_memory_db:
+                # some commands (e.g. oneshot) dont store a long-lived sqlite3 db file on disk.
+                # in those cases we create a temporary in-memory db and run the migrations
+                # immediately to get a usable in-memory-database at startup
+                os.environ.setdefault("ARCHIVEBOX_DATABASE_NAME", ":memory:")
+                django.setup()
+                
+                bump_startup_progress_bar()
+                call_command("migrate", interactive=False, verbosity=0)
+            else:
+                # Otherwise use default sqlite3 file-based database and initialize django
+                # without running migrations automatically (user runs them manually by calling init)
+                django.setup()
+            
+            bump_startup_progress_bar()
 
-    try:
-        from django.core.management import call_command
+            from django.conf import settings
 
-        sys.path.append(str(config['PACKAGE_DIR']))
-        os.environ.setdefault('OUTPUT_DIR', str(output_dir))
-        assert (config['PACKAGE_DIR'] / 'core' / 'settings.py').exists(), 'settings.py was not found at archivebox/core/settings.py'
-        os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'core.settings')
+            # log startup message to the error log
+            with open(settings.ERROR_LOG, "a", encoding='utf-8') as f:
+                command = ' '.join(sys.argv)
+                ts = datetime.now(timezone.utc).strftime('%Y-%m-%d__%H:%M:%S')
+                f.write(f"\n> {command}; TS={ts} VERSION={config['VERSION']} IN_DOCKER={config['IN_DOCKER']} IS_TTY={config['IS_TTY']}\n")
 
-        # Check to make sure JSON extension is available in our Sqlite3 instance
-        try:
-            cursor = sqlite3.connect(':memory:').cursor()
-            cursor.execute('SELECT JSON(\'{"a": "b"}\')')
-        except sqlite3.OperationalError as exc:
-            stderr(f'[X] Your SQLite3 version is missing the required JSON1 extension: {exc}', color='red')
-            hint([
-                'Upgrade your Python version or install the extension manually:',
-                'https://code.djangoproject.com/wiki/JSON1Extension'
-            ])
-
-        if in_memory_db:
-            # some commands (e.g. oneshot) dont store a long-lived sqlite3 db file on disk.
-            # in those cases we create a temporary in-memory db and run the migrations
-            # immediately to get a usable in-memory-database at startup
-            os.environ.setdefault("ARCHIVEBOX_DATABASE_NAME", ":memory:")
-            django.setup()
-            call_command("migrate", interactive=False, verbosity=0)
-        else:
-            # Otherwise use default sqlite3 file-based database and initialize django
-            # without running migrations automatically (user runs them manually by calling init)
-            django.setup()
-
-        from django.conf import settings
-
-        # log startup message to the error log
-        with open(settings.ERROR_LOG, "a", encoding='utf-8') as f:
-            command = ' '.join(sys.argv)
-            ts = datetime.now(timezone.utc).strftime('%Y-%m-%d__%H:%M:%S')
-            f.write(f"\n> {command}; TS={ts} VERSION={config['VERSION']} IN_DOCKER={config['IN_DOCKER']} IS_TTY={config['IS_TTY']}\n")
-
-        if check_db:
-            # Enable WAL mode in sqlite3
-            from django.db import connection
-            with connection.cursor() as cursor:
-
-                # Set Journal mode to WAL to allow for multiple writers
-                current_mode = cursor.execute("PRAGMA journal_mode")
-                if current_mode != 'wal':
-                    cursor.execute("PRAGMA journal_mode=wal;")
-
-                # Set max blocking delay for concurrent writes and write sync mode
-                # https://litestream.io/tips/#busy-timeout
-                cursor.execute("PRAGMA busy_timeout = 5000;")
-                cursor.execute("PRAGMA synchronous = NORMAL;")
-
-            # Create cache table in DB if needed
-            try:
-                from django.core.cache import cache
-                cache.get('test', None)
-            except django.db.utils.OperationalError:
-                call_command("createcachetable", verbosity=0)
+            if check_db:
+                # Enable WAL mode in sqlite3
+                from django.db import connection
+                with connection.cursor() as cursor:
+
+                    # Set Journal mode to WAL to allow for multiple writers
+                    current_mode = cursor.execute("PRAGMA journal_mode")
+                    if current_mode != 'wal':
+                        cursor.execute("PRAGMA journal_mode=wal;")
+
+                    # Set max blocking delay for concurrent writes and write sync mode
+                    # https://litestream.io/tips/#busy-timeout
+                    cursor.execute("PRAGMA busy_timeout = 5000;")
+                    cursor.execute("PRAGMA synchronous = NORMAL;")
+
+                # Create cache table in DB if needed
+                try:
+                    from django.core.cache import cache
+                    cache.get('test', None)
+                except django.db.utils.OperationalError:
+                    call_command("createcachetable", verbosity=0)
 
-            # if archivebox gets imported multiple times, we have to close
-            # the sqlite3 whenever we init from scratch to avoid multiple threads
-            # sharing the same connection by accident
-            from django.db import connections
-            for conn in connections.all():
-                conn.close_if_unusable_or_obsolete()
+                bump_startup_progress_bar()
 
-            sql_index_path = Path(output_dir) / SQL_INDEX_FILENAME
-            assert sql_index_path.exists(), (
-                f'No database file {SQL_INDEX_FILENAME} found in: {config["OUTPUT_DIR"]} (Are you in an ArchiveBox collection directory?)')
+                # if archivebox gets imported multiple times, we have to close
+                # the sqlite3 whenever we init from scratch to avoid multiple threads
+                # sharing the same connection by accident
+                from django.db import connections
+                for conn in connections.all():
+                    conn.close_if_unusable_or_obsolete()
 
+                sql_index_path = Path(output_dir) / SQL_INDEX_FILENAME
+                assert sql_index_path.exists(), (
+                    f'No database file {SQL_INDEX_FILENAME} found in: {config["OUTPUT_DIR"]} (Are you in an ArchiveBox collection directory?)')
 
-            # https://docs.pydantic.dev/logfire/integrations/django/ Logfire Debugging
-            if settings.DEBUG_LOGFIRE:
-                from opentelemetry.instrumentation.sqlite3 import SQLite3Instrumentor
-                SQLite3Instrumentor().instrument()
+                bump_startup_progress_bar()
 
-                import logfire
+                # https://docs.pydantic.dev/logfire/integrations/django/ Logfire Debugging
+                if settings.DEBUG_LOGFIRE:
+                    from opentelemetry.instrumentation.sqlite3 import SQLite3Instrumentor
+                    SQLite3Instrumentor().instrument()
 
-                logfire.configure()
-                logfire.instrument_django(is_sql_commentor_enabled=True)
-                logfire.info(f'Started ArchiveBox v{CONFIG.VERSION}', argv=sys.argv)
+                    import logfire
+
+                    logfire.configure()
+                    logfire.instrument_django(is_sql_commentor_enabled=True)
+                    logfire.info(f'Started ArchiveBox v{CONFIG.VERSION}', argv=sys.argv)
+
+        except KeyboardInterrupt:
+            raise SystemExit(2)
 
-    except KeyboardInterrupt:
-        raise SystemExit(2)
+    INITIAL_STARTUP_PROGRESS = None
+    INITIAL_STARTUP_PROGRESS_TASK = None

+ 2 - 0
archivebox/core/settings.py

@@ -170,6 +170,7 @@ STATICFILES_DIRS = [
     *[
         str(plugin_dir / 'static')
         for plugin_dir in PLUGIN_DIRS.values()
+        if (plugin_dir / 'static').is_dir()
     ],
     str(PACKAGE_DIR / CONFIG.TEMPLATES_DIR_NAME / 'static'),
 ]
@@ -179,6 +180,7 @@ TEMPLATE_DIRS = [
     *[
         str(plugin_dir / 'templates')
         for plugin_dir in PLUGIN_DIRS.values()
+        if (plugin_dir / 'templates').is_dir()
     ],
     str(PACKAGE_DIR / CONFIG.TEMPLATES_DIR_NAME / 'core'),
     str(PACKAGE_DIR / CONFIG.TEMPLATES_DIR_NAME / 'admin'),

+ 5 - 0
archivebox/core/settings_logging.py

@@ -141,18 +141,22 @@ SETTINGS_LOGGING = {
         "api": {
             "handlers": ["default", "logfile"],
             "level": "DEBUG",
+            "propagate": False,
         },
         "checks": {
             "handlers": ["default", "logfile"],
             "level": "DEBUG",
+            "propagate": False,
         },
         "core": {
             "handlers": ["default", "logfile"],
             "level": "DEBUG",
+            "propagate": False,
         },
         "plugins_extractor": {
             "handlers": ["default", "logfile"],
             "level": "DEBUG",
+            "propagate": False,
         },
         "httpx": {
             "handlers": ["outbound_webhooks"],
@@ -164,6 +168,7 @@ SETTINGS_LOGGING = {
             "handlers": ["default", "logfile"],
             "level": "INFO",
             "filters": ["noisyrequestsfilter"],
+            "propagate": False,
         },
         "django.utils.autoreload": {
             "propagate": False,

+ 3 - 3
archivebox/logging_util.py

@@ -230,7 +230,7 @@ def progress_bar(seconds: int, prefix: str='') -> None:
         print()
 
 
-def log_cli_command(subcommand: str, subcommand_args: List[str], stdin: Optional[str], pwd: str):
+def log_cli_command(subcommand: str, subcommand_args: List[str], stdin: Optional[str | IO], pwd: str):
     cmd = ' '.join(('archivebox', subcommand, *subcommand_args))
     stderr('{black}[i] [{now}] ArchiveBox v{VERSION}: {cmd}{reset}'.format(
         now=datetime.now(timezone.utc).strftime('%Y-%m-%d %H:%M:%S'),
@@ -526,11 +526,11 @@ def log_removal_finished(all_links: int, to_remove: int):
 
 
 def log_shell_welcome_msg():
-    from .cli import list_subcommands
+    from .cli import CLI_SUBCOMMANDS
 
     print('{green}# ArchiveBox Imports{reset}'.format(**ANSI))
     print('{green}from core.models import Snapshot, ArchiveResult, Tag, User{reset}'.format(**ANSI))
-    print('{green}from cli import *\n    {}{reset}'.format("\n    ".join(list_subcommands().keys()), **ANSI))
+    print('{green}from cli import *\n    {}{reset}'.format("\n    ".join(CLI_SUBCOMMANDS.keys()), **ANSI))
     print()
     print('[i] Welcome to the ArchiveBox Shell!')
     print('    https://github.com/ArchiveBox/ArchiveBox/wiki/Usage#Shell-Usage')

+ 19 - 21
archivebox/main.py

@@ -16,7 +16,7 @@ from django.db.models import QuerySet
 from django.utils import timezone
 
 from .cli import (
-    list_subcommands,
+    CLI_SUBCOMMANDS,
     run_subcommand,
     display_first,
     meta_cmds,
@@ -66,9 +66,9 @@ from .index.html import (
 )
 from .index.csv import links_to_csv
 from .extractors import archive_links, archive_link, ignore_methods
+from .misc.logging import stderr, hint
+from .misc.checks import check_data_folder, check_dependencies
 from .config import (
-    stderr,
-    hint,
     ConfigDict,
     ANSI,
     IS_TTY,
@@ -98,8 +98,6 @@ from .config import (
     SEARCH_BACKEND_ENGINE,
     LDAP,
     get_version,
-    check_dependencies,
-    check_data_folder,
     write_config_file,
     VERSION,
     VERSIONS_AVAILABLE,
@@ -146,7 +144,7 @@ from .logging_util import (
 def help(out_dir: Path=OUTPUT_DIR) -> None:
     """Print the ArchiveBox help message and usage"""
 
-    all_subcommands = list_subcommands()
+    all_subcommands = CLI_SUBCOMMANDS
     COMMANDS_HELP_TEXT = '\n    '.join(
         f'{cmd.ljust(20)} {summary}'
         for cmd, summary in all_subcommands.items()
@@ -281,7 +279,7 @@ def version(quiet: bool=False,
             print('{white}[i] Data locations:{reset} (not in a data directory)'.format(**ANSI))
 
         print()
-        check_dependencies()
+        check_dependencies(CONFIG)
 
 
 @enforce_types
@@ -469,7 +467,7 @@ def init(force: bool=False, quick: bool=False, setup: bool=False, out_dir: Path=
 def status(out_dir: Path=OUTPUT_DIR) -> None:
     """Print out some info and statistics about the archive collection"""
 
-    check_data_folder(out_dir=out_dir)
+    check_data_folder(CONFIG)
 
     from core.models import Snapshot
     from django.contrib.auth import get_user_model
@@ -609,8 +607,8 @@ def add(urls: Union[str, List[str]],
         run_subcommand('init', stdin=None, pwd=out_dir)
 
     # Load list of links from the existing index
-    check_data_folder(out_dir=out_dir)
-    check_dependencies()
+    check_data_folder(CONFIG)
+    check_dependencies(CONFIG)
     new_links: List[Link] = []
     all_links = load_main_index(out_dir=out_dir)
 
@@ -705,7 +703,7 @@ def remove(filter_str: Optional[str]=None,
            out_dir: Path=OUTPUT_DIR) -> List[Link]:
     """Remove the specified URLs from the archive"""
     
-    check_data_folder(out_dir=out_dir)
+    check_data_folder(CONFIG)
 
     if snapshots is None:
         if filter_str and filter_patterns:
@@ -792,8 +790,8 @@ def update(resume: Optional[float]=None,
     from core.models import ArchiveResult
     from .search import index_links
 
-    check_data_folder(out_dir=out_dir)
-    check_dependencies()
+    check_data_folder(CONFIG)
+    check_dependencies(CONFIG)
     new_links: List[Link] = [] # TODO: Remove input argument: only_new
 
     extractors = extractors.split(",") if extractors else []
@@ -863,7 +861,7 @@ def list_all(filter_patterns_str: Optional[str]=None,
              out_dir: Path=OUTPUT_DIR) -> Iterable[Link]:
     """List, filter, and export information about archive entries"""
     
-    check_data_folder(out_dir=out_dir)
+    check_data_folder(CONFIG)
 
     if filter_patterns and filter_patterns_str:
         stderr(
@@ -911,7 +909,7 @@ def list_links(snapshots: Optional[QuerySet]=None,
                before: Optional[float]=None,
                out_dir: Path=OUTPUT_DIR) -> Iterable[Link]:
     
-    check_data_folder(out_dir=out_dir)
+    check_data_folder(CONFIG)
 
     if snapshots:
         all_snapshots = snapshots
@@ -935,7 +933,7 @@ def list_folders(links: List[Link],
                  status: str,
                  out_dir: Path=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
     
-    check_data_folder(out_dir=out_dir)
+    check_data_folder(CONFIG)
 
     STATUS_FUNCTIONS = {
         "indexed": get_indexed_folders,
@@ -1080,7 +1078,7 @@ def config(config_options_str: Optional[str]=None,
            out_dir: Path=OUTPUT_DIR) -> None:
     """Get and set your ArchiveBox project configuration values"""
 
-    check_data_folder(out_dir=out_dir)
+    check_data_folder(CONFIG)
 
     if config_options and config_options_str:
         stderr(
@@ -1183,7 +1181,7 @@ def schedule(add: bool=False,
              out_dir: Path=OUTPUT_DIR):
     """Set ArchiveBox to regularly import URLs at specific times using cron"""
     
-    check_data_folder(out_dir=out_dir)
+    check_data_folder(CONFIG)
 
     Path(LOGS_DIR).mkdir(exist_ok=True)
 
@@ -1324,7 +1322,7 @@ def server(runserver_args: Optional[List[str]]=None,
     config.SHOW_PROGRESS = False
     config.DEBUG = config.DEBUG or debug
 
-    check_data_folder(out_dir=out_dir)
+    check_data_folder(CONFIG)
 
     from django.core.management import call_command
     from django.contrib.auth.models import User
@@ -1417,7 +1415,7 @@ def server(runserver_args: Optional[List[str]]=None,
 def manage(args: Optional[List[str]]=None, out_dir: Path=OUTPUT_DIR) -> None:
     """Run an ArchiveBox Django management command"""
 
-    check_data_folder(out_dir=out_dir)
+    check_data_folder(CONFIG)
     from django.core.management import execute_from_command_line
 
     if (args and "createsuperuser" in args) and (IN_DOCKER and not IS_TTY):
@@ -1432,7 +1430,7 @@ def manage(args: Optional[List[str]]=None, out_dir: Path=OUTPUT_DIR) -> None:
 def shell(out_dir: Path=OUTPUT_DIR) -> None:
     """Enter an interactive ArchiveBox Django shell"""
 
-    check_data_folder(out_dir=out_dir)
+    check_data_folder(CONFIG)
 
     from django.core.management import call_command
     call_command("shell_plus")

+ 1 - 1
archivebox/manage.py

@@ -7,7 +7,7 @@ if __name__ == '__main__':
     # versions of ./manage.py commands whenever possible. When that's not possible
     # (e.g. makemigrations), you can comment out this check temporarily
 
-    allowed_commands = ['makemigrations', 'migrate', 'startapp','squashmigrations', 'generate_stubs']
+    allowed_commands = ['makemigrations', 'migrate', 'startapp','squashmigrations', 'generate_stubs', 'test']
 
     if not any(cmd in sys.argv for cmd in allowed_commands):
         print("[X] Don't run ./manage.py directly (unless you are a developer running makemigrations):")

+ 0 - 0
archivebox/misc/__init__.py


+ 159 - 0
archivebox/misc/checks.py

@@ -0,0 +1,159 @@
+__package__ = 'archivebox.misc'
+
+# TODO: migrate all of these to new plugantic/base_check.py Check system
+
+import sys
+from benedict import benedict
+from pathlib import Path
+
+from .logging import stderr, hint
+
+
+def check_system_config(config: benedict) -> None:
+    ### Check system environment
+    if config['USER'] == 'root' or str(config['PUID']) == "0":
+        stderr('[!] ArchiveBox should never be run as root!', color='red')
+        stderr('    For more information, see the security overview documentation:')
+        stderr('        https://github.com/ArchiveBox/ArchiveBox/wiki/Security-Overview#do-not-run-as-root')
+        
+        if config['IN_DOCKER']:
+            attempted_command = ' '.join(sys.argv[:3])
+            stderr('')
+            stderr('    {lightred}Hint{reset}: When using Docker, you must run commands with {green}docker run{reset} instead of {lightyellow}docker exec{reset}, e.g.:'.format(**config['ANSI']))
+            stderr(f'        docker compose run archivebox {attempted_command}')
+            stderr(f'        docker run -it -v $PWD/data:/data archivebox/archivebox {attempted_command}')
+            stderr('        or:')
+            stderr(f'        docker compose exec --user=archivebox archivebox /bin/bash -c "archivebox {attempted_command}"')
+            stderr(f'        docker exec -it --user=archivebox <container id> /bin/bash -c "archivebox {attempted_command}"')
+        
+        raise SystemExit(2)
+
+    ### Check Python environment
+    if sys.version_info[:3] < (3, 7, 0):
+        stderr(f'[X] Python version is not new enough: {config["PYTHON_VERSION"]} (>3.6 is required)', color='red')
+        stderr('    See https://github.com/ArchiveBox/ArchiveBox/wiki/Troubleshooting#python for help upgrading your Python installation.')
+        raise SystemExit(2)
+
+    if int(config['DJANGO_VERSION'].split('.')[0]) < 3:
+        stderr(f'[X] Django version is not new enough: {config["DJANGO_VERSION"]} (>3.0 is required)', color='red')
+        stderr('    Upgrade django using pip or your system package manager: pip3 install --upgrade django')
+        raise SystemExit(2)
+
+    if config['PYTHON_ENCODING'] not in ('UTF-8', 'UTF8'):
+        stderr(f'[X] Your system is running python3 scripts with a bad locale setting: {config["PYTHON_ENCODING"]} (it should be UTF-8).', color='red')
+        stderr('    To fix it, add the line "export PYTHONIOENCODING=UTF-8" to your ~/.bashrc file (without quotes)')
+        stderr('    Or if you\'re using ubuntu/debian, run "dpkg-reconfigure locales"')
+        stderr('')
+        stderr('    Confirm that it\'s fixed by opening a new shell and running:')
+        stderr('        python3 -c "import sys; print(sys.stdout.encoding)"   # should output UTF-8')
+        raise SystemExit(2)
+
+    # stderr('[i] Using Chrome binary: {}'.format(shutil.which(CHROME_BINARY) or CHROME_BINARY))
+    # stderr('[i] Using Chrome data dir: {}'.format(os.path.abspath(CHROME_USER_DATA_DIR)))
+    if config['CHROME_USER_DATA_DIR'] is not None and Path(config['CHROME_USER_DATA_DIR']).exists():
+        if not (Path(config['CHROME_USER_DATA_DIR']) / 'Default').exists():
+            stderr('[X] Could not find profile "Default" in CHROME_USER_DATA_DIR.', color='red')
+            stderr(f'    {config["CHROME_USER_DATA_DIR"]}')
+            stderr('    Make sure you set it to a Chrome user data directory containing a Default profile folder.')
+            stderr('    For more info see:')
+            stderr('        https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#CHROME_USER_DATA_DIR')
+            if '/Default' in str(config['CHROME_USER_DATA_DIR']):
+                stderr()
+                stderr('    Try removing /Default from the end e.g.:')
+                stderr('        CHROME_USER_DATA_DIR="{}"'.format(str(config['CHROME_USER_DATA_DIR']).split('/Default')[0]))
+            
+            # hard error is too annoying here, instead just set it to nothing
+            # raise SystemExit(2)
+            config['CHROME_USER_DATA_DIR'] = None
+    else:
+        config['CHROME_USER_DATA_DIR'] = None
+
+
+def check_dependencies(config: benedict, show_help: bool=True) -> None:
+    invalid_dependencies = [
+        (name, info) for name, info in config['DEPENDENCIES'].items()
+        if info['enabled'] and not info['is_valid']
+    ]
+    if invalid_dependencies and show_help:
+        stderr(f'[!] Warning: Missing {len(invalid_dependencies)} recommended dependencies', color='lightyellow')
+        for dependency, info in invalid_dependencies:
+            stderr(
+                '    ! {}: {} ({})'.format(
+                    dependency,
+                    info['path'] or 'unable to find binary',
+                    info['version'] or 'unable to detect version',
+                )
+            )
+            if dependency in ('YOUTUBEDL_BINARY', 'CHROME_BINARY', 'SINGLEFILE_BINARY', 'READABILITY_BINARY', 'MERCURY_BINARY'):
+                hint(('To install all packages automatically run: archivebox setup',
+                    f'or to disable it and silence this warning: archivebox config --set SAVE_{dependency.rsplit("_", 1)[0]}=False',
+                    ''), prefix='      ')
+        stderr('')
+
+    if config['TIMEOUT'] < 5:
+        stderr(f'[!] Warning: TIMEOUT is set too low! (currently set to TIMEOUT={config["TIMEOUT"]} seconds)', color='red')
+        stderr('    You must allow *at least* 5 seconds for indexing and archive methods to run succesfully.')
+        stderr('    (Setting it to somewhere between 30 and 3000 seconds is recommended)')
+        stderr()
+        stderr('    If you want to make ArchiveBox run faster, disable specific archive methods instead:')
+        stderr('        https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#archive-method-toggles')
+        stderr()
+
+    elif config['USE_CHROME'] and config['TIMEOUT'] < 15:
+        stderr(f'[!] Warning: TIMEOUT is set too low! (currently set to TIMEOUT={config["TIMEOUT"]} seconds)', color='red')
+        stderr('    Chrome will fail to archive all sites if set to less than ~15 seconds.')
+        stderr('    (Setting it to somewhere between 30 and 300 seconds is recommended)')
+        stderr()
+        stderr('    If you want to make ArchiveBox run faster, disable specific archive methods instead:')
+        stderr('        https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#archive-method-toggles')
+        stderr()
+
+    if config['USE_YOUTUBEDL'] and config['MEDIA_TIMEOUT'] < 20:
+        stderr(f'[!] Warning: MEDIA_TIMEOUT is set too low! (currently set to MEDIA_TIMEOUT={config["MEDIA_TIMEOUT"]} seconds)', color='red')
+        stderr('    youtube-dl/yt-dlp will fail to archive any media if set to less than ~20 seconds.')
+        stderr('    (Setting it somewhere over 60 seconds is recommended)')
+        stderr()
+        stderr('    If you want to disable media archiving entirely, set SAVE_MEDIA=False instead:')
+        stderr('        https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#save_media')
+        stderr()
+
+        
+
+
+def check_data_folder(config: benedict) -> None:
+    output_dir = config['OUTPUT_DIR']
+
+    archive_dir_exists = (Path(output_dir) / 'archive').exists()
+    if not archive_dir_exists:
+        stderr('[X] No archivebox index found in the current directory.', color='red')
+        stderr(f'    {output_dir}', color='lightyellow')
+        stderr()
+        stderr('    {lightred}Hint{reset}: Are you running archivebox in the right folder?'.format(**config['ANSI']))
+        stderr('        cd path/to/your/archive/folder')
+        stderr('        archivebox [command]')
+        stderr()
+        stderr('    {lightred}Hint{reset}: To create a new archive collection or import existing data in this folder, run:'.format(**config['ANSI']))
+        stderr('        archivebox init')
+        raise SystemExit(2)
+
+
+def check_migrations(config: benedict):
+    output_dir = config['OUTPUT_DIR']
+    
+    from ..index.sql import list_migrations
+
+    pending_migrations = [name for status, name in list_migrations() if not status]
+
+    if pending_migrations:
+        stderr('[X] This collection was created with an older version of ArchiveBox and must be upgraded first.', color='lightyellow')
+        stderr(f'    {output_dir}')
+        stderr()
+        stderr(f'    To upgrade it to the latest version and apply the {len(pending_migrations)} pending migrations, run:')
+        stderr('        archivebox init')
+        raise SystemExit(3)
+
+    (Path(output_dir) / config['SOURCES_DIR_NAME']).mkdir(exist_ok=True)
+    (Path(output_dir) / config['LOGS_DIR_NAME']).mkdir(exist_ok=True)
+    (Path(output_dir) / config['CACHE_DIR_NAME']).mkdir(exist_ok=True)
+    (Path(output_dir) / config['LIB_DIR_NAME'] / 'bin').mkdir(exist_ok=True, parents=True)
+    (Path(output_dir) / config['PERSONAS_DIR_NAME'] / 'Default').mkdir(exist_ok=True, parents=True)

+ 30 - 0
archivebox/misc/debugging.py

@@ -0,0 +1,30 @@
+from functools import wraps
+from time import time
+
+def timed_function(func):
+    """
+    Very simple profiling decorator for debugging.
+    Usage:
+        @timed_function
+        def my_func():
+            ...
+    
+    More advanced alternatives:
+        - viztracer ../.venv/bin/archivebox manage check          # https://viztracer.readthedocs.io/en/latest/filter.html
+        - python -m cProfile -o archivebox.prof ../.venv/bin/archivebox manage check; snakeviz archivebox.prof
+        - Django Debug Toolbar + django-debug-toolbar-flamegraph
+        + Django Requests Tracker (requests-tracker)
+    """
+    @wraps(func)
+    def wrap(*args, **kwargs):
+        if args and hasattr(args[0], '__module__'):
+            module = args[0].__module__
+        else:
+            module = func.__module__
+        ts_start = time()
+        result = func(*args, **kwargs)
+        ts_end = time()
+        ms_elapsed = int((ts_end-ts_start) * 1000)
+        print(f'[DEBUG][{ms_elapsed}ms] {module}.{func.__name__}(...)')
+        return result
+    return wrap

+ 77 - 0
archivebox/misc/logging.py

@@ -0,0 +1,77 @@
+__package__ = 'archivebox.misc'
+
+# TODO: merge/dedupe this file with archivebox/logging_util.py
+
+import os
+import sys
+from typing import Optional, Union, Tuple, List
+from collections import defaultdict
+from benedict import benedict
+from rich.console import Console
+
+from ..config_stubs import ConfigDict
+
+SHOW_PROGRESS = None
+if os.environ.get('SHOW_PROGRESS', 'None') in ('True', '1', 'true', 'yes'):
+    SHOW_PROGRESS = True
+
+CONSOLE = Console(force_interactive=SHOW_PROGRESS)
+SHOW_PROGRESS = CONSOLE.is_interactive if SHOW_PROGRESS is None else SHOW_PROGRESS
+
+DEFAULT_CLI_COLORS = benedict(
+    {
+        "reset": "\033[00;00m",
+        "lightblue": "\033[01;30m",
+        "lightyellow": "\033[01;33m",
+        "lightred": "\033[01;35m",
+        "red": "\033[01;31m",
+        "green": "\033[01;32m",
+        "blue": "\033[01;34m",
+        "white": "\033[01;37m",
+        "black": "\033[01;30m",
+    }
+)
+ANSI = benedict({k: '' for k in DEFAULT_CLI_COLORS.keys()})
+
+COLOR_DICT = defaultdict(lambda: [(0, 0, 0), (0, 0, 0)], {
+    '00': [(0, 0, 0), (0, 0, 0)],
+    '30': [(0, 0, 0), (0, 0, 0)],
+    '31': [(255, 0, 0), (128, 0, 0)],
+    '32': [(0, 200, 0), (0, 128, 0)],
+    '33': [(255, 255, 0), (128, 128, 0)],
+    '34': [(0, 0, 255), (0, 0, 128)],
+    '35': [(255, 0, 255), (128, 0, 128)],
+    '36': [(0, 255, 255), (0, 128, 128)],
+    '37': [(255, 255, 255), (255, 255, 255)],
+})
+
+# Logging Helpers
+def stdout(*args, color: Optional[str]=None, prefix: str='', config: Optional[ConfigDict]=None) -> None:
+    ansi = DEFAULT_CLI_COLORS if (config or {}).get('USE_COLOR') else ANSI
+
+    if color:
+        strs = [ansi[color], ' '.join(str(a) for a in args), ansi['reset'], '\n']
+    else:
+        strs = [' '.join(str(a) for a in args), '\n']
+
+    sys.stdout.write(prefix + ''.join(strs))
+
+def stderr(*args, color: Optional[str]=None, prefix: str='', config: Optional[ConfigDict]=None) -> None:
+    ansi = DEFAULT_CLI_COLORS if (config or {}).get('USE_COLOR') else ANSI
+
+    if color:
+        strs = [ansi[color], ' '.join(str(a) for a in args), ansi['reset'], '\n']
+    else:
+        strs = [' '.join(str(a) for a in args), '\n']
+
+    sys.stderr.write(prefix + ''.join(strs))
+
+def hint(text: Union[Tuple[str, ...], List[str], str], prefix='    ', config: Optional[ConfigDict]=None) -> None:
+    ansi = DEFAULT_CLI_COLORS if (config or {}).get('USE_COLOR') else ANSI
+
+    if isinstance(text, str):
+        stderr('{}{lightred}Hint:{reset} {}'.format(prefix, text, **ansi))
+    else:
+        stderr('{}{lightred}Hint:{reset} {}'.format(prefix, text[0], **ansi))
+        for line in text[1:]:
+            stderr('{}      {}'.format(prefix, line))

+ 0 - 1
archivebox/monkey_patches.py

@@ -10,7 +10,6 @@ import datetime
 from django.utils import timezone
 timezone.utc = datetime.timezone.utc
 
-
 # monkey patch django-signals-webhooks to change how it shows up in Admin UI
 # from signal_webhooks.apps import DjangoSignalWebhooksConfig
 # DjangoSignalWebhooksConfig.verbose_name = 'API'

+ 3 - 3
archivebox/package-lock.json

@@ -371,9 +371,9 @@
       "license": "Apache-2.0"
     },
     "node_modules/bare-events": {
-      "version": "2.4.2",
-      "resolved": "https://registry.npmjs.org/bare-events/-/bare-events-2.4.2.tgz",
-      "integrity": "sha512-qMKFd2qG/36aA4GwvKq8MxnPgCQAmBWmSyLWsJcbn8v03wvIPQ/hG1Ms8bPzndZxMDoHpxez5VOS+gC9Yi24/Q==",
+      "version": "2.5.0",
+      "resolved": "https://registry.npmjs.org/bare-events/-/bare-events-2.5.0.tgz",
+      "integrity": "sha512-/E8dDe9dsbLyh2qrZ64PEPadOQ0F4gbl1sUJOrmph7xOiIxfY8vwab/4bFLh4Y88/Hk/ujKcrQKc+ps0mv873A==",
       "license": "Apache-2.0",
       "optional": true
     },

+ 3 - 3
archivebox/plugantic/base_binary.py

@@ -3,6 +3,7 @@ __package__ = "archivebox.plugantic"
 from typing import Dict, List
 from typing_extensions import Self
 
+from benedict import benedict
 from pydantic import Field, InstanceOf, validate_call
 from pydantic_pkgr import (
     Binary,
@@ -17,7 +18,6 @@ from pydantic_pkgr import (
 from django.conf import settings
 
 from .base_hook import BaseHook, HookType
-from ..config_stubs import AttrDict
 
 
 class BaseBinProvider(BaseHook, BinProvider):
@@ -38,7 +38,7 @@ class BaseBinProvider(BaseHook, BinProvider):
     def register(self, settings, parent_plugin=None):
         # self._plugin = parent_plugin                                      # for debugging only, never rely on this!
 
-        settings.BINPROVIDERS = getattr(settings, "BINPROVIDERS", None) or AttrDict({})
+        settings.BINPROVIDERS = getattr(settings, "BINPROVIDERS", None) or benedict({})
         settings.BINPROVIDERS[self.id] = self
 
         super().register(settings, parent_plugin=parent_plugin)
@@ -58,7 +58,7 @@ class BaseBinary(BaseHook, Binary):
     def register(self, settings, parent_plugin=None):
         # self._plugin = parent_plugin                                      # for debugging only, never rely on this!
 
-        settings.BINARIES = getattr(settings, "BINARIES", None) or AttrDict({})
+        settings.BINARIES = getattr(settings, "BINARIES", None) or benedict({})
         settings.BINARIES[self.id] = self
 
         super().register(settings, parent_plugin=parent_plugin)

+ 2 - 5
archivebox/plugantic/base_check.py

@@ -28,7 +28,7 @@ class BaseCheck(BaseHook):
     def register(self, settings, parent_plugin=None):
         # self._plugin = parent_plugin  # backref to parent is for debugging only, never rely on this!
 
-        self.register_with_django_check_system()  # (SIDE EFFECT)
+        self.register_with_django_check_system(settings)  # (SIDE EFFECT)
 
         # install hook into settings.CHECKS
         settings.CHECKS = getattr(settings, "CHECKS", None) or AttrDict({})
@@ -37,12 +37,9 @@ class BaseCheck(BaseHook):
         # record installed hook in settings.HOOKS
         super().register(settings, parent_plugin=parent_plugin)
 
-    def register_with_django_check_system(self):
-
+    def register_with_django_check_system(self, settings):
         def run_check(app_configs, **kwargs) -> List[Warning]:
-            from django.conf import settings
             import logging
-
             return self.check(settings, logging.getLogger("checks"))
 
         run_check.__name__ = self.id

+ 2 - 3
archivebox/plugantic/base_hook.py

@@ -96,14 +96,13 @@ class BaseHook(BaseModel):
         # e.g. /admin/environment/config/LdapConfig/
         return f"/admin/environment/{self.hook_type.lower()}/{self.id}/"
 
-
     def register(self, settings, parent_plugin=None):
         """Load a record of an installed hook into global Django settings.HOOKS at runtime."""
         self._plugin = parent_plugin         # for debugging only, never rely on this!
 
         # assert json.dumps(self.model_json_schema(), indent=4), f"Hook {self.hook_module} has invalid JSON schema."
 
-        print('  -', self.hook_module, '.register()')
+        # print('  -', self.hook_module, '.register()')
 
         # record installed hook in settings.HOOKS
         settings.HOOKS[self.id] = self
@@ -118,7 +117,7 @@ class BaseHook(BaseModel):
     def ready(self, settings):
         """Runs any runtime code needed when AppConfig.ready() is called (after all models are imported)."""
 
-        print('  -', self.hook_module, '.ready()')
+        # print('  -', self.hook_module, '.ready()')
 
         assert self.id in settings.HOOKS, f"Tried to ready hook {self.hook_module} but it is not registered in settings.HOOKS."
 

+ 14 - 11
archivebox/plugantic/base_plugin.py

@@ -1,6 +1,5 @@
 __package__ = 'archivebox.plugantic'
 
-import json
 import inspect
 from pathlib import Path
 
@@ -18,10 +17,11 @@ from pydantic import (
     computed_field,
     validate_call,
 )
+from benedict import benedict
 
 from .base_hook import BaseHook, HookType
 
-from ..config import AttrDict
+from ..config import bump_startup_progress_bar
 
 
 class BasePlugin(BaseModel):
@@ -90,7 +90,8 @@ class BasePlugin(BaseModel):
         
         assert self.app_label and self.app_label and self.verbose_name, f'{self.__class__.__name__} is missing .name or .app_label or .verbose_name'
         
-        assert json.dumps(self.model_json_schema(), indent=4), f"Plugin {self.plugin_module} has invalid JSON schema."
+        # assert json.dumps(self.model_json_schema(), indent=4), f"Plugin {self.plugin_module} has invalid JSON schema."
+        
         return self
     
     @property
@@ -114,13 +115,13 @@ class BasePlugin(BaseModel):
 
     @property
     def HOOKS_BY_ID(self) -> Dict[str, InstanceOf[BaseHook]]:
-        return AttrDict({hook.id: hook for hook in self.hooks})
+        return benedict({hook.id: hook for hook in self.hooks})
 
     @property
     def HOOKS_BY_TYPE(self) -> Dict[HookType, Dict[str, InstanceOf[BaseHook]]]:
-        hooks = AttrDict({})
+        hooks = benedict({})
         for hook in self.hooks:
-            hooks[hook.hook_type] = hooks.get(hook.hook_type) or AttrDict({})
+            hooks[hook.hook_type] = hooks.get(hook.hook_type) or benedict({})
             hooks[hook.hook_type][hook.id] = hook
         return hooks
 
@@ -131,10 +132,10 @@ class BasePlugin(BaseModel):
             from django.conf import settings as django_settings
             settings = django_settings
             
-        print()
-        print(self.plugin_module_full, '.register()')
+        # print()
+        # print(self.plugin_module_full, '.register()')
 
-        assert json.dumps(self.model_json_schema(), indent=4), f'Plugin {self.plugin_module} has invalid JSON schema.'
+        # assert json.dumps(self.model_json_schema(), indent=4), f'Plugin {self.plugin_module} has invalid JSON schema.'
 
         assert self.id not in settings.PLUGINS, f'Tried to register plugin {self.plugin_module} but it conflicts with existing plugin of the same name ({self.app_label}).'
 
@@ -149,6 +150,7 @@ class BasePlugin(BaseModel):
 
         settings.PLUGINS[self.id]._is_registered = True
         # print('√ REGISTERED PLUGIN:', self.plugin_module)
+        bump_startup_progress_bar()
 
     def ready(self, settings=None):
         """Runs any runtime code needed when AppConfig.ready() is called (after all models are imported)."""
@@ -157,8 +159,8 @@ class BasePlugin(BaseModel):
             from django.conf import settings as django_settings
             settings = django_settings
 
-        print()
-        print(self.plugin_module_full, '.ready()')
+        # print()
+        # print(self.plugin_module_full, '.ready()')
 
         assert (
             self.id in settings.PLUGINS and settings.PLUGINS[self.id]._is_registered
@@ -171,6 +173,7 @@ class BasePlugin(BaseModel):
             hook.ready(settings)
         
         settings.PLUGINS[self.id]._is_ready = True
+        bump_startup_progress_bar()
 
     # @validate_call
     # def install_binaries(self) -> Self:

+ 0 - 335
archivebox/plugantic/ini_to_toml.py

@@ -83,338 +83,3 @@ class JSONSchemaWithLambdas(GenerateJsonSchema):
     # for computed_field properties render them like this instead:
     # inspect.getsource(field.wrapped_property.fget).split('def ', 1)[-1].split('\n', 1)[-1].strip().strip('return '),
 
-
-
-### Basic Assertions
-
-# test_input = """
-# [SERVER_CONFIG]
-# IS_TTY=False
-# USE_COLOR=False
-# SHOW_PROGRESS=False
-# IN_DOCKER=False
-# IN_QEMU=False
-# PUID=501
-# PGID=20
-# OUTPUT_DIR=/opt/archivebox/data
-# CONFIG_FILE=/opt/archivebox/data/ArchiveBox.conf
-# ONLY_NEW=True
-# TIMEOUT=60
-# MEDIA_TIMEOUT=3600
-# OUTPUT_PERMISSIONS=644
-# RESTRICT_FILE_NAMES=windows
-# URL_DENYLIST=\.(css|js|otf|ttf|woff|woff2|gstatic\.com|googleapis\.com/css)(\?.*)?$
-# URL_ALLOWLIST=None
-# ADMIN_USERNAME=None
-# ADMIN_PASSWORD=None
-# ENFORCE_ATOMIC_WRITES=True
-# TAG_SEPARATOR_PATTERN=[,]
-# SECRET_KEY=xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx
-# BIND_ADDR=127.0.0.1:8000
-# ALLOWED_HOSTS=*
-# DEBUG=False
-# PUBLIC_INDEX=True
-# PUBLIC_SNAPSHOTS=True
-# PUBLIC_ADD_VIEW=False
-# FOOTER_INFO=Content is hosted for personal archiving purposes only.  Contact server owner for any takedown requests.
-# SNAPSHOTS_PER_PAGE=40
-# CUSTOM_TEMPLATES_DIR=None
-# TIME_ZONE=UTC
-# TIMEZONE=UTC
-# REVERSE_PROXY_USER_HEADER=Remote-User
-# REVERSE_PROXY_WHITELIST=
-# LOGOUT_REDIRECT_URL=/
-# PREVIEW_ORIGINALS=True
-# LDAP=False
-# LDAP_SERVER_URI=None
-# LDAP_BIND_DN=None
-# LDAP_BIND_PASSWORD=None
-# LDAP_USER_BASE=None
-# LDAP_USER_FILTER=None
-# LDAP_USERNAME_ATTR=None
-# LDAP_FIRSTNAME_ATTR=None
-# LDAP_LASTNAME_ATTR=None
-# LDAP_EMAIL_ATTR=None
-# LDAP_CREATE_SUPERUSER=False
-# SAVE_TITLE=True
-# SAVE_FAVICON=True
-# SAVE_WGET=True
-# SAVE_WGET_REQUISITES=True
-# SAVE_SINGLEFILE=True
-# SAVE_READABILITY=True
-# SAVE_MERCURY=True
-# SAVE_HTMLTOTEXT=True
-# SAVE_PDF=True
-# SAVE_SCREENSHOT=True
-# SAVE_DOM=True
-# SAVE_HEADERS=True
-# SAVE_WARC=True
-# SAVE_GIT=True
-# SAVE_MEDIA=True
-# SAVE_ARCHIVE_DOT_ORG=True
-# RESOLUTION=1440,2000
-# GIT_DOMAINS=github.com,bitbucket.org,gitlab.com,gist.github.com,codeberg.org,gitea.com,git.sr.ht
-# CHECK_SSL_VALIDITY=True
-# MEDIA_MAX_SIZE=750m
-# USER_AGENT=None
-# CURL_USER_AGENT=Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36 ArchiveBox/0.8.0 (+https://github.com/ArchiveBox/ArchiveBox/) curl/curl 8.4.0 (x86_64-apple-darwin23.0)
-# WGET_USER_AGENT=Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36 ArchiveBox/0.8.0 (+https://github.com/ArchiveBox/ArchiveBox/) wget/GNU Wget 1.24.5
-# CHROME_USER_AGENT=Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36 ArchiveBox/0.8.0 (+https://github.com/ArchiveBox/ArchiveBox/)
-# COOKIES_FILE=None
-# CHROME_USER_DATA_DIR=None
-# CHROME_TIMEOUT=0
-# CHROME_HEADLESS=True
-# CHROME_SANDBOX=True
-# CHROME_EXTRA_ARGS=[]
-# YOUTUBEDL_ARGS=['--restrict-filenames', '--trim-filenames', '128', '--write-description', '--write-info-json', '--write-annotations', '--write-thumbnail', '--no-call-home', '--write-sub', '--write-auto-subs', '--convert-subs=srt', '--yes-playlist', '--continue', '--no-abort-on-error', '--ignore-errors', '--geo-bypass', '--add-metadata', '--format=(bv*+ba/b)[filesize<=750m][filesize_approx<=?750m]/(bv*+ba/b)']
-# YOUTUBEDL_EXTRA_ARGS=[]
-# WGET_ARGS=['--no-verbose', '--adjust-extension', '--convert-links', '--force-directories', '--backup-converted', '--span-hosts', '--no-parent', '-e', 'robots=off']
-# WGET_EXTRA_ARGS=[]
-# CURL_ARGS=['--silent', '--location', '--compressed']
-# CURL_EXTRA_ARGS=[]
-# GIT_ARGS=['--recursive']
-# SINGLEFILE_ARGS=[]
-# SINGLEFILE_EXTRA_ARGS=[]
-# MERCURY_ARGS=['--format=text']
-# MERCURY_EXTRA_ARGS=[]
-# FAVICON_PROVIDER=https://www.google.com/s2/favicons?domain={}
-# USE_INDEXING_BACKEND=True
-# USE_SEARCHING_BACKEND=True
-# SEARCH_BACKEND_ENGINE=ripgrep
-# SEARCH_BACKEND_HOST_NAME=localhost
-# SEARCH_BACKEND_PORT=1491
-# SEARCH_BACKEND_PASSWORD=SecretPassword
-# SEARCH_PROCESS_HTML=True
-# SONIC_COLLECTION=archivebox
-# SONIC_BUCKET=snapshots
-# SEARCH_BACKEND_TIMEOUT=90
-# FTS_SEPARATE_DATABASE=True
-# FTS_TOKENIZERS=porter unicode61 remove_diacritics 2
-# FTS_SQLITE_MAX_LENGTH=1000000000
-# USE_CURL=True
-# USE_WGET=True
-# USE_SINGLEFILE=True
-# USE_READABILITY=True
-# USE_MERCURY=True
-# USE_GIT=True
-# USE_CHROME=True
-# USE_NODE=True
-# USE_YOUTUBEDL=True
-# USE_RIPGREP=True
-# CURL_BINARY=curl
-# GIT_BINARY=git
-# WGET_BINARY=wget
-# SINGLEFILE_BINARY=single-file
-# READABILITY_BINARY=readability-extractor
-# MERCURY_BINARY=postlight-parser
-# YOUTUBEDL_BINARY=yt-dlp
-# NODE_BINARY=node
-# RIPGREP_BINARY=rg
-# CHROME_BINARY=chrome
-# POCKET_CONSUMER_KEY=None
-# USER=squash
-# PACKAGE_DIR=/opt/archivebox/archivebox
-# TEMPLATES_DIR=/opt/archivebox/archivebox/templates
-# ARCHIVE_DIR=/opt/archivebox/data/archive
-# SOURCES_DIR=/opt/archivebox/data/sources
-# LOGS_DIR=/opt/archivebox/data/logs
-# PERSONAS_DIR=/opt/archivebox/data/personas
-# URL_DENYLIST_PTN=re.compile('\\.(css|js|otf|ttf|woff|woff2|gstatic\\.com|googleapis\\.com/css)(\\?.*)?$', re.IGNORECASE|re.MULTILINE)
-# URL_ALLOWLIST_PTN=None
-# DIR_OUTPUT_PERMISSIONS=755
-# ARCHIVEBOX_BINARY=/opt/archivebox/.venv/bin/archivebox
-# VERSION=0.8.0
-# COMMIT_HASH=102e87578c6036bb0132dd1ebd17f8f05ffc880f
-# BUILD_TIME=2024-05-15 03:28:05 1715768885
-# VERSIONS_AVAILABLE=None
-# CAN_UPGRADE=False
-# PYTHON_BINARY=/opt/archivebox/.venv/bin/python3.10
-# PYTHON_ENCODING=UTF-8
-# PYTHON_VERSION=3.10.14
-# DJANGO_BINARY=/opt/archivebox/.venv/lib/python3.10/site-packages/django/__init__.py
-# DJANGO_VERSION=5.0.6 final (0)
-# SQLITE_BINARY=/opt/homebrew/Cellar/[email protected]/3.10.14/Frameworks/Python.framework/Versions/3.10/lib/python3.10/sqlite3/dbapi2.py
-# SQLITE_VERSION=2.6.0
-# CURL_VERSION=curl 8.4.0 (x86_64-apple-darwin23.0)
-# WGET_VERSION=GNU Wget 1.24.5
-# WGET_AUTO_COMPRESSION=True
-# RIPGREP_VERSION=ripgrep 14.1.0
-# SINGLEFILE_VERSION=None
-# READABILITY_VERSION=None
-# MERCURY_VERSION=None
-# GIT_VERSION=git version 2.44.0
-# YOUTUBEDL_VERSION=2024.04.09
-# CHROME_VERSION=Google Chrome 124.0.6367.207
-# NODE_VERSION=v21.7.3
-# """
-
-
-# expected_output = TOML_HEADER + '''[SERVER_CONFIG]
-# IS_TTY = false
-# USE_COLOR = false
-# SHOW_PROGRESS = false
-# IN_DOCKER = false
-# IN_QEMU = false
-# PUID = 501
-# PGID = 20
-# OUTPUT_DIR = "/opt/archivebox/data"
-# CONFIG_FILE = "/opt/archivebox/data/ArchiveBox.conf"
-# ONLY_NEW = true
-# TIMEOUT = 60
-# MEDIA_TIMEOUT = 3600
-# OUTPUT_PERMISSIONS = 644
-# RESTRICT_FILE_NAMES = "windows"
-# URL_DENYLIST = "\\\\.(css|js|otf|ttf|woff|woff2|gstatic\\\\.com|googleapis\\\\.com/css)(\\\\?.*)?$"
-# URL_ALLOWLIST = null
-# ADMIN_USERNAME = null
-# ADMIN_PASSWORD = null
-# ENFORCE_ATOMIC_WRITES = true
-# TAG_SEPARATOR_PATTERN = "[,]"
-# SECRET_KEY = "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"
-# BIND_ADDR = "127.0.0.1:8000"
-# ALLOWED_HOSTS = "*"
-# DEBUG = false
-# PUBLIC_INDEX = true
-# PUBLIC_SNAPSHOTS = true
-# PUBLIC_ADD_VIEW = false
-# FOOTER_INFO = "Content is hosted for personal archiving purposes only.  Contact server owner for any takedown requests."
-# SNAPSHOTS_PER_PAGE = 40
-# CUSTOM_TEMPLATES_DIR = null
-# TIME_ZONE = "UTC"
-# TIMEZONE = "UTC"
-# REVERSE_PROXY_USER_HEADER = "Remote-User"
-# REVERSE_PROXY_WHITELIST = ""
-# LOGOUT_REDIRECT_URL = "/"
-# PREVIEW_ORIGINALS = true
-# LDAP = false
-# LDAP_SERVER_URI = null
-# LDAP_BIND_DN = null
-# LDAP_BIND_PASSWORD = null
-# LDAP_USER_BASE = null
-# LDAP_USER_FILTER = null
-# LDAP_USERNAME_ATTR = null
-# LDAP_FIRSTNAME_ATTR = null
-# LDAP_LASTNAME_ATTR = null
-# LDAP_EMAIL_ATTR = null
-# LDAP_CREATE_SUPERUSER = false
-# SAVE_TITLE = true
-# SAVE_FAVICON = true
-# SAVE_WGET = true
-# SAVE_WGET_REQUISITES = true
-# SAVE_SINGLEFILE = true
-# SAVE_READABILITY = true
-# SAVE_MERCURY = true
-# SAVE_HTMLTOTEXT = true
-# SAVE_PDF = true
-# SAVE_SCREENSHOT = true
-# SAVE_DOM = true
-# SAVE_HEADERS = true
-# SAVE_WARC = true
-# SAVE_GIT = true
-# SAVE_MEDIA = true
-# SAVE_ARCHIVE_DOT_ORG = true
-# RESOLUTION = [1440, 2000]
-# GIT_DOMAINS = "github.com,bitbucket.org,gitlab.com,gist.github.com,codeberg.org,gitea.com,git.sr.ht"
-# CHECK_SSL_VALIDITY = true
-# MEDIA_MAX_SIZE = "750m"
-# USER_AGENT = null
-# CURL_USER_AGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36 ArchiveBox/0.8.0 (+https://github.com/ArchiveBox/ArchiveBox/) curl/curl 8.4.0 (x86_64-apple-darwin23.0)"
-# WGET_USER_AGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36 ArchiveBox/0.8.0 (+https://github.com/ArchiveBox/ArchiveBox/) wget/GNU Wget 1.24.5"
-# CHROME_USER_AGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36 ArchiveBox/0.8.0 (+https://github.com/ArchiveBox/ArchiveBox/)"
-# COOKIES_FILE = null
-# CHROME_USER_DATA_DIR = null
-# CHROME_TIMEOUT = false
-# CHROME_HEADLESS = true
-# CHROME_SANDBOX = true
-# CHROME_EXTRA_ARGS = []
-# YOUTUBEDL_ARGS = ["--restrict-filenames", "--trim-filenames", "128", "--write-description", "--write-info-json", "--write-annotations", "--write-thumbnail", "--no-call-home", "--write-sub", "--write-auto-subs", "--convert-subs=srt", "--yes-playlist", "--continue", "--no-abort-on-error", "--ignore-errors", "--geo-bypass", "--add-metadata", "--format=(bv*+ba/b)[filesize<=750m][filesize_approx<=?750m]/(bv*+ba/b)"]
-# YOUTUBEDL_EXTRA_ARGS = []
-# WGET_ARGS = ["--no-verbose", "--adjust-extension", "--convert-links", "--force-directories", "--backup-converted", "--span-hosts", "--no-parent", "-e", "robots=off"]
-# WGET_EXTRA_ARGS = []
-# CURL_ARGS = ["--silent", "--location", "--compressed"]
-# CURL_EXTRA_ARGS = []
-# GIT_ARGS = ["--recursive"]
-# SINGLEFILE_ARGS = []
-# SINGLEFILE_EXTRA_ARGS = []
-# MERCURY_ARGS = ["--format=text"]
-# MERCURY_EXTRA_ARGS = []
-# FAVICON_PROVIDER = "https://www.google.com/s2/favicons?domain={}"
-# USE_INDEXING_BACKEND = true
-# USE_SEARCHING_BACKEND = true
-# SEARCH_BACKEND_ENGINE = "ripgrep"
-# SEARCH_BACKEND_HOST_NAME = "localhost"
-# SEARCH_BACKEND_PORT = 1491
-# SEARCH_BACKEND_PASSWORD = "SecretPassword"
-# SEARCH_PROCESS_HTML = true
-# SONIC_COLLECTION = "archivebox"
-# SONIC_BUCKET = "snapshots"
-# SEARCH_BACKEND_TIMEOUT = 90
-# FTS_SEPARATE_DATABASE = true
-# FTS_TOKENIZERS = "porter unicode61 remove_diacritics 2"
-# FTS_SQLITE_MAX_LENGTH = 1000000000
-# USE_CURL = true
-# USE_WGET = true
-# USE_SINGLEFILE = true
-# USE_READABILITY = true
-# USE_MERCURY = true
-# USE_GIT = true
-# USE_CHROME = true
-# USE_NODE = true
-# USE_YOUTUBEDL = true
-# USE_RIPGREP = true
-# CURL_BINARY = "curl"
-# GIT_BINARY = "git"
-# WGET_BINARY = "wget"
-# SINGLEFILE_BINARY = "single-file"
-# READABILITY_BINARY = "readability-extractor"
-# MERCURY_BINARY = "postlight-parser"
-# YOUTUBEDL_BINARY = "yt-dlp"
-# NODE_BINARY = "node"
-# RIPGREP_BINARY = "rg"
-# CHROME_BINARY = "chrome"
-# POCKET_CONSUMER_KEY = null
-# USER = "squash"
-# PACKAGE_DIR = "/opt/archivebox/archivebox"
-# TEMPLATES_DIR = "/opt/archivebox/archivebox/templates"
-# ARCHIVE_DIR = "/opt/archivebox/data/archive"
-# SOURCES_DIR = "/opt/archivebox/data/sources"
-# LOGS_DIR = "/opt/archivebox/data/logs"
-# PERSONAS_DIR = "/opt/archivebox/data/personas"
-# URL_DENYLIST_PTN = "re.compile(\'\\\\.(css|js|otf|ttf|woff|woff2|gstatic\\\\.com|googleapis\\\\.com/css)(\\\\?.*)?$\', re.IGNORECASE|re.MULTILINE)"
-# URL_ALLOWLIST_PTN = null
-# DIR_OUTPUT_PERMISSIONS = 755
-# ARCHIVEBOX_BINARY = "/opt/archivebox/.venv/bin/archivebox"
-# VERSION = "0.8.0"
-# COMMIT_HASH = "102e87578c6036bb0132dd1ebd17f8f05ffc880f"
-# BUILD_TIME = "2024-05-15 03:28:05 1715768885"
-# VERSIONS_AVAILABLE = null
-# CAN_UPGRADE = false
-# PYTHON_BINARY = "/opt/archivebox/.venv/bin/python3.10"
-# PYTHON_ENCODING = "UTF-8"
-# PYTHON_VERSION = "3.10.14"
-# DJANGO_BINARY = "/opt/archivebox/.venv/lib/python3.10/site-packages/django/__init__.py"
-# DJANGO_VERSION = "5.0.6 final (0)"
-# SQLITE_BINARY = "/opt/homebrew/Cellar/[email protected]/3.10.14/Frameworks/Python.framework/Versions/3.10/lib/python3.10/sqlite3/dbapi2.py"
-# SQLITE_VERSION = "2.6.0"
-# CURL_VERSION = "curl 8.4.0 (x86_64-apple-darwin23.0)"
-# WGET_VERSION = "GNU Wget 1.24.5"
-# WGET_AUTO_COMPRESSION = true
-# RIPGREP_VERSION = "ripgrep 14.1.0"
-# SINGLEFILE_VERSION = null
-# READABILITY_VERSION = null
-# MERCURY_VERSION = null
-# GIT_VERSION = "git version 2.44.0"
-# YOUTUBEDL_VERSION = "2024.04.09"
-# CHROME_VERSION = "Google Chrome 124.0.6367.207"
-# NODE_VERSION = "v21.7.3"'''
-
-
-# first_output = convert(test_input)      # make sure ini -> toml parses correctly
-# second_output = convert(first_output)   # make sure toml -> toml parses/dumps consistently
-# assert first_output == second_output == expected_output  # make sure parsing is indempotent
-
-# # DEBUGGING
-# import sys
-# import difflib
-# sys.stdout.writelines(difflib.context_diff(first_output, second_output, fromfile='first', tofile='second'))
-# print(repr(second_output))

+ 9 - 31
archivebox/plugins_extractor/chrome/apps.py

@@ -1,3 +1,5 @@
+__package__ = 'archivebox.plugins_extractor.chrome'
+
 import platform
 from pathlib import Path
 from typing import List, Optional, Dict, ClassVar
@@ -77,40 +79,16 @@ def create_macos_app_symlink(target: Path, shortcut: Path):
 ###################### Config ##########################
 
 
-class ChromeDependencyConfigs(BaseConfigSet):
+class ChromeConfig(BaseConfigSet):
     section: ClassVar[ConfigSectionName] = "DEPENDENCY_CONFIG"
 
-    CHROME_BINARY: str = Field(default='chrome')
-    CHROME_ARGS: Optional[List[str]] = Field(default=None)
-    CHROME_EXTRA_ARGS: List[str] = []
-    CHROME_DEFAULT_ARGS: List[str] = ['--timeout={TIMEOUT-10}']
-    
-    # def load(self) -> Self:
-    #     # for each field in the model, load its value
-    #     # load from each source in order of precedence (lowest to highest):
-    #     # - schema default
-    #     # - ArchiveBox.conf INI file
-    #     # - environment variables
-    #     # - command-line arguments
-        
-    #     LOADED_VALUES: Dict[str, Any] = {}
-
-    #     for field_name, field in self.__fields__.items():
-    #         def_value   = field.default_factory() if field.default_factory else field.default
-    #         ini_value   = settings.INI_CONFIG.get_value(field_name)
-    #         env_value   = settings.ENV_CONFIG.get_value(field_name)
-    #         cli_value   = settings.CLI_CONFIG.get_value(field_name)
-    #         run_value   = settings.RUN_CONFIG.get_value(field_name)
-    #         value = run_value or cli_value or env_value or ini_value or def_value
-
-class ChromeConfigs(ChromeDependencyConfigs):
-    # section: ConfigSectionName = 'ALL_CONFIGS'
-    pass
+    CHROME_BINARY: str                      = Field(default='chrome')
+    CHROME_ARGS: List[str] | None           = Field(default=None)
+    CHROME_EXTRA_ARGS: List[str]            = Field(default=[])
+    CHROME_DEFAULT_ARGS: List[str]          = Field(default=lambda: ['--timeout={TIMEOUT-10}'])
 
-DEFAULT_GLOBAL_CONFIG = {
-}
 
-CHROME_CONFIG = ChromeConfigs(**DEFAULT_GLOBAL_CONFIG)
+CHROME_CONFIG = ChromeConfig()
 
 
 class ChromeBinary(BaseBinary):
@@ -133,6 +111,7 @@ class ChromeBinary(BaseBinary):
     def symlink_to_lib(binary, bin_dir=settings.CONFIG.BIN_DIR) -> None:
         if not (binary.abspath and binary.abspath.exists()):
             return
+        
         bin_dir.mkdir(parents=True, exist_ok=True)
         symlink = bin_dir / binary.name
         
@@ -146,7 +125,6 @@ class ChromeBinary(BaseBinary):
 
 CHROME_BINARY = ChromeBinary()
 
-PLUGIN_BINARIES = [CHROME_BINARY]
 
 class ChromePlugin(BasePlugin):
     app_label: str = 'chrome'

+ 1 - 0
archivebox/plugins_pkg/pip/apps.py

@@ -149,6 +149,7 @@ class CheckUserIsNotRoot(BaseCheck):
             )
         logger.debug('[√] UID is not root')
         return errors
+
     
 class CheckPipEnvironment(BaseCheck):
     label: str = "CheckPipEnvironment"

+ 1 - 1
archivebox/search/__init__.py

@@ -14,7 +14,7 @@ from .utils import get_indexable_content, log_index_started
 
 
 def import_backend():
-    for backend in settings.SEARCH_BACKENDS:
+    for backend in settings.SEARCH_BACKENDS.values():
         if backend.name == settings.CONFIGS.SearchBackendConfig.SEARCH_BACKEND_ENGINE:
             return backend
     raise Exception(f'Could not load {settings.CONFIGS.SearchBackendConfig.SEARCH_BACKEND_ENGINE} as search backend')