Ver código fonte

finish migrating almost all config to new system

Nick Sweeting 1 ano atrás
pai
commit
d21bc86075

+ 0 - 37
archivebox/abx/archivebox/base_hook.py

@@ -13,43 +13,6 @@ HookType = Literal['CONFIG', 'BINPROVIDER', 'BINARY', 'EXTRACTOR', 'REPLAYER', '
 hook_type_names: Tuple[HookType] = get_args(HookType)
 
 class BaseHook(BaseModel):
-    """
-    A Plugin consists of a list of Hooks, applied to django.conf.settings when AppConfig.read() -> Plugin.register() is called.
-    Plugin.register() then calls each Hook.register() on the provided settings.
-    each Hook.regsiter() function (ideally pure) takes a django.conf.settings as input and returns a new one back.
-    or 
-    it modifies django.conf.settings in-place to add changes corresponding to its HookType.
-    e.g. for a HookType.CONFIG, the Hook.register() function places the hook in settings.CONFIG (and settings.HOOKS)
-    An example of an impure Hook would be a CHECK that modifies settings but also calls django.core.checks.register(check).
-    In practice any object that subclasses BaseHook and provides a .register() function can behave as a Hook.
-
-    setup_django() -> imports all settings.INSTALLED_APPS...
-        # django imports AppConfig, models, migrations, admins, etc. for all installed apps
-        # django then calls AppConfig.ready() on each installed app...
-
-        plugins_pkg.npm.NpmPlugin().AppConfig.ready()                    # called by django
-            plugins_pkg.npm.NpmPlugin().register(settings) ->
-                plugins_pkg.npm.NpmConfigSet().register(settings)
-                    abx.archivebox.base_configset.BaseConfigSet().register(settings)
-                        abx.archivebox.base_hook.BaseHook().register(settings, parent_plugin=plugins_pkg.npm.NpmPlugin())
-
-                ...
-        ...
-
-    Both core ArchiveBox code and plugin code depend on python >= 3.10 and django >= 5.0 w/ sqlite and a filesystem.
-    Core ArchiveBox code can depend only on python and the pip libraries it ships with, and can never depend on plugin code / node / other binaries.
-    Plugin code can depend on archivebox core, other django apps, other pip libraries, and other plugins.
-    Plugins can provide BinProviders + Binaries which can depend on arbitrary other binaries / package managers like curl / wget / yt-dlp / etc.
-
-    The execution interface between plugins is simply calling builtinplugins.npm.... functions directly, django handles
-    importing all plugin code. There is no need to manually register methods/classes, only register to call
-    impure setup functions or provide runtime state.
-    settings.CONFIGS / settings.BINPROVIDERS / settings.BINARIES /... etc. are reserved for dynamic runtime state only.
-    This state is exposed to the broader system in a flat namespace, e.g. CONFIG.IS_DOCKER=True, or BINARIES = [
-        ..., Binary('node', abspath='/usr/local/bin/node', version='22.2.0'), ...
-    ]
-
-    """
     model_config = ConfigDict(
         extra="allow",
         arbitrary_types_allowed=True,

+ 4 - 4
archivebox/api/v1_cli.py

@@ -13,7 +13,7 @@ from ..main import (
     schedule,
 )
 from archivebox.misc.util import ansi_to_html
-from ..config.legacy import ONLY_NEW
+from archivebox.config import ARCHIVING_CONFIG
 
 
 from .auth import API_AUTH_METHODS
@@ -58,7 +58,7 @@ class AddCommandSchema(Schema):
     urls: List[str]
     tag: str = ""
     depth: int = 0
-    update: bool = not ONLY_NEW  # Default to the opposite of ONLY_NEW
+    update: bool = not ARCHIVING_CONFIG.ONLY_NEW  # Default to the opposite of ARCHIVING_CONFIG.ONLY_NEW
     update_all: bool = False
     index_only: bool = False
     overwrite: bool = False
@@ -68,7 +68,7 @@ class AddCommandSchema(Schema):
 
 class UpdateCommandSchema(Schema):
     resume: Optional[float] = 0
-    only_new: bool = ONLY_NEW
+    only_new: bool = ARCHIVING_CONFIG.ONLY_NEW
     index_only: bool = False
     overwrite: bool = False
     after: Optional[float] = 0
@@ -85,7 +85,7 @@ class ScheduleCommandSchema(Schema):
     tag: str = ''
     depth: int = 0
     overwrite: bool = False
-    update: bool = not ONLY_NEW
+    update: bool = not ARCHIVING_CONFIG.ONLY_NEW
     clear: bool = False
 
 class ListCommandSchema(Schema):

+ 2 - 5
archivebox/cli/__init__.py

@@ -152,18 +152,15 @@ def run_subcommand(subcommand: str,
     subcommand_args = subcommand_args or []
 
     if subcommand not in meta_cmds:
-        from ..config.legacy import setup_django, CONFIG
+        from archivebox.config.legacy import setup_django
 
         cmd_requires_db = subcommand in archive_cmds
         init_pending = '--init' in subcommand_args or '--quick-init' in subcommand_args
 
-        if cmd_requires_db:
-            check_data_folder(CONFIG)
-
         setup_django(in_memory_db=subcommand in fake_db, check_db=cmd_requires_db and not init_pending)
 
         if cmd_requires_db:
-            check_migrations(CONFIG)
+            check_migrations()
 
     module = import_module('.archivebox_{}'.format(subcommand), __package__)
     module.main(args=subcommand_args, stdin=stdin, pwd=pwd)    # type: ignore

+ 2 - 1
archivebox/config/__init__.py

@@ -1,6 +1,6 @@
 __package__ = 'archivebox.config'
 
-from .constants import CONSTANTS, PACKAGE_DIR, DATA_DIR, ARCHIVE_DIR, VERSION
+from .constants import CONSTANTS, CONSTANTS_CONFIG, PACKAGE_DIR, DATA_DIR, ARCHIVE_DIR, VERSION
 from .defaults import (
     SHELL_CONFIG,
     STORAGE_CONFIG,
@@ -23,4 +23,5 @@ __all__ = [
     'SERVER_CONFIG',
     'ARCHIVING_CONFIG',
     'SEARCH_BACKEND_CONFIG',
+    'CONSTANTS_CONFIG',
 ]

+ 27 - 42
archivebox/config/legacy.py

@@ -60,6 +60,7 @@ from .defaults import SHELL_CONFIG, GENERAL_CONFIG, ARCHIVING_CONFIG, SERVER_CON
 from archivebox.plugins_auth.ldap.apps import LDAP_CONFIG
 from archivebox.plugins_extractor.favicon.apps import FAVICON_CONFIG
 from archivebox.plugins_extractor.wget.apps import WGET_CONFIG
+from archivebox.plugins_extractor.curl.apps import CURL_CONFIG
 
 ANSI = SHELL_CONFIG.ANSI
 LDAP = LDAP_CONFIG.LDAP_ENABLED
@@ -81,9 +82,11 @@ CONFIG_SCHEMA: Dict[str, ConfigDefaultDict] = {
     
     'LDAP_CONFIG': LDAP_CONFIG.as_legacy_config_schema(),
     
-    'FAVICON_CONFIG': FAVICON_CONFIG.as_legacy_config_schema(),
+    # 'FAVICON_CONFIG': FAVICON_CONFIG.as_legacy_config_schema(),
     
-    'WGET_CONFIG': WGET_CONFIG.as_legacy_config_schema(),
+    # 'WGET_CONFIG': WGET_CONFIG.as_legacy_config_schema(),
+    
+    # 'CURL_CONFIG': CURL_CONFIG.as_legacy_config_schema(),
 
 
     'ARCHIVE_METHOD_TOGGLES': {
@@ -109,7 +112,7 @@ CONFIG_SCHEMA: Dict[str, ConfigDefaultDict] = {
 
     'ARCHIVE_METHOD_OPTIONS': {
         'RESOLUTION':               {'type': str,   'default': '1440,2000', 'aliases': ('SCREENSHOT_RESOLUTION','WINDOW_SIZE')},
-        'GIT_DOMAINS':              {'type': str,   'default': 'github.com,bitbucket.org,gitlab.com,gist.github.com,codeberg.org,gitea.com,git.sr.ht'},
+        # 'GIT_DOMAINS':              {'type': str,   'default': 'github.com,bitbucket.org,gitlab.com,gist.github.com,codeberg.org,gitea.com,git.sr.ht'},
         'CHECK_SSL_VALIDITY':       {'type': bool,  'default': True},
         'MEDIA_MAX_SIZE':           {'type': str,   'default': '750m'},
 
@@ -144,15 +147,6 @@ CONFIG_SCHEMA: Dict[str, ConfigDefaultDict] = {
                                                                 ]},
         'YOUTUBEDL_EXTRA_ARGS':     {'type': list,  'default': None},
 
-
-        'CURL_ARGS':                {'type': list,  'default': ['--silent',
-                                                                '--location',
-                                                                '--compressed'
-                                                               ]},
-        'CURL_EXTRA_ARGS':          {'type': list,  'default': None},
-        'GIT_ARGS':                 {'type': list,  'default': ['--recursive']},
-        'SINGLEFILE_ARGS':          {'type': list,  'default': None},
-        'SINGLEFILE_EXTRA_ARGS':    {'type': list,  'default': None},
     },
 
     'DEPENDENCY_CONFIG': {
@@ -164,9 +158,9 @@ CONFIG_SCHEMA: Dict[str, ConfigDefaultDict] = {
         'USE_YOUTUBEDL':            {'type': bool,  'default': True},
         'USE_RIPGREP':              {'type': bool,  'default': True},
 
-        'CURL_BINARY':              {'type': str,   'default': 'curl'},
-        'GIT_BINARY':               {'type': str,   'default': 'git'},
-        'NODE_BINARY':              {'type': str,   'default': 'node'},
+        # 'GIT_BINARY':               {'type': str,   'default': 'git'},
+        # 'CURL_BINARY':              {'type': str,   'default': 'curl'},
+        # 'NODE_BINARY':              {'type': str,   'default': 'node'},
         # 'YOUTUBEDL_BINARY':         {'type': str,   'default': 'yt-dlp'},   # also can accept youtube-dl
         # 'SINGLEFILE_BINARY':        {'type': str,   'default': lambda c: bin_path('single-file')},
         # 'READABILITY_BINARY':       {'type': str,   'default': lambda c: bin_path('readability-extractor')},
@@ -209,21 +203,12 @@ DYNAMIC_CONFIG_SCHEMA: ConfigDefaultDict = {
     'URL_DENYLIST_PTN':         {'default': lambda c: c['URL_DENYLIST'] and re.compile(c['URL_DENYLIST'] or '', CONSTANTS.ALLOWDENYLIST_REGEX_FLAGS)},
     'URL_ALLOWLIST_PTN':        {'default': lambda c: c['URL_ALLOWLIST'] and re.compile(c['URL_ALLOWLIST'] or '', CONSTANTS.ALLOWDENYLIST_REGEX_FLAGS)},
 
+    # 'USE_GIT':                  {'default': lambda c: c['USE_GIT'] and c['SAVE_GIT']},
+    # 'GIT_VERSION':              {'default': lambda c: bin_version(c['GIT_BINARY']) if c['USE_GIT'] else None},
+    # 'SAVE_GIT':                 {'default': lambda c: c['USE_GIT'] and c['SAVE_GIT']},
 
-    'USE_CURL':                 {'default': lambda c: c['USE_CURL'] and (c['SAVE_FAVICON'] or c['SAVE_TITLE'] or c['SAVE_ARCHIVE_DOT_ORG'])},
-    'CURL_VERSION':             {'default': lambda c: bin_version(c['CURL_BINARY']) if c['USE_CURL'] else None},
-    # 'CURL_USER_AGENT':          {'default': lambda c: c['CURL_USER_AGENT'].format(**c)},
-    'CURL_ARGS':                {'default': lambda c: c['CURL_ARGS'] or []},
-    'CURL_EXTRA_ARGS':          {'default': lambda c: c['CURL_EXTRA_ARGS'] or []},
-    'SAVE_FAVICON':             {'default': lambda c: c['USE_CURL'] and c['SAVE_FAVICON']},
-    'SAVE_ARCHIVE_DOT_ORG':     {'default': lambda c: c['USE_CURL'] and c['SAVE_ARCHIVE_DOT_ORG']},
-
-    'USE_GIT':                  {'default': lambda c: c['USE_GIT'] and c['SAVE_GIT']},
-    'GIT_VERSION':              {'default': lambda c: bin_version(c['GIT_BINARY']) if c['USE_GIT'] else None},
-    'SAVE_GIT':                 {'default': lambda c: c['USE_GIT'] and c['SAVE_GIT']},
 
-
-    'DEPENDENCIES':             {'default': lambda c: get_dependency_info(c)},
+    # 'DEPENDENCIES':             {'default': lambda c: get_dependency_info(c)},
     # 'CODE_LOCATIONS':           {'default': lambda c: get_code_locations(c)},
     # 'DATA_LOCATIONS':           {'default': lambda c: get_data_locations(c)},
 
@@ -613,13 +598,13 @@ def get_dependency_info(config: benedict) -> ConfigValue:
         #     'is_valid': True,
         # },
         
-        'CURL_BINARY': {
-            'path': bin_path(config['CURL_BINARY']),
-            'version': config['CURL_VERSION'],
-            'hash': bin_hash(config['CURL_BINARY']),
-            'enabled': config['USE_CURL'],
-            'is_valid': bool(config['CURL_VERSION']),
-        },
+        # 'CURL_BINARY': {
+        #     'path': bin_path(config['CURL_BINARY']),
+        #     'version': config['CURL_VERSION'],
+        #     'hash': bin_hash(config['CURL_BINARY']),
+        #     'enabled': config['USE_CURL'],
+        #     'is_valid': bool(config['CURL_VERSION']),
+        # },
         # 'WGET_BINARY': {
         #     'path': bin_path(config['WGET_BINARY']),
         #     'version': config['WGET_VERSION'],
@@ -641,13 +626,13 @@ def get_dependency_info(config: benedict) -> ConfigValue:
         #     'enabled': config['USE_MERCURY'],
         #     'is_valid': bool(config['MERCURY_VERSION']),
         # },
-        'GIT_BINARY': {
-            'path': bin_path(config['GIT_BINARY']),
-            'version': config['GIT_VERSION'],
-            'hash': bin_hash(config['GIT_BINARY']),
-            'enabled': config['USE_GIT'],
-            'is_valid': bool(config['GIT_VERSION']),
-        },
+        # 'GIT_BINARY': {
+        #     'path': bin_path(config['GIT_BINARY']),
+        #     'version': config['GIT_VERSION'],
+        #     'hash': bin_hash(config['GIT_BINARY']),
+        #     'enabled': config['USE_GIT'],
+        #     'is_valid': bool(config['GIT_VERSION']),
+        # },
         # 'SINGLEFILE_BINARY': {
         #     'path': bin_path(config['SINGLEFILE_BINARY']),
         #     'version': config['SINGLEFILE_VERSION'],

+ 2 - 1
archivebox/config/views.py

@@ -76,7 +76,7 @@ def binaries_list_view(request: HttpRequest, **kwargs) -> TableContext:
 
     relevant_configs = {
         key: val
-        for key, val in settings.CONFIG.items()
+        for key, val in settings.FLAT_CONFIG.items()
         if '_BINARY' in key or '_VERSION' in key
     }
 
@@ -105,6 +105,7 @@ def binaries_list_view(request: HttpRequest, **kwargs) -> TableContext:
                 f'<a href="/admin/environment/config/{config_key}/">{config_key}</a>'
                 for config_key, config_value in relevant_configs.items()
                     if str(binary.name).lower().replace('-', '').replace('_', '').replace('ytdlp', 'youtubedl') in config_key.lower()
+                    or config_value.lower().endswith(binary.name.lower())
                     # or binary.name.lower().replace('-', '').replace('_', '') in str(config_value).lower()
             )))
             # if not binary.provider_overrides:

+ 1 - 1
archivebox/core/admin.py

@@ -36,7 +36,7 @@ from main import remove
 from extractors import archive_links
 
 
-CONFIG = settings.CONFIG
+CONFIG = settings.FLAT_CONFIG
 
 GLOBAL_CONTEXT = {'VERSION': VERSION, 'VERSIONS_AVAILABLE': [], 'CAN_UPGRADE': False}
 

+ 2 - 4
archivebox/core/auth.py

@@ -1,13 +1,11 @@
 __package__ = 'archivebox.core'
 
 
-from ..config.legacy import (
-    LDAP
-)
+from archivebox.plugins_auth.ldap.apps import LDAP_CONFIG
 
 def register_signals():
 
-    if LDAP:
+    if LDAP_CONFIG.LDAP_ENABLED:
         import django_auth_ldap.backend
         from .auth_ldap import create_user
 

+ 2 - 4
archivebox/core/auth_ldap.py

@@ -1,9 +1,7 @@
-from ..config.legacy import (
-    LDAP_CREATE_SUPERUSER
-)
+from archivebox.plugins_auth.ldap.apps import LDAP_CONFIG
 
 def create_user(sender, user=None, ldap_user=None, **kwargs):
-    if not user.id and LDAP_CREATE_SUPERUSER:
+    if not user.id and LDAP_CONFIG.LDAP_CREATE_SUPERUSER:
         user.is_superuser = True
 
     user.is_staff = True

+ 5 - 5
archivebox/core/middleware.py

@@ -5,7 +5,7 @@ from django.utils import timezone
 from django.contrib.auth.middleware import RemoteUserMiddleware
 from django.core.exceptions import ImproperlyConfigured
 
-from ..config.legacy import PUBLIC_SNAPSHOTS, REVERSE_PROXY_USER_HEADER, REVERSE_PROXY_WHITELIST
+from archivebox.config import SERVER_CONFIG
 
 
 def detect_timezone(request, activate: bool=True):
@@ -32,7 +32,7 @@ def CacheControlMiddleware(get_response):
         response = get_response(request)
 
         if '/archive/' in request.path or '/static/' in request.path:
-            policy = 'public' if PUBLIC_SNAPSHOTS else 'private'
+            policy = 'public' if SERVER_CONFIG.PUBLIC_SNAPSHOTS else 'private'
             response['Cache-Control'] = f'{policy}, max-age=60, stale-while-revalidate=300'
             # print('Set Cache-Control header to', response['Cache-Control'])
         return response
@@ -40,15 +40,15 @@ def CacheControlMiddleware(get_response):
     return middleware
 
 class ReverseProxyAuthMiddleware(RemoteUserMiddleware):
-    header = 'HTTP_{normalized}'.format(normalized=REVERSE_PROXY_USER_HEADER.replace('-', '_').upper())
+    header = 'HTTP_{normalized}'.format(normalized=SERVER_CONFIG.REVERSE_PROXY_USER_HEADER.replace('-', '_').upper())
 
     def process_request(self, request):
-        if REVERSE_PROXY_WHITELIST == '':
+        if SERVER_CONFIG.REVERSE_PROXY_WHITELIST == '':
             return
 
         ip = request.META.get('REMOTE_ADDR')
 
-        for cidr in REVERSE_PROXY_WHITELIST.split(','):
+        for cidr in SERVER_CONFIG.REVERSE_PROXY_WHITELIST.split(','):
             try:
                 network = ipaddress.ip_network(cidr)
             except ValueError:

+ 5 - 7
archivebox/core/settings.py

@@ -13,9 +13,7 @@ import abx.archivebox
 import abx.archivebox.use
 import abx.django.use
 
-from archivebox.config import VERSION, DATA_DIR, PACKAGE_DIR, ARCHIVE_DIR, CONSTANTS      # noqa
-
-from ..config.legacy import CONFIG
+from archivebox.config import VERSION, DATA_DIR, PACKAGE_DIR, ARCHIVE_DIR, CONSTANTS, SHELL_CONFIG, SERVER_CONFIG      # noqa
 
 IS_MIGRATING = 'makemigrations' in sys.argv[:3] or 'migrate' in sys.argv[:3]
 IS_TESTING = 'test' in sys.argv[:3] or 'PYTEST_CURRENT_TEST' in os.environ
@@ -80,7 +78,7 @@ LOGOUT_REDIRECT_URL = os.environ.get('LOGOUT_REDIRECT_URL', '/')
 PASSWORD_RESET_URL = '/accounts/password_reset/'
 APPEND_SLASH = True
 
-DEBUG = CONFIG.DEBUG or ('--debug' in sys.argv)
+DEBUG = SHELL_CONFIG.DEBUG or ('--debug' in sys.argv)
 
 
 INSTALLED_APPS = [
@@ -364,10 +362,10 @@ STORAGES = {
 ### Security Settings
 ################################################################################
 
-SECRET_KEY = CONFIG.SECRET_KEY or get_random_string(50, 'abcdefghijklmnopqrstuvwxyz0123456789_')
+SECRET_KEY = SERVER_CONFIG.SECRET_KEY or get_random_string(50, 'abcdefghijklmnopqrstuvwxyz0123456789_')
 
-ALLOWED_HOSTS = CONFIG.ALLOWED_HOSTS.split(',')
-CSRF_TRUSTED_ORIGINS = list(set(CONFIG.CSRF_TRUSTED_ORIGINS.split(',')))
+ALLOWED_HOSTS = SERVER_CONFIG.ALLOWED_HOSTS.split(',')
+CSRF_TRUSTED_ORIGINS = list(set(SERVER_CONFIG.CSRF_TRUSTED_ORIGINS.split(',')))
 
 # automatically fix case when user sets ALLOWED_HOSTS (e.g. to archivebox.example.com)
 # but forgets to add https://archivebox.example.com to CSRF_TRUSTED_ORIGINS

+ 1 - 1
archivebox/core/urls.py

@@ -10,7 +10,7 @@ from .views import HomepageView, SnapshotView, PublicIndexView, AddView, HealthC
 from .serve_static import serve_static
 
 # GLOBAL_CONTEXT doesn't work as-is, disabled for now: https://github.com/ArchiveBox/ArchiveBox/discussions/1306
-# from .config.legacy import VERSION, VERSIONS_AVAILABLE, CAN_UPGRADE
+# from archivebox.config import VERSION, VERSIONS_AVAILABLE, CAN_UPGRADE
 # GLOBAL_CONTEXT = {'VERSION': VERSION, 'VERSIONS_AVAILABLE': VERSIONS_AVAILABLE, 'CAN_UPGRADE': CAN_UPGRADE}
 
 

+ 55 - 57
archivebox/core/views.py

@@ -1,7 +1,7 @@
 __package__ = 'archivebox.core'
 
-from typing import Callable
-from benedict import benedict
+import inspect
+from typing import Callable, get_type_hints
 from pathlib import Path
 
 from django.shortcuts import render, redirect
@@ -27,21 +27,13 @@ from core.admin import result_url
 
 from queues.tasks import bg_add
 
-from archivebox.config import CONSTANTS, DATA_DIR, VERSION, SHELL_CONFIG, SERVER_CONFIG
-from ..plugins_extractor.archivedotorg.apps import ARCHIVEDOTORG_CONFIG
+from archivebox.config import CONSTANTS_CONFIG, DATA_DIR, VERSION, SHELL_CONFIG, SERVER_CONFIG
+from archivebox.misc.util import base_url, htmlencode, ts_to_date_str
 
-from ..config.legacy import (
-    CONFIG_SCHEMA,
-    DYNAMIC_CONFIG_SCHEMA,
-    USER_CONFIG,
-    CONFIG,
-)
+from .serve_static import serve_static_with_byterange_support
+from ..plugins_extractor.archivedotorg.apps import ARCHIVEDOTORG_CONFIG
 from ..logging_util import printable_filesize
-from archivebox.misc.util import base_url, htmlencode, ts_to_date_str
 from ..search import query_search_index
-from .serve_static import serve_static_with_byterange_support
-
-CONFIG = benedict({**CONSTANTS, **CONFIG, **settings.FLAT_CONFIG})
 
 
 class HomepageView(View):
@@ -502,27 +494,43 @@ class HealthCheckView(View):
 
 
 def find_config_section(key: str) -> str:
-    if key in CONSTANTS:
+    if key in CONSTANTS_CONFIG:
         return 'CONSTANT'
     matching_sections = [
-        name for name, opts in CONFIG_SCHEMA.items() if key in opts
+        section.id for section in settings.CONFIGS.values() if key in section.model_fields
     ]
     section = matching_sections[0] if matching_sections else 'DYNAMIC'
     return section
 
 def find_config_default(key: str) -> str:
-    default_val = USER_CONFIG.get(key, {}).get('default', lambda: None)
+    if key in CONSTANTS_CONFIG:
+        return str(CONSTANTS_CONFIG[key])
+    
+    default_val = None
+
+    for config in settings.CONFIGS.values():
+        if key in config.model_fields:
+            default_val = config.model_fields[key].default
+            break
+        
     if isinstance(default_val, Callable):
-        return None
+        default_val = inspect.getsource(default_val).split('lambda', 1)[-1].split(':', 1)[-1].replace('\n', ' ').strip()
+        if default_val.count(')') > default_val.count('('):
+            default_val = default_val[:-1]
     else:
-        default_val = repr(default_val)
+        default_val = str(default_val)
+        
+        
     return default_val
 
 def find_config_type(key: str) -> str:
-    if key in USER_CONFIG:
-        return str(USER_CONFIG[key]['type'])
-    elif key in DYNAMIC_CONFIG_SCHEMA:
-        return str(type(CONFIG[key]))
+    for config in settings.CONFIGS.values():
+        if hasattr(config, key):
+            type_hints = get_type_hints(config)
+            try:
+                return str(type_hints[key].__name__)
+            except AttributeError:
+                return str(type_hints[key])
     return 'str'
 
 def key_is_safe(key: str) -> bool:
@@ -543,40 +551,29 @@ def live_config_list_view(request: HttpRequest, **kwargs) -> TableContext:
         "Value": [],
         "Default": [],
         # "Documentation": [],
-        "Aliases": [],
+        # "Aliases": [],
     }
 
-    for section in CONFIG_SCHEMA.keys():
-        for key in CONFIG_SCHEMA[section].keys():
-            rows['Section'].append(section)   # section.replace('_', ' ').title().replace(' Config', '')
+    for section in reversed(list(settings.CONFIGS.values())):
+        for key, field in section.model_fields.items():
+            rows['Section'].append(section.id)   # section.replace('_', ' ').title().replace(' Config', '')
             rows['Key'].append(ItemLink(key, key=key))
-            rows['Type'].append(mark_safe(f'<code>{find_config_type(key)}</code>'))
-            rows['Value'].append(mark_safe(f'<code>{CONFIG[key]}</code>') if key_is_safe(key) else '******** (redacted)')
-            rows['Default'].append(mark_safe(f'<a href="https://github.com/search?q=repo%3AArchiveBox%2FArchiveBox+path%3Aconfig.py+{key}&type=code"><code style="text-decoration: underline">{find_config_default(key) or "See here..."}</code></a>'))
+            rows['Type'].append(format_html('<code>{}</code>', find_config_type(key)))
+            rows['Value'].append(mark_safe(f'<code>{getattr(section, key)}</code>') if key_is_safe(key) else '******** (redacted)')
+            rows['Default'].append(mark_safe(f'<a href="https://github.com/search?q=repo%3AArchiveBox%2FArchiveBox+path%3Aconfig+{key}&type=code"><code style="text-decoration: underline">{find_config_default(key) or "See here..."}</code></a>'))
             # rows['Documentation'].append(mark_safe(f'Wiki: <a href="https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#{key.lower()}">{key}</a>'))
-            rows['Aliases'].append(', '.join(CONFIG_SCHEMA[section][key].get('aliases', [])))
-
-    section = 'DYNAMIC'
-    for key in DYNAMIC_CONFIG_SCHEMA.keys():
-        if key in CONSTANTS:
-            continue
-        rows['Section'].append(section)   # section.replace('_', ' ').title().replace(' Config', '')
-        rows['Key'].append(ItemLink(key, key=key))
-        rows['Type'].append(mark_safe(f'<code>{find_config_type(key)}</code>'))
-        rows['Value'].append(mark_safe(f'<code>{CONFIG[key]}</code>') if key_is_safe(key) else '******** (redacted)')
-        rows['Default'].append(mark_safe(f'<a href="https://github.com/search?q=repo%3AArchiveBox%2FArchiveBox+path%3Aconfig.py+{key}&type=code"><code style="text-decoration: underline">{find_config_default(key) or "See here..."}</code></a>'))
-        # rows['Documentation'].append(mark_safe(f'Wiki: <a href="https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#{key.lower()}">{key}</a>'))
-        rows['Aliases'].append(ItemLink(key, key=key) if key in USER_CONFIG else '')
+            # rows['Aliases'].append(', '.join(find_config_aliases(key)))
 
+   
     section = 'CONSTANT'
-    for key in CONSTANTS.keys():
+    for key in CONSTANTS_CONFIG.keys():
         rows['Section'].append(section)   # section.replace('_', ' ').title().replace(' Config', '')
         rows['Key'].append(ItemLink(key, key=key))
-        rows['Type'].append(mark_safe(f'<code>{find_config_type(key)}</code>'))
-        rows['Value'].append(mark_safe(f'<code>{CONFIG[key]}</code>') if key_is_safe(key) else '******** (redacted)')
-        rows['Default'].append(mark_safe(f'<a href="https://github.com/search?q=repo%3AArchiveBox%2FArchiveBox+path%3Aconfig.py+{key}&type=code"><code style="text-decoration: underline">{find_config_default(key) or "See here..."}</code></a>'))
+        rows['Type'].append(format_html('<code>{}</code>', getattr(type(CONSTANTS_CONFIG[key]), '__name__', repr(CONSTANTS_CONFIG[key]))))
+        rows['Value'].append(format_html('<code>{}</code>', CONSTANTS_CONFIG[key]) if key_is_safe(key) else '******** (redacted)')
+        rows['Default'].append(mark_safe(f'<a href="https://github.com/search?q=repo%3AArchiveBox%2FArchiveBox+path%3Aconfig+{key}&type=code"><code style="text-decoration: underline">{find_config_default(key) or "See here..."}</code></a>'))
         # rows['Documentation'].append(mark_safe(f'Wiki: <a href="https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#{key.lower()}">{key}</a>'))
-        rows['Aliases'].append(ItemLink(key, key=key) if key in USER_CONFIG else '')
+        # rows['Aliases'].append('')
 
 
     return TableContext(
@@ -589,11 +586,12 @@ def live_config_value_view(request: HttpRequest, key: str, **kwargs) -> ItemCont
 
     assert request.user.is_superuser, 'Must be a superuser to view configuration settings.'
 
-    aliases = USER_CONFIG.get(key, {}).get("aliases", [])
+    # aliases = USER_CONFIG.get(key, {}).get("aliases", [])
+    aliases = []
 
-    if key in CONSTANTS:
+    if key in CONSTANTS_CONFIG:
         section_header = mark_safe(f'[CONSTANTS]   &nbsp; <b><code style="color: lightgray">{key}</code></b> &nbsp; <small>(read-only, hardcoded by ArchiveBox)</small>')
-    elif key in USER_CONFIG:
+    elif key in settings.FLAT_CONFIG:
         section_header = mark_safe(f'data / ArchiveBox.conf &nbsp; [{find_config_section(key)}]  &nbsp; <b><code style="color: lightgray">{key}</code></b>')
     else:
         section_header = mark_safe(f'[DYNAMIC CONFIG]   &nbsp; <b><code style="color: lightgray">{key}</code></b> &nbsp; <small>(read-only, calculated at runtime)</small>')
@@ -609,7 +607,7 @@ def live_config_value_view(request: HttpRequest, key: str, **kwargs) -> ItemCont
                 "fields": {
                     'Key': key,
                     'Type': find_config_type(key),
-                    'Value': CONFIG[key] if key_is_safe(key) else '********',
+                    'Value': settings.FLAT_CONFIG[key] if key_is_safe(key) else '********',
                 },
                 "help_texts": {
                     'Key': mark_safe(f'''
@@ -619,25 +617,25 @@ def live_config_value_view(request: HttpRequest, key: str, **kwargs) -> ItemCont
                         </span>
                     '''),
                     'Type': mark_safe(f'''
-                        <a href="https://github.com/search?q=repo%3AArchiveBox%2FArchiveBox+path%3Aconfig.py+{key}&type=code">
-                            See full definition in <code>archivebox/config.py</code>...
+                        <a href="https://github.com/search?q=repo%3AArchiveBox%2FArchiveBox+path%3Aconfig+{key}&type=code">
+                            See full definition in <code>archivebox/config</code>...
                         </a>
                     '''),
                     'Value': mark_safe(f'''
                         {'<b style="color: red">Value is redacted for your security. (Passwords, secrets, API tokens, etc. cannot be viewed in the Web UI)</b><br/><br/>' if not key_is_safe(key) else ''}
                         <br/><hr/><br/>
                         Default: &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; 
-                        <a href="https://github.com/search?q=repo%3AArchiveBox%2FArchiveBox+path%3Aconfig.py+{key}&type=code">
+                        <a href="https://github.com/search?q=repo%3AArchiveBox%2FArchiveBox+path%3Aconfig+{key}&type=code">
                             <code>{find_config_default(key) or '↗️ See in ArchiveBox source code...'}</code>
                         </a>
                         <br/><br/>
-                        <p style="display: {"block" if key in USER_CONFIG else "none"}">
+                        <p style="display: {"block" if key in settings.FLAT_CONFIG else "none"}">
                             <i>To change this value, edit <code>data/ArchiveBox.conf</code> or run:</i>
                             <br/><br/>
                             <code>archivebox config --set {key}="{
                                 val.strip("'")
                                 if (val := find_config_default(key)) else
-                                (repr(CONFIG[key] if key_is_safe(key) else '********')).strip("'")
+                                (repr(settings.FLAT_CONFIG[key] if key_is_safe(key) else '********')).strip("'")
                             }"</code>
                         </p>
                     '''),

+ 18 - 26
archivebox/extractors/archive_org.py

@@ -7,21 +7,10 @@ from collections import defaultdict
 
 from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError
 from archivebox.misc.system import run, chmod_file
-from archivebox.misc.util import (
-    enforce_types,
-    is_static_file,
-    dedupe,
-)
-from ..config.legacy import (
-    TIMEOUT,
-    CURL_ARGS,
-    CURL_EXTRA_ARGS,
-    CHECK_SSL_VALIDITY,
-    SAVE_ARCHIVE_DOT_ORG,
-    CURL_BINARY,
-    CURL_VERSION,
-    CURL_USER_AGENT,
-)
+from archivebox.misc.util import enforce_types, is_static_file, dedupe
+from archivebox.plugins_extractor.archivedotorg.apps import ARCHIVEDOTORG_CONFIG
+from archivebox.plugins_extractor.curl.apps import CURL_CONFIG, CURL_BINARY
+
 from ..logging_util import TimedProgress
 
 
@@ -39,27 +28,30 @@ def should_save_archive_dot_org(link: Link, out_dir: Optional[Path]=None, overwr
         # if open(path, 'r', encoding='utf-8').read().strip() != 'None':
         return False
 
-    return SAVE_ARCHIVE_DOT_ORG
+    return ARCHIVEDOTORG_CONFIG.SAVE_ARCHIVE_DOT_ORG
 
 @enforce_types
-def save_archive_dot_org(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) -> ArchiveResult:
+def save_archive_dot_org(link: Link, out_dir: Optional[Path]=None, timeout: int=CURL_CONFIG.CURL_TIMEOUT) -> ArchiveResult:
     """submit site to archive.org for archiving via their service, save returned archive url"""
 
+    curl_binary = CURL_BINARY.load()
+    assert curl_binary.abspath and curl_binary.version
+
     out_dir = out_dir or Path(link.link_dir)
     output: ArchiveOutput = get_output_path()
     archive_org_url = None
     submit_url = 'https://web.archive.org/save/{}'.format(link.url)
     # later options take precedence
     options = [
-        *CURL_ARGS,
-        *CURL_EXTRA_ARGS,
+        *CURL_CONFIG.CURL_ARGS,
+        *CURL_CONFIG.CURL_EXTRA_ARGS,
         '--head',
         '--max-time', str(timeout),
-        *(['--user-agent', '{}'.format(CURL_USER_AGENT)] if CURL_USER_AGENT else []),
-        *([] if CHECK_SSL_VALIDITY else ['--insecure']),
+        *(['--user-agent', '{}'.format(CURL_CONFIG.CURL_USER_AGENT)] if CURL_CONFIG.CURL_USER_AGENT else []),
+        *([] if CURL_CONFIG.CURL_CHECK_SSL_VALIDITY else ['--insecure']),
     ]
     cmd = [
-        CURL_BINARY,
+        str(curl_binary.abspath),
         *dedupe(options),
         submit_url,
     ]
@@ -97,22 +89,22 @@ def save_archive_dot_org(link: Link, out_dir: Optional[Path]=None, timeout: int=
     return ArchiveResult(
         cmd=cmd,
         pwd=str(out_dir),
-        cmd_version=CURL_VERSION,
+        cmd_version=str(curl_binary.version),
         output=output,
         status=status,
         **timer.stats,
     )
 
 @enforce_types
-def parse_archive_dot_org_response(response: bytes) -> Tuple[List[str], List[str]]:
+def parse_archive_dot_org_response(response: str) -> Tuple[List[str], List[str]]:
     # Parse archive.org response headers
     headers: Dict[str, List[str]] = defaultdict(list)
 
     # lowercase all the header names and store in dict
     for header in response.splitlines():
-        if b':' not in header or not header.strip():
+        if ':' not in header or not header.strip():
             continue
-        name, val = header.decode().split(':', 1)
+        name, val = header.split(':', 1)
         headers[name.lower().strip()].append(val.strip())
 
     # Get successful archive url in "content-location" header or any errors

+ 16 - 18
archivebox/extractors/favicon.py

@@ -2,16 +2,11 @@ __package__ = 'archivebox.extractors'
 
 from pathlib import Path
 
-from typing import Optional
-
-from ..index.schema import Link, ArchiveResult, ArchiveOutput
 from archivebox.misc.system import chmod_file, run
-from archivebox.misc.util import (
-    enforce_types,
-    domain,
-    dedupe,
-)
-from ..config.legacy import CONFIG
+from archivebox.misc.util import enforce_types, domain, dedupe
+from archivebox.plugins_extractor.favicon.apps import FAVICON_CONFIG
+from archivebox.plugins_extractor.curl.apps import CURL_CONFIG, CURL_BINARY
+from ..index.schema import Link, ArchiveResult, ArchiveOutput
 from ..logging_util import TimedProgress
 
 
@@ -22,7 +17,7 @@ def should_save_favicon(link: Link, out_dir: str | Path | None=None, overwrite:
     if not overwrite and (out_dir / 'favicon.ico').exists():
         return False
 
-    return CONFIG.SAVE_FAVICON
+    return FAVICON_CONFIG.SAVE_FAVICON
 
 @enforce_types
 def get_output_path():
@@ -30,26 +25,29 @@ def get_output_path():
 
 
 @enforce_types
-def save_favicon(link: Link, out_dir: str | Path | None=None, timeout: int=CONFIG.TIMEOUT) -> ArchiveResult:
+def save_favicon(link: Link, out_dir: str | Path | None=None, timeout: int=CURL_CONFIG.CURL_TIMEOUT) -> ArchiveResult:
     """download site favicon from google's favicon api"""
 
+    curl_binary = CURL_BINARY.load()
+    assert curl_binary.abspath and curl_binary.version
+
     out_dir = Path(out_dir or link.link_dir)
     assert out_dir.exists()
 
     output: ArchiveOutput = 'favicon.ico'
     # later options take precedence
     options = [
-        *CONFIG.CURL_ARGS,
-        *CONFIG.CURL_EXTRA_ARGS,
+        *CURL_CONFIG.CURL_ARGS,
+        *CURL_CONFIG.CURL_EXTRA_ARGS,
         '--max-time', str(timeout),
         '--output', str(output),
-        *(['--user-agent', '{}'.format(CONFIG.CURL_USER_AGENT)] if CONFIG.CURL_USER_AGENT else []),
-        *([] if CONFIG.CHECK_SSL_VALIDITY else ['--insecure']),
+        *(['--user-agent', '{}'.format(CURL_CONFIG.CURL_USER_AGENT)] if CURL_CONFIG.CURL_USER_AGENT else []),
+        *([] if CURL_CONFIG.CURL_CHECK_SSL_VALIDITY else ['--insecure']),
     ]
     cmd = [
-        CONFIG.CURL_BINARY,
+        str(curl_binary.abspath),
         *dedupe(options),
-        CONFIG.FAVICON_PROVIDER.format(domain(link.url)),
+        FAVICON_CONFIG.FAVICON_PROVIDER.format(domain(link.url)),
     ]
     status = 'failed'
     timer = TimedProgress(timeout, prefix='      ')
@@ -65,7 +63,7 @@ def save_favicon(link: Link, out_dir: str | Path | None=None, timeout: int=CONFI
     return ArchiveResult(
         cmd=cmd,
         pwd=str(out_dir),
-        cmd_version=CONFIG.CURL_VERSION,
+        cmd_version=str(curl_binary.version),
         output=output,
         status=status,
         **timer.stats,

+ 12 - 9
archivebox/extractors/git.py

@@ -4,7 +4,6 @@ __package__ = 'archivebox.extractors'
 from pathlib import Path
 from typing import Optional
 
-from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError
 from archivebox.misc.system import run, chmod_file
 from archivebox.misc.util import (
     enforce_types,
@@ -14,8 +13,9 @@ from archivebox.misc.util import (
     without_query,
     without_fragment,
 )
-from ..config.legacy import CONFIG
+from archivebox.plugins_extractor.git.apps import GIT_CONFIG, GIT_BINARY
 from ..logging_util import TimedProgress
+from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError
 
 
 def get_output_path():
@@ -42,28 +42,31 @@ def should_save_git(link: Link, out_dir: Optional[Path]=None, overwrite: Optiona
         return False
 
     is_clonable_url = (
-        (domain(link.url) in CONFIG.GIT_DOMAINS)
+        (domain(link.url) in GIT_CONFIG.GIT_DOMAINS)
         or (extension(link.url) == 'git')
     )
     if not is_clonable_url:
         return False
 
-    return CONFIG.SAVE_GIT
+    return GIT_CONFIG.SAVE_GIT
 
 
 @enforce_types
-def save_git(link: Link, out_dir: Optional[Path]=None, timeout: int=CONFIG.TIMEOUT) -> ArchiveResult:
+def save_git(link: Link, out_dir: Optional[Path]=None, timeout: int=GIT_CONFIG.GIT_TIMEOUT) -> ArchiveResult:
     """download full site using git"""
+    
+    git_binary = GIT_BINARY.load()
+    assert git_binary.abspath and git_binary.version
 
     out_dir = out_dir or Path(link.link_dir)
     output: ArchiveOutput = get_output_path()
     output_path = out_dir / output
     output_path.mkdir(exist_ok=True)
     cmd = [
-        CONFIG.GIT_BINARY,
+        str(git_binary.abspath),
         'clone',
-        *CONFIG.GIT_ARGS,
-        *([] if CONFIG.CHECK_SSL_VALIDITY else ['-c', 'http.sslVerify=false']),
+        *GIT_CONFIG.GIT_ARGS,
+        *([] if GIT_CONFIG.GIT_CHECK_SSL_VALIDITY else ['-c', 'http.sslVerify=false']),
         without_query(without_fragment(link.url)),
     ]
     status = 'succeeded'
@@ -88,7 +91,7 @@ def save_git(link: Link, out_dir: Optional[Path]=None, timeout: int=CONFIG.TIMEO
     return ArchiveResult(
         cmd=cmd,
         pwd=str(out_dir),
-        cmd_version=CONFIG.GIT_VERSION,
+        cmd_version=str(git_binary.version),
         output=output,
         status=status,
         **timer.stats,

+ 20 - 25
archivebox/extractors/headers.py

@@ -4,23 +4,14 @@ from pathlib import Path
 
 from typing import Optional
 
-from ..index.schema import Link, ArchiveResult, ArchiveOutput
 from archivebox.misc.system import atomic_write
 from archivebox.misc.util import (
     enforce_types,
     get_headers,
     dedupe,
 )
-from ..config.legacy import (
-    TIMEOUT,
-    CURL_BINARY,
-    CURL_ARGS,
-    CURL_EXTRA_ARGS,
-    CURL_USER_AGENT,
-    CURL_VERSION,
-    CHECK_SSL_VALIDITY,
-    SAVE_HEADERS
-)
+from archivebox.plugins_extractor.curl.apps import CURL_CONFIG, CURL_BINARY
+from ..index.schema import Link, ArchiveResult, ArchiveOutput
 from ..logging_util import TimedProgress
 
 def get_output_path():
@@ -29,34 +20,38 @@ def get_output_path():
 
 @enforce_types
 def should_save_headers(link: Link, out_dir: Optional[str]=None, overwrite: Optional[bool]=False) -> bool:
-    out_dir = out_dir or Path(link.link_dir)
-    if not overwrite and (out_dir / get_output_path()).exists():
+    out_dir_path = Path(out_dir or link.link_dir)
+    assert out_dir_path
+    if not overwrite and (out_dir_path / get_output_path()).exists():
         return False
 
-    return SAVE_HEADERS
+    return CURL_CONFIG.SAVE_HEADERS
 
 
 @enforce_types
-def save_headers(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEOUT) -> ArchiveResult:
+def save_headers(link: Link, out_dir: Optional[str]=None, timeout: int=CURL_CONFIG.CURL_TIMEOUT) -> ArchiveResult:
     """Download site headers"""
 
-    out_dir = Path(out_dir or link.link_dir)
-    output_folder = out_dir.absolute()
+    curl_binary = CURL_BINARY.load()
+    assert curl_binary.abspath and curl_binary.version
+
+    out_dir_path = Path(out_dir or link.link_dir)
+    output_folder = out_dir_path.absolute()
     output: ArchiveOutput = get_output_path()
 
     status = 'succeeded'
-    timer = TimedProgress(timeout, prefix='      ')
+    timer = TimedProgress(timeout + 1, prefix='      ')
     # later options take precedence
     options = [
-        *CURL_ARGS,
-        *CURL_EXTRA_ARGS,
+        *CURL_CONFIG.CURL_ARGS,
+        *CURL_CONFIG.CURL_EXTRA_ARGS,
         '--head',
         '--max-time', str(timeout),
-        *(['--user-agent', '{}'.format(CURL_USER_AGENT)] if CURL_USER_AGENT else []),
-        *([] if CHECK_SSL_VALIDITY else ['--insecure']),
+        *(['--user-agent', '{}'.format(CURL_CONFIG.CURL_USER_AGENT)] if CURL_CONFIG.CURL_USER_AGENT else []),
+        *([] if CURL_CONFIG.CURL_CHECK_SSL_VALIDITY else ['--insecure']),
     ]
     cmd = [
-        CURL_BINARY,
+        str(curl_binary.abspath),
         *dedupe(options),
         link.url,
     ]
@@ -72,8 +67,8 @@ def save_headers(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEOUT)
 
     return ArchiveResult(
         cmd=cmd,
-        pwd=str(out_dir),
-        cmd_version=CURL_VERSION,
+        pwd=str(out_dir_path),
+        cmd_version=str(curl_binary.version),
         output=output,
         status=status,
         **timer.stats,

+ 7 - 12
archivebox/extractors/htmltotext.py

@@ -5,18 +5,13 @@ import io
 from pathlib import Path
 from typing import Optional
 
-from archivebox.config import VERSION
-from ..config.legacy import (
-    SAVE_HTMLTOTEXT,
-    TIMEOUT,
-)
-from ..index.schema import Link, ArchiveResult, ArchiveError
-from ..logging_util import TimedProgress
+from archivebox.config import VERSION, ARCHIVING_CONFIG
+from archivebox.config.legacy import SAVE_HTMLTOTEXT
 from archivebox.misc.system import atomic_write
-from archivebox.misc.util import (
-    enforce_types,
-    is_static_file,
-)
+from archivebox.misc.util import enforce_types, is_static_file
+
+from ..logging_util import TimedProgress
+from ..index.schema import Link, ArchiveResult, ArchiveError
 from .title import get_html
 
 
@@ -122,7 +117,7 @@ def should_save_htmltotext(link: Link, out_dir: Optional[Path]=None, overwrite:
 
 
 @enforce_types
-def save_htmltotext(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) -> ArchiveResult:
+def save_htmltotext(link: Link, out_dir: Optional[Path]=None, timeout: int=ARCHIVING_CONFIG.TIMEOUT) -> ArchiveResult:
     """extract search-indexing-friendly text from an HTML document"""
 
     out_dir = Path(out_dir or link.link_dir)

+ 14 - 20
archivebox/extractors/title.py

@@ -5,23 +5,14 @@ from html.parser import HTMLParser
 from pathlib import Path
 from typing import Optional
 
-from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError
 from archivebox.misc.util import (
     enforce_types,
     download_url,
     htmldecode,
     dedupe,
 )
-from ..config.legacy import (
-    TIMEOUT,
-    CHECK_SSL_VALIDITY,
-    SAVE_TITLE,
-    CURL_BINARY,
-    CURL_ARGS,
-    CURL_EXTRA_ARGS,
-    CURL_VERSION,
-    CURL_USER_AGENT,
-)
+from archivebox.plugins_extractor.curl.apps import CURL_CONFIG, CURL_BINARY
+from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError
 from ..logging_util import TimedProgress
 
 
@@ -62,7 +53,7 @@ class TitleParser(HTMLParser):
 
 
 @enforce_types
-def get_html(link: Link, path: Path, timeout: int=TIMEOUT) -> str:
+def get_html(link: Link, path: Path, timeout: int=CURL_CONFIG.CURL_TIMEOUT) -> str:
     """
     Try to find wget, singlefile and then dom files.
     If none is found, download the url again.
@@ -98,7 +89,7 @@ def should_save_title(link: Link, out_dir: Optional[str]=None, overwrite: Option
     if not overwrite and link.title and not link.title.lower().startswith('http'):
         return False
 
-    return SAVE_TITLE
+    return CURL_CONFIG.SAVE_TITLE
 
 def extract_title_with_regex(html):
     match = re.search(HTML_TITLE_REGEX, html)
@@ -106,22 +97,25 @@ def extract_title_with_regex(html):
     return output
 
 @enforce_types
-def save_title(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) -> ArchiveResult:
+def save_title(link: Link, out_dir: Optional[Path]=None, timeout: int=CURL_CONFIG.CURL_TIMEOUT) -> ArchiveResult:
     """try to guess the page's title from its content"""
 
     from core.models import Snapshot
 
+    curl_binary = CURL_BINARY.load()
+    assert curl_binary.abspath and curl_binary.version
+
     output: ArchiveOutput = None
     # later options take precedence
     options = [
-        *CURL_ARGS,
-        *CURL_EXTRA_ARGS,
+        *CURL_CONFIG.CURL_ARGS,
+        *CURL_CONFIG.CURL_EXTRA_ARGS,
         '--max-time', str(timeout),
-        *(['--user-agent', '{}'.format(CURL_USER_AGENT)] if CURL_USER_AGENT else []),
-        *([] if CHECK_SSL_VALIDITY else ['--insecure']),
+        *(['--user-agent', '{}'.format(CURL_CONFIG.CURL_USER_AGENT)] if CURL_CONFIG.CURL_USER_AGENT else []),
+        *([] if CURL_CONFIG.CURL_CHECK_SSL_VALIDITY else ['--insecure']),
     ]
     cmd = [
-        CURL_BINARY,
+        str(curl_binary.abspath),
         *dedupe(options),
         link.url,
     ]
@@ -161,7 +155,7 @@ def save_title(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) -
     return ArchiveResult(
         cmd=cmd,
         pwd=str(out_dir),
-        cmd_version=CURL_VERSION,
+        cmd_version=str(curl_binary.version),
         output=output,
         status=status,
         **timer.stats,

+ 22 - 38
archivebox/main.py

@@ -430,7 +430,7 @@ def init(force: bool=False, quick: bool=False, install: bool=False, out_dir: Pat
 def status(out_dir: Path=DATA_DIR) -> None:
     """Print out some info and statistics about the archive collection"""
 
-    check_data_folder(CONFIG)
+    check_data_folder()
 
     from core.models import Snapshot
     from django.contrib.auth import get_user_model
@@ -573,7 +573,7 @@ def add(urls: Union[str, List[str]],
         run_subcommand('init', stdin=None, pwd=out_dir)
 
     # Load list of links from the existing index
-    check_data_folder(CONFIG)
+    check_data_folder()
 
     # worker = start_cli_workers()
     
@@ -673,7 +673,7 @@ def remove(filter_str: Optional[str]=None,
            out_dir: Path=DATA_DIR) -> List[Link]:
     """Remove the specified URLs from the archive"""
     
-    check_data_folder(CONFIG)
+    check_data_folder()
 
     if snapshots is None:
         if filter_str and filter_patterns:
@@ -762,7 +762,7 @@ def update(resume: Optional[float]=None,
     # from .queues.supervisor_util import start_cli_workers
     
 
-    check_data_folder(CONFIG)
+    check_data_folder()
     # start_cli_workers()
     new_links: List[Link] = [] # TODO: Remove input argument: only_new
 
@@ -833,7 +833,7 @@ def list_all(filter_patterns_str: Optional[str]=None,
              out_dir: Path=DATA_DIR) -> Iterable[Link]:
     """List, filter, and export information about archive entries"""
     
-    check_data_folder(CONFIG)
+    check_data_folder()
 
     if filter_patterns and filter_patterns_str:
         stderr(
@@ -881,7 +881,7 @@ def list_links(snapshots: Optional[QuerySet]=None,
                before: Optional[float]=None,
                out_dir: Path=DATA_DIR) -> Iterable[Link]:
     
-    check_data_folder(CONFIG)
+    check_data_folder()
 
     if snapshots:
         all_snapshots = snapshots
@@ -905,7 +905,7 @@ def list_folders(links: List[Link],
                  status: str,
                  out_dir: Path=DATA_DIR) -> Dict[str, Optional[Link]]:
     
-    check_data_folder(CONFIG)
+    check_data_folder()
 
     STATUS_FUNCTIONS = {
         "indexed": get_indexed_folders,
@@ -926,7 +926,7 @@ def list_folders(links: List[Link],
         raise ValueError('Status not recognized.')
 
 @enforce_types
-def setup(out_dir: Path=DATA_DIR) -> None:
+def install(out_dir: Path=DATA_DIR) -> None:
     """Automatically install all ArchiveBox dependencies and extras"""
 
     from rich import print
@@ -937,40 +937,20 @@ def setup(out_dir: Path=DATA_DIR) -> None:
 
     stderr('\n[+] Installing ArchiveBox dependencies automatically...', color='green')
 
-    for binary in settings.BINARIES.values():
+    for binary in reversed(list(settings.BINARIES.values())):
         try:
             print(binary.load_or_install().model_dump(exclude={'binproviders_supported', 'loaded_binprovider', 'provider_overrides', 'loaded_abspaths', 'bin_dir', 'loaded_respath'}))
         except Exception as e:
             print(f'[X] Failed to install {binary.name}: {e}')
 
-    # from plugins_extractor.curl.apps import CURL_BINARY
-    # print(CURL_BINARY.load_or_install().model_dump(exclude={'binproviders_supported', 'loaded_binprovider', 'provider_overrides', 'loaded_abspaths', 'bin_dir', 'loaded_respath'}))
-
-    # from plugins_extractor.wget.apps import WGET_BINARY
-    # print(WGET_BINARY.load_or_install().model_dump(exclude={'binproviders_supported', 'loaded_binprovider', 'provider_overrides', 'loaded_abspaths', 'bin_dir', 'loaded_respath'}))
-
-    # from plugins_extractor.ytdlp.apps import YTDLP_BINARY
-    # print(YTDLP_BINARY.load_or_install().model_dump(exclude={'binproviders_supported', 'loaded_binprovider', 'provider_overrides', 'loaded_abspaths', 'bin_dir', 'loaded_respath'}))
-
-    # from plugins_extractor.chrome.apps import CHROME_BINARY
-    # print(CHROME_BINARY.load_or_install().model_dump(exclude={'binproviders_supported', 'loaded_binprovider', 'provider_overrides', 'loaded_abspaths', 'bin_dir', 'loaded_respath'}))
-
-    # from plugins_extractor.singlefile.apps import SINGLEFILE_BINARY
-    # print(SINGLEFILE_BINARY.load_or_install().model_dump(exclude={'binproviders_supported', 'loaded_binprovider', 'provider_overrides', 'loaded_abspaths', 'bin_dir', 'loaded_respath'}))
-    
-    # from plugins_extractor.readability.apps import READABILITY_BINARY
-    # print(READABILITY_BINARY.load_or_install().model_dump(exclude={'binproviders_supported', 'loaded_binprovider', 'provider_overrides', 'loaded_abspaths', 'bin_dir', 'loaded_respath'}))
-    
-    # from plugins_extractor.mercury.apps import MERCURY_BINARY
-    # print(MERCURY_BINARY.load_or_install().model_dump(exclude={'binproviders_supported', 'loaded_binprovider', 'provider_overrides', 'loaded_abspaths', 'bin_dir', 'loaded_respath'}))
-    
 
     from django.contrib.auth import get_user_model
     User = get_user_model()
 
     if not User.objects.filter(is_superuser=True).exists():
-        stderr('\n[+] Creating new admin user for the Web UI...', color='green')
-        run_subcommand('manage', subcommand_args=['createsuperuser'], pwd=out_dir)
+        stderr('\n[+] Don\'t forget to create a new admin user for the Web UI...', color='green')
+        stderr('    archivebox manage createsuperuser')
+        # run_subcommand('manage', subcommand_args=['createsuperuser'], pwd=out_dir)
     
     stderr('\n[√] Set up ArchiveBox and its dependencies successfully.', color='green')
     
@@ -978,6 +958,10 @@ def setup(out_dir: Path=DATA_DIR) -> None:
     
     run_shell([ARCHIVEBOX_BINARY.load().abspath, '--version'], capture_output=False, cwd=out_dir)
 
+# backwards-compatibility:
+setup = install
+
+
 @enforce_types
 def config(config_options_str: Optional[str]=None,
            config_options: Optional[List[str]]=None,
@@ -989,7 +973,7 @@ def config(config_options_str: Optional[str]=None,
 
     from rich import print
 
-    check_data_folder(CONFIG)
+    check_data_folder()
     if config_options and config_options_str:
         stderr(
             '[X] You should either pass config values as an arguments '
@@ -1090,8 +1074,8 @@ def schedule(add: bool=False,
              out_dir: Path=DATA_DIR):
     """Set ArchiveBox to regularly import URLs at specific times using cron"""
     
-    check_data_folder(CONFIG)
-    from plugins_pkg.pip.apps import ARCHIVEBOX_BINARY
+    check_data_folder()
+    from archivebox.plugins_pkg.pip.apps import ARCHIVEBOX_BINARY
 
     Path(CONSTANTS.LOGS_DIR).mkdir(exist_ok=True)
 
@@ -1228,7 +1212,7 @@ def server(runserver_args: Optional[List[str]]=None,
         print()
 
 
-    check_data_folder(CONFIG)
+    check_data_folder()
 
     from django.core.management import call_command
     from django.contrib.auth.models import User
@@ -1280,7 +1264,7 @@ def server(runserver_args: Optional[List[str]]=None,
 def manage(args: Optional[List[str]]=None, out_dir: Path=DATA_DIR) -> None:
     """Run an ArchiveBox Django management command"""
 
-    check_data_folder(CONFIG)
+    check_data_folder()
     from django.core.management import execute_from_command_line
 
     if (args and "createsuperuser" in args) and (SHELL_CONFIG.IN_DOCKER and not SHELL_CONFIG.IS_TTY):
@@ -1297,7 +1281,7 @@ def manage(args: Optional[List[str]]=None, out_dir: Path=DATA_DIR) -> None:
 def shell(out_dir: Path=DATA_DIR) -> None:
     """Enter an interactive ArchiveBox Django shell"""
 
-    check_data_folder(CONFIG)
+    check_data_folder()
 
     from django.core.management import call_command
     call_command("shell_plus")

+ 2 - 4
archivebox/misc/checks.py

@@ -1,13 +1,11 @@
 __package__ = 'archivebox.misc'
 
-from benedict import benedict
-
 from archivebox.config import DATA_DIR, ARCHIVE_DIR, CONSTANTS, SHELL_CONFIG
 
 from .logging import stderr
 
 
-def check_data_folder(config: benedict) -> None:
+def check_data_folder() -> None:
 
     archive_dir_exists = ARCHIVE_DIR.exists()
     if not archive_dir_exists:
@@ -23,7 +21,7 @@ def check_data_folder(config: benedict) -> None:
         raise SystemExit(2)
 
 
-def check_migrations(config: benedict):
+def check_migrations():
     from ..index.sql import list_migrations
 
     pending_migrations = [name for status, name in list_migrations() if not status]

+ 17 - 12
archivebox/plugins_extractor/curl/apps.py

@@ -1,10 +1,10 @@
 __package__ = 'plugins_extractor.curl'
 
-from typing import List, Optional, Dict
+from typing import List, Optional
 from pathlib import Path
 
 from pydantic import InstanceOf, Field
-from pydantic_pkgr import BinProvider, BinName, bin_abspath, BinProviderName, ProviderLookupDict
+from pydantic_pkgr import BinProvider, BinName
 
 from abx.archivebox.base_plugin import BasePlugin, BaseHook
 from abx.archivebox.base_configset import BaseConfigSet
@@ -12,15 +12,26 @@ from abx.archivebox.base_binary import BaseBinary, env, apt, brew
 # from abx.archivebox.base_extractor import BaseExtractor, ExtractorName
 
 from archivebox.config import ARCHIVING_CONFIG
-
+from archivebox.plugins_extractor.favicon.apps import FAVICON_CONFIG
+from archivebox.plugins_extractor.archivedotorg.apps import ARCHIVEDOTORG_CONFIG
 
 class CurlConfig(BaseConfigSet):
-
-    SAVE_CURL: bool = True
     
-    # USE_CURL: bool = Field(default=lambda c: c.SAVE_HEADERS or c.SAVE_FAVICON)
+    SAVE_TITLE: bool = Field(default=True)
+    SAVE_HEADERS: bool = Field(default=True)
+    USE_CURL: bool = Field(default=lambda c: 
+        ARCHIVEDOTORG_CONFIG.SAVE_ARCHIVE_DOT_ORG
+        or FAVICON_CONFIG.SAVE_FAVICON
+        or c.SAVE_HEADERS
+        or c.SAVE_TITLE
+    )
     
     CURL_BINARY: str = Field(default='curl')
+    CURL_ARGS: List[str] = [
+        '--silent',
+        '--location',
+        '--compressed',
+    ]
     CURL_EXTRA_ARGS: List[str] = []
     
     CURL_TIMEOUT: int =  Field(default=lambda: ARCHIVING_CONFIG.TIMEOUT)
@@ -35,12 +46,6 @@ CURL_CONFIG = CurlConfig()
 class CurlBinary(BaseBinary):
     name: BinName = CURL_CONFIG.CURL_BINARY
     binproviders_supported: List[InstanceOf[BinProvider]] = [apt, brew, env]
-    
-    provider_overrides: Dict[BinProviderName, ProviderLookupDict] = {
-        brew.name: {
-            'abspath': lambda: bin_abspath(CURL_CONFIG.CURL_BINARY, PATH=f'/opt/homebrew/opt/curl/bin:{brew.PATH}'),
-        },
-    }
 
 CURL_BINARY = CurlBinary()
 

+ 2 - 8
archivebox/plugins_extractor/wget/apps.py

@@ -1,13 +1,13 @@
 __package__ = 'plugins_extractor.wget'
 
 import sys
-from typing import List, Optional, Dict
+from typing import List, Optional
 from pathlib import Path
 from subprocess import run, DEVNULL
 
 from rich import print
 from pydantic import InstanceOf, Field, model_validator
-from pydantic_pkgr import BinProvider, BinName, bin_abspath, BinProviderName, ProviderLookupDict
+from pydantic_pkgr import BinProvider, BinName
 
 from abx.archivebox.base_plugin import BasePlugin, BaseHook
 from abx.archivebox.base_configset import BaseConfigSet
@@ -80,12 +80,6 @@ WGET_CONFIG = WgetConfig()
 class WgetBinary(BaseBinary):
     name: BinName = WGET_CONFIG.WGET_BINARY
     binproviders_supported: List[InstanceOf[BinProvider]] = [apt, brew, env]
-    
-    provider_overrides: Dict[BinProviderName, ProviderLookupDict] = {
-        brew.name: {
-            'abspath': lambda: bin_abspath(WGET_CONFIG.WGET_BINARY, PATH=f'/opt/homebrew/opt/wget/bin:{brew.PATH}'),
-        },
-    }
 
 WGET_BINARY = WgetBinary()
 

+ 6 - 6
archivebox/search/__init__.py

@@ -11,7 +11,7 @@ from archivebox.misc.util import enforce_types
 from archivebox.misc.logging import stderr
 from archivebox.config.legacy import ANSI
 
-# from archivebox.archivebox.config import settings.CONFIGS.SearchBackendConfig
+from archivebox.config import SEARCH_BACKEND_CONFIG
 
 
 def log_index_started(url):
@@ -58,13 +58,13 @@ def get_indexable_content(results: QuerySet):
 
 def import_backend():
     for backend in settings.SEARCH_BACKENDS.values():
-        if backend.name == settings.CONFIGS.SearchBackendConfig.SEARCH_BACKEND_ENGINE:
+        if backend.name == SEARCH_BACKEND_CONFIG.SEARCH_BACKEND_ENGINE:
             return backend
-    raise Exception(f'Could not load {settings.CONFIGS.SearchBackendConfig.SEARCH_BACKEND_ENGINE} as search backend')
+    raise Exception(f'Could not load {SEARCH_BACKEND_CONFIG.SEARCH_BACKEND_ENGINE} as search backend')
 
 @enforce_types
 def write_search_index(link: Link, texts: Union[List[str], None]=None, out_dir: Path=settings.DATA_DIR, skip_text_index: bool=False) -> None:
-    if not settings.CONFIGS.SearchBackendConfig.USE_INDEXING_BACKEND:
+    if not SEARCH_BACKEND_CONFIG.USE_INDEXING_BACKEND:
         return
 
     if not skip_text_index and texts:
@@ -86,7 +86,7 @@ def write_search_index(link: Link, texts: Union[List[str], None]=None, out_dir:
 def query_search_index(query: str, out_dir: Path=settings.DATA_DIR) -> QuerySet:
     from core.models import Snapshot
 
-    if settings.CONFIGS.SearchBackendConfig.USE_SEARCHING_BACKEND:
+    if SEARCH_BACKEND_CONFIG.USE_SEARCHING_BACKEND:
         backend = import_backend()
         try:
             snapshot_pks = backend.search(query)
@@ -106,7 +106,7 @@ def query_search_index(query: str, out_dir: Path=settings.DATA_DIR) -> QuerySet:
 
 @enforce_types
 def flush_search_index(snapshots: QuerySet):
-    if not settings.CONFIGS.SearchBackendConfig.USE_INDEXING_BACKEND or not snapshots:
+    if not SEARCH_BACKEND_CONFIG.USE_INDEXING_BACKEND or not snapshots:
         return
     backend = import_backend()
     snapshot_pks = (str(pk) for pk in snapshots.values_list('pk', flat=True))

+ 1 - 1
archivebox/vendor/pydantic-pkgr

@@ -1 +1 @@
-Subproject commit 4f9486ab86a65f83ad1bfd94320795b8e09871aa
+Subproject commit 4f31b355fbf319a54b38953795b17b1b04db4348