2
0
Эх сурвалжийг харах

finish migrating almost all config to new system

Nick Sweeting 1 жил өмнө
parent
commit
d21bc86075

+ 0 - 37
archivebox/abx/archivebox/base_hook.py

@@ -13,43 +13,6 @@ HookType = Literal['CONFIG', 'BINPROVIDER', 'BINARY', 'EXTRACTOR', 'REPLAYER', '
 hook_type_names: Tuple[HookType] = get_args(HookType)
 hook_type_names: Tuple[HookType] = get_args(HookType)
 
 
 class BaseHook(BaseModel):
 class BaseHook(BaseModel):
-    """
-    A Plugin consists of a list of Hooks, applied to django.conf.settings when AppConfig.read() -> Plugin.register() is called.
-    Plugin.register() then calls each Hook.register() on the provided settings.
-    each Hook.regsiter() function (ideally pure) takes a django.conf.settings as input and returns a new one back.
-    or 
-    it modifies django.conf.settings in-place to add changes corresponding to its HookType.
-    e.g. for a HookType.CONFIG, the Hook.register() function places the hook in settings.CONFIG (and settings.HOOKS)
-    An example of an impure Hook would be a CHECK that modifies settings but also calls django.core.checks.register(check).
-    In practice any object that subclasses BaseHook and provides a .register() function can behave as a Hook.
-
-    setup_django() -> imports all settings.INSTALLED_APPS...
-        # django imports AppConfig, models, migrations, admins, etc. for all installed apps
-        # django then calls AppConfig.ready() on each installed app...
-
-        plugins_pkg.npm.NpmPlugin().AppConfig.ready()                    # called by django
-            plugins_pkg.npm.NpmPlugin().register(settings) ->
-                plugins_pkg.npm.NpmConfigSet().register(settings)
-                    abx.archivebox.base_configset.BaseConfigSet().register(settings)
-                        abx.archivebox.base_hook.BaseHook().register(settings, parent_plugin=plugins_pkg.npm.NpmPlugin())
-
-                ...
-        ...
-
-    Both core ArchiveBox code and plugin code depend on python >= 3.10 and django >= 5.0 w/ sqlite and a filesystem.
-    Core ArchiveBox code can depend only on python and the pip libraries it ships with, and can never depend on plugin code / node / other binaries.
-    Plugin code can depend on archivebox core, other django apps, other pip libraries, and other plugins.
-    Plugins can provide BinProviders + Binaries which can depend on arbitrary other binaries / package managers like curl / wget / yt-dlp / etc.
-
-    The execution interface between plugins is simply calling builtinplugins.npm.... functions directly, django handles
-    importing all plugin code. There is no need to manually register methods/classes, only register to call
-    impure setup functions or provide runtime state.
-    settings.CONFIGS / settings.BINPROVIDERS / settings.BINARIES /... etc. are reserved for dynamic runtime state only.
-    This state is exposed to the broader system in a flat namespace, e.g. CONFIG.IS_DOCKER=True, or BINARIES = [
-        ..., Binary('node', abspath='/usr/local/bin/node', version='22.2.0'), ...
-    ]
-
-    """
     model_config = ConfigDict(
     model_config = ConfigDict(
         extra="allow",
         extra="allow",
         arbitrary_types_allowed=True,
         arbitrary_types_allowed=True,

+ 4 - 4
archivebox/api/v1_cli.py

@@ -13,7 +13,7 @@ from ..main import (
     schedule,
     schedule,
 )
 )
 from archivebox.misc.util import ansi_to_html
 from archivebox.misc.util import ansi_to_html
-from ..config.legacy import ONLY_NEW
+from archivebox.config import ARCHIVING_CONFIG
 
 
 
 
 from .auth import API_AUTH_METHODS
 from .auth import API_AUTH_METHODS
@@ -58,7 +58,7 @@ class AddCommandSchema(Schema):
     urls: List[str]
     urls: List[str]
     tag: str = ""
     tag: str = ""
     depth: int = 0
     depth: int = 0
-    update: bool = not ONLY_NEW  # Default to the opposite of ONLY_NEW
+    update: bool = not ARCHIVING_CONFIG.ONLY_NEW  # Default to the opposite of ARCHIVING_CONFIG.ONLY_NEW
     update_all: bool = False
     update_all: bool = False
     index_only: bool = False
     index_only: bool = False
     overwrite: bool = False
     overwrite: bool = False
@@ -68,7 +68,7 @@ class AddCommandSchema(Schema):
 
 
 class UpdateCommandSchema(Schema):
 class UpdateCommandSchema(Schema):
     resume: Optional[float] = 0
     resume: Optional[float] = 0
-    only_new: bool = ONLY_NEW
+    only_new: bool = ARCHIVING_CONFIG.ONLY_NEW
     index_only: bool = False
     index_only: bool = False
     overwrite: bool = False
     overwrite: bool = False
     after: Optional[float] = 0
     after: Optional[float] = 0
@@ -85,7 +85,7 @@ class ScheduleCommandSchema(Schema):
     tag: str = ''
     tag: str = ''
     depth: int = 0
     depth: int = 0
     overwrite: bool = False
     overwrite: bool = False
-    update: bool = not ONLY_NEW
+    update: bool = not ARCHIVING_CONFIG.ONLY_NEW
     clear: bool = False
     clear: bool = False
 
 
 class ListCommandSchema(Schema):
 class ListCommandSchema(Schema):

+ 2 - 5
archivebox/cli/__init__.py

@@ -152,18 +152,15 @@ def run_subcommand(subcommand: str,
     subcommand_args = subcommand_args or []
     subcommand_args = subcommand_args or []
 
 
     if subcommand not in meta_cmds:
     if subcommand not in meta_cmds:
-        from ..config.legacy import setup_django, CONFIG
+        from archivebox.config.legacy import setup_django
 
 
         cmd_requires_db = subcommand in archive_cmds
         cmd_requires_db = subcommand in archive_cmds
         init_pending = '--init' in subcommand_args or '--quick-init' in subcommand_args
         init_pending = '--init' in subcommand_args or '--quick-init' in subcommand_args
 
 
-        if cmd_requires_db:
-            check_data_folder(CONFIG)
-
         setup_django(in_memory_db=subcommand in fake_db, check_db=cmd_requires_db and not init_pending)
         setup_django(in_memory_db=subcommand in fake_db, check_db=cmd_requires_db and not init_pending)
 
 
         if cmd_requires_db:
         if cmd_requires_db:
-            check_migrations(CONFIG)
+            check_migrations()
 
 
     module = import_module('.archivebox_{}'.format(subcommand), __package__)
     module = import_module('.archivebox_{}'.format(subcommand), __package__)
     module.main(args=subcommand_args, stdin=stdin, pwd=pwd)    # type: ignore
     module.main(args=subcommand_args, stdin=stdin, pwd=pwd)    # type: ignore

+ 2 - 1
archivebox/config/__init__.py

@@ -1,6 +1,6 @@
 __package__ = 'archivebox.config'
 __package__ = 'archivebox.config'
 
 
-from .constants import CONSTANTS, PACKAGE_DIR, DATA_DIR, ARCHIVE_DIR, VERSION
+from .constants import CONSTANTS, CONSTANTS_CONFIG, PACKAGE_DIR, DATA_DIR, ARCHIVE_DIR, VERSION
 from .defaults import (
 from .defaults import (
     SHELL_CONFIG,
     SHELL_CONFIG,
     STORAGE_CONFIG,
     STORAGE_CONFIG,
@@ -23,4 +23,5 @@ __all__ = [
     'SERVER_CONFIG',
     'SERVER_CONFIG',
     'ARCHIVING_CONFIG',
     'ARCHIVING_CONFIG',
     'SEARCH_BACKEND_CONFIG',
     'SEARCH_BACKEND_CONFIG',
+    'CONSTANTS_CONFIG',
 ]
 ]

+ 27 - 42
archivebox/config/legacy.py

@@ -60,6 +60,7 @@ from .defaults import SHELL_CONFIG, GENERAL_CONFIG, ARCHIVING_CONFIG, SERVER_CON
 from archivebox.plugins_auth.ldap.apps import LDAP_CONFIG
 from archivebox.plugins_auth.ldap.apps import LDAP_CONFIG
 from archivebox.plugins_extractor.favicon.apps import FAVICON_CONFIG
 from archivebox.plugins_extractor.favicon.apps import FAVICON_CONFIG
 from archivebox.plugins_extractor.wget.apps import WGET_CONFIG
 from archivebox.plugins_extractor.wget.apps import WGET_CONFIG
+from archivebox.plugins_extractor.curl.apps import CURL_CONFIG
 
 
 ANSI = SHELL_CONFIG.ANSI
 ANSI = SHELL_CONFIG.ANSI
 LDAP = LDAP_CONFIG.LDAP_ENABLED
 LDAP = LDAP_CONFIG.LDAP_ENABLED
@@ -81,9 +82,11 @@ CONFIG_SCHEMA: Dict[str, ConfigDefaultDict] = {
     
     
     'LDAP_CONFIG': LDAP_CONFIG.as_legacy_config_schema(),
     'LDAP_CONFIG': LDAP_CONFIG.as_legacy_config_schema(),
     
     
-    'FAVICON_CONFIG': FAVICON_CONFIG.as_legacy_config_schema(),
+    # 'FAVICON_CONFIG': FAVICON_CONFIG.as_legacy_config_schema(),
     
     
-    'WGET_CONFIG': WGET_CONFIG.as_legacy_config_schema(),
+    # 'WGET_CONFIG': WGET_CONFIG.as_legacy_config_schema(),
+    
+    # 'CURL_CONFIG': CURL_CONFIG.as_legacy_config_schema(),
 
 
 
 
     'ARCHIVE_METHOD_TOGGLES': {
     'ARCHIVE_METHOD_TOGGLES': {
@@ -109,7 +112,7 @@ CONFIG_SCHEMA: Dict[str, ConfigDefaultDict] = {
 
 
     'ARCHIVE_METHOD_OPTIONS': {
     'ARCHIVE_METHOD_OPTIONS': {
         'RESOLUTION':               {'type': str,   'default': '1440,2000', 'aliases': ('SCREENSHOT_RESOLUTION','WINDOW_SIZE')},
         'RESOLUTION':               {'type': str,   'default': '1440,2000', 'aliases': ('SCREENSHOT_RESOLUTION','WINDOW_SIZE')},
-        'GIT_DOMAINS':              {'type': str,   'default': 'github.com,bitbucket.org,gitlab.com,gist.github.com,codeberg.org,gitea.com,git.sr.ht'},
+        # 'GIT_DOMAINS':              {'type': str,   'default': 'github.com,bitbucket.org,gitlab.com,gist.github.com,codeberg.org,gitea.com,git.sr.ht'},
         'CHECK_SSL_VALIDITY':       {'type': bool,  'default': True},
         'CHECK_SSL_VALIDITY':       {'type': bool,  'default': True},
         'MEDIA_MAX_SIZE':           {'type': str,   'default': '750m'},
         'MEDIA_MAX_SIZE':           {'type': str,   'default': '750m'},
 
 
@@ -144,15 +147,6 @@ CONFIG_SCHEMA: Dict[str, ConfigDefaultDict] = {
                                                                 ]},
                                                                 ]},
         'YOUTUBEDL_EXTRA_ARGS':     {'type': list,  'default': None},
         'YOUTUBEDL_EXTRA_ARGS':     {'type': list,  'default': None},
 
 
-
-        'CURL_ARGS':                {'type': list,  'default': ['--silent',
-                                                                '--location',
-                                                                '--compressed'
-                                                               ]},
-        'CURL_EXTRA_ARGS':          {'type': list,  'default': None},
-        'GIT_ARGS':                 {'type': list,  'default': ['--recursive']},
-        'SINGLEFILE_ARGS':          {'type': list,  'default': None},
-        'SINGLEFILE_EXTRA_ARGS':    {'type': list,  'default': None},
     },
     },
 
 
     'DEPENDENCY_CONFIG': {
     'DEPENDENCY_CONFIG': {
@@ -164,9 +158,9 @@ CONFIG_SCHEMA: Dict[str, ConfigDefaultDict] = {
         'USE_YOUTUBEDL':            {'type': bool,  'default': True},
         'USE_YOUTUBEDL':            {'type': bool,  'default': True},
         'USE_RIPGREP':              {'type': bool,  'default': True},
         'USE_RIPGREP':              {'type': bool,  'default': True},
 
 
-        'CURL_BINARY':              {'type': str,   'default': 'curl'},
-        'GIT_BINARY':               {'type': str,   'default': 'git'},
-        'NODE_BINARY':              {'type': str,   'default': 'node'},
+        # 'GIT_BINARY':               {'type': str,   'default': 'git'},
+        # 'CURL_BINARY':              {'type': str,   'default': 'curl'},
+        # 'NODE_BINARY':              {'type': str,   'default': 'node'},
         # 'YOUTUBEDL_BINARY':         {'type': str,   'default': 'yt-dlp'},   # also can accept youtube-dl
         # 'YOUTUBEDL_BINARY':         {'type': str,   'default': 'yt-dlp'},   # also can accept youtube-dl
         # 'SINGLEFILE_BINARY':        {'type': str,   'default': lambda c: bin_path('single-file')},
         # 'SINGLEFILE_BINARY':        {'type': str,   'default': lambda c: bin_path('single-file')},
         # 'READABILITY_BINARY':       {'type': str,   'default': lambda c: bin_path('readability-extractor')},
         # 'READABILITY_BINARY':       {'type': str,   'default': lambda c: bin_path('readability-extractor')},
@@ -209,21 +203,12 @@ DYNAMIC_CONFIG_SCHEMA: ConfigDefaultDict = {
     'URL_DENYLIST_PTN':         {'default': lambda c: c['URL_DENYLIST'] and re.compile(c['URL_DENYLIST'] or '', CONSTANTS.ALLOWDENYLIST_REGEX_FLAGS)},
     'URL_DENYLIST_PTN':         {'default': lambda c: c['URL_DENYLIST'] and re.compile(c['URL_DENYLIST'] or '', CONSTANTS.ALLOWDENYLIST_REGEX_FLAGS)},
     'URL_ALLOWLIST_PTN':        {'default': lambda c: c['URL_ALLOWLIST'] and re.compile(c['URL_ALLOWLIST'] or '', CONSTANTS.ALLOWDENYLIST_REGEX_FLAGS)},
     'URL_ALLOWLIST_PTN':        {'default': lambda c: c['URL_ALLOWLIST'] and re.compile(c['URL_ALLOWLIST'] or '', CONSTANTS.ALLOWDENYLIST_REGEX_FLAGS)},
 
 
+    # 'USE_GIT':                  {'default': lambda c: c['USE_GIT'] and c['SAVE_GIT']},
+    # 'GIT_VERSION':              {'default': lambda c: bin_version(c['GIT_BINARY']) if c['USE_GIT'] else None},
+    # 'SAVE_GIT':                 {'default': lambda c: c['USE_GIT'] and c['SAVE_GIT']},
 
 
-    'USE_CURL':                 {'default': lambda c: c['USE_CURL'] and (c['SAVE_FAVICON'] or c['SAVE_TITLE'] or c['SAVE_ARCHIVE_DOT_ORG'])},
-    'CURL_VERSION':             {'default': lambda c: bin_version(c['CURL_BINARY']) if c['USE_CURL'] else None},
-    # 'CURL_USER_AGENT':          {'default': lambda c: c['CURL_USER_AGENT'].format(**c)},
-    'CURL_ARGS':                {'default': lambda c: c['CURL_ARGS'] or []},
-    'CURL_EXTRA_ARGS':          {'default': lambda c: c['CURL_EXTRA_ARGS'] or []},
-    'SAVE_FAVICON':             {'default': lambda c: c['USE_CURL'] and c['SAVE_FAVICON']},
-    'SAVE_ARCHIVE_DOT_ORG':     {'default': lambda c: c['USE_CURL'] and c['SAVE_ARCHIVE_DOT_ORG']},
-
-    'USE_GIT':                  {'default': lambda c: c['USE_GIT'] and c['SAVE_GIT']},
-    'GIT_VERSION':              {'default': lambda c: bin_version(c['GIT_BINARY']) if c['USE_GIT'] else None},
-    'SAVE_GIT':                 {'default': lambda c: c['USE_GIT'] and c['SAVE_GIT']},
 
 
-
-    'DEPENDENCIES':             {'default': lambda c: get_dependency_info(c)},
+    # 'DEPENDENCIES':             {'default': lambda c: get_dependency_info(c)},
     # 'CODE_LOCATIONS':           {'default': lambda c: get_code_locations(c)},
     # 'CODE_LOCATIONS':           {'default': lambda c: get_code_locations(c)},
     # 'DATA_LOCATIONS':           {'default': lambda c: get_data_locations(c)},
     # 'DATA_LOCATIONS':           {'default': lambda c: get_data_locations(c)},
 
 
@@ -613,13 +598,13 @@ def get_dependency_info(config: benedict) -> ConfigValue:
         #     'is_valid': True,
         #     'is_valid': True,
         # },
         # },
         
         
-        'CURL_BINARY': {
-            'path': bin_path(config['CURL_BINARY']),
-            'version': config['CURL_VERSION'],
-            'hash': bin_hash(config['CURL_BINARY']),
-            'enabled': config['USE_CURL'],
-            'is_valid': bool(config['CURL_VERSION']),
-        },
+        # 'CURL_BINARY': {
+        #     'path': bin_path(config['CURL_BINARY']),
+        #     'version': config['CURL_VERSION'],
+        #     'hash': bin_hash(config['CURL_BINARY']),
+        #     'enabled': config['USE_CURL'],
+        #     'is_valid': bool(config['CURL_VERSION']),
+        # },
         # 'WGET_BINARY': {
         # 'WGET_BINARY': {
         #     'path': bin_path(config['WGET_BINARY']),
         #     'path': bin_path(config['WGET_BINARY']),
         #     'version': config['WGET_VERSION'],
         #     'version': config['WGET_VERSION'],
@@ -641,13 +626,13 @@ def get_dependency_info(config: benedict) -> ConfigValue:
         #     'enabled': config['USE_MERCURY'],
         #     'enabled': config['USE_MERCURY'],
         #     'is_valid': bool(config['MERCURY_VERSION']),
         #     'is_valid': bool(config['MERCURY_VERSION']),
         # },
         # },
-        'GIT_BINARY': {
-            'path': bin_path(config['GIT_BINARY']),
-            'version': config['GIT_VERSION'],
-            'hash': bin_hash(config['GIT_BINARY']),
-            'enabled': config['USE_GIT'],
-            'is_valid': bool(config['GIT_VERSION']),
-        },
+        # 'GIT_BINARY': {
+        #     'path': bin_path(config['GIT_BINARY']),
+        #     'version': config['GIT_VERSION'],
+        #     'hash': bin_hash(config['GIT_BINARY']),
+        #     'enabled': config['USE_GIT'],
+        #     'is_valid': bool(config['GIT_VERSION']),
+        # },
         # 'SINGLEFILE_BINARY': {
         # 'SINGLEFILE_BINARY': {
         #     'path': bin_path(config['SINGLEFILE_BINARY']),
         #     'path': bin_path(config['SINGLEFILE_BINARY']),
         #     'version': config['SINGLEFILE_VERSION'],
         #     'version': config['SINGLEFILE_VERSION'],

+ 2 - 1
archivebox/config/views.py

@@ -76,7 +76,7 @@ def binaries_list_view(request: HttpRequest, **kwargs) -> TableContext:
 
 
     relevant_configs = {
     relevant_configs = {
         key: val
         key: val
-        for key, val in settings.CONFIG.items()
+        for key, val in settings.FLAT_CONFIG.items()
         if '_BINARY' in key or '_VERSION' in key
         if '_BINARY' in key or '_VERSION' in key
     }
     }
 
 
@@ -105,6 +105,7 @@ def binaries_list_view(request: HttpRequest, **kwargs) -> TableContext:
                 f'<a href="/admin/environment/config/{config_key}/">{config_key}</a>'
                 f'<a href="/admin/environment/config/{config_key}/">{config_key}</a>'
                 for config_key, config_value in relevant_configs.items()
                 for config_key, config_value in relevant_configs.items()
                     if str(binary.name).lower().replace('-', '').replace('_', '').replace('ytdlp', 'youtubedl') in config_key.lower()
                     if str(binary.name).lower().replace('-', '').replace('_', '').replace('ytdlp', 'youtubedl') in config_key.lower()
+                    or config_value.lower().endswith(binary.name.lower())
                     # or binary.name.lower().replace('-', '').replace('_', '') in str(config_value).lower()
                     # or binary.name.lower().replace('-', '').replace('_', '') in str(config_value).lower()
             )))
             )))
             # if not binary.provider_overrides:
             # if not binary.provider_overrides:

+ 1 - 1
archivebox/core/admin.py

@@ -36,7 +36,7 @@ from main import remove
 from extractors import archive_links
 from extractors import archive_links
 
 
 
 
-CONFIG = settings.CONFIG
+CONFIG = settings.FLAT_CONFIG
 
 
 GLOBAL_CONTEXT = {'VERSION': VERSION, 'VERSIONS_AVAILABLE': [], 'CAN_UPGRADE': False}
 GLOBAL_CONTEXT = {'VERSION': VERSION, 'VERSIONS_AVAILABLE': [], 'CAN_UPGRADE': False}
 
 

+ 2 - 4
archivebox/core/auth.py

@@ -1,13 +1,11 @@
 __package__ = 'archivebox.core'
 __package__ = 'archivebox.core'
 
 
 
 
-from ..config.legacy import (
-    LDAP
-)
+from archivebox.plugins_auth.ldap.apps import LDAP_CONFIG
 
 
 def register_signals():
 def register_signals():
 
 
-    if LDAP:
+    if LDAP_CONFIG.LDAP_ENABLED:
         import django_auth_ldap.backend
         import django_auth_ldap.backend
         from .auth_ldap import create_user
         from .auth_ldap import create_user
 
 

+ 2 - 4
archivebox/core/auth_ldap.py

@@ -1,9 +1,7 @@
-from ..config.legacy import (
-    LDAP_CREATE_SUPERUSER
-)
+from archivebox.plugins_auth.ldap.apps import LDAP_CONFIG
 
 
 def create_user(sender, user=None, ldap_user=None, **kwargs):
 def create_user(sender, user=None, ldap_user=None, **kwargs):
-    if not user.id and LDAP_CREATE_SUPERUSER:
+    if not user.id and LDAP_CONFIG.LDAP_CREATE_SUPERUSER:
         user.is_superuser = True
         user.is_superuser = True
 
 
     user.is_staff = True
     user.is_staff = True

+ 5 - 5
archivebox/core/middleware.py

@@ -5,7 +5,7 @@ from django.utils import timezone
 from django.contrib.auth.middleware import RemoteUserMiddleware
 from django.contrib.auth.middleware import RemoteUserMiddleware
 from django.core.exceptions import ImproperlyConfigured
 from django.core.exceptions import ImproperlyConfigured
 
 
-from ..config.legacy import PUBLIC_SNAPSHOTS, REVERSE_PROXY_USER_HEADER, REVERSE_PROXY_WHITELIST
+from archivebox.config import SERVER_CONFIG
 
 
 
 
 def detect_timezone(request, activate: bool=True):
 def detect_timezone(request, activate: bool=True):
@@ -32,7 +32,7 @@ def CacheControlMiddleware(get_response):
         response = get_response(request)
         response = get_response(request)
 
 
         if '/archive/' in request.path or '/static/' in request.path:
         if '/archive/' in request.path or '/static/' in request.path:
-            policy = 'public' if PUBLIC_SNAPSHOTS else 'private'
+            policy = 'public' if SERVER_CONFIG.PUBLIC_SNAPSHOTS else 'private'
             response['Cache-Control'] = f'{policy}, max-age=60, stale-while-revalidate=300'
             response['Cache-Control'] = f'{policy}, max-age=60, stale-while-revalidate=300'
             # print('Set Cache-Control header to', response['Cache-Control'])
             # print('Set Cache-Control header to', response['Cache-Control'])
         return response
         return response
@@ -40,15 +40,15 @@ def CacheControlMiddleware(get_response):
     return middleware
     return middleware
 
 
 class ReverseProxyAuthMiddleware(RemoteUserMiddleware):
 class ReverseProxyAuthMiddleware(RemoteUserMiddleware):
-    header = 'HTTP_{normalized}'.format(normalized=REVERSE_PROXY_USER_HEADER.replace('-', '_').upper())
+    header = 'HTTP_{normalized}'.format(normalized=SERVER_CONFIG.REVERSE_PROXY_USER_HEADER.replace('-', '_').upper())
 
 
     def process_request(self, request):
     def process_request(self, request):
-        if REVERSE_PROXY_WHITELIST == '':
+        if SERVER_CONFIG.REVERSE_PROXY_WHITELIST == '':
             return
             return
 
 
         ip = request.META.get('REMOTE_ADDR')
         ip = request.META.get('REMOTE_ADDR')
 
 
-        for cidr in REVERSE_PROXY_WHITELIST.split(','):
+        for cidr in SERVER_CONFIG.REVERSE_PROXY_WHITELIST.split(','):
             try:
             try:
                 network = ipaddress.ip_network(cidr)
                 network = ipaddress.ip_network(cidr)
             except ValueError:
             except ValueError:

+ 5 - 7
archivebox/core/settings.py

@@ -13,9 +13,7 @@ import abx.archivebox
 import abx.archivebox.use
 import abx.archivebox.use
 import abx.django.use
 import abx.django.use
 
 
-from archivebox.config import VERSION, DATA_DIR, PACKAGE_DIR, ARCHIVE_DIR, CONSTANTS      # noqa
-
-from ..config.legacy import CONFIG
+from archivebox.config import VERSION, DATA_DIR, PACKAGE_DIR, ARCHIVE_DIR, CONSTANTS, SHELL_CONFIG, SERVER_CONFIG      # noqa
 
 
 IS_MIGRATING = 'makemigrations' in sys.argv[:3] or 'migrate' in sys.argv[:3]
 IS_MIGRATING = 'makemigrations' in sys.argv[:3] or 'migrate' in sys.argv[:3]
 IS_TESTING = 'test' in sys.argv[:3] or 'PYTEST_CURRENT_TEST' in os.environ
 IS_TESTING = 'test' in sys.argv[:3] or 'PYTEST_CURRENT_TEST' in os.environ
@@ -80,7 +78,7 @@ LOGOUT_REDIRECT_URL = os.environ.get('LOGOUT_REDIRECT_URL', '/')
 PASSWORD_RESET_URL = '/accounts/password_reset/'
 PASSWORD_RESET_URL = '/accounts/password_reset/'
 APPEND_SLASH = True
 APPEND_SLASH = True
 
 
-DEBUG = CONFIG.DEBUG or ('--debug' in sys.argv)
+DEBUG = SHELL_CONFIG.DEBUG or ('--debug' in sys.argv)
 
 
 
 
 INSTALLED_APPS = [
 INSTALLED_APPS = [
@@ -364,10 +362,10 @@ STORAGES = {
 ### Security Settings
 ### Security Settings
 ################################################################################
 ################################################################################
 
 
-SECRET_KEY = CONFIG.SECRET_KEY or get_random_string(50, 'abcdefghijklmnopqrstuvwxyz0123456789_')
+SECRET_KEY = SERVER_CONFIG.SECRET_KEY or get_random_string(50, 'abcdefghijklmnopqrstuvwxyz0123456789_')
 
 
-ALLOWED_HOSTS = CONFIG.ALLOWED_HOSTS.split(',')
-CSRF_TRUSTED_ORIGINS = list(set(CONFIG.CSRF_TRUSTED_ORIGINS.split(',')))
+ALLOWED_HOSTS = SERVER_CONFIG.ALLOWED_HOSTS.split(',')
+CSRF_TRUSTED_ORIGINS = list(set(SERVER_CONFIG.CSRF_TRUSTED_ORIGINS.split(',')))
 
 
 # automatically fix case when user sets ALLOWED_HOSTS (e.g. to archivebox.example.com)
 # automatically fix case when user sets ALLOWED_HOSTS (e.g. to archivebox.example.com)
 # but forgets to add https://archivebox.example.com to CSRF_TRUSTED_ORIGINS
 # but forgets to add https://archivebox.example.com to CSRF_TRUSTED_ORIGINS

+ 1 - 1
archivebox/core/urls.py

@@ -10,7 +10,7 @@ from .views import HomepageView, SnapshotView, PublicIndexView, AddView, HealthC
 from .serve_static import serve_static
 from .serve_static import serve_static
 
 
 # GLOBAL_CONTEXT doesn't work as-is, disabled for now: https://github.com/ArchiveBox/ArchiveBox/discussions/1306
 # GLOBAL_CONTEXT doesn't work as-is, disabled for now: https://github.com/ArchiveBox/ArchiveBox/discussions/1306
-# from .config.legacy import VERSION, VERSIONS_AVAILABLE, CAN_UPGRADE
+# from archivebox.config import VERSION, VERSIONS_AVAILABLE, CAN_UPGRADE
 # GLOBAL_CONTEXT = {'VERSION': VERSION, 'VERSIONS_AVAILABLE': VERSIONS_AVAILABLE, 'CAN_UPGRADE': CAN_UPGRADE}
 # GLOBAL_CONTEXT = {'VERSION': VERSION, 'VERSIONS_AVAILABLE': VERSIONS_AVAILABLE, 'CAN_UPGRADE': CAN_UPGRADE}
 
 
 
 

+ 55 - 57
archivebox/core/views.py

@@ -1,7 +1,7 @@
 __package__ = 'archivebox.core'
 __package__ = 'archivebox.core'
 
 
-from typing import Callable
-from benedict import benedict
+import inspect
+from typing import Callable, get_type_hints
 from pathlib import Path
 from pathlib import Path
 
 
 from django.shortcuts import render, redirect
 from django.shortcuts import render, redirect
@@ -27,21 +27,13 @@ from core.admin import result_url
 
 
 from queues.tasks import bg_add
 from queues.tasks import bg_add
 
 
-from archivebox.config import CONSTANTS, DATA_DIR, VERSION, SHELL_CONFIG, SERVER_CONFIG
-from ..plugins_extractor.archivedotorg.apps import ARCHIVEDOTORG_CONFIG
+from archivebox.config import CONSTANTS_CONFIG, DATA_DIR, VERSION, SHELL_CONFIG, SERVER_CONFIG
+from archivebox.misc.util import base_url, htmlencode, ts_to_date_str
 
 
-from ..config.legacy import (
-    CONFIG_SCHEMA,
-    DYNAMIC_CONFIG_SCHEMA,
-    USER_CONFIG,
-    CONFIG,
-)
+from .serve_static import serve_static_with_byterange_support
+from ..plugins_extractor.archivedotorg.apps import ARCHIVEDOTORG_CONFIG
 from ..logging_util import printable_filesize
 from ..logging_util import printable_filesize
-from archivebox.misc.util import base_url, htmlencode, ts_to_date_str
 from ..search import query_search_index
 from ..search import query_search_index
-from .serve_static import serve_static_with_byterange_support
-
-CONFIG = benedict({**CONSTANTS, **CONFIG, **settings.FLAT_CONFIG})
 
 
 
 
 class HomepageView(View):
 class HomepageView(View):
@@ -502,27 +494,43 @@ class HealthCheckView(View):
 
 
 
 
 def find_config_section(key: str) -> str:
 def find_config_section(key: str) -> str:
-    if key in CONSTANTS:
+    if key in CONSTANTS_CONFIG:
         return 'CONSTANT'
         return 'CONSTANT'
     matching_sections = [
     matching_sections = [
-        name for name, opts in CONFIG_SCHEMA.items() if key in opts
+        section.id for section in settings.CONFIGS.values() if key in section.model_fields
     ]
     ]
     section = matching_sections[0] if matching_sections else 'DYNAMIC'
     section = matching_sections[0] if matching_sections else 'DYNAMIC'
     return section
     return section
 
 
 def find_config_default(key: str) -> str:
 def find_config_default(key: str) -> str:
-    default_val = USER_CONFIG.get(key, {}).get('default', lambda: None)
+    if key in CONSTANTS_CONFIG:
+        return str(CONSTANTS_CONFIG[key])
+    
+    default_val = None
+
+    for config in settings.CONFIGS.values():
+        if key in config.model_fields:
+            default_val = config.model_fields[key].default
+            break
+        
     if isinstance(default_val, Callable):
     if isinstance(default_val, Callable):
-        return None
+        default_val = inspect.getsource(default_val).split('lambda', 1)[-1].split(':', 1)[-1].replace('\n', ' ').strip()
+        if default_val.count(')') > default_val.count('('):
+            default_val = default_val[:-1]
     else:
     else:
-        default_val = repr(default_val)
+        default_val = str(default_val)
+        
+        
     return default_val
     return default_val
 
 
 def find_config_type(key: str) -> str:
 def find_config_type(key: str) -> str:
-    if key in USER_CONFIG:
-        return str(USER_CONFIG[key]['type'])
-    elif key in DYNAMIC_CONFIG_SCHEMA:
-        return str(type(CONFIG[key]))
+    for config in settings.CONFIGS.values():
+        if hasattr(config, key):
+            type_hints = get_type_hints(config)
+            try:
+                return str(type_hints[key].__name__)
+            except AttributeError:
+                return str(type_hints[key])
     return 'str'
     return 'str'
 
 
 def key_is_safe(key: str) -> bool:
 def key_is_safe(key: str) -> bool:
@@ -543,40 +551,29 @@ def live_config_list_view(request: HttpRequest, **kwargs) -> TableContext:
         "Value": [],
         "Value": [],
         "Default": [],
         "Default": [],
         # "Documentation": [],
         # "Documentation": [],
-        "Aliases": [],
+        # "Aliases": [],
     }
     }
 
 
-    for section in CONFIG_SCHEMA.keys():
-        for key in CONFIG_SCHEMA[section].keys():
-            rows['Section'].append(section)   # section.replace('_', ' ').title().replace(' Config', '')
+    for section in reversed(list(settings.CONFIGS.values())):
+        for key, field in section.model_fields.items():
+            rows['Section'].append(section.id)   # section.replace('_', ' ').title().replace(' Config', '')
             rows['Key'].append(ItemLink(key, key=key))
             rows['Key'].append(ItemLink(key, key=key))
-            rows['Type'].append(mark_safe(f'<code>{find_config_type(key)}</code>'))
-            rows['Value'].append(mark_safe(f'<code>{CONFIG[key]}</code>') if key_is_safe(key) else '******** (redacted)')
-            rows['Default'].append(mark_safe(f'<a href="https://github.com/search?q=repo%3AArchiveBox%2FArchiveBox+path%3Aconfig.py+{key}&type=code"><code style="text-decoration: underline">{find_config_default(key) or "See here..."}</code></a>'))
+            rows['Type'].append(format_html('<code>{}</code>', find_config_type(key)))
+            rows['Value'].append(mark_safe(f'<code>{getattr(section, key)}</code>') if key_is_safe(key) else '******** (redacted)')
+            rows['Default'].append(mark_safe(f'<a href="https://github.com/search?q=repo%3AArchiveBox%2FArchiveBox+path%3Aconfig+{key}&type=code"><code style="text-decoration: underline">{find_config_default(key) or "See here..."}</code></a>'))
             # rows['Documentation'].append(mark_safe(f'Wiki: <a href="https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#{key.lower()}">{key}</a>'))
             # rows['Documentation'].append(mark_safe(f'Wiki: <a href="https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#{key.lower()}">{key}</a>'))
-            rows['Aliases'].append(', '.join(CONFIG_SCHEMA[section][key].get('aliases', [])))
-
-    section = 'DYNAMIC'
-    for key in DYNAMIC_CONFIG_SCHEMA.keys():
-        if key in CONSTANTS:
-            continue
-        rows['Section'].append(section)   # section.replace('_', ' ').title().replace(' Config', '')
-        rows['Key'].append(ItemLink(key, key=key))
-        rows['Type'].append(mark_safe(f'<code>{find_config_type(key)}</code>'))
-        rows['Value'].append(mark_safe(f'<code>{CONFIG[key]}</code>') if key_is_safe(key) else '******** (redacted)')
-        rows['Default'].append(mark_safe(f'<a href="https://github.com/search?q=repo%3AArchiveBox%2FArchiveBox+path%3Aconfig.py+{key}&type=code"><code style="text-decoration: underline">{find_config_default(key) or "See here..."}</code></a>'))
-        # rows['Documentation'].append(mark_safe(f'Wiki: <a href="https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#{key.lower()}">{key}</a>'))
-        rows['Aliases'].append(ItemLink(key, key=key) if key in USER_CONFIG else '')
+            # rows['Aliases'].append(', '.join(find_config_aliases(key)))
 
 
+   
     section = 'CONSTANT'
     section = 'CONSTANT'
-    for key in CONSTANTS.keys():
+    for key in CONSTANTS_CONFIG.keys():
         rows['Section'].append(section)   # section.replace('_', ' ').title().replace(' Config', '')
         rows['Section'].append(section)   # section.replace('_', ' ').title().replace(' Config', '')
         rows['Key'].append(ItemLink(key, key=key))
         rows['Key'].append(ItemLink(key, key=key))
-        rows['Type'].append(mark_safe(f'<code>{find_config_type(key)}</code>'))
-        rows['Value'].append(mark_safe(f'<code>{CONFIG[key]}</code>') if key_is_safe(key) else '******** (redacted)')
-        rows['Default'].append(mark_safe(f'<a href="https://github.com/search?q=repo%3AArchiveBox%2FArchiveBox+path%3Aconfig.py+{key}&type=code"><code style="text-decoration: underline">{find_config_default(key) or "See here..."}</code></a>'))
+        rows['Type'].append(format_html('<code>{}</code>', getattr(type(CONSTANTS_CONFIG[key]), '__name__', repr(CONSTANTS_CONFIG[key]))))
+        rows['Value'].append(format_html('<code>{}</code>', CONSTANTS_CONFIG[key]) if key_is_safe(key) else '******** (redacted)')
+        rows['Default'].append(mark_safe(f'<a href="https://github.com/search?q=repo%3AArchiveBox%2FArchiveBox+path%3Aconfig+{key}&type=code"><code style="text-decoration: underline">{find_config_default(key) or "See here..."}</code></a>'))
         # rows['Documentation'].append(mark_safe(f'Wiki: <a href="https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#{key.lower()}">{key}</a>'))
         # rows['Documentation'].append(mark_safe(f'Wiki: <a href="https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#{key.lower()}">{key}</a>'))
-        rows['Aliases'].append(ItemLink(key, key=key) if key in USER_CONFIG else '')
+        # rows['Aliases'].append('')
 
 
 
 
     return TableContext(
     return TableContext(
@@ -589,11 +586,12 @@ def live_config_value_view(request: HttpRequest, key: str, **kwargs) -> ItemCont
 
 
     assert request.user.is_superuser, 'Must be a superuser to view configuration settings.'
     assert request.user.is_superuser, 'Must be a superuser to view configuration settings.'
 
 
-    aliases = USER_CONFIG.get(key, {}).get("aliases", [])
+    # aliases = USER_CONFIG.get(key, {}).get("aliases", [])
+    aliases = []
 
 
-    if key in CONSTANTS:
+    if key in CONSTANTS_CONFIG:
         section_header = mark_safe(f'[CONSTANTS]   &nbsp; <b><code style="color: lightgray">{key}</code></b> &nbsp; <small>(read-only, hardcoded by ArchiveBox)</small>')
         section_header = mark_safe(f'[CONSTANTS]   &nbsp; <b><code style="color: lightgray">{key}</code></b> &nbsp; <small>(read-only, hardcoded by ArchiveBox)</small>')
-    elif key in USER_CONFIG:
+    elif key in settings.FLAT_CONFIG:
         section_header = mark_safe(f'data / ArchiveBox.conf &nbsp; [{find_config_section(key)}]  &nbsp; <b><code style="color: lightgray">{key}</code></b>')
         section_header = mark_safe(f'data / ArchiveBox.conf &nbsp; [{find_config_section(key)}]  &nbsp; <b><code style="color: lightgray">{key}</code></b>')
     else:
     else:
         section_header = mark_safe(f'[DYNAMIC CONFIG]   &nbsp; <b><code style="color: lightgray">{key}</code></b> &nbsp; <small>(read-only, calculated at runtime)</small>')
         section_header = mark_safe(f'[DYNAMIC CONFIG]   &nbsp; <b><code style="color: lightgray">{key}</code></b> &nbsp; <small>(read-only, calculated at runtime)</small>')
@@ -609,7 +607,7 @@ def live_config_value_view(request: HttpRequest, key: str, **kwargs) -> ItemCont
                 "fields": {
                 "fields": {
                     'Key': key,
                     'Key': key,
                     'Type': find_config_type(key),
                     'Type': find_config_type(key),
-                    'Value': CONFIG[key] if key_is_safe(key) else '********',
+                    'Value': settings.FLAT_CONFIG[key] if key_is_safe(key) else '********',
                 },
                 },
                 "help_texts": {
                 "help_texts": {
                     'Key': mark_safe(f'''
                     'Key': mark_safe(f'''
@@ -619,25 +617,25 @@ def live_config_value_view(request: HttpRequest, key: str, **kwargs) -> ItemCont
                         </span>
                         </span>
                     '''),
                     '''),
                     'Type': mark_safe(f'''
                     'Type': mark_safe(f'''
-                        <a href="https://github.com/search?q=repo%3AArchiveBox%2FArchiveBox+path%3Aconfig.py+{key}&type=code">
-                            See full definition in <code>archivebox/config.py</code>...
+                        <a href="https://github.com/search?q=repo%3AArchiveBox%2FArchiveBox+path%3Aconfig+{key}&type=code">
+                            See full definition in <code>archivebox/config</code>...
                         </a>
                         </a>
                     '''),
                     '''),
                     'Value': mark_safe(f'''
                     'Value': mark_safe(f'''
                         {'<b style="color: red">Value is redacted for your security. (Passwords, secrets, API tokens, etc. cannot be viewed in the Web UI)</b><br/><br/>' if not key_is_safe(key) else ''}
                         {'<b style="color: red">Value is redacted for your security. (Passwords, secrets, API tokens, etc. cannot be viewed in the Web UI)</b><br/><br/>' if not key_is_safe(key) else ''}
                         <br/><hr/><br/>
                         <br/><hr/><br/>
                         Default: &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; 
                         Default: &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; 
-                        <a href="https://github.com/search?q=repo%3AArchiveBox%2FArchiveBox+path%3Aconfig.py+{key}&type=code">
+                        <a href="https://github.com/search?q=repo%3AArchiveBox%2FArchiveBox+path%3Aconfig+{key}&type=code">
                             <code>{find_config_default(key) or '↗️ See in ArchiveBox source code...'}</code>
                             <code>{find_config_default(key) or '↗️ See in ArchiveBox source code...'}</code>
                         </a>
                         </a>
                         <br/><br/>
                         <br/><br/>
-                        <p style="display: {"block" if key in USER_CONFIG else "none"}">
+                        <p style="display: {"block" if key in settings.FLAT_CONFIG else "none"}">
                             <i>To change this value, edit <code>data/ArchiveBox.conf</code> or run:</i>
                             <i>To change this value, edit <code>data/ArchiveBox.conf</code> or run:</i>
                             <br/><br/>
                             <br/><br/>
                             <code>archivebox config --set {key}="{
                             <code>archivebox config --set {key}="{
                                 val.strip("'")
                                 val.strip("'")
                                 if (val := find_config_default(key)) else
                                 if (val := find_config_default(key)) else
-                                (repr(CONFIG[key] if key_is_safe(key) else '********')).strip("'")
+                                (repr(settings.FLAT_CONFIG[key] if key_is_safe(key) else '********')).strip("'")
                             }"</code>
                             }"</code>
                         </p>
                         </p>
                     '''),
                     '''),

+ 18 - 26
archivebox/extractors/archive_org.py

@@ -7,21 +7,10 @@ from collections import defaultdict
 
 
 from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError
 from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError
 from archivebox.misc.system import run, chmod_file
 from archivebox.misc.system import run, chmod_file
-from archivebox.misc.util import (
-    enforce_types,
-    is_static_file,
-    dedupe,
-)
-from ..config.legacy import (
-    TIMEOUT,
-    CURL_ARGS,
-    CURL_EXTRA_ARGS,
-    CHECK_SSL_VALIDITY,
-    SAVE_ARCHIVE_DOT_ORG,
-    CURL_BINARY,
-    CURL_VERSION,
-    CURL_USER_AGENT,
-)
+from archivebox.misc.util import enforce_types, is_static_file, dedupe
+from archivebox.plugins_extractor.archivedotorg.apps import ARCHIVEDOTORG_CONFIG
+from archivebox.plugins_extractor.curl.apps import CURL_CONFIG, CURL_BINARY
+
 from ..logging_util import TimedProgress
 from ..logging_util import TimedProgress
 
 
 
 
@@ -39,27 +28,30 @@ def should_save_archive_dot_org(link: Link, out_dir: Optional[Path]=None, overwr
         # if open(path, 'r', encoding='utf-8').read().strip() != 'None':
         # if open(path, 'r', encoding='utf-8').read().strip() != 'None':
         return False
         return False
 
 
-    return SAVE_ARCHIVE_DOT_ORG
+    return ARCHIVEDOTORG_CONFIG.SAVE_ARCHIVE_DOT_ORG
 
 
 @enforce_types
 @enforce_types
-def save_archive_dot_org(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) -> ArchiveResult:
+def save_archive_dot_org(link: Link, out_dir: Optional[Path]=None, timeout: int=CURL_CONFIG.CURL_TIMEOUT) -> ArchiveResult:
     """submit site to archive.org for archiving via their service, save returned archive url"""
     """submit site to archive.org for archiving via their service, save returned archive url"""
 
 
+    curl_binary = CURL_BINARY.load()
+    assert curl_binary.abspath and curl_binary.version
+
     out_dir = out_dir or Path(link.link_dir)
     out_dir = out_dir or Path(link.link_dir)
     output: ArchiveOutput = get_output_path()
     output: ArchiveOutput = get_output_path()
     archive_org_url = None
     archive_org_url = None
     submit_url = 'https://web.archive.org/save/{}'.format(link.url)
     submit_url = 'https://web.archive.org/save/{}'.format(link.url)
     # later options take precedence
     # later options take precedence
     options = [
     options = [
-        *CURL_ARGS,
-        *CURL_EXTRA_ARGS,
+        *CURL_CONFIG.CURL_ARGS,
+        *CURL_CONFIG.CURL_EXTRA_ARGS,
         '--head',
         '--head',
         '--max-time', str(timeout),
         '--max-time', str(timeout),
-        *(['--user-agent', '{}'.format(CURL_USER_AGENT)] if CURL_USER_AGENT else []),
-        *([] if CHECK_SSL_VALIDITY else ['--insecure']),
+        *(['--user-agent', '{}'.format(CURL_CONFIG.CURL_USER_AGENT)] if CURL_CONFIG.CURL_USER_AGENT else []),
+        *([] if CURL_CONFIG.CURL_CHECK_SSL_VALIDITY else ['--insecure']),
     ]
     ]
     cmd = [
     cmd = [
-        CURL_BINARY,
+        str(curl_binary.abspath),
         *dedupe(options),
         *dedupe(options),
         submit_url,
         submit_url,
     ]
     ]
@@ -97,22 +89,22 @@ def save_archive_dot_org(link: Link, out_dir: Optional[Path]=None, timeout: int=
     return ArchiveResult(
     return ArchiveResult(
         cmd=cmd,
         cmd=cmd,
         pwd=str(out_dir),
         pwd=str(out_dir),
-        cmd_version=CURL_VERSION,
+        cmd_version=str(curl_binary.version),
         output=output,
         output=output,
         status=status,
         status=status,
         **timer.stats,
         **timer.stats,
     )
     )
 
 
 @enforce_types
 @enforce_types
-def parse_archive_dot_org_response(response: bytes) -> Tuple[List[str], List[str]]:
+def parse_archive_dot_org_response(response: str) -> Tuple[List[str], List[str]]:
     # Parse archive.org response headers
     # Parse archive.org response headers
     headers: Dict[str, List[str]] = defaultdict(list)
     headers: Dict[str, List[str]] = defaultdict(list)
 
 
     # lowercase all the header names and store in dict
     # lowercase all the header names and store in dict
     for header in response.splitlines():
     for header in response.splitlines():
-        if b':' not in header or not header.strip():
+        if ':' not in header or not header.strip():
             continue
             continue
-        name, val = header.decode().split(':', 1)
+        name, val = header.split(':', 1)
         headers[name.lower().strip()].append(val.strip())
         headers[name.lower().strip()].append(val.strip())
 
 
     # Get successful archive url in "content-location" header or any errors
     # Get successful archive url in "content-location" header or any errors

+ 16 - 18
archivebox/extractors/favicon.py

@@ -2,16 +2,11 @@ __package__ = 'archivebox.extractors'
 
 
 from pathlib import Path
 from pathlib import Path
 
 
-from typing import Optional
-
-from ..index.schema import Link, ArchiveResult, ArchiveOutput
 from archivebox.misc.system import chmod_file, run
 from archivebox.misc.system import chmod_file, run
-from archivebox.misc.util import (
-    enforce_types,
-    domain,
-    dedupe,
-)
-from ..config.legacy import CONFIG
+from archivebox.misc.util import enforce_types, domain, dedupe
+from archivebox.plugins_extractor.favicon.apps import FAVICON_CONFIG
+from archivebox.plugins_extractor.curl.apps import CURL_CONFIG, CURL_BINARY
+from ..index.schema import Link, ArchiveResult, ArchiveOutput
 from ..logging_util import TimedProgress
 from ..logging_util import TimedProgress
 
 
 
 
@@ -22,7 +17,7 @@ def should_save_favicon(link: Link, out_dir: str | Path | None=None, overwrite:
     if not overwrite and (out_dir / 'favicon.ico').exists():
     if not overwrite and (out_dir / 'favicon.ico').exists():
         return False
         return False
 
 
-    return CONFIG.SAVE_FAVICON
+    return FAVICON_CONFIG.SAVE_FAVICON
 
 
 @enforce_types
 @enforce_types
 def get_output_path():
 def get_output_path():
@@ -30,26 +25,29 @@ def get_output_path():
 
 
 
 
 @enforce_types
 @enforce_types
-def save_favicon(link: Link, out_dir: str | Path | None=None, timeout: int=CONFIG.TIMEOUT) -> ArchiveResult:
+def save_favicon(link: Link, out_dir: str | Path | None=None, timeout: int=CURL_CONFIG.CURL_TIMEOUT) -> ArchiveResult:
     """download site favicon from google's favicon api"""
     """download site favicon from google's favicon api"""
 
 
+    curl_binary = CURL_BINARY.load()
+    assert curl_binary.abspath and curl_binary.version
+
     out_dir = Path(out_dir or link.link_dir)
     out_dir = Path(out_dir or link.link_dir)
     assert out_dir.exists()
     assert out_dir.exists()
 
 
     output: ArchiveOutput = 'favicon.ico'
     output: ArchiveOutput = 'favicon.ico'
     # later options take precedence
     # later options take precedence
     options = [
     options = [
-        *CONFIG.CURL_ARGS,
-        *CONFIG.CURL_EXTRA_ARGS,
+        *CURL_CONFIG.CURL_ARGS,
+        *CURL_CONFIG.CURL_EXTRA_ARGS,
         '--max-time', str(timeout),
         '--max-time', str(timeout),
         '--output', str(output),
         '--output', str(output),
-        *(['--user-agent', '{}'.format(CONFIG.CURL_USER_AGENT)] if CONFIG.CURL_USER_AGENT else []),
-        *([] if CONFIG.CHECK_SSL_VALIDITY else ['--insecure']),
+        *(['--user-agent', '{}'.format(CURL_CONFIG.CURL_USER_AGENT)] if CURL_CONFIG.CURL_USER_AGENT else []),
+        *([] if CURL_CONFIG.CURL_CHECK_SSL_VALIDITY else ['--insecure']),
     ]
     ]
     cmd = [
     cmd = [
-        CONFIG.CURL_BINARY,
+        str(curl_binary.abspath),
         *dedupe(options),
         *dedupe(options),
-        CONFIG.FAVICON_PROVIDER.format(domain(link.url)),
+        FAVICON_CONFIG.FAVICON_PROVIDER.format(domain(link.url)),
     ]
     ]
     status = 'failed'
     status = 'failed'
     timer = TimedProgress(timeout, prefix='      ')
     timer = TimedProgress(timeout, prefix='      ')
@@ -65,7 +63,7 @@ def save_favicon(link: Link, out_dir: str | Path | None=None, timeout: int=CONFI
     return ArchiveResult(
     return ArchiveResult(
         cmd=cmd,
         cmd=cmd,
         pwd=str(out_dir),
         pwd=str(out_dir),
-        cmd_version=CONFIG.CURL_VERSION,
+        cmd_version=str(curl_binary.version),
         output=output,
         output=output,
         status=status,
         status=status,
         **timer.stats,
         **timer.stats,

+ 12 - 9
archivebox/extractors/git.py

@@ -4,7 +4,6 @@ __package__ = 'archivebox.extractors'
 from pathlib import Path
 from pathlib import Path
 from typing import Optional
 from typing import Optional
 
 
-from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError
 from archivebox.misc.system import run, chmod_file
 from archivebox.misc.system import run, chmod_file
 from archivebox.misc.util import (
 from archivebox.misc.util import (
     enforce_types,
     enforce_types,
@@ -14,8 +13,9 @@ from archivebox.misc.util import (
     without_query,
     without_query,
     without_fragment,
     without_fragment,
 )
 )
-from ..config.legacy import CONFIG
+from archivebox.plugins_extractor.git.apps import GIT_CONFIG, GIT_BINARY
 from ..logging_util import TimedProgress
 from ..logging_util import TimedProgress
+from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError
 
 
 
 
 def get_output_path():
 def get_output_path():
@@ -42,28 +42,31 @@ def should_save_git(link: Link, out_dir: Optional[Path]=None, overwrite: Optiona
         return False
         return False
 
 
     is_clonable_url = (
     is_clonable_url = (
-        (domain(link.url) in CONFIG.GIT_DOMAINS)
+        (domain(link.url) in GIT_CONFIG.GIT_DOMAINS)
         or (extension(link.url) == 'git')
         or (extension(link.url) == 'git')
     )
     )
     if not is_clonable_url:
     if not is_clonable_url:
         return False
         return False
 
 
-    return CONFIG.SAVE_GIT
+    return GIT_CONFIG.SAVE_GIT
 
 
 
 
 @enforce_types
 @enforce_types
-def save_git(link: Link, out_dir: Optional[Path]=None, timeout: int=CONFIG.TIMEOUT) -> ArchiveResult:
+def save_git(link: Link, out_dir: Optional[Path]=None, timeout: int=GIT_CONFIG.GIT_TIMEOUT) -> ArchiveResult:
     """download full site using git"""
     """download full site using git"""
+    
+    git_binary = GIT_BINARY.load()
+    assert git_binary.abspath and git_binary.version
 
 
     out_dir = out_dir or Path(link.link_dir)
     out_dir = out_dir or Path(link.link_dir)
     output: ArchiveOutput = get_output_path()
     output: ArchiveOutput = get_output_path()
     output_path = out_dir / output
     output_path = out_dir / output
     output_path.mkdir(exist_ok=True)
     output_path.mkdir(exist_ok=True)
     cmd = [
     cmd = [
-        CONFIG.GIT_BINARY,
+        str(git_binary.abspath),
         'clone',
         'clone',
-        *CONFIG.GIT_ARGS,
-        *([] if CONFIG.CHECK_SSL_VALIDITY else ['-c', 'http.sslVerify=false']),
+        *GIT_CONFIG.GIT_ARGS,
+        *([] if GIT_CONFIG.GIT_CHECK_SSL_VALIDITY else ['-c', 'http.sslVerify=false']),
         without_query(without_fragment(link.url)),
         without_query(without_fragment(link.url)),
     ]
     ]
     status = 'succeeded'
     status = 'succeeded'
@@ -88,7 +91,7 @@ def save_git(link: Link, out_dir: Optional[Path]=None, timeout: int=CONFIG.TIMEO
     return ArchiveResult(
     return ArchiveResult(
         cmd=cmd,
         cmd=cmd,
         pwd=str(out_dir),
         pwd=str(out_dir),
-        cmd_version=CONFIG.GIT_VERSION,
+        cmd_version=str(git_binary.version),
         output=output,
         output=output,
         status=status,
         status=status,
         **timer.stats,
         **timer.stats,

+ 20 - 25
archivebox/extractors/headers.py

@@ -4,23 +4,14 @@ from pathlib import Path
 
 
 from typing import Optional
 from typing import Optional
 
 
-from ..index.schema import Link, ArchiveResult, ArchiveOutput
 from archivebox.misc.system import atomic_write
 from archivebox.misc.system import atomic_write
 from archivebox.misc.util import (
 from archivebox.misc.util import (
     enforce_types,
     enforce_types,
     get_headers,
     get_headers,
     dedupe,
     dedupe,
 )
 )
-from ..config.legacy import (
-    TIMEOUT,
-    CURL_BINARY,
-    CURL_ARGS,
-    CURL_EXTRA_ARGS,
-    CURL_USER_AGENT,
-    CURL_VERSION,
-    CHECK_SSL_VALIDITY,
-    SAVE_HEADERS
-)
+from archivebox.plugins_extractor.curl.apps import CURL_CONFIG, CURL_BINARY
+from ..index.schema import Link, ArchiveResult, ArchiveOutput
 from ..logging_util import TimedProgress
 from ..logging_util import TimedProgress
 
 
 def get_output_path():
 def get_output_path():
@@ -29,34 +20,38 @@ def get_output_path():
 
 
 @enforce_types
 @enforce_types
 def should_save_headers(link: Link, out_dir: Optional[str]=None, overwrite: Optional[bool]=False) -> bool:
 def should_save_headers(link: Link, out_dir: Optional[str]=None, overwrite: Optional[bool]=False) -> bool:
-    out_dir = out_dir or Path(link.link_dir)
-    if not overwrite and (out_dir / get_output_path()).exists():
+    out_dir_path = Path(out_dir or link.link_dir)
+    assert out_dir_path
+    if not overwrite and (out_dir_path / get_output_path()).exists():
         return False
         return False
 
 
-    return SAVE_HEADERS
+    return CURL_CONFIG.SAVE_HEADERS
 
 
 
 
 @enforce_types
 @enforce_types
-def save_headers(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEOUT) -> ArchiveResult:
+def save_headers(link: Link, out_dir: Optional[str]=None, timeout: int=CURL_CONFIG.CURL_TIMEOUT) -> ArchiveResult:
     """Download site headers"""
     """Download site headers"""
 
 
-    out_dir = Path(out_dir or link.link_dir)
-    output_folder = out_dir.absolute()
+    curl_binary = CURL_BINARY.load()
+    assert curl_binary.abspath and curl_binary.version
+
+    out_dir_path = Path(out_dir or link.link_dir)
+    output_folder = out_dir_path.absolute()
     output: ArchiveOutput = get_output_path()
     output: ArchiveOutput = get_output_path()
 
 
     status = 'succeeded'
     status = 'succeeded'
-    timer = TimedProgress(timeout, prefix='      ')
+    timer = TimedProgress(timeout + 1, prefix='      ')
     # later options take precedence
     # later options take precedence
     options = [
     options = [
-        *CURL_ARGS,
-        *CURL_EXTRA_ARGS,
+        *CURL_CONFIG.CURL_ARGS,
+        *CURL_CONFIG.CURL_EXTRA_ARGS,
         '--head',
         '--head',
         '--max-time', str(timeout),
         '--max-time', str(timeout),
-        *(['--user-agent', '{}'.format(CURL_USER_AGENT)] if CURL_USER_AGENT else []),
-        *([] if CHECK_SSL_VALIDITY else ['--insecure']),
+        *(['--user-agent', '{}'.format(CURL_CONFIG.CURL_USER_AGENT)] if CURL_CONFIG.CURL_USER_AGENT else []),
+        *([] if CURL_CONFIG.CURL_CHECK_SSL_VALIDITY else ['--insecure']),
     ]
     ]
     cmd = [
     cmd = [
-        CURL_BINARY,
+        str(curl_binary.abspath),
         *dedupe(options),
         *dedupe(options),
         link.url,
         link.url,
     ]
     ]
@@ -72,8 +67,8 @@ def save_headers(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEOUT)
 
 
     return ArchiveResult(
     return ArchiveResult(
         cmd=cmd,
         cmd=cmd,
-        pwd=str(out_dir),
-        cmd_version=CURL_VERSION,
+        pwd=str(out_dir_path),
+        cmd_version=str(curl_binary.version),
         output=output,
         output=output,
         status=status,
         status=status,
         **timer.stats,
         **timer.stats,

+ 7 - 12
archivebox/extractors/htmltotext.py

@@ -5,18 +5,13 @@ import io
 from pathlib import Path
 from pathlib import Path
 from typing import Optional
 from typing import Optional
 
 
-from archivebox.config import VERSION
-from ..config.legacy import (
-    SAVE_HTMLTOTEXT,
-    TIMEOUT,
-)
-from ..index.schema import Link, ArchiveResult, ArchiveError
-from ..logging_util import TimedProgress
+from archivebox.config import VERSION, ARCHIVING_CONFIG
+from archivebox.config.legacy import SAVE_HTMLTOTEXT
 from archivebox.misc.system import atomic_write
 from archivebox.misc.system import atomic_write
-from archivebox.misc.util import (
-    enforce_types,
-    is_static_file,
-)
+from archivebox.misc.util import enforce_types, is_static_file
+
+from ..logging_util import TimedProgress
+from ..index.schema import Link, ArchiveResult, ArchiveError
 from .title import get_html
 from .title import get_html
 
 
 
 
@@ -122,7 +117,7 @@ def should_save_htmltotext(link: Link, out_dir: Optional[Path]=None, overwrite:
 
 
 
 
 @enforce_types
 @enforce_types
-def save_htmltotext(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) -> ArchiveResult:
+def save_htmltotext(link: Link, out_dir: Optional[Path]=None, timeout: int=ARCHIVING_CONFIG.TIMEOUT) -> ArchiveResult:
     """extract search-indexing-friendly text from an HTML document"""
     """extract search-indexing-friendly text from an HTML document"""
 
 
     out_dir = Path(out_dir or link.link_dir)
     out_dir = Path(out_dir or link.link_dir)

+ 14 - 20
archivebox/extractors/title.py

@@ -5,23 +5,14 @@ from html.parser import HTMLParser
 from pathlib import Path
 from pathlib import Path
 from typing import Optional
 from typing import Optional
 
 
-from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError
 from archivebox.misc.util import (
 from archivebox.misc.util import (
     enforce_types,
     enforce_types,
     download_url,
     download_url,
     htmldecode,
     htmldecode,
     dedupe,
     dedupe,
 )
 )
-from ..config.legacy import (
-    TIMEOUT,
-    CHECK_SSL_VALIDITY,
-    SAVE_TITLE,
-    CURL_BINARY,
-    CURL_ARGS,
-    CURL_EXTRA_ARGS,
-    CURL_VERSION,
-    CURL_USER_AGENT,
-)
+from archivebox.plugins_extractor.curl.apps import CURL_CONFIG, CURL_BINARY
+from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError
 from ..logging_util import TimedProgress
 from ..logging_util import TimedProgress
 
 
 
 
@@ -62,7 +53,7 @@ class TitleParser(HTMLParser):
 
 
 
 
 @enforce_types
 @enforce_types
-def get_html(link: Link, path: Path, timeout: int=TIMEOUT) -> str:
+def get_html(link: Link, path: Path, timeout: int=CURL_CONFIG.CURL_TIMEOUT) -> str:
     """
     """
     Try to find wget, singlefile and then dom files.
     Try to find wget, singlefile and then dom files.
     If none is found, download the url again.
     If none is found, download the url again.
@@ -98,7 +89,7 @@ def should_save_title(link: Link, out_dir: Optional[str]=None, overwrite: Option
     if not overwrite and link.title and not link.title.lower().startswith('http'):
     if not overwrite and link.title and not link.title.lower().startswith('http'):
         return False
         return False
 
 
-    return SAVE_TITLE
+    return CURL_CONFIG.SAVE_TITLE
 
 
 def extract_title_with_regex(html):
 def extract_title_with_regex(html):
     match = re.search(HTML_TITLE_REGEX, html)
     match = re.search(HTML_TITLE_REGEX, html)
@@ -106,22 +97,25 @@ def extract_title_with_regex(html):
     return output
     return output
 
 
 @enforce_types
 @enforce_types
-def save_title(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) -> ArchiveResult:
+def save_title(link: Link, out_dir: Optional[Path]=None, timeout: int=CURL_CONFIG.CURL_TIMEOUT) -> ArchiveResult:
     """try to guess the page's title from its content"""
     """try to guess the page's title from its content"""
 
 
     from core.models import Snapshot
     from core.models import Snapshot
 
 
+    curl_binary = CURL_BINARY.load()
+    assert curl_binary.abspath and curl_binary.version
+
     output: ArchiveOutput = None
     output: ArchiveOutput = None
     # later options take precedence
     # later options take precedence
     options = [
     options = [
-        *CURL_ARGS,
-        *CURL_EXTRA_ARGS,
+        *CURL_CONFIG.CURL_ARGS,
+        *CURL_CONFIG.CURL_EXTRA_ARGS,
         '--max-time', str(timeout),
         '--max-time', str(timeout),
-        *(['--user-agent', '{}'.format(CURL_USER_AGENT)] if CURL_USER_AGENT else []),
-        *([] if CHECK_SSL_VALIDITY else ['--insecure']),
+        *(['--user-agent', '{}'.format(CURL_CONFIG.CURL_USER_AGENT)] if CURL_CONFIG.CURL_USER_AGENT else []),
+        *([] if CURL_CONFIG.CURL_CHECK_SSL_VALIDITY else ['--insecure']),
     ]
     ]
     cmd = [
     cmd = [
-        CURL_BINARY,
+        str(curl_binary.abspath),
         *dedupe(options),
         *dedupe(options),
         link.url,
         link.url,
     ]
     ]
@@ -161,7 +155,7 @@ def save_title(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) -
     return ArchiveResult(
     return ArchiveResult(
         cmd=cmd,
         cmd=cmd,
         pwd=str(out_dir),
         pwd=str(out_dir),
-        cmd_version=CURL_VERSION,
+        cmd_version=str(curl_binary.version),
         output=output,
         output=output,
         status=status,
         status=status,
         **timer.stats,
         **timer.stats,

+ 22 - 38
archivebox/main.py

@@ -430,7 +430,7 @@ def init(force: bool=False, quick: bool=False, install: bool=False, out_dir: Pat
 def status(out_dir: Path=DATA_DIR) -> None:
 def status(out_dir: Path=DATA_DIR) -> None:
     """Print out some info and statistics about the archive collection"""
     """Print out some info and statistics about the archive collection"""
 
 
-    check_data_folder(CONFIG)
+    check_data_folder()
 
 
     from core.models import Snapshot
     from core.models import Snapshot
     from django.contrib.auth import get_user_model
     from django.contrib.auth import get_user_model
@@ -573,7 +573,7 @@ def add(urls: Union[str, List[str]],
         run_subcommand('init', stdin=None, pwd=out_dir)
         run_subcommand('init', stdin=None, pwd=out_dir)
 
 
     # Load list of links from the existing index
     # Load list of links from the existing index
-    check_data_folder(CONFIG)
+    check_data_folder()
 
 
     # worker = start_cli_workers()
     # worker = start_cli_workers()
     
     
@@ -673,7 +673,7 @@ def remove(filter_str: Optional[str]=None,
            out_dir: Path=DATA_DIR) -> List[Link]:
            out_dir: Path=DATA_DIR) -> List[Link]:
     """Remove the specified URLs from the archive"""
     """Remove the specified URLs from the archive"""
     
     
-    check_data_folder(CONFIG)
+    check_data_folder()
 
 
     if snapshots is None:
     if snapshots is None:
         if filter_str and filter_patterns:
         if filter_str and filter_patterns:
@@ -762,7 +762,7 @@ def update(resume: Optional[float]=None,
     # from .queues.supervisor_util import start_cli_workers
     # from .queues.supervisor_util import start_cli_workers
     
     
 
 
-    check_data_folder(CONFIG)
+    check_data_folder()
     # start_cli_workers()
     # start_cli_workers()
     new_links: List[Link] = [] # TODO: Remove input argument: only_new
     new_links: List[Link] = [] # TODO: Remove input argument: only_new
 
 
@@ -833,7 +833,7 @@ def list_all(filter_patterns_str: Optional[str]=None,
              out_dir: Path=DATA_DIR) -> Iterable[Link]:
              out_dir: Path=DATA_DIR) -> Iterable[Link]:
     """List, filter, and export information about archive entries"""
     """List, filter, and export information about archive entries"""
     
     
-    check_data_folder(CONFIG)
+    check_data_folder()
 
 
     if filter_patterns and filter_patterns_str:
     if filter_patterns and filter_patterns_str:
         stderr(
         stderr(
@@ -881,7 +881,7 @@ def list_links(snapshots: Optional[QuerySet]=None,
                before: Optional[float]=None,
                before: Optional[float]=None,
                out_dir: Path=DATA_DIR) -> Iterable[Link]:
                out_dir: Path=DATA_DIR) -> Iterable[Link]:
     
     
-    check_data_folder(CONFIG)
+    check_data_folder()
 
 
     if snapshots:
     if snapshots:
         all_snapshots = snapshots
         all_snapshots = snapshots
@@ -905,7 +905,7 @@ def list_folders(links: List[Link],
                  status: str,
                  status: str,
                  out_dir: Path=DATA_DIR) -> Dict[str, Optional[Link]]:
                  out_dir: Path=DATA_DIR) -> Dict[str, Optional[Link]]:
     
     
-    check_data_folder(CONFIG)
+    check_data_folder()
 
 
     STATUS_FUNCTIONS = {
     STATUS_FUNCTIONS = {
         "indexed": get_indexed_folders,
         "indexed": get_indexed_folders,
@@ -926,7 +926,7 @@ def list_folders(links: List[Link],
         raise ValueError('Status not recognized.')
         raise ValueError('Status not recognized.')
 
 
 @enforce_types
 @enforce_types
-def setup(out_dir: Path=DATA_DIR) -> None:
+def install(out_dir: Path=DATA_DIR) -> None:
     """Automatically install all ArchiveBox dependencies and extras"""
     """Automatically install all ArchiveBox dependencies and extras"""
 
 
     from rich import print
     from rich import print
@@ -937,40 +937,20 @@ def setup(out_dir: Path=DATA_DIR) -> None:
 
 
     stderr('\n[+] Installing ArchiveBox dependencies automatically...', color='green')
     stderr('\n[+] Installing ArchiveBox dependencies automatically...', color='green')
 
 
-    for binary in settings.BINARIES.values():
+    for binary in reversed(list(settings.BINARIES.values())):
         try:
         try:
             print(binary.load_or_install().model_dump(exclude={'binproviders_supported', 'loaded_binprovider', 'provider_overrides', 'loaded_abspaths', 'bin_dir', 'loaded_respath'}))
             print(binary.load_or_install().model_dump(exclude={'binproviders_supported', 'loaded_binprovider', 'provider_overrides', 'loaded_abspaths', 'bin_dir', 'loaded_respath'}))
         except Exception as e:
         except Exception as e:
             print(f'[X] Failed to install {binary.name}: {e}')
             print(f'[X] Failed to install {binary.name}: {e}')
 
 
-    # from plugins_extractor.curl.apps import CURL_BINARY
-    # print(CURL_BINARY.load_or_install().model_dump(exclude={'binproviders_supported', 'loaded_binprovider', 'provider_overrides', 'loaded_abspaths', 'bin_dir', 'loaded_respath'}))
-
-    # from plugins_extractor.wget.apps import WGET_BINARY
-    # print(WGET_BINARY.load_or_install().model_dump(exclude={'binproviders_supported', 'loaded_binprovider', 'provider_overrides', 'loaded_abspaths', 'bin_dir', 'loaded_respath'}))
-
-    # from plugins_extractor.ytdlp.apps import YTDLP_BINARY
-    # print(YTDLP_BINARY.load_or_install().model_dump(exclude={'binproviders_supported', 'loaded_binprovider', 'provider_overrides', 'loaded_abspaths', 'bin_dir', 'loaded_respath'}))
-
-    # from plugins_extractor.chrome.apps import CHROME_BINARY
-    # print(CHROME_BINARY.load_or_install().model_dump(exclude={'binproviders_supported', 'loaded_binprovider', 'provider_overrides', 'loaded_abspaths', 'bin_dir', 'loaded_respath'}))
-
-    # from plugins_extractor.singlefile.apps import SINGLEFILE_BINARY
-    # print(SINGLEFILE_BINARY.load_or_install().model_dump(exclude={'binproviders_supported', 'loaded_binprovider', 'provider_overrides', 'loaded_abspaths', 'bin_dir', 'loaded_respath'}))
-    
-    # from plugins_extractor.readability.apps import READABILITY_BINARY
-    # print(READABILITY_BINARY.load_or_install().model_dump(exclude={'binproviders_supported', 'loaded_binprovider', 'provider_overrides', 'loaded_abspaths', 'bin_dir', 'loaded_respath'}))
-    
-    # from plugins_extractor.mercury.apps import MERCURY_BINARY
-    # print(MERCURY_BINARY.load_or_install().model_dump(exclude={'binproviders_supported', 'loaded_binprovider', 'provider_overrides', 'loaded_abspaths', 'bin_dir', 'loaded_respath'}))
-    
 
 
     from django.contrib.auth import get_user_model
     from django.contrib.auth import get_user_model
     User = get_user_model()
     User = get_user_model()
 
 
     if not User.objects.filter(is_superuser=True).exists():
     if not User.objects.filter(is_superuser=True).exists():
-        stderr('\n[+] Creating new admin user for the Web UI...', color='green')
-        run_subcommand('manage', subcommand_args=['createsuperuser'], pwd=out_dir)
+        stderr('\n[+] Don\'t forget to create a new admin user for the Web UI...', color='green')
+        stderr('    archivebox manage createsuperuser')
+        # run_subcommand('manage', subcommand_args=['createsuperuser'], pwd=out_dir)
     
     
     stderr('\n[√] Set up ArchiveBox and its dependencies successfully.', color='green')
     stderr('\n[√] Set up ArchiveBox and its dependencies successfully.', color='green')
     
     
@@ -978,6 +958,10 @@ def setup(out_dir: Path=DATA_DIR) -> None:
     
     
     run_shell([ARCHIVEBOX_BINARY.load().abspath, '--version'], capture_output=False, cwd=out_dir)
     run_shell([ARCHIVEBOX_BINARY.load().abspath, '--version'], capture_output=False, cwd=out_dir)
 
 
+# backwards-compatibility:
+setup = install
+
+
 @enforce_types
 @enforce_types
 def config(config_options_str: Optional[str]=None,
 def config(config_options_str: Optional[str]=None,
            config_options: Optional[List[str]]=None,
            config_options: Optional[List[str]]=None,
@@ -989,7 +973,7 @@ def config(config_options_str: Optional[str]=None,
 
 
     from rich import print
     from rich import print
 
 
-    check_data_folder(CONFIG)
+    check_data_folder()
     if config_options and config_options_str:
     if config_options and config_options_str:
         stderr(
         stderr(
             '[X] You should either pass config values as an arguments '
             '[X] You should either pass config values as an arguments '
@@ -1090,8 +1074,8 @@ def schedule(add: bool=False,
              out_dir: Path=DATA_DIR):
              out_dir: Path=DATA_DIR):
     """Set ArchiveBox to regularly import URLs at specific times using cron"""
     """Set ArchiveBox to regularly import URLs at specific times using cron"""
     
     
-    check_data_folder(CONFIG)
-    from plugins_pkg.pip.apps import ARCHIVEBOX_BINARY
+    check_data_folder()
+    from archivebox.plugins_pkg.pip.apps import ARCHIVEBOX_BINARY
 
 
     Path(CONSTANTS.LOGS_DIR).mkdir(exist_ok=True)
     Path(CONSTANTS.LOGS_DIR).mkdir(exist_ok=True)
 
 
@@ -1228,7 +1212,7 @@ def server(runserver_args: Optional[List[str]]=None,
         print()
         print()
 
 
 
 
-    check_data_folder(CONFIG)
+    check_data_folder()
 
 
     from django.core.management import call_command
     from django.core.management import call_command
     from django.contrib.auth.models import User
     from django.contrib.auth.models import User
@@ -1280,7 +1264,7 @@ def server(runserver_args: Optional[List[str]]=None,
 def manage(args: Optional[List[str]]=None, out_dir: Path=DATA_DIR) -> None:
 def manage(args: Optional[List[str]]=None, out_dir: Path=DATA_DIR) -> None:
     """Run an ArchiveBox Django management command"""
     """Run an ArchiveBox Django management command"""
 
 
-    check_data_folder(CONFIG)
+    check_data_folder()
     from django.core.management import execute_from_command_line
     from django.core.management import execute_from_command_line
 
 
     if (args and "createsuperuser" in args) and (SHELL_CONFIG.IN_DOCKER and not SHELL_CONFIG.IS_TTY):
     if (args and "createsuperuser" in args) and (SHELL_CONFIG.IN_DOCKER and not SHELL_CONFIG.IS_TTY):
@@ -1297,7 +1281,7 @@ def manage(args: Optional[List[str]]=None, out_dir: Path=DATA_DIR) -> None:
 def shell(out_dir: Path=DATA_DIR) -> None:
 def shell(out_dir: Path=DATA_DIR) -> None:
     """Enter an interactive ArchiveBox Django shell"""
     """Enter an interactive ArchiveBox Django shell"""
 
 
-    check_data_folder(CONFIG)
+    check_data_folder()
 
 
     from django.core.management import call_command
     from django.core.management import call_command
     call_command("shell_plus")
     call_command("shell_plus")

+ 2 - 4
archivebox/misc/checks.py

@@ -1,13 +1,11 @@
 __package__ = 'archivebox.misc'
 __package__ = 'archivebox.misc'
 
 
-from benedict import benedict
-
 from archivebox.config import DATA_DIR, ARCHIVE_DIR, CONSTANTS, SHELL_CONFIG
 from archivebox.config import DATA_DIR, ARCHIVE_DIR, CONSTANTS, SHELL_CONFIG
 
 
 from .logging import stderr
 from .logging import stderr
 
 
 
 
-def check_data_folder(config: benedict) -> None:
+def check_data_folder() -> None:
 
 
     archive_dir_exists = ARCHIVE_DIR.exists()
     archive_dir_exists = ARCHIVE_DIR.exists()
     if not archive_dir_exists:
     if not archive_dir_exists:
@@ -23,7 +21,7 @@ def check_data_folder(config: benedict) -> None:
         raise SystemExit(2)
         raise SystemExit(2)
 
 
 
 
-def check_migrations(config: benedict):
+def check_migrations():
     from ..index.sql import list_migrations
     from ..index.sql import list_migrations
 
 
     pending_migrations = [name for status, name in list_migrations() if not status]
     pending_migrations = [name for status, name in list_migrations() if not status]

+ 17 - 12
archivebox/plugins_extractor/curl/apps.py

@@ -1,10 +1,10 @@
 __package__ = 'plugins_extractor.curl'
 __package__ = 'plugins_extractor.curl'
 
 
-from typing import List, Optional, Dict
+from typing import List, Optional
 from pathlib import Path
 from pathlib import Path
 
 
 from pydantic import InstanceOf, Field
 from pydantic import InstanceOf, Field
-from pydantic_pkgr import BinProvider, BinName, bin_abspath, BinProviderName, ProviderLookupDict
+from pydantic_pkgr import BinProvider, BinName
 
 
 from abx.archivebox.base_plugin import BasePlugin, BaseHook
 from abx.archivebox.base_plugin import BasePlugin, BaseHook
 from abx.archivebox.base_configset import BaseConfigSet
 from abx.archivebox.base_configset import BaseConfigSet
@@ -12,15 +12,26 @@ from abx.archivebox.base_binary import BaseBinary, env, apt, brew
 # from abx.archivebox.base_extractor import BaseExtractor, ExtractorName
 # from abx.archivebox.base_extractor import BaseExtractor, ExtractorName
 
 
 from archivebox.config import ARCHIVING_CONFIG
 from archivebox.config import ARCHIVING_CONFIG
-
+from archivebox.plugins_extractor.favicon.apps import FAVICON_CONFIG
+from archivebox.plugins_extractor.archivedotorg.apps import ARCHIVEDOTORG_CONFIG
 
 
 class CurlConfig(BaseConfigSet):
 class CurlConfig(BaseConfigSet):
-
-    SAVE_CURL: bool = True
     
     
-    # USE_CURL: bool = Field(default=lambda c: c.SAVE_HEADERS or c.SAVE_FAVICON)
+    SAVE_TITLE: bool = Field(default=True)
+    SAVE_HEADERS: bool = Field(default=True)
+    USE_CURL: bool = Field(default=lambda c: 
+        ARCHIVEDOTORG_CONFIG.SAVE_ARCHIVE_DOT_ORG
+        or FAVICON_CONFIG.SAVE_FAVICON
+        or c.SAVE_HEADERS
+        or c.SAVE_TITLE
+    )
     
     
     CURL_BINARY: str = Field(default='curl')
     CURL_BINARY: str = Field(default='curl')
+    CURL_ARGS: List[str] = [
+        '--silent',
+        '--location',
+        '--compressed',
+    ]
     CURL_EXTRA_ARGS: List[str] = []
     CURL_EXTRA_ARGS: List[str] = []
     
     
     CURL_TIMEOUT: int =  Field(default=lambda: ARCHIVING_CONFIG.TIMEOUT)
     CURL_TIMEOUT: int =  Field(default=lambda: ARCHIVING_CONFIG.TIMEOUT)
@@ -35,12 +46,6 @@ CURL_CONFIG = CurlConfig()
 class CurlBinary(BaseBinary):
 class CurlBinary(BaseBinary):
     name: BinName = CURL_CONFIG.CURL_BINARY
     name: BinName = CURL_CONFIG.CURL_BINARY
     binproviders_supported: List[InstanceOf[BinProvider]] = [apt, brew, env]
     binproviders_supported: List[InstanceOf[BinProvider]] = [apt, brew, env]
-    
-    provider_overrides: Dict[BinProviderName, ProviderLookupDict] = {
-        brew.name: {
-            'abspath': lambda: bin_abspath(CURL_CONFIG.CURL_BINARY, PATH=f'/opt/homebrew/opt/curl/bin:{brew.PATH}'),
-        },
-    }
 
 
 CURL_BINARY = CurlBinary()
 CURL_BINARY = CurlBinary()
 
 

+ 2 - 8
archivebox/plugins_extractor/wget/apps.py

@@ -1,13 +1,13 @@
 __package__ = 'plugins_extractor.wget'
 __package__ = 'plugins_extractor.wget'
 
 
 import sys
 import sys
-from typing import List, Optional, Dict
+from typing import List, Optional
 from pathlib import Path
 from pathlib import Path
 from subprocess import run, DEVNULL
 from subprocess import run, DEVNULL
 
 
 from rich import print
 from rich import print
 from pydantic import InstanceOf, Field, model_validator
 from pydantic import InstanceOf, Field, model_validator
-from pydantic_pkgr import BinProvider, BinName, bin_abspath, BinProviderName, ProviderLookupDict
+from pydantic_pkgr import BinProvider, BinName
 
 
 from abx.archivebox.base_plugin import BasePlugin, BaseHook
 from abx.archivebox.base_plugin import BasePlugin, BaseHook
 from abx.archivebox.base_configset import BaseConfigSet
 from abx.archivebox.base_configset import BaseConfigSet
@@ -80,12 +80,6 @@ WGET_CONFIG = WgetConfig()
 class WgetBinary(BaseBinary):
 class WgetBinary(BaseBinary):
     name: BinName = WGET_CONFIG.WGET_BINARY
     name: BinName = WGET_CONFIG.WGET_BINARY
     binproviders_supported: List[InstanceOf[BinProvider]] = [apt, brew, env]
     binproviders_supported: List[InstanceOf[BinProvider]] = [apt, brew, env]
-    
-    provider_overrides: Dict[BinProviderName, ProviderLookupDict] = {
-        brew.name: {
-            'abspath': lambda: bin_abspath(WGET_CONFIG.WGET_BINARY, PATH=f'/opt/homebrew/opt/wget/bin:{brew.PATH}'),
-        },
-    }
 
 
 WGET_BINARY = WgetBinary()
 WGET_BINARY = WgetBinary()
 
 

+ 6 - 6
archivebox/search/__init__.py

@@ -11,7 +11,7 @@ from archivebox.misc.util import enforce_types
 from archivebox.misc.logging import stderr
 from archivebox.misc.logging import stderr
 from archivebox.config.legacy import ANSI
 from archivebox.config.legacy import ANSI
 
 
-# from archivebox.archivebox.config import settings.CONFIGS.SearchBackendConfig
+from archivebox.config import SEARCH_BACKEND_CONFIG
 
 
 
 
 def log_index_started(url):
 def log_index_started(url):
@@ -58,13 +58,13 @@ def get_indexable_content(results: QuerySet):
 
 
 def import_backend():
 def import_backend():
     for backend in settings.SEARCH_BACKENDS.values():
     for backend in settings.SEARCH_BACKENDS.values():
-        if backend.name == settings.CONFIGS.SearchBackendConfig.SEARCH_BACKEND_ENGINE:
+        if backend.name == SEARCH_BACKEND_CONFIG.SEARCH_BACKEND_ENGINE:
             return backend
             return backend
-    raise Exception(f'Could not load {settings.CONFIGS.SearchBackendConfig.SEARCH_BACKEND_ENGINE} as search backend')
+    raise Exception(f'Could not load {SEARCH_BACKEND_CONFIG.SEARCH_BACKEND_ENGINE} as search backend')
 
 
 @enforce_types
 @enforce_types
 def write_search_index(link: Link, texts: Union[List[str], None]=None, out_dir: Path=settings.DATA_DIR, skip_text_index: bool=False) -> None:
 def write_search_index(link: Link, texts: Union[List[str], None]=None, out_dir: Path=settings.DATA_DIR, skip_text_index: bool=False) -> None:
-    if not settings.CONFIGS.SearchBackendConfig.USE_INDEXING_BACKEND:
+    if not SEARCH_BACKEND_CONFIG.USE_INDEXING_BACKEND:
         return
         return
 
 
     if not skip_text_index and texts:
     if not skip_text_index and texts:
@@ -86,7 +86,7 @@ def write_search_index(link: Link, texts: Union[List[str], None]=None, out_dir:
 def query_search_index(query: str, out_dir: Path=settings.DATA_DIR) -> QuerySet:
 def query_search_index(query: str, out_dir: Path=settings.DATA_DIR) -> QuerySet:
     from core.models import Snapshot
     from core.models import Snapshot
 
 
-    if settings.CONFIGS.SearchBackendConfig.USE_SEARCHING_BACKEND:
+    if SEARCH_BACKEND_CONFIG.USE_SEARCHING_BACKEND:
         backend = import_backend()
         backend = import_backend()
         try:
         try:
             snapshot_pks = backend.search(query)
             snapshot_pks = backend.search(query)
@@ -106,7 +106,7 @@ def query_search_index(query: str, out_dir: Path=settings.DATA_DIR) -> QuerySet:
 
 
 @enforce_types
 @enforce_types
 def flush_search_index(snapshots: QuerySet):
 def flush_search_index(snapshots: QuerySet):
-    if not settings.CONFIGS.SearchBackendConfig.USE_INDEXING_BACKEND or not snapshots:
+    if not SEARCH_BACKEND_CONFIG.USE_INDEXING_BACKEND or not snapshots:
         return
         return
     backend = import_backend()
     backend = import_backend()
     snapshot_pks = (str(pk) for pk in snapshots.values_list('pk', flat=True))
     snapshot_pks = (str(pk) for pk in snapshots.values_list('pk', flat=True))

+ 1 - 1
archivebox/vendor/pydantic-pkgr

@@ -1 +1 @@
-Subproject commit 4f9486ab86a65f83ad1bfd94320795b8e09871aa
+Subproject commit 4f31b355fbf319a54b38953795b17b1b04db4348