Explorar o código

move almost all config into new archivebox.CONSTANTS

Nick Sweeting hai 1 ano
pai
achega
bb65b2dbec

+ 8 - 5
archivebox/__init__.py

@@ -1,14 +1,15 @@
 __package__ = 'archivebox'
 
+
 # print('INSTALLING MONKEY PATCHES')
+from .monkey_patches import *                    # noqa
+# print('DONE INSTALLING MONKEY PATCHES')
 
-from .monkey_patches import *
 
 import os
-import importlib
+import importlib.metadata
 from pathlib import Path
 
-
 PACKAGE_DIR = Path(__file__).resolve().parent    # archivebox source code dir
 DATA_DIR = Path(os.curdir).resolve()             # archivebox user data dir
 
@@ -28,7 +29,9 @@ def _detect_installed_version():
 
     raise Exception('Failed to detect installed archivebox version!')
 
+VERSION = _detect_installed_version()
 
-__version__ = _detect_installed_version()
+__version__ = VERSION
 
-# print('DONE INSTALLING MONKEY PATCHES')
+
+from .constants import CONSTANTS

+ 57 - 427
archivebox/config.py

@@ -26,10 +26,7 @@ import io
 import re
 import sys
 import json
-import inspect
-import getpass
 import shutil
-import requests
 import archivebox
 
 from hashlib import md5
@@ -38,7 +35,6 @@ from datetime import datetime, timezone
 from typing import Optional, Type, Tuple, Dict
 from subprocess import run, PIPE, DEVNULL, STDOUT, TimeoutExpired
 from configparser import ConfigParser
-import importlib.metadata
 
 from pydantic_pkgr import SemVer
 from rich.progress import Progress
@@ -49,7 +45,6 @@ from django.db.backends.sqlite3.base import Database as sqlite3
 
 from .config_stubs import (
     AttrDict,
-    SimpleConfigValueDict,
     ConfigValue,
     ConfigDict,
     ConfigDefaultValue,
@@ -61,7 +56,7 @@ from .misc.logging import (
     ANSI,
     COLOR_DICT,
     stderr,
-    hint,
+    hint,      # noqa
 )
 
 # print('STARTING CONFIG LOADING')
@@ -165,8 +160,8 @@ CONFIG_SCHEMA: Dict[str, ConfigDefaultDict] = {
         'MEDIA_MAX_SIZE':           {'type': str,   'default': '750m'},
 
         'USER_AGENT':               {'type': str,   'default': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36 ArchiveBox/{VERSION} (+https://github.com/ArchiveBox/ArchiveBox/)'},
-        'CURL_USER_AGENT':          {'type': str,   'default': lambda c: c['USER_AGENT'] + ' curl/{CURL_VERSION}'},
-        'WGET_USER_AGENT':          {'type': str,   'default': lambda c: c['USER_AGENT'] + ' wget/{WGET_VERSION}'},
+        'CURL_USER_AGENT':          {'type': str,   'default': lambda c: c['USER_AGENT']}, # + ' curl/{CURL_VERSION}'},
+        'WGET_USER_AGENT':          {'type': str,   'default': lambda c: c['USER_AGENT']}, #  + ' wget/{WGET_VERSION}'},
 
         'COOKIES_FILE':             {'type': str,   'default': None},
 
@@ -254,12 +249,12 @@ CONFIG_SCHEMA: Dict[str, ConfigDefaultDict] = {
         'CURL_BINARY':              {'type': str,   'default': 'curl'},
         'GIT_BINARY':               {'type': str,   'default': 'git'},
         'WGET_BINARY':              {'type': str,   'default': 'wget'},     # also can accept wget2
-        'SINGLEFILE_BINARY':        {'type': str,   'default': lambda c: bin_path('single-file')},
-        'READABILITY_BINARY':       {'type': str,   'default': lambda c: bin_path('readability-extractor')},
         'MERCURY_BINARY':           {'type': str,   'default': lambda c: bin_path('postlight-parser')},
-        'YOUTUBEDL_BINARY':         {'type': str,   'default': 'yt-dlp'},   # also can accept youtube-dl
         'NODE_BINARY':              {'type': str,   'default': 'node'},
-        'RIPGREP_BINARY':           {'type': str,   'default': 'rg'},
+        # 'YOUTUBEDL_BINARY':         {'type': str,   'default': 'yt-dlp'},   # also can accept youtube-dl
+        # 'SINGLEFILE_BINARY':        {'type': str,   'default': lambda c: bin_path('single-file')},
+        # 'READABILITY_BINARY':       {'type': str,   'default': lambda c: bin_path('readability-extractor')},
+        # 'RIPGREP_BINARY':           {'type': str,   'default': 'rg'},
 
         'POCKET_CONSUMER_KEY':      {'type': str,   'default': None},
         'POCKET_ACCESS_TOKENS':     {'type': dict,  'default': {}},
@@ -308,212 +303,16 @@ CONFIG_FILENAME = 'ArchiveBox.conf'
 
 
 
-STATICFILE_EXTENSIONS = {
-    # 99.999% of the time, URLs ending in these extensions are static files
-    # that can be downloaded as-is, not html pages that need to be rendered
-    'gif', 'jpeg', 'jpg', 'png', 'tif', 'tiff', 'wbmp', 'ico', 'jng', 'bmp',
-    'svg', 'svgz', 'webp', 'ps', 'eps', 'ai',
-    'mp3', 'mp4', 'm4a', 'mpeg', 'mpg', 'mkv', 'mov', 'webm', 'm4v',
-    'flv', 'wmv', 'avi', 'ogg', 'ts', 'm3u8',
-    'pdf', 'txt', 'rtf', 'rtfd', 'doc', 'docx', 'ppt', 'pptx', 'xls', 'xlsx',
-    'atom', 'rss', 'css', 'js', 'json',
-    'dmg', 'iso', 'img',
-    'rar', 'war', 'hqx', 'zip', 'gz', 'bz2', '7z',
-
-    # Less common extensions to consider adding later
-    # jar, swf, bin, com, exe, dll, deb
-    # ear, hqx, eot, wmlc, kml, kmz, cco, jardiff, jnlp, run, msi, msp, msm,
-    # pl pm, prc pdb, rar, rpm, sea, sit, tcl tk, der, pem, crt, xpi, xspf,
-    # ra, mng, asx, asf, 3gpp, 3gp, mid, midi, kar, jad, wml, htc, mml
-
-    # These are always treated as pages, not as static files, never add them:
-    # html, htm, shtml, xhtml, xml, aspx, php, cgi
-}
-
-# When initializing archivebox in a new directory, we check to make sure the dir is
-# actually empty so that we dont clobber someone's home directory or desktop by accident.
-# These files are exceptions to the is_empty check when we're trying to init a new dir,
-# as they could be from a previous archivebox version, system artifacts, dependencies, etc.
-ALLOWED_IN_OUTPUT_DIR = {
-    ".gitignore",
-    "lost+found",
-    ".DS_Store",
-    ".venv",
-    "venv",
-    "virtualenv",
-    ".virtualenv",
-    "node_modules",
-    "package.json",
-    "package-lock.json",
-    "yarn.lock",
-    "static",
-    "sonic",
-    "search.sqlite3",
-    CRONTABS_DIR_NAME,
-    ARCHIVE_DIR_NAME,
-    SOURCES_DIR_NAME,
-    LOGS_DIR_NAME,
-    CACHE_DIR_NAME,
-    LIB_DIR_NAME,
-    PERSONAS_DIR_NAME,
-    SQL_INDEX_FILENAME,
-    f"{SQL_INDEX_FILENAME}-wal",
-    f"{SQL_INDEX_FILENAME}-shm",
-    "queue.sqlite3",
-    "queue.sqlite3-wal",
-    "queue.sqlite3-shm",
-    JSON_INDEX_FILENAME,
-    HTML_INDEX_FILENAME,
-    ROBOTS_TXT_FILENAME,
-    FAVICON_FILENAME,
-    CONFIG_FILENAME,
-    f"{CONFIG_FILENAME}.bak",
-    "static_index.json",
-}
-
 
 ALLOWDENYLIST_REGEX_FLAGS: int = re.IGNORECASE | re.UNICODE | re.MULTILINE
 
 
-CONSTANTS = {
-    "PACKAGE_DIR_NAME":             {'default': lambda c: PACKAGE_DIR_NAME},
-    "LIB_DIR_NAME":                 {'default': lambda c: LIB_DIR_NAME},
-    "TEMPLATES_DIR_NAME":           {'default': lambda c: TEMPLATES_DIR_NAME},
-    "ARCHIVE_DIR_NAME":             {'default': lambda c: ARCHIVE_DIR_NAME},
-    "SOURCES_DIR_NAME":             {'default': lambda c: SOURCES_DIR_NAME},
-    "LOGS_DIR_NAME":                {'default': lambda c: LOGS_DIR_NAME},
-    "CACHE_DIR_NAME":               {'default': lambda c: CACHE_DIR_NAME},
-    "PERSONAS_DIR_NAME":            {'default': lambda c: PERSONAS_DIR_NAME},
-    "CRONTABS_DIR_NAME":            {'default': lambda c: CRONTABS_DIR_NAME},
-    "SQL_INDEX_FILENAME":           {'default': lambda c: SQL_INDEX_FILENAME},
-    "JSON_INDEX_FILENAME":          {'default': lambda c: JSON_INDEX_FILENAME},
-    "HTML_INDEX_FILENAME":          {'default': lambda c: HTML_INDEX_FILENAME},
-    "ROBOTS_TXT_FILENAME":          {'default': lambda c: ROBOTS_TXT_FILENAME},
-    "FAVICON_FILENAME":             {'default': lambda c: FAVICON_FILENAME},
-    "CONFIG_FILENAME":              {'default': lambda c: CONFIG_FILENAME},
-    "DEFAULT_CLI_COLORS":           {'default': lambda c: DEFAULT_CLI_COLORS},
-    "ANSI":                         {'default': lambda c: ANSI},
-    "COLOR_DICT":                   {'default': lambda c: COLOR_DICT},
-    "STATICFILE_EXTENSIONS":        {'default': lambda c: STATICFILE_EXTENSIONS},
-    "ALLOWED_IN_OUTPUT_DIR":        {'default': lambda c: ALLOWED_IN_OUTPUT_DIR},
-    # "ALLOWDENYLIST_REGEX_FLAGS":    {'default': lambda c: ALLOWDENYLIST_REGEX_FLAGS},
-}
+CONSTANTS = archivebox.CONSTANTS._asdict()
 
 ############################## Version Config ##################################
 
-def get_system_user() -> str:
-    # some host OS's are unable to provide a username (k3s, Windows), making this complicated
-    # uid 999 is especially problematic and breaks many attempts
-    SYSTEM_USER = None
-    FALLBACK_USER_PLACHOLDER = f'user_{os.getuid()}'
 
-    # Option 1
-    try:
-        import pwd
-        SYSTEM_USER = SYSTEM_USER or pwd.getpwuid(os.geteuid()).pw_name
-    except (ModuleNotFoundError, Exception):
-        pass
 
-    # Option 2
-    try:
-        SYSTEM_USER = SYSTEM_USER or getpass.getuser()
-    except Exception:
-        pass
-
-    # Option 3
-    try:
-        SYSTEM_USER = SYSTEM_USER or os.getlogin()
-    except Exception:
-        pass
-
-    return SYSTEM_USER or FALLBACK_USER_PLACHOLDER
-
-def get_version(config):
-    try:
-        return importlib.metadata.version(__package__ or 'archivebox')
-    except importlib.metadata.PackageNotFoundError:
-        try:
-            pyproject_config = (config['PACKAGE_DIR'] / 'pyproject.toml').read_text()
-            for line in pyproject_config:
-                if line.startswith('version = '):
-                    return line.split(' = ', 1)[-1].strip('"')
-        except FileNotFoundError:
-            # building docs, pyproject.toml is not available
-            return 'dev'
-
-    raise Exception('Failed to detect installed archivebox version!')
-
-def get_commit_hash(config) -> Optional[str]:
-    try:
-        git_dir = config['PACKAGE_DIR'] / '../.git'
-        ref = (git_dir / 'HEAD').read_text().strip().split(' ')[-1]
-        commit_hash = git_dir.joinpath(ref).read_text().strip()
-        return commit_hash
-    except Exception:
-        pass
-
-    try:
-        return list((config['PACKAGE_DIR'] / '../.git/refs/heads/').glob('*'))[0].read_text().strip()
-    except Exception:
-        pass
-    
-    return None
-
-def get_build_time(config) -> str:
-    if config['IN_DOCKER']:
-        docker_build_end_time = Path('/VERSION.txt').read_text().rsplit('BUILD_END_TIME=')[-1].split('\n', 1)[0]
-        return docker_build_end_time
-
-    src_last_modified_unix_timestamp = (config['PACKAGE_DIR'] / 'config.py').stat().st_mtime
-    return datetime.fromtimestamp(src_last_modified_unix_timestamp).strftime('%Y-%m-%d %H:%M:%S %s')
-
-def get_versions_available_on_github(config):
-    """
-    returns a dictionary containing the ArchiveBox GitHub release info for
-    the recommended upgrade version and the currently installed version
-    """
-    
-    # we only want to perform the (relatively expensive) check for new versions
-    # when its most relevant, e.g. when the user runs a long-running command
-    subcommand_run_by_user = sys.argv[3] if len(sys.argv) > 3 else 'help'
-    long_running_commands = ('add', 'schedule', 'update', 'status', 'server')
-    if subcommand_run_by_user not in long_running_commands:
-        return None
-    
-    github_releases_api = "https://api.github.com/repos/ArchiveBox/ArchiveBox/releases"
-    response = requests.get(github_releases_api)
-    if response.status_code != 200:
-        stderr(f'[!] Warning: GitHub API call to check for new ArchiveBox version failed! (status={response.status_code})', color='lightyellow', config=config)
-        return None
-    all_releases = response.json()
-
-    installed_version = parse_version_string(config['VERSION'])
-
-    # find current version or nearest older version (to link to)
-    current_version = None
-    for idx, release in enumerate(all_releases):
-        release_version = parse_version_string(release['tag_name'])
-        if release_version <= installed_version:
-            current_version = release
-            break
-
-    current_version = current_version or all_releases[-1]
-    
-    # recommended version is whatever comes after current_version in the release list
-    # (perhaps too conservative to only recommend upgrading one version at a time, but it's safest)
-    try:
-        recommended_version = all_releases[idx+1]
-    except IndexError:
-        recommended_version = None
-
-    return {'recommended_version': recommended_version, 'current_version': current_version}
-
-def can_upgrade(config):
-    if config['VERSIONS_AVAILABLE'] and config['VERSIONS_AVAILABLE']['recommended_version']:
-        recommended_version = parse_version_string(config['VERSIONS_AVAILABLE']['recommended_version']['tag_name'])
-        current_version = parse_version_string(config['VERSIONS_AVAILABLE']['current_version']['tag_name'])
-        return recommended_version > current_version
-    return False
 
 
 ############################## Derived Config ##################################
@@ -523,55 +322,25 @@ def can_upgrade(config):
 # These are derived/computed values calculated *after* all user-provided config values are ingested
 # they appear in `archivebox config` output and are intended to be read-only for the user
 DYNAMIC_CONFIG_SCHEMA: ConfigDefaultDict = {
-    **CONSTANTS,
+    **{
+        key: {'default': lambda c: val}
+        for key, val in archivebox.CONSTANTS.items()
+    },
 
-    'TERM_WIDTH':               {'default': lambda c: lambda: shutil.get_terminal_size((100, 10)).columns},
-    'USER':                     {'default': lambda c: get_system_user()},
-    'ANSI':                     {'default': lambda c: DEFAULT_CLI_COLORS if c['USE_COLOR'] else AttrDict({k: '' for k in DEFAULT_CLI_COLORS.keys()})},
 
-    'PACKAGE_DIR':              {'default': lambda c: Path(__file__).resolve().parent},
+    'PACKAGE_DIR':              {'default': lambda c: archivebox.PACKAGE_DIR.resolve()},
     'TEMPLATES_DIR':            {'default': lambda c: c['PACKAGE_DIR'] / TEMPLATES_DIR_NAME},
     'CUSTOM_TEMPLATES_DIR':     {'default': lambda c: c['CUSTOM_TEMPLATES_DIR'] and Path(c['CUSTOM_TEMPLATES_DIR'])},
 
-    'OUTPUT_DIR':               {'default': lambda c: Path(c['OUTPUT_DIR']).resolve() if c['OUTPUT_DIR'] else Path(os.curdir).resolve()},
-    'ARCHIVE_DIR':              {'default': lambda c: c['OUTPUT_DIR'] / ARCHIVE_DIR_NAME},
-    'SOURCES_DIR':              {'default': lambda c: c['OUTPUT_DIR'] / SOURCES_DIR_NAME},
-    'LOGS_DIR':                 {'default': lambda c: c['OUTPUT_DIR'] / LOGS_DIR_NAME},
-    'CACHE_DIR':                {'default': lambda c: c['OUTPUT_DIR'] / CACHE_DIR_NAME},
-    'LIB_DIR':                  {'default': lambda c: c['OUTPUT_DIR'] / LIB_DIR_NAME},
-    'BIN_DIR':                  {'default': lambda c: c['OUTPUT_DIR'] / LIB_DIR_NAME / 'bin'},
-    'PERSONAS_DIR':             {'default': lambda c: c['OUTPUT_DIR'] / PERSONAS_DIR_NAME},
-    'CONFIG_FILE':              {'default': lambda c: Path(c['CONFIG_FILE']).resolve() if c['CONFIG_FILE'] else c['OUTPUT_DIR'] / CONFIG_FILENAME},
-    'COOKIES_FILE':             {'default': lambda c: c['COOKIES_FILE'] and Path(c['COOKIES_FILE']).resolve()},
 
     'URL_DENYLIST_PTN':         {'default': lambda c: c['URL_DENYLIST'] and re.compile(c['URL_DENYLIST'] or '', ALLOWDENYLIST_REGEX_FLAGS)},
     'URL_ALLOWLIST_PTN':        {'default': lambda c: c['URL_ALLOWLIST'] and re.compile(c['URL_ALLOWLIST'] or '', ALLOWDENYLIST_REGEX_FLAGS)},
     'DIR_OUTPUT_PERMISSIONS':   {'default': lambda c: c['OUTPUT_PERMISSIONS'].replace('6', '7').replace('4', '5')},  # exec is always needed to list directories
 
-    'ARCHIVEBOX_BINARY':        {'default': lambda c: sys.argv[0] or bin_path('archivebox')},
-    'NODE_BIN_PATH':            {'default': lambda c: str((Path(c["OUTPUT_DIR"]).absolute() / 'node_modules' / '.bin'))},
-
-    'VERSION':                  {'default': lambda c: get_version(c).split('+', 1)[0]},     # remove +editable from user-displayed version string
-    'COMMIT_HASH':              {'default': lambda c: get_commit_hash(c)},                  # short git commit hash of codebase HEAD commit
-    'BUILD_TIME':               {'default': lambda c: get_build_time(c)},                   # docker build completed time or python src last modified time
-    
-    'VERSIONS_AVAILABLE':       {'default': lambda c: False},             # get_versions_available_on_github(c)},
-    'CAN_UPGRADE':              {'default': lambda c: False},             # can_upgrade(c)},
-
-    'PYTHON_BINARY':            {'default': lambda c: sys.executable},
-    'PYTHON_VERSION':           {'default': lambda c: '{}.{}.{}'.format(*sys.version_info[:3])},
-
-    'DJANGO_BINARY':            {'default': lambda c: inspect.getfile(django)},
-    'DJANGO_VERSION':           {'default': lambda c: '{}.{}.{}'.format(*django.VERSION[:3])},
-    
-    'SQLITE_BINARY':            {'default': lambda c: inspect.getfile(sqlite3)},
-    'SQLITE_VERSION':           {'default': lambda c: sqlite3.version},
-    #'SQLITE_JOURNAL_MODE':      {'default': lambda c: 'wal'},         # set at runtime below, interesting if changed later but unused for now because its always expected to be wal
-    #'SQLITE_OPTIONS':           {'default': lambda c: ['JSON1']},     # set at runtime below
 
     'USE_CURL':                 {'default': lambda c: c['USE_CURL'] and (c['SAVE_FAVICON'] or c['SAVE_TITLE'] or c['SAVE_ARCHIVE_DOT_ORG'])},
     'CURL_VERSION':             {'default': lambda c: bin_version(c['CURL_BINARY']) if c['USE_CURL'] else None},
-    'CURL_USER_AGENT':          {'default': lambda c: c['CURL_USER_AGENT'].format(**c)},
+    # 'CURL_USER_AGENT':          {'default': lambda c: c['CURL_USER_AGENT'].format(**c)},
     'CURL_ARGS':                {'default': lambda c: c['CURL_ARGS'] or []},
     'CURL_EXTRA_ARGS':          {'default': lambda c: c['CURL_EXTRA_ARGS'] or []},
     'SAVE_FAVICON':             {'default': lambda c: c['USE_CURL'] and c['SAVE_FAVICON']},
@@ -580,23 +349,14 @@ DYNAMIC_CONFIG_SCHEMA: ConfigDefaultDict = {
     'USE_WGET':                 {'default': lambda c: c['USE_WGET'] and (c['SAVE_WGET'] or c['SAVE_WARC'])},
     'WGET_VERSION':             {'default': lambda c: bin_version(c['WGET_BINARY']) if c['USE_WGET'] else None},
     'WGET_AUTO_COMPRESSION':    {'default': lambda c: wget_supports_compression(c) if c['USE_WGET'] else False},
-    'WGET_USER_AGENT':          {'default': lambda c: c['WGET_USER_AGENT'].format(**c)},
+    # 'WGET_USER_AGENT':          {'default': lambda c: c['WGET_USER_AGENT'].format(**c)},
     'SAVE_WGET':                {'default': lambda c: c['USE_WGET'] and c['SAVE_WGET']},
     'SAVE_WARC':                {'default': lambda c: c['USE_WGET'] and c['SAVE_WARC']},
     'WGET_ARGS':                {'default': lambda c: c['WGET_ARGS'] or []},
     'WGET_EXTRA_ARGS':          {'default': lambda c: c['WGET_EXTRA_ARGS'] or []},
 
-    # 'RIPGREP_VERSION':          {'default': lambda c: bin_version(c['RIPGREP_BINARY']) if c['USE_RIPGREP'] else None},
-
-    'USE_SINGLEFILE':           {'default': lambda c: c['USE_SINGLEFILE'] and c['SAVE_SINGLEFILE']},
-    'SINGLEFILE_VERSION':       {'default': lambda c: bin_version(c['SINGLEFILE_BINARY']) if c['USE_SINGLEFILE'] else None},
-    'SINGLEFILE_ARGS':          {'default': lambda c: c['SINGLEFILE_ARGS'] or []},
-    'SINGLEFILE_EXTRA_ARGS':    {'default': lambda c: c['SINGLEFILE_EXTRA_ARGS'] or []},
-
-    'USE_READABILITY':          {'default': lambda c: c['USE_READABILITY'] and c['SAVE_READABILITY']},
-    'READABILITY_VERSION':      {'default': lambda c: bin_version(c['READABILITY_BINARY']) if c['USE_READABILITY'] else None},
-
     'USE_MERCURY':              {'default': lambda c: c['USE_MERCURY'] and c['SAVE_MERCURY']},
+    'SAVE_MERCURY':             {'default': lambda c: c['USE_MERCURY'] and c['USE_NODE']},
     'MERCURY_VERSION':          {'default': lambda c: '1.0.0' if shutil.which(str(bin_path(c['MERCURY_BINARY']))) else None},  # mercury doesnt expose version info until this is merged https://github.com/postlight/parser/pull/750
     'MERCURY_ARGS':             {'default': lambda c: c['MERCURY_ARGS'] or []},
     'MERCURY_EXTRA_ARGS':       {'default': lambda c: c['MERCURY_EXTRA_ARGS'] or []},
@@ -605,21 +365,12 @@ DYNAMIC_CONFIG_SCHEMA: ConfigDefaultDict = {
     'GIT_VERSION':              {'default': lambda c: bin_version(c['GIT_BINARY']) if c['USE_GIT'] else None},
     'SAVE_GIT':                 {'default': lambda c: c['USE_GIT'] and c['SAVE_GIT']},
 
-    'USE_YOUTUBEDL':            {'default': lambda c: c['USE_YOUTUBEDL'] and c['SAVE_MEDIA']},
-    'YOUTUBEDL_VERSION':        {'default': lambda c: bin_version(c['YOUTUBEDL_BINARY']) if c['USE_YOUTUBEDL'] else None},
-    'SAVE_MEDIA':               {'default': lambda c: c['USE_YOUTUBEDL'] and c['SAVE_MEDIA']},
-    'YOUTUBEDL_ARGS':           {'default': lambda c: c['YOUTUBEDL_ARGS'] or []},
-    'YOUTUBEDL_EXTRA_ARGS':     {'default': lambda c: c['YOUTUBEDL_EXTRA_ARGS'] or []},
-
-    'SAVE_READABILITY':         {'default': lambda c: c['USE_READABILITY'] and c['USE_NODE']},
-    'SAVE_MERCURY':             {'default': lambda c: c['USE_MERCURY'] and c['USE_NODE']},
-
-    'USE_NODE':                 {'default': lambda c: c['USE_NODE'] and (c['SAVE_READABILITY'] or c['SAVE_SINGLEFILE'] or c['SAVE_MERCURY'])},
+    'USE_NODE':                 {'default': lambda c: True},
     'NODE_VERSION':             {'default': lambda c: bin_version(c['NODE_BINARY']) if c['USE_NODE'] else None},
 
     'DEPENDENCIES':             {'default': lambda c: get_dependency_info(c)},
-    'CODE_LOCATIONS':           {'default': lambda c: get_code_locations(c)},
-    'DATA_LOCATIONS':           {'default': lambda c: get_data_locations(c)},
+    # 'CODE_LOCATIONS':           {'default': lambda c: get_code_locations(c)},
+    # 'DATA_LOCATIONS':           {'default': lambda c: get_data_locations(c)},
 
     'SAVE_ALLOWLIST_PTN':       {'default': lambda c: c['SAVE_ALLOWLIST'] and {re.compile(k, ALLOWDENYLIST_REGEX_FLAGS): v for k, v in c['SAVE_ALLOWLIST'].items()}},
     'SAVE_DENYLIST_PTN':        {'default': lambda c: c['SAVE_DENYLIST'] and {re.compile(k, ALLOWDENYLIST_REGEX_FLAGS): v for k, v in c['SAVE_DENYLIST'].items()}},
@@ -696,12 +447,10 @@ def load_config_val(key: str,
     raise Exception('Config values can only be str, bool, int, or json')
 
 
-def load_config_file(out_dir: str | None=None) -> Optional[ConfigDict]:
+def load_config_file(out_dir: str | None=archivebox.DATA_DIR) -> Optional[ConfigDict]:
     """load the ini-formatted config file from OUTPUT_DIR/Archivebox.conf"""
 
-    out_dir = out_dir or Path(os.getenv('OUTPUT_DIR', '.')).resolve()
-    assert out_dir and out_dir.is_dir()
-    config_path = Path(out_dir) / CONFIG_FILENAME
+    config_path = archivebox.CONSTANTS.CONFIG_FILE
     if config_path.exists():
         config_file = ConfigParser()
         config_file.optionxform = str
@@ -718,7 +467,7 @@ def load_config_file(out_dir: str | None=None) -> Optional[ConfigDict]:
     return None
 
 
-def write_config_file(config: Dict[str, str], out_dir: str | None=None) -> ConfigDict:
+def write_config_file(config: Dict[str, str], out_dir: str | None=archivebox.DATA_DIR) -> ConfigDict:
     """load the ini-formatted config file from OUTPUT_DIR/Archivebox.conf"""
 
     from .system import atomic_write
@@ -737,8 +486,7 @@ def write_config_file(config: Dict[str, str], out_dir: str | None=None) -> Confi
 
     """)
 
-    out_dir = out_dir or Path(os.getenv('OUTPUT_DIR', '.')).resolve()
-    config_path = Path(out_dir) /  CONFIG_FILENAME
+    config_path = archivebox.CONSTANTS.CONFIG_FILE
 
     if not config_path.exists():
         atomic_write(config_path, CONFIG_HEADER)
@@ -833,7 +581,7 @@ def load_config(defaults: ConfigDefaultDict,
             stderr('        https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration')
             stderr()
             # raise
-            raise SystemExit(2)
+            # raise SystemExit(2)
 
     return AttrDict(extended_config)
 
@@ -984,98 +732,6 @@ def wget_supports_compression(config):
     except (FileNotFoundError, OSError):
         return False
 
-def get_code_locations(config: ConfigDict) -> SimpleConfigValueDict:
-    return {
-        'PACKAGE_DIR': {
-            'path': (config['PACKAGE_DIR']).resolve(),
-            'enabled': True,
-            'is_valid': (config['PACKAGE_DIR'] / '__main__.py').exists(),
-        },
-        'TEMPLATES_DIR': {
-            'path': (config['TEMPLATES_DIR']).resolve(),
-            'enabled': True,
-            'is_valid': (config['TEMPLATES_DIR'] / 'static').exists(),
-        },
-        'LIB_DIR': {
-            'path': (config['LIB_DIR']).resolve(),
-            'enabled': True,
-            'is_valid': config['LIB_DIR'].is_dir(),
-        },
-        # 'NODE_MODULES_DIR': {
-        #     'path': ,
-        #     'enabled': ,
-        #     'is_valid': (...).exists(),
-        # },
-    }
-
-def get_data_locations(config: ConfigDict) -> ConfigValue:
-    return {
-        # OLD: migrating to personas
-        # 'CHROME_USER_DATA_DIR': {
-        #     'path': os.path.abspath(config['CHROME_USER_DATA_DIR']),
-        #     'enabled': config['USE_CHROME'] and config['CHROME_USER_DATA_DIR'],
-        #     'is_valid': False if config['CHROME_USER_DATA_DIR'] is None else (Path(config['CHROME_USER_DATA_DIR']) / 'Default').exists(),
-        # },
-        # 'COOKIES_FILE': {
-        #     'path': os.path.abspath(config['COOKIES_FILE']),
-        #     'enabled': config['USE_WGET'] and config['COOKIES_FILE'],
-        #     'is_valid': False if config['COOKIES_FILE'] is None else Path(config['COOKIES_FILE']).exists(),
-        # },
-        "OUTPUT_DIR": {
-            "path": config["OUTPUT_DIR"].resolve(),
-            "enabled": True,
-            "is_valid": (config["OUTPUT_DIR"] / SQL_INDEX_FILENAME).exists(),
-            "is_mount": os.path.ismount(config["OUTPUT_DIR"].resolve()),
-        },
-        "CONFIG_FILE": {
-            "path": config["CONFIG_FILE"].resolve(),
-            "enabled": True,
-            "is_valid": config["CONFIG_FILE"].exists(),
-        },
-        "SQL_INDEX": {
-            "path": (config["OUTPUT_DIR"] / SQL_INDEX_FILENAME).resolve(),
-            "enabled": True,
-            "is_valid": (config["OUTPUT_DIR"] / SQL_INDEX_FILENAME).exists(),
-            "is_mount": os.path.ismount((config["OUTPUT_DIR"] / SQL_INDEX_FILENAME).resolve()),
-        },
-        "ARCHIVE_DIR": {
-            "path": config["ARCHIVE_DIR"].resolve(),
-            "enabled": True,
-            "is_valid": config["ARCHIVE_DIR"].exists(),
-            "is_mount": os.path.ismount(config["ARCHIVE_DIR"].resolve()),
-        },
-        "SOURCES_DIR": {
-            "path": config["SOURCES_DIR"].resolve(),
-            "enabled": True,
-            "is_valid": config["SOURCES_DIR"].exists(),
-        },
-        "PERSONAS_DIR": {
-            "path": config["PERSONAS_DIR"].resolve(),
-            "enabled": True,
-            "is_valid": config["PERSONAS_DIR"].exists(),
-        },
-        "LOGS_DIR": {
-            "path": config["LOGS_DIR"].resolve(),
-            "enabled": True,
-            "is_valid": config["LOGS_DIR"].exists(),
-        },
-        "CACHE_DIR": {
-            "path": config["CACHE_DIR"].resolve(),
-            "enabled": True,
-            "is_valid": config["CACHE_DIR"].exists(),
-        },
-        "CUSTOM_TEMPLATES_DIR": {
-            "path": config["CUSTOM_TEMPLATES_DIR"] and Path(config["CUSTOM_TEMPLATES_DIR"]).resolve(),
-            "enabled": bool(config["CUSTOM_TEMPLATES_DIR"]),
-            "is_valid": config["CUSTOM_TEMPLATES_DIR"] and Path(config["CUSTOM_TEMPLATES_DIR"]).exists(),
-        },
-        # managed by bin/docker_entrypoint.sh and python-crontab:
-        # 'CRONTABS_DIR': {
-        #     'path': config['CRONTABS_DIR'].resolve(),
-        #     'enabled': True,
-        #     'is_valid': config['CRONTABS_DIR'].exists(),
-        # },
-    }
 
 def get_dependency_info(config: ConfigDict) -> ConfigValue:
     return {
@@ -1129,20 +785,6 @@ def get_dependency_info(config: ConfigDict) -> ConfigValue:
             'enabled': config['USE_NODE'],
             'is_valid': bool(config['NODE_VERSION']),
         },
-        'SINGLEFILE_BINARY': {
-            'path': bin_path(config['SINGLEFILE_BINARY']),
-            'version': config['SINGLEFILE_VERSION'],
-            'hash': bin_hash(config['SINGLEFILE_BINARY']),
-            'enabled': config['USE_SINGLEFILE'],
-            'is_valid': bool(config['SINGLEFILE_VERSION']),
-        },
-        'READABILITY_BINARY': {
-            'path': bin_path(config['READABILITY_BINARY']),
-            'version': config['READABILITY_VERSION'],
-            'hash': bin_hash(config['READABILITY_BINARY']),
-            'enabled': config['USE_READABILITY'],
-            'is_valid': bool(config['READABILITY_VERSION']),
-        },
         'MERCURY_BINARY': {
             'path': bin_path(config['MERCURY_BINARY']),
             'version': config['MERCURY_VERSION'],
@@ -1157,13 +799,27 @@ def get_dependency_info(config: ConfigDict) -> ConfigValue:
             'enabled': config['USE_GIT'],
             'is_valid': bool(config['GIT_VERSION']),
         },
-        'YOUTUBEDL_BINARY': {
-            'path': bin_path(config['YOUTUBEDL_BINARY']),
-            'version': config['YOUTUBEDL_VERSION'],
-            'hash': bin_hash(config['YOUTUBEDL_BINARY']),
-            'enabled': config['USE_YOUTUBEDL'],
-            'is_valid': bool(config['YOUTUBEDL_VERSION']),
-        },
+        # 'SINGLEFILE_BINARY': {
+        #     'path': bin_path(config['SINGLEFILE_BINARY']),
+        #     'version': config['SINGLEFILE_VERSION'],
+        #     'hash': bin_hash(config['SINGLEFILE_BINARY']),
+        #     'enabled': config['USE_SINGLEFILE'],
+        #     'is_valid': bool(config['SINGLEFILE_VERSION']),
+        # },
+        # 'READABILITY_BINARY': {
+        #     'path': bin_path(config['READABILITY_BINARY']),
+        #     'version': config['READABILITY_VERSION'],
+        #     'hash': bin_hash(config['READABILITY_BINARY']),
+        #     'enabled': config['USE_READABILITY'],
+        #     'is_valid': bool(config['READABILITY_VERSION']),
+        # },
+        # 'YOUTUBEDL_BINARY': {
+        #     'path': bin_path(config['YOUTUBEDL_BINARY']),
+        #     'version': config['YOUTUBEDL_VERSION'],
+        #     'hash': bin_hash(config['YOUTUBEDL_BINARY']),
+        #     'enabled': config['USE_YOUTUBEDL'],
+        #     'is_valid': bool(config['YOUTUBEDL_VERSION']),
+        # },
         # 'CHROME_BINARY': {
         #     'path': bin_path(config['CHROME_BINARY']),
         #     'version': config['CHROME_VERSION'],
@@ -1227,10 +883,6 @@ assert TIMEZONE == 'UTC', 'The server timezone should always be set to UTC'  # n
 os.environ["TZ"] = TIMEZONE                                                  # noqa: F821
 os.umask(0o777 - int(DIR_OUTPUT_PERMISSIONS, base=8))                        # noqa: F821
 
-# add ./node_modules/.bin to $PATH so we can use node scripts in extractors
-sys.path.append(CONFIG.NODE_BIN_PATH)
-
-
 ########################### Config Validity Checkers ###########################
 
 if not CONFIG.USE_COLOR:
@@ -1256,6 +908,7 @@ def bump_startup_progress_bar():
 
 def setup_django_minimal():
     sys.path.append(str(archivebox.PACKAGE_DIR))
+    os.environ.setdefault('OUTPUT_DIR', str(archivebox.DATA_DIR))
     os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'core.settings')
     django.setup()
 
@@ -1267,29 +920,18 @@ def setup_django(out_dir: Path=None, check_db=False, config: ConfigDict=CONFIG,
     with Progress(transient=True, expand=True, console=CONSOLE) as INITIAL_STARTUP_PROGRESS:
         INITIAL_STARTUP_PROGRESS_TASK = INITIAL_STARTUP_PROGRESS.add_task("[green]Loading modules...", total=25)
 
-        output_dir = out_dir or Path(config['OUTPUT_DIR'])
+        output_dir = out_dir or archivebox.DATA_DIR
 
-        assert isinstance(output_dir, Path) and isinstance(config['PACKAGE_DIR'], Path)
+        assert isinstance(output_dir, Path) and isinstance(archivebox.PACKAGE_DIR, Path)
 
         bump_startup_progress_bar()
         try:
             from django.core.management import call_command
 
-            sys.path.append(str(config['PACKAGE_DIR']))
-            os.environ.setdefault('OUTPUT_DIR', str(output_dir))
-            assert (config['PACKAGE_DIR'] / 'core' / 'settings.py').exists(), 'settings.py was not found at archivebox/core/settings.py'
+            sys.path.append(str(archivebox.PACKAGE_DIR))
+            os.environ.setdefault('OUTPUT_DIR', str(archivebox.DATA_DIR))
+            os.environ.setdefault("ARCHIVEBOX_DATABASE_NAME", ":memory:")
             os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'core.settings')
-
-            # Check to make sure JSON extension is available in our Sqlite3 instance
-            try:
-                cursor = sqlite3.connect(':memory:').cursor()
-                cursor.execute('SELECT JSON(\'{"a": "b"}\')')
-            except sqlite3.OperationalError as exc:
-                stderr(f'[X] Your SQLite3 version is missing the required JSON1 extension: {exc}', color='red')
-                hint([
-                    'Upgrade your Python version or install the extension manually:',
-                    'https://code.djangoproject.com/wiki/JSON1Extension'
-                ])
                 
             bump_startup_progress_bar()
 
@@ -1310,28 +952,16 @@ def setup_django(out_dir: Path=None, check_db=False, config: ConfigDict=CONFIG,
             bump_startup_progress_bar()
 
             from django.conf import settings
+            
+            from plugins_sys.config.apps import SHELL_CONFIG
 
             # log startup message to the error log
             with open(settings.ERROR_LOG, "a", encoding='utf-8') as f:
                 command = ' '.join(sys.argv)
                 ts = datetime.now(timezone.utc).strftime('%Y-%m-%d__%H:%M:%S')
-                f.write(f"\n> {command}; TS={ts} VERSION={config['VERSION']} IN_DOCKER={config['IN_DOCKER']} IS_TTY={config['IS_TTY']}\n")
+                f.write(f"\n> {command}; TS={ts} VERSION={archivebox.VERSION} IN_DOCKER={SHELL_CONFIG.IN_DOCKER} IS_TTY={SHELL_CONFIG.IS_TTY}\n")
 
             if check_db:
-                # Enable WAL mode in sqlite3
-                from django.db import connection
-                with connection.cursor() as cursor:
-
-                    # Set Journal mode to WAL to allow for multiple writers
-                    current_mode = cursor.execute("PRAGMA journal_mode")
-                    if current_mode != 'wal':
-                        cursor.execute("PRAGMA journal_mode=wal;")
-
-                    # Set max blocking delay for concurrent writes and write sync mode
-                    # https://litestream.io/tips/#busy-timeout
-                    cursor.execute("PRAGMA busy_timeout = 5000;")
-                    cursor.execute("PRAGMA synchronous = NORMAL;")
-
                 # Create cache table in DB if needed
                 try:
                     from django.core.cache import cache
@@ -1348,9 +978,9 @@ def setup_django(out_dir: Path=None, check_db=False, config: ConfigDict=CONFIG,
                 for conn in connections.all():
                     conn.close_if_unusable_or_obsolete()
 
-                sql_index_path = Path(output_dir) / SQL_INDEX_FILENAME
+                sql_index_path = archivebox.CONSTANTS.DATABASE_FILE
                 assert sql_index_path.exists(), (
-                    f'No database file {SQL_INDEX_FILENAME} found in: {config["OUTPUT_DIR"]} (Are you in an ArchiveBox collection directory?)')
+                    f'No database file {sql_index_path} found in: {archivebox.DATA_DIR} (Are you in an ArchiveBox collection directory?)')
 
                 bump_startup_progress_bar()
 
@@ -1363,7 +993,7 @@ def setup_django(out_dir: Path=None, check_db=False, config: ConfigDict=CONFIG,
 
                     logfire.configure()
                     logfire.instrument_django(is_sql_commentor_enabled=True)
-                    logfire.info(f'Started ArchiveBox v{CONFIG.VERSION}', argv=sys.argv)
+                    logfire.info(f'Started ArchiveBox v{archivebox.VERSION}', argv=sys.argv)
 
         except KeyboardInterrupt:
             raise SystemExit(2)

+ 249 - 0
archivebox/constants.py

@@ -0,0 +1,249 @@
+__package__ = 'archivebox'
+
+
+import os
+from types import MappingProxyType
+from typing import Set, Dict, NamedTuple, Tuple
+from pathlib import Path
+
+from benedict import benedict
+
+import archivebox
+
+from .misc.logging import DEFAULT_CLI_COLORS
+
+###################### Config ##########################
+
+class ConstantsConfig(NamedTuple):
+    
+    VERSION: str = archivebox.__version__
+    
+    DEFAULT_CLI_COLORS: Dict[str, str]        = DEFAULT_CLI_COLORS
+    DISABLED_CLI_COLORS: Dict[str, str]       = benedict({k: '' for k in DEFAULT_CLI_COLORS})
+    
+    PACKAGE_DIR: Path = archivebox.PACKAGE_DIR
+    PACKAGE_DIR_NAME: str = archivebox.PACKAGE_DIR.name
+    TEMPLATES_DIR_NAME: str = 'templates'
+    TEMPLATES_DIR: Path                 = archivebox.PACKAGE_DIR / TEMPLATES_DIR_NAME
+    STATIC_DIR: Path                    = TEMPLATES_DIR / 'static'
+    USER_PLUGINS_DIR_NAME: str          = 'user_plugins'
+    CUSTOM_TEMPLATES_DIR_NAME: str      = 'user_templates'
+    
+    DATA_DIR: Path = archivebox.DATA_DIR
+    ARCHIVE_DIR_NAME: str = 'archive'
+    SOURCES_DIR_NAME: str = 'sources'
+    PERSONAS_DIR_NAME: str = 'personas'
+    CRONTABS_DIR_NAME: str = 'crontabs'
+    CACHE_DIR_NAME: str = 'cache'
+    LOGS_DIR_NAME: str = 'logs'
+    LIB_DIR_NAME: str = 'lib'
+    TMP_DIR_NAME: str = 'tmp'
+    OUTPUT_DIR: Path                    = archivebox.DATA_DIR
+    ARCHIVE_DIR: Path                   = archivebox.DATA_DIR / ARCHIVE_DIR_NAME
+    SOURCES_DIR: Path                   = archivebox.DATA_DIR / SOURCES_DIR_NAME
+    PERSONAS_DIR: Path                  = archivebox.DATA_DIR / PERSONAS_DIR_NAME
+    CACHE_DIR: Path                     = archivebox.DATA_DIR / CACHE_DIR_NAME
+    LOGS_DIR: Path                      = archivebox.DATA_DIR / LOGS_DIR_NAME
+    LIB_DIR: Path                       = archivebox.DATA_DIR / LIB_DIR_NAME
+    TMP_DIR: Path                       = archivebox.DATA_DIR / TMP_DIR_NAME
+    CUSTOM_TEMPLATES_DIR: Path          = archivebox.DATA_DIR / CUSTOM_TEMPLATES_DIR_NAME
+    USER_PLUGINS_DIR: Path              = archivebox.DATA_DIR / USER_PLUGINS_DIR_NAME
+    
+    LIB_PIP_DIR: Path                   = LIB_DIR / 'pip'
+    LIB_NPM_DIR: Path                   = LIB_DIR / 'npm'
+    LIB_BROWSERS_DIR: Path              = LIB_DIR / 'browsers'
+    LIB_BIN_DIR: Path                   = LIB_DIR / 'bin'
+    BIN_DIR: Path                       = LIB_BIN_DIR
+    
+    CONFIG_FILENAME: str = 'ArchiveBox.conf'
+    SQL_INDEX_FILENAME: str = 'index.sqlite3'
+    
+    CONFIG_FILE: Path                   = archivebox.DATA_DIR / CONFIG_FILENAME
+    DATABASE_FILE: Path                 = archivebox.DATA_DIR / SQL_INDEX_FILENAME
+    QUEUE_DATABASE_FILE: Path           = archivebox.DATA_DIR / SQL_INDEX_FILENAME.replace('index.', 'queue.')
+    
+    JSON_INDEX_FILENAME: str = 'index.json'
+    HTML_INDEX_FILENAME: str = 'index.html'
+    ROBOTS_TXT_FILENAME: str = 'robots.txt'
+    FAVICON_FILENAME: str = 'favicon.ico'
+    
+    STATICFILE_EXTENSIONSSTATICFILE_EXTENSIONS: frozenset[str] = frozenset((
+        # 99.999% of the time, URLs ending in these extensions are static files
+        # that can be downloaded as-is, not html pages that need to be rendered
+        'gif', 'jpeg', 'jpg', 'png', 'tif', 'tiff', 'wbmp', 'ico', 'jng', 'bmp',
+        'svg', 'svgz', 'webp', 'ps', 'eps', 'ai',
+        'mp3', 'mp4', 'm4a', 'mpeg', 'mpg', 'mkv', 'mov', 'webm', 'm4v',
+        'flv', 'wmv', 'avi', 'ogg', 'ts', 'm3u8',
+        'pdf', 'txt', 'rtf', 'rtfd', 'doc', 'docx', 'ppt', 'pptx', 'xls', 'xlsx',
+        'atom', 'rss', 'css', 'js', 'json',
+        'dmg', 'iso', 'img',
+        'rar', 'war', 'hqx', 'zip', 'gz', 'bz2', '7z',
+
+        # Less common extensions to consider adding later
+        # jar, swf, bin, com, exe, dll, deb
+        # ear, hqx, eot, wmlc, kml, kmz, cco, jardiff, jnlp, run, msi, msp, msm,
+        # pl pm, prc pdb, rar, rpm, sea, sit, tcl tk, der, pem, crt, xpi, xspf,
+        # ra, mng, asx, asf, 3gpp, 3gp, mid, midi, kar, jad, wml, htc, mml
+
+        # These are always treated as pages, not as static files, never add them:
+        # html, htm, shtml, xhtml, xml, aspx, php, cgi
+    ))
+    
+    INGORED_PATHS: frozenset[str] = frozenset((
+        ".git",
+        ".svn",
+        ".DS_Store",
+        ".gitignore",
+        "lost+found",
+        ".DS_Store",
+        ".env",
+        "Dockerfile",
+    ))
+    PIP_RELATED_NAMES: frozenset[str] = frozenset((
+        ".venv",
+        "venv",
+        "virtualenv",
+        ".virtualenv",
+    ))
+    NPM_RELATED_NAMES: frozenset[str] = frozenset((
+        "node_modules",
+        "package.json",
+        "package-lock.json",
+        "yarn.lock",
+    ))
+    
+    DATA_DIR_NAMES: frozenset[str] = frozenset((
+        ARCHIVE_DIR_NAME,
+        SOURCES_DIR_NAME,
+        LOGS_DIR_NAME,
+        CACHE_DIR_NAME,
+        LIB_DIR_NAME,
+        PERSONAS_DIR_NAME,
+        CUSTOM_TEMPLATES_DIR_NAME,
+        USER_PLUGINS_DIR_NAME,
+    ))
+    DATA_DIRS: frozenset[Path] = frozenset(archivebox.DATA_DIR / dirname for dirname in DATA_DIR_NAMES)
+    DATA_FILE_NAMES: frozenset[str] = frozenset((
+        CONFIG_FILENAME,
+        SQL_INDEX_FILENAME,
+        f"{SQL_INDEX_FILENAME}-wal",
+        f"{SQL_INDEX_FILENAME}-shm",
+        "queue.sqlite3",
+        "queue.sqlite3-wal",
+        "queue.sqlite3-shm",
+        "search.sqlite3",
+        JSON_INDEX_FILENAME,
+        HTML_INDEX_FILENAME,
+        ROBOTS_TXT_FILENAME,
+        FAVICON_FILENAME,
+        CONFIG_FILENAME,
+        f"{CONFIG_FILENAME}.bak",
+        "static_index.json",
+    ))
+    
+    # When initializing archivebox in a new directory, we check to make sure the dir is
+    # actually empty so that we dont clobber someone's home directory or desktop by accident.
+    # These files are exceptions to the is_empty check when we're trying to init a new dir,
+    # as they could be from a previous archivebox version, system artifacts, dependencies, etc.
+    ALLOWED_IN_OUTPUT_DIR: frozenset[str] = frozenset((
+        *INGORED_PATHS,
+        *PIP_RELATED_NAMES,
+        *NPM_RELATED_NAMES,
+        *DATA_DIR_NAMES,
+        *DATA_FILE_NAMES,
+        "static",                # created by old static exports <v0.6.0
+        "sonic",                 # created by docker bind mount
+    ))
+    
+    CODE_LOCATIONS = MappingProxyType(benedict({
+        'PACKAGE_DIR': {
+            'path': (archivebox.PACKAGE_DIR).resolve(),
+            'enabled': True,
+            'is_valid': (archivebox.PACKAGE_DIR / '__main__.py').exists(),
+        },
+        'LIB_DIR': {
+            'path': LIB_DIR.resolve(),
+            'enabled': True,
+            'is_valid': LIB_DIR.is_dir(),
+        },
+        'RUNTIME_CONFIG': {
+            'path': TMP_DIR.resolve(),
+            'enabled': True,
+            'is_valid': TMP_DIR.is_dir(),
+        },
+        'TEMPLATES_DIR': {
+            'path': TEMPLATES_DIR.resolve(),
+            'enabled': True,
+            'is_valid': STATIC_DIR.exists(),
+        },
+        'CUSTOM_TEMPLATES_DIR': {
+            'path': CUSTOM_TEMPLATES_DIR.resolve(),
+            'enabled': True,
+            'is_valid': CUSTOM_TEMPLATES_DIR.is_dir(),
+        },
+    }))
+        
+    DATA_LOCATIONS = MappingProxyType(benedict({
+        "OUTPUT_DIR": {
+            "path": archivebox.DATA_DIR.resolve(),
+            "enabled": True,
+            "is_valid": DATABASE_FILE.exists(),
+            "is_mount": os.path.ismount(archivebox.DATA_DIR.resolve()),
+        },
+        "CONFIG_FILE": {
+            "path": CONFIG_FILE.resolve(),
+            "enabled": True,
+            "is_valid": CONFIG_FILE.exists(),
+        },
+        "SQL_INDEX": {
+            "path": DATABASE_FILE.resolve(),
+            "enabled": True,
+            "is_valid": DATABASE_FILE.exists(),
+            "is_mount": os.path.ismount(DATABASE_FILE.resolve()),
+        },
+        "QUEUE_DATABASE": {
+            "path": QUEUE_DATABASE_FILE.resolve(),
+            "enabled": True,
+            "is_valid": QUEUE_DATABASE_FILE.exists(),
+            "is_mount": os.path.ismount(QUEUE_DATABASE_FILE.resolve()),
+        },
+        "ARCHIVE_DIR": {
+            "path": ARCHIVE_DIR.resolve(),
+            "enabled": True,
+            "is_valid": ARCHIVE_DIR.exists(),
+            "is_mount": os.path.ismount(ARCHIVE_DIR.resolve()),
+        },
+        "SOURCES_DIR": {
+            "path": SOURCES_DIR.resolve(),
+            "enabled": True,
+            "is_valid": SOURCES_DIR.exists(),
+        },
+        "PERSONAS_DIR": {
+            "path": PERSONAS_DIR.resolve(),
+            "enabled": PERSONAS_DIR.exists(),
+            "is_valid": PERSONAS_DIR.exists(),
+        },
+        "LOGS_DIR": {
+            "path": LOGS_DIR.resolve(),
+            "enabled": True,
+            "is_valid": LOGS_DIR.is_dir(),
+        },
+        "CACHE_DIR": {
+            "path": CACHE_DIR.resolve(),
+            "enabled": True,
+            "is_valid": CACHE_DIR.is_dir(),
+        },
+    }))
+    
+    def items(self):
+        return self._asdict().items()
+    
+    def keys(self):
+        return self._asdict().keys()
+    
+    def values(self):
+        return self._asdict().values()
+
+
+CONSTANTS = ConstantsConfig()
+CONSTANTS_CONFIG = CONSTANTS

+ 3 - 3
archivebox/core/admin.py

@@ -2,7 +2,6 @@ __package__ = 'archivebox.core'
 
 import os
 
-import threading
 from pathlib import Path
 
 from django.contrib import admin, messages
@@ -19,6 +18,7 @@ from django.template import Template, RequestContext
 from django.conf import settings
 from django import forms
 
+import archivebox
 
 from signal_webhooks.admin import WebhookAdmin
 from signal_webhooks.utils import get_webhook_model
@@ -34,13 +34,13 @@ from queues.tasks import bg_archive_links, bg_archive_link, bg_add
 
 from index.html import snapshot_icons
 from logging_util import printable_filesize
-from main import add, remove
+from main import remove
 from extractors import archive_links
 
 
 CONFIG = settings.CONFIG
 
-GLOBAL_CONTEXT = {'VERSION': CONFIG.VERSION, 'VERSIONS_AVAILABLE': CONFIG.VERSIONS_AVAILABLE, 'CAN_UPGRADE': CONFIG.CAN_UPGRADE}
+GLOBAL_CONTEXT = {'VERSION': archivebox.VERSION, 'VERSIONS_AVAILABLE': [], 'CAN_UPGRADE': False}
 
 # Admin URLs
 # /admin/

+ 31 - 39
archivebox/core/settings.py

@@ -2,36 +2,27 @@ __package__ = 'archivebox.core'
 
 import os
 import sys
-import re
-import logging
 import inspect
-import tempfile
-import archivebox
 
 from typing import Dict
 from pathlib import Path
 
-import django
+from benedict import benedict
 from django.utils.crypto import get_random_string
 
+import archivebox
+
 from ..config import CONFIG
-from ..config_stubs import AttrDict
-assert isinstance(CONFIG, AttrDict)
 
 IS_MIGRATING = 'makemigrations' in sys.argv[:3] or 'migrate' in sys.argv[:3]
 IS_TESTING = 'test' in sys.argv[:3] or 'PYTEST_CURRENT_TEST' in os.environ
 IS_SHELL = 'shell' in sys.argv[:3] or 'shell_plus' in sys.argv[:3]
 
 
+VERSION = archivebox.__version__
 PACKAGE_DIR = archivebox.PACKAGE_DIR
-assert PACKAGE_DIR == CONFIG.PACKAGE_DIR
-
 DATA_DIR = archivebox.DATA_DIR
-assert DATA_DIR == CONFIG.OUTPUT_DIR
-ARCHIVE_DIR = DATA_DIR / 'archive'
-assert ARCHIVE_DIR == CONFIG.ARCHIVE_DIR
-
-VERSION = archivebox.__version__
+ARCHIVE_DIR = archivebox.DATA_DIR / 'archive'
 
 ################################################################################
 ### ArchiveBox Plugin Settings
@@ -39,17 +30,16 @@ VERSION = archivebox.__version__
 
 
 def find_plugins_in_dir(plugins_dir: Path, prefix: str) -> Dict[str, Path]:
-    """{"plugins_pkg.pip": "/app/archivebox/plugins_pkg/pip", "user_plugins.other": "/data/user_plugins/other",...}"""
     return {
         f"{prefix}.{plugin_entrypoint.parent.name}": plugin_entrypoint.parent
         for plugin_entrypoint in sorted(plugins_dir.glob("*/apps.py"))   # key=get_plugin_order  # Someday enforcing plugin import order may be required, but right now it's not needed
-    }
+    }   # "plugins_pkg.pip": "/app/archivebox/plugins_pkg/pip"
     
 PLUGIN_DIRS = {
     'plugins_sys':          PACKAGE_DIR / 'plugins_sys',
     'plugins_pkg':          PACKAGE_DIR / 'plugins_pkg',
     'plugins_auth':         PACKAGE_DIR / 'plugins_auth',
-    'plugins_search':         PACKAGE_DIR / 'plugins_search',
+    'plugins_search':       PACKAGE_DIR / 'plugins_search',
     'plugins_extractor':    PACKAGE_DIR / 'plugins_extractor',
     'user_plugins':         DATA_DIR / 'user_plugins',
 }
@@ -59,17 +49,17 @@ for plugin_prefix, plugin_dir in PLUGIN_DIRS.items():
 
 
 ### Plugins Globals (filled by plugin_type.pluginname.apps.PluginName.register() after Django startup)
-PLUGINS = AttrDict({})
-HOOKS = AttrDict({})
+PLUGINS = benedict({})
+HOOKS = benedict({})
 
-# Created later by Hook.register(settings) when each Plugin.register(settings) is called
-# CONFIGS = AttrDict({})
-# BINPROVIDERS = AttrDict({})
-# BINARIES = AttrDict({})
-# EXTRACTORS = AttrDict({})
-# REPLAYERS = AttrDict({})
-# CHECKS = AttrDict({})
-# ADMINDATAVIEWS = AttrDict({})
+# Created later by Plugin.register(settings) -> Hook.register(settings):
+# CONFIGS = benedict({})
+# BINPROVIDERS = benedict({})
+# BINARIES = benedict({})
+# EXTRACTORS = benedict({})
+# REPLAYERS = benedict({})
+# CHECKS = benedict({})
+# ADMINDATAVIEWS = benedict({})
 
 
 ################################################################################
@@ -113,7 +103,7 @@ INSTALLED_APPS = [
     'api',                       # Django-Ninja-based Rest API interfaces, config, APIToken model, etc.
 
     # ArchiveBox plugins
-    *INSTALLED_PLUGINS.keys(),   # all plugin django-apps found in archivebox/*_plugins and data/user_plugins,
+    *INSTALLED_PLUGINS.keys(),   # all plugin django-apps found in archivebox/plugins_* and data/user_plugins,
     # plugin.register(settings) is called at import of each plugin (in the order they are listed here), then plugin.ready() is called at AppConfig.ready() time
 
     # 3rd-party apps from PyPI that need to be loaded last
@@ -164,7 +154,7 @@ if LDAP_CONFIG.LDAP_ENABLED:
 ################################################################################
 
 STATIC_URL = '/static/'
-
+TEMPLATES_DIR_NAME = 'templates'
 STATICFILES_DIRS = [
     *([str(CONFIG.CUSTOM_TEMPLATES_DIR / 'static')] if CONFIG.CUSTOM_TEMPLATES_DIR else []),
     *[
@@ -172,7 +162,7 @@ STATICFILES_DIRS = [
         for plugin_dir in PLUGIN_DIRS.values()
         if (plugin_dir / 'static').is_dir()
     ],
-    str(PACKAGE_DIR / CONFIG.TEMPLATES_DIR_NAME / 'static'),
+    str(PACKAGE_DIR / TEMPLATES_DIR_NAME / 'static'),
 ]
 
 TEMPLATE_DIRS = [
@@ -182,9 +172,9 @@ TEMPLATE_DIRS = [
         for plugin_dir in PLUGIN_DIRS.values()
         if (plugin_dir / 'templates').is_dir()
     ],
-    str(PACKAGE_DIR / CONFIG.TEMPLATES_DIR_NAME / 'core'),
-    str(PACKAGE_DIR / CONFIG.TEMPLATES_DIR_NAME / 'admin'),
-    str(PACKAGE_DIR / CONFIG.TEMPLATES_DIR_NAME),
+    str(PACKAGE_DIR / TEMPLATES_DIR_NAME / 'core'),
+    str(PACKAGE_DIR / TEMPLATES_DIR_NAME / 'admin'),
+    str(PACKAGE_DIR / TEMPLATES_DIR_NAME),
 ]
 
 TEMPLATES = [
@@ -208,13 +198,14 @@ TEMPLATES = [
 ### External Service Settings
 ################################################################################
 
+from ..plugins_sys.config.constants import CONSTANTS
 
-CACHE_DB_FILENAME = 'cache.sqlite3'
-CACHE_DB_PATH = CONFIG.CACHE_DIR / CACHE_DB_FILENAME
-CACHE_DB_TABLE = 'django_cache'
+# CACHE_DB_FILENAME = 'cache.sqlite3'
+# CACHE_DB_PATH = CONSTANTS.CACHE_DIR / CACHE_DB_FILENAME
+# CACHE_DB_TABLE = 'django_cache'
 
-DATABASE_FILE = DATA_DIR / CONFIG.SQL_INDEX_FILENAME
-DATABASE_NAME = os.environ.get("ARCHIVEBOX_DATABASE_NAME", str(DATABASE_FILE))
+DATABASE_FILE = DATA_DIR / CONSTANTS.SQL_INDEX_FILENAME
+DATABASE_NAME = os.environ.get("ARCHIVEBOX_DATABASE_NAME", str(CONSTANTS.DATABASE_FILE))
 
 QUEUE_DATABASE_NAME = DATABASE_NAME.replace('index.sqlite3', 'queue.sqlite3')
 
@@ -222,6 +213,7 @@ SQLITE_CONNECTION_OPTIONS = {
     "TIME_ZONE": CONFIG.TIMEZONE,
     "OPTIONS": {
         # https://gcollazo.com/optimal-sqlite-settings-for-django/
+        # # https://litestream.io/tips/#busy-timeout
         "timeout": 5,
         "check_same_thread": False,
         "transaction_mode": "IMMEDIATE",
@@ -345,7 +337,7 @@ STORAGES = {
         "BACKEND": "django.core.files.storage.FileSystemStorage",
         "OPTIONS": {
             "base_url": "/archive/",
-            "location": CONFIG.ARCHIVE_DIR,
+            "location": ARCHIVE_DIR,
         },
     },
     # "personas": {

+ 3 - 2
archivebox/extractors/__init__.py

@@ -14,7 +14,6 @@ from ..config import (
     SAVE_ALLOWLIST_PTN,
     SAVE_DENYLIST_PTN,
 )
-from ..core.settings import ERROR_LOG
 from ..index.schema import ArchiveResult, Link
 from ..index.sql import write_link_to_sql_index
 from ..index import (
@@ -109,6 +108,8 @@ def ignore_methods(to_ignore: List[str]) -> Iterable[str]:
 def archive_link(link: Link, overwrite: bool=False, methods: Optional[Iterable[str]]=None, out_dir: Optional[Path]=None, created_by_id: int | None=None) -> Link:
     """download the DOM, PDF, and a screenshot into a folder named after the link's timestamp"""
 
+    from django.conf import settings
+
     from ..search import write_search_index
 
     # TODO: Remove when the input is changed to be a snapshot. Suboptimal approach.
@@ -169,7 +170,7 @@ def archive_link(link: Link, overwrite: bool=False, methods: Optional[Iterable[s
                     stats['skipped'] += 1
             except Exception as e:
                 # https://github.com/ArchiveBox/ArchiveBox/issues/984#issuecomment-1150541627
-                with open(ERROR_LOG, "a", encoding='utf-8') as f:
+                with open(settings.ERROR_LOG, "a", encoding='utf-8') as f:
                     command = ' '.join(sys.argv)
                     ts = datetime.now(timezone.utc).strftime('%Y-%m-%d__%H:%M:%S')
                     f.write(("\n" + 'Exception in archive_methods.save_{}(Link(url={})) command={}; ts={}'.format(

+ 3 - 2
archivebox/extractors/htmltotext.py

@@ -1,5 +1,7 @@
 __package__ = 'archivebox.extractors'
 
+import archivebox
+
 from html.parser import HTMLParser
 import io
 from pathlib import Path
@@ -8,7 +10,6 @@ from typing import Optional
 from ..config import (
     SAVE_HTMLTOTEXT,
     TIMEOUT,
-    VERSION,
 )
 from ..index.schema import Link, ArchiveResult, ArchiveError
 from ..logging_util import TimedProgress
@@ -153,7 +154,7 @@ def save_htmltotext(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEO
     return ArchiveResult(
         cmd=cmd,
         pwd=str(out_dir),
-        cmd_version=VERSION,
+        cmd_version=archivebox.__version__,
         output=output,
         status=status,
         index_texts=[extracted_text] if extracted_text else [],

+ 27 - 29
archivebox/extractors/readability.py

@@ -8,17 +8,7 @@ import json
 
 from ..index.schema import Link, ArchiveResult, ArchiveError
 from ..system import run, atomic_write
-from ..util import (
-    enforce_types,
-    is_static_file,
-)
-from ..config import (
-    TIMEOUT,
-    CURL_BINARY,
-    SAVE_READABILITY,
-    DEPENDENCIES,
-    READABILITY_VERSION,
-)
+from ..util import enforce_types, is_static_file
 from ..logging_util import TimedProgress
 from .title import get_html
 
@@ -31,22 +21,29 @@ def get_embed_path(archiveresult=None):
 
 @enforce_types
 def should_save_readability(link: Link, out_dir: Optional[str]=None, overwrite: Optional[bool]=False) -> bool:
+    from plugins_extractor.readability.apps import READABILITY_CONFIG
+    
     if is_static_file(link.url):
         return False
 
-    out_dir = out_dir or Path(link.link_dir)
-    if not overwrite and (out_dir / get_output_path()).exists():
+    output_subdir = (Path(out_dir or link.link_dir) / get_output_path())
+    if not overwrite and output_subdir.exists():
         return False
 
-    return SAVE_READABILITY
+    return READABILITY_CONFIG.SAVE_READABILITY
 
 
 @enforce_types
-def save_readability(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEOUT) -> ArchiveResult:
+def save_readability(link: Link, out_dir: Optional[str]=None, timeout: int=0) -> ArchiveResult:
     """download reader friendly version using @mozilla/readability"""
-
-    out_dir = Path(out_dir or link.link_dir)
-    output_folder = out_dir.absolute() / get_output_path()
+    
+    from plugins_extractor.readability.apps import READABILITY_CONFIG, READABILITY_BINARY
+    
+    READABILITY_BIN = READABILITY_BINARY.load()
+    assert READABILITY_BIN.abspath and READABILITY_BIN.version
+
+    timeout = timeout or READABILITY_CONFIG.READABILITY_TIMEOUT
+    output_subdir = Path(out_dir or link.link_dir).absolute() / get_output_path()
     output = get_output_path()
 
     # Readability Docs: https://github.com/mozilla/readability
@@ -54,13 +51,14 @@ def save_readability(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEO
     status = 'succeeded'
     # fake command to show the user so they have something to try debugging if get_html fails
     cmd = [
-        CURL_BINARY,
-        link.url
+        str(READABILITY_BIN.abspath),
+        '{dom,singlefile}.html',
+        link.url,
     ]
     readability_content = None
     timer = TimedProgress(timeout, prefix='      ')
     try:
-        document = get_html(link, out_dir)
+        document = get_html(link, Path(out_dir or link.link_dir))
         temp_doc = NamedTemporaryFile(delete=False)
         temp_doc.write(document.encode("utf-8"))
         temp_doc.close()
@@ -69,26 +67,26 @@ def save_readability(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEO
             raise ArchiveError('Readability could not find HTML to parse for article text')
 
         cmd = [
-            DEPENDENCIES['READABILITY_BINARY']['path'],
+            str(READABILITY_BIN.abspath),
             temp_doc.name,
             link.url,
         ]
-        result = run(cmd, cwd=out_dir, timeout=timeout)
+        result = run(cmd, cwd=out_dir, timeout=timeout, text=True)
         try:
             result_json = json.loads(result.stdout)
             assert result_json and 'content' in result_json, 'Readability output is not valid JSON'
         except json.JSONDecodeError:
             raise ArchiveError('Readability was not able to archive the page (invalid JSON)', result.stdout + result.stderr)
 
-        output_folder.mkdir(exist_ok=True)
+        output_subdir.mkdir(exist_ok=True)
         readability_content = result_json.pop("textContent") 
-        atomic_write(str(output_folder / "content.html"), result_json.pop("content"))
-        atomic_write(str(output_folder / "content.txt"), readability_content)
-        atomic_write(str(output_folder / "article.json"), result_json)
+        atomic_write(str(output_subdir / "content.html"), result_json.pop("content"))
+        atomic_write(str(output_subdir / "content.txt"), readability_content)
+        atomic_write(str(output_subdir / "article.json"), result_json)
 
         output_tail = [
             line.strip()
-            for line in (result.stdout + result.stderr).decode().rsplit('\n', 5)[-5:]
+            for line in (result.stdout + result.stderr).rsplit('\n', 5)[-5:]
             if line.strip()
         ]
         hints = (
@@ -111,7 +109,7 @@ def save_readability(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEO
     return ArchiveResult(
         cmd=cmd,
         pwd=str(out_dir),
-        cmd_version=READABILITY_VERSION,
+        cmd_version=str(READABILITY_BIN.version),
         output=output,
         status=status,
         index_texts=[readability_content] if readability_content else [],

+ 33 - 34
archivebox/index/__init__.py

@@ -11,20 +11,19 @@ from contextlib import contextmanager
 from urllib.parse import urlparse
 from django.db.models import QuerySet, Q
 
+
+import archivebox
+
 from ..util import (
     scheme,
     enforce_types,
     ExtendedEncoder,
 )
+from ..misc.logging import stderr
 from ..config import (
-    ARCHIVE_DIR_NAME,
-    SQL_INDEX_FILENAME,
-    JSON_INDEX_FILENAME,
-    OUTPUT_DIR,
     TIMEOUT,
     URL_DENYLIST_PTN,
     URL_ALLOWLIST_PTN,
-    stderr,
     OUTPUT_PERMISSIONS
 )
 from ..logging_util import (
@@ -224,28 +223,28 @@ def timed_index_update(out_path: Path):
 
 
 @enforce_types
-def write_main_index(links: List[Link], out_dir: Path=OUTPUT_DIR, created_by_id: int | None=None) -> None:
+def write_main_index(links: List[Link], out_dir: Path=archivebox.DATA_DIR, created_by_id: int | None=None) -> None:
     """Writes links to sqlite3 file for a given list of links"""
 
     log_indexing_process_started(len(links))
 
     try:
-        with timed_index_update(out_dir / SQL_INDEX_FILENAME):
+        with timed_index_update(archivebox.CONSTANTS.DATABASE_FILE):
             write_sql_main_index(links, out_dir=out_dir, created_by_id=created_by_id)
-            os.chmod(out_dir / SQL_INDEX_FILENAME, int(OUTPUT_PERMISSIONS, base=8)) # set here because we don't write it with atomic writes
+            os.chmod(archivebox.CONSTANTS.DATABASE_FILE, int(OUTPUT_PERMISSIONS, base=8)) # set here because we don't write it with atomic writes
 
     except (KeyboardInterrupt, SystemExit):
         stderr('[!] Warning: Still writing index to disk...', color='lightyellow')
         stderr('    Run archivebox init to fix any inconsistencies from an ungraceful exit.')
-        with timed_index_update(out_dir / SQL_INDEX_FILENAME):
+        with timed_index_update(archivebox.CONSTANTS.DATABASE_FILE):
             write_sql_main_index(links, out_dir=out_dir, created_by_id=created_by_id)
-            os.chmod(out_dir / SQL_INDEX_FILENAME, int(OUTPUT_PERMISSIONS, base=8)) # set here because we don't write it with atomic writes
+            os.chmod(archivebox.CONSTANTS.DATABASE_FILE, int(OUTPUT_PERMISSIONS, base=8)) # set here because we don't write it with atomic writes
         raise SystemExit(0)
 
     log_indexing_process_finished()
 
 @enforce_types
-def load_main_index(out_dir: Path=OUTPUT_DIR, warn: bool=True) -> List[Link]:
+def load_main_index(out_dir: Path=archivebox.DATA_DIR, warn: bool=True) -> List[Link]:
     """parse and load existing index with any new links from import_path merged in"""
     from core.models import Snapshot
     try:
@@ -255,8 +254,8 @@ def load_main_index(out_dir: Path=OUTPUT_DIR, warn: bool=True) -> List[Link]:
         raise SystemExit(0)
 
 @enforce_types
-def load_main_index_meta(out_dir: Path=OUTPUT_DIR) -> Optional[dict]:
-    index_path = out_dir / JSON_INDEX_FILENAME
+def load_main_index_meta(out_dir: Path=archivebox.DATA_DIR) -> Optional[dict]:
+    index_path = out_dir / archivebox.CONSTANTS.JSON_INDEX_FILENAME
     if index_path.exists():
         with open(index_path, 'r', encoding='utf-8') as f:
             meta_dict = pyjson.load(f)
@@ -407,7 +406,7 @@ def snapshot_filter(snapshots: QuerySet, filter_patterns: List[str], filter_type
         return search_filter(snapshots, filter_patterns, filter_type)
 
 
-def get_indexed_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
+def get_indexed_folders(snapshots, out_dir: Path=archivebox.DATA_DIR) -> Dict[str, Optional[Link]]:
     """indexed links without checking archive status or data directory validity"""
     links = (snapshot.as_link() for snapshot in snapshots.iterator(chunk_size=500))
     return {
@@ -415,7 +414,7 @@ def get_indexed_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Option
         for link in links
     }
 
-def get_archived_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
+def get_archived_folders(snapshots, out_dir: Path=archivebox.DATA_DIR) -> Dict[str, Optional[Link]]:
     """indexed links that are archived with a valid data directory"""
     links = (snapshot.as_link() for snapshot in snapshots.iterator(chunk_size=500))
     return {
@@ -423,7 +422,7 @@ def get_archived_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optio
         for link in filter(is_archived, links)
     }
 
-def get_unarchived_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
+def get_unarchived_folders(snapshots, out_dir: Path=archivebox.DATA_DIR) -> Dict[str, Optional[Link]]:
     """indexed links that are unarchived with no data directory or an empty data directory"""
     links = (snapshot.as_link() for snapshot in snapshots.iterator(chunk_size=500))
     return {
@@ -431,12 +430,12 @@ def get_unarchived_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Opt
         for link in filter(is_unarchived, links)
     }
 
-def get_present_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
+def get_present_folders(snapshots, out_dir: Path=archivebox.DATA_DIR) -> Dict[str, Optional[Link]]:
     """dirs that actually exist in the archive/ folder"""
 
     all_folders = {}
 
-    for entry in (out_dir / ARCHIVE_DIR_NAME).iterdir():
+    for entry in (out_dir / archivebox.CONSTANTS.ARCHIVE_DIR_NAME).iterdir():
         if entry.is_dir():
             link = None
             try:
@@ -448,7 +447,7 @@ def get_present_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Option
 
     return all_folders
 
-def get_valid_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
+def get_valid_folders(snapshots, out_dir: Path=archivebox.DATA_DIR) -> Dict[str, Optional[Link]]:
     """dirs with a valid index matched to the main index and archived content"""
     links = [snapshot.as_link_with_details() for snapshot in snapshots.iterator(chunk_size=500)]
     return {
@@ -456,16 +455,16 @@ def get_valid_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optional
         for link in filter(is_valid, links)
     }
 
-def get_invalid_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
+def get_invalid_folders(snapshots, out_dir: Path=archivebox.DATA_DIR) -> Dict[str, Optional[Link]]:
     """dirs that are invalid for any reason: corrupted/duplicate/orphaned/unrecognized"""
-    duplicate = get_duplicate_folders(snapshots, out_dir=OUTPUT_DIR)
-    orphaned = get_orphaned_folders(snapshots, out_dir=OUTPUT_DIR)
-    corrupted = get_corrupted_folders(snapshots, out_dir=OUTPUT_DIR)
-    unrecognized = get_unrecognized_folders(snapshots, out_dir=OUTPUT_DIR)
+    duplicate = get_duplicate_folders(snapshots, out_dir=out_dir)
+    orphaned = get_orphaned_folders(snapshots, out_dir=out_dir)
+    corrupted = get_corrupted_folders(snapshots, out_dir=out_dir)
+    unrecognized = get_unrecognized_folders(snapshots, out_dir=out_dir)
     return {**duplicate, **orphaned, **corrupted, **unrecognized}
 
 
-def get_duplicate_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
+def get_duplicate_folders(snapshots, out_dir: Path=archivebox.DATA_DIR) -> Dict[str, Optional[Link]]:
     """dirs that conflict with other directories that have the same link URL or timestamp"""
     by_url = {}
     by_timestamp = {}
@@ -473,7 +472,7 @@ def get_duplicate_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Opti
 
     data_folders = (
         str(entry)
-        for entry in (Path(out_dir) / ARCHIVE_DIR_NAME).iterdir()
+        for entry in archivebox.CONSTANTS.ARCHIVE_DIR.iterdir()
             if entry.is_dir() and not snapshots.filter(timestamp=entry.name).exists()
     )
 
@@ -499,11 +498,11 @@ def get_duplicate_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Opti
                 duplicate_folders[path] = link
     return duplicate_folders
 
-def get_orphaned_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
+def get_orphaned_folders(snapshots, out_dir: Path=archivebox.DATA_DIR) -> Dict[str, Optional[Link]]:
     """dirs that contain a valid index but aren't listed in the main index"""
     orphaned_folders = {}
 
-    for entry in (Path(out_dir) / ARCHIVE_DIR_NAME).iterdir():
+    for entry in archivebox.CONSTANTS.ARCHIVE_DIR.iterdir():
         if entry.is_dir():
             link = None
             try:
@@ -517,7 +516,7 @@ def get_orphaned_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optio
 
     return orphaned_folders
 
-def get_corrupted_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
+def get_corrupted_folders(snapshots, out_dir: Path=archivebox.DATA_DIR) -> Dict[str, Optional[Link]]:
     """dirs that don't contain a valid index and aren't listed in the main index"""
     corrupted = {}
     for snapshot in snapshots.iterator(chunk_size=500):
@@ -526,11 +525,11 @@ def get_corrupted_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Opti
             corrupted[link.link_dir] = link
     return corrupted
 
-def get_unrecognized_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
+def get_unrecognized_folders(snapshots, out_dir: Path=archivebox.DATA_DIR) -> Dict[str, Optional[Link]]:
     """dirs that don't contain recognizable archive data and aren't listed in the main index"""
     unrecognized_folders: Dict[str, Optional[Link]] = {}
 
-    for entry in (Path(out_dir) / ARCHIVE_DIR_NAME).iterdir():
+    for entry in (Path(out_dir) / archivebox.CONSTANTS.ARCHIVE_DIR_NAME).iterdir():
         if entry.is_dir():
             index_exists = (entry / "index.json").exists()
             link = None
@@ -595,10 +594,10 @@ def is_unarchived(link: Link) -> bool:
     return not link.is_archived
 
 
-def fix_invalid_folder_locations(out_dir: Path=OUTPUT_DIR) -> Tuple[List[str], List[str]]:
+def fix_invalid_folder_locations(out_dir: Path=archivebox.DATA_DIR) -> Tuple[List[str], List[str]]:
     fixed = []
     cant_fix = []
-    for entry in os.scandir(out_dir / ARCHIVE_DIR_NAME):
+    for entry in os.scandir(out_dir / archivebox.CONSTANTS.ARCHIVE_DIR_NAME):
         if entry.is_dir(follow_symlinks=True):
             if (Path(entry.path) / 'index.json').exists():
                 try:
@@ -609,7 +608,7 @@ def fix_invalid_folder_locations(out_dir: Path=OUTPUT_DIR) -> Tuple[List[str], L
                     continue
 
                 if not entry.path.endswith(f'/{link.timestamp}'):
-                    dest = out_dir / ARCHIVE_DIR_NAME / link.timestamp
+                    dest = out_dir /archivebox.CONSTANTS.ARCHIVE_DIR_NAME / link.timestamp
                     if dest.exists():
                         cant_fix.append(entry.path)
                     else:

+ 13 - 11
archivebox/index/html.py

@@ -1,11 +1,12 @@
 __package__ = 'archivebox.index'
 
+import archivebox
 from pathlib import Path
 from datetime import datetime, timezone
 from collections import defaultdict
 from typing import List, Optional, Iterator, Mapping
 
-from django.utils.html import format_html, mark_safe
+from django.utils.html import format_html, mark_safe   # type: ignore
 from django.core.cache import cache
 
 from .schema import Link
@@ -19,10 +20,6 @@ from ..util import (
     urldecode,
 )
 from ..config import (
-    OUTPUT_DIR,
-    VERSION,
-    FOOTER_INFO,
-    HTML_INDEX_FILENAME,
     SAVE_ARCHIVE_DOT_ORG,
     PREVIEW_ORIGINALS,
 )
@@ -36,10 +33,12 @@ TITLE_LOADING_MSG = 'Not yet archived...'
 ### Main Links Index
 
 @enforce_types
-def parse_html_main_index(out_dir: Path=OUTPUT_DIR) -> Iterator[str]:
+def parse_html_main_index(out_dir: Path=archivebox.DATA_DIR) -> Iterator[str]:
     """parse an archive index html file and return the list of urls"""
 
-    index_path = Path(out_dir) / HTML_INDEX_FILENAME
+    from plugins_sys.config.constants import CONSTANTS
+
+    index_path = Path(out_dir) / CONSTANTS.HTML_INDEX_FILENAME
     if index_path.exists():
         with open(index_path, 'r', encoding='utf-8') as f:
             for line in f:
@@ -59,14 +58,16 @@ def generate_index_from_links(links: List[Link], with_headers: bool):
 def main_index_template(links: List[Link], template: str=MAIN_INDEX_TEMPLATE) -> str:
     """render the template for the entire main index"""
 
+    from plugins_sys.config.apps import SHELL_CONFIG, SERVER_CONFIG
+
     return render_django_template(template, {
-        'version': VERSION,
-        'git_sha': VERSION,  # not used anymore, but kept for backwards compatibility
+        'version': archivebox.VERSION,
+        'git_sha': SHELL_CONFIG.COMMIT_HASH or archivebox.VERSION,
         'num_links': str(len(links)),
         'date_updated': datetime.now(timezone.utc).strftime('%Y-%m-%d'),
         'time_updated': datetime.now(timezone.utc).strftime('%Y-%m-%d %H:%M'),
         'links': [link._asdict(extended=True) for link in links],
-        'FOOTER_INFO': FOOTER_INFO,
+        'FOOTER_INFO': SERVER_CONFIG.FOOTER_INFO,
     })
 
 
@@ -74,10 +75,11 @@ def main_index_template(links: List[Link], template: str=MAIN_INDEX_TEMPLATE) ->
 
 @enforce_types
 def write_html_link_details(link: Link, out_dir: Optional[str]=None) -> None:
+    from plugins_sys.config.constants import CONSTANTS
     out_dir = out_dir or link.link_dir
 
     rendered_html = link_details_template(link)
-    atomic_write(str(Path(out_dir) / HTML_INDEX_FILENAME), rendered_html)
+    atomic_write(str(Path(out_dir) / CONSTANTS.HTML_INDEX_FILENAME), rendered_html)
 
 
 @enforce_types

+ 40 - 34
archivebox/index/json.py

@@ -8,38 +8,36 @@ from pathlib import Path
 from datetime import datetime, timezone
 from typing import List, Optional, Iterator, Any, Union
 
+import archivebox
+
 from .schema import Link
 from ..system import atomic_write
 from ..util import enforce_types
-from ..config import (
-    VERSION,
-    OUTPUT_DIR,
-    FOOTER_INFO,
-    DEPENDENCIES,
-    JSON_INDEX_FILENAME,
-    ARCHIVE_DIR_NAME,
-    ANSI
-)
-
-
-MAIN_INDEX_HEADER = {
-    'info': 'This is an index of site data archived by ArchiveBox: The self-hosted web archive.',
-    'schema': 'archivebox.index.json',
-    'copyright_info': FOOTER_INFO,
-    'meta': {
-        'project': 'ArchiveBox',
-        'version': VERSION,
-        'git_sha': VERSION,  # not used anymore, but kept for backwards compatibility
-        'website': 'https://ArchiveBox.io',
-        'docs': 'https://github.com/ArchiveBox/ArchiveBox/wiki',
-        'source': 'https://github.com/ArchiveBox/ArchiveBox',
-        'issues': 'https://github.com/ArchiveBox/ArchiveBox/issues',
-        'dependencies': DEPENDENCIES,
-    },
-}
+
+
 
 @enforce_types
 def generate_json_index_from_links(links: List[Link], with_headers: bool):
+    from django.conf import settings
+    from plugins_sys.config.apps import SERVER_CONFIG
+    
+    MAIN_INDEX_HEADER = {
+        'info': 'This is an index of site data archived by ArchiveBox: The self-hosted web archive.',
+        'schema': 'archivebox.index.json',
+        'copyright_info': SERVER_CONFIG.FOOTER_INFO,
+        'meta': {
+            'project': 'ArchiveBox',
+            'version': archivebox.VERSION,
+            'git_sha': archivebox.VERSION,  # not used anymore, but kept for backwards compatibility
+            'website': 'https://ArchiveBox.io',
+            'docs': 'https://github.com/ArchiveBox/ArchiveBox/wiki',
+            'source': 'https://github.com/ArchiveBox/ArchiveBox',
+            'issues': 'https://github.com/ArchiveBox/ArchiveBox/issues',
+            'dependencies': settings.BINARIES.to_dict(),
+        },
+    }
+    
+    
     if with_headers:
         output = {
             **MAIN_INDEX_HEADER,
@@ -54,10 +52,12 @@ def generate_json_index_from_links(links: List[Link], with_headers: bool):
 
 
 @enforce_types
-def parse_json_main_index(out_dir: Path=OUTPUT_DIR) -> Iterator[Link]:
+def parse_json_main_index(out_dir: Path=archivebox.DATA_DIR) -> Iterator[Link]:
     """parse an archive index json file and return the list of links"""
 
-    index_path = Path(out_dir) / JSON_INDEX_FILENAME
+    from plugins_sys.config.constants import CONSTANTS
+
+    index_path = Path(out_dir) / CONSTANTS.JSON_INDEX_FILENAME
     if index_path.exists():
         with open(index_path, 'r', encoding='utf-8') as f:
             try:
@@ -77,14 +77,14 @@ def parse_json_main_index(out_dir: Path=OUTPUT_DIR) -> Iterator[Link]:
                     yield Link.from_json(link_json)
                 except KeyError:
                     try:
-                        detail_index_path = Path(OUTPUT_DIR) / ARCHIVE_DIR_NAME / link_json['timestamp']
+                        detail_index_path = CONSTANTS.ARCHIVE_DIR / link_json['timestamp']
                         yield parse_json_link_details(str(detail_index_path))
                     except KeyError: 
                         # as a last effort, try to guess the missing values out of existing ones
                         try:
                             yield Link.from_json(link_json, guess=True)
                         except KeyError:
-                            print("    {lightyellow}! Failed to load the index.json from {}".format(detail_index_path, **ANSI))
+                            # print("    {lightyellow}! Failed to load the index.json from {}".format(detail_index_path, **ANSI))
                             continue
     return ()
 
@@ -94,15 +94,19 @@ def parse_json_main_index(out_dir: Path=OUTPUT_DIR) -> Iterator[Link]:
 def write_json_link_details(link: Link, out_dir: Optional[str]=None) -> None:
     """write a json file with some info about the link"""
     
+    from plugins_sys.config.constants import CONSTANTS
+    
     out_dir = out_dir or link.link_dir
-    path = Path(out_dir) / JSON_INDEX_FILENAME
+    path = Path(out_dir) / CONSTANTS.JSON_INDEX_FILENAME
     atomic_write(str(path), link._asdict(extended=True))
 
 
 @enforce_types
-def parse_json_link_details(out_dir: Union[Path, str], guess: Optional[bool]=False) -> Optional[Link]:
+def parse_json_link_details(out_dir: Union[Path, str], guess: bool=False) -> Optional[Link]:
     """load the json link index from a given directory"""
-    existing_index = Path(out_dir) / JSON_INDEX_FILENAME
+    from plugins_sys.config.constants import CONSTANTS
+    
+    existing_index = Path(out_dir) / CONSTANTS.JSON_INDEX_FILENAME
     if existing_index.exists():
         with open(existing_index, 'r', encoding='utf-8') as f:
             try:
@@ -117,7 +121,9 @@ def parse_json_link_details(out_dir: Union[Path, str], guess: Optional[bool]=Fal
 def parse_json_links_details(out_dir: Union[Path, str]) -> Iterator[Link]:
     """read through all the archive data folders and return the parsed links"""
 
-    for entry in os.scandir(Path(out_dir) / ARCHIVE_DIR_NAME):
+    from plugins_sys.config.constants import CONSTANTS
+
+    for entry in os.scandir(CONSTANTS.ARCHIVE_DIR):
         if entry.is_dir(follow_symlinks=True):
             if (Path(entry.path) / 'index.json').exists():
                 try:

+ 29 - 31
archivebox/logging_util.py

@@ -4,8 +4,11 @@ import re
 import os
 import sys
 import stat
+import shutil
 import time
 import argparse
+import archivebox
+
 from math import log
 from multiprocessing import Process
 from pathlib import Path
@@ -22,18 +25,7 @@ from rich.panel import Panel
 
 from .system import get_dir_size
 from .util import enforce_types
-from .config import (
-    ConfigDict,
-    OUTPUT_DIR,
-    VERSION,
-    ANSI,
-    IS_TTY,
-    IN_DOCKER,
-    TERM_WIDTH,
-    SHOW_PROGRESS,
-    SOURCES_DIR_NAME,
-    stderr,
-)
+from .misc.logging import ANSI, stderr
 
 @dataclass
 class RuntimeStats:
@@ -102,7 +94,7 @@ def reject_stdin(caller: str, stdin: Optional[IO]=sys.stdin) -> None:
     if not stdin:
         return None
 
-    if IN_DOCKER:
+    if os.environ.get('IN_DOCKER') in ('1', 'true', 'True', 'TRUE', 'yes'):
         # when TTY is disabled in docker we cant tell if stdin is being piped in or not
         # if we try to read stdin when its not piped we will hang indefinitely waiting for it
         return None
@@ -141,9 +133,14 @@ class TimedProgress:
 
     def __init__(self, seconds, prefix=''):
 
-        self.SHOW_PROGRESS = SHOW_PROGRESS
+        from plugins_sys.config.apps import SHELL_CONFIG
+
+        self.SHOW_PROGRESS = SHELL_CONFIG.SHOW_PROGRESS
+        self.ANSI = SHELL_CONFIG.ANSI
+        self.TERM_WIDTH = lambda: shutil.get_terminal_size().columns      # lambda so it live-updates when terminal is resized
+        
         if self.SHOW_PROGRESS:
-            self.p = Process(target=progress_bar, args=(seconds, prefix))
+            self.p = Process(target=progress_bar, args=(seconds, prefix, self.ANSI))
             self.p.start()
 
         self.stats = {'start_ts': datetime.now(timezone.utc), 'end_ts': None}
@@ -172,7 +169,7 @@ class TimedProgress:
 
                 # clear whole terminal line
                 try:
-                    sys.stdout.write('\r{}{}\r'.format((' ' * TERM_WIDTH()), ANSI['reset']))
+                    sys.stdout.write('\r{}{}\r'.format((' ' * self.TERM_WIDTH()), self.ANSI['reset']))
                 except (IOError, BrokenPipeError):
                     # ignore when the parent proc has stopped listening to our stdout
                     pass
@@ -181,9 +178,10 @@ class TimedProgress:
 
 
 @enforce_types
-def progress_bar(seconds: int, prefix: str='') -> None:
+def progress_bar(seconds: int, prefix: str='', ANSI: Dict[str, str]=ANSI) -> None:
     """show timer in the form of progress bar, with percentage and seconds remaining"""
-    chunk = '█' if (sys.stdout or sys.__stdout__).encoding.upper() == 'UTF-8' else '#'
+    output_buf = (sys.stdout or sys.__stdout__ or sys.stderr or sys.__stderr__)
+    chunk = '█' if output_buf and output_buf.encoding.upper() == 'UTF-8' else '#'
     last_width = TERM_WIDTH()
     chunks = last_width - len(prefix) - 20  # number of progress chunks to show (aka max bar width)
     try:
@@ -236,18 +234,15 @@ def log_cli_command(subcommand: str, subcommand_args: List[str], stdin: Optional
     args = ' '.join(subcommand_args)
     version_msg = '[dark_magenta]\\[i] [{now}] ArchiveBox v{VERSION}: [/dark_magenta][green4]archivebox [green3]{subcommand}[green2] {args}[/green2]'.format(
         now=datetime.now(timezone.utc).strftime('%Y-%m-%d %H:%M:%S'),
-        VERSION=VERSION,
+        VERSION=archivebox.__version__,
         subcommand=subcommand,
         args=args,
     )
     # stderr()
     # stderr('[bright_black]    > {pwd}[/]'.format(pwd=pwd, **ANSI))
     # stderr()
-    if SHOW_PROGRESS:
-        print(Panel(version_msg), file=sys.stderr)
-    else:
-        print(version_msg, file=sys.stderr)
-
+    print(Panel(version_msg), file=sys.stderr)
+    
 ### Parsing Stage
 
 
@@ -261,7 +256,8 @@ def log_importing_started(urls: Union[str, List[str]], depth: int, index_only: b
     ))
 
 def log_source_saved(source_file: str):
-    print('    > Saved verbatim input to {}/{}'.format(SOURCES_DIR_NAME, source_file.rsplit('/', 1)[-1]))
+    from plugins_sys.config.constants import CONSTANTS
+    print('    > Saved verbatim input to {}/{}'.format(CONSTANTS.SOURCES_DIR_NAME, source_file.rsplit('/', 1)[-1]))
 
 def log_parsing_finished(num_parsed: int, parser_name: str):
     _LAST_RUN_STATS.parse_end_ts = datetime.now(timezone.utc)
@@ -293,12 +289,14 @@ def log_indexing_process_finished():
 
 
 def log_indexing_started(out_path: str):
-    if IS_TTY:
-        sys.stdout.write(f'    > ./{Path(out_path).relative_to(OUTPUT_DIR)}')
+    from plugins_sys.config.apps import SHELL_CONFIG
+    
+    if SHELL_CONFIG.IS_TTY:
+        sys.stdout.write(f'    > ./{Path(out_path).relative_to(archivebox.DATA_DIR)}')
 
 
 def log_indexing_finished(out_path: str):
-    print(f'\r    √ ./{Path(out_path).relative_to(OUTPUT_DIR)}')
+    print(f'\r    √ ./{Path(out_path).relative_to(archivebox.DATA_DIR)}')
 
 
 ### Archiving Stage
@@ -447,7 +445,7 @@ def log_archive_method_finished(result: "ArchiveResult"):
             )
 
         docker_hints = ()
-        if IN_DOCKER:
+        if os.environ.get('IN_DOCKER') in ('1', 'true', 'True', 'TRUE', 'yes'):
             docker_hints = (
                 '  docker run -it -v $PWD/data:/data archivebox/archivebox /bin/bash',
             )
@@ -534,7 +532,7 @@ def log_shell_welcome_msg():
 ### Helpers
 
 @enforce_types
-def pretty_path(path: Union[Path, str], pwd: Union[Path, str]=OUTPUT_DIR) -> str:
+def pretty_path(path: Union[Path, str], pwd: Union[Path, str]=archivebox.DATA_DIR) -> str:
     """convert paths like .../ArchiveBox/archivebox/../output/abc into output/abc"""
     pwd = str(Path(pwd))  # .resolve()
     path = str(path)
@@ -577,7 +575,7 @@ def printable_folders(folders: Dict[str, Optional["Link"]],
 
 
 @enforce_types
-def printable_config(config: ConfigDict, prefix: str='') -> str:
+def printable_config(config: dict, prefix: str='') -> str:
     return f'\n{prefix}'.join(
         f'{key}={val}'
         for key, val in config.items()

+ 50 - 63
archivebox/main.py

@@ -6,6 +6,8 @@ import shutil
 import platform
 import archivebox
 
+CONSTANTS = archivebox.CONSTANTS
+
 from typing import Dict, List, Optional, Iterable, IO, Union
 from pathlib import Path
 from datetime import date, datetime
@@ -66,47 +68,25 @@ from .index.html import (
 )
 from .index.csv import links_to_csv
 from .extractors import archive_links, archive_link, ignore_methods
-from .misc.logging import stderr, hint
+from .misc.logging import stderr, hint, ANSI
 from .misc.checks import check_data_folder, check_dependencies
 from .config import (
     setup_django_minimal,
     ConfigDict,
-    ANSI,
     IS_TTY,
     DEBUG,
     IN_DOCKER,
     IN_QEMU,
     PUID,
     PGID,
-    USER,
     TIMEZONE,
-    ENFORCE_ATOMIC_WRITES,
-    OUTPUT_PERMISSIONS,
     ONLY_NEW,
-    OUTPUT_DIR,
-    SOURCES_DIR,
-    ARCHIVE_DIR,
-    LOGS_DIR,
-    PACKAGE_DIR,
-    CONFIG_FILE,
-    ARCHIVE_DIR_NAME,
     JSON_INDEX_FILENAME,
     HTML_INDEX_FILENAME,
     SQL_INDEX_FILENAME,
-    ALLOWED_IN_OUTPUT_DIR,
     LDAP,
     write_config_file,
-    VERSION,
-    COMMIT_HASH,
-    BUILD_TIME,
-    CODE_LOCATIONS,
-    DATA_LOCATIONS,
     DEPENDENCIES,
-    YOUTUBEDL_BINARY,
-    YOUTUBEDL_VERSION,
-    SINGLEFILE_VERSION,
-    READABILITY_VERSION,
-    MERCURY_VERSION,
     load_all_config,
     CONFIG,
     USER_CONFIG,
@@ -114,7 +94,6 @@ from .config import (
     setup_django,
 )
 from .logging_util import (
-    TERM_WIDTH,
     TimedProgress,
     log_importing_started,
     log_crawl_started,
@@ -129,9 +108,14 @@ from .logging_util import (
     printable_dependency_version,
 )
 
+VERSION = archivebox.VERSION
+PACKAGE_DIR = archivebox.PACKAGE_DIR
+OUTPUT_DIR = archivebox.DATA_DIR
+ARCHIVE_DIR = archivebox.DATA_DIR / 'archive'
+
 
 @enforce_types
-def help(out_dir: Path=OUTPUT_DIR) -> None:
+def help(out_dir: Path=archivebox.DATA_DIR) -> None:
     """Print the ArchiveBox help message and usage"""
 
     all_subcommands = CLI_SUBCOMMANDS
@@ -207,7 +191,7 @@ def version(quiet: bool=False,
     """Print the ArchiveBox version and dependency information"""
     
     setup_django_minimal()
-    from plugins_sys.config.apps import SEARCH_BACKEND_CONFIG, STORAGE_CONFIG, SHELL_CONFIG
+    from plugins_sys.config.apps import SEARCH_BACKEND_CONFIG, STORAGE_CONFIG, SHELL_CONFIG, CONSTANTS
     from plugins_auth.ldap.apps import LDAP_CONFIG
     from django.conf import settings
     
@@ -223,8 +207,8 @@ def version(quiet: bool=False,
         p = platform.uname()
         print(
             'ArchiveBox v{}'.format(archivebox.__version__),
-            f'COMMIT_HASH={COMMIT_HASH[:7] if COMMIT_HASH else "unknown"}',
-            f'BUILD_TIME={BUILD_TIME}',
+            f'COMMIT_HASH={SHELL_CONFIG.COMMIT_HASH[:7] if SHELL_CONFIG.COMMIT_HASH else "unknown"}',
+            f'BUILD_TIME={SHELL_CONFIG.BUILD_TIME}',
         )
         print(
             f'IN_DOCKER={IN_DOCKER}',
@@ -234,7 +218,7 @@ def version(quiet: bool=False,
             f'PLATFORM={platform.platform()}',
             f'PYTHON={sys.implementation.name.title()}',
         )
-        OUTPUT_IS_REMOTE_FS = DATA_LOCATIONS['OUTPUT_DIR']['is_mount'] or DATA_LOCATIONS['ARCHIVE_DIR']['is_mount']
+        OUTPUT_IS_REMOTE_FS = CONSTANTS.DATA_LOCATIONS['OUTPUT_DIR']['is_mount'] or CONSTANTS.DATA_LOCATIONS['ARCHIVE_DIR']['is_mount']
         print(
             f'FS_ATOMIC={STORAGE_CONFIG.ENFORCE_ATOMIC_WRITES}',
             f'FS_REMOTE={OUTPUT_IS_REMOTE_FS}',
@@ -268,17 +252,18 @@ def version(quiet: bool=False,
             except Exception as e:
                 err = e
                 loaded_bin = binary
+                raise
             print('', '√' if loaded_bin.is_valid else 'X', '', loaded_bin.name.ljust(21), str(loaded_bin.version).ljust(15), loaded_bin.abspath or str(err))
    
         print()
         print('{white}[i] Source-code locations:{reset}'.format(**ANSI))
-        for name, path in CODE_LOCATIONS.items():
+        for name, path in CONSTANTS.CODE_LOCATIONS.items():
             print(printable_folder_status(name, path))
 
         print()
-        if DATA_LOCATIONS['OUTPUT_DIR']['is_valid']:
+        if CONSTANTS.DATABASE_FILE.exists() or CONSTANTS.ARCHIVE_DIR.exists() or CONSTANTS.CONFIG_FILE.exists():
             print('{white}[i] Data locations:{reset}'.format(**ANSI))
-            for name, path in DATA_LOCATIONS.items():
+            for name, path in CONSTANTS.DATA_LOCATIONS.items():
                 print(printable_folder_status(name, path))
         else:
             print()
@@ -303,19 +288,19 @@ def run(subcommand: str,
 
 
 @enforce_types
-def init(force: bool=False, quick: bool=False, setup: bool=False, out_dir: Path=OUTPUT_DIR) -> None:
+def init(force: bool=False, quick: bool=False, setup: bool=False, out_dir: Path=archivebox.DATA_DIR) -> None:
     """Initialize a new ArchiveBox collection in the current directory"""
     
     from core.models import Snapshot
 
     out_dir.mkdir(exist_ok=True)
-    is_empty = not len(set(os.listdir(out_dir)) - ALLOWED_IN_OUTPUT_DIR)
+    is_empty = not len(set(os.listdir(out_dir)) - CONSTANTS.ALLOWED_IN_OUTPUT_DIR)
 
-    if (out_dir / JSON_INDEX_FILENAME).exists():
+    if (out_dir / archivebox.CONSTANTS.JSON_INDEX_FILENAME).exists():
         stderr("[!] This folder contains a JSON index. It is deprecated, and will no longer be kept up to date automatically.", color="lightyellow")
         stderr("    You can run `archivebox list --json --with-headers > static_index.json` to manually generate it.", color="lightyellow")
 
-    existing_index = (out_dir / SQL_INDEX_FILENAME).exists()
+    existing_index = archivebox.CONSTANTS.DATABASE_FILE.exists()
 
     if is_empty and not existing_index:
         print('{green}[+] Initializing a new ArchiveBox v{} collection...{reset}'.format(VERSION, **ANSI))
@@ -344,25 +329,24 @@ def init(force: bool=False, quick: bool=False, setup: bool=False, out_dir: Path=
     else:
         print('\n{green}[+] Building archive folder structure...{reset}'.format(**ANSI))
     
-    print(f'    + ./{ARCHIVE_DIR.relative_to(OUTPUT_DIR)}, ./{SOURCES_DIR.relative_to(OUTPUT_DIR)}, ./{LOGS_DIR.relative_to(OUTPUT_DIR)}...')
-    Path(SOURCES_DIR).mkdir(exist_ok=True)
-    Path(ARCHIVE_DIR).mkdir(exist_ok=True)
-    Path(LOGS_DIR).mkdir(exist_ok=True)
-    print(f'    + ./{CONFIG_FILE.relative_to(OUTPUT_DIR)}...')
+    print(f'    + ./{CONSTANTS.ARCHIVE_DIR.relative_to(OUTPUT_DIR)}, ./{CONSTANTS.SOURCES_DIR.relative_to(OUTPUT_DIR)}, ./{CONSTANTS.LOGS_DIR.relative_to(OUTPUT_DIR)}...')
+    Path(CONSTANTS.SOURCES_DIR).mkdir(exist_ok=True)
+    Path(CONSTANTS.ARCHIVE_DIR).mkdir(exist_ok=True)
+    Path(CONSTANTS.LOGS_DIR).mkdir(exist_ok=True)
+    print(f'    + ./{CONSTANTS.CONFIG_FILE.relative_to(OUTPUT_DIR)}...')
     write_config_file({}, out_dir=out_dir)
 
-    if (out_dir / SQL_INDEX_FILENAME).exists():
+    if CONSTANTS.DATABASE_FILE.exists():
         print('\n{green}[*] Verifying main SQL index and running any migrations needed...{reset}'.format(**ANSI))
     else:
         print('\n{green}[+] Building main SQL index and running initial migrations...{reset}'.format(**ANSI))
     
-    DATABASE_FILE = out_dir / SQL_INDEX_FILENAME
     for migration_line in apply_migrations(out_dir):
         print(f'    {migration_line}')
 
-    assert DATABASE_FILE.exists()
+    assert CONSTANTS.DATABASE_FILE.exists()
     print()
-    print(f'    √ ./{DATABASE_FILE.relative_to(OUTPUT_DIR)}')
+    print(f'    √ ./{CONSTANTS.DATABASE_FILE.relative_to(OUTPUT_DIR)}')
     
     # from django.contrib.auth.models import User
     # if IS_TTY and not User.objects.filter(is_superuser=True).exists():
@@ -477,7 +461,7 @@ def status(out_dir: Path=OUTPUT_DIR) -> None:
     check_data_folder(CONFIG)
 
     from core.models import Snapshot
-    from django.contrib.auth import get_user_model
+    from django.contrib.auth import get_user_mod, SHELL_CONFIG
     User = get_user_model()
 
     print('{green}[*] Scanning archive main index...{reset}'.format(**ANSI))
@@ -491,7 +475,7 @@ def status(out_dir: Path=OUTPUT_DIR) -> None:
     num_sql_links = links.count()
     num_link_details = sum(1 for link in parse_json_links_details(out_dir=out_dir))
     print(f'    > SQL Main Index: {num_sql_links} links'.ljust(36), f'(found in {SQL_INDEX_FILENAME})')
-    print(f'    > JSON Link Details: {num_link_details} links'.ljust(36), f'(found in {ARCHIVE_DIR_NAME}/*/index.json)')
+    print(f'    > JSON Link Details: {num_link_details} links'.ljust(36), f'(found in {ARCHIVE_DIR.name}/*/index.json)')
     print()
     print('{green}[*] Scanning archive data directories...{reset}'.format(**ANSI))
     print(ANSI['lightyellow'], f'   {ARCHIVE_DIR}/*', ANSI['reset'])
@@ -539,7 +523,7 @@ def status(out_dir: Path=OUTPUT_DIR) -> None:
     
     print()
     print('{green}[*] Scanning recent archive changes and user logins:{reset}'.format(**ANSI))
-    print(ANSI['lightyellow'], f'   {LOGS_DIR}/*', ANSI['reset'])
+    print(ANSI['lightyellow'], f'   {CONSTANTS.LOGS_DIR}/*', ANSI['reset'])
     users = get_admins().values_list('username', flat=True)
     print(f'    UI users {len(users)}: {", ".join(users)}')
     last_login = User.objects.order_by('last_login').last()
@@ -564,7 +548,7 @@ def status(out_dir: Path=OUTPUT_DIR) -> None:
                 f'   > {str(snapshot.downloaded_at)[:16]} '
                 f'[{snapshot.num_outputs} {("X", "√")[snapshot.is_archived]} {printable_filesize(snapshot.archive_size)}] '
                 f'"{snapshot.title}": {snapshot.url}'
-            )[:TERM_WIDTH()],
+            )[:SHELL_CONFIG.TERM_WIDTH],
             ANSI['reset'],
         )
     print(ANSI['black'], '   ...', ANSI['reset'])
@@ -976,7 +960,7 @@ def setup(out_dir: Path=OUTPUT_DIR) -> None:
 
     from rich import print
 
-    if not (out_dir / ARCHIVE_DIR_NAME).exists():
+    if not ARCHIVE_DIR.exists():
         run_subcommand('init', stdin=None, pwd=out_dir)
 
     setup_django(out_dir=out_dir, check_db=True)
@@ -992,9 +976,13 @@ def setup(out_dir: Path=OUTPUT_DIR) -> None:
     from plugins_extractor.singlefile.apps import SINGLEFILE_BINARY
     print(SINGLEFILE_BINARY.load_or_install().model_dump(exclude={'binproviders_supported', 'loaded_binprovider', 'provider_overrides', 'loaded_abspaths', 'bin_dir', 'loaded_respath'}))
     
+    from plugins_extractor.readability.apps import READABILITY_BINARY
+    print(READABILITY_BINARY.load_or_install().model_dump(exclude={'binproviders_supported', 'loaded_binprovider', 'provider_overrides', 'loaded_abspaths', 'bin_dir', 'loaded_respath'}))
+    
+    
     from plugins_pkg.npm.apps import npm
 
-    print(npm.load_or_install('readability-extractor', overrides={'packages': lambda: ['github:ArchiveBox/readability-extractor']}).model_dump(exclude={'binproviders_supported', 'loaded_binprovider', 'provider_overrides', 'loaded_abspaths', 'bin_dir', 'loaded_respath'}))
+    # TODO: move these to their own plugin binaries
     print(npm.load_or_install('postlight-parser',      overrides={'packages': lambda: ['@postlight/parser@^2.2.3'], 'version': lambda: '2.2.3'}).model_dump(exclude={'binproviders_supported', 'loaded_binprovider', 'provider_overrides', 'loaded_abspaths'}))
 
     from django.contrib.auth import get_user_model
@@ -1020,7 +1008,6 @@ def config(config_options_str: Optional[str]=None,
     """Get and set your ArchiveBox project configuration values"""
 
     check_data_folder(CONFIG)
-
     if config_options and config_options_str:
         stderr(
             '[X] You should either pass config values as an arguments '
@@ -1096,7 +1083,6 @@ def config(config_options_str: Optional[str]=None,
     elif reset:
         stderr('[X] This command is not implemented yet.', color='red')
         stderr('    Please manually remove the relevant lines from your config file:')
-        stderr(f'        {CONFIG_FILE}')
         raise SystemExit(2)
     else:
         stderr('[X] You must pass either --get or --set, or no arguments to get the whole config.', color='red')
@@ -1125,8 +1111,9 @@ def schedule(add: bool=False,
     check_data_folder(CONFIG)
     setup_django_minimal()
     from plugins_pkg.pip.apps import ARCHIVEBOX_BINARY
+    from plugins_sys.config.apps import SHELL_CONFIG, CONSTANTS
 
-    Path(LOGS_DIR).mkdir(exist_ok=True)
+    Path(CONSTANTS.LOGS_DIR).mkdir(exist_ok=True)
 
     cron = CronTab(user=True)
     cron = dedupe_cron_jobs(cron)
@@ -1155,7 +1142,7 @@ def schedule(add: bool=False,
                 f'"{import_path}"',
             ] if import_path else ['update']),
             '>>',
-            quoted(Path(LOGS_DIR) / 'schedule.log'),
+            quoted(Path(CONSTANTS.LOGS_DIR) / 'schedule.log'),
             '2>&1',
 
         ]
@@ -1167,7 +1154,7 @@ def schedule(add: bool=False,
         elif CronSlices.is_valid(every):
             new_job.setall(every)
         else:
-            stderr('{red}[X] Got invalid timeperiod for cron task.{reset}'.format(**ANSI))
+            stderr('{red}[X] Got invalid timeperiod for cron task.{reset}'.format(**SHELL_CONFIG.ANSI))
             stderr('    It must be one of minute/hour/day/month')
             stderr('    or a quoted cron-format schedule like:')
             stderr('        archivebox init --every=day --depth=1 https://example.com/some/rss/feed.xml')
@@ -1181,11 +1168,11 @@ def schedule(add: bool=False,
         existing_jobs = list(cron.find_comment(CRON_COMMENT))
 
         print()
-        print('{green}[√] Scheduled new ArchiveBox cron job for user: {} ({} jobs are active).{reset}'.format(USER, len(existing_jobs), **ANSI))
+        print('{green}[√] Scheduled new ArchiveBox cron job for user: {} ({} jobs are active).{reset}'.format(SHELL_CONFIG.USER, len(existing_jobs), **SHELL_CONFIG.ANSI))
         print('\n'.join(f'  > {cmd}' if str(cmd) == str(new_job) else f'    {cmd}' for cmd in existing_jobs))
         if total_runs > 60 and not quiet:
             stderr()
-            stderr('{lightyellow}[!] With the current cron config, ArchiveBox is estimated to run >{} times per year.{reset}'.format(total_runs, **ANSI))
+            stderr('{lightyellow}[!] With the current cron config, ArchiveBox is estimated to run >{} times per year.{reset}'.format(total_runs, **SHELL_CONFIG.ANSI))
             stderr('    Congrats on being an enthusiastic internet archiver! 👌')
             stderr()
             stderr('    Make sure you have enough storage space available to hold all the data.')
@@ -1195,7 +1182,7 @@ def schedule(add: bool=False,
         if existing_jobs:
             print('\n'.join(str(cmd) for cmd in existing_jobs))
         else:
-            stderr('{red}[X] There are no ArchiveBox cron jobs scheduled for your user ({}).{reset}'.format(USER, **ANSI))
+            stderr('{red}[X] There are no ArchiveBox cron jobs scheduled for your user ({}).{reset}'.format(SHELL_CONFIG.USER, **SHELL_CONFIG.ANSI))
             stderr('    To schedule a new job, run:')
             stderr('        archivebox schedule --every=[timeperiod] --depth=1 https://example.com/some/rss/feed.xml')
         raise SystemExit(0)
@@ -1206,11 +1193,11 @@ def schedule(add: bool=False,
 
     if foreground or run_all:
         if not existing_jobs:
-            stderr('{red}[X] You must schedule some jobs first before running in foreground mode.{reset}'.format(**ANSI))
+            stderr('{red}[X] You must schedule some jobs first before running in foreground mode.{reset}'.format(**SHELL_CONFIG.ANSI))
             stderr('    archivebox schedule --every=hour --depth=1 https://example.com/some/rss/feed.xml')
             raise SystemExit(1)
 
-        print('{green}[*] Running {} ArchiveBox jobs in foreground task scheduler...{reset}'.format(len(existing_jobs), **ANSI))
+        print('{green}[*] Running {} ArchiveBox jobs in foreground task scheduler...{reset}'.format(len(existing_jobs), **SHELL_CONFIG.ANSI))
         if run_all:
             try:
                 for job in existing_jobs:
@@ -1220,7 +1207,7 @@ def schedule(add: bool=False,
                     job.run()
                     sys.stdout.write(f'\r    √ {job.command.split("/archivebox ")[-1]}\n')
             except KeyboardInterrupt:
-                print('\n{green}[√] Stopped.{reset}'.format(**ANSI))
+                print('\n{green}[√] Stopped.{reset}'.format(**SHELL_CONFIG.ANSI))
                 raise SystemExit(1)
 
         if foreground:
@@ -1230,7 +1217,7 @@ def schedule(add: bool=False,
                 for result in cron.run_scheduler():
                     print(result)
             except KeyboardInterrupt:
-                print('\n{green}[√] Stopped.{reset}'.format(**ANSI))
+                print('\n{green}[√] Stopped.{reset}'.format(**SHELL_CONFIG.ANSI))
                 raise SystemExit(1)
 
     # if CAN_UPGRADE:

+ 34 - 30
archivebox/misc/checks.py

@@ -5,51 +5,55 @@ __package__ = 'archivebox.misc'
 from benedict import benedict
 from pathlib import Path
 
-from .logging import stderr, hint
+import archivebox
+
+from .logging import stderr, hint, ANSI
 
 
 def check_dependencies(config: benedict, show_help: bool=True) -> None:
-    invalid_dependencies = [
-        (name, info) for name, info in config['DEPENDENCIES'].items()
-        if info['enabled'] and not info['is_valid']
-    ]
-    if invalid_dependencies and show_help:
-        stderr(f'[!] Warning: Missing {len(invalid_dependencies)} recommended dependencies', color='lightyellow')
-        for dependency, info in invalid_dependencies:
-            stderr(
-                '    ! {}: {} ({})'.format(
-                    dependency,
-                    info['path'] or 'unable to find binary',
-                    info['version'] or 'unable to detect version',
-                )
-            )
-            if dependency in ('YOUTUBEDL_BINARY', 'CHROME_BINARY', 'SINGLEFILE_BINARY', 'READABILITY_BINARY', 'MERCURY_BINARY'):
-                hint(('To install all packages automatically run: archivebox setup',
-                    f'or to disable it and silence this warning: archivebox config --set SAVE_{dependency.rsplit("_", 1)[0]}=False',
-                    ''), prefix='      ')
-        stderr('')
+    # dont do this on startup anymore, it's too slow
+    pass
+    # invalid_dependencies = [
+    #     (name, binary) for name, info in settings.BINARIES.items()
+    #     if not binary.
+    # ]
+    # if invalid_dependencies and show_help:
+    #     stderr(f'[!] Warning: Missing {len(invalid_dependencies)} recommended dependencies', color='lightyellow')
+    #     for dependency, info in invalid_dependencies:
+    #         stderr(
+    #             '    ! {}: {} ({})'.format(
+    #                 dependency,
+    #                 info['path'] or 'unable to find binary',
+    #                 info['version'] or 'unable to detect version',
+    #             )
+    #         )
+    #         if dependency in ('YOUTUBEDL_BINARY', 'CHROME_BINARY', 'SINGLEFILE_BINARY', 'READABILITY_BINARY', 'MERCURY_BINARY'):
+    #             hint(('To install all packages automatically run: archivebox setup',
+    #                 f'or to disable it and silence this warning: archivebox config --set SAVE_{dependency.rsplit("_", 1)[0]}=False',
+    #                 ''), prefix='      ')
+    #     stderr('')
 
 
 
 def check_data_folder(config: benedict) -> None:
-    output_dir = config['OUTPUT_DIR']
+    output_dir = archivebox.DATA_DIR
 
-    archive_dir_exists = (Path(output_dir) / 'archive').exists()
+    archive_dir_exists = (archivebox.CONSTANTS.ARCHIVE_DIR).exists()
     if not archive_dir_exists:
         stderr('[X] No archivebox index found in the current directory.', color='red')
         stderr(f'    {output_dir}', color='lightyellow')
         stderr()
-        stderr('    {lightred}Hint{reset}: Are you running archivebox in the right folder?'.format(**config['ANSI']))
+        stderr('    {lightred}Hint{reset}: Are you running archivebox in the right folder?'.format(**ANSI))
         stderr('        cd path/to/your/archive/folder')
         stderr('        archivebox [command]')
         stderr()
-        stderr('    {lightred}Hint{reset}: To create a new archive collection or import existing data in this folder, run:'.format(**config['ANSI']))
+        stderr('    {lightred}Hint{reset}: To create a new archive collection or import existing data in this folder, run:'.format(**ANSI))
         stderr('        archivebox init')
         raise SystemExit(2)
 
 
 def check_migrations(config: benedict):
-    output_dir = config['OUTPUT_DIR']
+    output_dir = archivebox.DATA_DIR
     
     from ..index.sql import list_migrations
 
@@ -63,8 +67,8 @@ def check_migrations(config: benedict):
         stderr('        archivebox init')
         raise SystemExit(3)
 
-    (Path(output_dir) / config['SOURCES_DIR_NAME']).mkdir(exist_ok=True)
-    (Path(output_dir) / config['LOGS_DIR_NAME']).mkdir(exist_ok=True)
-    (Path(output_dir) / config['CACHE_DIR_NAME']).mkdir(exist_ok=True)
-    (Path(output_dir) / config['LIB_DIR_NAME'] / 'bin').mkdir(exist_ok=True, parents=True)
-    (Path(output_dir) / config['PERSONAS_DIR_NAME'] / 'Default').mkdir(exist_ok=True, parents=True)
+    archivebox.CONSTANTS.SOURCES_DIR.mkdir(exist_ok=True)
+    archivebox.CONSTANTS.LOGS_DIR.mkdir(exist_ok=True)
+    archivebox.CONSTANTS.CACHE_DIR.mkdir(exist_ok=True)
+    (archivebox.CONSTANTS.LIB_DIR / 'bin').mkdir(exist_ok=True, parents=True)
+    (archivebox.CONSTANTS.PERSONAS_DIR / 'Default').mkdir(exist_ok=True, parents=True)

+ 3 - 5
archivebox/misc/logging.py

@@ -8,8 +8,6 @@ from collections import defaultdict
 from benedict import benedict
 from rich.console import Console
 
-from ..config_stubs import ConfigDict
-
 # SETUP RICH CONSOLE / TTY detection / COLOR / PROGRESS BARS
 CONSOLE = Console()
 IS_TTY = CONSOLE.is_interactive
@@ -43,7 +41,7 @@ COLOR_DICT = defaultdict(lambda: [(0, 0, 0), (0, 0, 0)], {
 })
 
 # Logging Helpers
-def stdout(*args, color: Optional[str]=None, prefix: str='', config: Optional[ConfigDict]=None) -> None:
+def stdout(*args, color: Optional[str]=None, prefix: str='', config: Optional[benedict]=None) -> None:
     ansi = DEFAULT_CLI_COLORS if (config or {}).get('USE_COLOR') else ANSI
 
     if color:
@@ -53,7 +51,7 @@ def stdout(*args, color: Optional[str]=None, prefix: str='', config: Optional[Co
 
     sys.stdout.write(prefix + ''.join(strs))
 
-def stderr(*args, color: Optional[str]=None, prefix: str='', config: Optional[ConfigDict]=None) -> None:
+def stderr(*args, color: Optional[str]=None, prefix: str='', config: Optional[benedict]=None) -> None:
     ansi = DEFAULT_CLI_COLORS if (config or {}).get('USE_COLOR') else ANSI
 
     if color:
@@ -63,7 +61,7 @@ def stderr(*args, color: Optional[str]=None, prefix: str='', config: Optional[Co
 
     sys.stderr.write(prefix + ''.join(strs))
 
-def hint(text: Union[Tuple[str, ...], List[str], str], prefix='    ', config: Optional[ConfigDict]=None) -> None:
+def hint(text: Union[Tuple[str, ...], List[str], str], prefix='    ', config: Optional[benedict]=None) -> None:
     ansi = DEFAULT_CLI_COLORS if (config or {}).get('USE_COLOR') else ANSI
 
     if isinstance(text, str):

+ 2 - 3
archivebox/parsers/pocket_api.py

@@ -2,25 +2,24 @@ __package__ = 'archivebox.parsers'
 
 
 import re
+import archivebox
 
 from typing import IO, Iterable, Optional
 from configparser import ConfigParser
 
-from pathlib import Path
 from pocket import Pocket
 
 from ..index.schema import Link
 from ..util import enforce_types
 from ..system import atomic_write
 from ..config import (
-    SOURCES_DIR,
     POCKET_CONSUMER_KEY,
     POCKET_ACCESS_TOKENS,
 )
 
 
 COUNT_PER_PAGE = 500
-API_DB_PATH = Path(SOURCES_DIR) / 'pocket_api.db'
+API_DB_PATH = archivebox.DATA_DIR / 'sources' / 'pocket_api.db'
 
 # search for broken protocols that sometimes come from the Pocket API
 _BROKEN_PROTOCOL_RE = re.compile('^(http[s]?)(:/(?!/))')

+ 3 - 7
archivebox/parsers/readwise_reader_api.py

@@ -3,23 +3,19 @@ __package__ = "archivebox.parsers"
 
 import re
 import requests
+import archivebox
 from datetime import datetime
 
 from typing import IO, Iterable, Optional
 from configparser import ConfigParser
 
-from pathlib import Path
-
 from ..index.schema import Link
 from ..util import enforce_types
 from ..system import atomic_write
-from ..config import (
-    SOURCES_DIR,
-    READWISE_READER_TOKENS,
-)
+from ..config import READWISE_READER_TOKENS
 
 
-API_DB_PATH = Path(SOURCES_DIR) / "readwise_reader_api.db"
+API_DB_PATH = archivebox.DATA_DIR / "sources" / "readwise_reader_api.db"
 
 
 class ReadwiseReaderAPI:

+ 8 - 4
archivebox/plugantic/base_binary.py

@@ -17,6 +17,8 @@ from pydantic_pkgr import (
 
 from django.conf import settings
 
+import archivebox
+
 from .base_hook import BaseHook, HookType
 
 
@@ -64,7 +66,9 @@ class BaseBinary(BaseHook, Binary):
         super().register(settings, parent_plugin=parent_plugin)
 
     @staticmethod
-    def symlink_to_lib(binary, bin_dir=settings.CONFIG.BIN_DIR) -> None:
+    def symlink_to_lib(binary, bin_dir=None) -> None:
+        bin_dir = bin_dir or archivebox.CONSTANTS.LIB_BIN_DIR
+        
         if not (binary.abspath and binary.abspath.exists()):
             return
         
@@ -77,19 +81,19 @@ class BaseBinary(BaseHook, Binary):
     @validate_call
     def load(self, **kwargs) -> Self:
         binary = super().load(**kwargs)
-        self.symlink_to_lib(binary=binary, bin_dir=settings.CONFIG.BIN_DIR)
+        self.symlink_to_lib(binary=binary, bin_dir=archivebox.CONSTANTS.LIB_BIN_DIR)
         return binary
     
     @validate_call
     def install(self, **kwargs) -> Self:
         binary = super().install(**kwargs)
-        self.symlink_to_lib(binary=binary, bin_dir=settings.CONFIG.BIN_DIR)
+        self.symlink_to_lib(binary=binary, bin_dir=archivebox.CONSTANTS.LIB_BIN_DIR)
         return binary
     
     @validate_call
     def load_or_install(self, **kwargs) -> Self:
         binary = super().load_or_install(**kwargs)
-        self.symlink_to_lib(binary=binary, bin_dir=settings.CONFIG.BIN_DIR)
+        self.symlink_to_lib(binary=binary, bin_dir=archivebox.CONSTANTS.LIB_BIN_DIR)
         return binary
     
     @property

+ 29 - 14
archivebox/plugantic/base_configset.py

@@ -123,6 +123,10 @@ class ArchiveBoxBaseConfig(BaseSettings):
         validate_return=True,
         revalidate_instances="always",
     )
+    
+    load_from_defaults: ClassVar[bool] = True
+    load_from_configfile: ClassVar[bool] = True
+    load_from_environment: ClassVar[bool] = True
 
     @classmethod
     def settings_customise_sources(
@@ -140,20 +144,22 @@ class ArchiveBoxBaseConfig(BaseSettings):
         
         # import ipdb; ipdb.set_trace()
         
+        precedence_order = {}
+        
         # if ArchiveBox.conf does not exist yet, return defaults -> env order
         if not ARCHIVEBOX_CONFIG_FILE.is_file():
-            return (
-                init_settings,
-                env_settings,
-            )
+            precedence_order = {
+                'defaults': init_settings,
+                'environment': env_settings,
+            }
         
         # if ArchiveBox.conf exists and is in TOML format, return default -> TOML -> env order
         try:
-            return (
-                init_settings,
-                FlatTomlConfigSettingsSource(settings_cls, toml_file=ARCHIVEBOX_CONFIG_FILE),
-                env_settings,
-            )
+            precedence_order = precedence_order or {
+                'defaults': init_settings,
+                'configfile': FlatTomlConfigSettingsSource(settings_cls, toml_file=ARCHIVEBOX_CONFIG_FILE),
+                'environment': env_settings,
+            }
         except Exception as err:
             if err.__class__.__name__ != "TOMLDecodeError":
                 raise
@@ -165,11 +171,20 @@ class ArchiveBoxBaseConfig(BaseSettings):
             new_toml = ini_to_toml.convert(original_ini)
             ARCHIVEBOX_CONFIG_FILE.write_text(new_toml)
 
-            return (
-                init_settings,
-                FlatTomlConfigSettingsSource(settings_cls, toml_file=ARCHIVEBOX_CONFIG_FILE),
-                env_settings,
-            )
+            precedence_order = {
+                'defaults': init_settings,
+                'configfile': FlatTomlConfigSettingsSource(settings_cls, toml_file=ARCHIVEBOX_CONFIG_FILE),
+                'environment': env_settings,
+            }
+            
+        if not cls.load_from_environment:
+            precedence_order.pop('environment')
+        if not cls.load_from_configfile:
+            precedence_order.pop('configfile')
+        if not cls.load_from_defaults:
+            precedence_order.pop('defaults')
+
+        return tuple(precedence_order.values())
 
     @model_validator(mode="after")
     def fill_defaults(self):

+ 57 - 57
archivebox/plugantic/management/commands/pkg.py

@@ -1,72 +1,72 @@
-__package__ = 'archivebox.plugantic.management.commands'
+# __package__ = 'archivebox.plugantic.management.commands'
 
-from django.core.management.base import BaseCommand
-from django.conf import settings
+# from django.core.management.base import BaseCommand
+# from django.conf import settings
 
-from pydantic_pkgr import Binary, BinProvider, BrewProvider, EnvProvider, SemVer
-from pydantic_pkgr.binprovider import bin_abspath
+# from pydantic_pkgr import Binary, BinProvider, BrewProvider, EnvProvider, SemVer
+# from pydantic_pkgr.binprovider import bin_abspath
 
-from ....config import NODE_BIN_PATH, bin_path
-from ...base_binary import env
+# from ....config import bin_path
+# from ...base_binary import env
 
 
-class Command(BaseCommand):
-    def handle(self, *args, method, **options):
-        method(*args, **options)
+# class Command(BaseCommand):
+#     def handle(self, *args, method, **options):
+#         method(*args, **options)
 
-    def add_arguments(self, parser):
-        subparsers = parser.add_subparsers(title="sub-commands", required=True)
+#     def add_arguments(self, parser):
+#         subparsers = parser.add_subparsers(title="sub-commands", required=True)
 
-        list_parser = subparsers.add_parser("list", help="List archivebox runtime dependencies.")
-        list_parser.set_defaults(method=self.list)
+#         list_parser = subparsers.add_parser("list", help="List archivebox runtime dependencies.")
+#         list_parser.set_defaults(method=self.list)
 
-        install_parser = subparsers.add_parser("install", help="Install archivebox runtime dependencies.")
-        install_parser.add_argument("--update", action="store_true", help="Update dependencies to latest versions.")
-        install_parser.add_argument("package_names", nargs="+", type=str)
-        install_parser.set_defaults(method=self.install)
+#         install_parser = subparsers.add_parser("install", help="Install archivebox runtime dependencies.")
+#         install_parser.add_argument("--update", action="store_true", help="Update dependencies to latest versions.")
+#         install_parser.add_argument("package_names", nargs="+", type=str)
+#         install_parser.set_defaults(method=self.install)
 
-    def list(self, *args, **options):
-        self.stdout.write('################# PLUGINS ####################')
-        for plugin in settings.PLUGINS.values():
-            self.stdout.write(f'{plugin.name}:')
-            for binary in plugin.binaries:
-                try:
-                    binary = binary.load()
-                except Exception as e:
-                    # import ipdb; ipdb.set_trace()
-                    raise
-                self.stdout.write(f'    {binary.name.ljust(14)} {str(binary.version).ljust(11)} {binary.binprovider.INSTALLER_BIN.ljust(5)}  {binary.abspath}')
+#     def list(self, *args, **options):
+#         self.stdout.write('################# PLUGINS ####################')
+#         for plugin in settings.PLUGINS.values():
+#             self.stdout.write(f'{plugin.name}:')
+#             for binary in plugin.binaries:
+#                 try:
+#                     binary = binary.load()
+#                 except Exception as e:
+#                     # import ipdb; ipdb.set_trace()
+#                     raise
+#                 self.stdout.write(f'    {binary.name.ljust(14)} {str(binary.version).ljust(11)} {binary.binprovider.INSTALLER_BIN.ljust(5)}  {binary.abspath}')
 
-        self.stdout.write('\n################# LEGACY ####################')
-        for bin_key, dependency in settings.CONFIG.DEPENDENCIES.items():
-            bin_name = settings.CONFIG[bin_key]
+#         self.stdout.write('\n################# LEGACY ####################')
+#         for bin_key, dependency in settings.CONFIG.DEPENDENCIES.items():
+#             bin_name = settings.CONFIG[bin_key]
 
-            self.stdout.write(f'{bin_key}:     {bin_name}')
+#             self.stdout.write(f'{bin_key}:     {bin_name}')
 
-            # binary = Binary(name=package_name, providers=[env])
-            # print(binary)
+#             # binary = Binary(name=package_name, providers=[env])
+#             # print(binary)
 
-            # try:
-            #     loaded_bin = binary.load()
-            #     self.stdout.write(
-            #         self.style.SUCCESS(f'Successfully loaded {package_name}:') + str(loaded_bin)
-            #     )
-            # except Exception as e:
-            #     self.stderr.write(
-            #         self.style.ERROR(f"Error loading {package_name}: {e}")
-            #     )
+#             # try:
+#             #     loaded_bin = binary.load()
+#             #     self.stdout.write(
+#             #         self.style.SUCCESS(f'Successfully loaded {package_name}:') + str(loaded_bin)
+#             #     )
+#             # except Exception as e:
+#             #     self.stderr.write(
+#             #         self.style.ERROR(f"Error loading {package_name}: {e}")
+#             #     )
 
-    def install(self, *args, bright, **options):
-        for package_name in options["package_names"]:
-            binary = Binary(name=package_name, providers=[env])
-            print(binary)
+#     def install(self, *args, bright, **options):
+#         for package_name in options["package_names"]:
+#             binary = Binary(name=package_name, providers=[env])
+#             print(binary)
 
-            try:
-                loaded_bin = binary.load()
-                self.stdout.write(
-                    self.style.SUCCESS(f'Successfully loaded {package_name}:') + str(loaded_bin)
-                )
-            except Exception as e:
-                self.stderr.write(
-                    self.style.ERROR(f"Error loading {package_name}: {e}")
-                )
+#             try:
+#                 loaded_bin = binary.load()
+#                 self.stdout.write(
+#                     self.style.SUCCESS(f'Successfully loaded {package_name}:') + str(loaded_bin)
+#                 )
+#             except Exception as e:
+#                 self.stderr.write(
+#                     self.style.ERROR(f"Error loading {package_name}: {e}")
+#                 )

+ 3 - 1
archivebox/plugins_extractor/chrome/apps.py

@@ -18,6 +18,8 @@ from pydantic_pkgr import (
     bin_abspath,
 )
 
+import archivebox
+
 # Depends on other Django apps:
 from plugantic.base_plugin import BasePlugin
 from plugantic.base_configset import BaseConfigSet, ConfigSectionName
@@ -215,7 +217,7 @@ class ChromeBinary(BaseBinary):
     }
 
     @staticmethod
-    def symlink_to_lib(binary, bin_dir=settings.CONFIG.BIN_DIR) -> None:
+    def symlink_to_lib(binary, bin_dir=archivebox.CONSTANTS.LIB_BIN_DIR) -> None:
         if not (binary.abspath and binary.abspath.exists()):
             return
         

+ 103 - 0
archivebox/plugins_extractor/readability/apps.py

@@ -0,0 +1,103 @@
+__package__ = 'archivebox.plugins_extractor.readability'
+
+from pathlib import Path
+from typing import List, Dict, Optional, ClassVar
+# from typing_extensions import Self
+
+from django.conf import settings
+
+# Depends on other PyPI/vendor packages:
+from pydantic import InstanceOf, Field, validate_call
+from pydantic_pkgr import BinProvider, BinProviderName, ProviderLookupDict, BinName, ShallowBinary
+
+# Depends on other Django apps:
+from plugantic.base_plugin import BasePlugin
+from plugantic.base_configset import BaseConfigSet, ConfigSectionName
+from plugantic.base_binary import BaseBinary, env
+from plugantic.base_extractor import BaseExtractor
+from plugantic.base_hook import BaseHook
+
+# Depends on Other Plugins:
+from plugins_sys.config.apps import ARCHIVING_CONFIG
+from plugins_pkg.npm.apps import SYS_NPM_BINPROVIDER, LIB_NPM_BINPROVIDER
+
+###################### Config ##########################
+
+class ReadabilityConfig(BaseConfigSet):
+    section: ClassVar[ConfigSectionName] = 'ARCHIVING_CONFIG'
+
+    SAVE_READABILITY: bool = Field(default=True, alias='USE_READABILITY')
+
+    READABILITY_TIMEOUT: int                 = Field(default=lambda: ARCHIVING_CONFIG.TIMEOUT)
+
+    READABILITY_BINARY: str = Field(default='readability-extractor')
+    # READABILITY_EXTRA_ARGS: List[str] = []                                # readability-extractor doesn't take any extra args
+
+
+READABILITY_CONFIG = ReadabilityConfig()
+
+
+READABILITY_PACKAGE_NAME = 'github:ArchiveBox/readability-extractor'
+
+class ReadabilityBinary(BaseBinary):
+    name: BinName = READABILITY_CONFIG.READABILITY_BINARY
+    binproviders_supported: List[InstanceOf[BinProvider]] = [LIB_NPM_BINPROVIDER, SYS_NPM_BINPROVIDER, env]
+
+    provider_overrides: Dict[BinProviderName, ProviderLookupDict] = {
+        LIB_NPM_BINPROVIDER.name: {"packages": lambda: [READABILITY_PACKAGE_NAME]},
+        SYS_NPM_BINPROVIDER.name: {"packages": lambda: []},    # prevent modifying system global npm packages
+    }
+    
+    @validate_call
+    def install(self, binprovider_name: Optional[BinProviderName]=None) -> ShallowBinary:
+        # force install to only use lib/npm provider, we never want to modify global NPM packages
+        return BaseBinary.install(self, binprovider_name=binprovider_name or LIB_NPM_BINPROVIDER.name)
+    
+    @validate_call
+    def load_or_install(self, binprovider_name: Optional[BinProviderName] = None) -> ShallowBinary:
+        # force install to only use lib/npm provider, we never want to modify global NPM packages
+        try:
+            return self.load()
+        except Exception:
+            return BaseBinary.install(self, binprovider_name=binprovider_name or LIB_NPM_BINPROVIDER.name)
+
+
+
+
+READABILITY_BINARY = ReadabilityBinary()
+
+
+class ReadabilityExtractor(BaseExtractor):
+    name: str = 'readability'
+    binary: BinName = READABILITY_BINARY.name
+
+    def get_output_path(self, snapshot) -> Path:
+        return Path(snapshot.link_dir) / 'readability' / 'content.html'
+
+
+READABILITY_BINARY = ReadabilityBinary()
+READABILITY_EXTRACTOR = ReadabilityExtractor()
+
+# class ReadabilityQueue(BaseQueue):
+#     name: str = 'singlefile'
+    
+#     binaries: List[InstanceOf[BaseBinary]] = [READABILITY_BINARY]
+
+# READABILITY_QUEUE = ReadabilityQueue()
+
+class ReadabilityPlugin(BasePlugin):
+    app_label: str ='singlefile'
+    verbose_name: str = 'SingleFile'
+
+    hooks: List[InstanceOf[BaseHook]] = [
+        READABILITY_CONFIG,
+        READABILITY_BINARY,
+        READABILITY_EXTRACTOR,
+        # READABILITY_QUEUE,
+    ]
+
+
+
+PLUGIN = ReadabilityPlugin()
+PLUGIN.register(settings)
+DJANGO_APP = PLUGIN.AppConfig

+ 8 - 4
archivebox/plugins_extractor/singlefile/apps.py

@@ -34,7 +34,7 @@ class SinglefileConfig(BaseConfigSet):
     SINGLEFILE_CHECK_SSL_VALIDITY: bool     = Field(default=lambda: ARCHIVING_CONFIG.CHECK_SSL_VALIDITY)
     SINGLEFILE_COOKIES_FILE: Optional[Path] = Field(default=lambda: ARCHIVING_CONFIG.COOKIES_FILE)
 
-    SINGLEFILE_BINARY: str = Field(default='wget')
+    SINGLEFILE_BINARY: str = Field(default='single-file')
     SINGLEFILE_EXTRA_ARGS: List[str] = []
 
 
@@ -46,17 +46,21 @@ SINGLEFILE_MAX_VERSION = '1.1.60'
 
 
 class SinglefileBinary(BaseBinary):
-    name: BinName = 'single-file'
+    name: BinName = SINGLEFILE_CONFIG.SINGLEFILE_BINARY
     binproviders_supported: List[InstanceOf[BinProvider]] = [LIB_NPM_BINPROVIDER, SYS_NPM_BINPROVIDER, env]
 
     provider_overrides: Dict[BinProviderName, ProviderLookupDict] = {
         env.name: {
             'abspath': lambda:
-                bin_abspath('single-file', PATH=env.PATH) or bin_abspath('single-file-node.js', PATH=env.PATH),
+                bin_abspath(SINGLEFILE_CONFIG.SINGLEFILE_BINARY, PATH=env.PATH)
+                or bin_abspath('single-file', PATH=env.PATH)
+                or bin_abspath('single-file-node.js', PATH=env.PATH),
         },
         LIB_NPM_BINPROVIDER.name: {
             "abspath": lambda:
-                bin_abspath("single-file", PATH=LIB_NPM_BINPROVIDER.PATH) or bin_abspath("single-file-node.js", PATH=LIB_NPM_BINPROVIDER.PATH),
+                bin_abspath(SINGLEFILE_CONFIG.SINGLEFILE_BINARY, PATH=env.PATH)
+                or bin_abspath("single-file", PATH=LIB_NPM_BINPROVIDER.PATH)
+                or bin_abspath("single-file-node.js", PATH=LIB_NPM_BINPROVIDER.PATH),
             "packages": lambda:
                 [f"single-file-cli@>={SINGLEFILE_MIN_VERSION} <{SINGLEFILE_MAX_VERSION}"],
         },

+ 14 - 6
archivebox/plugins_pkg/npm/apps.py

@@ -1,10 +1,13 @@
 __package__ = 'archivebox.plugins_pkg.npm'
 
+import archivebox
+
 from pathlib import Path
 from typing import List, Optional
 
 from django.conf import settings
-from pydantic import InstanceOf
+
+from pydantic import InstanceOf, model_validator
 
 from pydantic_pkgr import BinProvider, NpmProvider, BinName, PATHStr, BinProviderName
 
@@ -14,8 +17,6 @@ from plugantic.base_binary import BaseBinary, BaseBinProvider, env, apt, brew
 from plugantic.base_hook import BaseHook
 
 
-from ...config import CONFIG
-
 ###################### Config ##########################
 
 
@@ -35,17 +36,24 @@ DEFAULT_GLOBAL_CONFIG = {
 NPM_CONFIG = NpmDependencyConfigs(**DEFAULT_GLOBAL_CONFIG)
 
 
+OLD_NODE_BIN_PATH = archivebox.DATA_DIR / 'node_modules' / '.bin'
+NEW_NODE_BIN_PATH = archivebox.CONSTANTS.LIB_NPM_DIR / 'node_modules' / '.bin'
+
 class SystemNpmProvider(NpmProvider, BaseBinProvider):
     name: BinProviderName = "sys_npm"
-    PATH: PATHStr = str(CONFIG.NODE_BIN_PATH)
     
     npm_prefix: Optional[Path] = None
 
 class LibNpmProvider(NpmProvider, BaseBinProvider):
     name: BinProviderName = "lib_npm"
-    PATH: PATHStr = str(CONFIG.NODE_BIN_PATH)
+    PATH: PATHStr = str(OLD_NODE_BIN_PATH)
+    
+    npm_prefix: Optional[Path] = archivebox.CONSTANTS.LIB_NPM_DIR
     
-    npm_prefix: Optional[Path] = settings.CONFIG.LIB_DIR / 'npm'
+    @model_validator(mode='after')
+    def validate_path(self):
+        assert self.npm_prefix == NEW_NODE_BIN_PATH.parent.parent
+        return self
 
 
 SYS_NPM_BINPROVIDER = SystemNpmProvider()

+ 20 - 3
archivebox/plugins_pkg/pip/apps.py

@@ -1,13 +1,14 @@
+__package__ = 'archivebox.plugins_pkg.pip'
+
 import os
 import sys
 import inspect
 import archivebox
 from pathlib import Path
 from typing import List, Dict, Optional, ClassVar
-from pydantic import InstanceOf, Field
+from pydantic import InstanceOf, Field, model_validator
 
 import django
-
 from django.db.backends.sqlite3.base import Database as django_sqlite3     # type: ignore[import-type]
 from django.core.checks import Error, Tags
 from django.conf import settings
@@ -19,6 +20,8 @@ from plugantic.base_check import BaseCheck
 from plugantic.base_binary import BaseBinary, BaseBinProvider, env, apt, brew
 from plugantic.base_hook import BaseHook
 
+from ...misc.logging import hint
+
 
 ###################### Config ##########################
 
@@ -66,7 +69,7 @@ class LibPipBinProvider(PipProvider, BaseBinProvider):
     name: BinProviderName = "lib_pip"
     INSTALLER_BIN: BinName = "pip"
     
-    pip_venv: Optional[Path] = settings.CONFIG.OUTPUT_DIR / 'lib' / 'pip' / 'venv'
+    pip_venv: Optional[Path] = archivebox.CONSTANTS.LIB_PIP_DIR / 'venv'
 
 SYS_PIP_BINPROVIDER = SystemPipBinProvider()
 PIPX_PIP_BINPROVIDER = SystemPipxBinProvider()
@@ -117,6 +120,20 @@ class SqliteBinary(BaseBinary):
             "version": lambda: SemVer(django_sqlite3.version),
         },
     }
+    
+    @model_validator(mode='after')
+    def validate_json_extension_is_available(self):
+        # Check to make sure JSON extension is available in our Sqlite3 instance
+        try:
+            cursor = django_sqlite3.connect(':memory:').cursor()
+            cursor.execute('SELECT JSON(\'{"a": "b"}\')')
+        except django_sqlite3.OperationalError as exc:
+            print(f'[red][X] Your SQLite3 version is missing the required JSON1 extension: {exc}[/red]')
+            hint([
+                'Upgrade your Python version or install the extension manually:',
+                'https://code.djangoproject.com/wiki/JSON1Extension'
+            ])
+        return self
 
 SQLITE_BINARY = SqliteBinary()
 

+ 7 - 7
archivebox/plugins_pkg/playwright/apps.py

@@ -19,6 +19,8 @@ from pydantic_pkgr import (
     DEFAULT_ENV_PATH,
 )
 
+import archivebox
+
 # Depends on other Django apps:
 from plugantic.base_plugin import BasePlugin
 from plugantic.base_configset import BaseConfigSet
@@ -42,12 +44,10 @@ class PlaywrightConfigs(BaseConfigSet):
     # PLAYWRIGHT_DEFAULT_ARGS: List[str] = ['--timeout={TIMEOUT-10}']
     pass
 
-DEFAULT_GLOBAL_CONFIG = {
-}
 
-PLAYWRIGHT_CONFIG = PlaywrightConfigs(**DEFAULT_GLOBAL_CONFIG)
+PLAYWRIGHT_CONFIG = PlaywrightConfigs()
 
-LIB_DIR_BROWSERS = settings.CONFIG.OUTPUT_DIR / "lib" / "browsers"
+LIB_DIR_BROWSERS = archivebox.CONSTANTS.LIB_BROWSERS_DIR
 
 
 
@@ -65,12 +65,12 @@ class PlaywrightBinProvider(BaseBinProvider):
     name: BinProviderName = "playwright"
     INSTALLER_BIN: BinName = PLAYWRIGHT_BINARY.name
 
-    PATH: PATHStr = f"{settings.CONFIG.BIN_DIR}:{DEFAULT_ENV_PATH}"
+    PATH: PATHStr = f"{archivebox.CONSTANTS.LIB_BIN_DIR}:{DEFAULT_ENV_PATH}"
 
     puppeteer_browsers_dir: Optional[Path] = (
-        Path("~/Library/Caches/ms-playwright").expanduser()
+        Path("~/Library/Caches/ms-playwright").expanduser()      # macos playwright cache dir
         if OPERATING_SYSTEM == "darwin" else
-        Path("~/.cache/ms-playwright").expanduser()
+        Path("~/.cache/ms-playwright").expanduser()              # linux playwright cache dir
     )
     puppeteer_install_args: List[str] = ["install"]  # --with-deps
 

+ 7 - 7
archivebox/plugins_pkg/puppeteer/apps.py

@@ -16,6 +16,8 @@ from pydantic_pkgr import (
     HostBinPath,
 )
 
+import archivebox
+
 # Depends on other Django apps:
 from plugantic.base_plugin import BasePlugin
 from plugantic.base_configset import BaseConfigSet
@@ -40,12 +42,10 @@ class PuppeteerConfigs(BaseConfigSet):
     # PUPPETEER_DEFAULT_ARGS: List[str] = ['--timeout={TIMEOUT-10}']
     pass
 
-DEFAULT_GLOBAL_CONFIG = {
-}
 
-PUPPETEER_CONFIG = PuppeteerConfigs(**DEFAULT_GLOBAL_CONFIG)
+PUPPETEER_CONFIG = PuppeteerConfigs()
 
-LIB_DIR_BROWSERS = settings.CONFIG.OUTPUT_DIR / "lib" / "browsers"
+LIB_DIR_BROWSERS = archivebox.CONSTANTS.LIB_BROWSERS_DIR
 
 
 class PuppeteerBinary(BaseBinary):
@@ -60,8 +60,8 @@ PUPPETEER_BINARY = PuppeteerBinary()
 class PuppeteerBinProvider(BaseBinProvider):
     name: BinProviderName = "puppeteer"
     INSTALLER_BIN: BinName = "npx"
-    
-    PATH: PATHStr = str(settings.CONFIG.BIN_DIR)
+
+    PATH: PATHStr = str(archivebox.CONSTANTS.LIB_BIN_DIR)
 
     puppeteer_browsers_dir: Optional[Path] = LIB_DIR_BROWSERS
     puppeteer_install_args: List[str] = ["@puppeteer/browsers", "install", "--path", str(LIB_DIR_BROWSERS)]
@@ -140,7 +140,7 @@ PUPPETEER_BINPROVIDER = PuppeteerBinProvider()
 
 # ALTERNATIVE INSTALL METHOD using Ansible:
 # install_playbook = self.plugin_dir / 'install_puppeteer.yml'
-# chrome_bin = run_playbook(install_playbook, data_dir=settings.CONFIG.OUTPUT_DIR, quiet=quiet).BINARIES.chrome
+# chrome_bin = run_playbook(install_playbook, data_dir=archivebox.DATA_DIR, quiet=quiet).BINARIES.chrome
 # return self.__class__.model_validate(
 #     {
 #         **self.model_dump(),

+ 58 - 6
archivebox/plugins_sys/config/apps.py

@@ -1,18 +1,24 @@
+__package__ = 'archivebox.plugins_sys.config'
 import os
 import sys
+import shutil
 import platform
+import archivebox
 
-from typing import List, ClassVar
+from typing import List, ClassVar, Dict, Optional
+from datetime import datetime
 from pathlib import Path
-from pydantic import InstanceOf, Field, field_validator, model_validator
+from pydantic import InstanceOf, Field, field_validator, model_validator, computed_field
+from benedict import benedict
 from rich import print
 
 from django.conf import settings
-
+from django.utils.crypto import get_random_string
 from plugantic.base_plugin import BasePlugin
 from plugantic.base_configset import BaseConfigSet, ConfigSectionName
 from plugantic.base_hook import BaseHook
 
+from .constants import CONSTANTS, CONSTANTS_CONFIG
 
 ###################### Config ##########################
 
@@ -24,17 +30,57 @@ class ShellConfig(BaseConfigSet):
     
     IS_TTY: bool                        = Field(default=sys.stdout.isatty())
     USE_COLOR: bool                     = Field(default=lambda c: c.IS_TTY)
-    SHOW_PROGRESS: bool                 = Field(default=lambda c: (c.IS_TTY and platform.system() != 'darwin'))  # progress bars are buggy on mac, disable for now
+    SHOW_PROGRESS: bool                 = Field(default=lambda c: c.IS_TTY)
     
     IN_DOCKER: bool                     = Field(default=False)
     IN_QEMU: bool                       = Field(default=False)
     
+    USER: str                           = Field(default=Path('~').expanduser().resolve().name)
     PUID: int                           = Field(default=os.getuid())
     PGID: int                           = Field(default=os.getgid())
     
     PYTHON_ENCODING: str                = Field(default=(sys.__stdout__ or sys.stdout or sys.__stderr__ or sys.stderr).encoding.upper().replace('UTF8', 'UTF-8'))
 
+    ANSI: Dict[str, str]                = Field(default=lambda c: CONSTANTS.DEFAULT_CLI_COLORS if c.USE_COLOR else CONSTANTS.DISABLED_CLI_COLORS)
+
+    VERSIONS_AVAILABLE: bool = False             # .check_for_update.get_versions_available_on_github(c)},
+    CAN_UPGRADE: bool = False                    # .check_for_update.can_upgrade(c)},
+
     
+    @computed_field
+    @property
+    def TERM_WIDTH(self) -> int:
+        return shutil.get_terminal_size((100, 10)).columns
+    
+    @computed_field
+    @property
+    def COMMIT_HASH(self) -> Optional[str]:
+        try:
+            git_dir = archivebox.PACKAGE_DIR / '../.git'
+            ref = (git_dir / 'HEAD').read_text().strip().split(' ')[-1]
+            commit_hash = git_dir.joinpath(ref).read_text().strip()
+            return commit_hash
+        except Exception:
+            pass
+    
+        try:
+            return list((archivebox.PACKAGE_DIR / '../.git/refs/heads/').glob('*'))[0].read_text().strip()
+        except Exception:
+            pass
+        
+        return None
+    
+    @computed_field
+    @property
+    def BUILD_TIME(self) -> str:
+        if self.IN_DOCKER:
+            docker_build_end_time = Path('/VERSION.txt').read_text().rsplit('BUILD_END_TIME=')[-1].split('\n', 1)[0]
+            return docker_build_end_time
+    
+        src_last_modified_unix_timestamp = (archivebox.PACKAGE_DIR / 'config.py').stat().st_mtime
+        return datetime.fromtimestamp(src_last_modified_unix_timestamp).strftime('%Y-%m-%d %H:%M:%S %s')
+    
+
     @model_validator(mode='after')
     def validate_not_running_as_root(self):
         attempted_command = ' '.join(sys.argv[:3])
@@ -92,7 +138,7 @@ GENERAL_CONFIG = GeneralConfig()
 class ServerConfig(BaseConfigSet):
     section: ClassVar[ConfigSectionName] = 'SERVER_CONFIG'
 
-    SECRET_KEY: str                     = Field(default=None)
+    SECRET_KEY: str                     = Field(default=lambda: get_random_string(50, 'abcdefghijklmnopqrstuvwxyz0123456789_'))
     BIND_ADDR: str                      = Field(default=lambda: ['127.0.0.1:8000', '0.0.0.0:8000'][SHELL_CONFIG.IN_DOCKER])
     ALLOWED_HOSTS: str                  = Field(default='*')
     CSRF_TRUSTED_ORIGINS: str           = Field(default=lambda c: 'http://localhost:8000,http://127.0.0.1:8000,http://0.0.0.0:8000,http://{}'.format(c.BIND_ADDR))
@@ -179,7 +225,7 @@ SEARCH_BACKEND_CONFIG = SearchBackendConfig()
 
 
 class ConfigPlugin(BasePlugin):
-    app_label: str = 'config'
+    app_label: str = 'CONFIG'
     verbose_name: str = 'Configuration'
 
     hooks: List[InstanceOf[BaseHook]] = [
@@ -190,6 +236,12 @@ class ConfigPlugin(BasePlugin):
         ARCHIVING_CONFIG,
         SEARCH_BACKEND_CONFIG,
     ]
+    
+    # def register(self, settings, parent_plugin=None):
+    #     try:
+    #         super().register(settings, parent_plugin=parent_plugin)
+    #     except Exception as e:
+    #         print(f'[red][X] Error registering config plugin: {e}[/red]', file=sys.stderr)
 
 
 PLUGIN = ConfigPlugin()

+ 47 - 0
archivebox/plugins_sys/config/check_for_update.py

@@ -0,0 +1,47 @@
+# def get_versions_available_on_github(config):
+#     """
+#     returns a dictionary containing the ArchiveBox GitHub release info for
+#     the recommended upgrade version and the currently installed version
+#     """
+    
+#     # we only want to perform the (relatively expensive) check for new versions
+#     # when its most relevant, e.g. when the user runs a long-running command
+#     subcommand_run_by_user = sys.argv[3] if len(sys.argv) > 3 else 'help'
+#     long_running_commands = ('add', 'schedule', 'update', 'status', 'server')
+#     if subcommand_run_by_user not in long_running_commands:
+#         return None
+    
+#     github_releases_api = "https://api.github.com/repos/ArchiveBox/ArchiveBox/releases"
+#     response = requests.get(github_releases_api)
+#     if response.status_code != 200:
+#         stderr(f'[!] Warning: GitHub API call to check for new ArchiveBox version failed! (status={response.status_code})', color='lightyellow', config=config)
+#         return None
+#     all_releases = response.json()
+
+#     installed_version = parse_version_string(config['VERSION'])
+
+#     # find current version or nearest older version (to link to)
+#     current_version = None
+#     for idx, release in enumerate(all_releases):
+#         release_version = parse_version_string(release['tag_name'])
+#         if release_version <= installed_version:
+#             current_version = release
+#             break
+
+#     current_version = current_version or all_releases[-1]
+    
+#     # recommended version is whatever comes after current_version in the release list
+#     # (perhaps too conservative to only recommend upgrading one version at a time, but it's safest)
+#     try:
+#         recommended_version = all_releases[idx+1]
+#     except IndexError:
+#         recommended_version = None
+
+#     return {'recommended_version': recommended_version, 'current_version': current_version}
+
+# def can_upgrade(config):
+#     if config['VERSIONS_AVAILABLE'] and config['VERSIONS_AVAILABLE']['recommended_version']:
+#         recommended_version = parse_version_string(config['VERSIONS_AVAILABLE']['recommended_version']['tag_name'])
+#         current_version = parse_version_string(config['VERSIONS_AVAILABLE']['current_version']['tag_name'])
+#         return recommended_version > current_version
+#     return False

+ 1 - 0
archivebox/plugins_sys/config/constants.py

@@ -0,0 +1 @@
+from archivebox.constants import *

+ 4 - 7
archivebox/queues/settings.py

@@ -1,16 +1,13 @@
 from pathlib import Path
 
-from django.conf import settings
 
+import archivebox
+OUTPUT_DIR = archivebox.DATA_DIR
+LOGS_DIR = archivebox.CONSTANTS.LOGS_DIR
 
-OUTPUT_DIR = settings.CONFIG.OUTPUT_DIR
-LOGS_DIR = settings.CONFIG.LOGS_DIR
-
-TMP_DIR = OUTPUT_DIR / "tmp"
+TMP_DIR = archivebox.CONSTANTS.TMP_DIR
 
 Path.mkdir(TMP_DIR, exist_ok=True)
-
-
 CONFIG_FILE = TMP_DIR / "supervisord.conf"
 PID_FILE = TMP_DIR / "supervisord.pid"
 SOCK_FILE = TMP_DIR / "supervisord.sock"

+ 29 - 0
archivebox/system.py

@@ -4,6 +4,7 @@ __package__ = 'archivebox'
 import os
 import signal
 import shutil
+import getpass
 
 from json import dump
 from pathlib import Path
@@ -229,3 +230,31 @@ class suppress_output(object):
         if self.stderr:
             os.dup2(self.real_stderr, 2)
             os.close(self.null_stderr)
+
+
+def get_system_user() -> str:
+    # some host OS's are unable to provide a username (k3s, Windows), making this complicated
+    # uid 999 is especially problematic and breaks many attempts
+    SYSTEM_USER = None
+    FALLBACK_USER_PLACHOLDER = f'user_{os.getuid()}'
+
+    # Option 1
+    try:
+        import pwd
+        SYSTEM_USER = SYSTEM_USER or pwd.getpwuid(os.geteuid()).pw_name
+    except (ModuleNotFoundError, Exception):
+        pass
+
+    # Option 2
+    try:
+        SYSTEM_USER = SYSTEM_USER or getpass.getuser()
+    except Exception:
+        pass
+
+    # Option 3
+    try:
+        SYSTEM_USER = SYSTEM_USER or os.getlogin()
+    except Exception:
+        pass
+
+    return SYSTEM_USER or FALLBACK_USER_PLACHOLDER