Browse Source

move almost all config into new archivebox.CONSTANTS

Nick Sweeting 1 year ago
parent
commit
bb65b2dbec

+ 8 - 5
archivebox/__init__.py

@@ -1,14 +1,15 @@
 __package__ = 'archivebox'
 __package__ = 'archivebox'
 
 
+
 # print('INSTALLING MONKEY PATCHES')
 # print('INSTALLING MONKEY PATCHES')
+from .monkey_patches import *                    # noqa
+# print('DONE INSTALLING MONKEY PATCHES')
 
 
-from .monkey_patches import *
 
 
 import os
 import os
-import importlib
+import importlib.metadata
 from pathlib import Path
 from pathlib import Path
 
 
-
 PACKAGE_DIR = Path(__file__).resolve().parent    # archivebox source code dir
 PACKAGE_DIR = Path(__file__).resolve().parent    # archivebox source code dir
 DATA_DIR = Path(os.curdir).resolve()             # archivebox user data dir
 DATA_DIR = Path(os.curdir).resolve()             # archivebox user data dir
 
 
@@ -28,7 +29,9 @@ def _detect_installed_version():
 
 
     raise Exception('Failed to detect installed archivebox version!')
     raise Exception('Failed to detect installed archivebox version!')
 
 
+VERSION = _detect_installed_version()
 
 
-__version__ = _detect_installed_version()
+__version__ = VERSION
 
 
-# print('DONE INSTALLING MONKEY PATCHES')
+
+from .constants import CONSTANTS

+ 57 - 427
archivebox/config.py

@@ -26,10 +26,7 @@ import io
 import re
 import re
 import sys
 import sys
 import json
 import json
-import inspect
-import getpass
 import shutil
 import shutil
-import requests
 import archivebox
 import archivebox
 
 
 from hashlib import md5
 from hashlib import md5
@@ -38,7 +35,6 @@ from datetime import datetime, timezone
 from typing import Optional, Type, Tuple, Dict
 from typing import Optional, Type, Tuple, Dict
 from subprocess import run, PIPE, DEVNULL, STDOUT, TimeoutExpired
 from subprocess import run, PIPE, DEVNULL, STDOUT, TimeoutExpired
 from configparser import ConfigParser
 from configparser import ConfigParser
-import importlib.metadata
 
 
 from pydantic_pkgr import SemVer
 from pydantic_pkgr import SemVer
 from rich.progress import Progress
 from rich.progress import Progress
@@ -49,7 +45,6 @@ from django.db.backends.sqlite3.base import Database as sqlite3
 
 
 from .config_stubs import (
 from .config_stubs import (
     AttrDict,
     AttrDict,
-    SimpleConfigValueDict,
     ConfigValue,
     ConfigValue,
     ConfigDict,
     ConfigDict,
     ConfigDefaultValue,
     ConfigDefaultValue,
@@ -61,7 +56,7 @@ from .misc.logging import (
     ANSI,
     ANSI,
     COLOR_DICT,
     COLOR_DICT,
     stderr,
     stderr,
-    hint,
+    hint,      # noqa
 )
 )
 
 
 # print('STARTING CONFIG LOADING')
 # print('STARTING CONFIG LOADING')
@@ -165,8 +160,8 @@ CONFIG_SCHEMA: Dict[str, ConfigDefaultDict] = {
         'MEDIA_MAX_SIZE':           {'type': str,   'default': '750m'},
         'MEDIA_MAX_SIZE':           {'type': str,   'default': '750m'},
 
 
         'USER_AGENT':               {'type': str,   'default': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36 ArchiveBox/{VERSION} (+https://github.com/ArchiveBox/ArchiveBox/)'},
         'USER_AGENT':               {'type': str,   'default': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36 ArchiveBox/{VERSION} (+https://github.com/ArchiveBox/ArchiveBox/)'},
-        'CURL_USER_AGENT':          {'type': str,   'default': lambda c: c['USER_AGENT'] + ' curl/{CURL_VERSION}'},
-        'WGET_USER_AGENT':          {'type': str,   'default': lambda c: c['USER_AGENT'] + ' wget/{WGET_VERSION}'},
+        'CURL_USER_AGENT':          {'type': str,   'default': lambda c: c['USER_AGENT']}, # + ' curl/{CURL_VERSION}'},
+        'WGET_USER_AGENT':          {'type': str,   'default': lambda c: c['USER_AGENT']}, #  + ' wget/{WGET_VERSION}'},
 
 
         'COOKIES_FILE':             {'type': str,   'default': None},
         'COOKIES_FILE':             {'type': str,   'default': None},
 
 
@@ -254,12 +249,12 @@ CONFIG_SCHEMA: Dict[str, ConfigDefaultDict] = {
         'CURL_BINARY':              {'type': str,   'default': 'curl'},
         'CURL_BINARY':              {'type': str,   'default': 'curl'},
         'GIT_BINARY':               {'type': str,   'default': 'git'},
         'GIT_BINARY':               {'type': str,   'default': 'git'},
         'WGET_BINARY':              {'type': str,   'default': 'wget'},     # also can accept wget2
         'WGET_BINARY':              {'type': str,   'default': 'wget'},     # also can accept wget2
-        'SINGLEFILE_BINARY':        {'type': str,   'default': lambda c: bin_path('single-file')},
-        'READABILITY_BINARY':       {'type': str,   'default': lambda c: bin_path('readability-extractor')},
         'MERCURY_BINARY':           {'type': str,   'default': lambda c: bin_path('postlight-parser')},
         'MERCURY_BINARY':           {'type': str,   'default': lambda c: bin_path('postlight-parser')},
-        'YOUTUBEDL_BINARY':         {'type': str,   'default': 'yt-dlp'},   # also can accept youtube-dl
         'NODE_BINARY':              {'type': str,   'default': 'node'},
         'NODE_BINARY':              {'type': str,   'default': 'node'},
-        'RIPGREP_BINARY':           {'type': str,   'default': 'rg'},
+        # 'YOUTUBEDL_BINARY':         {'type': str,   'default': 'yt-dlp'},   # also can accept youtube-dl
+        # 'SINGLEFILE_BINARY':        {'type': str,   'default': lambda c: bin_path('single-file')},
+        # 'READABILITY_BINARY':       {'type': str,   'default': lambda c: bin_path('readability-extractor')},
+        # 'RIPGREP_BINARY':           {'type': str,   'default': 'rg'},
 
 
         'POCKET_CONSUMER_KEY':      {'type': str,   'default': None},
         'POCKET_CONSUMER_KEY':      {'type': str,   'default': None},
         'POCKET_ACCESS_TOKENS':     {'type': dict,  'default': {}},
         'POCKET_ACCESS_TOKENS':     {'type': dict,  'default': {}},
@@ -308,212 +303,16 @@ CONFIG_FILENAME = 'ArchiveBox.conf'
 
 
 
 
 
 
-STATICFILE_EXTENSIONS = {
-    # 99.999% of the time, URLs ending in these extensions are static files
-    # that can be downloaded as-is, not html pages that need to be rendered
-    'gif', 'jpeg', 'jpg', 'png', 'tif', 'tiff', 'wbmp', 'ico', 'jng', 'bmp',
-    'svg', 'svgz', 'webp', 'ps', 'eps', 'ai',
-    'mp3', 'mp4', 'm4a', 'mpeg', 'mpg', 'mkv', 'mov', 'webm', 'm4v',
-    'flv', 'wmv', 'avi', 'ogg', 'ts', 'm3u8',
-    'pdf', 'txt', 'rtf', 'rtfd', 'doc', 'docx', 'ppt', 'pptx', 'xls', 'xlsx',
-    'atom', 'rss', 'css', 'js', 'json',
-    'dmg', 'iso', 'img',
-    'rar', 'war', 'hqx', 'zip', 'gz', 'bz2', '7z',
-
-    # Less common extensions to consider adding later
-    # jar, swf, bin, com, exe, dll, deb
-    # ear, hqx, eot, wmlc, kml, kmz, cco, jardiff, jnlp, run, msi, msp, msm,
-    # pl pm, prc pdb, rar, rpm, sea, sit, tcl tk, der, pem, crt, xpi, xspf,
-    # ra, mng, asx, asf, 3gpp, 3gp, mid, midi, kar, jad, wml, htc, mml
-
-    # These are always treated as pages, not as static files, never add them:
-    # html, htm, shtml, xhtml, xml, aspx, php, cgi
-}
-
-# When initializing archivebox in a new directory, we check to make sure the dir is
-# actually empty so that we dont clobber someone's home directory or desktop by accident.
-# These files are exceptions to the is_empty check when we're trying to init a new dir,
-# as they could be from a previous archivebox version, system artifacts, dependencies, etc.
-ALLOWED_IN_OUTPUT_DIR = {
-    ".gitignore",
-    "lost+found",
-    ".DS_Store",
-    ".venv",
-    "venv",
-    "virtualenv",
-    ".virtualenv",
-    "node_modules",
-    "package.json",
-    "package-lock.json",
-    "yarn.lock",
-    "static",
-    "sonic",
-    "search.sqlite3",
-    CRONTABS_DIR_NAME,
-    ARCHIVE_DIR_NAME,
-    SOURCES_DIR_NAME,
-    LOGS_DIR_NAME,
-    CACHE_DIR_NAME,
-    LIB_DIR_NAME,
-    PERSONAS_DIR_NAME,
-    SQL_INDEX_FILENAME,
-    f"{SQL_INDEX_FILENAME}-wal",
-    f"{SQL_INDEX_FILENAME}-shm",
-    "queue.sqlite3",
-    "queue.sqlite3-wal",
-    "queue.sqlite3-shm",
-    JSON_INDEX_FILENAME,
-    HTML_INDEX_FILENAME,
-    ROBOTS_TXT_FILENAME,
-    FAVICON_FILENAME,
-    CONFIG_FILENAME,
-    f"{CONFIG_FILENAME}.bak",
-    "static_index.json",
-}
-
 
 
 ALLOWDENYLIST_REGEX_FLAGS: int = re.IGNORECASE | re.UNICODE | re.MULTILINE
 ALLOWDENYLIST_REGEX_FLAGS: int = re.IGNORECASE | re.UNICODE | re.MULTILINE
 
 
 
 
-CONSTANTS = {
-    "PACKAGE_DIR_NAME":             {'default': lambda c: PACKAGE_DIR_NAME},
-    "LIB_DIR_NAME":                 {'default': lambda c: LIB_DIR_NAME},
-    "TEMPLATES_DIR_NAME":           {'default': lambda c: TEMPLATES_DIR_NAME},
-    "ARCHIVE_DIR_NAME":             {'default': lambda c: ARCHIVE_DIR_NAME},
-    "SOURCES_DIR_NAME":             {'default': lambda c: SOURCES_DIR_NAME},
-    "LOGS_DIR_NAME":                {'default': lambda c: LOGS_DIR_NAME},
-    "CACHE_DIR_NAME":               {'default': lambda c: CACHE_DIR_NAME},
-    "PERSONAS_DIR_NAME":            {'default': lambda c: PERSONAS_DIR_NAME},
-    "CRONTABS_DIR_NAME":            {'default': lambda c: CRONTABS_DIR_NAME},
-    "SQL_INDEX_FILENAME":           {'default': lambda c: SQL_INDEX_FILENAME},
-    "JSON_INDEX_FILENAME":          {'default': lambda c: JSON_INDEX_FILENAME},
-    "HTML_INDEX_FILENAME":          {'default': lambda c: HTML_INDEX_FILENAME},
-    "ROBOTS_TXT_FILENAME":          {'default': lambda c: ROBOTS_TXT_FILENAME},
-    "FAVICON_FILENAME":             {'default': lambda c: FAVICON_FILENAME},
-    "CONFIG_FILENAME":              {'default': lambda c: CONFIG_FILENAME},
-    "DEFAULT_CLI_COLORS":           {'default': lambda c: DEFAULT_CLI_COLORS},
-    "ANSI":                         {'default': lambda c: ANSI},
-    "COLOR_DICT":                   {'default': lambda c: COLOR_DICT},
-    "STATICFILE_EXTENSIONS":        {'default': lambda c: STATICFILE_EXTENSIONS},
-    "ALLOWED_IN_OUTPUT_DIR":        {'default': lambda c: ALLOWED_IN_OUTPUT_DIR},
-    # "ALLOWDENYLIST_REGEX_FLAGS":    {'default': lambda c: ALLOWDENYLIST_REGEX_FLAGS},
-}
+CONSTANTS = archivebox.CONSTANTS._asdict()
 
 
 ############################## Version Config ##################################
 ############################## Version Config ##################################
 
 
-def get_system_user() -> str:
-    # some host OS's are unable to provide a username (k3s, Windows), making this complicated
-    # uid 999 is especially problematic and breaks many attempts
-    SYSTEM_USER = None
-    FALLBACK_USER_PLACHOLDER = f'user_{os.getuid()}'
 
 
-    # Option 1
-    try:
-        import pwd
-        SYSTEM_USER = SYSTEM_USER or pwd.getpwuid(os.geteuid()).pw_name
-    except (ModuleNotFoundError, Exception):
-        pass
 
 
-    # Option 2
-    try:
-        SYSTEM_USER = SYSTEM_USER or getpass.getuser()
-    except Exception:
-        pass
-
-    # Option 3
-    try:
-        SYSTEM_USER = SYSTEM_USER or os.getlogin()
-    except Exception:
-        pass
-
-    return SYSTEM_USER or FALLBACK_USER_PLACHOLDER
-
-def get_version(config):
-    try:
-        return importlib.metadata.version(__package__ or 'archivebox')
-    except importlib.metadata.PackageNotFoundError:
-        try:
-            pyproject_config = (config['PACKAGE_DIR'] / 'pyproject.toml').read_text()
-            for line in pyproject_config:
-                if line.startswith('version = '):
-                    return line.split(' = ', 1)[-1].strip('"')
-        except FileNotFoundError:
-            # building docs, pyproject.toml is not available
-            return 'dev'
-
-    raise Exception('Failed to detect installed archivebox version!')
-
-def get_commit_hash(config) -> Optional[str]:
-    try:
-        git_dir = config['PACKAGE_DIR'] / '../.git'
-        ref = (git_dir / 'HEAD').read_text().strip().split(' ')[-1]
-        commit_hash = git_dir.joinpath(ref).read_text().strip()
-        return commit_hash
-    except Exception:
-        pass
-
-    try:
-        return list((config['PACKAGE_DIR'] / '../.git/refs/heads/').glob('*'))[0].read_text().strip()
-    except Exception:
-        pass
-    
-    return None
-
-def get_build_time(config) -> str:
-    if config['IN_DOCKER']:
-        docker_build_end_time = Path('/VERSION.txt').read_text().rsplit('BUILD_END_TIME=')[-1].split('\n', 1)[0]
-        return docker_build_end_time
-
-    src_last_modified_unix_timestamp = (config['PACKAGE_DIR'] / 'config.py').stat().st_mtime
-    return datetime.fromtimestamp(src_last_modified_unix_timestamp).strftime('%Y-%m-%d %H:%M:%S %s')
-
-def get_versions_available_on_github(config):
-    """
-    returns a dictionary containing the ArchiveBox GitHub release info for
-    the recommended upgrade version and the currently installed version
-    """
-    
-    # we only want to perform the (relatively expensive) check for new versions
-    # when its most relevant, e.g. when the user runs a long-running command
-    subcommand_run_by_user = sys.argv[3] if len(sys.argv) > 3 else 'help'
-    long_running_commands = ('add', 'schedule', 'update', 'status', 'server')
-    if subcommand_run_by_user not in long_running_commands:
-        return None
-    
-    github_releases_api = "https://api.github.com/repos/ArchiveBox/ArchiveBox/releases"
-    response = requests.get(github_releases_api)
-    if response.status_code != 200:
-        stderr(f'[!] Warning: GitHub API call to check for new ArchiveBox version failed! (status={response.status_code})', color='lightyellow', config=config)
-        return None
-    all_releases = response.json()
-
-    installed_version = parse_version_string(config['VERSION'])
-
-    # find current version or nearest older version (to link to)
-    current_version = None
-    for idx, release in enumerate(all_releases):
-        release_version = parse_version_string(release['tag_name'])
-        if release_version <= installed_version:
-            current_version = release
-            break
-
-    current_version = current_version or all_releases[-1]
-    
-    # recommended version is whatever comes after current_version in the release list
-    # (perhaps too conservative to only recommend upgrading one version at a time, but it's safest)
-    try:
-        recommended_version = all_releases[idx+1]
-    except IndexError:
-        recommended_version = None
-
-    return {'recommended_version': recommended_version, 'current_version': current_version}
-
-def can_upgrade(config):
-    if config['VERSIONS_AVAILABLE'] and config['VERSIONS_AVAILABLE']['recommended_version']:
-        recommended_version = parse_version_string(config['VERSIONS_AVAILABLE']['recommended_version']['tag_name'])
-        current_version = parse_version_string(config['VERSIONS_AVAILABLE']['current_version']['tag_name'])
-        return recommended_version > current_version
-    return False
 
 
 
 
 ############################## Derived Config ##################################
 ############################## Derived Config ##################################
@@ -523,55 +322,25 @@ def can_upgrade(config):
 # These are derived/computed values calculated *after* all user-provided config values are ingested
 # These are derived/computed values calculated *after* all user-provided config values are ingested
 # they appear in `archivebox config` output and are intended to be read-only for the user
 # they appear in `archivebox config` output and are intended to be read-only for the user
 DYNAMIC_CONFIG_SCHEMA: ConfigDefaultDict = {
 DYNAMIC_CONFIG_SCHEMA: ConfigDefaultDict = {
-    **CONSTANTS,
+    **{
+        key: {'default': lambda c: val}
+        for key, val in archivebox.CONSTANTS.items()
+    },
 
 
-    'TERM_WIDTH':               {'default': lambda c: lambda: shutil.get_terminal_size((100, 10)).columns},
-    'USER':                     {'default': lambda c: get_system_user()},
-    'ANSI':                     {'default': lambda c: DEFAULT_CLI_COLORS if c['USE_COLOR'] else AttrDict({k: '' for k in DEFAULT_CLI_COLORS.keys()})},
 
 
-    'PACKAGE_DIR':              {'default': lambda c: Path(__file__).resolve().parent},
+    'PACKAGE_DIR':              {'default': lambda c: archivebox.PACKAGE_DIR.resolve()},
     'TEMPLATES_DIR':            {'default': lambda c: c['PACKAGE_DIR'] / TEMPLATES_DIR_NAME},
     'TEMPLATES_DIR':            {'default': lambda c: c['PACKAGE_DIR'] / TEMPLATES_DIR_NAME},
     'CUSTOM_TEMPLATES_DIR':     {'default': lambda c: c['CUSTOM_TEMPLATES_DIR'] and Path(c['CUSTOM_TEMPLATES_DIR'])},
     'CUSTOM_TEMPLATES_DIR':     {'default': lambda c: c['CUSTOM_TEMPLATES_DIR'] and Path(c['CUSTOM_TEMPLATES_DIR'])},
 
 
-    'OUTPUT_DIR':               {'default': lambda c: Path(c['OUTPUT_DIR']).resolve() if c['OUTPUT_DIR'] else Path(os.curdir).resolve()},
-    'ARCHIVE_DIR':              {'default': lambda c: c['OUTPUT_DIR'] / ARCHIVE_DIR_NAME},
-    'SOURCES_DIR':              {'default': lambda c: c['OUTPUT_DIR'] / SOURCES_DIR_NAME},
-    'LOGS_DIR':                 {'default': lambda c: c['OUTPUT_DIR'] / LOGS_DIR_NAME},
-    'CACHE_DIR':                {'default': lambda c: c['OUTPUT_DIR'] / CACHE_DIR_NAME},
-    'LIB_DIR':                  {'default': lambda c: c['OUTPUT_DIR'] / LIB_DIR_NAME},
-    'BIN_DIR':                  {'default': lambda c: c['OUTPUT_DIR'] / LIB_DIR_NAME / 'bin'},
-    'PERSONAS_DIR':             {'default': lambda c: c['OUTPUT_DIR'] / PERSONAS_DIR_NAME},
-    'CONFIG_FILE':              {'default': lambda c: Path(c['CONFIG_FILE']).resolve() if c['CONFIG_FILE'] else c['OUTPUT_DIR'] / CONFIG_FILENAME},
-    'COOKIES_FILE':             {'default': lambda c: c['COOKIES_FILE'] and Path(c['COOKIES_FILE']).resolve()},
 
 
     'URL_DENYLIST_PTN':         {'default': lambda c: c['URL_DENYLIST'] and re.compile(c['URL_DENYLIST'] or '', ALLOWDENYLIST_REGEX_FLAGS)},
     'URL_DENYLIST_PTN':         {'default': lambda c: c['URL_DENYLIST'] and re.compile(c['URL_DENYLIST'] or '', ALLOWDENYLIST_REGEX_FLAGS)},
     'URL_ALLOWLIST_PTN':        {'default': lambda c: c['URL_ALLOWLIST'] and re.compile(c['URL_ALLOWLIST'] or '', ALLOWDENYLIST_REGEX_FLAGS)},
     'URL_ALLOWLIST_PTN':        {'default': lambda c: c['URL_ALLOWLIST'] and re.compile(c['URL_ALLOWLIST'] or '', ALLOWDENYLIST_REGEX_FLAGS)},
     'DIR_OUTPUT_PERMISSIONS':   {'default': lambda c: c['OUTPUT_PERMISSIONS'].replace('6', '7').replace('4', '5')},  # exec is always needed to list directories
     'DIR_OUTPUT_PERMISSIONS':   {'default': lambda c: c['OUTPUT_PERMISSIONS'].replace('6', '7').replace('4', '5')},  # exec is always needed to list directories
 
 
-    'ARCHIVEBOX_BINARY':        {'default': lambda c: sys.argv[0] or bin_path('archivebox')},
-    'NODE_BIN_PATH':            {'default': lambda c: str((Path(c["OUTPUT_DIR"]).absolute() / 'node_modules' / '.bin'))},
-
-    'VERSION':                  {'default': lambda c: get_version(c).split('+', 1)[0]},     # remove +editable from user-displayed version string
-    'COMMIT_HASH':              {'default': lambda c: get_commit_hash(c)},                  # short git commit hash of codebase HEAD commit
-    'BUILD_TIME':               {'default': lambda c: get_build_time(c)},                   # docker build completed time or python src last modified time
-    
-    'VERSIONS_AVAILABLE':       {'default': lambda c: False},             # get_versions_available_on_github(c)},
-    'CAN_UPGRADE':              {'default': lambda c: False},             # can_upgrade(c)},
-
-    'PYTHON_BINARY':            {'default': lambda c: sys.executable},
-    'PYTHON_VERSION':           {'default': lambda c: '{}.{}.{}'.format(*sys.version_info[:3])},
-
-    'DJANGO_BINARY':            {'default': lambda c: inspect.getfile(django)},
-    'DJANGO_VERSION':           {'default': lambda c: '{}.{}.{}'.format(*django.VERSION[:3])},
-    
-    'SQLITE_BINARY':            {'default': lambda c: inspect.getfile(sqlite3)},
-    'SQLITE_VERSION':           {'default': lambda c: sqlite3.version},
-    #'SQLITE_JOURNAL_MODE':      {'default': lambda c: 'wal'},         # set at runtime below, interesting if changed later but unused for now because its always expected to be wal
-    #'SQLITE_OPTIONS':           {'default': lambda c: ['JSON1']},     # set at runtime below
 
 
     'USE_CURL':                 {'default': lambda c: c['USE_CURL'] and (c['SAVE_FAVICON'] or c['SAVE_TITLE'] or c['SAVE_ARCHIVE_DOT_ORG'])},
     'USE_CURL':                 {'default': lambda c: c['USE_CURL'] and (c['SAVE_FAVICON'] or c['SAVE_TITLE'] or c['SAVE_ARCHIVE_DOT_ORG'])},
     'CURL_VERSION':             {'default': lambda c: bin_version(c['CURL_BINARY']) if c['USE_CURL'] else None},
     'CURL_VERSION':             {'default': lambda c: bin_version(c['CURL_BINARY']) if c['USE_CURL'] else None},
-    'CURL_USER_AGENT':          {'default': lambda c: c['CURL_USER_AGENT'].format(**c)},
+    # 'CURL_USER_AGENT':          {'default': lambda c: c['CURL_USER_AGENT'].format(**c)},
     'CURL_ARGS':                {'default': lambda c: c['CURL_ARGS'] or []},
     'CURL_ARGS':                {'default': lambda c: c['CURL_ARGS'] or []},
     'CURL_EXTRA_ARGS':          {'default': lambda c: c['CURL_EXTRA_ARGS'] or []},
     'CURL_EXTRA_ARGS':          {'default': lambda c: c['CURL_EXTRA_ARGS'] or []},
     'SAVE_FAVICON':             {'default': lambda c: c['USE_CURL'] and c['SAVE_FAVICON']},
     'SAVE_FAVICON':             {'default': lambda c: c['USE_CURL'] and c['SAVE_FAVICON']},
@@ -580,23 +349,14 @@ DYNAMIC_CONFIG_SCHEMA: ConfigDefaultDict = {
     'USE_WGET':                 {'default': lambda c: c['USE_WGET'] and (c['SAVE_WGET'] or c['SAVE_WARC'])},
     'USE_WGET':                 {'default': lambda c: c['USE_WGET'] and (c['SAVE_WGET'] or c['SAVE_WARC'])},
     'WGET_VERSION':             {'default': lambda c: bin_version(c['WGET_BINARY']) if c['USE_WGET'] else None},
     'WGET_VERSION':             {'default': lambda c: bin_version(c['WGET_BINARY']) if c['USE_WGET'] else None},
     'WGET_AUTO_COMPRESSION':    {'default': lambda c: wget_supports_compression(c) if c['USE_WGET'] else False},
     'WGET_AUTO_COMPRESSION':    {'default': lambda c: wget_supports_compression(c) if c['USE_WGET'] else False},
-    'WGET_USER_AGENT':          {'default': lambda c: c['WGET_USER_AGENT'].format(**c)},
+    # 'WGET_USER_AGENT':          {'default': lambda c: c['WGET_USER_AGENT'].format(**c)},
     'SAVE_WGET':                {'default': lambda c: c['USE_WGET'] and c['SAVE_WGET']},
     'SAVE_WGET':                {'default': lambda c: c['USE_WGET'] and c['SAVE_WGET']},
     'SAVE_WARC':                {'default': lambda c: c['USE_WGET'] and c['SAVE_WARC']},
     'SAVE_WARC':                {'default': lambda c: c['USE_WGET'] and c['SAVE_WARC']},
     'WGET_ARGS':                {'default': lambda c: c['WGET_ARGS'] or []},
     'WGET_ARGS':                {'default': lambda c: c['WGET_ARGS'] or []},
     'WGET_EXTRA_ARGS':          {'default': lambda c: c['WGET_EXTRA_ARGS'] or []},
     'WGET_EXTRA_ARGS':          {'default': lambda c: c['WGET_EXTRA_ARGS'] or []},
 
 
-    # 'RIPGREP_VERSION':          {'default': lambda c: bin_version(c['RIPGREP_BINARY']) if c['USE_RIPGREP'] else None},
-
-    'USE_SINGLEFILE':           {'default': lambda c: c['USE_SINGLEFILE'] and c['SAVE_SINGLEFILE']},
-    'SINGLEFILE_VERSION':       {'default': lambda c: bin_version(c['SINGLEFILE_BINARY']) if c['USE_SINGLEFILE'] else None},
-    'SINGLEFILE_ARGS':          {'default': lambda c: c['SINGLEFILE_ARGS'] or []},
-    'SINGLEFILE_EXTRA_ARGS':    {'default': lambda c: c['SINGLEFILE_EXTRA_ARGS'] or []},
-
-    'USE_READABILITY':          {'default': lambda c: c['USE_READABILITY'] and c['SAVE_READABILITY']},
-    'READABILITY_VERSION':      {'default': lambda c: bin_version(c['READABILITY_BINARY']) if c['USE_READABILITY'] else None},
-
     'USE_MERCURY':              {'default': lambda c: c['USE_MERCURY'] and c['SAVE_MERCURY']},
     'USE_MERCURY':              {'default': lambda c: c['USE_MERCURY'] and c['SAVE_MERCURY']},
+    'SAVE_MERCURY':             {'default': lambda c: c['USE_MERCURY'] and c['USE_NODE']},
     'MERCURY_VERSION':          {'default': lambda c: '1.0.0' if shutil.which(str(bin_path(c['MERCURY_BINARY']))) else None},  # mercury doesnt expose version info until this is merged https://github.com/postlight/parser/pull/750
     'MERCURY_VERSION':          {'default': lambda c: '1.0.0' if shutil.which(str(bin_path(c['MERCURY_BINARY']))) else None},  # mercury doesnt expose version info until this is merged https://github.com/postlight/parser/pull/750
     'MERCURY_ARGS':             {'default': lambda c: c['MERCURY_ARGS'] or []},
     'MERCURY_ARGS':             {'default': lambda c: c['MERCURY_ARGS'] or []},
     'MERCURY_EXTRA_ARGS':       {'default': lambda c: c['MERCURY_EXTRA_ARGS'] or []},
     'MERCURY_EXTRA_ARGS':       {'default': lambda c: c['MERCURY_EXTRA_ARGS'] or []},
@@ -605,21 +365,12 @@ DYNAMIC_CONFIG_SCHEMA: ConfigDefaultDict = {
     'GIT_VERSION':              {'default': lambda c: bin_version(c['GIT_BINARY']) if c['USE_GIT'] else None},
     'GIT_VERSION':              {'default': lambda c: bin_version(c['GIT_BINARY']) if c['USE_GIT'] else None},
     'SAVE_GIT':                 {'default': lambda c: c['USE_GIT'] and c['SAVE_GIT']},
     'SAVE_GIT':                 {'default': lambda c: c['USE_GIT'] and c['SAVE_GIT']},
 
 
-    'USE_YOUTUBEDL':            {'default': lambda c: c['USE_YOUTUBEDL'] and c['SAVE_MEDIA']},
-    'YOUTUBEDL_VERSION':        {'default': lambda c: bin_version(c['YOUTUBEDL_BINARY']) if c['USE_YOUTUBEDL'] else None},
-    'SAVE_MEDIA':               {'default': lambda c: c['USE_YOUTUBEDL'] and c['SAVE_MEDIA']},
-    'YOUTUBEDL_ARGS':           {'default': lambda c: c['YOUTUBEDL_ARGS'] or []},
-    'YOUTUBEDL_EXTRA_ARGS':     {'default': lambda c: c['YOUTUBEDL_EXTRA_ARGS'] or []},
-
-    'SAVE_READABILITY':         {'default': lambda c: c['USE_READABILITY'] and c['USE_NODE']},
-    'SAVE_MERCURY':             {'default': lambda c: c['USE_MERCURY'] and c['USE_NODE']},
-
-    'USE_NODE':                 {'default': lambda c: c['USE_NODE'] and (c['SAVE_READABILITY'] or c['SAVE_SINGLEFILE'] or c['SAVE_MERCURY'])},
+    'USE_NODE':                 {'default': lambda c: True},
     'NODE_VERSION':             {'default': lambda c: bin_version(c['NODE_BINARY']) if c['USE_NODE'] else None},
     'NODE_VERSION':             {'default': lambda c: bin_version(c['NODE_BINARY']) if c['USE_NODE'] else None},
 
 
     'DEPENDENCIES':             {'default': lambda c: get_dependency_info(c)},
     'DEPENDENCIES':             {'default': lambda c: get_dependency_info(c)},
-    'CODE_LOCATIONS':           {'default': lambda c: get_code_locations(c)},
-    'DATA_LOCATIONS':           {'default': lambda c: get_data_locations(c)},
+    # 'CODE_LOCATIONS':           {'default': lambda c: get_code_locations(c)},
+    # 'DATA_LOCATIONS':           {'default': lambda c: get_data_locations(c)},
 
 
     'SAVE_ALLOWLIST_PTN':       {'default': lambda c: c['SAVE_ALLOWLIST'] and {re.compile(k, ALLOWDENYLIST_REGEX_FLAGS): v for k, v in c['SAVE_ALLOWLIST'].items()}},
     'SAVE_ALLOWLIST_PTN':       {'default': lambda c: c['SAVE_ALLOWLIST'] and {re.compile(k, ALLOWDENYLIST_REGEX_FLAGS): v for k, v in c['SAVE_ALLOWLIST'].items()}},
     'SAVE_DENYLIST_PTN':        {'default': lambda c: c['SAVE_DENYLIST'] and {re.compile(k, ALLOWDENYLIST_REGEX_FLAGS): v for k, v in c['SAVE_DENYLIST'].items()}},
     'SAVE_DENYLIST_PTN':        {'default': lambda c: c['SAVE_DENYLIST'] and {re.compile(k, ALLOWDENYLIST_REGEX_FLAGS): v for k, v in c['SAVE_DENYLIST'].items()}},
@@ -696,12 +447,10 @@ def load_config_val(key: str,
     raise Exception('Config values can only be str, bool, int, or json')
     raise Exception('Config values can only be str, bool, int, or json')
 
 
 
 
-def load_config_file(out_dir: str | None=None) -> Optional[ConfigDict]:
+def load_config_file(out_dir: str | None=archivebox.DATA_DIR) -> Optional[ConfigDict]:
     """load the ini-formatted config file from OUTPUT_DIR/Archivebox.conf"""
     """load the ini-formatted config file from OUTPUT_DIR/Archivebox.conf"""
 
 
-    out_dir = out_dir or Path(os.getenv('OUTPUT_DIR', '.')).resolve()
-    assert out_dir and out_dir.is_dir()
-    config_path = Path(out_dir) / CONFIG_FILENAME
+    config_path = archivebox.CONSTANTS.CONFIG_FILE
     if config_path.exists():
     if config_path.exists():
         config_file = ConfigParser()
         config_file = ConfigParser()
         config_file.optionxform = str
         config_file.optionxform = str
@@ -718,7 +467,7 @@ def load_config_file(out_dir: str | None=None) -> Optional[ConfigDict]:
     return None
     return None
 
 
 
 
-def write_config_file(config: Dict[str, str], out_dir: str | None=None) -> ConfigDict:
+def write_config_file(config: Dict[str, str], out_dir: str | None=archivebox.DATA_DIR) -> ConfigDict:
     """load the ini-formatted config file from OUTPUT_DIR/Archivebox.conf"""
     """load the ini-formatted config file from OUTPUT_DIR/Archivebox.conf"""
 
 
     from .system import atomic_write
     from .system import atomic_write
@@ -737,8 +486,7 @@ def write_config_file(config: Dict[str, str], out_dir: str | None=None) -> Confi
 
 
     """)
     """)
 
 
-    out_dir = out_dir or Path(os.getenv('OUTPUT_DIR', '.')).resolve()
-    config_path = Path(out_dir) /  CONFIG_FILENAME
+    config_path = archivebox.CONSTANTS.CONFIG_FILE
 
 
     if not config_path.exists():
     if not config_path.exists():
         atomic_write(config_path, CONFIG_HEADER)
         atomic_write(config_path, CONFIG_HEADER)
@@ -833,7 +581,7 @@ def load_config(defaults: ConfigDefaultDict,
             stderr('        https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration')
             stderr('        https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration')
             stderr()
             stderr()
             # raise
             # raise
-            raise SystemExit(2)
+            # raise SystemExit(2)
 
 
     return AttrDict(extended_config)
     return AttrDict(extended_config)
 
 
@@ -984,98 +732,6 @@ def wget_supports_compression(config):
     except (FileNotFoundError, OSError):
     except (FileNotFoundError, OSError):
         return False
         return False
 
 
-def get_code_locations(config: ConfigDict) -> SimpleConfigValueDict:
-    return {
-        'PACKAGE_DIR': {
-            'path': (config['PACKAGE_DIR']).resolve(),
-            'enabled': True,
-            'is_valid': (config['PACKAGE_DIR'] / '__main__.py').exists(),
-        },
-        'TEMPLATES_DIR': {
-            'path': (config['TEMPLATES_DIR']).resolve(),
-            'enabled': True,
-            'is_valid': (config['TEMPLATES_DIR'] / 'static').exists(),
-        },
-        'LIB_DIR': {
-            'path': (config['LIB_DIR']).resolve(),
-            'enabled': True,
-            'is_valid': config['LIB_DIR'].is_dir(),
-        },
-        # 'NODE_MODULES_DIR': {
-        #     'path': ,
-        #     'enabled': ,
-        #     'is_valid': (...).exists(),
-        # },
-    }
-
-def get_data_locations(config: ConfigDict) -> ConfigValue:
-    return {
-        # OLD: migrating to personas
-        # 'CHROME_USER_DATA_DIR': {
-        #     'path': os.path.abspath(config['CHROME_USER_DATA_DIR']),
-        #     'enabled': config['USE_CHROME'] and config['CHROME_USER_DATA_DIR'],
-        #     'is_valid': False if config['CHROME_USER_DATA_DIR'] is None else (Path(config['CHROME_USER_DATA_DIR']) / 'Default').exists(),
-        # },
-        # 'COOKIES_FILE': {
-        #     'path': os.path.abspath(config['COOKIES_FILE']),
-        #     'enabled': config['USE_WGET'] and config['COOKIES_FILE'],
-        #     'is_valid': False if config['COOKIES_FILE'] is None else Path(config['COOKIES_FILE']).exists(),
-        # },
-        "OUTPUT_DIR": {
-            "path": config["OUTPUT_DIR"].resolve(),
-            "enabled": True,
-            "is_valid": (config["OUTPUT_DIR"] / SQL_INDEX_FILENAME).exists(),
-            "is_mount": os.path.ismount(config["OUTPUT_DIR"].resolve()),
-        },
-        "CONFIG_FILE": {
-            "path": config["CONFIG_FILE"].resolve(),
-            "enabled": True,
-            "is_valid": config["CONFIG_FILE"].exists(),
-        },
-        "SQL_INDEX": {
-            "path": (config["OUTPUT_DIR"] / SQL_INDEX_FILENAME).resolve(),
-            "enabled": True,
-            "is_valid": (config["OUTPUT_DIR"] / SQL_INDEX_FILENAME).exists(),
-            "is_mount": os.path.ismount((config["OUTPUT_DIR"] / SQL_INDEX_FILENAME).resolve()),
-        },
-        "ARCHIVE_DIR": {
-            "path": config["ARCHIVE_DIR"].resolve(),
-            "enabled": True,
-            "is_valid": config["ARCHIVE_DIR"].exists(),
-            "is_mount": os.path.ismount(config["ARCHIVE_DIR"].resolve()),
-        },
-        "SOURCES_DIR": {
-            "path": config["SOURCES_DIR"].resolve(),
-            "enabled": True,
-            "is_valid": config["SOURCES_DIR"].exists(),
-        },
-        "PERSONAS_DIR": {
-            "path": config["PERSONAS_DIR"].resolve(),
-            "enabled": True,
-            "is_valid": config["PERSONAS_DIR"].exists(),
-        },
-        "LOGS_DIR": {
-            "path": config["LOGS_DIR"].resolve(),
-            "enabled": True,
-            "is_valid": config["LOGS_DIR"].exists(),
-        },
-        "CACHE_DIR": {
-            "path": config["CACHE_DIR"].resolve(),
-            "enabled": True,
-            "is_valid": config["CACHE_DIR"].exists(),
-        },
-        "CUSTOM_TEMPLATES_DIR": {
-            "path": config["CUSTOM_TEMPLATES_DIR"] and Path(config["CUSTOM_TEMPLATES_DIR"]).resolve(),
-            "enabled": bool(config["CUSTOM_TEMPLATES_DIR"]),
-            "is_valid": config["CUSTOM_TEMPLATES_DIR"] and Path(config["CUSTOM_TEMPLATES_DIR"]).exists(),
-        },
-        # managed by bin/docker_entrypoint.sh and python-crontab:
-        # 'CRONTABS_DIR': {
-        #     'path': config['CRONTABS_DIR'].resolve(),
-        #     'enabled': True,
-        #     'is_valid': config['CRONTABS_DIR'].exists(),
-        # },
-    }
 
 
 def get_dependency_info(config: ConfigDict) -> ConfigValue:
 def get_dependency_info(config: ConfigDict) -> ConfigValue:
     return {
     return {
@@ -1129,20 +785,6 @@ def get_dependency_info(config: ConfigDict) -> ConfigValue:
             'enabled': config['USE_NODE'],
             'enabled': config['USE_NODE'],
             'is_valid': bool(config['NODE_VERSION']),
             'is_valid': bool(config['NODE_VERSION']),
         },
         },
-        'SINGLEFILE_BINARY': {
-            'path': bin_path(config['SINGLEFILE_BINARY']),
-            'version': config['SINGLEFILE_VERSION'],
-            'hash': bin_hash(config['SINGLEFILE_BINARY']),
-            'enabled': config['USE_SINGLEFILE'],
-            'is_valid': bool(config['SINGLEFILE_VERSION']),
-        },
-        'READABILITY_BINARY': {
-            'path': bin_path(config['READABILITY_BINARY']),
-            'version': config['READABILITY_VERSION'],
-            'hash': bin_hash(config['READABILITY_BINARY']),
-            'enabled': config['USE_READABILITY'],
-            'is_valid': bool(config['READABILITY_VERSION']),
-        },
         'MERCURY_BINARY': {
         'MERCURY_BINARY': {
             'path': bin_path(config['MERCURY_BINARY']),
             'path': bin_path(config['MERCURY_BINARY']),
             'version': config['MERCURY_VERSION'],
             'version': config['MERCURY_VERSION'],
@@ -1157,13 +799,27 @@ def get_dependency_info(config: ConfigDict) -> ConfigValue:
             'enabled': config['USE_GIT'],
             'enabled': config['USE_GIT'],
             'is_valid': bool(config['GIT_VERSION']),
             'is_valid': bool(config['GIT_VERSION']),
         },
         },
-        'YOUTUBEDL_BINARY': {
-            'path': bin_path(config['YOUTUBEDL_BINARY']),
-            'version': config['YOUTUBEDL_VERSION'],
-            'hash': bin_hash(config['YOUTUBEDL_BINARY']),
-            'enabled': config['USE_YOUTUBEDL'],
-            'is_valid': bool(config['YOUTUBEDL_VERSION']),
-        },
+        # 'SINGLEFILE_BINARY': {
+        #     'path': bin_path(config['SINGLEFILE_BINARY']),
+        #     'version': config['SINGLEFILE_VERSION'],
+        #     'hash': bin_hash(config['SINGLEFILE_BINARY']),
+        #     'enabled': config['USE_SINGLEFILE'],
+        #     'is_valid': bool(config['SINGLEFILE_VERSION']),
+        # },
+        # 'READABILITY_BINARY': {
+        #     'path': bin_path(config['READABILITY_BINARY']),
+        #     'version': config['READABILITY_VERSION'],
+        #     'hash': bin_hash(config['READABILITY_BINARY']),
+        #     'enabled': config['USE_READABILITY'],
+        #     'is_valid': bool(config['READABILITY_VERSION']),
+        # },
+        # 'YOUTUBEDL_BINARY': {
+        #     'path': bin_path(config['YOUTUBEDL_BINARY']),
+        #     'version': config['YOUTUBEDL_VERSION'],
+        #     'hash': bin_hash(config['YOUTUBEDL_BINARY']),
+        #     'enabled': config['USE_YOUTUBEDL'],
+        #     'is_valid': bool(config['YOUTUBEDL_VERSION']),
+        # },
         # 'CHROME_BINARY': {
         # 'CHROME_BINARY': {
         #     'path': bin_path(config['CHROME_BINARY']),
         #     'path': bin_path(config['CHROME_BINARY']),
         #     'version': config['CHROME_VERSION'],
         #     'version': config['CHROME_VERSION'],
@@ -1227,10 +883,6 @@ assert TIMEZONE == 'UTC', 'The server timezone should always be set to UTC'  # n
 os.environ["TZ"] = TIMEZONE                                                  # noqa: F821
 os.environ["TZ"] = TIMEZONE                                                  # noqa: F821
 os.umask(0o777 - int(DIR_OUTPUT_PERMISSIONS, base=8))                        # noqa: F821
 os.umask(0o777 - int(DIR_OUTPUT_PERMISSIONS, base=8))                        # noqa: F821
 
 
-# add ./node_modules/.bin to $PATH so we can use node scripts in extractors
-sys.path.append(CONFIG.NODE_BIN_PATH)
-
-
 ########################### Config Validity Checkers ###########################
 ########################### Config Validity Checkers ###########################
 
 
 if not CONFIG.USE_COLOR:
 if not CONFIG.USE_COLOR:
@@ -1256,6 +908,7 @@ def bump_startup_progress_bar():
 
 
 def setup_django_minimal():
 def setup_django_minimal():
     sys.path.append(str(archivebox.PACKAGE_DIR))
     sys.path.append(str(archivebox.PACKAGE_DIR))
+    os.environ.setdefault('OUTPUT_DIR', str(archivebox.DATA_DIR))
     os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'core.settings')
     os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'core.settings')
     django.setup()
     django.setup()
 
 
@@ -1267,29 +920,18 @@ def setup_django(out_dir: Path=None, check_db=False, config: ConfigDict=CONFIG,
     with Progress(transient=True, expand=True, console=CONSOLE) as INITIAL_STARTUP_PROGRESS:
     with Progress(transient=True, expand=True, console=CONSOLE) as INITIAL_STARTUP_PROGRESS:
         INITIAL_STARTUP_PROGRESS_TASK = INITIAL_STARTUP_PROGRESS.add_task("[green]Loading modules...", total=25)
         INITIAL_STARTUP_PROGRESS_TASK = INITIAL_STARTUP_PROGRESS.add_task("[green]Loading modules...", total=25)
 
 
-        output_dir = out_dir or Path(config['OUTPUT_DIR'])
+        output_dir = out_dir or archivebox.DATA_DIR
 
 
-        assert isinstance(output_dir, Path) and isinstance(config['PACKAGE_DIR'], Path)
+        assert isinstance(output_dir, Path) and isinstance(archivebox.PACKAGE_DIR, Path)
 
 
         bump_startup_progress_bar()
         bump_startup_progress_bar()
         try:
         try:
             from django.core.management import call_command
             from django.core.management import call_command
 
 
-            sys.path.append(str(config['PACKAGE_DIR']))
-            os.environ.setdefault('OUTPUT_DIR', str(output_dir))
-            assert (config['PACKAGE_DIR'] / 'core' / 'settings.py').exists(), 'settings.py was not found at archivebox/core/settings.py'
+            sys.path.append(str(archivebox.PACKAGE_DIR))
+            os.environ.setdefault('OUTPUT_DIR', str(archivebox.DATA_DIR))
+            os.environ.setdefault("ARCHIVEBOX_DATABASE_NAME", ":memory:")
             os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'core.settings')
             os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'core.settings')
-
-            # Check to make sure JSON extension is available in our Sqlite3 instance
-            try:
-                cursor = sqlite3.connect(':memory:').cursor()
-                cursor.execute('SELECT JSON(\'{"a": "b"}\')')
-            except sqlite3.OperationalError as exc:
-                stderr(f'[X] Your SQLite3 version is missing the required JSON1 extension: {exc}', color='red')
-                hint([
-                    'Upgrade your Python version or install the extension manually:',
-                    'https://code.djangoproject.com/wiki/JSON1Extension'
-                ])
                 
                 
             bump_startup_progress_bar()
             bump_startup_progress_bar()
 
 
@@ -1310,28 +952,16 @@ def setup_django(out_dir: Path=None, check_db=False, config: ConfigDict=CONFIG,
             bump_startup_progress_bar()
             bump_startup_progress_bar()
 
 
             from django.conf import settings
             from django.conf import settings
+            
+            from plugins_sys.config.apps import SHELL_CONFIG
 
 
             # log startup message to the error log
             # log startup message to the error log
             with open(settings.ERROR_LOG, "a", encoding='utf-8') as f:
             with open(settings.ERROR_LOG, "a", encoding='utf-8') as f:
                 command = ' '.join(sys.argv)
                 command = ' '.join(sys.argv)
                 ts = datetime.now(timezone.utc).strftime('%Y-%m-%d__%H:%M:%S')
                 ts = datetime.now(timezone.utc).strftime('%Y-%m-%d__%H:%M:%S')
-                f.write(f"\n> {command}; TS={ts} VERSION={config['VERSION']} IN_DOCKER={config['IN_DOCKER']} IS_TTY={config['IS_TTY']}\n")
+                f.write(f"\n> {command}; TS={ts} VERSION={archivebox.VERSION} IN_DOCKER={SHELL_CONFIG.IN_DOCKER} IS_TTY={SHELL_CONFIG.IS_TTY}\n")
 
 
             if check_db:
             if check_db:
-                # Enable WAL mode in sqlite3
-                from django.db import connection
-                with connection.cursor() as cursor:
-
-                    # Set Journal mode to WAL to allow for multiple writers
-                    current_mode = cursor.execute("PRAGMA journal_mode")
-                    if current_mode != 'wal':
-                        cursor.execute("PRAGMA journal_mode=wal;")
-
-                    # Set max blocking delay for concurrent writes and write sync mode
-                    # https://litestream.io/tips/#busy-timeout
-                    cursor.execute("PRAGMA busy_timeout = 5000;")
-                    cursor.execute("PRAGMA synchronous = NORMAL;")
-
                 # Create cache table in DB if needed
                 # Create cache table in DB if needed
                 try:
                 try:
                     from django.core.cache import cache
                     from django.core.cache import cache
@@ -1348,9 +978,9 @@ def setup_django(out_dir: Path=None, check_db=False, config: ConfigDict=CONFIG,
                 for conn in connections.all():
                 for conn in connections.all():
                     conn.close_if_unusable_or_obsolete()
                     conn.close_if_unusable_or_obsolete()
 
 
-                sql_index_path = Path(output_dir) / SQL_INDEX_FILENAME
+                sql_index_path = archivebox.CONSTANTS.DATABASE_FILE
                 assert sql_index_path.exists(), (
                 assert sql_index_path.exists(), (
-                    f'No database file {SQL_INDEX_FILENAME} found in: {config["OUTPUT_DIR"]} (Are you in an ArchiveBox collection directory?)')
+                    f'No database file {sql_index_path} found in: {archivebox.DATA_DIR} (Are you in an ArchiveBox collection directory?)')
 
 
                 bump_startup_progress_bar()
                 bump_startup_progress_bar()
 
 
@@ -1363,7 +993,7 @@ def setup_django(out_dir: Path=None, check_db=False, config: ConfigDict=CONFIG,
 
 
                     logfire.configure()
                     logfire.configure()
                     logfire.instrument_django(is_sql_commentor_enabled=True)
                     logfire.instrument_django(is_sql_commentor_enabled=True)
-                    logfire.info(f'Started ArchiveBox v{CONFIG.VERSION}', argv=sys.argv)
+                    logfire.info(f'Started ArchiveBox v{archivebox.VERSION}', argv=sys.argv)
 
 
         except KeyboardInterrupt:
         except KeyboardInterrupt:
             raise SystemExit(2)
             raise SystemExit(2)

+ 249 - 0
archivebox/constants.py

@@ -0,0 +1,249 @@
+__package__ = 'archivebox'
+
+
+import os
+from types import MappingProxyType
+from typing import Set, Dict, NamedTuple, Tuple
+from pathlib import Path
+
+from benedict import benedict
+
+import archivebox
+
+from .misc.logging import DEFAULT_CLI_COLORS
+
+###################### Config ##########################
+
+class ConstantsConfig(NamedTuple):
+    
+    VERSION: str = archivebox.__version__
+    
+    DEFAULT_CLI_COLORS: Dict[str, str]        = DEFAULT_CLI_COLORS
+    DISABLED_CLI_COLORS: Dict[str, str]       = benedict({k: '' for k in DEFAULT_CLI_COLORS})
+    
+    PACKAGE_DIR: Path = archivebox.PACKAGE_DIR
+    PACKAGE_DIR_NAME: str = archivebox.PACKAGE_DIR.name
+    TEMPLATES_DIR_NAME: str = 'templates'
+    TEMPLATES_DIR: Path                 = archivebox.PACKAGE_DIR / TEMPLATES_DIR_NAME
+    STATIC_DIR: Path                    = TEMPLATES_DIR / 'static'
+    USER_PLUGINS_DIR_NAME: str          = 'user_plugins'
+    CUSTOM_TEMPLATES_DIR_NAME: str      = 'user_templates'
+    
+    DATA_DIR: Path = archivebox.DATA_DIR
+    ARCHIVE_DIR_NAME: str = 'archive'
+    SOURCES_DIR_NAME: str = 'sources'
+    PERSONAS_DIR_NAME: str = 'personas'
+    CRONTABS_DIR_NAME: str = 'crontabs'
+    CACHE_DIR_NAME: str = 'cache'
+    LOGS_DIR_NAME: str = 'logs'
+    LIB_DIR_NAME: str = 'lib'
+    TMP_DIR_NAME: str = 'tmp'
+    OUTPUT_DIR: Path                    = archivebox.DATA_DIR
+    ARCHIVE_DIR: Path                   = archivebox.DATA_DIR / ARCHIVE_DIR_NAME
+    SOURCES_DIR: Path                   = archivebox.DATA_DIR / SOURCES_DIR_NAME
+    PERSONAS_DIR: Path                  = archivebox.DATA_DIR / PERSONAS_DIR_NAME
+    CACHE_DIR: Path                     = archivebox.DATA_DIR / CACHE_DIR_NAME
+    LOGS_DIR: Path                      = archivebox.DATA_DIR / LOGS_DIR_NAME
+    LIB_DIR: Path                       = archivebox.DATA_DIR / LIB_DIR_NAME
+    TMP_DIR: Path                       = archivebox.DATA_DIR / TMP_DIR_NAME
+    CUSTOM_TEMPLATES_DIR: Path          = archivebox.DATA_DIR / CUSTOM_TEMPLATES_DIR_NAME
+    USER_PLUGINS_DIR: Path              = archivebox.DATA_DIR / USER_PLUGINS_DIR_NAME
+    
+    LIB_PIP_DIR: Path                   = LIB_DIR / 'pip'
+    LIB_NPM_DIR: Path                   = LIB_DIR / 'npm'
+    LIB_BROWSERS_DIR: Path              = LIB_DIR / 'browsers'
+    LIB_BIN_DIR: Path                   = LIB_DIR / 'bin'
+    BIN_DIR: Path                       = LIB_BIN_DIR
+    
+    CONFIG_FILENAME: str = 'ArchiveBox.conf'
+    SQL_INDEX_FILENAME: str = 'index.sqlite3'
+    
+    CONFIG_FILE: Path                   = archivebox.DATA_DIR / CONFIG_FILENAME
+    DATABASE_FILE: Path                 = archivebox.DATA_DIR / SQL_INDEX_FILENAME
+    QUEUE_DATABASE_FILE: Path           = archivebox.DATA_DIR / SQL_INDEX_FILENAME.replace('index.', 'queue.')
+    
+    JSON_INDEX_FILENAME: str = 'index.json'
+    HTML_INDEX_FILENAME: str = 'index.html'
+    ROBOTS_TXT_FILENAME: str = 'robots.txt'
+    FAVICON_FILENAME: str = 'favicon.ico'
+    
+    STATICFILE_EXTENSIONSSTATICFILE_EXTENSIONS: frozenset[str] = frozenset((
+        # 99.999% of the time, URLs ending in these extensions are static files
+        # that can be downloaded as-is, not html pages that need to be rendered
+        'gif', 'jpeg', 'jpg', 'png', 'tif', 'tiff', 'wbmp', 'ico', 'jng', 'bmp',
+        'svg', 'svgz', 'webp', 'ps', 'eps', 'ai',
+        'mp3', 'mp4', 'm4a', 'mpeg', 'mpg', 'mkv', 'mov', 'webm', 'm4v',
+        'flv', 'wmv', 'avi', 'ogg', 'ts', 'm3u8',
+        'pdf', 'txt', 'rtf', 'rtfd', 'doc', 'docx', 'ppt', 'pptx', 'xls', 'xlsx',
+        'atom', 'rss', 'css', 'js', 'json',
+        'dmg', 'iso', 'img',
+        'rar', 'war', 'hqx', 'zip', 'gz', 'bz2', '7z',
+
+        # Less common extensions to consider adding later
+        # jar, swf, bin, com, exe, dll, deb
+        # ear, hqx, eot, wmlc, kml, kmz, cco, jardiff, jnlp, run, msi, msp, msm,
+        # pl pm, prc pdb, rar, rpm, sea, sit, tcl tk, der, pem, crt, xpi, xspf,
+        # ra, mng, asx, asf, 3gpp, 3gp, mid, midi, kar, jad, wml, htc, mml
+
+        # These are always treated as pages, not as static files, never add them:
+        # html, htm, shtml, xhtml, xml, aspx, php, cgi
+    ))
+    
+    INGORED_PATHS: frozenset[str] = frozenset((
+        ".git",
+        ".svn",
+        ".DS_Store",
+        ".gitignore",
+        "lost+found",
+        ".DS_Store",
+        ".env",
+        "Dockerfile",
+    ))
+    PIP_RELATED_NAMES: frozenset[str] = frozenset((
+        ".venv",
+        "venv",
+        "virtualenv",
+        ".virtualenv",
+    ))
+    NPM_RELATED_NAMES: frozenset[str] = frozenset((
+        "node_modules",
+        "package.json",
+        "package-lock.json",
+        "yarn.lock",
+    ))
+    
+    DATA_DIR_NAMES: frozenset[str] = frozenset((
+        ARCHIVE_DIR_NAME,
+        SOURCES_DIR_NAME,
+        LOGS_DIR_NAME,
+        CACHE_DIR_NAME,
+        LIB_DIR_NAME,
+        PERSONAS_DIR_NAME,
+        CUSTOM_TEMPLATES_DIR_NAME,
+        USER_PLUGINS_DIR_NAME,
+    ))
+    DATA_DIRS: frozenset[Path] = frozenset(archivebox.DATA_DIR / dirname for dirname in DATA_DIR_NAMES)
+    DATA_FILE_NAMES: frozenset[str] = frozenset((
+        CONFIG_FILENAME,
+        SQL_INDEX_FILENAME,
+        f"{SQL_INDEX_FILENAME}-wal",
+        f"{SQL_INDEX_FILENAME}-shm",
+        "queue.sqlite3",
+        "queue.sqlite3-wal",
+        "queue.sqlite3-shm",
+        "search.sqlite3",
+        JSON_INDEX_FILENAME,
+        HTML_INDEX_FILENAME,
+        ROBOTS_TXT_FILENAME,
+        FAVICON_FILENAME,
+        CONFIG_FILENAME,
+        f"{CONFIG_FILENAME}.bak",
+        "static_index.json",
+    ))
+    
+    # When initializing archivebox in a new directory, we check to make sure the dir is
+    # actually empty so that we dont clobber someone's home directory or desktop by accident.
+    # These files are exceptions to the is_empty check when we're trying to init a new dir,
+    # as they could be from a previous archivebox version, system artifacts, dependencies, etc.
+    ALLOWED_IN_OUTPUT_DIR: frozenset[str] = frozenset((
+        *INGORED_PATHS,
+        *PIP_RELATED_NAMES,
+        *NPM_RELATED_NAMES,
+        *DATA_DIR_NAMES,
+        *DATA_FILE_NAMES,
+        "static",                # created by old static exports <v0.6.0
+        "sonic",                 # created by docker bind mount
+    ))
+    
+    CODE_LOCATIONS = MappingProxyType(benedict({
+        'PACKAGE_DIR': {
+            'path': (archivebox.PACKAGE_DIR).resolve(),
+            'enabled': True,
+            'is_valid': (archivebox.PACKAGE_DIR / '__main__.py').exists(),
+        },
+        'LIB_DIR': {
+            'path': LIB_DIR.resolve(),
+            'enabled': True,
+            'is_valid': LIB_DIR.is_dir(),
+        },
+        'RUNTIME_CONFIG': {
+            'path': TMP_DIR.resolve(),
+            'enabled': True,
+            'is_valid': TMP_DIR.is_dir(),
+        },
+        'TEMPLATES_DIR': {
+            'path': TEMPLATES_DIR.resolve(),
+            'enabled': True,
+            'is_valid': STATIC_DIR.exists(),
+        },
+        'CUSTOM_TEMPLATES_DIR': {
+            'path': CUSTOM_TEMPLATES_DIR.resolve(),
+            'enabled': True,
+            'is_valid': CUSTOM_TEMPLATES_DIR.is_dir(),
+        },
+    }))
+        
+    DATA_LOCATIONS = MappingProxyType(benedict({
+        "OUTPUT_DIR": {
+            "path": archivebox.DATA_DIR.resolve(),
+            "enabled": True,
+            "is_valid": DATABASE_FILE.exists(),
+            "is_mount": os.path.ismount(archivebox.DATA_DIR.resolve()),
+        },
+        "CONFIG_FILE": {
+            "path": CONFIG_FILE.resolve(),
+            "enabled": True,
+            "is_valid": CONFIG_FILE.exists(),
+        },
+        "SQL_INDEX": {
+            "path": DATABASE_FILE.resolve(),
+            "enabled": True,
+            "is_valid": DATABASE_FILE.exists(),
+            "is_mount": os.path.ismount(DATABASE_FILE.resolve()),
+        },
+        "QUEUE_DATABASE": {
+            "path": QUEUE_DATABASE_FILE.resolve(),
+            "enabled": True,
+            "is_valid": QUEUE_DATABASE_FILE.exists(),
+            "is_mount": os.path.ismount(QUEUE_DATABASE_FILE.resolve()),
+        },
+        "ARCHIVE_DIR": {
+            "path": ARCHIVE_DIR.resolve(),
+            "enabled": True,
+            "is_valid": ARCHIVE_DIR.exists(),
+            "is_mount": os.path.ismount(ARCHIVE_DIR.resolve()),
+        },
+        "SOURCES_DIR": {
+            "path": SOURCES_DIR.resolve(),
+            "enabled": True,
+            "is_valid": SOURCES_DIR.exists(),
+        },
+        "PERSONAS_DIR": {
+            "path": PERSONAS_DIR.resolve(),
+            "enabled": PERSONAS_DIR.exists(),
+            "is_valid": PERSONAS_DIR.exists(),
+        },
+        "LOGS_DIR": {
+            "path": LOGS_DIR.resolve(),
+            "enabled": True,
+            "is_valid": LOGS_DIR.is_dir(),
+        },
+        "CACHE_DIR": {
+            "path": CACHE_DIR.resolve(),
+            "enabled": True,
+            "is_valid": CACHE_DIR.is_dir(),
+        },
+    }))
+    
+    def items(self):
+        return self._asdict().items()
+    
+    def keys(self):
+        return self._asdict().keys()
+    
+    def values(self):
+        return self._asdict().values()
+
+
+CONSTANTS = ConstantsConfig()
+CONSTANTS_CONFIG = CONSTANTS

+ 3 - 3
archivebox/core/admin.py

@@ -2,7 +2,6 @@ __package__ = 'archivebox.core'
 
 
 import os
 import os
 
 
-import threading
 from pathlib import Path
 from pathlib import Path
 
 
 from django.contrib import admin, messages
 from django.contrib import admin, messages
@@ -19,6 +18,7 @@ from django.template import Template, RequestContext
 from django.conf import settings
 from django.conf import settings
 from django import forms
 from django import forms
 
 
+import archivebox
 
 
 from signal_webhooks.admin import WebhookAdmin
 from signal_webhooks.admin import WebhookAdmin
 from signal_webhooks.utils import get_webhook_model
 from signal_webhooks.utils import get_webhook_model
@@ -34,13 +34,13 @@ from queues.tasks import bg_archive_links, bg_archive_link, bg_add
 
 
 from index.html import snapshot_icons
 from index.html import snapshot_icons
 from logging_util import printable_filesize
 from logging_util import printable_filesize
-from main import add, remove
+from main import remove
 from extractors import archive_links
 from extractors import archive_links
 
 
 
 
 CONFIG = settings.CONFIG
 CONFIG = settings.CONFIG
 
 
-GLOBAL_CONTEXT = {'VERSION': CONFIG.VERSION, 'VERSIONS_AVAILABLE': CONFIG.VERSIONS_AVAILABLE, 'CAN_UPGRADE': CONFIG.CAN_UPGRADE}
+GLOBAL_CONTEXT = {'VERSION': archivebox.VERSION, 'VERSIONS_AVAILABLE': [], 'CAN_UPGRADE': False}
 
 
 # Admin URLs
 # Admin URLs
 # /admin/
 # /admin/

+ 31 - 39
archivebox/core/settings.py

@@ -2,36 +2,27 @@ __package__ = 'archivebox.core'
 
 
 import os
 import os
 import sys
 import sys
-import re
-import logging
 import inspect
 import inspect
-import tempfile
-import archivebox
 
 
 from typing import Dict
 from typing import Dict
 from pathlib import Path
 from pathlib import Path
 
 
-import django
+from benedict import benedict
 from django.utils.crypto import get_random_string
 from django.utils.crypto import get_random_string
 
 
+import archivebox
+
 from ..config import CONFIG
 from ..config import CONFIG
-from ..config_stubs import AttrDict
-assert isinstance(CONFIG, AttrDict)
 
 
 IS_MIGRATING = 'makemigrations' in sys.argv[:3] or 'migrate' in sys.argv[:3]
 IS_MIGRATING = 'makemigrations' in sys.argv[:3] or 'migrate' in sys.argv[:3]
 IS_TESTING = 'test' in sys.argv[:3] or 'PYTEST_CURRENT_TEST' in os.environ
 IS_TESTING = 'test' in sys.argv[:3] or 'PYTEST_CURRENT_TEST' in os.environ
 IS_SHELL = 'shell' in sys.argv[:3] or 'shell_plus' in sys.argv[:3]
 IS_SHELL = 'shell' in sys.argv[:3] or 'shell_plus' in sys.argv[:3]
 
 
 
 
+VERSION = archivebox.__version__
 PACKAGE_DIR = archivebox.PACKAGE_DIR
 PACKAGE_DIR = archivebox.PACKAGE_DIR
-assert PACKAGE_DIR == CONFIG.PACKAGE_DIR
-
 DATA_DIR = archivebox.DATA_DIR
 DATA_DIR = archivebox.DATA_DIR
-assert DATA_DIR == CONFIG.OUTPUT_DIR
-ARCHIVE_DIR = DATA_DIR / 'archive'
-assert ARCHIVE_DIR == CONFIG.ARCHIVE_DIR
-
-VERSION = archivebox.__version__
+ARCHIVE_DIR = archivebox.DATA_DIR / 'archive'
 
 
 ################################################################################
 ################################################################################
 ### ArchiveBox Plugin Settings
 ### ArchiveBox Plugin Settings
@@ -39,17 +30,16 @@ VERSION = archivebox.__version__
 
 
 
 
 def find_plugins_in_dir(plugins_dir: Path, prefix: str) -> Dict[str, Path]:
 def find_plugins_in_dir(plugins_dir: Path, prefix: str) -> Dict[str, Path]:
-    """{"plugins_pkg.pip": "/app/archivebox/plugins_pkg/pip", "user_plugins.other": "/data/user_plugins/other",...}"""
     return {
     return {
         f"{prefix}.{plugin_entrypoint.parent.name}": plugin_entrypoint.parent
         f"{prefix}.{plugin_entrypoint.parent.name}": plugin_entrypoint.parent
         for plugin_entrypoint in sorted(plugins_dir.glob("*/apps.py"))   # key=get_plugin_order  # Someday enforcing plugin import order may be required, but right now it's not needed
         for plugin_entrypoint in sorted(plugins_dir.glob("*/apps.py"))   # key=get_plugin_order  # Someday enforcing plugin import order may be required, but right now it's not needed
-    }
+    }   # "plugins_pkg.pip": "/app/archivebox/plugins_pkg/pip"
     
     
 PLUGIN_DIRS = {
 PLUGIN_DIRS = {
     'plugins_sys':          PACKAGE_DIR / 'plugins_sys',
     'plugins_sys':          PACKAGE_DIR / 'plugins_sys',
     'plugins_pkg':          PACKAGE_DIR / 'plugins_pkg',
     'plugins_pkg':          PACKAGE_DIR / 'plugins_pkg',
     'plugins_auth':         PACKAGE_DIR / 'plugins_auth',
     'plugins_auth':         PACKAGE_DIR / 'plugins_auth',
-    'plugins_search':         PACKAGE_DIR / 'plugins_search',
+    'plugins_search':       PACKAGE_DIR / 'plugins_search',
     'plugins_extractor':    PACKAGE_DIR / 'plugins_extractor',
     'plugins_extractor':    PACKAGE_DIR / 'plugins_extractor',
     'user_plugins':         DATA_DIR / 'user_plugins',
     'user_plugins':         DATA_DIR / 'user_plugins',
 }
 }
@@ -59,17 +49,17 @@ for plugin_prefix, plugin_dir in PLUGIN_DIRS.items():
 
 
 
 
 ### Plugins Globals (filled by plugin_type.pluginname.apps.PluginName.register() after Django startup)
 ### Plugins Globals (filled by plugin_type.pluginname.apps.PluginName.register() after Django startup)
-PLUGINS = AttrDict({})
-HOOKS = AttrDict({})
+PLUGINS = benedict({})
+HOOKS = benedict({})
 
 
-# Created later by Hook.register(settings) when each Plugin.register(settings) is called
-# CONFIGS = AttrDict({})
-# BINPROVIDERS = AttrDict({})
-# BINARIES = AttrDict({})
-# EXTRACTORS = AttrDict({})
-# REPLAYERS = AttrDict({})
-# CHECKS = AttrDict({})
-# ADMINDATAVIEWS = AttrDict({})
+# Created later by Plugin.register(settings) -> Hook.register(settings):
+# CONFIGS = benedict({})
+# BINPROVIDERS = benedict({})
+# BINARIES = benedict({})
+# EXTRACTORS = benedict({})
+# REPLAYERS = benedict({})
+# CHECKS = benedict({})
+# ADMINDATAVIEWS = benedict({})
 
 
 
 
 ################################################################################
 ################################################################################
@@ -113,7 +103,7 @@ INSTALLED_APPS = [
     'api',                       # Django-Ninja-based Rest API interfaces, config, APIToken model, etc.
     'api',                       # Django-Ninja-based Rest API interfaces, config, APIToken model, etc.
 
 
     # ArchiveBox plugins
     # ArchiveBox plugins
-    *INSTALLED_PLUGINS.keys(),   # all plugin django-apps found in archivebox/*_plugins and data/user_plugins,
+    *INSTALLED_PLUGINS.keys(),   # all plugin django-apps found in archivebox/plugins_* and data/user_plugins,
     # plugin.register(settings) is called at import of each plugin (in the order they are listed here), then plugin.ready() is called at AppConfig.ready() time
     # plugin.register(settings) is called at import of each plugin (in the order they are listed here), then plugin.ready() is called at AppConfig.ready() time
 
 
     # 3rd-party apps from PyPI that need to be loaded last
     # 3rd-party apps from PyPI that need to be loaded last
@@ -164,7 +154,7 @@ if LDAP_CONFIG.LDAP_ENABLED:
 ################################################################################
 ################################################################################
 
 
 STATIC_URL = '/static/'
 STATIC_URL = '/static/'
-
+TEMPLATES_DIR_NAME = 'templates'
 STATICFILES_DIRS = [
 STATICFILES_DIRS = [
     *([str(CONFIG.CUSTOM_TEMPLATES_DIR / 'static')] if CONFIG.CUSTOM_TEMPLATES_DIR else []),
     *([str(CONFIG.CUSTOM_TEMPLATES_DIR / 'static')] if CONFIG.CUSTOM_TEMPLATES_DIR else []),
     *[
     *[
@@ -172,7 +162,7 @@ STATICFILES_DIRS = [
         for plugin_dir in PLUGIN_DIRS.values()
         for plugin_dir in PLUGIN_DIRS.values()
         if (plugin_dir / 'static').is_dir()
         if (plugin_dir / 'static').is_dir()
     ],
     ],
-    str(PACKAGE_DIR / CONFIG.TEMPLATES_DIR_NAME / 'static'),
+    str(PACKAGE_DIR / TEMPLATES_DIR_NAME / 'static'),
 ]
 ]
 
 
 TEMPLATE_DIRS = [
 TEMPLATE_DIRS = [
@@ -182,9 +172,9 @@ TEMPLATE_DIRS = [
         for plugin_dir in PLUGIN_DIRS.values()
         for plugin_dir in PLUGIN_DIRS.values()
         if (plugin_dir / 'templates').is_dir()
         if (plugin_dir / 'templates').is_dir()
     ],
     ],
-    str(PACKAGE_DIR / CONFIG.TEMPLATES_DIR_NAME / 'core'),
-    str(PACKAGE_DIR / CONFIG.TEMPLATES_DIR_NAME / 'admin'),
-    str(PACKAGE_DIR / CONFIG.TEMPLATES_DIR_NAME),
+    str(PACKAGE_DIR / TEMPLATES_DIR_NAME / 'core'),
+    str(PACKAGE_DIR / TEMPLATES_DIR_NAME / 'admin'),
+    str(PACKAGE_DIR / TEMPLATES_DIR_NAME),
 ]
 ]
 
 
 TEMPLATES = [
 TEMPLATES = [
@@ -208,13 +198,14 @@ TEMPLATES = [
 ### External Service Settings
 ### External Service Settings
 ################################################################################
 ################################################################################
 
 
+from ..plugins_sys.config.constants import CONSTANTS
 
 
-CACHE_DB_FILENAME = 'cache.sqlite3'
-CACHE_DB_PATH = CONFIG.CACHE_DIR / CACHE_DB_FILENAME
-CACHE_DB_TABLE = 'django_cache'
+# CACHE_DB_FILENAME = 'cache.sqlite3'
+# CACHE_DB_PATH = CONSTANTS.CACHE_DIR / CACHE_DB_FILENAME
+# CACHE_DB_TABLE = 'django_cache'
 
 
-DATABASE_FILE = DATA_DIR / CONFIG.SQL_INDEX_FILENAME
-DATABASE_NAME = os.environ.get("ARCHIVEBOX_DATABASE_NAME", str(DATABASE_FILE))
+DATABASE_FILE = DATA_DIR / CONSTANTS.SQL_INDEX_FILENAME
+DATABASE_NAME = os.environ.get("ARCHIVEBOX_DATABASE_NAME", str(CONSTANTS.DATABASE_FILE))
 
 
 QUEUE_DATABASE_NAME = DATABASE_NAME.replace('index.sqlite3', 'queue.sqlite3')
 QUEUE_DATABASE_NAME = DATABASE_NAME.replace('index.sqlite3', 'queue.sqlite3')
 
 
@@ -222,6 +213,7 @@ SQLITE_CONNECTION_OPTIONS = {
     "TIME_ZONE": CONFIG.TIMEZONE,
     "TIME_ZONE": CONFIG.TIMEZONE,
     "OPTIONS": {
     "OPTIONS": {
         # https://gcollazo.com/optimal-sqlite-settings-for-django/
         # https://gcollazo.com/optimal-sqlite-settings-for-django/
+        # # https://litestream.io/tips/#busy-timeout
         "timeout": 5,
         "timeout": 5,
         "check_same_thread": False,
         "check_same_thread": False,
         "transaction_mode": "IMMEDIATE",
         "transaction_mode": "IMMEDIATE",
@@ -345,7 +337,7 @@ STORAGES = {
         "BACKEND": "django.core.files.storage.FileSystemStorage",
         "BACKEND": "django.core.files.storage.FileSystemStorage",
         "OPTIONS": {
         "OPTIONS": {
             "base_url": "/archive/",
             "base_url": "/archive/",
-            "location": CONFIG.ARCHIVE_DIR,
+            "location": ARCHIVE_DIR,
         },
         },
     },
     },
     # "personas": {
     # "personas": {

+ 3 - 2
archivebox/extractors/__init__.py

@@ -14,7 +14,6 @@ from ..config import (
     SAVE_ALLOWLIST_PTN,
     SAVE_ALLOWLIST_PTN,
     SAVE_DENYLIST_PTN,
     SAVE_DENYLIST_PTN,
 )
 )
-from ..core.settings import ERROR_LOG
 from ..index.schema import ArchiveResult, Link
 from ..index.schema import ArchiveResult, Link
 from ..index.sql import write_link_to_sql_index
 from ..index.sql import write_link_to_sql_index
 from ..index import (
 from ..index import (
@@ -109,6 +108,8 @@ def ignore_methods(to_ignore: List[str]) -> Iterable[str]:
 def archive_link(link: Link, overwrite: bool=False, methods: Optional[Iterable[str]]=None, out_dir: Optional[Path]=None, created_by_id: int | None=None) -> Link:
 def archive_link(link: Link, overwrite: bool=False, methods: Optional[Iterable[str]]=None, out_dir: Optional[Path]=None, created_by_id: int | None=None) -> Link:
     """download the DOM, PDF, and a screenshot into a folder named after the link's timestamp"""
     """download the DOM, PDF, and a screenshot into a folder named after the link's timestamp"""
 
 
+    from django.conf import settings
+
     from ..search import write_search_index
     from ..search import write_search_index
 
 
     # TODO: Remove when the input is changed to be a snapshot. Suboptimal approach.
     # TODO: Remove when the input is changed to be a snapshot. Suboptimal approach.
@@ -169,7 +170,7 @@ def archive_link(link: Link, overwrite: bool=False, methods: Optional[Iterable[s
                     stats['skipped'] += 1
                     stats['skipped'] += 1
             except Exception as e:
             except Exception as e:
                 # https://github.com/ArchiveBox/ArchiveBox/issues/984#issuecomment-1150541627
                 # https://github.com/ArchiveBox/ArchiveBox/issues/984#issuecomment-1150541627
-                with open(ERROR_LOG, "a", encoding='utf-8') as f:
+                with open(settings.ERROR_LOG, "a", encoding='utf-8') as f:
                     command = ' '.join(sys.argv)
                     command = ' '.join(sys.argv)
                     ts = datetime.now(timezone.utc).strftime('%Y-%m-%d__%H:%M:%S')
                     ts = datetime.now(timezone.utc).strftime('%Y-%m-%d__%H:%M:%S')
                     f.write(("\n" + 'Exception in archive_methods.save_{}(Link(url={})) command={}; ts={}'.format(
                     f.write(("\n" + 'Exception in archive_methods.save_{}(Link(url={})) command={}; ts={}'.format(

+ 3 - 2
archivebox/extractors/htmltotext.py

@@ -1,5 +1,7 @@
 __package__ = 'archivebox.extractors'
 __package__ = 'archivebox.extractors'
 
 
+import archivebox
+
 from html.parser import HTMLParser
 from html.parser import HTMLParser
 import io
 import io
 from pathlib import Path
 from pathlib import Path
@@ -8,7 +10,6 @@ from typing import Optional
 from ..config import (
 from ..config import (
     SAVE_HTMLTOTEXT,
     SAVE_HTMLTOTEXT,
     TIMEOUT,
     TIMEOUT,
-    VERSION,
 )
 )
 from ..index.schema import Link, ArchiveResult, ArchiveError
 from ..index.schema import Link, ArchiveResult, ArchiveError
 from ..logging_util import TimedProgress
 from ..logging_util import TimedProgress
@@ -153,7 +154,7 @@ def save_htmltotext(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEO
     return ArchiveResult(
     return ArchiveResult(
         cmd=cmd,
         cmd=cmd,
         pwd=str(out_dir),
         pwd=str(out_dir),
-        cmd_version=VERSION,
+        cmd_version=archivebox.__version__,
         output=output,
         output=output,
         status=status,
         status=status,
         index_texts=[extracted_text] if extracted_text else [],
         index_texts=[extracted_text] if extracted_text else [],

+ 27 - 29
archivebox/extractors/readability.py

@@ -8,17 +8,7 @@ import json
 
 
 from ..index.schema import Link, ArchiveResult, ArchiveError
 from ..index.schema import Link, ArchiveResult, ArchiveError
 from ..system import run, atomic_write
 from ..system import run, atomic_write
-from ..util import (
-    enforce_types,
-    is_static_file,
-)
-from ..config import (
-    TIMEOUT,
-    CURL_BINARY,
-    SAVE_READABILITY,
-    DEPENDENCIES,
-    READABILITY_VERSION,
-)
+from ..util import enforce_types, is_static_file
 from ..logging_util import TimedProgress
 from ..logging_util import TimedProgress
 from .title import get_html
 from .title import get_html
 
 
@@ -31,22 +21,29 @@ def get_embed_path(archiveresult=None):
 
 
 @enforce_types
 @enforce_types
 def should_save_readability(link: Link, out_dir: Optional[str]=None, overwrite: Optional[bool]=False) -> bool:
 def should_save_readability(link: Link, out_dir: Optional[str]=None, overwrite: Optional[bool]=False) -> bool:
+    from plugins_extractor.readability.apps import READABILITY_CONFIG
+    
     if is_static_file(link.url):
     if is_static_file(link.url):
         return False
         return False
 
 
-    out_dir = out_dir or Path(link.link_dir)
-    if not overwrite and (out_dir / get_output_path()).exists():
+    output_subdir = (Path(out_dir or link.link_dir) / get_output_path())
+    if not overwrite and output_subdir.exists():
         return False
         return False
 
 
-    return SAVE_READABILITY
+    return READABILITY_CONFIG.SAVE_READABILITY
 
 
 
 
 @enforce_types
 @enforce_types
-def save_readability(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEOUT) -> ArchiveResult:
+def save_readability(link: Link, out_dir: Optional[str]=None, timeout: int=0) -> ArchiveResult:
     """download reader friendly version using @mozilla/readability"""
     """download reader friendly version using @mozilla/readability"""
-
-    out_dir = Path(out_dir or link.link_dir)
-    output_folder = out_dir.absolute() / get_output_path()
+    
+    from plugins_extractor.readability.apps import READABILITY_CONFIG, READABILITY_BINARY
+    
+    READABILITY_BIN = READABILITY_BINARY.load()
+    assert READABILITY_BIN.abspath and READABILITY_BIN.version
+
+    timeout = timeout or READABILITY_CONFIG.READABILITY_TIMEOUT
+    output_subdir = Path(out_dir or link.link_dir).absolute() / get_output_path()
     output = get_output_path()
     output = get_output_path()
 
 
     # Readability Docs: https://github.com/mozilla/readability
     # Readability Docs: https://github.com/mozilla/readability
@@ -54,13 +51,14 @@ def save_readability(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEO
     status = 'succeeded'
     status = 'succeeded'
     # fake command to show the user so they have something to try debugging if get_html fails
     # fake command to show the user so they have something to try debugging if get_html fails
     cmd = [
     cmd = [
-        CURL_BINARY,
-        link.url
+        str(READABILITY_BIN.abspath),
+        '{dom,singlefile}.html',
+        link.url,
     ]
     ]
     readability_content = None
     readability_content = None
     timer = TimedProgress(timeout, prefix='      ')
     timer = TimedProgress(timeout, prefix='      ')
     try:
     try:
-        document = get_html(link, out_dir)
+        document = get_html(link, Path(out_dir or link.link_dir))
         temp_doc = NamedTemporaryFile(delete=False)
         temp_doc = NamedTemporaryFile(delete=False)
         temp_doc.write(document.encode("utf-8"))
         temp_doc.write(document.encode("utf-8"))
         temp_doc.close()
         temp_doc.close()
@@ -69,26 +67,26 @@ def save_readability(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEO
             raise ArchiveError('Readability could not find HTML to parse for article text')
             raise ArchiveError('Readability could not find HTML to parse for article text')
 
 
         cmd = [
         cmd = [
-            DEPENDENCIES['READABILITY_BINARY']['path'],
+            str(READABILITY_BIN.abspath),
             temp_doc.name,
             temp_doc.name,
             link.url,
             link.url,
         ]
         ]
-        result = run(cmd, cwd=out_dir, timeout=timeout)
+        result = run(cmd, cwd=out_dir, timeout=timeout, text=True)
         try:
         try:
             result_json = json.loads(result.stdout)
             result_json = json.loads(result.stdout)
             assert result_json and 'content' in result_json, 'Readability output is not valid JSON'
             assert result_json and 'content' in result_json, 'Readability output is not valid JSON'
         except json.JSONDecodeError:
         except json.JSONDecodeError:
             raise ArchiveError('Readability was not able to archive the page (invalid JSON)', result.stdout + result.stderr)
             raise ArchiveError('Readability was not able to archive the page (invalid JSON)', result.stdout + result.stderr)
 
 
-        output_folder.mkdir(exist_ok=True)
+        output_subdir.mkdir(exist_ok=True)
         readability_content = result_json.pop("textContent") 
         readability_content = result_json.pop("textContent") 
-        atomic_write(str(output_folder / "content.html"), result_json.pop("content"))
-        atomic_write(str(output_folder / "content.txt"), readability_content)
-        atomic_write(str(output_folder / "article.json"), result_json)
+        atomic_write(str(output_subdir / "content.html"), result_json.pop("content"))
+        atomic_write(str(output_subdir / "content.txt"), readability_content)
+        atomic_write(str(output_subdir / "article.json"), result_json)
 
 
         output_tail = [
         output_tail = [
             line.strip()
             line.strip()
-            for line in (result.stdout + result.stderr).decode().rsplit('\n', 5)[-5:]
+            for line in (result.stdout + result.stderr).rsplit('\n', 5)[-5:]
             if line.strip()
             if line.strip()
         ]
         ]
         hints = (
         hints = (
@@ -111,7 +109,7 @@ def save_readability(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEO
     return ArchiveResult(
     return ArchiveResult(
         cmd=cmd,
         cmd=cmd,
         pwd=str(out_dir),
         pwd=str(out_dir),
-        cmd_version=READABILITY_VERSION,
+        cmd_version=str(READABILITY_BIN.version),
         output=output,
         output=output,
         status=status,
         status=status,
         index_texts=[readability_content] if readability_content else [],
         index_texts=[readability_content] if readability_content else [],

+ 33 - 34
archivebox/index/__init__.py

@@ -11,20 +11,19 @@ from contextlib import contextmanager
 from urllib.parse import urlparse
 from urllib.parse import urlparse
 from django.db.models import QuerySet, Q
 from django.db.models import QuerySet, Q
 
 
+
+import archivebox
+
 from ..util import (
 from ..util import (
     scheme,
     scheme,
     enforce_types,
     enforce_types,
     ExtendedEncoder,
     ExtendedEncoder,
 )
 )
+from ..misc.logging import stderr
 from ..config import (
 from ..config import (
-    ARCHIVE_DIR_NAME,
-    SQL_INDEX_FILENAME,
-    JSON_INDEX_FILENAME,
-    OUTPUT_DIR,
     TIMEOUT,
     TIMEOUT,
     URL_DENYLIST_PTN,
     URL_DENYLIST_PTN,
     URL_ALLOWLIST_PTN,
     URL_ALLOWLIST_PTN,
-    stderr,
     OUTPUT_PERMISSIONS
     OUTPUT_PERMISSIONS
 )
 )
 from ..logging_util import (
 from ..logging_util import (
@@ -224,28 +223,28 @@ def timed_index_update(out_path: Path):
 
 
 
 
 @enforce_types
 @enforce_types
-def write_main_index(links: List[Link], out_dir: Path=OUTPUT_DIR, created_by_id: int | None=None) -> None:
+def write_main_index(links: List[Link], out_dir: Path=archivebox.DATA_DIR, created_by_id: int | None=None) -> None:
     """Writes links to sqlite3 file for a given list of links"""
     """Writes links to sqlite3 file for a given list of links"""
 
 
     log_indexing_process_started(len(links))
     log_indexing_process_started(len(links))
 
 
     try:
     try:
-        with timed_index_update(out_dir / SQL_INDEX_FILENAME):
+        with timed_index_update(archivebox.CONSTANTS.DATABASE_FILE):
             write_sql_main_index(links, out_dir=out_dir, created_by_id=created_by_id)
             write_sql_main_index(links, out_dir=out_dir, created_by_id=created_by_id)
-            os.chmod(out_dir / SQL_INDEX_FILENAME, int(OUTPUT_PERMISSIONS, base=8)) # set here because we don't write it with atomic writes
+            os.chmod(archivebox.CONSTANTS.DATABASE_FILE, int(OUTPUT_PERMISSIONS, base=8)) # set here because we don't write it with atomic writes
 
 
     except (KeyboardInterrupt, SystemExit):
     except (KeyboardInterrupt, SystemExit):
         stderr('[!] Warning: Still writing index to disk...', color='lightyellow')
         stderr('[!] Warning: Still writing index to disk...', color='lightyellow')
         stderr('    Run archivebox init to fix any inconsistencies from an ungraceful exit.')
         stderr('    Run archivebox init to fix any inconsistencies from an ungraceful exit.')
-        with timed_index_update(out_dir / SQL_INDEX_FILENAME):
+        with timed_index_update(archivebox.CONSTANTS.DATABASE_FILE):
             write_sql_main_index(links, out_dir=out_dir, created_by_id=created_by_id)
             write_sql_main_index(links, out_dir=out_dir, created_by_id=created_by_id)
-            os.chmod(out_dir / SQL_INDEX_FILENAME, int(OUTPUT_PERMISSIONS, base=8)) # set here because we don't write it with atomic writes
+            os.chmod(archivebox.CONSTANTS.DATABASE_FILE, int(OUTPUT_PERMISSIONS, base=8)) # set here because we don't write it with atomic writes
         raise SystemExit(0)
         raise SystemExit(0)
 
 
     log_indexing_process_finished()
     log_indexing_process_finished()
 
 
 @enforce_types
 @enforce_types
-def load_main_index(out_dir: Path=OUTPUT_DIR, warn: bool=True) -> List[Link]:
+def load_main_index(out_dir: Path=archivebox.DATA_DIR, warn: bool=True) -> List[Link]:
     """parse and load existing index with any new links from import_path merged in"""
     """parse and load existing index with any new links from import_path merged in"""
     from core.models import Snapshot
     from core.models import Snapshot
     try:
     try:
@@ -255,8 +254,8 @@ def load_main_index(out_dir: Path=OUTPUT_DIR, warn: bool=True) -> List[Link]:
         raise SystemExit(0)
         raise SystemExit(0)
 
 
 @enforce_types
 @enforce_types
-def load_main_index_meta(out_dir: Path=OUTPUT_DIR) -> Optional[dict]:
-    index_path = out_dir / JSON_INDEX_FILENAME
+def load_main_index_meta(out_dir: Path=archivebox.DATA_DIR) -> Optional[dict]:
+    index_path = out_dir / archivebox.CONSTANTS.JSON_INDEX_FILENAME
     if index_path.exists():
     if index_path.exists():
         with open(index_path, 'r', encoding='utf-8') as f:
         with open(index_path, 'r', encoding='utf-8') as f:
             meta_dict = pyjson.load(f)
             meta_dict = pyjson.load(f)
@@ -407,7 +406,7 @@ def snapshot_filter(snapshots: QuerySet, filter_patterns: List[str], filter_type
         return search_filter(snapshots, filter_patterns, filter_type)
         return search_filter(snapshots, filter_patterns, filter_type)
 
 
 
 
-def get_indexed_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
+def get_indexed_folders(snapshots, out_dir: Path=archivebox.DATA_DIR) -> Dict[str, Optional[Link]]:
     """indexed links without checking archive status or data directory validity"""
     """indexed links without checking archive status or data directory validity"""
     links = (snapshot.as_link() for snapshot in snapshots.iterator(chunk_size=500))
     links = (snapshot.as_link() for snapshot in snapshots.iterator(chunk_size=500))
     return {
     return {
@@ -415,7 +414,7 @@ def get_indexed_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Option
         for link in links
         for link in links
     }
     }
 
 
-def get_archived_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
+def get_archived_folders(snapshots, out_dir: Path=archivebox.DATA_DIR) -> Dict[str, Optional[Link]]:
     """indexed links that are archived with a valid data directory"""
     """indexed links that are archived with a valid data directory"""
     links = (snapshot.as_link() for snapshot in snapshots.iterator(chunk_size=500))
     links = (snapshot.as_link() for snapshot in snapshots.iterator(chunk_size=500))
     return {
     return {
@@ -423,7 +422,7 @@ def get_archived_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optio
         for link in filter(is_archived, links)
         for link in filter(is_archived, links)
     }
     }
 
 
-def get_unarchived_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
+def get_unarchived_folders(snapshots, out_dir: Path=archivebox.DATA_DIR) -> Dict[str, Optional[Link]]:
     """indexed links that are unarchived with no data directory or an empty data directory"""
     """indexed links that are unarchived with no data directory or an empty data directory"""
     links = (snapshot.as_link() for snapshot in snapshots.iterator(chunk_size=500))
     links = (snapshot.as_link() for snapshot in snapshots.iterator(chunk_size=500))
     return {
     return {
@@ -431,12 +430,12 @@ def get_unarchived_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Opt
         for link in filter(is_unarchived, links)
         for link in filter(is_unarchived, links)
     }
     }
 
 
-def get_present_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
+def get_present_folders(snapshots, out_dir: Path=archivebox.DATA_DIR) -> Dict[str, Optional[Link]]:
     """dirs that actually exist in the archive/ folder"""
     """dirs that actually exist in the archive/ folder"""
 
 
     all_folders = {}
     all_folders = {}
 
 
-    for entry in (out_dir / ARCHIVE_DIR_NAME).iterdir():
+    for entry in (out_dir / archivebox.CONSTANTS.ARCHIVE_DIR_NAME).iterdir():
         if entry.is_dir():
         if entry.is_dir():
             link = None
             link = None
             try:
             try:
@@ -448,7 +447,7 @@ def get_present_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Option
 
 
     return all_folders
     return all_folders
 
 
-def get_valid_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
+def get_valid_folders(snapshots, out_dir: Path=archivebox.DATA_DIR) -> Dict[str, Optional[Link]]:
     """dirs with a valid index matched to the main index and archived content"""
     """dirs with a valid index matched to the main index and archived content"""
     links = [snapshot.as_link_with_details() for snapshot in snapshots.iterator(chunk_size=500)]
     links = [snapshot.as_link_with_details() for snapshot in snapshots.iterator(chunk_size=500)]
     return {
     return {
@@ -456,16 +455,16 @@ def get_valid_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optional
         for link in filter(is_valid, links)
         for link in filter(is_valid, links)
     }
     }
 
 
-def get_invalid_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
+def get_invalid_folders(snapshots, out_dir: Path=archivebox.DATA_DIR) -> Dict[str, Optional[Link]]:
     """dirs that are invalid for any reason: corrupted/duplicate/orphaned/unrecognized"""
     """dirs that are invalid for any reason: corrupted/duplicate/orphaned/unrecognized"""
-    duplicate = get_duplicate_folders(snapshots, out_dir=OUTPUT_DIR)
-    orphaned = get_orphaned_folders(snapshots, out_dir=OUTPUT_DIR)
-    corrupted = get_corrupted_folders(snapshots, out_dir=OUTPUT_DIR)
-    unrecognized = get_unrecognized_folders(snapshots, out_dir=OUTPUT_DIR)
+    duplicate = get_duplicate_folders(snapshots, out_dir=out_dir)
+    orphaned = get_orphaned_folders(snapshots, out_dir=out_dir)
+    corrupted = get_corrupted_folders(snapshots, out_dir=out_dir)
+    unrecognized = get_unrecognized_folders(snapshots, out_dir=out_dir)
     return {**duplicate, **orphaned, **corrupted, **unrecognized}
     return {**duplicate, **orphaned, **corrupted, **unrecognized}
 
 
 
 
-def get_duplicate_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
+def get_duplicate_folders(snapshots, out_dir: Path=archivebox.DATA_DIR) -> Dict[str, Optional[Link]]:
     """dirs that conflict with other directories that have the same link URL or timestamp"""
     """dirs that conflict with other directories that have the same link URL or timestamp"""
     by_url = {}
     by_url = {}
     by_timestamp = {}
     by_timestamp = {}
@@ -473,7 +472,7 @@ def get_duplicate_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Opti
 
 
     data_folders = (
     data_folders = (
         str(entry)
         str(entry)
-        for entry in (Path(out_dir) / ARCHIVE_DIR_NAME).iterdir()
+        for entry in archivebox.CONSTANTS.ARCHIVE_DIR.iterdir()
             if entry.is_dir() and not snapshots.filter(timestamp=entry.name).exists()
             if entry.is_dir() and not snapshots.filter(timestamp=entry.name).exists()
     )
     )
 
 
@@ -499,11 +498,11 @@ def get_duplicate_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Opti
                 duplicate_folders[path] = link
                 duplicate_folders[path] = link
     return duplicate_folders
     return duplicate_folders
 
 
-def get_orphaned_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
+def get_orphaned_folders(snapshots, out_dir: Path=archivebox.DATA_DIR) -> Dict[str, Optional[Link]]:
     """dirs that contain a valid index but aren't listed in the main index"""
     """dirs that contain a valid index but aren't listed in the main index"""
     orphaned_folders = {}
     orphaned_folders = {}
 
 
-    for entry in (Path(out_dir) / ARCHIVE_DIR_NAME).iterdir():
+    for entry in archivebox.CONSTANTS.ARCHIVE_DIR.iterdir():
         if entry.is_dir():
         if entry.is_dir():
             link = None
             link = None
             try:
             try:
@@ -517,7 +516,7 @@ def get_orphaned_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optio
 
 
     return orphaned_folders
     return orphaned_folders
 
 
-def get_corrupted_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
+def get_corrupted_folders(snapshots, out_dir: Path=archivebox.DATA_DIR) -> Dict[str, Optional[Link]]:
     """dirs that don't contain a valid index and aren't listed in the main index"""
     """dirs that don't contain a valid index and aren't listed in the main index"""
     corrupted = {}
     corrupted = {}
     for snapshot in snapshots.iterator(chunk_size=500):
     for snapshot in snapshots.iterator(chunk_size=500):
@@ -526,11 +525,11 @@ def get_corrupted_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Opti
             corrupted[link.link_dir] = link
             corrupted[link.link_dir] = link
     return corrupted
     return corrupted
 
 
-def get_unrecognized_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
+def get_unrecognized_folders(snapshots, out_dir: Path=archivebox.DATA_DIR) -> Dict[str, Optional[Link]]:
     """dirs that don't contain recognizable archive data and aren't listed in the main index"""
     """dirs that don't contain recognizable archive data and aren't listed in the main index"""
     unrecognized_folders: Dict[str, Optional[Link]] = {}
     unrecognized_folders: Dict[str, Optional[Link]] = {}
 
 
-    for entry in (Path(out_dir) / ARCHIVE_DIR_NAME).iterdir():
+    for entry in (Path(out_dir) / archivebox.CONSTANTS.ARCHIVE_DIR_NAME).iterdir():
         if entry.is_dir():
         if entry.is_dir():
             index_exists = (entry / "index.json").exists()
             index_exists = (entry / "index.json").exists()
             link = None
             link = None
@@ -595,10 +594,10 @@ def is_unarchived(link: Link) -> bool:
     return not link.is_archived
     return not link.is_archived
 
 
 
 
-def fix_invalid_folder_locations(out_dir: Path=OUTPUT_DIR) -> Tuple[List[str], List[str]]:
+def fix_invalid_folder_locations(out_dir: Path=archivebox.DATA_DIR) -> Tuple[List[str], List[str]]:
     fixed = []
     fixed = []
     cant_fix = []
     cant_fix = []
-    for entry in os.scandir(out_dir / ARCHIVE_DIR_NAME):
+    for entry in os.scandir(out_dir / archivebox.CONSTANTS.ARCHIVE_DIR_NAME):
         if entry.is_dir(follow_symlinks=True):
         if entry.is_dir(follow_symlinks=True):
             if (Path(entry.path) / 'index.json').exists():
             if (Path(entry.path) / 'index.json').exists():
                 try:
                 try:
@@ -609,7 +608,7 @@ def fix_invalid_folder_locations(out_dir: Path=OUTPUT_DIR) -> Tuple[List[str], L
                     continue
                     continue
 
 
                 if not entry.path.endswith(f'/{link.timestamp}'):
                 if not entry.path.endswith(f'/{link.timestamp}'):
-                    dest = out_dir / ARCHIVE_DIR_NAME / link.timestamp
+                    dest = out_dir /archivebox.CONSTANTS.ARCHIVE_DIR_NAME / link.timestamp
                     if dest.exists():
                     if dest.exists():
                         cant_fix.append(entry.path)
                         cant_fix.append(entry.path)
                     else:
                     else:

+ 13 - 11
archivebox/index/html.py

@@ -1,11 +1,12 @@
 __package__ = 'archivebox.index'
 __package__ = 'archivebox.index'
 
 
+import archivebox
 from pathlib import Path
 from pathlib import Path
 from datetime import datetime, timezone
 from datetime import datetime, timezone
 from collections import defaultdict
 from collections import defaultdict
 from typing import List, Optional, Iterator, Mapping
 from typing import List, Optional, Iterator, Mapping
 
 
-from django.utils.html import format_html, mark_safe
+from django.utils.html import format_html, mark_safe   # type: ignore
 from django.core.cache import cache
 from django.core.cache import cache
 
 
 from .schema import Link
 from .schema import Link
@@ -19,10 +20,6 @@ from ..util import (
     urldecode,
     urldecode,
 )
 )
 from ..config import (
 from ..config import (
-    OUTPUT_DIR,
-    VERSION,
-    FOOTER_INFO,
-    HTML_INDEX_FILENAME,
     SAVE_ARCHIVE_DOT_ORG,
     SAVE_ARCHIVE_DOT_ORG,
     PREVIEW_ORIGINALS,
     PREVIEW_ORIGINALS,
 )
 )
@@ -36,10 +33,12 @@ TITLE_LOADING_MSG = 'Not yet archived...'
 ### Main Links Index
 ### Main Links Index
 
 
 @enforce_types
 @enforce_types
-def parse_html_main_index(out_dir: Path=OUTPUT_DIR) -> Iterator[str]:
+def parse_html_main_index(out_dir: Path=archivebox.DATA_DIR) -> Iterator[str]:
     """parse an archive index html file and return the list of urls"""
     """parse an archive index html file and return the list of urls"""
 
 
-    index_path = Path(out_dir) / HTML_INDEX_FILENAME
+    from plugins_sys.config.constants import CONSTANTS
+
+    index_path = Path(out_dir) / CONSTANTS.HTML_INDEX_FILENAME
     if index_path.exists():
     if index_path.exists():
         with open(index_path, 'r', encoding='utf-8') as f:
         with open(index_path, 'r', encoding='utf-8') as f:
             for line in f:
             for line in f:
@@ -59,14 +58,16 @@ def generate_index_from_links(links: List[Link], with_headers: bool):
 def main_index_template(links: List[Link], template: str=MAIN_INDEX_TEMPLATE) -> str:
 def main_index_template(links: List[Link], template: str=MAIN_INDEX_TEMPLATE) -> str:
     """render the template for the entire main index"""
     """render the template for the entire main index"""
 
 
+    from plugins_sys.config.apps import SHELL_CONFIG, SERVER_CONFIG
+
     return render_django_template(template, {
     return render_django_template(template, {
-        'version': VERSION,
-        'git_sha': VERSION,  # not used anymore, but kept for backwards compatibility
+        'version': archivebox.VERSION,
+        'git_sha': SHELL_CONFIG.COMMIT_HASH or archivebox.VERSION,
         'num_links': str(len(links)),
         'num_links': str(len(links)),
         'date_updated': datetime.now(timezone.utc).strftime('%Y-%m-%d'),
         'date_updated': datetime.now(timezone.utc).strftime('%Y-%m-%d'),
         'time_updated': datetime.now(timezone.utc).strftime('%Y-%m-%d %H:%M'),
         'time_updated': datetime.now(timezone.utc).strftime('%Y-%m-%d %H:%M'),
         'links': [link._asdict(extended=True) for link in links],
         'links': [link._asdict(extended=True) for link in links],
-        'FOOTER_INFO': FOOTER_INFO,
+        'FOOTER_INFO': SERVER_CONFIG.FOOTER_INFO,
     })
     })
 
 
 
 
@@ -74,10 +75,11 @@ def main_index_template(links: List[Link], template: str=MAIN_INDEX_TEMPLATE) ->
 
 
 @enforce_types
 @enforce_types
 def write_html_link_details(link: Link, out_dir: Optional[str]=None) -> None:
 def write_html_link_details(link: Link, out_dir: Optional[str]=None) -> None:
+    from plugins_sys.config.constants import CONSTANTS
     out_dir = out_dir or link.link_dir
     out_dir = out_dir or link.link_dir
 
 
     rendered_html = link_details_template(link)
     rendered_html = link_details_template(link)
-    atomic_write(str(Path(out_dir) / HTML_INDEX_FILENAME), rendered_html)
+    atomic_write(str(Path(out_dir) / CONSTANTS.HTML_INDEX_FILENAME), rendered_html)
 
 
 
 
 @enforce_types
 @enforce_types

+ 40 - 34
archivebox/index/json.py

@@ -8,38 +8,36 @@ from pathlib import Path
 from datetime import datetime, timezone
 from datetime import datetime, timezone
 from typing import List, Optional, Iterator, Any, Union
 from typing import List, Optional, Iterator, Any, Union
 
 
+import archivebox
+
 from .schema import Link
 from .schema import Link
 from ..system import atomic_write
 from ..system import atomic_write
 from ..util import enforce_types
 from ..util import enforce_types
-from ..config import (
-    VERSION,
-    OUTPUT_DIR,
-    FOOTER_INFO,
-    DEPENDENCIES,
-    JSON_INDEX_FILENAME,
-    ARCHIVE_DIR_NAME,
-    ANSI
-)
-
-
-MAIN_INDEX_HEADER = {
-    'info': 'This is an index of site data archived by ArchiveBox: The self-hosted web archive.',
-    'schema': 'archivebox.index.json',
-    'copyright_info': FOOTER_INFO,
-    'meta': {
-        'project': 'ArchiveBox',
-        'version': VERSION,
-        'git_sha': VERSION,  # not used anymore, but kept for backwards compatibility
-        'website': 'https://ArchiveBox.io',
-        'docs': 'https://github.com/ArchiveBox/ArchiveBox/wiki',
-        'source': 'https://github.com/ArchiveBox/ArchiveBox',
-        'issues': 'https://github.com/ArchiveBox/ArchiveBox/issues',
-        'dependencies': DEPENDENCIES,
-    },
-}
+
+
 
 
 @enforce_types
 @enforce_types
 def generate_json_index_from_links(links: List[Link], with_headers: bool):
 def generate_json_index_from_links(links: List[Link], with_headers: bool):
+    from django.conf import settings
+    from plugins_sys.config.apps import SERVER_CONFIG
+    
+    MAIN_INDEX_HEADER = {
+        'info': 'This is an index of site data archived by ArchiveBox: The self-hosted web archive.',
+        'schema': 'archivebox.index.json',
+        'copyright_info': SERVER_CONFIG.FOOTER_INFO,
+        'meta': {
+            'project': 'ArchiveBox',
+            'version': archivebox.VERSION,
+            'git_sha': archivebox.VERSION,  # not used anymore, but kept for backwards compatibility
+            'website': 'https://ArchiveBox.io',
+            'docs': 'https://github.com/ArchiveBox/ArchiveBox/wiki',
+            'source': 'https://github.com/ArchiveBox/ArchiveBox',
+            'issues': 'https://github.com/ArchiveBox/ArchiveBox/issues',
+            'dependencies': settings.BINARIES.to_dict(),
+        },
+    }
+    
+    
     if with_headers:
     if with_headers:
         output = {
         output = {
             **MAIN_INDEX_HEADER,
             **MAIN_INDEX_HEADER,
@@ -54,10 +52,12 @@ def generate_json_index_from_links(links: List[Link], with_headers: bool):
 
 
 
 
 @enforce_types
 @enforce_types
-def parse_json_main_index(out_dir: Path=OUTPUT_DIR) -> Iterator[Link]:
+def parse_json_main_index(out_dir: Path=archivebox.DATA_DIR) -> Iterator[Link]:
     """parse an archive index json file and return the list of links"""
     """parse an archive index json file and return the list of links"""
 
 
-    index_path = Path(out_dir) / JSON_INDEX_FILENAME
+    from plugins_sys.config.constants import CONSTANTS
+
+    index_path = Path(out_dir) / CONSTANTS.JSON_INDEX_FILENAME
     if index_path.exists():
     if index_path.exists():
         with open(index_path, 'r', encoding='utf-8') as f:
         with open(index_path, 'r', encoding='utf-8') as f:
             try:
             try:
@@ -77,14 +77,14 @@ def parse_json_main_index(out_dir: Path=OUTPUT_DIR) -> Iterator[Link]:
                     yield Link.from_json(link_json)
                     yield Link.from_json(link_json)
                 except KeyError:
                 except KeyError:
                     try:
                     try:
-                        detail_index_path = Path(OUTPUT_DIR) / ARCHIVE_DIR_NAME / link_json['timestamp']
+                        detail_index_path = CONSTANTS.ARCHIVE_DIR / link_json['timestamp']
                         yield parse_json_link_details(str(detail_index_path))
                         yield parse_json_link_details(str(detail_index_path))
                     except KeyError: 
                     except KeyError: 
                         # as a last effort, try to guess the missing values out of existing ones
                         # as a last effort, try to guess the missing values out of existing ones
                         try:
                         try:
                             yield Link.from_json(link_json, guess=True)
                             yield Link.from_json(link_json, guess=True)
                         except KeyError:
                         except KeyError:
-                            print("    {lightyellow}! Failed to load the index.json from {}".format(detail_index_path, **ANSI))
+                            # print("    {lightyellow}! Failed to load the index.json from {}".format(detail_index_path, **ANSI))
                             continue
                             continue
     return ()
     return ()
 
 
@@ -94,15 +94,19 @@ def parse_json_main_index(out_dir: Path=OUTPUT_DIR) -> Iterator[Link]:
 def write_json_link_details(link: Link, out_dir: Optional[str]=None) -> None:
 def write_json_link_details(link: Link, out_dir: Optional[str]=None) -> None:
     """write a json file with some info about the link"""
     """write a json file with some info about the link"""
     
     
+    from plugins_sys.config.constants import CONSTANTS
+    
     out_dir = out_dir or link.link_dir
     out_dir = out_dir or link.link_dir
-    path = Path(out_dir) / JSON_INDEX_FILENAME
+    path = Path(out_dir) / CONSTANTS.JSON_INDEX_FILENAME
     atomic_write(str(path), link._asdict(extended=True))
     atomic_write(str(path), link._asdict(extended=True))
 
 
 
 
 @enforce_types
 @enforce_types
-def parse_json_link_details(out_dir: Union[Path, str], guess: Optional[bool]=False) -> Optional[Link]:
+def parse_json_link_details(out_dir: Union[Path, str], guess: bool=False) -> Optional[Link]:
     """load the json link index from a given directory"""
     """load the json link index from a given directory"""
-    existing_index = Path(out_dir) / JSON_INDEX_FILENAME
+    from plugins_sys.config.constants import CONSTANTS
+    
+    existing_index = Path(out_dir) / CONSTANTS.JSON_INDEX_FILENAME
     if existing_index.exists():
     if existing_index.exists():
         with open(existing_index, 'r', encoding='utf-8') as f:
         with open(existing_index, 'r', encoding='utf-8') as f:
             try:
             try:
@@ -117,7 +121,9 @@ def parse_json_link_details(out_dir: Union[Path, str], guess: Optional[bool]=Fal
 def parse_json_links_details(out_dir: Union[Path, str]) -> Iterator[Link]:
 def parse_json_links_details(out_dir: Union[Path, str]) -> Iterator[Link]:
     """read through all the archive data folders and return the parsed links"""
     """read through all the archive data folders and return the parsed links"""
 
 
-    for entry in os.scandir(Path(out_dir) / ARCHIVE_DIR_NAME):
+    from plugins_sys.config.constants import CONSTANTS
+
+    for entry in os.scandir(CONSTANTS.ARCHIVE_DIR):
         if entry.is_dir(follow_symlinks=True):
         if entry.is_dir(follow_symlinks=True):
             if (Path(entry.path) / 'index.json').exists():
             if (Path(entry.path) / 'index.json').exists():
                 try:
                 try:

+ 29 - 31
archivebox/logging_util.py

@@ -4,8 +4,11 @@ import re
 import os
 import os
 import sys
 import sys
 import stat
 import stat
+import shutil
 import time
 import time
 import argparse
 import argparse
+import archivebox
+
 from math import log
 from math import log
 from multiprocessing import Process
 from multiprocessing import Process
 from pathlib import Path
 from pathlib import Path
@@ -22,18 +25,7 @@ from rich.panel import Panel
 
 
 from .system import get_dir_size
 from .system import get_dir_size
 from .util import enforce_types
 from .util import enforce_types
-from .config import (
-    ConfigDict,
-    OUTPUT_DIR,
-    VERSION,
-    ANSI,
-    IS_TTY,
-    IN_DOCKER,
-    TERM_WIDTH,
-    SHOW_PROGRESS,
-    SOURCES_DIR_NAME,
-    stderr,
-)
+from .misc.logging import ANSI, stderr
 
 
 @dataclass
 @dataclass
 class RuntimeStats:
 class RuntimeStats:
@@ -102,7 +94,7 @@ def reject_stdin(caller: str, stdin: Optional[IO]=sys.stdin) -> None:
     if not stdin:
     if not stdin:
         return None
         return None
 
 
-    if IN_DOCKER:
+    if os.environ.get('IN_DOCKER') in ('1', 'true', 'True', 'TRUE', 'yes'):
         # when TTY is disabled in docker we cant tell if stdin is being piped in or not
         # when TTY is disabled in docker we cant tell if stdin is being piped in or not
         # if we try to read stdin when its not piped we will hang indefinitely waiting for it
         # if we try to read stdin when its not piped we will hang indefinitely waiting for it
         return None
         return None
@@ -141,9 +133,14 @@ class TimedProgress:
 
 
     def __init__(self, seconds, prefix=''):
     def __init__(self, seconds, prefix=''):
 
 
-        self.SHOW_PROGRESS = SHOW_PROGRESS
+        from plugins_sys.config.apps import SHELL_CONFIG
+
+        self.SHOW_PROGRESS = SHELL_CONFIG.SHOW_PROGRESS
+        self.ANSI = SHELL_CONFIG.ANSI
+        self.TERM_WIDTH = lambda: shutil.get_terminal_size().columns      # lambda so it live-updates when terminal is resized
+        
         if self.SHOW_PROGRESS:
         if self.SHOW_PROGRESS:
-            self.p = Process(target=progress_bar, args=(seconds, prefix))
+            self.p = Process(target=progress_bar, args=(seconds, prefix, self.ANSI))
             self.p.start()
             self.p.start()
 
 
         self.stats = {'start_ts': datetime.now(timezone.utc), 'end_ts': None}
         self.stats = {'start_ts': datetime.now(timezone.utc), 'end_ts': None}
@@ -172,7 +169,7 @@ class TimedProgress:
 
 
                 # clear whole terminal line
                 # clear whole terminal line
                 try:
                 try:
-                    sys.stdout.write('\r{}{}\r'.format((' ' * TERM_WIDTH()), ANSI['reset']))
+                    sys.stdout.write('\r{}{}\r'.format((' ' * self.TERM_WIDTH()), self.ANSI['reset']))
                 except (IOError, BrokenPipeError):
                 except (IOError, BrokenPipeError):
                     # ignore when the parent proc has stopped listening to our stdout
                     # ignore when the parent proc has stopped listening to our stdout
                     pass
                     pass
@@ -181,9 +178,10 @@ class TimedProgress:
 
 
 
 
 @enforce_types
 @enforce_types
-def progress_bar(seconds: int, prefix: str='') -> None:
+def progress_bar(seconds: int, prefix: str='', ANSI: Dict[str, str]=ANSI) -> None:
     """show timer in the form of progress bar, with percentage and seconds remaining"""
     """show timer in the form of progress bar, with percentage and seconds remaining"""
-    chunk = '█' if (sys.stdout or sys.__stdout__).encoding.upper() == 'UTF-8' else '#'
+    output_buf = (sys.stdout or sys.__stdout__ or sys.stderr or sys.__stderr__)
+    chunk = '█' if output_buf and output_buf.encoding.upper() == 'UTF-8' else '#'
     last_width = TERM_WIDTH()
     last_width = TERM_WIDTH()
     chunks = last_width - len(prefix) - 20  # number of progress chunks to show (aka max bar width)
     chunks = last_width - len(prefix) - 20  # number of progress chunks to show (aka max bar width)
     try:
     try:
@@ -236,18 +234,15 @@ def log_cli_command(subcommand: str, subcommand_args: List[str], stdin: Optional
     args = ' '.join(subcommand_args)
     args = ' '.join(subcommand_args)
     version_msg = '[dark_magenta]\\[i] [{now}] ArchiveBox v{VERSION}: [/dark_magenta][green4]archivebox [green3]{subcommand}[green2] {args}[/green2]'.format(
     version_msg = '[dark_magenta]\\[i] [{now}] ArchiveBox v{VERSION}: [/dark_magenta][green4]archivebox [green3]{subcommand}[green2] {args}[/green2]'.format(
         now=datetime.now(timezone.utc).strftime('%Y-%m-%d %H:%M:%S'),
         now=datetime.now(timezone.utc).strftime('%Y-%m-%d %H:%M:%S'),
-        VERSION=VERSION,
+        VERSION=archivebox.__version__,
         subcommand=subcommand,
         subcommand=subcommand,
         args=args,
         args=args,
     )
     )
     # stderr()
     # stderr()
     # stderr('[bright_black]    > {pwd}[/]'.format(pwd=pwd, **ANSI))
     # stderr('[bright_black]    > {pwd}[/]'.format(pwd=pwd, **ANSI))
     # stderr()
     # stderr()
-    if SHOW_PROGRESS:
-        print(Panel(version_msg), file=sys.stderr)
-    else:
-        print(version_msg, file=sys.stderr)
-
+    print(Panel(version_msg), file=sys.stderr)
+    
 ### Parsing Stage
 ### Parsing Stage
 
 
 
 
@@ -261,7 +256,8 @@ def log_importing_started(urls: Union[str, List[str]], depth: int, index_only: b
     ))
     ))
 
 
 def log_source_saved(source_file: str):
 def log_source_saved(source_file: str):
-    print('    > Saved verbatim input to {}/{}'.format(SOURCES_DIR_NAME, source_file.rsplit('/', 1)[-1]))
+    from plugins_sys.config.constants import CONSTANTS
+    print('    > Saved verbatim input to {}/{}'.format(CONSTANTS.SOURCES_DIR_NAME, source_file.rsplit('/', 1)[-1]))
 
 
 def log_parsing_finished(num_parsed: int, parser_name: str):
 def log_parsing_finished(num_parsed: int, parser_name: str):
     _LAST_RUN_STATS.parse_end_ts = datetime.now(timezone.utc)
     _LAST_RUN_STATS.parse_end_ts = datetime.now(timezone.utc)
@@ -293,12 +289,14 @@ def log_indexing_process_finished():
 
 
 
 
 def log_indexing_started(out_path: str):
 def log_indexing_started(out_path: str):
-    if IS_TTY:
-        sys.stdout.write(f'    > ./{Path(out_path).relative_to(OUTPUT_DIR)}')
+    from plugins_sys.config.apps import SHELL_CONFIG
+    
+    if SHELL_CONFIG.IS_TTY:
+        sys.stdout.write(f'    > ./{Path(out_path).relative_to(archivebox.DATA_DIR)}')
 
 
 
 
 def log_indexing_finished(out_path: str):
 def log_indexing_finished(out_path: str):
-    print(f'\r    √ ./{Path(out_path).relative_to(OUTPUT_DIR)}')
+    print(f'\r    √ ./{Path(out_path).relative_to(archivebox.DATA_DIR)}')
 
 
 
 
 ### Archiving Stage
 ### Archiving Stage
@@ -447,7 +445,7 @@ def log_archive_method_finished(result: "ArchiveResult"):
             )
             )
 
 
         docker_hints = ()
         docker_hints = ()
-        if IN_DOCKER:
+        if os.environ.get('IN_DOCKER') in ('1', 'true', 'True', 'TRUE', 'yes'):
             docker_hints = (
             docker_hints = (
                 '  docker run -it -v $PWD/data:/data archivebox/archivebox /bin/bash',
                 '  docker run -it -v $PWD/data:/data archivebox/archivebox /bin/bash',
             )
             )
@@ -534,7 +532,7 @@ def log_shell_welcome_msg():
 ### Helpers
 ### Helpers
 
 
 @enforce_types
 @enforce_types
-def pretty_path(path: Union[Path, str], pwd: Union[Path, str]=OUTPUT_DIR) -> str:
+def pretty_path(path: Union[Path, str], pwd: Union[Path, str]=archivebox.DATA_DIR) -> str:
     """convert paths like .../ArchiveBox/archivebox/../output/abc into output/abc"""
     """convert paths like .../ArchiveBox/archivebox/../output/abc into output/abc"""
     pwd = str(Path(pwd))  # .resolve()
     pwd = str(Path(pwd))  # .resolve()
     path = str(path)
     path = str(path)
@@ -577,7 +575,7 @@ def printable_folders(folders: Dict[str, Optional["Link"]],
 
 
 
 
 @enforce_types
 @enforce_types
-def printable_config(config: ConfigDict, prefix: str='') -> str:
+def printable_config(config: dict, prefix: str='') -> str:
     return f'\n{prefix}'.join(
     return f'\n{prefix}'.join(
         f'{key}={val}'
         f'{key}={val}'
         for key, val in config.items()
         for key, val in config.items()

+ 50 - 63
archivebox/main.py

@@ -6,6 +6,8 @@ import shutil
 import platform
 import platform
 import archivebox
 import archivebox
 
 
+CONSTANTS = archivebox.CONSTANTS
+
 from typing import Dict, List, Optional, Iterable, IO, Union
 from typing import Dict, List, Optional, Iterable, IO, Union
 from pathlib import Path
 from pathlib import Path
 from datetime import date, datetime
 from datetime import date, datetime
@@ -66,47 +68,25 @@ from .index.html import (
 )
 )
 from .index.csv import links_to_csv
 from .index.csv import links_to_csv
 from .extractors import archive_links, archive_link, ignore_methods
 from .extractors import archive_links, archive_link, ignore_methods
-from .misc.logging import stderr, hint
+from .misc.logging import stderr, hint, ANSI
 from .misc.checks import check_data_folder, check_dependencies
 from .misc.checks import check_data_folder, check_dependencies
 from .config import (
 from .config import (
     setup_django_minimal,
     setup_django_minimal,
     ConfigDict,
     ConfigDict,
-    ANSI,
     IS_TTY,
     IS_TTY,
     DEBUG,
     DEBUG,
     IN_DOCKER,
     IN_DOCKER,
     IN_QEMU,
     IN_QEMU,
     PUID,
     PUID,
     PGID,
     PGID,
-    USER,
     TIMEZONE,
     TIMEZONE,
-    ENFORCE_ATOMIC_WRITES,
-    OUTPUT_PERMISSIONS,
     ONLY_NEW,
     ONLY_NEW,
-    OUTPUT_DIR,
-    SOURCES_DIR,
-    ARCHIVE_DIR,
-    LOGS_DIR,
-    PACKAGE_DIR,
-    CONFIG_FILE,
-    ARCHIVE_DIR_NAME,
     JSON_INDEX_FILENAME,
     JSON_INDEX_FILENAME,
     HTML_INDEX_FILENAME,
     HTML_INDEX_FILENAME,
     SQL_INDEX_FILENAME,
     SQL_INDEX_FILENAME,
-    ALLOWED_IN_OUTPUT_DIR,
     LDAP,
     LDAP,
     write_config_file,
     write_config_file,
-    VERSION,
-    COMMIT_HASH,
-    BUILD_TIME,
-    CODE_LOCATIONS,
-    DATA_LOCATIONS,
     DEPENDENCIES,
     DEPENDENCIES,
-    YOUTUBEDL_BINARY,
-    YOUTUBEDL_VERSION,
-    SINGLEFILE_VERSION,
-    READABILITY_VERSION,
-    MERCURY_VERSION,
     load_all_config,
     load_all_config,
     CONFIG,
     CONFIG,
     USER_CONFIG,
     USER_CONFIG,
@@ -114,7 +94,6 @@ from .config import (
     setup_django,
     setup_django,
 )
 )
 from .logging_util import (
 from .logging_util import (
-    TERM_WIDTH,
     TimedProgress,
     TimedProgress,
     log_importing_started,
     log_importing_started,
     log_crawl_started,
     log_crawl_started,
@@ -129,9 +108,14 @@ from .logging_util import (
     printable_dependency_version,
     printable_dependency_version,
 )
 )
 
 
+VERSION = archivebox.VERSION
+PACKAGE_DIR = archivebox.PACKAGE_DIR
+OUTPUT_DIR = archivebox.DATA_DIR
+ARCHIVE_DIR = archivebox.DATA_DIR / 'archive'
+
 
 
 @enforce_types
 @enforce_types
-def help(out_dir: Path=OUTPUT_DIR) -> None:
+def help(out_dir: Path=archivebox.DATA_DIR) -> None:
     """Print the ArchiveBox help message and usage"""
     """Print the ArchiveBox help message and usage"""
 
 
     all_subcommands = CLI_SUBCOMMANDS
     all_subcommands = CLI_SUBCOMMANDS
@@ -207,7 +191,7 @@ def version(quiet: bool=False,
     """Print the ArchiveBox version and dependency information"""
     """Print the ArchiveBox version and dependency information"""
     
     
     setup_django_minimal()
     setup_django_minimal()
-    from plugins_sys.config.apps import SEARCH_BACKEND_CONFIG, STORAGE_CONFIG, SHELL_CONFIG
+    from plugins_sys.config.apps import SEARCH_BACKEND_CONFIG, STORAGE_CONFIG, SHELL_CONFIG, CONSTANTS
     from plugins_auth.ldap.apps import LDAP_CONFIG
     from plugins_auth.ldap.apps import LDAP_CONFIG
     from django.conf import settings
     from django.conf import settings
     
     
@@ -223,8 +207,8 @@ def version(quiet: bool=False,
         p = platform.uname()
         p = platform.uname()
         print(
         print(
             'ArchiveBox v{}'.format(archivebox.__version__),
             'ArchiveBox v{}'.format(archivebox.__version__),
-            f'COMMIT_HASH={COMMIT_HASH[:7] if COMMIT_HASH else "unknown"}',
-            f'BUILD_TIME={BUILD_TIME}',
+            f'COMMIT_HASH={SHELL_CONFIG.COMMIT_HASH[:7] if SHELL_CONFIG.COMMIT_HASH else "unknown"}',
+            f'BUILD_TIME={SHELL_CONFIG.BUILD_TIME}',
         )
         )
         print(
         print(
             f'IN_DOCKER={IN_DOCKER}',
             f'IN_DOCKER={IN_DOCKER}',
@@ -234,7 +218,7 @@ def version(quiet: bool=False,
             f'PLATFORM={platform.platform()}',
             f'PLATFORM={platform.platform()}',
             f'PYTHON={sys.implementation.name.title()}',
             f'PYTHON={sys.implementation.name.title()}',
         )
         )
-        OUTPUT_IS_REMOTE_FS = DATA_LOCATIONS['OUTPUT_DIR']['is_mount'] or DATA_LOCATIONS['ARCHIVE_DIR']['is_mount']
+        OUTPUT_IS_REMOTE_FS = CONSTANTS.DATA_LOCATIONS['OUTPUT_DIR']['is_mount'] or CONSTANTS.DATA_LOCATIONS['ARCHIVE_DIR']['is_mount']
         print(
         print(
             f'FS_ATOMIC={STORAGE_CONFIG.ENFORCE_ATOMIC_WRITES}',
             f'FS_ATOMIC={STORAGE_CONFIG.ENFORCE_ATOMIC_WRITES}',
             f'FS_REMOTE={OUTPUT_IS_REMOTE_FS}',
             f'FS_REMOTE={OUTPUT_IS_REMOTE_FS}',
@@ -268,17 +252,18 @@ def version(quiet: bool=False,
             except Exception as e:
             except Exception as e:
                 err = e
                 err = e
                 loaded_bin = binary
                 loaded_bin = binary
+                raise
             print('', '√' if loaded_bin.is_valid else 'X', '', loaded_bin.name.ljust(21), str(loaded_bin.version).ljust(15), loaded_bin.abspath or str(err))
             print('', '√' if loaded_bin.is_valid else 'X', '', loaded_bin.name.ljust(21), str(loaded_bin.version).ljust(15), loaded_bin.abspath or str(err))
    
    
         print()
         print()
         print('{white}[i] Source-code locations:{reset}'.format(**ANSI))
         print('{white}[i] Source-code locations:{reset}'.format(**ANSI))
-        for name, path in CODE_LOCATIONS.items():
+        for name, path in CONSTANTS.CODE_LOCATIONS.items():
             print(printable_folder_status(name, path))
             print(printable_folder_status(name, path))
 
 
         print()
         print()
-        if DATA_LOCATIONS['OUTPUT_DIR']['is_valid']:
+        if CONSTANTS.DATABASE_FILE.exists() or CONSTANTS.ARCHIVE_DIR.exists() or CONSTANTS.CONFIG_FILE.exists():
             print('{white}[i] Data locations:{reset}'.format(**ANSI))
             print('{white}[i] Data locations:{reset}'.format(**ANSI))
-            for name, path in DATA_LOCATIONS.items():
+            for name, path in CONSTANTS.DATA_LOCATIONS.items():
                 print(printable_folder_status(name, path))
                 print(printable_folder_status(name, path))
         else:
         else:
             print()
             print()
@@ -303,19 +288,19 @@ def run(subcommand: str,
 
 
 
 
 @enforce_types
 @enforce_types
-def init(force: bool=False, quick: bool=False, setup: bool=False, out_dir: Path=OUTPUT_DIR) -> None:
+def init(force: bool=False, quick: bool=False, setup: bool=False, out_dir: Path=archivebox.DATA_DIR) -> None:
     """Initialize a new ArchiveBox collection in the current directory"""
     """Initialize a new ArchiveBox collection in the current directory"""
     
     
     from core.models import Snapshot
     from core.models import Snapshot
 
 
     out_dir.mkdir(exist_ok=True)
     out_dir.mkdir(exist_ok=True)
-    is_empty = not len(set(os.listdir(out_dir)) - ALLOWED_IN_OUTPUT_DIR)
+    is_empty = not len(set(os.listdir(out_dir)) - CONSTANTS.ALLOWED_IN_OUTPUT_DIR)
 
 
-    if (out_dir / JSON_INDEX_FILENAME).exists():
+    if (out_dir / archivebox.CONSTANTS.JSON_INDEX_FILENAME).exists():
         stderr("[!] This folder contains a JSON index. It is deprecated, and will no longer be kept up to date automatically.", color="lightyellow")
         stderr("[!] This folder contains a JSON index. It is deprecated, and will no longer be kept up to date automatically.", color="lightyellow")
         stderr("    You can run `archivebox list --json --with-headers > static_index.json` to manually generate it.", color="lightyellow")
         stderr("    You can run `archivebox list --json --with-headers > static_index.json` to manually generate it.", color="lightyellow")
 
 
-    existing_index = (out_dir / SQL_INDEX_FILENAME).exists()
+    existing_index = archivebox.CONSTANTS.DATABASE_FILE.exists()
 
 
     if is_empty and not existing_index:
     if is_empty and not existing_index:
         print('{green}[+] Initializing a new ArchiveBox v{} collection...{reset}'.format(VERSION, **ANSI))
         print('{green}[+] Initializing a new ArchiveBox v{} collection...{reset}'.format(VERSION, **ANSI))
@@ -344,25 +329,24 @@ def init(force: bool=False, quick: bool=False, setup: bool=False, out_dir: Path=
     else:
     else:
         print('\n{green}[+] Building archive folder structure...{reset}'.format(**ANSI))
         print('\n{green}[+] Building archive folder structure...{reset}'.format(**ANSI))
     
     
-    print(f'    + ./{ARCHIVE_DIR.relative_to(OUTPUT_DIR)}, ./{SOURCES_DIR.relative_to(OUTPUT_DIR)}, ./{LOGS_DIR.relative_to(OUTPUT_DIR)}...')
-    Path(SOURCES_DIR).mkdir(exist_ok=True)
-    Path(ARCHIVE_DIR).mkdir(exist_ok=True)
-    Path(LOGS_DIR).mkdir(exist_ok=True)
-    print(f'    + ./{CONFIG_FILE.relative_to(OUTPUT_DIR)}...')
+    print(f'    + ./{CONSTANTS.ARCHIVE_DIR.relative_to(OUTPUT_DIR)}, ./{CONSTANTS.SOURCES_DIR.relative_to(OUTPUT_DIR)}, ./{CONSTANTS.LOGS_DIR.relative_to(OUTPUT_DIR)}...')
+    Path(CONSTANTS.SOURCES_DIR).mkdir(exist_ok=True)
+    Path(CONSTANTS.ARCHIVE_DIR).mkdir(exist_ok=True)
+    Path(CONSTANTS.LOGS_DIR).mkdir(exist_ok=True)
+    print(f'    + ./{CONSTANTS.CONFIG_FILE.relative_to(OUTPUT_DIR)}...')
     write_config_file({}, out_dir=out_dir)
     write_config_file({}, out_dir=out_dir)
 
 
-    if (out_dir / SQL_INDEX_FILENAME).exists():
+    if CONSTANTS.DATABASE_FILE.exists():
         print('\n{green}[*] Verifying main SQL index and running any migrations needed...{reset}'.format(**ANSI))
         print('\n{green}[*] Verifying main SQL index and running any migrations needed...{reset}'.format(**ANSI))
     else:
     else:
         print('\n{green}[+] Building main SQL index and running initial migrations...{reset}'.format(**ANSI))
         print('\n{green}[+] Building main SQL index and running initial migrations...{reset}'.format(**ANSI))
     
     
-    DATABASE_FILE = out_dir / SQL_INDEX_FILENAME
     for migration_line in apply_migrations(out_dir):
     for migration_line in apply_migrations(out_dir):
         print(f'    {migration_line}')
         print(f'    {migration_line}')
 
 
-    assert DATABASE_FILE.exists()
+    assert CONSTANTS.DATABASE_FILE.exists()
     print()
     print()
-    print(f'    √ ./{DATABASE_FILE.relative_to(OUTPUT_DIR)}')
+    print(f'    √ ./{CONSTANTS.DATABASE_FILE.relative_to(OUTPUT_DIR)}')
     
     
     # from django.contrib.auth.models import User
     # from django.contrib.auth.models import User
     # if IS_TTY and not User.objects.filter(is_superuser=True).exists():
     # if IS_TTY and not User.objects.filter(is_superuser=True).exists():
@@ -477,7 +461,7 @@ def status(out_dir: Path=OUTPUT_DIR) -> None:
     check_data_folder(CONFIG)
     check_data_folder(CONFIG)
 
 
     from core.models import Snapshot
     from core.models import Snapshot
-    from django.contrib.auth import get_user_model
+    from django.contrib.auth import get_user_mod, SHELL_CONFIG
     User = get_user_model()
     User = get_user_model()
 
 
     print('{green}[*] Scanning archive main index...{reset}'.format(**ANSI))
     print('{green}[*] Scanning archive main index...{reset}'.format(**ANSI))
@@ -491,7 +475,7 @@ def status(out_dir: Path=OUTPUT_DIR) -> None:
     num_sql_links = links.count()
     num_sql_links = links.count()
     num_link_details = sum(1 for link in parse_json_links_details(out_dir=out_dir))
     num_link_details = sum(1 for link in parse_json_links_details(out_dir=out_dir))
     print(f'    > SQL Main Index: {num_sql_links} links'.ljust(36), f'(found in {SQL_INDEX_FILENAME})')
     print(f'    > SQL Main Index: {num_sql_links} links'.ljust(36), f'(found in {SQL_INDEX_FILENAME})')
-    print(f'    > JSON Link Details: {num_link_details} links'.ljust(36), f'(found in {ARCHIVE_DIR_NAME}/*/index.json)')
+    print(f'    > JSON Link Details: {num_link_details} links'.ljust(36), f'(found in {ARCHIVE_DIR.name}/*/index.json)')
     print()
     print()
     print('{green}[*] Scanning archive data directories...{reset}'.format(**ANSI))
     print('{green}[*] Scanning archive data directories...{reset}'.format(**ANSI))
     print(ANSI['lightyellow'], f'   {ARCHIVE_DIR}/*', ANSI['reset'])
     print(ANSI['lightyellow'], f'   {ARCHIVE_DIR}/*', ANSI['reset'])
@@ -539,7 +523,7 @@ def status(out_dir: Path=OUTPUT_DIR) -> None:
     
     
     print()
     print()
     print('{green}[*] Scanning recent archive changes and user logins:{reset}'.format(**ANSI))
     print('{green}[*] Scanning recent archive changes and user logins:{reset}'.format(**ANSI))
-    print(ANSI['lightyellow'], f'   {LOGS_DIR}/*', ANSI['reset'])
+    print(ANSI['lightyellow'], f'   {CONSTANTS.LOGS_DIR}/*', ANSI['reset'])
     users = get_admins().values_list('username', flat=True)
     users = get_admins().values_list('username', flat=True)
     print(f'    UI users {len(users)}: {", ".join(users)}')
     print(f'    UI users {len(users)}: {", ".join(users)}')
     last_login = User.objects.order_by('last_login').last()
     last_login = User.objects.order_by('last_login').last()
@@ -564,7 +548,7 @@ def status(out_dir: Path=OUTPUT_DIR) -> None:
                 f'   > {str(snapshot.downloaded_at)[:16]} '
                 f'   > {str(snapshot.downloaded_at)[:16]} '
                 f'[{snapshot.num_outputs} {("X", "√")[snapshot.is_archived]} {printable_filesize(snapshot.archive_size)}] '
                 f'[{snapshot.num_outputs} {("X", "√")[snapshot.is_archived]} {printable_filesize(snapshot.archive_size)}] '
                 f'"{snapshot.title}": {snapshot.url}'
                 f'"{snapshot.title}": {snapshot.url}'
-            )[:TERM_WIDTH()],
+            )[:SHELL_CONFIG.TERM_WIDTH],
             ANSI['reset'],
             ANSI['reset'],
         )
         )
     print(ANSI['black'], '   ...', ANSI['reset'])
     print(ANSI['black'], '   ...', ANSI['reset'])
@@ -976,7 +960,7 @@ def setup(out_dir: Path=OUTPUT_DIR) -> None:
 
 
     from rich import print
     from rich import print
 
 
-    if not (out_dir / ARCHIVE_DIR_NAME).exists():
+    if not ARCHIVE_DIR.exists():
         run_subcommand('init', stdin=None, pwd=out_dir)
         run_subcommand('init', stdin=None, pwd=out_dir)
 
 
     setup_django(out_dir=out_dir, check_db=True)
     setup_django(out_dir=out_dir, check_db=True)
@@ -992,9 +976,13 @@ def setup(out_dir: Path=OUTPUT_DIR) -> None:
     from plugins_extractor.singlefile.apps import SINGLEFILE_BINARY
     from plugins_extractor.singlefile.apps import SINGLEFILE_BINARY
     print(SINGLEFILE_BINARY.load_or_install().model_dump(exclude={'binproviders_supported', 'loaded_binprovider', 'provider_overrides', 'loaded_abspaths', 'bin_dir', 'loaded_respath'}))
     print(SINGLEFILE_BINARY.load_or_install().model_dump(exclude={'binproviders_supported', 'loaded_binprovider', 'provider_overrides', 'loaded_abspaths', 'bin_dir', 'loaded_respath'}))
     
     
+    from plugins_extractor.readability.apps import READABILITY_BINARY
+    print(READABILITY_BINARY.load_or_install().model_dump(exclude={'binproviders_supported', 'loaded_binprovider', 'provider_overrides', 'loaded_abspaths', 'bin_dir', 'loaded_respath'}))
+    
+    
     from plugins_pkg.npm.apps import npm
     from plugins_pkg.npm.apps import npm
 
 
-    print(npm.load_or_install('readability-extractor', overrides={'packages': lambda: ['github:ArchiveBox/readability-extractor']}).model_dump(exclude={'binproviders_supported', 'loaded_binprovider', 'provider_overrides', 'loaded_abspaths', 'bin_dir', 'loaded_respath'}))
+    # TODO: move these to their own plugin binaries
     print(npm.load_or_install('postlight-parser',      overrides={'packages': lambda: ['@postlight/parser@^2.2.3'], 'version': lambda: '2.2.3'}).model_dump(exclude={'binproviders_supported', 'loaded_binprovider', 'provider_overrides', 'loaded_abspaths'}))
     print(npm.load_or_install('postlight-parser',      overrides={'packages': lambda: ['@postlight/parser@^2.2.3'], 'version': lambda: '2.2.3'}).model_dump(exclude={'binproviders_supported', 'loaded_binprovider', 'provider_overrides', 'loaded_abspaths'}))
 
 
     from django.contrib.auth import get_user_model
     from django.contrib.auth import get_user_model
@@ -1020,7 +1008,6 @@ def config(config_options_str: Optional[str]=None,
     """Get and set your ArchiveBox project configuration values"""
     """Get and set your ArchiveBox project configuration values"""
 
 
     check_data_folder(CONFIG)
     check_data_folder(CONFIG)
-
     if config_options and config_options_str:
     if config_options and config_options_str:
         stderr(
         stderr(
             '[X] You should either pass config values as an arguments '
             '[X] You should either pass config values as an arguments '
@@ -1096,7 +1083,6 @@ def config(config_options_str: Optional[str]=None,
     elif reset:
     elif reset:
         stderr('[X] This command is not implemented yet.', color='red')
         stderr('[X] This command is not implemented yet.', color='red')
         stderr('    Please manually remove the relevant lines from your config file:')
         stderr('    Please manually remove the relevant lines from your config file:')
-        stderr(f'        {CONFIG_FILE}')
         raise SystemExit(2)
         raise SystemExit(2)
     else:
     else:
         stderr('[X] You must pass either --get or --set, or no arguments to get the whole config.', color='red')
         stderr('[X] You must pass either --get or --set, or no arguments to get the whole config.', color='red')
@@ -1125,8 +1111,9 @@ def schedule(add: bool=False,
     check_data_folder(CONFIG)
     check_data_folder(CONFIG)
     setup_django_minimal()
     setup_django_minimal()
     from plugins_pkg.pip.apps import ARCHIVEBOX_BINARY
     from plugins_pkg.pip.apps import ARCHIVEBOX_BINARY
+    from plugins_sys.config.apps import SHELL_CONFIG, CONSTANTS
 
 
-    Path(LOGS_DIR).mkdir(exist_ok=True)
+    Path(CONSTANTS.LOGS_DIR).mkdir(exist_ok=True)
 
 
     cron = CronTab(user=True)
     cron = CronTab(user=True)
     cron = dedupe_cron_jobs(cron)
     cron = dedupe_cron_jobs(cron)
@@ -1155,7 +1142,7 @@ def schedule(add: bool=False,
                 f'"{import_path}"',
                 f'"{import_path}"',
             ] if import_path else ['update']),
             ] if import_path else ['update']),
             '>>',
             '>>',
-            quoted(Path(LOGS_DIR) / 'schedule.log'),
+            quoted(Path(CONSTANTS.LOGS_DIR) / 'schedule.log'),
             '2>&1',
             '2>&1',
 
 
         ]
         ]
@@ -1167,7 +1154,7 @@ def schedule(add: bool=False,
         elif CronSlices.is_valid(every):
         elif CronSlices.is_valid(every):
             new_job.setall(every)
             new_job.setall(every)
         else:
         else:
-            stderr('{red}[X] Got invalid timeperiod for cron task.{reset}'.format(**ANSI))
+            stderr('{red}[X] Got invalid timeperiod for cron task.{reset}'.format(**SHELL_CONFIG.ANSI))
             stderr('    It must be one of minute/hour/day/month')
             stderr('    It must be one of minute/hour/day/month')
             stderr('    or a quoted cron-format schedule like:')
             stderr('    or a quoted cron-format schedule like:')
             stderr('        archivebox init --every=day --depth=1 https://example.com/some/rss/feed.xml')
             stderr('        archivebox init --every=day --depth=1 https://example.com/some/rss/feed.xml')
@@ -1181,11 +1168,11 @@ def schedule(add: bool=False,
         existing_jobs = list(cron.find_comment(CRON_COMMENT))
         existing_jobs = list(cron.find_comment(CRON_COMMENT))
 
 
         print()
         print()
-        print('{green}[√] Scheduled new ArchiveBox cron job for user: {} ({} jobs are active).{reset}'.format(USER, len(existing_jobs), **ANSI))
+        print('{green}[√] Scheduled new ArchiveBox cron job for user: {} ({} jobs are active).{reset}'.format(SHELL_CONFIG.USER, len(existing_jobs), **SHELL_CONFIG.ANSI))
         print('\n'.join(f'  > {cmd}' if str(cmd) == str(new_job) else f'    {cmd}' for cmd in existing_jobs))
         print('\n'.join(f'  > {cmd}' if str(cmd) == str(new_job) else f'    {cmd}' for cmd in existing_jobs))
         if total_runs > 60 and not quiet:
         if total_runs > 60 and not quiet:
             stderr()
             stderr()
-            stderr('{lightyellow}[!] With the current cron config, ArchiveBox is estimated to run >{} times per year.{reset}'.format(total_runs, **ANSI))
+            stderr('{lightyellow}[!] With the current cron config, ArchiveBox is estimated to run >{} times per year.{reset}'.format(total_runs, **SHELL_CONFIG.ANSI))
             stderr('    Congrats on being an enthusiastic internet archiver! 👌')
             stderr('    Congrats on being an enthusiastic internet archiver! 👌')
             stderr()
             stderr()
             stderr('    Make sure you have enough storage space available to hold all the data.')
             stderr('    Make sure you have enough storage space available to hold all the data.')
@@ -1195,7 +1182,7 @@ def schedule(add: bool=False,
         if existing_jobs:
         if existing_jobs:
             print('\n'.join(str(cmd) for cmd in existing_jobs))
             print('\n'.join(str(cmd) for cmd in existing_jobs))
         else:
         else:
-            stderr('{red}[X] There are no ArchiveBox cron jobs scheduled for your user ({}).{reset}'.format(USER, **ANSI))
+            stderr('{red}[X] There are no ArchiveBox cron jobs scheduled for your user ({}).{reset}'.format(SHELL_CONFIG.USER, **SHELL_CONFIG.ANSI))
             stderr('    To schedule a new job, run:')
             stderr('    To schedule a new job, run:')
             stderr('        archivebox schedule --every=[timeperiod] --depth=1 https://example.com/some/rss/feed.xml')
             stderr('        archivebox schedule --every=[timeperiod] --depth=1 https://example.com/some/rss/feed.xml')
         raise SystemExit(0)
         raise SystemExit(0)
@@ -1206,11 +1193,11 @@ def schedule(add: bool=False,
 
 
     if foreground or run_all:
     if foreground or run_all:
         if not existing_jobs:
         if not existing_jobs:
-            stderr('{red}[X] You must schedule some jobs first before running in foreground mode.{reset}'.format(**ANSI))
+            stderr('{red}[X] You must schedule some jobs first before running in foreground mode.{reset}'.format(**SHELL_CONFIG.ANSI))
             stderr('    archivebox schedule --every=hour --depth=1 https://example.com/some/rss/feed.xml')
             stderr('    archivebox schedule --every=hour --depth=1 https://example.com/some/rss/feed.xml')
             raise SystemExit(1)
             raise SystemExit(1)
 
 
-        print('{green}[*] Running {} ArchiveBox jobs in foreground task scheduler...{reset}'.format(len(existing_jobs), **ANSI))
+        print('{green}[*] Running {} ArchiveBox jobs in foreground task scheduler...{reset}'.format(len(existing_jobs), **SHELL_CONFIG.ANSI))
         if run_all:
         if run_all:
             try:
             try:
                 for job in existing_jobs:
                 for job in existing_jobs:
@@ -1220,7 +1207,7 @@ def schedule(add: bool=False,
                     job.run()
                     job.run()
                     sys.stdout.write(f'\r    √ {job.command.split("/archivebox ")[-1]}\n')
                     sys.stdout.write(f'\r    √ {job.command.split("/archivebox ")[-1]}\n')
             except KeyboardInterrupt:
             except KeyboardInterrupt:
-                print('\n{green}[√] Stopped.{reset}'.format(**ANSI))
+                print('\n{green}[√] Stopped.{reset}'.format(**SHELL_CONFIG.ANSI))
                 raise SystemExit(1)
                 raise SystemExit(1)
 
 
         if foreground:
         if foreground:
@@ -1230,7 +1217,7 @@ def schedule(add: bool=False,
                 for result in cron.run_scheduler():
                 for result in cron.run_scheduler():
                     print(result)
                     print(result)
             except KeyboardInterrupt:
             except KeyboardInterrupt:
-                print('\n{green}[√] Stopped.{reset}'.format(**ANSI))
+                print('\n{green}[√] Stopped.{reset}'.format(**SHELL_CONFIG.ANSI))
                 raise SystemExit(1)
                 raise SystemExit(1)
 
 
     # if CAN_UPGRADE:
     # if CAN_UPGRADE:

+ 34 - 30
archivebox/misc/checks.py

@@ -5,51 +5,55 @@ __package__ = 'archivebox.misc'
 from benedict import benedict
 from benedict import benedict
 from pathlib import Path
 from pathlib import Path
 
 
-from .logging import stderr, hint
+import archivebox
+
+from .logging import stderr, hint, ANSI
 
 
 
 
 def check_dependencies(config: benedict, show_help: bool=True) -> None:
 def check_dependencies(config: benedict, show_help: bool=True) -> None:
-    invalid_dependencies = [
-        (name, info) for name, info in config['DEPENDENCIES'].items()
-        if info['enabled'] and not info['is_valid']
-    ]
-    if invalid_dependencies and show_help:
-        stderr(f'[!] Warning: Missing {len(invalid_dependencies)} recommended dependencies', color='lightyellow')
-        for dependency, info in invalid_dependencies:
-            stderr(
-                '    ! {}: {} ({})'.format(
-                    dependency,
-                    info['path'] or 'unable to find binary',
-                    info['version'] or 'unable to detect version',
-                )
-            )
-            if dependency in ('YOUTUBEDL_BINARY', 'CHROME_BINARY', 'SINGLEFILE_BINARY', 'READABILITY_BINARY', 'MERCURY_BINARY'):
-                hint(('To install all packages automatically run: archivebox setup',
-                    f'or to disable it and silence this warning: archivebox config --set SAVE_{dependency.rsplit("_", 1)[0]}=False',
-                    ''), prefix='      ')
-        stderr('')
+    # dont do this on startup anymore, it's too slow
+    pass
+    # invalid_dependencies = [
+    #     (name, binary) for name, info in settings.BINARIES.items()
+    #     if not binary.
+    # ]
+    # if invalid_dependencies and show_help:
+    #     stderr(f'[!] Warning: Missing {len(invalid_dependencies)} recommended dependencies', color='lightyellow')
+    #     for dependency, info in invalid_dependencies:
+    #         stderr(
+    #             '    ! {}: {} ({})'.format(
+    #                 dependency,
+    #                 info['path'] or 'unable to find binary',
+    #                 info['version'] or 'unable to detect version',
+    #             )
+    #         )
+    #         if dependency in ('YOUTUBEDL_BINARY', 'CHROME_BINARY', 'SINGLEFILE_BINARY', 'READABILITY_BINARY', 'MERCURY_BINARY'):
+    #             hint(('To install all packages automatically run: archivebox setup',
+    #                 f'or to disable it and silence this warning: archivebox config --set SAVE_{dependency.rsplit("_", 1)[0]}=False',
+    #                 ''), prefix='      ')
+    #     stderr('')
 
 
 
 
 
 
 def check_data_folder(config: benedict) -> None:
 def check_data_folder(config: benedict) -> None:
-    output_dir = config['OUTPUT_DIR']
+    output_dir = archivebox.DATA_DIR
 
 
-    archive_dir_exists = (Path(output_dir) / 'archive').exists()
+    archive_dir_exists = (archivebox.CONSTANTS.ARCHIVE_DIR).exists()
     if not archive_dir_exists:
     if not archive_dir_exists:
         stderr('[X] No archivebox index found in the current directory.', color='red')
         stderr('[X] No archivebox index found in the current directory.', color='red')
         stderr(f'    {output_dir}', color='lightyellow')
         stderr(f'    {output_dir}', color='lightyellow')
         stderr()
         stderr()
-        stderr('    {lightred}Hint{reset}: Are you running archivebox in the right folder?'.format(**config['ANSI']))
+        stderr('    {lightred}Hint{reset}: Are you running archivebox in the right folder?'.format(**ANSI))
         stderr('        cd path/to/your/archive/folder')
         stderr('        cd path/to/your/archive/folder')
         stderr('        archivebox [command]')
         stderr('        archivebox [command]')
         stderr()
         stderr()
-        stderr('    {lightred}Hint{reset}: To create a new archive collection or import existing data in this folder, run:'.format(**config['ANSI']))
+        stderr('    {lightred}Hint{reset}: To create a new archive collection or import existing data in this folder, run:'.format(**ANSI))
         stderr('        archivebox init')
         stderr('        archivebox init')
         raise SystemExit(2)
         raise SystemExit(2)
 
 
 
 
 def check_migrations(config: benedict):
 def check_migrations(config: benedict):
-    output_dir = config['OUTPUT_DIR']
+    output_dir = archivebox.DATA_DIR
     
     
     from ..index.sql import list_migrations
     from ..index.sql import list_migrations
 
 
@@ -63,8 +67,8 @@ def check_migrations(config: benedict):
         stderr('        archivebox init')
         stderr('        archivebox init')
         raise SystemExit(3)
         raise SystemExit(3)
 
 
-    (Path(output_dir) / config['SOURCES_DIR_NAME']).mkdir(exist_ok=True)
-    (Path(output_dir) / config['LOGS_DIR_NAME']).mkdir(exist_ok=True)
-    (Path(output_dir) / config['CACHE_DIR_NAME']).mkdir(exist_ok=True)
-    (Path(output_dir) / config['LIB_DIR_NAME'] / 'bin').mkdir(exist_ok=True, parents=True)
-    (Path(output_dir) / config['PERSONAS_DIR_NAME'] / 'Default').mkdir(exist_ok=True, parents=True)
+    archivebox.CONSTANTS.SOURCES_DIR.mkdir(exist_ok=True)
+    archivebox.CONSTANTS.LOGS_DIR.mkdir(exist_ok=True)
+    archivebox.CONSTANTS.CACHE_DIR.mkdir(exist_ok=True)
+    (archivebox.CONSTANTS.LIB_DIR / 'bin').mkdir(exist_ok=True, parents=True)
+    (archivebox.CONSTANTS.PERSONAS_DIR / 'Default').mkdir(exist_ok=True, parents=True)

+ 3 - 5
archivebox/misc/logging.py

@@ -8,8 +8,6 @@ from collections import defaultdict
 from benedict import benedict
 from benedict import benedict
 from rich.console import Console
 from rich.console import Console
 
 
-from ..config_stubs import ConfigDict
-
 # SETUP RICH CONSOLE / TTY detection / COLOR / PROGRESS BARS
 # SETUP RICH CONSOLE / TTY detection / COLOR / PROGRESS BARS
 CONSOLE = Console()
 CONSOLE = Console()
 IS_TTY = CONSOLE.is_interactive
 IS_TTY = CONSOLE.is_interactive
@@ -43,7 +41,7 @@ COLOR_DICT = defaultdict(lambda: [(0, 0, 0), (0, 0, 0)], {
 })
 })
 
 
 # Logging Helpers
 # Logging Helpers
-def stdout(*args, color: Optional[str]=None, prefix: str='', config: Optional[ConfigDict]=None) -> None:
+def stdout(*args, color: Optional[str]=None, prefix: str='', config: Optional[benedict]=None) -> None:
     ansi = DEFAULT_CLI_COLORS if (config or {}).get('USE_COLOR') else ANSI
     ansi = DEFAULT_CLI_COLORS if (config or {}).get('USE_COLOR') else ANSI
 
 
     if color:
     if color:
@@ -53,7 +51,7 @@ def stdout(*args, color: Optional[str]=None, prefix: str='', config: Optional[Co
 
 
     sys.stdout.write(prefix + ''.join(strs))
     sys.stdout.write(prefix + ''.join(strs))
 
 
-def stderr(*args, color: Optional[str]=None, prefix: str='', config: Optional[ConfigDict]=None) -> None:
+def stderr(*args, color: Optional[str]=None, prefix: str='', config: Optional[benedict]=None) -> None:
     ansi = DEFAULT_CLI_COLORS if (config or {}).get('USE_COLOR') else ANSI
     ansi = DEFAULT_CLI_COLORS if (config or {}).get('USE_COLOR') else ANSI
 
 
     if color:
     if color:
@@ -63,7 +61,7 @@ def stderr(*args, color: Optional[str]=None, prefix: str='', config: Optional[Co
 
 
     sys.stderr.write(prefix + ''.join(strs))
     sys.stderr.write(prefix + ''.join(strs))
 
 
-def hint(text: Union[Tuple[str, ...], List[str], str], prefix='    ', config: Optional[ConfigDict]=None) -> None:
+def hint(text: Union[Tuple[str, ...], List[str], str], prefix='    ', config: Optional[benedict]=None) -> None:
     ansi = DEFAULT_CLI_COLORS if (config or {}).get('USE_COLOR') else ANSI
     ansi = DEFAULT_CLI_COLORS if (config or {}).get('USE_COLOR') else ANSI
 
 
     if isinstance(text, str):
     if isinstance(text, str):

+ 2 - 3
archivebox/parsers/pocket_api.py

@@ -2,25 +2,24 @@ __package__ = 'archivebox.parsers'
 
 
 
 
 import re
 import re
+import archivebox
 
 
 from typing import IO, Iterable, Optional
 from typing import IO, Iterable, Optional
 from configparser import ConfigParser
 from configparser import ConfigParser
 
 
-from pathlib import Path
 from pocket import Pocket
 from pocket import Pocket
 
 
 from ..index.schema import Link
 from ..index.schema import Link
 from ..util import enforce_types
 from ..util import enforce_types
 from ..system import atomic_write
 from ..system import atomic_write
 from ..config import (
 from ..config import (
-    SOURCES_DIR,
     POCKET_CONSUMER_KEY,
     POCKET_CONSUMER_KEY,
     POCKET_ACCESS_TOKENS,
     POCKET_ACCESS_TOKENS,
 )
 )
 
 
 
 
 COUNT_PER_PAGE = 500
 COUNT_PER_PAGE = 500
-API_DB_PATH = Path(SOURCES_DIR) / 'pocket_api.db'
+API_DB_PATH = archivebox.DATA_DIR / 'sources' / 'pocket_api.db'
 
 
 # search for broken protocols that sometimes come from the Pocket API
 # search for broken protocols that sometimes come from the Pocket API
 _BROKEN_PROTOCOL_RE = re.compile('^(http[s]?)(:/(?!/))')
 _BROKEN_PROTOCOL_RE = re.compile('^(http[s]?)(:/(?!/))')

+ 3 - 7
archivebox/parsers/readwise_reader_api.py

@@ -3,23 +3,19 @@ __package__ = "archivebox.parsers"
 
 
 import re
 import re
 import requests
 import requests
+import archivebox
 from datetime import datetime
 from datetime import datetime
 
 
 from typing import IO, Iterable, Optional
 from typing import IO, Iterable, Optional
 from configparser import ConfigParser
 from configparser import ConfigParser
 
 
-from pathlib import Path
-
 from ..index.schema import Link
 from ..index.schema import Link
 from ..util import enforce_types
 from ..util import enforce_types
 from ..system import atomic_write
 from ..system import atomic_write
-from ..config import (
-    SOURCES_DIR,
-    READWISE_READER_TOKENS,
-)
+from ..config import READWISE_READER_TOKENS
 
 
 
 
-API_DB_PATH = Path(SOURCES_DIR) / "readwise_reader_api.db"
+API_DB_PATH = archivebox.DATA_DIR / "sources" / "readwise_reader_api.db"
 
 
 
 
 class ReadwiseReaderAPI:
 class ReadwiseReaderAPI:

+ 8 - 4
archivebox/plugantic/base_binary.py

@@ -17,6 +17,8 @@ from pydantic_pkgr import (
 
 
 from django.conf import settings
 from django.conf import settings
 
 
+import archivebox
+
 from .base_hook import BaseHook, HookType
 from .base_hook import BaseHook, HookType
 
 
 
 
@@ -64,7 +66,9 @@ class BaseBinary(BaseHook, Binary):
         super().register(settings, parent_plugin=parent_plugin)
         super().register(settings, parent_plugin=parent_plugin)
 
 
     @staticmethod
     @staticmethod
-    def symlink_to_lib(binary, bin_dir=settings.CONFIG.BIN_DIR) -> None:
+    def symlink_to_lib(binary, bin_dir=None) -> None:
+        bin_dir = bin_dir or archivebox.CONSTANTS.LIB_BIN_DIR
+        
         if not (binary.abspath and binary.abspath.exists()):
         if not (binary.abspath and binary.abspath.exists()):
             return
             return
         
         
@@ -77,19 +81,19 @@ class BaseBinary(BaseHook, Binary):
     @validate_call
     @validate_call
     def load(self, **kwargs) -> Self:
     def load(self, **kwargs) -> Self:
         binary = super().load(**kwargs)
         binary = super().load(**kwargs)
-        self.symlink_to_lib(binary=binary, bin_dir=settings.CONFIG.BIN_DIR)
+        self.symlink_to_lib(binary=binary, bin_dir=archivebox.CONSTANTS.LIB_BIN_DIR)
         return binary
         return binary
     
     
     @validate_call
     @validate_call
     def install(self, **kwargs) -> Self:
     def install(self, **kwargs) -> Self:
         binary = super().install(**kwargs)
         binary = super().install(**kwargs)
-        self.symlink_to_lib(binary=binary, bin_dir=settings.CONFIG.BIN_DIR)
+        self.symlink_to_lib(binary=binary, bin_dir=archivebox.CONSTANTS.LIB_BIN_DIR)
         return binary
         return binary
     
     
     @validate_call
     @validate_call
     def load_or_install(self, **kwargs) -> Self:
     def load_or_install(self, **kwargs) -> Self:
         binary = super().load_or_install(**kwargs)
         binary = super().load_or_install(**kwargs)
-        self.symlink_to_lib(binary=binary, bin_dir=settings.CONFIG.BIN_DIR)
+        self.symlink_to_lib(binary=binary, bin_dir=archivebox.CONSTANTS.LIB_BIN_DIR)
         return binary
         return binary
     
     
     @property
     @property

+ 29 - 14
archivebox/plugantic/base_configset.py

@@ -123,6 +123,10 @@ class ArchiveBoxBaseConfig(BaseSettings):
         validate_return=True,
         validate_return=True,
         revalidate_instances="always",
         revalidate_instances="always",
     )
     )
+    
+    load_from_defaults: ClassVar[bool] = True
+    load_from_configfile: ClassVar[bool] = True
+    load_from_environment: ClassVar[bool] = True
 
 
     @classmethod
     @classmethod
     def settings_customise_sources(
     def settings_customise_sources(
@@ -140,20 +144,22 @@ class ArchiveBoxBaseConfig(BaseSettings):
         
         
         # import ipdb; ipdb.set_trace()
         # import ipdb; ipdb.set_trace()
         
         
+        precedence_order = {}
+        
         # if ArchiveBox.conf does not exist yet, return defaults -> env order
         # if ArchiveBox.conf does not exist yet, return defaults -> env order
         if not ARCHIVEBOX_CONFIG_FILE.is_file():
         if not ARCHIVEBOX_CONFIG_FILE.is_file():
-            return (
-                init_settings,
-                env_settings,
-            )
+            precedence_order = {
+                'defaults': init_settings,
+                'environment': env_settings,
+            }
         
         
         # if ArchiveBox.conf exists and is in TOML format, return default -> TOML -> env order
         # if ArchiveBox.conf exists and is in TOML format, return default -> TOML -> env order
         try:
         try:
-            return (
-                init_settings,
-                FlatTomlConfigSettingsSource(settings_cls, toml_file=ARCHIVEBOX_CONFIG_FILE),
-                env_settings,
-            )
+            precedence_order = precedence_order or {
+                'defaults': init_settings,
+                'configfile': FlatTomlConfigSettingsSource(settings_cls, toml_file=ARCHIVEBOX_CONFIG_FILE),
+                'environment': env_settings,
+            }
         except Exception as err:
         except Exception as err:
             if err.__class__.__name__ != "TOMLDecodeError":
             if err.__class__.__name__ != "TOMLDecodeError":
                 raise
                 raise
@@ -165,11 +171,20 @@ class ArchiveBoxBaseConfig(BaseSettings):
             new_toml = ini_to_toml.convert(original_ini)
             new_toml = ini_to_toml.convert(original_ini)
             ARCHIVEBOX_CONFIG_FILE.write_text(new_toml)
             ARCHIVEBOX_CONFIG_FILE.write_text(new_toml)
 
 
-            return (
-                init_settings,
-                FlatTomlConfigSettingsSource(settings_cls, toml_file=ARCHIVEBOX_CONFIG_FILE),
-                env_settings,
-            )
+            precedence_order = {
+                'defaults': init_settings,
+                'configfile': FlatTomlConfigSettingsSource(settings_cls, toml_file=ARCHIVEBOX_CONFIG_FILE),
+                'environment': env_settings,
+            }
+            
+        if not cls.load_from_environment:
+            precedence_order.pop('environment')
+        if not cls.load_from_configfile:
+            precedence_order.pop('configfile')
+        if not cls.load_from_defaults:
+            precedence_order.pop('defaults')
+
+        return tuple(precedence_order.values())
 
 
     @model_validator(mode="after")
     @model_validator(mode="after")
     def fill_defaults(self):
     def fill_defaults(self):

+ 57 - 57
archivebox/plugantic/management/commands/pkg.py

@@ -1,72 +1,72 @@
-__package__ = 'archivebox.plugantic.management.commands'
+# __package__ = 'archivebox.plugantic.management.commands'
 
 
-from django.core.management.base import BaseCommand
-from django.conf import settings
+# from django.core.management.base import BaseCommand
+# from django.conf import settings
 
 
-from pydantic_pkgr import Binary, BinProvider, BrewProvider, EnvProvider, SemVer
-from pydantic_pkgr.binprovider import bin_abspath
+# from pydantic_pkgr import Binary, BinProvider, BrewProvider, EnvProvider, SemVer
+# from pydantic_pkgr.binprovider import bin_abspath
 
 
-from ....config import NODE_BIN_PATH, bin_path
-from ...base_binary import env
+# from ....config import bin_path
+# from ...base_binary import env
 
 
 
 
-class Command(BaseCommand):
-    def handle(self, *args, method, **options):
-        method(*args, **options)
+# class Command(BaseCommand):
+#     def handle(self, *args, method, **options):
+#         method(*args, **options)
 
 
-    def add_arguments(self, parser):
-        subparsers = parser.add_subparsers(title="sub-commands", required=True)
+#     def add_arguments(self, parser):
+#         subparsers = parser.add_subparsers(title="sub-commands", required=True)
 
 
-        list_parser = subparsers.add_parser("list", help="List archivebox runtime dependencies.")
-        list_parser.set_defaults(method=self.list)
+#         list_parser = subparsers.add_parser("list", help="List archivebox runtime dependencies.")
+#         list_parser.set_defaults(method=self.list)
 
 
-        install_parser = subparsers.add_parser("install", help="Install archivebox runtime dependencies.")
-        install_parser.add_argument("--update", action="store_true", help="Update dependencies to latest versions.")
-        install_parser.add_argument("package_names", nargs="+", type=str)
-        install_parser.set_defaults(method=self.install)
+#         install_parser = subparsers.add_parser("install", help="Install archivebox runtime dependencies.")
+#         install_parser.add_argument("--update", action="store_true", help="Update dependencies to latest versions.")
+#         install_parser.add_argument("package_names", nargs="+", type=str)
+#         install_parser.set_defaults(method=self.install)
 
 
-    def list(self, *args, **options):
-        self.stdout.write('################# PLUGINS ####################')
-        for plugin in settings.PLUGINS.values():
-            self.stdout.write(f'{plugin.name}:')
-            for binary in plugin.binaries:
-                try:
-                    binary = binary.load()
-                except Exception as e:
-                    # import ipdb; ipdb.set_trace()
-                    raise
-                self.stdout.write(f'    {binary.name.ljust(14)} {str(binary.version).ljust(11)} {binary.binprovider.INSTALLER_BIN.ljust(5)}  {binary.abspath}')
+#     def list(self, *args, **options):
+#         self.stdout.write('################# PLUGINS ####################')
+#         for plugin in settings.PLUGINS.values():
+#             self.stdout.write(f'{plugin.name}:')
+#             for binary in plugin.binaries:
+#                 try:
+#                     binary = binary.load()
+#                 except Exception as e:
+#                     # import ipdb; ipdb.set_trace()
+#                     raise
+#                 self.stdout.write(f'    {binary.name.ljust(14)} {str(binary.version).ljust(11)} {binary.binprovider.INSTALLER_BIN.ljust(5)}  {binary.abspath}')
 
 
-        self.stdout.write('\n################# LEGACY ####################')
-        for bin_key, dependency in settings.CONFIG.DEPENDENCIES.items():
-            bin_name = settings.CONFIG[bin_key]
+#         self.stdout.write('\n################# LEGACY ####################')
+#         for bin_key, dependency in settings.CONFIG.DEPENDENCIES.items():
+#             bin_name = settings.CONFIG[bin_key]
 
 
-            self.stdout.write(f'{bin_key}:     {bin_name}')
+#             self.stdout.write(f'{bin_key}:     {bin_name}')
 
 
-            # binary = Binary(name=package_name, providers=[env])
-            # print(binary)
+#             # binary = Binary(name=package_name, providers=[env])
+#             # print(binary)
 
 
-            # try:
-            #     loaded_bin = binary.load()
-            #     self.stdout.write(
-            #         self.style.SUCCESS(f'Successfully loaded {package_name}:') + str(loaded_bin)
-            #     )
-            # except Exception as e:
-            #     self.stderr.write(
-            #         self.style.ERROR(f"Error loading {package_name}: {e}")
-            #     )
+#             # try:
+#             #     loaded_bin = binary.load()
+#             #     self.stdout.write(
+#             #         self.style.SUCCESS(f'Successfully loaded {package_name}:') + str(loaded_bin)
+#             #     )
+#             # except Exception as e:
+#             #     self.stderr.write(
+#             #         self.style.ERROR(f"Error loading {package_name}: {e}")
+#             #     )
 
 
-    def install(self, *args, bright, **options):
-        for package_name in options["package_names"]:
-            binary = Binary(name=package_name, providers=[env])
-            print(binary)
+#     def install(self, *args, bright, **options):
+#         for package_name in options["package_names"]:
+#             binary = Binary(name=package_name, providers=[env])
+#             print(binary)
 
 
-            try:
-                loaded_bin = binary.load()
-                self.stdout.write(
-                    self.style.SUCCESS(f'Successfully loaded {package_name}:') + str(loaded_bin)
-                )
-            except Exception as e:
-                self.stderr.write(
-                    self.style.ERROR(f"Error loading {package_name}: {e}")
-                )
+#             try:
+#                 loaded_bin = binary.load()
+#                 self.stdout.write(
+#                     self.style.SUCCESS(f'Successfully loaded {package_name}:') + str(loaded_bin)
+#                 )
+#             except Exception as e:
+#                 self.stderr.write(
+#                     self.style.ERROR(f"Error loading {package_name}: {e}")
+#                 )

+ 3 - 1
archivebox/plugins_extractor/chrome/apps.py

@@ -18,6 +18,8 @@ from pydantic_pkgr import (
     bin_abspath,
     bin_abspath,
 )
 )
 
 
+import archivebox
+
 # Depends on other Django apps:
 # Depends on other Django apps:
 from plugantic.base_plugin import BasePlugin
 from plugantic.base_plugin import BasePlugin
 from plugantic.base_configset import BaseConfigSet, ConfigSectionName
 from plugantic.base_configset import BaseConfigSet, ConfigSectionName
@@ -215,7 +217,7 @@ class ChromeBinary(BaseBinary):
     }
     }
 
 
     @staticmethod
     @staticmethod
-    def symlink_to_lib(binary, bin_dir=settings.CONFIG.BIN_DIR) -> None:
+    def symlink_to_lib(binary, bin_dir=archivebox.CONSTANTS.LIB_BIN_DIR) -> None:
         if not (binary.abspath and binary.abspath.exists()):
         if not (binary.abspath and binary.abspath.exists()):
             return
             return
         
         

+ 103 - 0
archivebox/plugins_extractor/readability/apps.py

@@ -0,0 +1,103 @@
+__package__ = 'archivebox.plugins_extractor.readability'
+
+from pathlib import Path
+from typing import List, Dict, Optional, ClassVar
+# from typing_extensions import Self
+
+from django.conf import settings
+
+# Depends on other PyPI/vendor packages:
+from pydantic import InstanceOf, Field, validate_call
+from pydantic_pkgr import BinProvider, BinProviderName, ProviderLookupDict, BinName, ShallowBinary
+
+# Depends on other Django apps:
+from plugantic.base_plugin import BasePlugin
+from plugantic.base_configset import BaseConfigSet, ConfigSectionName
+from plugantic.base_binary import BaseBinary, env
+from plugantic.base_extractor import BaseExtractor
+from plugantic.base_hook import BaseHook
+
+# Depends on Other Plugins:
+from plugins_sys.config.apps import ARCHIVING_CONFIG
+from plugins_pkg.npm.apps import SYS_NPM_BINPROVIDER, LIB_NPM_BINPROVIDER
+
+###################### Config ##########################
+
+class ReadabilityConfig(BaseConfigSet):
+    section: ClassVar[ConfigSectionName] = 'ARCHIVING_CONFIG'
+
+    SAVE_READABILITY: bool = Field(default=True, alias='USE_READABILITY')
+
+    READABILITY_TIMEOUT: int                 = Field(default=lambda: ARCHIVING_CONFIG.TIMEOUT)
+
+    READABILITY_BINARY: str = Field(default='readability-extractor')
+    # READABILITY_EXTRA_ARGS: List[str] = []                                # readability-extractor doesn't take any extra args
+
+
+READABILITY_CONFIG = ReadabilityConfig()
+
+
+READABILITY_PACKAGE_NAME = 'github:ArchiveBox/readability-extractor'
+
+class ReadabilityBinary(BaseBinary):
+    name: BinName = READABILITY_CONFIG.READABILITY_BINARY
+    binproviders_supported: List[InstanceOf[BinProvider]] = [LIB_NPM_BINPROVIDER, SYS_NPM_BINPROVIDER, env]
+
+    provider_overrides: Dict[BinProviderName, ProviderLookupDict] = {
+        LIB_NPM_BINPROVIDER.name: {"packages": lambda: [READABILITY_PACKAGE_NAME]},
+        SYS_NPM_BINPROVIDER.name: {"packages": lambda: []},    # prevent modifying system global npm packages
+    }
+    
+    @validate_call
+    def install(self, binprovider_name: Optional[BinProviderName]=None) -> ShallowBinary:
+        # force install to only use lib/npm provider, we never want to modify global NPM packages
+        return BaseBinary.install(self, binprovider_name=binprovider_name or LIB_NPM_BINPROVIDER.name)
+    
+    @validate_call
+    def load_or_install(self, binprovider_name: Optional[BinProviderName] = None) -> ShallowBinary:
+        # force install to only use lib/npm provider, we never want to modify global NPM packages
+        try:
+            return self.load()
+        except Exception:
+            return BaseBinary.install(self, binprovider_name=binprovider_name or LIB_NPM_BINPROVIDER.name)
+
+
+
+
+READABILITY_BINARY = ReadabilityBinary()
+
+
+class ReadabilityExtractor(BaseExtractor):
+    name: str = 'readability'
+    binary: BinName = READABILITY_BINARY.name
+
+    def get_output_path(self, snapshot) -> Path:
+        return Path(snapshot.link_dir) / 'readability' / 'content.html'
+
+
+READABILITY_BINARY = ReadabilityBinary()
+READABILITY_EXTRACTOR = ReadabilityExtractor()
+
+# class ReadabilityQueue(BaseQueue):
+#     name: str = 'singlefile'
+    
+#     binaries: List[InstanceOf[BaseBinary]] = [READABILITY_BINARY]
+
+# READABILITY_QUEUE = ReadabilityQueue()
+
+class ReadabilityPlugin(BasePlugin):
+    app_label: str ='singlefile'
+    verbose_name: str = 'SingleFile'
+
+    hooks: List[InstanceOf[BaseHook]] = [
+        READABILITY_CONFIG,
+        READABILITY_BINARY,
+        READABILITY_EXTRACTOR,
+        # READABILITY_QUEUE,
+    ]
+
+
+
+PLUGIN = ReadabilityPlugin()
+PLUGIN.register(settings)
+DJANGO_APP = PLUGIN.AppConfig

+ 8 - 4
archivebox/plugins_extractor/singlefile/apps.py

@@ -34,7 +34,7 @@ class SinglefileConfig(BaseConfigSet):
     SINGLEFILE_CHECK_SSL_VALIDITY: bool     = Field(default=lambda: ARCHIVING_CONFIG.CHECK_SSL_VALIDITY)
     SINGLEFILE_CHECK_SSL_VALIDITY: bool     = Field(default=lambda: ARCHIVING_CONFIG.CHECK_SSL_VALIDITY)
     SINGLEFILE_COOKIES_FILE: Optional[Path] = Field(default=lambda: ARCHIVING_CONFIG.COOKIES_FILE)
     SINGLEFILE_COOKIES_FILE: Optional[Path] = Field(default=lambda: ARCHIVING_CONFIG.COOKIES_FILE)
 
 
-    SINGLEFILE_BINARY: str = Field(default='wget')
+    SINGLEFILE_BINARY: str = Field(default='single-file')
     SINGLEFILE_EXTRA_ARGS: List[str] = []
     SINGLEFILE_EXTRA_ARGS: List[str] = []
 
 
 
 
@@ -46,17 +46,21 @@ SINGLEFILE_MAX_VERSION = '1.1.60'
 
 
 
 
 class SinglefileBinary(BaseBinary):
 class SinglefileBinary(BaseBinary):
-    name: BinName = 'single-file'
+    name: BinName = SINGLEFILE_CONFIG.SINGLEFILE_BINARY
     binproviders_supported: List[InstanceOf[BinProvider]] = [LIB_NPM_BINPROVIDER, SYS_NPM_BINPROVIDER, env]
     binproviders_supported: List[InstanceOf[BinProvider]] = [LIB_NPM_BINPROVIDER, SYS_NPM_BINPROVIDER, env]
 
 
     provider_overrides: Dict[BinProviderName, ProviderLookupDict] = {
     provider_overrides: Dict[BinProviderName, ProviderLookupDict] = {
         env.name: {
         env.name: {
             'abspath': lambda:
             'abspath': lambda:
-                bin_abspath('single-file', PATH=env.PATH) or bin_abspath('single-file-node.js', PATH=env.PATH),
+                bin_abspath(SINGLEFILE_CONFIG.SINGLEFILE_BINARY, PATH=env.PATH)
+                or bin_abspath('single-file', PATH=env.PATH)
+                or bin_abspath('single-file-node.js', PATH=env.PATH),
         },
         },
         LIB_NPM_BINPROVIDER.name: {
         LIB_NPM_BINPROVIDER.name: {
             "abspath": lambda:
             "abspath": lambda:
-                bin_abspath("single-file", PATH=LIB_NPM_BINPROVIDER.PATH) or bin_abspath("single-file-node.js", PATH=LIB_NPM_BINPROVIDER.PATH),
+                bin_abspath(SINGLEFILE_CONFIG.SINGLEFILE_BINARY, PATH=env.PATH)
+                or bin_abspath("single-file", PATH=LIB_NPM_BINPROVIDER.PATH)
+                or bin_abspath("single-file-node.js", PATH=LIB_NPM_BINPROVIDER.PATH),
             "packages": lambda:
             "packages": lambda:
                 [f"single-file-cli@>={SINGLEFILE_MIN_VERSION} <{SINGLEFILE_MAX_VERSION}"],
                 [f"single-file-cli@>={SINGLEFILE_MIN_VERSION} <{SINGLEFILE_MAX_VERSION}"],
         },
         },

+ 14 - 6
archivebox/plugins_pkg/npm/apps.py

@@ -1,10 +1,13 @@
 __package__ = 'archivebox.plugins_pkg.npm'
 __package__ = 'archivebox.plugins_pkg.npm'
 
 
+import archivebox
+
 from pathlib import Path
 from pathlib import Path
 from typing import List, Optional
 from typing import List, Optional
 
 
 from django.conf import settings
 from django.conf import settings
-from pydantic import InstanceOf
+
+from pydantic import InstanceOf, model_validator
 
 
 from pydantic_pkgr import BinProvider, NpmProvider, BinName, PATHStr, BinProviderName
 from pydantic_pkgr import BinProvider, NpmProvider, BinName, PATHStr, BinProviderName
 
 
@@ -14,8 +17,6 @@ from plugantic.base_binary import BaseBinary, BaseBinProvider, env, apt, brew
 from plugantic.base_hook import BaseHook
 from plugantic.base_hook import BaseHook
 
 
 
 
-from ...config import CONFIG
-
 ###################### Config ##########################
 ###################### Config ##########################
 
 
 
 
@@ -35,17 +36,24 @@ DEFAULT_GLOBAL_CONFIG = {
 NPM_CONFIG = NpmDependencyConfigs(**DEFAULT_GLOBAL_CONFIG)
 NPM_CONFIG = NpmDependencyConfigs(**DEFAULT_GLOBAL_CONFIG)
 
 
 
 
+OLD_NODE_BIN_PATH = archivebox.DATA_DIR / 'node_modules' / '.bin'
+NEW_NODE_BIN_PATH = archivebox.CONSTANTS.LIB_NPM_DIR / 'node_modules' / '.bin'
+
 class SystemNpmProvider(NpmProvider, BaseBinProvider):
 class SystemNpmProvider(NpmProvider, BaseBinProvider):
     name: BinProviderName = "sys_npm"
     name: BinProviderName = "sys_npm"
-    PATH: PATHStr = str(CONFIG.NODE_BIN_PATH)
     
     
     npm_prefix: Optional[Path] = None
     npm_prefix: Optional[Path] = None
 
 
 class LibNpmProvider(NpmProvider, BaseBinProvider):
 class LibNpmProvider(NpmProvider, BaseBinProvider):
     name: BinProviderName = "lib_npm"
     name: BinProviderName = "lib_npm"
-    PATH: PATHStr = str(CONFIG.NODE_BIN_PATH)
+    PATH: PATHStr = str(OLD_NODE_BIN_PATH)
+    
+    npm_prefix: Optional[Path] = archivebox.CONSTANTS.LIB_NPM_DIR
     
     
-    npm_prefix: Optional[Path] = settings.CONFIG.LIB_DIR / 'npm'
+    @model_validator(mode='after')
+    def validate_path(self):
+        assert self.npm_prefix == NEW_NODE_BIN_PATH.parent.parent
+        return self
 
 
 
 
 SYS_NPM_BINPROVIDER = SystemNpmProvider()
 SYS_NPM_BINPROVIDER = SystemNpmProvider()

+ 20 - 3
archivebox/plugins_pkg/pip/apps.py

@@ -1,13 +1,14 @@
+__package__ = 'archivebox.plugins_pkg.pip'
+
 import os
 import os
 import sys
 import sys
 import inspect
 import inspect
 import archivebox
 import archivebox
 from pathlib import Path
 from pathlib import Path
 from typing import List, Dict, Optional, ClassVar
 from typing import List, Dict, Optional, ClassVar
-from pydantic import InstanceOf, Field
+from pydantic import InstanceOf, Field, model_validator
 
 
 import django
 import django
-
 from django.db.backends.sqlite3.base import Database as django_sqlite3     # type: ignore[import-type]
 from django.db.backends.sqlite3.base import Database as django_sqlite3     # type: ignore[import-type]
 from django.core.checks import Error, Tags
 from django.core.checks import Error, Tags
 from django.conf import settings
 from django.conf import settings
@@ -19,6 +20,8 @@ from plugantic.base_check import BaseCheck
 from plugantic.base_binary import BaseBinary, BaseBinProvider, env, apt, brew
 from plugantic.base_binary import BaseBinary, BaseBinProvider, env, apt, brew
 from plugantic.base_hook import BaseHook
 from plugantic.base_hook import BaseHook
 
 
+from ...misc.logging import hint
+
 
 
 ###################### Config ##########################
 ###################### Config ##########################
 
 
@@ -66,7 +69,7 @@ class LibPipBinProvider(PipProvider, BaseBinProvider):
     name: BinProviderName = "lib_pip"
     name: BinProviderName = "lib_pip"
     INSTALLER_BIN: BinName = "pip"
     INSTALLER_BIN: BinName = "pip"
     
     
-    pip_venv: Optional[Path] = settings.CONFIG.OUTPUT_DIR / 'lib' / 'pip' / 'venv'
+    pip_venv: Optional[Path] = archivebox.CONSTANTS.LIB_PIP_DIR / 'venv'
 
 
 SYS_PIP_BINPROVIDER = SystemPipBinProvider()
 SYS_PIP_BINPROVIDER = SystemPipBinProvider()
 PIPX_PIP_BINPROVIDER = SystemPipxBinProvider()
 PIPX_PIP_BINPROVIDER = SystemPipxBinProvider()
@@ -117,6 +120,20 @@ class SqliteBinary(BaseBinary):
             "version": lambda: SemVer(django_sqlite3.version),
             "version": lambda: SemVer(django_sqlite3.version),
         },
         },
     }
     }
+    
+    @model_validator(mode='after')
+    def validate_json_extension_is_available(self):
+        # Check to make sure JSON extension is available in our Sqlite3 instance
+        try:
+            cursor = django_sqlite3.connect(':memory:').cursor()
+            cursor.execute('SELECT JSON(\'{"a": "b"}\')')
+        except django_sqlite3.OperationalError as exc:
+            print(f'[red][X] Your SQLite3 version is missing the required JSON1 extension: {exc}[/red]')
+            hint([
+                'Upgrade your Python version or install the extension manually:',
+                'https://code.djangoproject.com/wiki/JSON1Extension'
+            ])
+        return self
 
 
 SQLITE_BINARY = SqliteBinary()
 SQLITE_BINARY = SqliteBinary()
 
 

+ 7 - 7
archivebox/plugins_pkg/playwright/apps.py

@@ -19,6 +19,8 @@ from pydantic_pkgr import (
     DEFAULT_ENV_PATH,
     DEFAULT_ENV_PATH,
 )
 )
 
 
+import archivebox
+
 # Depends on other Django apps:
 # Depends on other Django apps:
 from plugantic.base_plugin import BasePlugin
 from plugantic.base_plugin import BasePlugin
 from plugantic.base_configset import BaseConfigSet
 from plugantic.base_configset import BaseConfigSet
@@ -42,12 +44,10 @@ class PlaywrightConfigs(BaseConfigSet):
     # PLAYWRIGHT_DEFAULT_ARGS: List[str] = ['--timeout={TIMEOUT-10}']
     # PLAYWRIGHT_DEFAULT_ARGS: List[str] = ['--timeout={TIMEOUT-10}']
     pass
     pass
 
 
-DEFAULT_GLOBAL_CONFIG = {
-}
 
 
-PLAYWRIGHT_CONFIG = PlaywrightConfigs(**DEFAULT_GLOBAL_CONFIG)
+PLAYWRIGHT_CONFIG = PlaywrightConfigs()
 
 
-LIB_DIR_BROWSERS = settings.CONFIG.OUTPUT_DIR / "lib" / "browsers"
+LIB_DIR_BROWSERS = archivebox.CONSTANTS.LIB_BROWSERS_DIR
 
 
 
 
 
 
@@ -65,12 +65,12 @@ class PlaywrightBinProvider(BaseBinProvider):
     name: BinProviderName = "playwright"
     name: BinProviderName = "playwright"
     INSTALLER_BIN: BinName = PLAYWRIGHT_BINARY.name
     INSTALLER_BIN: BinName = PLAYWRIGHT_BINARY.name
 
 
-    PATH: PATHStr = f"{settings.CONFIG.BIN_DIR}:{DEFAULT_ENV_PATH}"
+    PATH: PATHStr = f"{archivebox.CONSTANTS.LIB_BIN_DIR}:{DEFAULT_ENV_PATH}"
 
 
     puppeteer_browsers_dir: Optional[Path] = (
     puppeteer_browsers_dir: Optional[Path] = (
-        Path("~/Library/Caches/ms-playwright").expanduser()
+        Path("~/Library/Caches/ms-playwright").expanduser()      # macos playwright cache dir
         if OPERATING_SYSTEM == "darwin" else
         if OPERATING_SYSTEM == "darwin" else
-        Path("~/.cache/ms-playwright").expanduser()
+        Path("~/.cache/ms-playwright").expanduser()              # linux playwright cache dir
     )
     )
     puppeteer_install_args: List[str] = ["install"]  # --with-deps
     puppeteer_install_args: List[str] = ["install"]  # --with-deps
 
 

+ 7 - 7
archivebox/plugins_pkg/puppeteer/apps.py

@@ -16,6 +16,8 @@ from pydantic_pkgr import (
     HostBinPath,
     HostBinPath,
 )
 )
 
 
+import archivebox
+
 # Depends on other Django apps:
 # Depends on other Django apps:
 from plugantic.base_plugin import BasePlugin
 from plugantic.base_plugin import BasePlugin
 from plugantic.base_configset import BaseConfigSet
 from plugantic.base_configset import BaseConfigSet
@@ -40,12 +42,10 @@ class PuppeteerConfigs(BaseConfigSet):
     # PUPPETEER_DEFAULT_ARGS: List[str] = ['--timeout={TIMEOUT-10}']
     # PUPPETEER_DEFAULT_ARGS: List[str] = ['--timeout={TIMEOUT-10}']
     pass
     pass
 
 
-DEFAULT_GLOBAL_CONFIG = {
-}
 
 
-PUPPETEER_CONFIG = PuppeteerConfigs(**DEFAULT_GLOBAL_CONFIG)
+PUPPETEER_CONFIG = PuppeteerConfigs()
 
 
-LIB_DIR_BROWSERS = settings.CONFIG.OUTPUT_DIR / "lib" / "browsers"
+LIB_DIR_BROWSERS = archivebox.CONSTANTS.LIB_BROWSERS_DIR
 
 
 
 
 class PuppeteerBinary(BaseBinary):
 class PuppeteerBinary(BaseBinary):
@@ -60,8 +60,8 @@ PUPPETEER_BINARY = PuppeteerBinary()
 class PuppeteerBinProvider(BaseBinProvider):
 class PuppeteerBinProvider(BaseBinProvider):
     name: BinProviderName = "puppeteer"
     name: BinProviderName = "puppeteer"
     INSTALLER_BIN: BinName = "npx"
     INSTALLER_BIN: BinName = "npx"
-    
-    PATH: PATHStr = str(settings.CONFIG.BIN_DIR)
+
+    PATH: PATHStr = str(archivebox.CONSTANTS.LIB_BIN_DIR)
 
 
     puppeteer_browsers_dir: Optional[Path] = LIB_DIR_BROWSERS
     puppeteer_browsers_dir: Optional[Path] = LIB_DIR_BROWSERS
     puppeteer_install_args: List[str] = ["@puppeteer/browsers", "install", "--path", str(LIB_DIR_BROWSERS)]
     puppeteer_install_args: List[str] = ["@puppeteer/browsers", "install", "--path", str(LIB_DIR_BROWSERS)]
@@ -140,7 +140,7 @@ PUPPETEER_BINPROVIDER = PuppeteerBinProvider()
 
 
 # ALTERNATIVE INSTALL METHOD using Ansible:
 # ALTERNATIVE INSTALL METHOD using Ansible:
 # install_playbook = self.plugin_dir / 'install_puppeteer.yml'
 # install_playbook = self.plugin_dir / 'install_puppeteer.yml'
-# chrome_bin = run_playbook(install_playbook, data_dir=settings.CONFIG.OUTPUT_DIR, quiet=quiet).BINARIES.chrome
+# chrome_bin = run_playbook(install_playbook, data_dir=archivebox.DATA_DIR, quiet=quiet).BINARIES.chrome
 # return self.__class__.model_validate(
 # return self.__class__.model_validate(
 #     {
 #     {
 #         **self.model_dump(),
 #         **self.model_dump(),

+ 58 - 6
archivebox/plugins_sys/config/apps.py

@@ -1,18 +1,24 @@
+__package__ = 'archivebox.plugins_sys.config'
 import os
 import os
 import sys
 import sys
+import shutil
 import platform
 import platform
+import archivebox
 
 
-from typing import List, ClassVar
+from typing import List, ClassVar, Dict, Optional
+from datetime import datetime
 from pathlib import Path
 from pathlib import Path
-from pydantic import InstanceOf, Field, field_validator, model_validator
+from pydantic import InstanceOf, Field, field_validator, model_validator, computed_field
+from benedict import benedict
 from rich import print
 from rich import print
 
 
 from django.conf import settings
 from django.conf import settings
-
+from django.utils.crypto import get_random_string
 from plugantic.base_plugin import BasePlugin
 from plugantic.base_plugin import BasePlugin
 from plugantic.base_configset import BaseConfigSet, ConfigSectionName
 from plugantic.base_configset import BaseConfigSet, ConfigSectionName
 from plugantic.base_hook import BaseHook
 from plugantic.base_hook import BaseHook
 
 
+from .constants import CONSTANTS, CONSTANTS_CONFIG
 
 
 ###################### Config ##########################
 ###################### Config ##########################
 
 
@@ -24,17 +30,57 @@ class ShellConfig(BaseConfigSet):
     
     
     IS_TTY: bool                        = Field(default=sys.stdout.isatty())
     IS_TTY: bool                        = Field(default=sys.stdout.isatty())
     USE_COLOR: bool                     = Field(default=lambda c: c.IS_TTY)
     USE_COLOR: bool                     = Field(default=lambda c: c.IS_TTY)
-    SHOW_PROGRESS: bool                 = Field(default=lambda c: (c.IS_TTY and platform.system() != 'darwin'))  # progress bars are buggy on mac, disable for now
+    SHOW_PROGRESS: bool                 = Field(default=lambda c: c.IS_TTY)
     
     
     IN_DOCKER: bool                     = Field(default=False)
     IN_DOCKER: bool                     = Field(default=False)
     IN_QEMU: bool                       = Field(default=False)
     IN_QEMU: bool                       = Field(default=False)
     
     
+    USER: str                           = Field(default=Path('~').expanduser().resolve().name)
     PUID: int                           = Field(default=os.getuid())
     PUID: int                           = Field(default=os.getuid())
     PGID: int                           = Field(default=os.getgid())
     PGID: int                           = Field(default=os.getgid())
     
     
     PYTHON_ENCODING: str                = Field(default=(sys.__stdout__ or sys.stdout or sys.__stderr__ or sys.stderr).encoding.upper().replace('UTF8', 'UTF-8'))
     PYTHON_ENCODING: str                = Field(default=(sys.__stdout__ or sys.stdout or sys.__stderr__ or sys.stderr).encoding.upper().replace('UTF8', 'UTF-8'))
 
 
+    ANSI: Dict[str, str]                = Field(default=lambda c: CONSTANTS.DEFAULT_CLI_COLORS if c.USE_COLOR else CONSTANTS.DISABLED_CLI_COLORS)
+
+    VERSIONS_AVAILABLE: bool = False             # .check_for_update.get_versions_available_on_github(c)},
+    CAN_UPGRADE: bool = False                    # .check_for_update.can_upgrade(c)},
+
     
     
+    @computed_field
+    @property
+    def TERM_WIDTH(self) -> int:
+        return shutil.get_terminal_size((100, 10)).columns
+    
+    @computed_field
+    @property
+    def COMMIT_HASH(self) -> Optional[str]:
+        try:
+            git_dir = archivebox.PACKAGE_DIR / '../.git'
+            ref = (git_dir / 'HEAD').read_text().strip().split(' ')[-1]
+            commit_hash = git_dir.joinpath(ref).read_text().strip()
+            return commit_hash
+        except Exception:
+            pass
+    
+        try:
+            return list((archivebox.PACKAGE_DIR / '../.git/refs/heads/').glob('*'))[0].read_text().strip()
+        except Exception:
+            pass
+        
+        return None
+    
+    @computed_field
+    @property
+    def BUILD_TIME(self) -> str:
+        if self.IN_DOCKER:
+            docker_build_end_time = Path('/VERSION.txt').read_text().rsplit('BUILD_END_TIME=')[-1].split('\n', 1)[0]
+            return docker_build_end_time
+    
+        src_last_modified_unix_timestamp = (archivebox.PACKAGE_DIR / 'config.py').stat().st_mtime
+        return datetime.fromtimestamp(src_last_modified_unix_timestamp).strftime('%Y-%m-%d %H:%M:%S %s')
+    
+
     @model_validator(mode='after')
     @model_validator(mode='after')
     def validate_not_running_as_root(self):
     def validate_not_running_as_root(self):
         attempted_command = ' '.join(sys.argv[:3])
         attempted_command = ' '.join(sys.argv[:3])
@@ -92,7 +138,7 @@ GENERAL_CONFIG = GeneralConfig()
 class ServerConfig(BaseConfigSet):
 class ServerConfig(BaseConfigSet):
     section: ClassVar[ConfigSectionName] = 'SERVER_CONFIG'
     section: ClassVar[ConfigSectionName] = 'SERVER_CONFIG'
 
 
-    SECRET_KEY: str                     = Field(default=None)
+    SECRET_KEY: str                     = Field(default=lambda: get_random_string(50, 'abcdefghijklmnopqrstuvwxyz0123456789_'))
     BIND_ADDR: str                      = Field(default=lambda: ['127.0.0.1:8000', '0.0.0.0:8000'][SHELL_CONFIG.IN_DOCKER])
     BIND_ADDR: str                      = Field(default=lambda: ['127.0.0.1:8000', '0.0.0.0:8000'][SHELL_CONFIG.IN_DOCKER])
     ALLOWED_HOSTS: str                  = Field(default='*')
     ALLOWED_HOSTS: str                  = Field(default='*')
     CSRF_TRUSTED_ORIGINS: str           = Field(default=lambda c: 'http://localhost:8000,http://127.0.0.1:8000,http://0.0.0.0:8000,http://{}'.format(c.BIND_ADDR))
     CSRF_TRUSTED_ORIGINS: str           = Field(default=lambda c: 'http://localhost:8000,http://127.0.0.1:8000,http://0.0.0.0:8000,http://{}'.format(c.BIND_ADDR))
@@ -179,7 +225,7 @@ SEARCH_BACKEND_CONFIG = SearchBackendConfig()
 
 
 
 
 class ConfigPlugin(BasePlugin):
 class ConfigPlugin(BasePlugin):
-    app_label: str = 'config'
+    app_label: str = 'CONFIG'
     verbose_name: str = 'Configuration'
     verbose_name: str = 'Configuration'
 
 
     hooks: List[InstanceOf[BaseHook]] = [
     hooks: List[InstanceOf[BaseHook]] = [
@@ -190,6 +236,12 @@ class ConfigPlugin(BasePlugin):
         ARCHIVING_CONFIG,
         ARCHIVING_CONFIG,
         SEARCH_BACKEND_CONFIG,
         SEARCH_BACKEND_CONFIG,
     ]
     ]
+    
+    # def register(self, settings, parent_plugin=None):
+    #     try:
+    #         super().register(settings, parent_plugin=parent_plugin)
+    #     except Exception as e:
+    #         print(f'[red][X] Error registering config plugin: {e}[/red]', file=sys.stderr)
 
 
 
 
 PLUGIN = ConfigPlugin()
 PLUGIN = ConfigPlugin()

+ 47 - 0
archivebox/plugins_sys/config/check_for_update.py

@@ -0,0 +1,47 @@
+# def get_versions_available_on_github(config):
+#     """
+#     returns a dictionary containing the ArchiveBox GitHub release info for
+#     the recommended upgrade version and the currently installed version
+#     """
+    
+#     # we only want to perform the (relatively expensive) check for new versions
+#     # when its most relevant, e.g. when the user runs a long-running command
+#     subcommand_run_by_user = sys.argv[3] if len(sys.argv) > 3 else 'help'
+#     long_running_commands = ('add', 'schedule', 'update', 'status', 'server')
+#     if subcommand_run_by_user not in long_running_commands:
+#         return None
+    
+#     github_releases_api = "https://api.github.com/repos/ArchiveBox/ArchiveBox/releases"
+#     response = requests.get(github_releases_api)
+#     if response.status_code != 200:
+#         stderr(f'[!] Warning: GitHub API call to check for new ArchiveBox version failed! (status={response.status_code})', color='lightyellow', config=config)
+#         return None
+#     all_releases = response.json()
+
+#     installed_version = parse_version_string(config['VERSION'])
+
+#     # find current version or nearest older version (to link to)
+#     current_version = None
+#     for idx, release in enumerate(all_releases):
+#         release_version = parse_version_string(release['tag_name'])
+#         if release_version <= installed_version:
+#             current_version = release
+#             break
+
+#     current_version = current_version or all_releases[-1]
+    
+#     # recommended version is whatever comes after current_version in the release list
+#     # (perhaps too conservative to only recommend upgrading one version at a time, but it's safest)
+#     try:
+#         recommended_version = all_releases[idx+1]
+#     except IndexError:
+#         recommended_version = None
+
+#     return {'recommended_version': recommended_version, 'current_version': current_version}
+
+# def can_upgrade(config):
+#     if config['VERSIONS_AVAILABLE'] and config['VERSIONS_AVAILABLE']['recommended_version']:
+#         recommended_version = parse_version_string(config['VERSIONS_AVAILABLE']['recommended_version']['tag_name'])
+#         current_version = parse_version_string(config['VERSIONS_AVAILABLE']['current_version']['tag_name'])
+#         return recommended_version > current_version
+#     return False

+ 1 - 0
archivebox/plugins_sys/config/constants.py

@@ -0,0 +1 @@
+from archivebox.constants import *

+ 4 - 7
archivebox/queues/settings.py

@@ -1,16 +1,13 @@
 from pathlib import Path
 from pathlib import Path
 
 
-from django.conf import settings
 
 
+import archivebox
+OUTPUT_DIR = archivebox.DATA_DIR
+LOGS_DIR = archivebox.CONSTANTS.LOGS_DIR
 
 
-OUTPUT_DIR = settings.CONFIG.OUTPUT_DIR
-LOGS_DIR = settings.CONFIG.LOGS_DIR
-
-TMP_DIR = OUTPUT_DIR / "tmp"
+TMP_DIR = archivebox.CONSTANTS.TMP_DIR
 
 
 Path.mkdir(TMP_DIR, exist_ok=True)
 Path.mkdir(TMP_DIR, exist_ok=True)
-
-
 CONFIG_FILE = TMP_DIR / "supervisord.conf"
 CONFIG_FILE = TMP_DIR / "supervisord.conf"
 PID_FILE = TMP_DIR / "supervisord.pid"
 PID_FILE = TMP_DIR / "supervisord.pid"
 SOCK_FILE = TMP_DIR / "supervisord.sock"
 SOCK_FILE = TMP_DIR / "supervisord.sock"

+ 29 - 0
archivebox/system.py

@@ -4,6 +4,7 @@ __package__ = 'archivebox'
 import os
 import os
 import signal
 import signal
 import shutil
 import shutil
+import getpass
 
 
 from json import dump
 from json import dump
 from pathlib import Path
 from pathlib import Path
@@ -229,3 +230,31 @@ class suppress_output(object):
         if self.stderr:
         if self.stderr:
             os.dup2(self.real_stderr, 2)
             os.dup2(self.real_stderr, 2)
             os.close(self.null_stderr)
             os.close(self.null_stderr)
+
+
+def get_system_user() -> str:
+    # some host OS's are unable to provide a username (k3s, Windows), making this complicated
+    # uid 999 is especially problematic and breaks many attempts
+    SYSTEM_USER = None
+    FALLBACK_USER_PLACHOLDER = f'user_{os.getuid()}'
+
+    # Option 1
+    try:
+        import pwd
+        SYSTEM_USER = SYSTEM_USER or pwd.getpwuid(os.geteuid()).pw_name
+    except (ModuleNotFoundError, Exception):
+        pass
+
+    # Option 2
+    try:
+        SYSTEM_USER = SYSTEM_USER or getpass.getuser()
+    except Exception:
+        pass
+
+    # Option 3
+    try:
+        SYSTEM_USER = SYSTEM_USER or os.getlogin()
+    except Exception:
+        pass
+
+    return SYSTEM_USER or FALLBACK_USER_PLACHOLDER