Explorar o código

improve config loading of TMP_DIR, LIB_DIR, move to separate files

Nick Sweeting hai 1 ano
pai
achega
cf1ea8f80f
Modificáronse 49 ficheiros con 767 adicións e 527 borrados
  1. 2 12
      Dockerfile
  2. 13 14
      archivebox/__init__.py
  3. 3 2
      archivebox/api/v1_api.py
  4. 1 1
      archivebox/api/v1_cli.py
  5. 15 8
      archivebox/cli/__init__.py
  6. 2 1
      archivebox/cli/archivebox_add.py
  7. 2 1
      archivebox/cli/archivebox_server.py
  8. 6 24
      archivebox/config/__init__.py
  9. 1 1
      archivebox/config/apps.py
  10. 0 47
      archivebox/config/check_for_update.py
  11. 7 64
      archivebox/config/common.py
  12. 0 115
      archivebox/config/config_stubs.py
  13. 113 121
      archivebox/config/constants.py
  14. 39 25
      archivebox/config/legacy.py
  15. 152 0
      archivebox/config/paths.py
  16. 70 0
      archivebox/config/permissions.py
  17. 121 0
      archivebox/config/version.py
  18. 1 1
      archivebox/core/middleware.py
  19. 2 1
      archivebox/core/settings.py
  20. 2 1
      archivebox/core/views.py
  21. 2 1
      archivebox/extractors/htmltotext.py
  22. 4 2
      archivebox/index/__init__.py
  23. 4 2
      archivebox/index/html.py
  24. 2 1
      archivebox/index/json.py
  25. 2 1
      archivebox/index/sql.py
  26. 2 1
      archivebox/logging_util.py
  27. 93 23
      archivebox/main.py
  28. 62 19
      archivebox/misc/checks.py
  29. 2 1
      archivebox/misc/logging.py
  30. 1 2
      archivebox/misc/system.py
  31. 5 5
      archivebox/misc/util.py
  32. 2 1
      archivebox/parsers/__init__.py
  33. 2 1
      archivebox/plugins_extractor/chrome/apps.py
  34. 1 1
      archivebox/plugins_extractor/curl/apps.py
  35. 1 1
      archivebox/plugins_extractor/git/apps.py
  36. 3 3
      archivebox/plugins_extractor/mercury/apps.py
  37. 1 1
      archivebox/plugins_extractor/readability/apps.py
  38. 3 3
      archivebox/plugins_extractor/singlefile/apps.py
  39. 1 1
      archivebox/plugins_extractor/wget/apps.py
  40. 1 1
      archivebox/plugins_extractor/ytdlp/apps.py
  41. 2 1
      archivebox/plugins_search/ripgrep/apps.py
  42. 1 1
      archivebox/plugins_search/sonic/apps.py
  43. 1 1
      archivebox/plugins_search/sqlite/apps.py
  44. 4 0
      archivebox/queues/supervisor_util.py
  45. 1 1
      archivebox/search/__init__.py
  46. 5 6
      bin/docker_entrypoint.sh
  47. 2 2
      pyproject.toml
  48. 4 4
      tests/test_init.py
  49. 1 1
      uv.lock

+ 2 - 12
Dockerfile

@@ -287,22 +287,12 @@ WORKDIR "$DATA_DIR"
 RUN openssl rand -hex 16 > /etc/machine-id \
 RUN openssl rand -hex 16 > /etc/machine-id \
     && chown -R "$DEFAULT_PUID:$DEFAULT_PGID" "/tmp"
     && chown -R "$DEFAULT_PUID:$DEFAULT_PGID" "/tmp"
 ENV IN_DOCKER=True \
 ENV IN_DOCKER=True \
-    SYSTEM_LIB_DIR=/app/lib \
-    SYSTEM_TMP_DIR=/tmp \
+    SYSTEM_LIB_DIR=/usr/share/archivebox \
+    SYSTEM_TMP_DIR=/tmp/archivebox \
     GOOGLE_API_KEY=no \
     GOOGLE_API_KEY=no \
     GOOGLE_DEFAULT_CLIENT_ID=no \
     GOOGLE_DEFAULT_CLIENT_ID=no \
     GOOGLE_DEFAULT_CLIENT_SECRET=no \
     GOOGLE_DEFAULT_CLIENT_SECRET=no \
     ALLOWED_HOSTS=*
     ALLOWED_HOSTS=*
-    ## No need to set explicitly, these values will be autodetected by archivebox in docker:
-    # WGET_BINARY="wget" \
-    # YOUTUBEDL_BINARY="yt-dlp" \
-    # CHROME_BINARY="/usr/bin/chromium-browser" \
-    # USE_SINGLEFILE=True \
-    # SINGLEFILE_BINARY="$NODE_MODULES/.bin/single-file" \
-    # USE_READABILITY=True \
-    # READABILITY_BINARY="$NODE_MODULES/.bin/readability-extractor" \
-    # USE_MERCURY=True \
-    # MERCURY_BINARY="$NODE_MODULES/.bin/postlight-parser"
 
 
 # Print version for nice docker finish summary
 # Print version for nice docker finish summary
 RUN (echo -e "\n\n[√] Finished Docker build succesfully. Saving build summary in: /VERSION.txt" \
 RUN (echo -e "\n\n[√] Finished Docker build succesfully. Saving build summary in: /VERSION.txt" \

+ 13 - 14
archivebox/__init__.py

@@ -13,7 +13,7 @@ __package__ = 'archivebox'
 
 
 import os
 import os
 import sys
 import sys
-import tempfile
+
 from pathlib import Path
 from pathlib import Path
 
 
 ASCII_LOGO = """
 ASCII_LOGO = """
@@ -25,37 +25,36 @@ ASCII_LOGO = """
 ╚═╝  ╚═╝╚═╝  ╚═╝ ╚═════╝╚═╝  ╚═╝╚═╝  ╚═══╝  ╚══════╝ ╚═════╝  ╚═════╝ ╚═╝  ╚═╝
 ╚═╝  ╚═╝╚═╝  ╚═╝ ╚═════╝╚═╝  ╚═╝╚═╝  ╚═══╝  ╚══════╝ ╚═════╝  ╚═════╝ ╚═╝  ╚═╝
 """
 """
 
 
-SYSTEM_TMP_DIR = Path(tempfile.gettempdir()) / 'archivebox'
-SYSTEM_TMP_DIR.mkdir(parents=True, exist_ok=True)
-os.environ['SYSTEM_TMP_DIR'] = str(SYSTEM_TMP_DIR)
-os.environ['DJANGO_SETTINGS_MODULE'] = 'core.settings'
+# detect ArchiveBox user's UID/GID based on data dir ownership
+from archivebox.config.permissions import drop_privileges                 # noqa
+drop_privileges()
 
 
-# if we are outside a data dir, cd into an ephemeral tmp dir so that
-# we can run version/help without polluting cwd with an index.sqlite3
-if len(sys.argv) > 1 and sys.argv[1] in ('version', 'help'):
-    current_dir = Path(os.getcwd()).resolve()
-    if not (current_dir / 'index.sqlite3').exists():
-        os.chdir(SYSTEM_TMP_DIR)
+from archivebox.misc.checks import check_not_root, check_io_encoding      # noqa
+check_not_root()
+check_io_encoding()
 
 
 # make sure PACKAGE_DIR is in sys.path so we can import all subfolders
 # make sure PACKAGE_DIR is in sys.path so we can import all subfolders
 # without necessarily waiting for django to load them thorugh INSTALLED_APPS
 # without necessarily waiting for django to load them thorugh INSTALLED_APPS
 PACKAGE_DIR = Path(__file__).resolve().parent
 PACKAGE_DIR = Path(__file__).resolve().parent
 if str(PACKAGE_DIR) not in sys.path:
 if str(PACKAGE_DIR) not in sys.path:
     sys.path.append(str(PACKAGE_DIR))
     sys.path.append(str(PACKAGE_DIR))
+os.environ['DJANGO_SETTINGS_MODULE'] = 'core.settings'
 
 
 
 
 # print('INSTALLING MONKEY PATCHES')
 # print('INSTALLING MONKEY PATCHES')
-from .monkey_patches import *                    # noqa
+from archivebox.monkey_patches import *                    # noqa
 # print('DONE INSTALLING MONKEY PATCHES')
 # print('DONE INSTALLING MONKEY PATCHES')
 
 
 
 
 # print('LOADING VENDORED LIBRARIES')
 # print('LOADING VENDORED LIBRARIES')
-from .vendor import load_vendored_libs           # noqa
+from archivebox.vendor import load_vendored_libs           # noqa
 load_vendored_libs()
 load_vendored_libs()
 # print('DONE LOADING VENDORED LIBRARIES')
 # print('DONE LOADING VENDORED LIBRARIES')
 
 
 
 
-from .config.constants import CONSTANTS, DATA_DIR, PACKAGE_DIR, ARCHIVE_DIR, VERSION  # noqa
+from archivebox.config.constants import CONSTANTS                         # noqa
+from archivebox.config.paths import PACKAGE_DIR, DATA_DIR, ARCHIVE_DIR    # noqa
+from archivebox.config.version import VERSION                             # noqa
 
 
 __version__ = VERSION
 __version__ = VERSION
 __author__ = 'Nick Sweeting'
 __author__ = 'Nick Sweeting'

+ 3 - 2
archivebox/api/v1_api.py

@@ -12,12 +12,13 @@ from ninja import NinjaAPI, Swagger
 
 
 # TODO: explore adding https://eadwincode.github.io/django-ninja-extra/
 # TODO: explore adding https://eadwincode.github.io/django-ninja-extra/
 
 
-from archivebox.config import SHELL_CONFIG, VERSION
+from archivebox.config import VERSION
+from archivebox.config.version import get_COMMIT_HASH
 
 
 from api.auth import API_AUTH_METHODS
 from api.auth import API_AUTH_METHODS
 
 
 
 
-COMMIT_HASH = SHELL_CONFIG.COMMIT_HASH or 'unknown'
+COMMIT_HASH = get_COMMIT_HASH() or 'unknown'
 
 
 html_description=f'''
 html_description=f'''
 <h3>Welcome to your ArchiveBox server's REST API <code>[v1 ALPHA]</code> homepage!</h3>
 <h3>Welcome to your ArchiveBox server's REST API <code>[v1 ALPHA]</code> homepage!</h3>

+ 1 - 1
archivebox/api/v1_cli.py

@@ -13,7 +13,7 @@ from ..main import (
     schedule,
     schedule,
 )
 )
 from archivebox.misc.util import ansi_to_html
 from archivebox.misc.util import ansi_to_html
-from archivebox.config import ARCHIVING_CONFIG
+from archivebox.config.common import ARCHIVING_CONFIG
 
 
 
 
 from .auth import API_AUTH_METHODS
 from .auth import API_AUTH_METHODS

+ 15 - 8
archivebox/cli/__init__.py

@@ -1,6 +1,7 @@
 __package__ = 'archivebox.cli'
 __package__ = 'archivebox.cli'
 __command__ = 'archivebox'
 __command__ = 'archivebox'
 
 
+import os
 import sys
 import sys
 import argparse
 import argparse
 import threading
 import threading
@@ -25,6 +26,10 @@ if len(sys.argv) > 1 and sys.argv[1] == 'setup':
     print(':warning: [bold red]DEPRECATED[/bold red] `archivebox setup` is deprecated, use `archivebox install` instead')
     print(':warning: [bold red]DEPRECATED[/bold red] `archivebox setup` is deprecated, use `archivebox install` instead')
     sys.argv[1] = 'install'
     sys.argv[1] = 'install'
 
 
+if '--debug' in sys.argv:
+    os.environ['DEBUG'] = 'True'
+    sys.argv.remove('--debug')
+
 
 
 # def list_subcommands() -> Dict[str, str]:
 # def list_subcommands() -> Dict[str, str]:
 #     """find and import all valid archivebox_<subcommand>.py files in CLI_DIR"""
 #     """find and import all valid archivebox_<subcommand>.py files in CLI_DIR"""
@@ -50,8 +55,8 @@ SUBCOMMAND_MODULES = {
     
     
     'init': 'archivebox_init',
     'init': 'archivebox_init',
     'install': 'archivebox_install',
     'install': 'archivebox_install',
+    ##############################################
     'config': 'archivebox_config',
     'config': 'archivebox_config',
-    
     'add': 'archivebox_add',
     'add': 'archivebox_add',
     'remove': 'archivebox_remove',
     'remove': 'archivebox_remove',
     'update': 'archivebox_update',
     'update': 'archivebox_update',
@@ -63,7 +68,7 @@ SUBCOMMAND_MODULES = {
     'shell': 'archivebox_shell',
     'shell': 'archivebox_shell',
     'manage': 'archivebox_manage',
     'manage': 'archivebox_manage',
 
 
-    'oneshot': 'archivebox_oneshot',
+    # 'oneshot': 'archivebox_oneshot',
 }
 }
 
 
 # every imported command module must have these properties in order to be valid
 # every imported command module must have these properties in order to be valid
@@ -102,11 +107,11 @@ CLI_SUBCOMMANDS = LazySubcommands()
 
 
 # these common commands will appear sorted before any others for ease-of-use
 # these common commands will appear sorted before any others for ease-of-use
 meta_cmds = ('help', 'version')                               # dont require valid data folder at all
 meta_cmds = ('help', 'version')                               # dont require valid data folder at all
-main_cmds = ('init', 'config', 'setup', 'install')            # dont require existing db present
-archive_cmds = ('add', 'remove', 'update', 'list', 'status')  # require existing db present
+setup_cmds = ('init', 'setup', 'install')                      # require valid data folder, but dont require DB present in it yet
+archive_cmds = ('add', 'remove', 'update', 'list', 'status', 'schedule', 'server', 'shell', 'manage')  # require valid data folder + existing db present
 fake_db = ("oneshot",)                                        # use fake in-memory db
 fake_db = ("oneshot",)                                        # use fake in-memory db
 
 
-display_first = (*meta_cmds, *main_cmds, *archive_cmds)
+display_first = (*meta_cmds, *setup_cmds, *archive_cmds)
 
 
 
 
 IGNORED_BG_THREADS = ('MainThread', 'ThreadPoolExecutor', 'IPythonHistorySavingThread', 'Scheduler')  # threads we dont have to wait for before exiting
 IGNORED_BG_THREADS = ('MainThread', 'ThreadPoolExecutor', 'IPythonHistorySavingThread', 'Scheduler')  # threads we dont have to wait for before exiting
@@ -157,14 +162,16 @@ def run_subcommand(subcommand: str,
     from archivebox.config.legacy import setup_django
     from archivebox.config.legacy import setup_django
     
     
     # print('DATA_DIR is', DATA_DIR)
     # print('DATA_DIR is', DATA_DIR)
-    # print('pwd is', os.getcwd())
+    # print('pwd is', os.getcwd())    
 
 
     cmd_requires_db = subcommand in archive_cmds
     cmd_requires_db = subcommand in archive_cmds
     init_pending = '--init' in subcommand_args or '--quick-init' in subcommand_args
     init_pending = '--init' in subcommand_args or '--quick-init' in subcommand_args
 
 
-    setup_django(in_memory_db=subcommand in fake_db, check_db=cmd_requires_db and not init_pending)
+    check_db = cmd_requires_db and not init_pending
+
+    setup_django(in_memory_db=subcommand in fake_db, check_db=check_db)
 
 
-    if subcommand not in meta_cmds:
+    if subcommand in archive_cmds:
         if cmd_requires_db:
         if cmd_requires_db:
             check_migrations()
             check_migrations()
 
 

+ 2 - 1
archivebox/cli/archivebox_add.py

@@ -9,7 +9,8 @@ import argparse
 from typing import List, Optional, IO
 from typing import List, Optional, IO
 
 
 from archivebox.misc.util import docstring
 from archivebox.misc.util import docstring
-from archivebox.config import DATA_DIR, ARCHIVING_CONFIG
+from archivebox.config import DATA_DIR
+from archivebox.config.common import ARCHIVING_CONFIG
 
 
 from ..main import add
 from ..main import add
 from ..parsers import PARSERS
 from ..parsers import PARSERS

+ 2 - 1
archivebox/cli/archivebox_server.py

@@ -9,7 +9,8 @@ from pathlib import Path
 from typing import Optional, List, IO
 from typing import Optional, List, IO
 
 
 from archivebox.misc.util import docstring
 from archivebox.misc.util import docstring
-from archivebox.config import DATA_DIR, SERVER_CONFIG
+from archivebox.config import DATA_DIR
+from archivebox.config.common import SERVER_CONFIG
 from ..logging_util import SmartFormatter, reject_stdin
 from ..logging_util import SmartFormatter, reject_stdin
 from ..main import server
 from ..main import server
 
 

+ 6 - 24
archivebox/config/__init__.py

@@ -1,27 +1,9 @@
 __package__ = 'archivebox.config'
 __package__ = 'archivebox.config'
 
 
-from .constants import CONSTANTS, CONSTANTS_CONFIG, PACKAGE_DIR, DATA_DIR, ARCHIVE_DIR, VERSION
-from .defaults import (
-    SHELL_CONFIG,
-    STORAGE_CONFIG,
-    GENERAL_CONFIG,
-    SERVER_CONFIG,
-    ARCHIVING_CONFIG,
-    SEARCH_BACKEND_CONFIG,
+from .paths import (
+    PACKAGE_DIR,                                    # noqa
+    DATA_DIR,                                       # noqa
+    ARCHIVE_DIR,                                    # noqa
 )
 )
-
-
-__all__ = [
-    'CONSTANTS',
-    'PACKAGE_DIR',
-    'DATA_DIR',
-    'ARCHIVE_DIR',
-    'VERSION',
-    'SHELL_CONFIG',
-    'STORAGE_CONFIG',
-    'GENERAL_CONFIG',
-    'SERVER_CONFIG',
-    'ARCHIVING_CONFIG',
-    'SEARCH_BACKEND_CONFIG',
-    'CONSTANTS_CONFIG',
-]
+from .constants import CONSTANTS, CONSTANTS_CONFIG  # noqa
+from .version import VERSION                        # noqa

+ 1 - 1
archivebox/config/apps.py

@@ -8,7 +8,7 @@ from abx.archivebox.base_hook import BaseHook
 
 
 
 
 from .constants import CONSTANTS, CONSTANTS_CONFIG, PACKAGE_DIR, DATA_DIR, ARCHIVE_DIR      # noqa
 from .constants import CONSTANTS, CONSTANTS_CONFIG, PACKAGE_DIR, DATA_DIR, ARCHIVE_DIR      # noqa
-from .defaults import (
+from .common import (
     ShellConfig,                    # noqa: F401
     ShellConfig,                    # noqa: F401
     StorageConfig,                  # noqa: F401
     StorageConfig,                  # noqa: F401
     GeneralConfig,                  # noqa: F401
     GeneralConfig,                  # noqa: F401

+ 0 - 47
archivebox/config/check_for_update.py

@@ -1,47 +0,0 @@
-# def get_versions_available_on_github(config):
-#     """
-#     returns a dictionary containing the ArchiveBox GitHub release info for
-#     the recommended upgrade version and the currently installed version
-#     """
-    
-#     # we only want to perform the (relatively expensive) check for new versions
-#     # when its most relevant, e.g. when the user runs a long-running command
-#     subcommand_run_by_user = sys.argv[3] if len(sys.argv) > 3 else 'help'
-#     long_running_commands = ('add', 'schedule', 'update', 'status', 'server')
-#     if subcommand_run_by_user not in long_running_commands:
-#         return None
-    
-#     github_releases_api = "https://api.github.com/repos/ArchiveBox/ArchiveBox/releases"
-#     response = requests.get(github_releases_api)
-#     if response.status_code != 200:
-#         stderr(f'[!] Warning: GitHub API call to check for new ArchiveBox version failed! (status={response.status_code})', color='lightyellow', config=config)
-#         return None
-#     all_releases = response.json()
-
-#     installed_version = parse_version_string(config['VERSION'])
-
-#     # find current version or nearest older version (to link to)
-#     current_version = None
-#     for idx, release in enumerate(all_releases):
-#         release_version = parse_version_string(release['tag_name'])
-#         if release_version <= installed_version:
-#             current_version = release
-#             break
-
-#     current_version = current_version or all_releases[-1]
-    
-#     # recommended version is whatever comes after current_version in the release list
-#     # (perhaps too conservative to only recommend upgrading one version at a time, but it's safest)
-#     try:
-#         recommended_version = all_releases[idx+1]
-#     except IndexError:
-#         recommended_version = None
-
-#     return {'recommended_version': recommended_version, 'current_version': current_version}
-
-# def can_upgrade(config):
-#     if config['VERSIONS_AVAILABLE'] and config['VERSIONS_AVAILABLE']['recommended_version']:
-#         recommended_version = parse_version_string(config['VERSIONS_AVAILABLE']['recommended_version']['tag_name'])
-#         current_version = parse_version_string(config['VERSIONS_AVAILABLE']['current_version']['tag_name'])
-#         return recommended_version > current_version
-#     return False

+ 7 - 64
archivebox/config/defaults.py → archivebox/config/common.py

@@ -1,21 +1,21 @@
 __package__ = 'archivebox.config'
 __package__ = 'archivebox.config'
 
 
-import os
 import sys
 import sys
 import shutil
 import shutil
 
 
 from typing import Dict, Optional
 from typing import Dict, Optional
-from datetime import datetime
 from pathlib import Path
 from pathlib import Path
 
 
 from rich import print
 from rich import print
-from pydantic import Field, field_validator, model_validator, computed_field
+from pydantic import Field, field_validator, computed_field
 from django.utils.crypto import get_random_string
 from django.utils.crypto import get_random_string
 
 
 from abx.archivebox.base_configset import BaseConfigSet
 from abx.archivebox.base_configset import BaseConfigSet
 
 
 
 
-from .constants import CONSTANTS, PACKAGE_DIR
+from .constants import CONSTANTS
+from .version import get_COMMIT_HASH, get_BUILD_TIME
+from .permissions import IN_DOCKER
 
 
 ###################### Config ##########################
 ###################### Config ##########################
 
 
@@ -27,14 +27,8 @@ class ShellConfig(BaseConfigSet):
     USE_COLOR: bool                     = Field(default=lambda c: c.IS_TTY)
     USE_COLOR: bool                     = Field(default=lambda c: c.IS_TTY)
     SHOW_PROGRESS: bool                 = Field(default=lambda c: c.IS_TTY)
     SHOW_PROGRESS: bool                 = Field(default=lambda c: c.IS_TTY)
     
     
-    IN_DOCKER: bool                     = Field(default=False)
+    IN_DOCKER: bool                     = Field(default=IN_DOCKER)
     IN_QEMU: bool                       = Field(default=False)
     IN_QEMU: bool                       = Field(default=False)
-    
-    USER: str                           = Field(default=Path('~').expanduser().resolve().name)
-    PUID: int                           = Field(default=os.getuid())
-    PGID: int                           = Field(default=os.getgid())
-    
-    PYTHON_ENCODING: str                = Field(default=(sys.__stdout__ or sys.stdout or sys.__stderr__ or sys.stderr).encoding.upper().replace('UTF8', 'UTF-8'))
 
 
     ANSI: Dict[str, str]                = Field(default=lambda c: CONSTANTS.DEFAULT_CLI_COLORS if c.USE_COLOR else CONSTANTS.DISABLED_CLI_COLORS)
     ANSI: Dict[str, str]                = Field(default=lambda c: CONSTANTS.DEFAULT_CLI_COLORS if c.USE_COLOR else CONSTANTS.DISABLED_CLI_COLORS)
 
 
@@ -52,63 +46,12 @@ class ShellConfig(BaseConfigSet):
     @computed_field
     @computed_field
     @property
     @property
     def COMMIT_HASH(self) -> Optional[str]:
     def COMMIT_HASH(self) -> Optional[str]:
-        try:
-            git_dir = PACKAGE_DIR / '../.git'
-            ref = (git_dir / 'HEAD').read_text().strip().split(' ')[-1]
-            commit_hash = git_dir.joinpath(ref).read_text().strip()
-            return commit_hash
-        except Exception:
-            pass
-    
-        try:
-            return list((PACKAGE_DIR / '../.git/refs/heads/').glob('*'))[0].read_text().strip()
-        except Exception:
-            pass
-        
-        return None
+        return get_COMMIT_HASH()
     
     
     @computed_field
     @computed_field
     @property
     @property
     def BUILD_TIME(self) -> str:
     def BUILD_TIME(self) -> str:
-        if self.IN_DOCKER:
-            docker_build_end_time = Path('/VERSION.txt').read_text().rsplit('BUILD_END_TIME=')[-1].split('\n', 1)[0]
-            return docker_build_end_time
-    
-        src_last_modified_unix_timestamp = (PACKAGE_DIR / 'README.md').stat().st_mtime
-        return datetime.fromtimestamp(src_last_modified_unix_timestamp).strftime('%Y-%m-%d %H:%M:%S %s')
-    
-
-    @model_validator(mode='after')
-    def validate_not_running_as_root(self):
-        attempted_command = ' '.join(sys.argv[:3])
-        if self.PUID == 0 and attempted_command not in ('setup', 'install'):
-            # stderr('[!] ArchiveBox should never be run as root!', color='red')
-            # stderr('    For more information, see the security overview documentation:')
-            # stderr('        https://github.com/ArchiveBox/ArchiveBox/wiki/Security-Overview#do-not-run-as-root')
-            print('[red][!] ArchiveBox should never be run as root![/red]', file=sys.stderr)
-            print('    For more information, see the security overview documentation:', file=sys.stderr)
-            print('        https://github.com/ArchiveBox/ArchiveBox/wiki/Security-Overview#do-not-run-as-root', file=sys.stderr)
-            
-            if self.IN_DOCKER:
-                print('[red][!] When using Docker, you must run commands with [green]docker run[/green] instead of [yellow3]docker exec[/yellow3], e.g.:', file=sys.stderr)
-                print('        docker compose run archivebox {attempted_command}', file=sys.stderr)
-                print(f'        docker run -it -v $PWD/data:/data archivebox/archivebox {attempted_command}', file=sys.stderr)
-                print('        or:', file=sys.stderr)
-                print(f'        docker compose exec --user=archivebox archivebox /bin/bash -c "archivebox {attempted_command}"', file=sys.stderr)
-                print(f'        docker exec -it --user=archivebox <container id> /bin/bash -c "archivebox {attempted_command}"', file=sys.stderr)
-            raise SystemExit(2)
-        
-        # check python locale
-        if self.PYTHON_ENCODING != 'UTF-8':
-            print(f'[red][X] Your system is running python3 scripts with a bad locale setting: {self.PYTHON_ENCODING} (it should be UTF-8).[/red]', file=sys.stderr)
-            print('    To fix it, add the line "export PYTHONIOENCODING=UTF-8" to your ~/.bashrc file (without quotes)', file=sys.stderr)
-            print('    Or if you\'re using ubuntu/debian, run "dpkg-reconfigure locales"', file=sys.stderr)
-            print('')
-            print('    Confirm that it\'s fixed by opening a new shell and running:', file=sys.stderr)
-            print('        python3 -c "import sys; print(sys.stdout.encoding)"   # should output UTF-8', file=sys.stderr)
-            raise SystemExit(2)
-        
-        return self
+        return get_BUILD_TIME()
 
 
 SHELL_CONFIG = ShellConfig()
 SHELL_CONFIG = ShellConfig()
 
 

+ 0 - 115
archivebox/config/config_stubs.py

@@ -1,115 +0,0 @@
-from pathlib import Path
-from typing import Optional, Dict, Union, Tuple, Callable, Pattern, Type, Any, List
-from mypy_extensions import TypedDict
-
-from benedict import benedict
-
-SimpleConfigValue = Union[str, bool, int, None, Pattern, Dict[str, Any]]
-SimpleConfigValueDict = Dict[str, SimpleConfigValue]
-SimpleConfigValueGetter = Callable[[], SimpleConfigValue]
-ConfigValue = Union[SimpleConfigValue, SimpleConfigValueDict, SimpleConfigValueGetter]
-
-
-
-class BaseConfig(TypedDict):
-    pass
-
-class ConfigDict(BaseConfig, benedict, total=False):
-    """
-    # Regenerate by pasting this quine into `archivebox shell` 🥚
-    from archivebox.config import ConfigDict, CONFIG_DEFAULTS
-    print('class ConfigDict(BaseConfig, total=False):')
-    print('    ' + '"'*3 + ConfigDict.__doc__ + '"'*3)
-    for section, configs in CONFIG_DEFAULTS.items():
-        for key, attrs in configs.items():
-            Type, default = attrs['type'], attrs['default']
-            if default is None:
-                print(f'    {key}: Optional[{Type.__name__}]')
-            else:
-                print(f'    {key}: {Type.__name__}')
-        print()
-    """
-
-    IS_TTY: bool
-    USE_COLOR: bool
-    SHOW_PROGRESS: bool
-    IN_DOCKER: bool
-
-    PACKAGE_DIR: Path
-    CONFIG_FILE: Path
-    ONLY_NEW: bool
-    TIMEOUT: int
-    MEDIA_TIMEOUT: int
-    OUTPUT_PERMISSIONS: str
-    RESTRICT_FILE_NAMES: str
-    URL_DENYLIST: str
-
-    SECRET_KEY: Optional[str]
-    BIND_ADDR: str
-    ALLOWED_HOSTS: str
-    DEBUG: bool
-    PUBLIC_INDEX: bool
-    PUBLIC_SNAPSHOTS: bool
-    FOOTER_INFO: str
-
-    SAVE_TITLE: bool
-    SAVE_FAVICON: bool
-    SAVE_WGET: bool
-    SAVE_WGET_REQUISITES: bool
-    SAVE_SINGLEFILE: bool
-    SAVE_READABILITY: bool
-    SAVE_MERCURY: bool
-    SAVE_PDF: bool
-    SAVE_SCREENSHOT: bool
-    SAVE_DOM: bool
-    SAVE_WARC: bool
-    SAVE_GIT: bool
-    SAVE_MEDIA: bool
-    SAVE_ARCHIVE_DOT_ORG: bool
-
-    RESOLUTION: str
-    GIT_DOMAINS: str
-    CHECK_SSL_VALIDITY: bool
-    CURL_USER_AGENT: str
-    WGET_USER_AGENT: str
-    CHROME_USER_AGENT: str
-    COOKIES_FILE: Union[str, Path, None]
-    CHROME_USER_DATA_DIR: Union[str, Path, None]
-    CHROME_TIMEOUT: int
-    CHROME_HEADLESS: bool
-    CHROME_SANDBOX: bool
-
-    USE_CURL: bool
-    USE_WGET: bool
-    USE_SINGLEFILE: bool
-    USE_READABILITY: bool
-    USE_MERCURY: bool
-    USE_GIT: bool
-    USE_CHROME: bool
-    USE_YOUTUBEDL: bool
-    CURL_BINARY: str
-    GIT_BINARY: str
-    WGET_BINARY: str
-    SINGLEFILE_BINARY: str
-    READABILITY_BINARY: str
-    MERCURY_BINARY: str
-    YOUTUBEDL_BINARY: str
-    CHROME_BINARY: Optional[str]
-
-    YOUTUBEDL_ARGS: List[str]
-    WGET_ARGS: List[str]
-    CURL_ARGS: List[str]
-    GIT_ARGS: List[str]
-    TAG_SEPARATOR_PATTERN: str
-
-
-ConfigDefaultValueGetter = Callable[[ConfigDict], ConfigValue]
-ConfigDefaultValue = Union[ConfigValue, ConfigDefaultValueGetter]
-
-ConfigDefault = TypedDict('ConfigDefault', {
-    'default': ConfigDefaultValue,
-    'type': Optional[Type],
-    'aliases': Optional[Tuple[str, ...]],
-}, total=False)
-
-ConfigDefaultDict = Dict[str, ConfigDefault]

+ 113 - 121
archivebox/config/constants.py

@@ -1,118 +1,115 @@
 __package__ = 'archivebox.config'
 __package__ = 'archivebox.config'
 
 
-
 import os
 import os
 import re
 import re
 import platform
 import platform
-import tempfile
 
 
 from typing import Dict
 from typing import Dict
 from pathlib import Path
 from pathlib import Path
-import importlib.metadata
 from collections.abc import Mapping
 from collections.abc import Mapping
 
 
 from benedict import benedict
 from benedict import benedict
 
 
 from ..misc.logging import DEFAULT_CLI_COLORS
 from ..misc.logging import DEFAULT_CLI_COLORS
 
 
-###################### Config ##########################
-
-PACKAGE_DIR: Path = Path(__file__).resolve().parent.parent    # archivebox source code dir
-DATA_DIR: Path = Path(os.getcwd()).resolve()                    # archivebox user data dir
-ARCHIVE_DIR: Path = DATA_DIR / 'archive'                      # archivebox snapshot data dir
-
-def _detect_installed_version(PACKAGE_DIR: Path):
-    """Autodetect the installed archivebox version by using pip package metadata, pyproject.toml file, or package.json file"""
-    try:
-        # if in production install, use pip-installed package metadata
-        return importlib.metadata.version(__package__ or 'archivebox').strip()
-    except importlib.metadata.PackageNotFoundError:
-        pass
-
-    try:
-        # if in dev Git repo dir, use pyproject.toml file
-        pyproject_config = (PACKAGE_DIR.parent / 'pyproject.toml').read_text().split('\n')
-        for line in pyproject_config:
-            if line.startswith('version = '):
-                return line.split(' = ', 1)[-1].strip('"').strip()
-    except FileNotFoundError:
-        # building docs, pyproject.toml is not available
-        pass
-
-    # raise Exception('Failed to detect installed archivebox version!')
-    return 'dev'
-
-VERSION: str = _detect_installed_version(PACKAGE_DIR)
-
+from .paths import (
+    PACKAGE_DIR,
+    DATA_DIR,
+    ARCHIVE_DIR,
+    get_collection_id,
+    get_LIB_DIR,
+    get_TMP_DIR,
+)
+from .permissions import (
+    IS_ROOT,
+    IN_DOCKER,
+    RUNNING_AS_UID,
+    RUNNING_AS_GID,
+    DEFAULT_PUID,
+    DEFAULT_PGID,
+    ARCHIVEBOX_USER,
+    ARCHIVEBOX_GROUP,
+)
+from .version import detect_installed_version
 
 
+###################### Config ##########################
 
 
 
 
 class ConstantsDict(Mapping):
 class ConstantsDict(Mapping):
-    IN_DOCKER = os.environ.get('IN_DOCKER', False) in ('1', 'true', 'True', 'yes')
-    OS = platform.system().lower()      # darwin, linux, etc.
-    ARCH = platform.machine().lower()   # arm64, x86_64, etc.
-    LIB_DIR_SCOPE = f'{ARCH}-{OS}' + ('-docker' if IN_DOCKER else '')
-
-    PACKAGE_DIR: Path = PACKAGE_DIR     # archivebox source code dir
-    DATA_DIR: Path = DATA_DIR           # archivebox user data dir
-    ARCHIVE_DIR: Path = ARCHIVE_DIR     # archivebox snapshot data dir
-    VERSION: str = VERSION
+    PACKAGE_DIR: Path                   = PACKAGE_DIR
+    DATA_DIR: Path                      = DATA_DIR
+    ARCHIVE_DIR: Path                   = ARCHIVE_DIR
+    COLLECTION_ID: str                  = get_collection_id(DATA_DIR)
+    
+    # Host system
+    VERSION: str                        = detect_installed_version(PACKAGE_DIR)
+    OS: str                             = platform.system().lower()    # darwin, linux, etc.
+    ARCH: str                           = platform.machine().lower()   # arm64, x86_64, aarch64, etc.
+    IN_DOCKER: bool                     = IN_DOCKER
     
     
+    # Permissions
+    IS_ROOT: bool                       = IS_ROOT
+    ARCHIVEBOX_USER: int                = ARCHIVEBOX_USER
+    ARCHIVEBOX_GROUP: int               = ARCHIVEBOX_GROUP
+    RUNNING_AS_UID: int                 = RUNNING_AS_UID
+    RUNNING_AS_GID: int                 = RUNNING_AS_GID
+    DEFAULT_PUID: int                   = DEFAULT_PUID
+    DEFAULT_PGID: int                   = DEFAULT_PGID
+    
+    # Source code dirs
     PACKAGE_DIR_NAME: str               = PACKAGE_DIR.name
     PACKAGE_DIR_NAME: str               = PACKAGE_DIR.name
     TEMPLATES_DIR_NAME: str             = 'templates'
     TEMPLATES_DIR_NAME: str             = 'templates'
     TEMPLATES_DIR: Path                 = PACKAGE_DIR / TEMPLATES_DIR_NAME
     TEMPLATES_DIR: Path                 = PACKAGE_DIR / TEMPLATES_DIR_NAME
-    STATIC_DIR: Path                    = TEMPLATES_DIR / 'static'
+    STATIC_DIR_NAME: str                = 'static'
+    STATIC_DIR: Path                    = TEMPLATES_DIR / STATIC_DIR_NAME
+
+    # Data dirs
+    ARCHIVE_DIR_NAME: str               = 'archive'
+    SOURCES_DIR_NAME: str               = 'sources'
+    PERSONAS_DIR_NAME: str              = 'personas'
+    CRONTABS_DIR_NAME: str              = 'crontabs'
+    CACHE_DIR_NAME: str                 = 'cache'
+    LOGS_DIR_NAME: str                  = 'logs'
     USER_PLUGINS_DIR_NAME: str          = 'user_plugins'
     USER_PLUGINS_DIR_NAME: str          = 'user_plugins'
     CUSTOM_TEMPLATES_DIR_NAME: str      = 'user_templates'
     CUSTOM_TEMPLATES_DIR_NAME: str      = 'user_templates'
-
-    ARCHIVE_DIR_NAME: str = 'archive'
-    SOURCES_DIR_NAME: str = 'sources'
-    PERSONAS_DIR_NAME: str = 'personas'
-    CRONTABS_DIR_NAME: str = 'crontabs'
-    CACHE_DIR_NAME: str = 'cache'
-    LOGS_DIR_NAME: str = 'logs'
-    LIB_DIR_NAME: str = 'lib'
-    TMP_DIR_NAME: str = 'tmp'
-
-    SYSTEM_TMP_DIR: Path                = Path(os.environ['SYSTEM_TMP_DIR']) if 'SYSTEM_TMP_DIR' in os.environ else (Path(tempfile.gettempdir()) / 'archivebox')
-    # DATA_DIR_TMP_DIR: Path              = DATA_DIR / TMP_DIR_NAME / machineid.hashed_id('archivebox')[:16]   # cant be used because of socket path length restrictions break too often if data dir is in some deep subdir: ocket.error reported AF_UNIX path too long
-    SYSTEM_LIB_DIR: Path                = Path(os.environ['SYSTEM_LIB_DIR']) if 'SYSTEM_LIB_DIR' in os.environ else (PACKAGE_DIR / LIB_DIR_NAME)
-    DATA_DIR_LIB_DIR: Path              = DATA_DIR / LIB_DIR_NAME / LIB_DIR_SCOPE
-
     ARCHIVE_DIR: Path                   = DATA_DIR / ARCHIVE_DIR_NAME
     ARCHIVE_DIR: Path                   = DATA_DIR / ARCHIVE_DIR_NAME
     SOURCES_DIR: Path                   = DATA_DIR / SOURCES_DIR_NAME
     SOURCES_DIR: Path                   = DATA_DIR / SOURCES_DIR_NAME
     PERSONAS_DIR: Path                  = DATA_DIR / PERSONAS_DIR_NAME
     PERSONAS_DIR: Path                  = DATA_DIR / PERSONAS_DIR_NAME
-    CACHE_DIR: Path                     = DATA_DIR / CACHE_DIR_NAME
     LOGS_DIR: Path                      = DATA_DIR / LOGS_DIR_NAME
     LOGS_DIR: Path                      = DATA_DIR / LOGS_DIR_NAME
-    LIB_DIR: Path                       = SYSTEM_LIB_DIR if IN_DOCKER else DATA_DIR_LIB_DIR  # e.g. /app/lib or ./data/lib/arm64-darwin-docker
-    TMP_DIR: Path                       = SYSTEM_TMP_DIR
+    CACHE_DIR: Path                     = DATA_DIR / CACHE_DIR_NAME
     CUSTOM_TEMPLATES_DIR: Path          = DATA_DIR / CUSTOM_TEMPLATES_DIR_NAME
     CUSTOM_TEMPLATES_DIR: Path          = DATA_DIR / CUSTOM_TEMPLATES_DIR_NAME
     USER_PLUGINS_DIR: Path              = DATA_DIR / USER_PLUGINS_DIR_NAME
     USER_PLUGINS_DIR: Path              = DATA_DIR / USER_PLUGINS_DIR_NAME
 
 
+    # Data dir files
+    CONFIG_FILENAME: str                = 'ArchiveBox.conf'
+    SQL_INDEX_FILENAME: str             = 'index.sqlite3'
+    QUEUE_DATABASE_FILENAME: str        = 'queue.sqlite3'
+    CONFIG_FILE: Path                   = DATA_DIR / CONFIG_FILENAME
+    DATABASE_FILE: Path                 = DATA_DIR / SQL_INDEX_FILENAME
+    QUEUE_DATABASE_FILE: Path           = DATA_DIR / QUEUE_DATABASE_FILENAME
+    
+    JSON_INDEX_FILENAME: str            = 'index.json'
+    HTML_INDEX_FILENAME: str            = 'index.html'
+    ROBOTS_TXT_FILENAME: str            = 'robots.txt'
+    FAVICON_FILENAME: str               = 'favicon.ico'
+    
+    # Runtime dirs
+    TMP_DIR_NAME: str                   = 'tmp'
+    TMP_DIR: Path                       = get_TMP_DIR()
+    LIB_DIR_NAME: str                   = 'lib'
+    LIB_DIR: Path                       = get_LIB_DIR()
     LIB_PIP_DIR: Path                   = LIB_DIR / 'pip'
     LIB_PIP_DIR: Path                   = LIB_DIR / 'pip'
     LIB_NPM_DIR: Path                   = LIB_DIR / 'npm'
     LIB_NPM_DIR: Path                   = LIB_DIR / 'npm'
     LIB_BROWSERS_DIR: Path              = LIB_DIR / 'browsers'
     LIB_BROWSERS_DIR: Path              = LIB_DIR / 'browsers'
     LIB_BIN_DIR: Path                   = LIB_DIR / 'bin'
     LIB_BIN_DIR: Path                   = LIB_DIR / 'bin'
     BIN_DIR: Path                       = LIB_BIN_DIR
     BIN_DIR: Path                       = LIB_BIN_DIR
 
 
-    CONFIG_FILENAME: str = 'ArchiveBox.conf'
-    SQL_INDEX_FILENAME: str = 'index.sqlite3'
-    QUEUE_DATABASE_FILENAME: str = 'queue.sqlite3'
-
-    CONFIG_FILE: Path                   = DATA_DIR / CONFIG_FILENAME
-    DATABASE_FILE: Path                 = DATA_DIR / SQL_INDEX_FILENAME
-    QUEUE_DATABASE_FILE: Path           = DATA_DIR / QUEUE_DATABASE_FILENAME
-
-    JSON_INDEX_FILENAME: str = 'index.json'
-    HTML_INDEX_FILENAME: str = 'index.html'
-    ROBOTS_TXT_FILENAME: str = 'robots.txt'
-    FAVICON_FILENAME: str = 'favicon.ico'
+    # Config constants
+    TIMEZONE: str                       = 'UTC'
+    DEFAULT_CLI_COLORS: Dict[str, str]  = DEFAULT_CLI_COLORS
+    DISABLED_CLI_COLORS: Dict[str, str] = benedict({k: '' for k in DEFAULT_CLI_COLORS})
 
 
-    TIMEZONE: str                             = 'UTC'
-    DEFAULT_CLI_COLORS: Dict[str, str]        = DEFAULT_CLI_COLORS
-    DISABLED_CLI_COLORS: Dict[str, str]       = benedict({k: '' for k in DEFAULT_CLI_COLORS})
-
-    ALLOWDENYLIST_REGEX_FLAGS: int = re.IGNORECASE | re.UNICODE | re.MULTILINE
+    ALLOWDENYLIST_REGEX_FLAGS: int      = re.IGNORECASE | re.UNICODE | re.MULTILINE
 
 
     STATICFILE_EXTENSIONS: frozenset[str] = frozenset((
     STATICFILE_EXTENSIONS: frozenset[str] = frozenset((
         # 99.999% of the time, URLs ending in these extensions are static files
         # 99.999% of the time, URLs ending in these extensions are static files
@@ -136,17 +133,6 @@ class ConstantsDict(Mapping):
         # html, htm, shtml, xhtml, xml, aspx, php, cgi
         # html, htm, shtml, xhtml, xml, aspx, php, cgi
     ))
     ))
 
 
-    INGORED_PATHS: frozenset[str] = frozenset((
-        ".git",
-        ".svn",
-        ".DS_Store",
-        ".gitignore",
-        "lost+found",
-        ".DS_Store",
-        ".env",
-        "Dockerfile",
-        ".ArchiveBox.conf.bak",
-    ))
     PIP_RELATED_NAMES: frozenset[str] = frozenset((
     PIP_RELATED_NAMES: frozenset[str] = frozenset((
         ".venv",
         ".venv",
         "venv",
         "venv",
@@ -160,7 +146,15 @@ class ConstantsDict(Mapping):
         "yarn.lock",
         "yarn.lock",
     ))
     ))
 
 
-    DATA_DIR_NAMES: frozenset[str] = frozenset((
+    # When initializing archivebox in a new directory, we check to make sure the dir is
+    # actually empty so that we dont clobber someone's home directory or desktop by accident.
+    # These files are exceptions to the is_empty check when we're trying to init a new dir,
+    # as they could be from a previous archivebox version, system artifacts, dependencies, etc.
+    ALLOWED_IN_DATA_DIR: frozenset[str] = frozenset((
+        *PIP_RELATED_NAMES,
+        *NPM_RELATED_NAMES,
+        
+        ### Dirs:
         ARCHIVE_DIR_NAME,
         ARCHIVE_DIR_NAME,
         SOURCES_DIR_NAME,
         SOURCES_DIR_NAME,
         LOGS_DIR_NAME,
         LOGS_DIR_NAME,
@@ -171,9 +165,12 @@ class ConstantsDict(Mapping):
         CUSTOM_TEMPLATES_DIR_NAME,
         CUSTOM_TEMPLATES_DIR_NAME,
         USER_PLUGINS_DIR_NAME,
         USER_PLUGINS_DIR_NAME,
         CRONTABS_DIR_NAME,
         CRONTABS_DIR_NAME,
-    ))
-    DATA_DIRS: frozenset[Path] = frozenset(DATA_DIR / dirname for dirname in DATA_DIR_NAMES)
-    DATA_FILE_NAMES: frozenset[str] = frozenset((
+        "static",                # created by old static exports <v0.6.0
+        "sonic",                 # created by docker bind mount / sonic FTS process
+        ".git",
+        ".svn",
+        
+        ### Files:
         CONFIG_FILENAME,
         CONFIG_FILENAME,
         SQL_INDEX_FILENAME,
         SQL_INDEX_FILENAME,
         f"{SQL_INDEX_FILENAME}-wal",
         f"{SQL_INDEX_FILENAME}-wal",
@@ -188,43 +185,37 @@ class ConstantsDict(Mapping):
         FAVICON_FILENAME,
         FAVICON_FILENAME,
         CONFIG_FILENAME,
         CONFIG_FILENAME,
         f"{CONFIG_FILENAME}.bak",
         f"{CONFIG_FILENAME}.bak",
+        f".{CONFIG_FILENAME}.bak",
         "static_index.json",
         "static_index.json",
-    ))
-
-    # When initializing archivebox in a new directory, we check to make sure the dir is
-    # actually empty so that we dont clobber someone's home directory or desktop by accident.
-    # These files are exceptions to the is_empty check when we're trying to init a new dir,
-    # as they could be from a previous archivebox version, system artifacts, dependencies, etc.
-    ALLOWED_IN_DATA_DIR: frozenset[str] = frozenset((
-        *INGORED_PATHS,
-        *PIP_RELATED_NAMES,
-        *NPM_RELATED_NAMES,
-        *DATA_DIR_NAMES,
-        *DATA_FILE_NAMES,
-        "static",                # created by old static exports <v0.6.0
-        "sonic",                 # created by docker bind mount
+        ".DS_Store",
+        ".gitignore",
+        "lost+found",
+        ".DS_Store",
+        ".env",
+        ".collection_id",
+        "Dockerfile",
     ))
     ))
 
 
     CODE_LOCATIONS = benedict({
     CODE_LOCATIONS = benedict({
         'PACKAGE_DIR': {
         'PACKAGE_DIR': {
             'path': (PACKAGE_DIR).resolve(),
             'path': (PACKAGE_DIR).resolve(),
             'enabled': True,
             'enabled': True,
-            'is_valid': (PACKAGE_DIR / '__main__.py').exists(),
+            'is_valid': (PACKAGE_DIR / '__main__.py').exists(),                                                                            # read + list
         },
         },
         'TEMPLATES_DIR': {
         'TEMPLATES_DIR': {
             'path': TEMPLATES_DIR.resolve(),
             'path': TEMPLATES_DIR.resolve(),
             'enabled': True,
             'enabled': True,
-            'is_valid': STATIC_DIR.exists(),
+            'is_valid': STATIC_DIR.exists() and os.access(STATIC_DIR, os.R_OK) and os.access(STATIC_DIR, os.X_OK),                         # read + list
         },
         },
         'LIB_DIR': {
         'LIB_DIR': {
             'path': LIB_DIR.resolve(),
             'path': LIB_DIR.resolve(),
             'enabled': True,
             'enabled': True,
-            'is_valid': LIB_DIR.is_dir(),
+            'is_valid': LIB_DIR.is_dir() and os.access(LIB_DIR, os.R_OK) and os.access(LIB_DIR, os.X_OK) and os.access(LIB_DIR, os.W_OK),  # read + write
         },
         },
         'TMP_DIR': {
         'TMP_DIR': {
             'path': TMP_DIR.resolve(),
             'path': TMP_DIR.resolve(),
             'enabled': True,
             'enabled': True,
-            'is_valid': TMP_DIR.is_dir(),
+            'is_valid': TMP_DIR.is_dir() and os.access(TMP_DIR, os.R_OK) and os.access(TMP_DIR, os.X_OK) and os.access(TMP_DIR, os.W_OK),  # read + write
         },
         },
     })
     })
         
         
@@ -232,61 +223,61 @@ class ConstantsDict(Mapping):
         "DATA_DIR": {
         "DATA_DIR": {
             "path": DATA_DIR.resolve(),
             "path": DATA_DIR.resolve(),
             "enabled": True,
             "enabled": True,
-            "is_valid": DATABASE_FILE.exists(),
+            "is_valid": DATABASE_FILE.exists() and os.access(DATA_DIR, os.R_OK) and os.access(DATA_DIR, os.W_OK) and os.access(DATA_DIR, os.X_OK),
             "is_mount": os.path.ismount(DATA_DIR.resolve()),
             "is_mount": os.path.ismount(DATA_DIR.resolve()),
         },
         },
         "CONFIG_FILE": {
         "CONFIG_FILE": {
             "path": CONFIG_FILE.resolve(),
             "path": CONFIG_FILE.resolve(),
             "enabled": True,
             "enabled": True,
-            "is_valid": CONFIG_FILE.exists(),
+            "is_valid": CONFIG_FILE.exists() and os.access(CONFIG_FILE, os.W_OK),
         },
         },
         "SQL_INDEX": {
         "SQL_INDEX": {
             "path": DATABASE_FILE.resolve(),
             "path": DATABASE_FILE.resolve(),
             "enabled": True,
             "enabled": True,
-            "is_valid": DATABASE_FILE.exists(),
+            "is_valid": DATABASE_FILE.exists() and os.access(DATABASE_FILE, os.R_OK) and os.access(DATABASE_FILE, os.W_OK),
             "is_mount": os.path.ismount(DATABASE_FILE.resolve()),
             "is_mount": os.path.ismount(DATABASE_FILE.resolve()),
         },
         },
         "QUEUE_DATABASE": {
         "QUEUE_DATABASE": {
             "path": QUEUE_DATABASE_FILE.resolve(),
             "path": QUEUE_DATABASE_FILE.resolve(),
             "enabled": True,
             "enabled": True,
-            "is_valid": QUEUE_DATABASE_FILE.exists(),
+            "is_valid": QUEUE_DATABASE_FILE.exists() and os.access(QUEUE_DATABASE_FILE, os.R_OK) and os.access(QUEUE_DATABASE_FILE, os.W_OK),
             "is_mount": os.path.ismount(QUEUE_DATABASE_FILE.resolve()),
             "is_mount": os.path.ismount(QUEUE_DATABASE_FILE.resolve()),
         },
         },
         "ARCHIVE_DIR": {
         "ARCHIVE_DIR": {
             "path": ARCHIVE_DIR.resolve(),
             "path": ARCHIVE_DIR.resolve(),
             "enabled": True,
             "enabled": True,
-            "is_valid": ARCHIVE_DIR.exists(),
+            "is_valid": ARCHIVE_DIR.exists() and os.access(ARCHIVE_DIR, os.R_OK) and os.access(ARCHIVE_DIR, os.W_OK) and os.access(ARCHIVE_DIR, os.X_OK),
             "is_mount": os.path.ismount(ARCHIVE_DIR.resolve()),
             "is_mount": os.path.ismount(ARCHIVE_DIR.resolve()),
         },
         },
         "SOURCES_DIR": {
         "SOURCES_DIR": {
             "path": SOURCES_DIR.resolve(),
             "path": SOURCES_DIR.resolve(),
             "enabled": True,
             "enabled": True,
-            "is_valid": SOURCES_DIR.exists(),
+            "is_valid": SOURCES_DIR.exists() and os.access(SOURCES_DIR, os.R_OK) and os.access(SOURCES_DIR, os.W_OK) and os.access(SOURCES_DIR, os.X_OK),
         },
         },
         "LOGS_DIR": {
         "LOGS_DIR": {
             "path": LOGS_DIR.resolve(),
             "path": LOGS_DIR.resolve(),
             "enabled": True,
             "enabled": True,
-            "is_valid": LOGS_DIR.is_dir(),
+            "is_valid": LOGS_DIR.is_dir() and os.access(LOGS_DIR, os.R_OK) and os.access(LOGS_DIR, os.W_OK) and os.access(LOGS_DIR, os.X_OK),        # read + write
         },
         },
         # "CACHE_DIR": {
         # "CACHE_DIR": {
         #     "path": CACHE_DIR.resolve(),
         #     "path": CACHE_DIR.resolve(),
         #     "enabled": True,
         #     "enabled": True,
-        #     "is_valid": CACHE_DIR.is_dir(),
+        #     "is_valid": CACHE_DIR.is_dir() and os.access(CACHE_DIR, os.R_OK) and os.access(CACHE_DIR, os.W_OK) and os.access(CACHE_DIR, os.X_OK),  # read + write
         # },
         # },
         "PERSONAS_DIR": {
         "PERSONAS_DIR": {
             "path": PERSONAS_DIR.resolve(),
             "path": PERSONAS_DIR.resolve(),
             "enabled": PERSONAS_DIR.exists(),
             "enabled": PERSONAS_DIR.exists(),
-            "is_valid": PERSONAS_DIR.is_dir(),
+            "is_valid": PERSONAS_DIR.is_dir() and os.access(PERSONAS_DIR, os.R_OK) and os.access(PERSONAS_DIR, os.W_OK) and os.access(PERSONAS_DIR, os.X_OK), # read + write
         },
         },
         'CUSTOM_TEMPLATES_DIR': {
         'CUSTOM_TEMPLATES_DIR': {
             'path': CUSTOM_TEMPLATES_DIR.resolve(),
             'path': CUSTOM_TEMPLATES_DIR.resolve(),
             'enabled': CUSTOM_TEMPLATES_DIR.exists(),
             'enabled': CUSTOM_TEMPLATES_DIR.exists(),
-            'is_valid': CUSTOM_TEMPLATES_DIR.is_dir(),
+            'is_valid': CUSTOM_TEMPLATES_DIR.is_dir() and os.access(CUSTOM_TEMPLATES_DIR, os.R_OK) and os.access(CUSTOM_TEMPLATES_DIR, os.X_OK),       # read
         },
         },
         'USER_PLUGINS_DIR': {
         'USER_PLUGINS_DIR': {
             'path': USER_PLUGINS_DIR.resolve(),
             'path': USER_PLUGINS_DIR.resolve(),
             'enabled': USER_PLUGINS_DIR.exists(),
             'enabled': USER_PLUGINS_DIR.exists(),
-            'is_valid': USER_PLUGINS_DIR.is_dir(),
+            'is_valid': USER_PLUGINS_DIR.is_dir() and os.access(USER_PLUGINS_DIR, os.R_OK) and os.access(USER_PLUGINS_DIR, os.X_OK),                   # read
         },
         },
     })
     })
 
 
@@ -314,5 +305,6 @@ globals().update(CONSTANTS)
 
 
 
 
 # these need to always exist as we need them to run almost everything
 # these need to always exist as we need them to run almost everything
+# TODO: figure out a better time to make these than import-time
 CONSTANTS.LIB_DIR.mkdir(parents=True, exist_ok=True)
 CONSTANTS.LIB_DIR.mkdir(parents=True, exist_ok=True)
 CONSTANTS.TMP_DIR.mkdir(parents=True, exist_ok=True)
 CONSTANTS.TMP_DIR.mkdir(parents=True, exist_ok=True)

+ 39 - 25
archivebox/config/legacy.py

@@ -22,41 +22,34 @@ Documentation:
 __package__ = 'archivebox.config'
 __package__ = 'archivebox.config'
 
 
 import os
 import os
-import io
 import re
 import re
 import sys
 import sys
 import json
 import json
 import shutil
 import shutil
 
 
-from hashlib import md5
 from pathlib import Path
 from pathlib import Path
 from datetime import datetime, timezone
 from datetime import datetime, timezone
-from typing import Optional, Type, Tuple, Dict
-from subprocess import run, PIPE, DEVNULL, STDOUT, TimeoutExpired
+from typing import Optional, Type, Tuple, Dict, Any
+from subprocess import run, DEVNULL
 from configparser import ConfigParser
 from configparser import ConfigParser
 
 
 from rich.progress import Progress
 from rich.progress import Progress
 from rich.console import Console
 from rich.console import Console
 from benedict import benedict
 from benedict import benedict
-from pydantic_pkgr import SemVer
 
 
 import django
 import django
 from django.db.backends.sqlite3.base import Database as sqlite3
 from django.db.backends.sqlite3.base import Database as sqlite3
 
 
 
 
-from .constants import CONSTANTS, TIMEZONE
+from .constants import CONSTANTS
 from .constants import *
 from .constants import *
-from .config_stubs import (
-    ConfigValue,
-    ConfigDefaultValue,
-    ConfigDefaultDict,
-)
+
 from ..misc.logging import (
 from ..misc.logging import (
     stderr,
     stderr,
     hint,      # noqa
     hint,      # noqa
 )
 )
 
 
-from .defaults import SHELL_CONFIG, GENERAL_CONFIG, ARCHIVING_CONFIG, SERVER_CONFIG, SEARCH_BACKEND_CONFIG, STORAGE_CONFIG
+from .common import SHELL_CONFIG, GENERAL_CONFIG, ARCHIVING_CONFIG, SERVER_CONFIG, SEARCH_BACKEND_CONFIG, STORAGE_CONFIG
 from archivebox.plugins_auth.ldap.apps import LDAP_CONFIG
 from archivebox.plugins_auth.ldap.apps import LDAP_CONFIG
 from archivebox.plugins_extractor.favicon.apps import FAVICON_CONFIG
 from archivebox.plugins_extractor.favicon.apps import FAVICON_CONFIG
 from archivebox.plugins_extractor.wget.apps import WGET_CONFIG
 from archivebox.plugins_extractor.wget.apps import WGET_CONFIG
@@ -67,7 +60,7 @@ LDAP = LDAP_CONFIG.LDAP_ENABLED
 
 
 ############################### Config Schema ##################################
 ############################### Config Schema ##################################
 
 
-CONFIG_SCHEMA: Dict[str, ConfigDefaultDict] = {
+CONFIG_SCHEMA: Dict[str, Dict[str, Any]] = {
     'SHELL_CONFIG': SHELL_CONFIG.as_legacy_config_schema(),
     'SHELL_CONFIG': SHELL_CONFIG.as_legacy_config_schema(),
 
 
     'SERVER_CONFIG': SERVER_CONFIG.as_legacy_config_schema(),
     'SERVER_CONFIG': SERVER_CONFIG.as_legacy_config_schema(),
@@ -194,7 +187,7 @@ def get_real_name(key: str) -> str:
 
 
 # These are derived/computed values calculated *after* all user-provided config values are ingested
 # These are derived/computed values calculated *after* all user-provided config values are ingested
 # they appear in `archivebox config` output and are intended to be read-only for the user
 # they appear in `archivebox config` output and are intended to be read-only for the user
-DYNAMIC_CONFIG_SCHEMA: ConfigDefaultDict = {
+DYNAMIC_CONFIG_SCHEMA: Dict[str, Any] = {
     'URL_DENYLIST_PTN':         {'default': lambda c: c['URL_DENYLIST'] and re.compile(c['URL_DENYLIST'] or '', CONSTANTS.ALLOWDENYLIST_REGEX_FLAGS)},
     'URL_DENYLIST_PTN':         {'default': lambda c: c['URL_DENYLIST'] and re.compile(c['URL_DENYLIST'] or '', CONSTANTS.ALLOWDENYLIST_REGEX_FLAGS)},
     'URL_ALLOWLIST_PTN':        {'default': lambda c: c['URL_ALLOWLIST'] and re.compile(c['URL_ALLOWLIST'] or '', CONSTANTS.ALLOWDENYLIST_REGEX_FLAGS)},
     'URL_ALLOWLIST_PTN':        {'default': lambda c: c['URL_ALLOWLIST'] and re.compile(c['URL_ALLOWLIST'] or '', CONSTANTS.ALLOWDENYLIST_REGEX_FLAGS)},
 
 
@@ -209,12 +202,12 @@ DYNAMIC_CONFIG_SCHEMA: ConfigDefaultDict = {
 
 
 
 
 def load_config_val(key: str,
 def load_config_val(key: str,
-                    default: ConfigDefaultValue=None,
+                    default: Any=None,
                     type: Optional[Type]=None,
                     type: Optional[Type]=None,
                     aliases: Optional[Tuple[str, ...]]=None,
                     aliases: Optional[Tuple[str, ...]]=None,
                     config: Optional[benedict]=None,
                     config: Optional[benedict]=None,
                     env_vars: Optional[os._Environ]=None,
                     env_vars: Optional[os._Environ]=None,
-                    config_file_vars: Optional[Dict[str, str]]=None) -> ConfigValue:
+                    config_file_vars: Optional[Dict[str, str]]=None) -> Any:
     """parse bool, int, and str key=value pairs from env"""
     """parse bool, int, and str key=value pairs from env"""
 
 
     assert isinstance(config, dict)
     assert isinstance(config, dict)
@@ -372,7 +365,7 @@ def write_config_file(config: Dict[str, str], out_dir: str | None=CONSTANTS.DATA
 
 
 
 
 
 
-def load_config(defaults: ConfigDefaultDict,
+def load_config(defaults: Dict[str, Any],
                 config: Optional[benedict]=None,
                 config: Optional[benedict]=None,
                 out_dir: Optional[str]=None,
                 out_dir: Optional[str]=None,
                 env_vars: Optional[os._Environ]=None,
                 env_vars: Optional[os._Environ]=None,
@@ -505,7 +498,7 @@ def load_all_config():
 # add all final config values in CONFIG to globals in this file
 # add all final config values in CONFIG to globals in this file
 CONFIG: benedict = load_all_config()
 CONFIG: benedict = load_all_config()
 globals().update(CONFIG)
 globals().update(CONFIG)
-# this lets us do:  from .config import DEBUG, MEDIA_TIMEOUT, ...
+
 
 
 # print("FINISHED LOADING CONFIG USING SCHEMAS + FILE + ENV")
 # print("FINISHED LOADING CONFIG USING SCHEMAS + FILE + ENV")
 
 
@@ -521,8 +514,8 @@ globals().update(CONFIG)
 
 
 
 
 # Set timezone to UTC and umask to OUTPUT_PERMISSIONS
 # Set timezone to UTC and umask to OUTPUT_PERMISSIONS
-assert TIMEZONE == 'UTC', f'The server timezone should always be set to UTC (got {TIMEZONE})'  # noqa: F821
-os.environ["TZ"] = TIMEZONE                                                  # noqa: F821
+assert CONSTANTS.TIMEZONE == 'UTC', f'The server timezone should always be set to UTC (got {CONSTANTS.TIMEZONE})'  # noqa: F821
+os.environ["TZ"] = CONSTANTS.TIMEZONE                                                  # noqa: F821
 os.umask(0o777 - int(STORAGE_CONFIG.DIR_OUTPUT_PERMISSIONS, base=8))                        # noqa: F821
 os.umask(0o777 - int(STORAGE_CONFIG.DIR_OUTPUT_PERMISSIONS, base=8))                        # noqa: F821
 
 
 ########################### Config Validity Checkers ###########################
 ########################### Config Validity Checkers ###########################
@@ -533,7 +526,8 @@ if not SHELL_CONFIG.SHOW_PROGRESS:
     os.environ['TERM'] = 'dumb'
     os.environ['TERM'] = 'dumb'
 
 
 # recreate rich console obj based on new config values
 # recreate rich console obj based on new config values
-CONSOLE = Console()
+STDOUT = CONSOLE = Console()
+STDERR = Console(stderr=True)
 from ..misc import logging
 from ..misc import logging
 logging.CONSOLE = CONSOLE
 logging.CONSOLE = CONSOLE
 
 
@@ -541,11 +535,11 @@ logging.CONSOLE = CONSOLE
 INITIAL_STARTUP_PROGRESS = None
 INITIAL_STARTUP_PROGRESS = None
 INITIAL_STARTUP_PROGRESS_TASK = 0
 INITIAL_STARTUP_PROGRESS_TASK = 0
 
 
-def bump_startup_progress_bar():
+def bump_startup_progress_bar(advance=1):
     global INITIAL_STARTUP_PROGRESS
     global INITIAL_STARTUP_PROGRESS
     global INITIAL_STARTUP_PROGRESS_TASK
     global INITIAL_STARTUP_PROGRESS_TASK
     if INITIAL_STARTUP_PROGRESS:
     if INITIAL_STARTUP_PROGRESS:
-        INITIAL_STARTUP_PROGRESS.update(INITIAL_STARTUP_PROGRESS_TASK, advance=1)   # type: ignore
+        INITIAL_STARTUP_PROGRESS.update(INITIAL_STARTUP_PROGRESS_TASK, advance=advance)   # type: ignore
 
 
 
 
 def setup_django_minimal():
 def setup_django_minimal():
@@ -559,6 +553,8 @@ DJANGO_SET_UP = False
 
 
 
 
 def setup_django(out_dir: Path | None=None, check_db=False, config: benedict=CONFIG, in_memory_db=False) -> None:
 def setup_django(out_dir: Path | None=None, check_db=False, config: benedict=CONFIG, in_memory_db=False) -> None:
+    from rich.panel import Panel
+    
     global INITIAL_STARTUP_PROGRESS
     global INITIAL_STARTUP_PROGRESS
     global INITIAL_STARTUP_PROGRESS_TASK
     global INITIAL_STARTUP_PROGRESS_TASK
     global DJANGO_SET_UP
     global DJANGO_SET_UP
@@ -568,7 +564,7 @@ def setup_django(out_dir: Path | None=None, check_db=False, config: benedict=CON
         # TODO: figure out why CLI entrypoints with init_pending are running this twice sometimes
         # TODO: figure out why CLI entrypoints with init_pending are running this twice sometimes
         return
         return
 
 
-    with Progress(transient=True, expand=True, console=CONSOLE) as INITIAL_STARTUP_PROGRESS:
+    with Progress(transient=True, expand=True, console=STDERR) as INITIAL_STARTUP_PROGRESS:
         INITIAL_STARTUP_PROGRESS_TASK = INITIAL_STARTUP_PROGRESS.add_task("[green]Loading modules...", total=25)
         INITIAL_STARTUP_PROGRESS_TASK = INITIAL_STARTUP_PROGRESS.add_task("[green]Loading modules...", total=25)
 
 
         output_dir = out_dir or CONSTANTS.DATA_DIR
         output_dir = out_dir or CONSTANTS.DATA_DIR
@@ -595,7 +591,14 @@ def setup_django(out_dir: Path | None=None, check_db=False, config: benedict=CON
             else:
             else:
                 # Otherwise use default sqlite3 file-based database and initialize django
                 # Otherwise use default sqlite3 file-based database and initialize django
                 # without running migrations automatically (user runs them manually by calling init)
                 # without running migrations automatically (user runs them manually by calling init)
-                django.setup()
+                try:
+                    django.setup()
+                except Exception as e:
+                    bump_startup_progress_bar(advance=1000)
+                    STDERR.print()
+                    STDERR.print(Panel(f'\n[red]{e.__class__.__name__}[/red]: [yellow]{e}[/yellow]\nPlease check your config and [blue]DATA_DIR[/blue] permissions.\n', title='\n\n[red][X] Error while trying to load database!', subtitle='[grey53]NO WRITES CAN BE PERFORMED[/grey53]', expand=False, style='bold red'))
+                    STDERR.print()
+                    return
             
             
             bump_startup_progress_bar()
             bump_startup_progress_bar()
 
 
@@ -608,6 +611,17 @@ def setup_django(out_dir: Path | None=None, check_db=False, config: benedict=CON
                 f.write(f"\n> {command}; TS={ts} VERSION={CONSTANTS.VERSION} IN_DOCKER={SHELL_CONFIG.IN_DOCKER} IS_TTY={SHELL_CONFIG.IS_TTY}\n")
                 f.write(f"\n> {command}; TS={ts} VERSION={CONSTANTS.VERSION} IN_DOCKER={SHELL_CONFIG.IN_DOCKER} IS_TTY={SHELL_CONFIG.IS_TTY}\n")
 
 
             if check_db:
             if check_db:
+                # make sure the data dir is owned by a non-root user
+                if CONSTANTS.DATA_DIR.stat().st_uid == 0:
+                    STDERR.print('[red][X] Error: ArchiveBox DATA_DIR cannot be owned by root![/red]')
+                    STDERR.print(f'    {CONSTANTS.DATA_DIR}')
+                    STDERR.print()
+                    STDERR.print('[violet]Hint:[/violet] Are you running archivebox in the right folder? (and as a non-root user?)')
+                    STDERR.print('    cd path/to/your/archive/data')
+                    STDERR.print('    archivebox [command]')
+                    STDERR.print()
+                    raise SystemExit(9)
+                
                 # Create cache table in DB if needed
                 # Create cache table in DB if needed
                 try:
                 try:
                     from django.core.cache import cache
                     from django.core.cache import cache

+ 152 - 0
archivebox/config/paths.py

@@ -0,0 +1,152 @@
+__package__ = 'archivebox.config'
+
+import os
+import tempfile
+import hashlib
+from pathlib import Path
+
+from functools import cache
+from platformdirs import PlatformDirs
+
+from .permissions import SudoPermission, IS_ROOT, ARCHIVEBOX_USER, ARCHIVEBOX_GROUP
+
+#############################################################################################
+
+PACKAGE_DIR: Path = Path(__file__).resolve().parent.parent    # archivebox source code dir
+DATA_DIR: Path = Path(os.getcwd()).resolve()                  # archivebox user data dir
+ARCHIVE_DIR: Path = DATA_DIR / 'archive'                      # archivebox snapshot data dir
+
+#############################################################################################
+
+@cache
+def get_collection_id(DATA_DIR=DATA_DIR):
+    """Get a short, stable, unique ID for the current collection"""
+    collection_id_file = DATA_DIR / '.collection_id'
+    
+    try:
+        return collection_id_file.read_text().strip()
+    except (OSError, FileNotFoundError, PermissionError):
+        pass
+    
+    hash_key = str(DATA_DIR.resolve()).encode()
+    collection_id = hashlib.sha256(hash_key).hexdigest()[:8]
+    try:
+        collection_id_file.write_text(collection_id)
+    except (OSError, FileNotFoundError, PermissionError):
+        pass
+    return collection_id
+
+
+def dir_is_writable(dir_path: Path, uid: int | None = None, gid: int | None = None, fallback=True) -> bool:
+    """Check if a given directory is writable by a specific user and group (fallback=try as current user is unable to check with provided uid)"""
+    current_uid, current_gid = os.geteuid(), os.getegid()
+    uid, gid = uid or current_uid, gid or current_gid
+
+    test_file = dir_path / '.permissions_test'
+    try:
+        with SudoPermission(uid=uid, fallback=fallback):
+            test_file.exists()
+            test_file.write_text(f'Checking if PUID={uid} PGID={gid} can write to dir')
+            test_file.unlink()
+            return True
+    except (IOError, OSError, PermissionError):
+        pass
+        
+    return False
+
+
+
+@cache
+def get_LIB_DIR():
+    """
+    - should be shared with other collections on the same host
+    - must be scoped by CPU architecture, OS family, and archivebox version
+    - should not be shared with other hosts/archivebox versions
+    - must be writable by any archivebox user
+    - should be persistent across reboots
+    - can be on a docker bin mount but probably shouldnt be
+    - ok to have a long path (doesnt contain SOCKETS)
+    """
+    from .version import detect_installed_version
+    
+    HOST_DIRS = PlatformDirs(appname='archivebox', appauthor='ArchiveBox', version=detect_installed_version(), opinion=True, ensure_exists=False)
+    
+    if 'SYSTEM_LIB_DIR' in os.environ:
+        lib_dir = Path(os.environ['SYSTEM_LIB_DIR'])
+    else:
+        with SudoPermission(uid=ARCHIVEBOX_USER, fallback=True):
+            lib_dir = HOST_DIRS.site_data_path
+    
+    # Docker: /usr/local/share/archivebox/0.8.5
+    # Ubuntu: /usr/local/share/archivebox/0.8.5
+    # macOS: /Library/Application Support/archivebox
+    try:
+        with SudoPermission(uid=0, fallback=True):
+            lib_dir.mkdir(parents=True, exist_ok=True)
+    except PermissionError:
+        # our user cannot 
+        lib_dir = HOST_DIRS.user_data_path
+        lib_dir.mkdir(parents=True, exist_ok=True)
+    
+    if not dir_is_writable(lib_dir):
+        if IS_ROOT:
+            # make sure lib dir is owned by the archivebox user, not root
+            with SudoPermission(uid=0):
+                os.system(f'chown {ARCHIVEBOX_USER}:{ARCHIVEBOX_GROUP} "{lib_dir}"')
+        else:
+            raise PermissionError(f'SYSTEM_LIB_DIR {lib_dir} is not writable by archivebox user {ARCHIVEBOX_USER}:{ARCHIVEBOX_GROUP}')
+        
+    return lib_dir
+    
+@cache
+def get_TMP_DIR():
+    """
+    - must NOT be inside DATA_DIR / inside a docker volume bind mount
+    - must NOT have a long PATH (UNIX socket path length restrictions)
+    - must NOT be shared with other collections/hosts
+    - must be writable by archivebox user & root
+    - must be cleared on every boot / not persisted
+    - must be cleared on every archivebox version upgrade
+    """
+    from .version import detect_installed_version
+    
+    HOST_DIRS = PlatformDirs(appname='archivebox', appauthor='ArchiveBox', version=detect_installed_version(), opinion=True, ensure_exists=False)
+    
+    # print('DATA_DIR OWNED BY:', ARCHIVEBOX_USER, ARCHIVEBOX_GROUP)
+    # print('RUNNING AS:', self.PUID, self.PGID)
+    
+    if 'SYSTEM_TMP_DIR' in os.environ:
+        run_dir = Path(os.environ['SYSTEM_TMP_DIR']).resolve() / get_collection_id(DATA_DIR=DATA_DIR)
+        with SudoPermission(uid=0, fallback=True):
+            run_dir.mkdir(parents=True, exist_ok=True)
+        if not dir_is_writable(run_dir):
+            if IS_ROOT:
+                with SudoPermission(uid=0, fallback=False):
+                    os.system(f'chown {ARCHIVEBOX_USER}:{ARCHIVEBOX_GROUP} "{run_dir}"')
+            else:
+                raise PermissionError(f'SYSTEM_TMP_DIR {run_dir} is not writable by archivebox user {ARCHIVEBOX_USER}:{ARCHIVEBOX_GROUP}')
+        assert len(str(run_dir / 'supervisord.conf')) < 95, 'SYSTEM_TMP_DIR path is too long, please set SYSTEM_TMP_DIR env variable to a shorter path (unfortunately unix requires socket paths be < 108 chars)'
+        return run_dir
+    
+    run_dir = (HOST_DIRS.site_runtime_path / get_collection_id(DATA_DIR=DATA_DIR)).resolve()
+    try:
+        assert len(str(run_dir)) + len('/supervisord.sock') < 95
+    except AssertionError:
+        run_dir = Path(tempfile.gettempdir()).resolve() / 'archivebox' / get_collection_id(DATA_DIR=DATA_DIR)
+        assert len(str(run_dir)) + len('/supervisord.sock') < 95, 'SYSTEM_TMP_DIR path is too long, please set SYSTEM_TMP_DIR env variable to a shorter path (unfortunately unix requires socket paths be < 108 chars)'
+    
+    with SudoPermission(uid=0, fallback=True):
+        run_dir.mkdir(parents=True, exist_ok=True)
+        
+    if not dir_is_writable(run_dir):
+        if IS_ROOT:
+            with SudoPermission(uid=0):
+                os.system(f'chown {ARCHIVEBOX_USER}:{ARCHIVEBOX_GROUP} "{run_dir}"')
+        else:
+            raise PermissionError(f'SYSTEM_TMP_DIR {run_dir} is not writable by archivebox user {ARCHIVEBOX_USER}:{ARCHIVEBOX_GROUP}')
+    
+    # Docker: /tmp/archivebox/0.8.5/abc324235
+    # Ubuntu: /tmp/archivebox/0.8.5/abc324235
+    # macOS: /var/folders/qy/6tpfrpx100j1t4l312nz683m0000gn/T/archivebox/0.8.5/abc324235
+    return run_dir
+

+ 70 - 0
archivebox/config/permissions.py

@@ -0,0 +1,70 @@
+__package__ = 'archivebox.config'
+
+import os
+from pathlib import Path
+from contextlib import contextmanager
+
+#############################################################################################
+
+DATA_DIR = Path(os.getcwd())
+
+DATA_DIR_STAT           = Path(DATA_DIR).stat()
+DATA_DIR_UID            = DATA_DIR_STAT.st_uid
+DATA_DIR_GID            = DATA_DIR_STAT.st_gid
+DEFAULT_PUID            = 911
+DEFAULT_PGID            = 911
+RUNNING_AS_UID          = os.getuid()
+RUNNING_AS_GID          = os.getgid()
+EUID                    = os.geteuid()
+EGID                    = os.getegid()
+USER: str               = Path('~').expanduser().resolve().name
+
+IS_ROOT = RUNNING_AS_UID == 0
+IN_DOCKER = os.environ.get('IN_DOCKER', False) in ('1', 'true', 'True', 'TRUE', 'yes')
+
+os.environ.setdefault('PUID', str(DATA_DIR_UID or RUNNING_AS_UID or DEFAULT_PUID))
+os.environ.setdefault('PGID', str(DATA_DIR_GID or RUNNING_AS_GID or DEFAULT_PGID))
+
+ARCHIVEBOX_USER = int(os.environ['PUID'])
+ARCHIVEBOX_GROUP = int(os.environ['PGID'])
+
+#############################################################################################
+
+def drop_privileges():
+    """If running as root, drop privileges to the user that owns the data dir (or PUID, or default=911)"""
+    
+    # always run archivebox as the user that owns the data dir, never as root
+    if os.getuid() == 0:
+        # drop permissions to the user that owns the data dir / provided PUID
+        if os.geteuid() != ARCHIVEBOX_USER:
+            os.seteuid(ARCHIVEBOX_USER)
+        # if we need sudo (e.g. for installing dependencies) code should use SudoPermissions() context manager to regain root
+
+
+@contextmanager
+def SudoPermission(uid=0, fallback=False):
+    """Attempt to run code with sudo permissions for a given user (or root)"""
+    
+    if os.geteuid() == uid:
+        # no need to change effective UID, we are already that user
+        yield
+        return
+
+    try:
+        # change our effective UID to the given UID
+        os.seteuid(uid)
+    except PermissionError as err:
+        if not fallback:
+            raise PermissionError(f'Not enough permissions to run code as uid={uid}, please retry with sudo') from err
+    try:
+        # yield back to the caller so they can run code inside context as root
+        yield
+    finally:
+        # then set effective UID back to DATA_DIR owner
+        DATA_DIR_OWNER = DATA_DIR.stat().st_uid
+        try:
+            os.seteuid(DATA_DIR_OWNER)
+        except PermissionError as err:
+            if not fallback:
+                raise PermissionError(f'Failed to revert uid={uid} back to {DATA_DIR_OWNER} after running code with sudo') from err
+

+ 121 - 0
archivebox/config/version.py

@@ -0,0 +1,121 @@
+__package__ = 'archivebox.config'
+
+import os
+import importlib.metadata
+
+from pathlib import Path
+from functools import cache
+from datetime import datetime
+from typing import Optional
+
+#############################################################################################
+
+IN_DOCKER = os.environ.get('IN_DOCKER', False) in ('1', 'true', 'True', 'TRUE', 'yes')
+
+PACKAGE_DIR: Path = Path(__file__).resolve().parent.parent    # archivebox source code dir
+DATA_DIR: Path = Path(os.getcwd()).resolve()                  # archivebox user data dir
+ARCHIVE_DIR: Path = DATA_DIR / 'archive'                      # archivebox snapshot data dir
+
+#############################################################################################
+
+
+@cache
+def detect_installed_version(PACKAGE_DIR: Path=PACKAGE_DIR):
+    """Autodetect the installed archivebox version by using pip package metadata, pyproject.toml file, or package.json file"""
+    try:
+        # if in production install, use pip-installed package metadata
+        return importlib.metadata.version('archivebox').strip()
+    except importlib.metadata.PackageNotFoundError:
+        pass
+
+    try:
+        # if in dev Git repo dir, use pyproject.toml file
+        pyproject_config = (PACKAGE_DIR.parent / 'pyproject.toml').read_text().split('\n')
+        for line in pyproject_config:
+            if line.startswith('version = '):
+                return line.split(' = ', 1)[-1].strip('"').strip()
+    except FileNotFoundError:
+        # building docs, pyproject.toml is not available
+        pass
+
+    # raise Exception('Failed to detect installed archivebox version!')
+    return 'dev'
+
+
+@cache
+def get_COMMIT_HASH() -> Optional[str]:
+    try:
+        git_dir = PACKAGE_DIR / '../.git'
+        ref = (git_dir / 'HEAD').read_text().strip().split(' ')[-1]
+        commit_hash = git_dir.joinpath(ref).read_text().strip()
+        return commit_hash
+    except Exception:
+        pass
+
+    try:
+        return list((PACKAGE_DIR / '../.git/refs/heads/').glob('*'))[0].read_text().strip()
+    except Exception:
+        pass
+    
+    return None
+    
+@cache
+def get_BUILD_TIME() -> str:
+    if IN_DOCKER:
+        docker_build_end_time = Path('/VERSION.txt').read_text().rsplit('BUILD_END_TIME=')[-1].split('\n', 1)[0]
+        return docker_build_end_time
+
+    src_last_modified_unix_timestamp = (PACKAGE_DIR / 'README.md').stat().st_mtime
+    return datetime.fromtimestamp(src_last_modified_unix_timestamp).strftime('%Y-%m-%d %H:%M:%S %s')
+
+
+# def get_versions_available_on_github(config):
+#     """
+#     returns a dictionary containing the ArchiveBox GitHub release info for
+#     the recommended upgrade version and the currently installed version
+#     """
+    
+#     # we only want to perform the (relatively expensive) check for new versions
+#     # when its most relevant, e.g. when the user runs a long-running command
+#     subcommand_run_by_user = sys.argv[3] if len(sys.argv) > 3 else 'help'
+#     long_running_commands = ('add', 'schedule', 'update', 'status', 'server')
+#     if subcommand_run_by_user not in long_running_commands:
+#         return None
+    
+#     github_releases_api = "https://api.github.com/repos/ArchiveBox/ArchiveBox/releases"
+#     response = requests.get(github_releases_api)
+#     if response.status_code != 200:
+#         stderr(f'[!] Warning: GitHub API call to check for new ArchiveBox version failed! (status={response.status_code})', color='lightyellow', config=config)
+#         return None
+#     all_releases = response.json()
+
+#     installed_version = parse_version_string(config['VERSION'])
+
+#     # find current version or nearest older version (to link to)
+#     current_version = None
+#     for idx, release in enumerate(all_releases):
+#         release_version = parse_version_string(release['tag_name'])
+#         if release_version <= installed_version:
+#             current_version = release
+#             break
+
+#     current_version = current_version or all_releases[-1]
+    
+#     # recommended version is whatever comes after current_version in the release list
+#     # (perhaps too conservative to only recommend upgrading one version at a time, but it's safest)
+#     try:
+#         recommended_version = all_releases[idx+1]
+#     except IndexError:
+#         recommended_version = None
+
+#     return {'recommended_version': recommended_version, 'current_version': current_version}
+
+# def can_upgrade(config):
+#     if config['VERSIONS_AVAILABLE'] and config['VERSIONS_AVAILABLE']['recommended_version']:
+#         recommended_version = parse_version_string(config['VERSIONS_AVAILABLE']['recommended_version']['tag_name'])
+#         current_version = parse_version_string(config['VERSIONS_AVAILABLE']['current_version']['tag_name'])
+#         return recommended_version > current_version
+#     return False
+
+
+VERSION: str = detect_installed_version()

+ 1 - 1
archivebox/core/middleware.py

@@ -5,7 +5,7 @@ from django.utils import timezone
 from django.contrib.auth.middleware import RemoteUserMiddleware
 from django.contrib.auth.middleware import RemoteUserMiddleware
 from django.core.exceptions import ImproperlyConfigured
 from django.core.exceptions import ImproperlyConfigured
 
 
-from archivebox.config import SERVER_CONFIG
+from archivebox.config.common import SERVER_CONFIG
 
 
 
 
 def detect_timezone(request, activate: bool=True):
 def detect_timezone(request, activate: bool=True):

+ 2 - 1
archivebox/core/settings.py

@@ -13,7 +13,8 @@ import abx.archivebox
 import abx.archivebox.use
 import abx.archivebox.use
 import abx.django.use
 import abx.django.use
 
 
-from archivebox.config import VERSION, DATA_DIR, PACKAGE_DIR, ARCHIVE_DIR, CONSTANTS, SHELL_CONFIG, SERVER_CONFIG      # noqa
+from archivebox.config import DATA_DIR, PACKAGE_DIR, ARCHIVE_DIR, CONSTANTS
+from archivebox.config.common import SHELL_CONFIG, SERVER_CONFIG      # noqa
 
 
 IS_MIGRATING = 'makemigrations' in sys.argv[:3] or 'migrate' in sys.argv[:3]
 IS_MIGRATING = 'makemigrations' in sys.argv[:3] or 'migrate' in sys.argv[:3]
 IS_TESTING = 'test' in sys.argv[:3] or 'PYTEST_CURRENT_TEST' in os.environ
 IS_TESTING = 'test' in sys.argv[:3] or 'PYTEST_CURRENT_TEST' in os.environ

+ 2 - 1
archivebox/core/views.py

@@ -27,7 +27,8 @@ from core.admin import result_url
 
 
 from queues.tasks import bg_add
 from queues.tasks import bg_add
 
 
-from archivebox.config import CONSTANTS_CONFIG, DATA_DIR, VERSION, SHELL_CONFIG, SERVER_CONFIG
+from archivebox.config import CONSTANTS_CONFIG, DATA_DIR, VERSION
+from archivebox.config.common import SHELL_CONFIG, SERVER_CONFIG
 from archivebox.misc.util import base_url, htmlencode, ts_to_date_str
 from archivebox.misc.util import base_url, htmlencode, ts_to_date_str
 
 
 from .serve_static import serve_static_with_byterange_support
 from .serve_static import serve_static_with_byterange_support

+ 2 - 1
archivebox/extractors/htmltotext.py

@@ -5,7 +5,8 @@ import io
 from pathlib import Path
 from pathlib import Path
 from typing import Optional
 from typing import Optional
 
 
-from archivebox.config import VERSION, ARCHIVING_CONFIG
+from archivebox.config import VERSION
+from archivebox.config.common import ARCHIVING_CONFIG
 from archivebox.config.legacy import SAVE_HTMLTOTEXT
 from archivebox.config.legacy import SAVE_HTMLTOTEXT
 from archivebox.misc.system import atomic_write
 from archivebox.misc.system import atomic_write
 from archivebox.misc.util import enforce_types, is_static_file
 from archivebox.misc.util import enforce_types, is_static_file

+ 4 - 2
archivebox/index/__init__.py

@@ -12,9 +12,11 @@ from urllib.parse import urlparse
 from django.db.models import QuerySet, Q
 from django.db.models import QuerySet, Q
 
 
 
 
-from archivebox.config import DATA_DIR, CONSTANTS, ARCHIVING_CONFIG, STORAGE_CONFIG, SEARCH_BACKEND_CONFIG
-from archivebox.misc.util import scheme, enforce_types, ExtendedEncoder
 from archivebox.misc.logging import stderr
 from archivebox.misc.logging import stderr
+from archivebox.misc.util import scheme, enforce_types, ExtendedEncoder
+
+from archivebox.config import DATA_DIR, CONSTANTS
+from archivebox.config.common import ARCHIVING_CONFIG, STORAGE_CONFIG, SEARCH_BACKEND_CONFIG
 from archivebox.config.legacy import URL_DENYLIST_PTN, URL_ALLOWLIST_PTN
 from archivebox.config.legacy import URL_DENYLIST_PTN, URL_ALLOWLIST_PTN
 
 
 from ..logging_util import (
 from ..logging_util import (

+ 4 - 2
archivebox/index/html.py

@@ -16,7 +16,9 @@ from archivebox.misc.util import (
     htmlencode,
     htmlencode,
     urldecode,
     urldecode,
 )
 )
-from archivebox.config import CONSTANTS, DATA_DIR, VERSION, SHELL_CONFIG, SERVER_CONFIG
+from archivebox.config import CONSTANTS, DATA_DIR, VERSION
+from archivebox.config.common import SERVER_CONFIG
+from archivebox.config.version import get_COMMIT_HASH
 from archivebox.plugins_extractor.archivedotorg.apps import ARCHIVEDOTORG_CONFIG
 from archivebox.plugins_extractor.archivedotorg.apps import ARCHIVEDOTORG_CONFIG
 
 
 from .schema import Link
 from .schema import Link
@@ -56,7 +58,7 @@ def main_index_template(links: List[Link], template: str=MAIN_INDEX_TEMPLATE) ->
 
 
     return render_django_template(template, {
     return render_django_template(template, {
         'version': VERSION,
         'version': VERSION,
-        'git_sha': SHELL_CONFIG.COMMIT_HASH or VERSION,
+        'git_sha': get_COMMIT_HASH() or VERSION,
         'num_links': str(len(links)),
         'num_links': str(len(links)),
         'date_updated': datetime.now(timezone.utc).strftime('%Y-%m-%d'),
         'date_updated': datetime.now(timezone.utc).strftime('%Y-%m-%d'),
         'time_updated': datetime.now(timezone.utc).strftime('%Y-%m-%d %H:%M'),
         'time_updated': datetime.now(timezone.utc).strftime('%Y-%m-%d %H:%M'),

+ 2 - 1
archivebox/index/json.py

@@ -8,7 +8,8 @@ from pathlib import Path
 from datetime import datetime, timezone
 from datetime import datetime, timezone
 from typing import List, Optional, Iterator, Any, Union
 from typing import List, Optional, Iterator, Any, Union
 
 
-from archivebox.config import VERSION, DATA_DIR, CONSTANTS, SERVER_CONFIG, SHELL_CONFIG
+from archivebox.config import VERSION, DATA_DIR, CONSTANTS
+from archivebox.config.common import SERVER_CONFIG, SHELL_CONFIG
 
 
 from .schema import Link
 from .schema import Link
 from archivebox.misc.system import atomic_write
 from archivebox.misc.system import atomic_write

+ 2 - 1
archivebox/index/sql.py

@@ -9,7 +9,8 @@ from django.db.models import QuerySet
 from django.db import transaction
 from django.db import transaction
 
 
 from archivebox.misc.util import enforce_types, parse_date
 from archivebox.misc.util import enforce_types, parse_date
-from archivebox.config import DATA_DIR, GENERAL_CONFIG
+from archivebox.config import DATA_DIR
+from archivebox.config.common import GENERAL_CONFIG
 
 
 from .schema import Link
 from .schema import Link
 
 

+ 2 - 1
archivebox/logging_util.py

@@ -22,7 +22,8 @@ from rich.panel import Panel
 from rich_argparse import RichHelpFormatter
 from rich_argparse import RichHelpFormatter
 from django.core.management.base import DjangoHelpFormatter
 from django.core.management.base import DjangoHelpFormatter
 
 
-from archivebox.config import CONSTANTS, DATA_DIR, VERSION, SHELL_CONFIG
+from archivebox.config import CONSTANTS, DATA_DIR, VERSION
+from archivebox.config.common import SHELL_CONFIG
 from archivebox.misc.system import get_dir_size
 from archivebox.misc.system import get_dir_size
 from archivebox.misc.util import enforce_types
 from archivebox.misc.util import enforce_types
 from archivebox.misc.logging import ANSI, stderr
 from archivebox.misc.logging import ANSI, stderr

+ 93 - 23
archivebox/main.py

@@ -14,13 +14,15 @@ from crontab import CronTab, CronSlices
 from django.db.models import QuerySet
 from django.db.models import QuerySet
 from django.utils import timezone
 from django.utils import timezone
 
 
-from archivebox.config import CONSTANTS, VERSION, DATA_DIR, ARCHIVE_DIR, SHELL_CONFIG, SEARCH_BACKEND_CONFIG, STORAGE_CONFIG, SERVER_CONFIG, ARCHIVING_CONFIG
+from archivebox.config import CONSTANTS, VERSION, DATA_DIR, ARCHIVE_DIR
+from archivebox.config.common import SHELL_CONFIG, SEARCH_BACKEND_CONFIG, STORAGE_CONFIG, SERVER_CONFIG, ARCHIVING_CONFIG
+from archivebox.config.permissions import SudoPermission, IN_DOCKER
 from .cli import (
 from .cli import (
     CLI_SUBCOMMANDS,
     CLI_SUBCOMMANDS,
     run_subcommand,
     run_subcommand,
     display_first,
     display_first,
     meta_cmds,
     meta_cmds,
-    main_cmds,
+    setup_cmds,
     archive_cmds,
     archive_cmds,
 )
 )
 from .parsers import (
 from .parsers import (
@@ -101,7 +103,7 @@ def help(out_dir: Path=DATA_DIR) -> None:
     ) + '\n\n    ' + '\n    '.join(
     ) + '\n\n    ' + '\n    '.join(
         f'[green]{cmd.ljust(20)}[/green] {func.__doc__}'
         f'[green]{cmd.ljust(20)}[/green] {func.__doc__}'
         for cmd, func in all_subcommands.items()
         for cmd, func in all_subcommands.items()
-        if cmd in main_cmds
+        if cmd in setup_cmds
     ) + '\n\n    ' + '\n    '.join(
     ) + '\n\n    ' + '\n    '.join(
         f'[green]{cmd.ljust(20)}[/green] {func.__doc__}'
         f'[green]{cmd.ljust(20)}[/green] {func.__doc__}'
         for cmd, func in all_subcommands.items()
         for cmd, func in all_subcommands.items()
@@ -119,10 +121,10 @@ def help(out_dir: Path=DATA_DIR) -> None:
 
 
     [grey53]# using Docker:[/grey53]
     [grey53]# using Docker:[/grey53]
     [blue]docker run[/blue] -v [light_slate_blue]$PWD:/data[/light_slate_blue] [grey53]-p 8000:8000[/grey53] -it [dark_green]archivebox/archivebox[/dark_green] [green]\\[command][/green] [green3][...args][/green3] [violet][--help][/violet] [grey53][--version][/grey53]
     [blue]docker run[/blue] -v [light_slate_blue]$PWD:/data[/light_slate_blue] [grey53]-p 8000:8000[/grey53] -it [dark_green]archivebox/archivebox[/dark_green] [green]\\[command][/green] [green3][...args][/green3] [violet][--help][/violet] [grey53][--version][/grey53]
-''' if SHELL_CONFIG.IN_DOCKER else ''
-    DOCKER_DOCS = '\n    [link=https://github.com/ArchiveBox/ArchiveBox/wiki/Docker#usage]https://github.com/ArchiveBox/ArchiveBox/wiki/Docker[/link]' if SHELL_CONFIG.IN_DOCKER else ''
-    DOCKER_OUTSIDE_HINT = "\n    [grey53]# outside of Docker:[/grey53]" if SHELL_CONFIG.IN_DOCKER else ''
-    DOCKER_CMD_PREFIX = "[blue]docker ... [/blue]" if SHELL_CONFIG.IN_DOCKER else ''
+''' if IN_DOCKER else ''
+    DOCKER_DOCS = '\n    [link=https://github.com/ArchiveBox/ArchiveBox/wiki/Docker#usage]https://github.com/ArchiveBox/ArchiveBox/wiki/Docker[/link]' if IN_DOCKER else ''
+    DOCKER_OUTSIDE_HINT = "\n    [grey53]# outside of Docker:[/grey53]" if IN_DOCKER else ''
+    DOCKER_CMD_PREFIX = "[blue]docker ... [/blue]" if IN_DOCKER else ''
 
 
     print(f'''{DOCKER_USAGE}
     print(f'''{DOCKER_USAGE}
 [deep_sky_blue4]Usage:[/deep_sky_blue4]{DOCKER_OUTSIDE_HINT}
 [deep_sky_blue4]Usage:[/deep_sky_blue4]{DOCKER_OUTSIDE_HINT}
@@ -158,7 +160,7 @@ def help(out_dir: Path=DATA_DIR) -> None:
         print(Panel(EXAMPLE_USAGE, expand=False, border_style='grey53', title='[green3]:white_check_mark: A collection [light_slate_blue]DATA DIR[/light_slate_blue] is currently active[/green3]', subtitle='Commands run inside this dir will only apply to this collection.'))
         print(Panel(EXAMPLE_USAGE, expand=False, border_style='grey53', title='[green3]:white_check_mark: A collection [light_slate_blue]DATA DIR[/light_slate_blue] is currently active[/green3]', subtitle='Commands run inside this dir will only apply to this collection.'))
     else:
     else:
         DATA_SETUP_HELP = '\n'
         DATA_SETUP_HELP = '\n'
-        if SHELL_CONFIG.IN_DOCKER:
+        if IN_DOCKER:
             DATA_SETUP_HELP += '[violet]Hint:[/violet] When using Docker, you need to mount a volume to use as your data dir:\n'
             DATA_SETUP_HELP += '[violet]Hint:[/violet] When using Docker, you need to mount a volume to use as your data dir:\n'
             DATA_SETUP_HELP += '    docker run [violet]-v /some/path/data:/data[/violet] archivebox/archivebox ...\n\n'
             DATA_SETUP_HELP += '    docker run [violet]-v /some/path/data:/data[/violet] archivebox/archivebox ...\n\n'
         DATA_SETUP_HELP += 'To load an [dark_blue]existing[/dark_blue] collection:\n'
         DATA_SETUP_HELP += 'To load an [dark_blue]existing[/dark_blue] collection:\n'
@@ -190,6 +192,8 @@ def version(quiet: bool=False,
     
     
     from plugins_auth.ldap.apps import LDAP_CONFIG
     from plugins_auth.ldap.apps import LDAP_CONFIG
     from django.conf import settings
     from django.conf import settings
+    from archivebox.config.version import get_COMMIT_HASH, get_BUILD_TIME
+    from archivebox.config.permissions import ARCHIVEBOX_USER, ARCHIVEBOX_GROUP, RUNNING_AS_UID, RUNNING_AS_GID
 
 
     # 0.7.1
     # 0.7.1
     # ArchiveBox v0.7.1+editable COMMIT_HASH=951bba5 BUILD_TIME=2023-12-17 16:46:05 1702860365
     # ArchiveBox v0.7.1+editable COMMIT_HASH=951bba5 BUILD_TIME=2023-12-17 16:46:05 1702860365
@@ -198,13 +202,14 @@ def version(quiet: bool=False,
     # DEBUG=False IS_TTY=True TZ=UTC SEARCH_BACKEND=ripgrep LDAP=False
     # DEBUG=False IS_TTY=True TZ=UTC SEARCH_BACKEND=ripgrep LDAP=False
     
     
     p = platform.uname()
     p = platform.uname()
+    COMMIT_HASH = get_COMMIT_HASH()
     prnt(
     prnt(
         '[dark_green]ArchiveBox[/dark_green] [dark_goldenrod]v{}[/dark_goldenrod]'.format(CONSTANTS.VERSION),
         '[dark_green]ArchiveBox[/dark_green] [dark_goldenrod]v{}[/dark_goldenrod]'.format(CONSTANTS.VERSION),
-        f'COMMIT_HASH={SHELL_CONFIG.COMMIT_HASH[:7] if SHELL_CONFIG.COMMIT_HASH else "unknown"}',
-        f'BUILD_TIME={SHELL_CONFIG.BUILD_TIME}',
+        f'COMMIT_HASH={COMMIT_HASH[:7] if COMMIT_HASH else "unknown"}',
+        f'BUILD_TIME={get_BUILD_TIME()}',
     )
     )
     prnt(
     prnt(
-        f'IN_DOCKER={SHELL_CONFIG.IN_DOCKER}',
+        f'IN_DOCKER={IN_DOCKER}',
         f'IN_QEMU={SHELL_CONFIG.IN_QEMU}',
         f'IN_QEMU={SHELL_CONFIG.IN_QEMU}',
         f'ARCH={p.machine}',
         f'ARCH={p.machine}',
         f'OS={p.system}',
         f'OS={p.system}',
@@ -212,11 +217,13 @@ def version(quiet: bool=False,
         f'PYTHON={sys.implementation.name.title()}',
         f'PYTHON={sys.implementation.name.title()}',
     )
     )
     OUTPUT_IS_REMOTE_FS = CONSTANTS.DATA_LOCATIONS.DATA_DIR.is_mount or CONSTANTS.DATA_LOCATIONS.ARCHIVE_DIR.is_mount
     OUTPUT_IS_REMOTE_FS = CONSTANTS.DATA_LOCATIONS.DATA_DIR.is_mount or CONSTANTS.DATA_LOCATIONS.ARCHIVE_DIR.is_mount
+    DATA_DIR_STAT = CONSTANTS.DATA_DIR.stat()
     prnt(
     prnt(
+        f'EUID={os.geteuid()} UID={RUNNING_AS_UID} PUID={ARCHIVEBOX_USER} FS_UID={DATA_DIR_STAT.st_uid}',
+        f'EGID={os.getegid()} GID={RUNNING_AS_GID} PGID={ARCHIVEBOX_GROUP} FS_GID={DATA_DIR_STAT.st_gid}',
+        f'FS_PERMS={STORAGE_CONFIG.OUTPUT_PERMISSIONS}',
         f'FS_ATOMIC={STORAGE_CONFIG.ENFORCE_ATOMIC_WRITES}',
         f'FS_ATOMIC={STORAGE_CONFIG.ENFORCE_ATOMIC_WRITES}',
         f'FS_REMOTE={OUTPUT_IS_REMOTE_FS}',
         f'FS_REMOTE={OUTPUT_IS_REMOTE_FS}',
-        f'FS_USER={SHELL_CONFIG.PUID}:{SHELL_CONFIG.PGID}',
-        f'FS_PERMS={STORAGE_CONFIG.OUTPUT_PERMISSIONS}',
     )
     )
     prnt(
     prnt(
         f'DEBUG={SHELL_CONFIG.DEBUG}',
         f'DEBUG={SHELL_CONFIG.DEBUG}',
@@ -261,8 +268,36 @@ def version(quiet: bool=False,
     else:
     else:
         prnt()
         prnt()
         prnt('[red][i] Data locations:[/red] (not in a data directory)')
         prnt('[red][i] Data locations:[/red] (not in a data directory)')
-
+        
     prnt()
     prnt()
+    
+    from archivebox.config.permissions import ARCHIVEBOX_USER, ARCHIVEBOX_GROUP, DEFAULT_PUID, DEFAULT_PGID, IS_ROOT, USER
+    
+    data_dir_stat = Path(DATA_DIR).stat()
+    data_dir_uid, data_dir_gid = data_dir_stat.st_uid, data_dir_stat.st_gid
+    data_owned_by_root = data_dir_uid == 0 or data_dir_gid == 0
+    
+    data_owned_by_default_user = data_dir_uid == DEFAULT_PUID or data_dir_gid == DEFAULT_PGID
+    data_owner_doesnt_match = (data_dir_uid != ARCHIVEBOX_USER and data_dir_gid != ARCHIVEBOX_GROUP) and not IS_ROOT
+    data_not_writable = not (os.access(DATA_DIR, os.W_OK) and os.access(CONSTANTS.LIB_DIR, os.W_OK) and os.access(CONSTANTS.TMP_DIR, os.W_OK))
+    if data_owned_by_root:
+        prnt('[yellow]:warning: Warning: ArchiveBox [blue]DATA_DIR[/blue] is currently owned by [red]root[/red], ArchiveBox will refuse to run![/yellow]')
+    elif data_owner_doesnt_match or data_not_writable:
+        prnt(f'[yellow]:warning: Warning: ArchiveBox [blue]DATA_DIR[/blue] is currently owned by [red]{data_dir_uid}:{data_dir_gid}[/red], but ArchiveBox user is [blue]{ARCHIVEBOX_USER}:{ARCHIVEBOX_GROUP}[/blue] ({USER})! (ArchiveBox may not be able to write to the data dir)[/yellow]')
+    else:
+        prnt(f':information: [blue]DATA_DIR[/blue] is currently owned by [blue]{data_dir_uid}:{data_dir_gid}[/blue] (PUID:PGID)')
+        
+    if data_owned_by_root or data_owner_doesnt_match or data_owned_by_default_user or data_not_writable:
+        prnt(f'[violet]Hint:[/violet] If you encounter permissions errors, change [red]{data_dir_uid}[/red]:{data_dir_gid} (PUID:PGID) to match the user that will run ArchiveBox, e.g.:')
+        prnt(f'    [grey53]sudo[/grey53] chown -R [blue]{DEFAULT_PUID}:{DEFAULT_PGID}[/blue] {DATA_DIR.resolve()}')
+        prnt(f'    [grey53]sudo[/grey53] chown -R [blue]{DEFAULT_PUID}:{DEFAULT_PGID}[/blue] {CONSTANTS.LIB_DIR.resolve()}')
+        prnt(f'    [grey53]sudo[/grey53] chown -R [blue]{DEFAULT_PUID}:{DEFAULT_PGID}[/blue] {CONSTANTS.TMP_DIR.resolve()}')
+        prnt()
+        prnt('[blue]More info:[/blue]')
+        prnt('    [link=https://github.com/ArchiveBox/ArchiveBox#storage-requirements]https://github.com/ArchiveBox/ArchiveBox#storage-requirements[/link]')
+        prnt('    [link=https://github.com/ArchiveBox/ArchiveBox/wiki/Security-Overview#permissions]https://github.com/ArchiveBox/ArchiveBox/wiki/Security-Overview#permissions[/link]')
+        prnt('    [link=https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#puid--pgid]https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#puid--pgid[/link]')
+        prnt('    [link=https://github.com/ArchiveBox/ArchiveBox/wiki/Troubleshooting#filesystem-doesnt-support-fsync-eg-network-mounts]https://github.com/ArchiveBox/ArchiveBox/wiki/Troubleshooting#filesystem-doesnt-support-fsync-eg-network-mounts[/link]')
 
 
 
 
 @enforce_types
 @enforce_types
@@ -948,23 +983,56 @@ def list_folders(links: List[Link],
 @enforce_types
 @enforce_types
 def install(out_dir: Path=DATA_DIR) -> None:
 def install(out_dir: Path=DATA_DIR) -> None:
     """Automatically install all ArchiveBox dependencies and extras"""
     """Automatically install all ArchiveBox dependencies and extras"""
+    
+    # if running as root:
+    #    - run init to create index + lib dir
+    #    - chown -R 911 DATA_DIR
+    #    - install all binaries as root
+    #    - chown -R 911 LIB_DIR
+    # else:
+    #    - run init to create index + lib dir as current user
+    #    - install all binaries as current user
+    #    - recommend user re-run with sudo if any deps need to be installed as root
 
 
     from rich import print
     from rich import print
     from django.conf import settings
     from django.conf import settings
+    
+    from archivebox import CONSTANTS
+    from archivebox.config.permissions import IS_ROOT, ARCHIVEBOX_USER, ARCHIVEBOX_GROUP
 
 
     if not ARCHIVE_DIR.exists():
     if not ARCHIVE_DIR.exists():
-        run_subcommand('init', stdin=None, pwd=out_dir)
-
-    stderr('\n[+] Installing ArchiveBox dependencies automatically...', color='green')
+        run_subcommand('init', stdin=None, pwd=out_dir)  # must init full index because we need a db to store InstalledBinary entries in
 
 
+    print('\n[green][+] Installing ArchiveBox dependencies automatically...[/green]')
+    
+    # we never want the data dir to be owned by root, detect owner of existing owner of DATA_DIR to try and guess desired non-root UID
+    if IS_ROOT:
+        # if we have sudo/root permissions, take advantage of them just while installing dependencies
+        print()
+        print('[yellow]:warning:  Using [red]root[/red] privileges only to install dependencies that need it, all other operations should be done as a [blue]non-root[/blue] user.[/yellow]')
+        print(f'    DATA_DIR, LIB_DIR, and TMP_DIR will be owned by [blue]{ARCHIVEBOX_USER}:{ARCHIVEBOX_GROUP}[/blue].')
+        print()
+        
     for binary in reversed(list(settings.BINARIES.values())):
     for binary in reversed(list(settings.BINARIES.values())):
         providers = ' [grey53]or[/grey53] '.join(provider.name for provider in binary.binproviders_supported)
         providers = ' [grey53]or[/grey53] '.join(provider.name for provider in binary.binproviders_supported)
         print(f'[+] Locating / Installing [yellow]{binary.name}[/yellow] using [red]{providers}[/red]...')
         print(f'[+] Locating / Installing [yellow]{binary.name}[/yellow] using [red]{providers}[/red]...')
         try:
         try:
             print(binary.load_or_install(fresh=True).model_dump(exclude={'provider_overrides', 'bin_dir', 'hook_type'}))
             print(binary.load_or_install(fresh=True).model_dump(exclude={'provider_overrides', 'bin_dir', 'hook_type'}))
+            if IS_ROOT:
+                with SudoPermission(uid=0):
+                    os.system(f'chown -R {ARCHIVEBOX_USER} "{CONSTANTS.LIB_DIR.resolve()}"')
         except Exception as e:
         except Exception as e:
-            print(f'[X] Failed to install {binary.name}: {e}')
-
+            if IS_ROOT:
+                print(f'[yellow]:warning:  Retrying {binary.name} installation with [red]sudo[/red]...[/yellow]')
+                with SudoPermission(uid=0):
+                    try:
+                        print(binary.load_or_install(fresh=True).model_dump(exclude={'provider_overrides', 'bin_dir', 'hook_type'}))
+                        os.system(f'chown -R {ARCHIVEBOX_USER} "{CONSTANTS.LIB_DIR.resolve()}"')
+                    except Exception as e:
+                        print(f'[red]:cross_mark: Failed to install {binary.name} as root: {e}[/red]')
+            else:
+                print(f'[red]:cross_mark: Failed to install {binary.name} as user {ARCHIVEBOX_USER}: {e}[/red]')
+                
 
 
     from django.contrib.auth import get_user_model
     from django.contrib.auth import get_user_model
     User = get_user_model()
     User = get_user_model()
@@ -974,12 +1042,13 @@ def install(out_dir: Path=DATA_DIR) -> None:
         stderr('    archivebox manage createsuperuser')
         stderr('    archivebox manage createsuperuser')
         # run_subcommand('manage', subcommand_args=['createsuperuser'], pwd=out_dir)
         # run_subcommand('manage', subcommand_args=['createsuperuser'], pwd=out_dir)
     
     
-    stderr('\n[√] Set up ArchiveBox and its dependencies successfully.', color='green')
+    print('\n[green][√] Set up ArchiveBox and its dependencies successfully.[/green]\n', file=sys.stderr)
     
     
     from plugins_pkg.pip.apps import ARCHIVEBOX_BINARY
     from plugins_pkg.pip.apps import ARCHIVEBOX_BINARY
     
     
     run_shell([ARCHIVEBOX_BINARY.load().abspath, 'version'], capture_output=False, cwd=out_dir)
     run_shell([ARCHIVEBOX_BINARY.load().abspath, 'version'], capture_output=False, cwd=out_dir)
 
 
+
 # backwards-compatibility:
 # backwards-compatibility:
 setup = install
 setup = install
 
 
@@ -1100,6 +1169,7 @@ def schedule(add: bool=False,
     
     
     check_data_folder()
     check_data_folder()
     from archivebox.plugins_pkg.pip.apps import ARCHIVEBOX_BINARY
     from archivebox.plugins_pkg.pip.apps import ARCHIVEBOX_BINARY
+    from archivebox.config.permissions import USER
 
 
     Path(CONSTANTS.LOGS_DIR).mkdir(exist_ok=True)
     Path(CONSTANTS.LOGS_DIR).mkdir(exist_ok=True)
 
 
@@ -1156,7 +1226,7 @@ def schedule(add: bool=False,
         existing_jobs = list(cron.find_comment(CRON_COMMENT))
         existing_jobs = list(cron.find_comment(CRON_COMMENT))
 
 
         print()
         print()
-        print('{green}[√] Scheduled new ArchiveBox cron job for user: {} ({} jobs are active).{reset}'.format(SHELL_CONFIG.USER, len(existing_jobs), **SHELL_CONFIG.ANSI))
+        print('{green}[√] Scheduled new ArchiveBox cron job for user: {} ({} jobs are active).{reset}'.format(USER, len(existing_jobs), **SHELL_CONFIG.ANSI))
         print('\n'.join(f'  > {cmd}' if str(cmd) == str(new_job) else f'    {cmd}' for cmd in existing_jobs))
         print('\n'.join(f'  > {cmd}' if str(cmd) == str(new_job) else f'    {cmd}' for cmd in existing_jobs))
         if total_runs > 60 and not quiet:
         if total_runs > 60 and not quiet:
             stderr()
             stderr()
@@ -1170,7 +1240,7 @@ def schedule(add: bool=False,
         if existing_jobs:
         if existing_jobs:
             print('\n'.join(str(cmd) for cmd in existing_jobs))
             print('\n'.join(str(cmd) for cmd in existing_jobs))
         else:
         else:
-            stderr('{red}[X] There are no ArchiveBox cron jobs scheduled for your user ({}).{reset}'.format(SHELL_CONFIG.USER, **SHELL_CONFIG.ANSI))
+            stderr('{red}[X] There are no ArchiveBox cron jobs scheduled for your user ({}).{reset}'.format(USER, **SHELL_CONFIG.ANSI))
             stderr('    To schedule a new job, run:')
             stderr('    To schedule a new job, run:')
             stderr('        archivebox schedule --every=[timeperiod] --depth=1 https://example.com/some/rss/feed.xml')
             stderr('        archivebox schedule --every=[timeperiod] --depth=1 https://example.com/some/rss/feed.xml')
         raise SystemExit(0)
         raise SystemExit(0)
@@ -1294,7 +1364,7 @@ def manage(args: Optional[List[str]]=None, out_dir: Path=DATA_DIR) -> None:
     check_data_folder()
     check_data_folder()
     from django.core.management import execute_from_command_line
     from django.core.management import execute_from_command_line
 
 
-    if (args and "createsuperuser" in args) and (SHELL_CONFIG.IN_DOCKER and not SHELL_CONFIG.IS_TTY):
+    if (args and "createsuperuser" in args) and (IN_DOCKER and not SHELL_CONFIG.IS_TTY):
         stderr('[!] Warning: you need to pass -it to use interactive commands in docker', color='lightyellow')
         stderr('[!] Warning: you need to pass -it to use interactive commands in docker', color='lightyellow')
         stderr('    docker run -it archivebox manage {}'.format(' '.join(args or ['...'])), color='lightyellow')
         stderr('    docker run -it archivebox manage {}'.format(' '.join(args or ['...'])), color='lightyellow')
         stderr('')
         stderr('')

+ 62 - 19
archivebox/misc/checks.py

@@ -1,37 +1,44 @@
 __package__ = 'archivebox.misc'
 __package__ = 'archivebox.misc'
 
 
-from archivebox.config import DATA_DIR, ARCHIVE_DIR, CONSTANTS, SHELL_CONFIG
+import sys
+from rich import print
 
 
-from .logging import stderr
+# DO NOT ADD ANY TOP-LEVEL IMPORTS HERE
+# this file is imported by archivebox/__init__.py
+# and any imports here will be imported by EVERYTHING else
+# so this file should only be used for pure python checks
+# that don't need to import other parts of ArchiveBox
 
 
 
 
 def check_data_folder() -> None:
 def check_data_folder() -> None:
-
+    from archivebox import DATA_DIR, ARCHIVE_DIR
+    
     archive_dir_exists = ARCHIVE_DIR.exists()
     archive_dir_exists = ARCHIVE_DIR.exists()
     if not archive_dir_exists:
     if not archive_dir_exists:
-        stderr('[X] No archivebox index found in the current directory.', color='red')
-        stderr(f'    {DATA_DIR}', color='lightyellow')
-        stderr()
-        stderr('    {lightred}Hint{reset}: Are you running archivebox in the right folder?'.format(**SHELL_CONFIG.ANSI))
-        stderr('        cd path/to/your/archive/folder')
-        stderr('        archivebox [command]')
-        stderr()
-        stderr('    {lightred}Hint{reset}: To create a new archive collection or import existing data in this folder, run:'.format(**SHELL_CONFIG.ANSI))
-        stderr('        archivebox init')
+        print('[red][X] No archivebox index found in the current directory.[/red]', file=sys.stderr)
+        print(f'    {DATA_DIR}', file=sys.stderr)
+        print(file=sys.stderr)
+        print('    [violet]Hint[/violet]: Are you running archivebox in the right folder?', file=sys.stderr)
+        print('        cd path/to/your/archive/folder', file=sys.stderr)
+        print('        archivebox [command]', file=sys.stderr)
+        print(file=sys.stderr)
+        print('    [violet]Hint[/violet]: To create a new archive collection or import existing data in this folder, run:', file=sys.stderr)
+        print('        archivebox init', file=sys.stderr)
         raise SystemExit(2)
         raise SystemExit(2)
-
-
+    
+    
 def check_migrations():
 def check_migrations():
+    from archivebox import DATA_DIR, CONSTANTS
     from ..index.sql import list_migrations
     from ..index.sql import list_migrations
 
 
     pending_migrations = [name for status, name in list_migrations() if not status]
     pending_migrations = [name for status, name in list_migrations() if not status]
 
 
     if pending_migrations:
     if pending_migrations:
-        stderr('[X] This collection was created with an older version of ArchiveBox and must be upgraded first.', color='lightyellow')
-        stderr(f'    {DATA_DIR}')
-        stderr()
-        stderr(f'    To upgrade it to the latest version and apply the {len(pending_migrations)} pending migrations, run:')
-        stderr('        archivebox init')
+        print('[red][X] This collection was created with an older version of ArchiveBox and must be upgraded first.[/red]')
+        print(f'    {DATA_DIR}', file=sys.stderr)
+        print(file=sys.stderr)
+        print(f'    [violet]Hint:[/violet] To upgrade it to the latest version and apply the {len(pending_migrations)} pending migrations, run:', file=sys.stderr)
+        print('        archivebox init', file=sys.stderr)
         raise SystemExit(3)
         raise SystemExit(3)
 
 
     CONSTANTS.SOURCES_DIR.mkdir(exist_ok=True)
     CONSTANTS.SOURCES_DIR.mkdir(exist_ok=True)
@@ -39,3 +46,39 @@ def check_migrations():
     # CONSTANTS.CACHE_DIR.mkdir(exist_ok=True)
     # CONSTANTS.CACHE_DIR.mkdir(exist_ok=True)
     (CONSTANTS.LIB_DIR / 'bin').mkdir(exist_ok=True, parents=True)
     (CONSTANTS.LIB_DIR / 'bin').mkdir(exist_ok=True, parents=True)
     (CONSTANTS.PERSONAS_DIR / 'Default').mkdir(exist_ok=True, parents=True)
     (CONSTANTS.PERSONAS_DIR / 'Default').mkdir(exist_ok=True, parents=True)
+
+
+def check_io_encoding():
+    PYTHON_ENCODING = (sys.__stdout__ or sys.stdout or sys.__stderr__ or sys.stderr).encoding.upper().replace('UTF8', 'UTF-8')
+            
+    if PYTHON_ENCODING != 'UTF-8':
+        print(f'[red][X] Your system is running python3 scripts with a bad locale setting: {PYTHON_ENCODING} (it should be UTF-8).[/red]', file=sys.stderr)
+        print('    To fix it, add the line "export PYTHONIOENCODING=UTF-8" to your ~/.bashrc file (without quotes)', file=sys.stderr)
+        print('    Or if you\'re using ubuntu/debian, run "dpkg-reconfigure locales"', file=sys.stderr)
+        print('')
+        print('    Confirm that it\'s fixed by opening a new shell and running:', file=sys.stderr)
+        print('        python3 -c "import sys; print(sys.stdout.encoding)"   # should output UTF-8', file=sys.stderr)
+        raise SystemExit(2)
+
+
+def check_not_root():
+    from archivebox.config.permissions import IS_ROOT, IN_DOCKER
+    
+    attempted_command = ' '.join(sys.argv[1:]) if len(sys.argv) > 1 else ''
+    is_getting_help = '-h' in sys.argv or '--help' in sys.argv or 'help' in sys.argv[:2]
+    is_getting_version = '--version' in sys.argv or 'version' in sys.argv[:2]
+    is_installing = 'setup' in sys.argv[:2] or 'install' in sys.argv[:2]
+    
+    if IS_ROOT and not (is_getting_help or is_getting_version or is_installing):
+        print('[red][!] ArchiveBox should never be run as root![/red]', file=sys.stderr)
+        print('    For more information, see the security overview documentation:', file=sys.stderr)
+        print('        https://github.com/ArchiveBox/ArchiveBox/wiki/Security-Overview#do-not-run-as-root', file=sys.stderr)
+        
+        if IN_DOCKER:
+            print('[red][!] When using Docker, you must run commands with [green]docker run[/green] instead of [yellow3]docker exec[/yellow3], e.g.:', file=sys.stderr)
+            print('        docker compose run archivebox {attempted_command}', file=sys.stderr)
+            print(f'        docker run -it -v $PWD/data:/data archivebox/archivebox {attempted_command}', file=sys.stderr)
+            print('        or:', file=sys.stderr)
+            print(f'        docker compose exec --user=archivebox archivebox /bin/bash -c "archivebox {attempted_command}"', file=sys.stderr)
+            print(f'        docker exec -it --user=archivebox <container id> /bin/bash -c "archivebox {attempted_command}"', file=sys.stderr)
+        raise SystemExit(2)

+ 2 - 1
archivebox/misc/logging.py

@@ -13,6 +13,7 @@ from rich.highlighter import Highlighter
 
 
 # SETUP RICH CONSOLE / TTY detection / COLOR / PROGRESS BARS
 # SETUP RICH CONSOLE / TTY detection / COLOR / PROGRESS BARS
 CONSOLE = Console()
 CONSOLE = Console()
+STDERR = Console(stderr=True)
 IS_TTY = CONSOLE.is_interactive
 IS_TTY = CONSOLE.is_interactive
 
 
 
 
@@ -51,7 +52,7 @@ COLOR_DICT = defaultdict(lambda: [(0, 0, 0), (0, 0, 0)], {
     '37': [(255, 255, 255), (255, 255, 255)],
     '37': [(255, 255, 255), (255, 255, 255)],
 })
 })
 
 
-# Logging Helpers
+# Logging Helpers (DEPRECATED, use rich.print instead going forward)
 def stdout(*args, color: Optional[str]=None, prefix: str='', config: Optional[benedict]=None) -> None:
 def stdout(*args, color: Optional[str]=None, prefix: str='', config: Optional[benedict]=None) -> None:
     ansi = DEFAULT_CLI_COLORS if (config or {}).get('USE_COLOR') else ANSI
     ansi = DEFAULT_CLI_COLORS if (config or {}).get('USE_COLOR') else ANSI
 
 

+ 1 - 2
archivebox/misc/system.py

@@ -4,7 +4,6 @@ __package__ = 'archivebox.misc'
 import os
 import os
 import signal
 import signal
 import shutil
 import shutil
-import getpass
 
 
 from json import dump
 from json import dump
 from pathlib import Path
 from pathlib import Path
@@ -14,7 +13,7 @@ from subprocess import _mswindows, PIPE, Popen, CalledProcessError, CompletedPro
 from crontab import CronTab
 from crontab import CronTab
 from atomicwrites import atomic_write as lib_atomic_write
 from atomicwrites import atomic_write as lib_atomic_write
 
 
-from archivebox.config import STORAGE_CONFIG
+from archivebox.config.common import STORAGE_CONFIG
 from archivebox.misc.util import enforce_types, ExtendedEncoder
 from archivebox.misc.util import enforce_types, ExtendedEncoder
 
 
 
 

+ 5 - 5
archivebox/misc/util.py

@@ -1,4 +1,4 @@
-__package__ = 'archivebox'
+__package__ = 'archivebox.misc'
 
 
 import re
 import re
 import requests
 import requests
@@ -25,10 +25,10 @@ except ImportError:
     detect_encoding = lambda rawdata: "utf-8"
     detect_encoding = lambda rawdata: "utf-8"
 
 
 
 
-from archivebox.config.constants import STATICFILE_EXTENSIONS
-from archivebox.config import ARCHIVING_CONFIG
+from archivebox.config import CONSTANTS
+from archivebox.config.common import ARCHIVING_CONFIG
 
 
-from .misc.logging import COLOR_DICT
+from .logging import COLOR_DICT
 
 
 
 
 ### Parsing Helpers
 ### Parsing Helpers
@@ -120,7 +120,7 @@ def find_all_urls(urls_str: str):
 
 
 def is_static_file(url: str):
 def is_static_file(url: str):
     # TODO: the proper way is with MIME type detection + ext, not only extension
     # TODO: the proper way is with MIME type detection + ext, not only extension
-    return extension(url).lower() in STATICFILE_EXTENSIONS
+    return extension(url).lower() in CONSTANTS.STATICFILE_EXTENSIONS
 
 
 
 
 def enforce_types(func):
 def enforce_types(func):

+ 2 - 1
archivebox/parsers/__init__.py

@@ -13,7 +13,8 @@ from typing import IO, Tuple, List, Optional
 from datetime import datetime, timezone
 from datetime import datetime, timezone
 from pathlib import Path 
 from pathlib import Path 
 
 
-from archivebox.config import DATA_DIR, CONSTANTS, SHELL_CONFIG, ARCHIVING_CONFIG
+from archivebox.config import DATA_DIR, CONSTANTS
+from archivebox.config.common import SHELL_CONFIG, ARCHIVING_CONFIG
 from archivebox.misc.system import atomic_write
 from archivebox.misc.system import atomic_write
 from archivebox.misc.logging import stderr, hint
 from archivebox.misc.logging import stderr, hint
 from archivebox.misc.util import (
 from archivebox.misc.util import (

+ 2 - 1
archivebox/plugins_extractor/chrome/apps.py

@@ -25,7 +25,8 @@ from abx.archivebox.base_binary import BaseBinary, env
 from abx.archivebox.base_hook import BaseHook
 from abx.archivebox.base_hook import BaseHook
 
 
 # Depends on Other Plugins:
 # Depends on Other Plugins:
-from archivebox.config import CONSTANTS, ARCHIVING_CONFIG, SHELL_CONFIG
+from archivebox.config import CONSTANTS
+from archivebox.config.common import ARCHIVING_CONFIG, SHELL_CONFIG
 from plugins_pkg.puppeteer.apps import PUPPETEER_BINPROVIDER
 from plugins_pkg.puppeteer.apps import PUPPETEER_BINPROVIDER
 from plugins_pkg.playwright.apps import PLAYWRIGHT_BINPROVIDER
 from plugins_pkg.playwright.apps import PLAYWRIGHT_BINPROVIDER
 
 

+ 1 - 1
archivebox/plugins_extractor/curl/apps.py

@@ -11,7 +11,7 @@ from abx.archivebox.base_configset import BaseConfigSet
 from abx.archivebox.base_binary import BaseBinary, env, apt, brew
 from abx.archivebox.base_binary import BaseBinary, env, apt, brew
 # from abx.archivebox.base_extractor import BaseExtractor, ExtractorName
 # from abx.archivebox.base_extractor import BaseExtractor, ExtractorName
 
 
-from archivebox.config import ARCHIVING_CONFIG
+from archivebox.config.common import ARCHIVING_CONFIG
 from archivebox.plugins_extractor.favicon.apps import FAVICON_CONFIG
 from archivebox.plugins_extractor.favicon.apps import FAVICON_CONFIG
 from archivebox.plugins_extractor.archivedotorg.apps import ARCHIVEDOTORG_CONFIG
 from archivebox.plugins_extractor.archivedotorg.apps import ARCHIVEDOTORG_CONFIG
 
 

+ 1 - 1
archivebox/plugins_extractor/git/apps.py

@@ -11,7 +11,7 @@ from abx.archivebox.base_configset import BaseConfigSet
 from abx.archivebox.base_binary import BaseBinary, env, apt, brew
 from abx.archivebox.base_binary import BaseBinary, env, apt, brew
 from abx.archivebox.base_extractor import BaseExtractor, ExtractorName
 from abx.archivebox.base_extractor import BaseExtractor, ExtractorName
 
 
-from archivebox.config import ARCHIVING_CONFIG
+from archivebox.config.common import ARCHIVING_CONFIG
 
 
 
 
 class GitConfig(BaseConfigSet):
 class GitConfig(BaseConfigSet):

+ 3 - 3
archivebox/plugins_extractor/mercury/apps.py

@@ -5,14 +5,14 @@ from pathlib import Path
 from subprocess import run
 from subprocess import run
 
 
 from pydantic import InstanceOf, Field
 from pydantic import InstanceOf, Field
-from pydantic_pkgr import BinProvider, BinName, bin_abspath
+from pydantic_pkgr import BinProvider, BinName, BinProviderName, ProviderLookupDict, bin_abspath
 
 
 from abx.archivebox.base_plugin import BasePlugin, BaseHook
 from abx.archivebox.base_plugin import BasePlugin, BaseHook
 from abx.archivebox.base_configset import BaseConfigSet
 from abx.archivebox.base_configset import BaseConfigSet
-from abx.archivebox.base_binary import BaseBinary, BinProviderName,ProviderLookupDict, env
+from abx.archivebox.base_binary import BaseBinary, env
 from abx.archivebox.base_extractor import BaseExtractor, ExtractorName
 from abx.archivebox.base_extractor import BaseExtractor, ExtractorName
 
 
-from archivebox.config import ARCHIVING_CONFIG, STORAGE_CONFIG
+from archivebox.config.common import ARCHIVING_CONFIG, STORAGE_CONFIG
 from archivebox.plugins_pkg.npm.apps import SYS_NPM_BINPROVIDER, LIB_NPM_BINPROVIDER
 from archivebox.plugins_pkg.npm.apps import SYS_NPM_BINPROVIDER, LIB_NPM_BINPROVIDER
 
 
 class MercuryConfig(BaseConfigSet):
 class MercuryConfig(BaseConfigSet):

+ 1 - 1
archivebox/plugins_extractor/readability/apps.py

@@ -16,7 +16,7 @@ from abx.archivebox.base_extractor import BaseExtractor
 from abx.archivebox.base_hook import BaseHook
 from abx.archivebox.base_hook import BaseHook
 
 
 # Depends on Other Plugins:
 # Depends on Other Plugins:
-from archivebox.config import ARCHIVING_CONFIG
+from archivebox.config.common import ARCHIVING_CONFIG
 from plugins_pkg.npm.apps import SYS_NPM_BINPROVIDER, LIB_NPM_BINPROVIDER
 from plugins_pkg.npm.apps import SYS_NPM_BINPROVIDER, LIB_NPM_BINPROVIDER
 
 
 ###################### Config ##########################
 ###################### Config ##########################

+ 3 - 3
archivebox/plugins_extractor/singlefile/apps.py

@@ -1,11 +1,11 @@
 __package__ = 'archivebox.plugins_extractor.singlefile'
 __package__ = 'archivebox.plugins_extractor.singlefile'
 
 
 from pathlib import Path
 from pathlib import Path
-from typing import List, Dict, Optional, ClassVar
+from typing import List, Dict, Optional
 # from typing_extensions import Self
 # from typing_extensions import Self
 
 
 # Depends on other PyPI/vendor packages:
 # Depends on other PyPI/vendor packages:
-from pydantic import InstanceOf, Field, validate_call
+from pydantic import InstanceOf, Field
 from pydantic_pkgr import BinProvider, BinProviderName, ProviderLookupDict, BinName, bin_abspath, ShallowBinary
 from pydantic_pkgr import BinProvider, BinProviderName, ProviderLookupDict, BinName, bin_abspath, ShallowBinary
 
 
 # Depends on other Django apps:
 # Depends on other Django apps:
@@ -17,7 +17,7 @@ from abx.archivebox.base_queue import BaseQueue
 from abx.archivebox.base_hook import BaseHook
 from abx.archivebox.base_hook import BaseHook
 
 
 # Depends on Other Plugins:
 # Depends on Other Plugins:
-from archivebox.config import ARCHIVING_CONFIG
+from archivebox.config.common import ARCHIVING_CONFIG
 from plugins_pkg.npm.apps import SYS_NPM_BINPROVIDER, LIB_NPM_BINPROVIDER
 from plugins_pkg.npm.apps import SYS_NPM_BINPROVIDER, LIB_NPM_BINPROVIDER
 
 
 ###################### Config ##########################
 ###################### Config ##########################

+ 1 - 1
archivebox/plugins_extractor/wget/apps.py

@@ -14,7 +14,7 @@ from abx.archivebox.base_configset import BaseConfigSet
 from abx.archivebox.base_binary import BaseBinary, env, apt, brew
 from abx.archivebox.base_binary import BaseBinary, env, apt, brew
 from abx.archivebox.base_extractor import BaseExtractor, ExtractorName
 from abx.archivebox.base_extractor import BaseExtractor, ExtractorName
 
 
-from archivebox.config import ARCHIVING_CONFIG, STORAGE_CONFIG
+from archivebox.config.common import ARCHIVING_CONFIG, STORAGE_CONFIG
 from .wget_util import wget_output_path
 from .wget_util import wget_output_path
 
 
 
 

+ 1 - 1
archivebox/plugins_extractor/ytdlp/apps.py

@@ -11,7 +11,7 @@ from abx.archivebox.base_configset import BaseConfigSet
 from abx.archivebox.base_binary import BaseBinary, env, apt, brew
 from abx.archivebox.base_binary import BaseBinary, env, apt, brew
 from abx.archivebox.base_hook import BaseHook
 from abx.archivebox.base_hook import BaseHook
 
 
-from archivebox.config import ARCHIVING_CONFIG
+from archivebox.config.common import ARCHIVING_CONFIG
 from plugins_pkg.pip.apps import pip
 from plugins_pkg.pip.apps import pip
 
 
 ###################### Config ##########################
 ###################### Config ##########################

+ 2 - 1
archivebox/plugins_search/ripgrep/apps.py

@@ -18,7 +18,8 @@ from abx.archivebox.base_hook import BaseHook
 from abx.archivebox.base_searchbackend import BaseSearchBackend
 from abx.archivebox.base_searchbackend import BaseSearchBackend
 
 
 # Depends on Other Plugins:
 # Depends on Other Plugins:
-from archivebox.config import CONSTANTS, SEARCH_BACKEND_CONFIG
+from archivebox.config import CONSTANTS
+from archivebox.config.common import SEARCH_BACKEND_CONFIG
 
 
 ###################### Config ##########################
 ###################### Config ##########################
 
 

+ 1 - 1
archivebox/plugins_search/sonic/apps.py

@@ -15,7 +15,7 @@ from abx.archivebox.base_hook import BaseHook
 from abx.archivebox.base_searchbackend import BaseSearchBackend
 from abx.archivebox.base_searchbackend import BaseSearchBackend
 
 
 # Depends on Other Plugins:
 # Depends on Other Plugins:
-from archivebox.config import SEARCH_BACKEND_CONFIG
+from archivebox.config.common import SEARCH_BACKEND_CONFIG
 
 
 SONIC_LIB = None
 SONIC_LIB = None
 try:
 try:

+ 1 - 1
archivebox/plugins_search/sqlite/apps.py

@@ -17,7 +17,7 @@ from abx.archivebox.base_hook import BaseHook
 from abx.archivebox.base_searchbackend import BaseSearchBackend
 from abx.archivebox.base_searchbackend import BaseSearchBackend
 
 
 # Depends on Other Plugins:
 # Depends on Other Plugins:
-from archivebox.config import SEARCH_BACKEND_CONFIG
+from archivebox.config.common import SEARCH_BACKEND_CONFIG
 
 
 
 
 
 

+ 4 - 0
archivebox/queues/supervisor_util.py

@@ -1,5 +1,6 @@
 __package__ = 'archivebox.queues'
 __package__ = 'archivebox.queues'
 
 
+import os
 import time
 import time
 import signal
 import signal
 import psutil
 import psutil
@@ -12,6 +13,8 @@ from typing import Dict, cast
 from supervisor.xmlrpc import SupervisorTransport
 from supervisor.xmlrpc import SupervisorTransport
 from xmlrpc.client import ServerProxy
 from xmlrpc.client import ServerProxy
 
 
+from archivebox.config.permissions import ARCHIVEBOX_USER
+
 from .settings import SUPERVISORD_CONFIG_FILE, DATA_DIR, PID_FILE, SOCK_FILE, LOG_FILE, WORKERS_DIR, TMP_DIR, LOGS_DIR
 from .settings import SUPERVISORD_CONFIG_FILE, DATA_DIR, PID_FILE, SOCK_FILE, LOG_FILE, WORKERS_DIR, TMP_DIR, LOGS_DIR
 
 
 from typing import Iterator
 from typing import Iterator
@@ -42,6 +45,7 @@ childlogdir = {LOGS_DIR}
 directory = {DATA_DIR}
 directory = {DATA_DIR}
 strip_ansi = true
 strip_ansi = true
 nocleanup = true
 nocleanup = true
+user = {ARCHIVEBOX_USER}
 
 
 [unix_http_server]
 [unix_http_server]
 file = {TMP_DIR}/{SOCK_FILE.name}
 file = {TMP_DIR}/{SOCK_FILE.name}

+ 1 - 1
archivebox/search/__init__.py

@@ -11,7 +11,7 @@ import abx.archivebox.use
 from archivebox.index.schema import Link
 from archivebox.index.schema import Link
 from archivebox.misc.util import enforce_types
 from archivebox.misc.util import enforce_types
 from archivebox.misc.logging import stderr
 from archivebox.misc.logging import stderr
-from archivebox.config import SEARCH_BACKEND_CONFIG
+from archivebox.config.common import SEARCH_BACKEND_CONFIG
 
 
 
 
 def log_index_started(url):
 def log_index_started(url):

+ 5 - 6
bin/docker_entrypoint.sh

@@ -110,12 +110,11 @@ if [[ -d "$PLAYWRIGHT_BROWSERS_PATH/.links" ]]; then
     chown -h $PUID:$PGID "$PLAYWRIGHT_BROWSERS_PATH"/.links/*
     chown -h $PUID:$PGID "$PLAYWRIGHT_BROWSERS_PATH"/.links/*
 fi
 fi
 
 
-# also chown tmp dir
-mkdir -p /tmp/archivebox
-chmod 777 /tmp
-chown $PUID:$PGID /tmp/archivebox
-mkdir -p /app/lib
-chown $PUID:$PGID /app/lib /app/lib/*
+# also chown tmp dir and lib dir
+mkdir -p "$SYSTEM_TMP_DIR"
+chown $PUID:$PGID "$SYSTEM_TMP_DIR"
+mkdir -p "$SYSTEM_LIB_DIR"
+chown $PUID:$PGID "$SYSTEM_LIB_DIR" "$SYSTEM_LIB_DIR"/*
 
 
 # (this check is written in blood in 2023, QEMU silently breaks things in ways that are not obvious)
 # (this check is written in blood in 2023, QEMU silently breaks things in ways that are not obvious)
 export IN_QEMU="$(pmap 1 | grep qemu >/dev/null && echo 'True' || echo 'False')"
 export IN_QEMU="$(pmap 1 | grep qemu >/dev/null && echo 'True' || echo 'False')"

+ 2 - 2
pyproject.toml

@@ -1,6 +1,6 @@
 [project]
 [project]
 name = "archivebox"
 name = "archivebox"
-version = "0.8.5rc2"
+version = "0.8.5rc3"
 requires-python = ">=3.10"
 requires-python = ">=3.10"
 description = "Self-hosted internet archiving solution."
 description = "Self-hosted internet archiving solution."
 authors = [{name = "Nick Sweeting", email = "[email protected]"}]
 authors = [{name = "Nick Sweeting", email = "[email protected]"}]
@@ -77,6 +77,7 @@ dependencies = [
     "atomicwrites==1.4.1",
     "atomicwrites==1.4.1",
     "django-taggit==1.3.0",
     "django-taggit==1.3.0",
     "base32-crockford==0.3.0",
     "base32-crockford==0.3.0",
+    "platformdirs>=4.3.6",
     # "pocket@git+https://github.com/tapanpandita/[email protected]",
     # "pocket@git+https://github.com/tapanpandita/[email protected]",
     # "pydantic-pkgr>=0.4.7",
     # "pydantic-pkgr>=0.4.7",
     ############# Plugin Dependencies ################
     ############# Plugin Dependencies ################
@@ -133,7 +134,6 @@ dev-dependencies = [
     "django-autotyping>=0.5.1",
     "django-autotyping>=0.5.1",
 ]
 ]
 
 
-
 [build-system]
 [build-system]
 requires = ["pdm-backend"]
 requires = ["pdm-backend"]
 build-backend = "pdm.backend"
 build-backend = "pdm.backend"

+ 4 - 4
tests/test_init.py

@@ -7,11 +7,11 @@ from pathlib import Path
 import json, shutil
 import json, shutil
 import sqlite3
 import sqlite3
 
 
-from archivebox.config import OUTPUT_PERMISSIONS
+from archivebox.config.common import STORAGE_CONFIG
 
 
 from .fixtures import *
 from .fixtures import *
 
 
-DIR_PERMISSIONS = OUTPUT_PERMISSIONS.replace('6', '7').replace('4', '5')
+DIR_PERMISSIONS = STORAGE_CONFIG.OUTPUT_PERMISSIONS.replace('6', '7').replace('4', '5')
 
 
 def test_init(tmp_path, process):
 def test_init(tmp_path, process):
     assert "Initializing a new ArchiveBox" in process.stdout.decode("utf-8")
     assert "Initializing a new ArchiveBox" in process.stdout.decode("utf-8")
@@ -57,7 +57,7 @@ def test_correct_permissions_output_folder(tmp_path, process):
     index_files = ['index.sqlite3', 'archive']
     index_files = ['index.sqlite3', 'archive']
     for file in index_files:
     for file in index_files:
         file_path = tmp_path / file
         file_path = tmp_path / file
-        assert oct(file_path.stat().st_mode)[-3:] in (OUTPUT_PERMISSIONS, DIR_PERMISSIONS)
+        assert oct(file_path.stat().st_mode)[-3:] in (STORAGE_CONFIG.OUTPUT_PERMISSIONS, DIR_PERMISSIONS)
 
 
 def test_correct_permissions_add_command_results(tmp_path, process, disable_extractors_dict):
 def test_correct_permissions_add_command_results(tmp_path, process, disable_extractors_dict):
     os.chdir(tmp_path)
     os.chdir(tmp_path)
@@ -65,7 +65,7 @@ def test_correct_permissions_add_command_results(tmp_path, process, disable_extr
                                   env=disable_extractors_dict)
                                   env=disable_extractors_dict)
     archived_item_path = list(tmp_path.glob('archive/**/*'))[0]
     archived_item_path = list(tmp_path.glob('archive/**/*'))[0]
     for path in archived_item_path.iterdir():
     for path in archived_item_path.iterdir():
-        assert oct(path.stat().st_mode)[-3:] in (OUTPUT_PERMISSIONS, DIR_PERMISSIONS)
+        assert oct(path.stat().st_mode)[-3:] in (STORAGE_CONFIG.OUTPUT_PERMISSIONS, DIR_PERMISSIONS)
 
 
 def test_collision_urls_different_timestamps(tmp_path, process, disable_extractors_dict):
 def test_collision_urls_different_timestamps(tmp_path, process, disable_extractors_dict):
     os.chdir(tmp_path)
     os.chdir(tmp_path)

+ 1 - 1
uv.lock

@@ -41,7 +41,7 @@ wheels = [
 
 
 [[package]]
 [[package]]
 name = "archivebox"
 name = "archivebox"
-version = "0.8.5rc2"
+version = "0.8.5rc3"
 source = { editable = "." }
 source = { editable = "." }
 dependencies = [
 dependencies = [
     { name = "atomicwrites" },
     { name = "atomicwrites" },