| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413 |
- __package__ = 'archivebox.config'
- import os
- import socket
- import hashlib
- import tempfile
- import platform
- from pathlib import Path
- from functools import cache
- from datetime import datetime
- from benedict import benedict
- from .permissions import SudoPermission, IS_ROOT, ARCHIVEBOX_USER
- #############################################################################################
- PACKAGE_DIR: Path = Path(__file__).resolve().parent.parent # archivebox source code dir
- DATA_DIR: Path = Path(os.getcwd()).resolve() # archivebox user data dir
- ARCHIVE_DIR: Path = DATA_DIR / 'archive' # archivebox snapshot data dir
- IN_DOCKER = os.environ.get('IN_DOCKER', False) in ('1', 'true', 'True', 'TRUE', 'yes')
- DATABASE_FILE = DATA_DIR / 'index.sqlite3'
- #############################################################################################
- def _get_collection_id(DATA_DIR=DATA_DIR, force_create=False) -> str:
- collection_id_file = DATA_DIR / '.archivebox_id'
-
- try:
- return collection_id_file.read_text().strip()
- except (OSError, FileNotFoundError, PermissionError):
- pass
-
- # hash the machine_id + collection dir path + creation time to get a unique collection_id
- machine_id = get_machine_id()
- collection_path = DATA_DIR.resolve()
- try:
- creation_date = DATA_DIR.stat().st_ctime
- except Exception:
- creation_date = datetime.now().isoformat()
- collection_id = hashlib.sha256(f'{machine_id}:{collection_path}@{creation_date}'.encode()).hexdigest()[:8]
-
- try:
- # only persist collection_id file if we already have an index.sqlite3 file present
- # otherwise we might be running in a directory that is not a collection, no point creating cruft files
- collection_is_active = os.path.isfile(DATABASE_FILE) and os.path.isdir(ARCHIVE_DIR) and os.access(DATA_DIR, os.W_OK)
- if collection_is_active or force_create:
- collection_id_file.write_text(collection_id)
-
- # if we're running as root right now, make sure the collection_id file is owned by the archivebox user
- if IS_ROOT:
- with SudoPermission(uid=0):
- if ARCHIVEBOX_USER == 0:
- os.system(f'chmod 777 "{collection_id_file}"')
- else:
- os.system(f'chown {ARCHIVEBOX_USER} "{collection_id_file}"')
- except (OSError, FileNotFoundError, PermissionError):
- pass
- return collection_id
- @cache
- def get_collection_id(DATA_DIR=DATA_DIR) -> str:
- """Get a short, stable, unique ID for the current collection (e.g. abc45678)"""
- return _get_collection_id(DATA_DIR=DATA_DIR)
- @cache
- def get_machine_id() -> str:
- """Get a short, stable, unique ID for the current machine (e.g. abc45678)"""
-
- MACHINE_ID = 'unknown'
- try:
- import machineid
- MACHINE_ID = machineid.hashed_id('archivebox')[:8]
- except Exception:
- try:
- import uuid
- import hashlib
- MACHINE_ID = hashlib.sha256(str(uuid.getnode()).encode()).hexdigest()[:8]
- except Exception:
- pass
- return MACHINE_ID
- @cache
- def get_machine_type() -> str:
- """Get a short, stable, unique type identifier for the current machine (e.g. linux-x86_64-docker)"""
-
- OS: str = platform.system().lower() # darwin, linux, etc.
- ARCH: str = platform.machine().lower() # arm64, x86_64, aarch64, etc.
- LIB_DIR_SCOPE: str = f'{ARCH}-{OS}-docker' if IN_DOCKER else f'{ARCH}-{OS}'
- return LIB_DIR_SCOPE
- def dir_is_writable(dir_path: Path, uid: int | None = None, gid: int | None = None, fallback=True, chown=True) -> bool:
- """Check if a given directory is writable by a specific user and group (fallback=try as current user is unable to check with provided uid)"""
- current_uid, current_gid = os.geteuid(), os.getegid()
- uid, gid = uid or current_uid, gid or current_gid
- test_file = dir_path / '.permissions_test'
- try:
- with SudoPermission(uid=uid, fallback=fallback):
- test_file.exists()
- test_file.write_text(f'Checking if PUID={uid} PGID={gid} can write to dir')
- test_file.unlink()
- return True
- except (IOError, OSError, PermissionError):
- if chown:
- # try fixing it using sudo permissions
- with SudoPermission(uid=uid, fallback=fallback):
- os.system(f'chown {uid}:{gid} "{dir_path}" 2>/dev/null')
- return dir_is_writable(dir_path, uid=uid, gid=gid, fallback=fallback, chown=False)
- return False
- def assert_dir_can_contain_unix_sockets(dir_path: Path) -> bool:
- """Check if a given directory can contain unix sockets (e.g. /tmp/supervisord.sock)"""
- from archivebox.misc.logging_util import pretty_path
-
- try:
- socket_path = str(dir_path / '.test_socket.sock')
- s = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM)
- try:
- os.remove(socket_path)
- except OSError:
- pass
- s.bind(socket_path)
- s.close()
- try:
- os.remove(socket_path)
- except OSError:
- pass
- except Exception as e:
- raise Exception(f'ArchiveBox failed to create a test UNIX socket file in {pretty_path(dir_path, color=False)}') from e
-
- return True
- def create_and_chown_dir(dir_path: Path) -> None:
- with SudoPermission(uid=0, fallback=True):
- dir_path.mkdir(parents=True, exist_ok=True)
- os.system(f'chown {ARCHIVEBOX_USER} "{dir_path}" 2>/dev/null')
- os.system(f'chown {ARCHIVEBOX_USER} "{dir_path}"/* 2>/dev/null &')
- @cache
- def get_or_create_working_tmp_dir(autofix=True, quiet=True):
- from archivebox import CONSTANTS
- from archivebox.config.common import STORAGE_CONFIG
- from archivebox.misc.checks import check_tmp_dir
- # try a few potential directories in order of preference
- CANDIDATES = [
- STORAGE_CONFIG.TMP_DIR, # <user-specified>
- CONSTANTS.DEFAULT_TMP_DIR, # ./data/tmp/<machine_id>
- Path('/var/run/archivebox') / get_collection_id(), # /var/run/archivebox/abc5d8512
- Path('/tmp') / 'archivebox' / get_collection_id(), # /tmp/archivebox/abc5d8512
- Path('~/.tmp/archivebox').expanduser() / get_collection_id(), # ~/.tmp/archivebox/abc5d8512
- Path(tempfile.gettempdir()) / 'archivebox' / get_collection_id(), # /var/folders/qy/6tpfrpx100j1t4l312nz683m0000gn/T/archivebox/abc5d8512
- Path(tempfile.gettempdir()) / 'archivebox' / get_collection_id()[:4], # /var/folders/qy/6tpfrpx100j1t4l312nz683m0000gn/T/archivebox/abc5d
- Path(tempfile.gettempdir()) / 'abx' / get_collection_id()[:4], # /var/folders/qy/6tpfrpx100j1t4l312nz683m0000gn/T/abx/abc5
- ]
- for candidate in CANDIDATES:
- try:
- create_and_chown_dir(candidate)
- except Exception:
- pass
- if check_tmp_dir(candidate, throw=False, quiet=True, must_exist=True):
- if autofix and STORAGE_CONFIG.TMP_DIR != candidate:
- STORAGE_CONFIG.update_in_place(TMP_DIR=candidate)
- return candidate
-
- if not quiet:
- raise OSError(f'ArchiveBox is unable to find a writable TMP_DIR, tried {CANDIDATES}!')
- @cache
- def get_or_create_working_lib_dir(autofix=True, quiet=False):
- from archivebox import CONSTANTS
- from archivebox.config.common import STORAGE_CONFIG
- from archivebox.misc.checks import check_lib_dir
-
- # try a few potential directories in order of preference
- CANDIDATES = [
- STORAGE_CONFIG.LIB_DIR, # <user-specified>
- CONSTANTS.DEFAULT_LIB_DIR, # ./data/lib/arm64-linux-docker
- Path('/usr/local/share/archivebox') / get_collection_id(), # /usr/local/share/archivebox/abc5
- *([Path('/opt/homebrew/share/archivebox') / get_collection_id()] if os.path.isfile('/opt/homebrew/bin/archivebox') else []), # /opt/homebrew/share/archivebox/abc5
- Path('~/.local/share/archivebox').expanduser() / get_collection_id(), # ~/.local/share/archivebox/abc5
- ]
-
- for candidate in CANDIDATES:
- try:
- create_and_chown_dir(candidate)
- except Exception:
- pass
- if check_lib_dir(candidate, throw=False, quiet=True, must_exist=True):
- if autofix and STORAGE_CONFIG.LIB_DIR != candidate:
- STORAGE_CONFIG.update_in_place(LIB_DIR=candidate)
- return candidate
-
- if not quiet:
- raise OSError(f'ArchiveBox is unable to find a writable LIB_DIR, tried {CANDIDATES}!')
- @cache
- def get_data_locations():
- from archivebox.config import CONSTANTS
- from archivebox.config.common import STORAGE_CONFIG
-
- return benedict({
- "DATA_DIR": {
- "path": DATA_DIR.resolve(),
- "enabled": True,
- "is_valid": os.path.isdir(DATA_DIR) and os.access(DATA_DIR, os.R_OK) and os.access(DATA_DIR, os.W_OK),
- "is_mount": os.path.ismount(DATA_DIR.resolve()),
- },
- "CONFIG_FILE": {
- "path": CONSTANTS.CONFIG_FILE.resolve(),
- "enabled": True,
- "is_valid": os.path.isfile(CONSTANTS.CONFIG_FILE) and os.access(CONSTANTS.CONFIG_FILE, os.R_OK) and os.access(CONSTANTS.CONFIG_FILE, os.W_OK),
- },
- "SQL_INDEX": {
- "path": DATABASE_FILE.resolve(),
- "enabled": True,
- "is_valid": os.path.isfile(DATABASE_FILE) and os.access(DATABASE_FILE, os.R_OK) and os.access(DATABASE_FILE, os.W_OK),
- "is_mount": os.path.ismount(DATABASE_FILE.resolve()),
- },
- "QUEUE_DATABASE": {
- "path": CONSTANTS.QUEUE_DATABASE_FILE,
- "enabled": True,
- "is_valid": os.path.isfile(CONSTANTS.QUEUE_DATABASE_FILE) and os.access(CONSTANTS.QUEUE_DATABASE_FILE, os.R_OK) and os.access(CONSTANTS.QUEUE_DATABASE_FILE, os.W_OK),
- "is_mount": os.path.ismount(CONSTANTS.QUEUE_DATABASE_FILE),
- },
- "ARCHIVE_DIR": {
- "path": ARCHIVE_DIR.resolve(),
- "enabled": True,
- "is_valid": os.path.isdir(ARCHIVE_DIR) and os.access(ARCHIVE_DIR, os.R_OK) and os.access(ARCHIVE_DIR, os.W_OK),
- "is_mount": os.path.ismount(ARCHIVE_DIR.resolve()),
- },
- "SOURCES_DIR": {
- "path": CONSTANTS.SOURCES_DIR.resolve(),
- "enabled": True,
- "is_valid": os.path.isdir(CONSTANTS.SOURCES_DIR) and os.access(CONSTANTS.SOURCES_DIR, os.R_OK) and os.access(CONSTANTS.SOURCES_DIR, os.W_OK),
- },
- "PERSONAS_DIR": {
- "path": CONSTANTS.PERSONAS_DIR.resolve(),
- "enabled": os.path.isdir(CONSTANTS.PERSONAS_DIR),
- "is_valid": os.path.isdir(CONSTANTS.PERSONAS_DIR) and os.access(CONSTANTS.PERSONAS_DIR, os.R_OK) and os.access(CONSTANTS.PERSONAS_DIR, os.W_OK), # read + write
- },
- "LOGS_DIR": {
- "path": CONSTANTS.LOGS_DIR.resolve(),
- "enabled": True,
- "is_valid": os.path.isdir(CONSTANTS.LOGS_DIR) and os.access(CONSTANTS.LOGS_DIR, os.R_OK) and os.access(CONSTANTS.LOGS_DIR, os.W_OK), # read + write
- },
- 'TMP_DIR': {
- 'path': STORAGE_CONFIG.TMP_DIR.resolve(),
- 'enabled': True,
- 'is_valid': os.path.isdir(STORAGE_CONFIG.TMP_DIR) and os.access(STORAGE_CONFIG.TMP_DIR, os.R_OK) and os.access(STORAGE_CONFIG.TMP_DIR, os.W_OK), # read + write
- },
- # "CACHE_DIR": {
- # "path": CACHE_DIR.resolve(),
- # "enabled": True,
- # "is_valid": os.access(CACHE_DIR, os.R_OK) and os.access(CACHE_DIR, os.W_OK), # read + write
- # },
- })
- @cache
- def get_code_locations():
- from archivebox.config import CONSTANTS
- from archivebox.config.common import STORAGE_CONFIG
-
- return benedict({
- 'PACKAGE_DIR': {
- 'path': (PACKAGE_DIR).resolve(),
- 'enabled': True,
- 'is_valid': os.access(PACKAGE_DIR / '__main__.py', os.X_OK), # executable
- },
- 'TEMPLATES_DIR': {
- 'path': CONSTANTS.TEMPLATES_DIR.resolve(),
- 'enabled': True,
- 'is_valid': os.access(CONSTANTS.STATIC_DIR, os.R_OK) and os.access(CONSTANTS.STATIC_DIR, os.X_OK), # read + list
- },
- 'CUSTOM_TEMPLATES_DIR': {
- 'path': CONSTANTS.CUSTOM_TEMPLATES_DIR.resolve(),
- 'enabled': os.path.isdir(CONSTANTS.CUSTOM_TEMPLATES_DIR),
- 'is_valid': os.path.isdir(CONSTANTS.CUSTOM_TEMPLATES_DIR) and os.access(CONSTANTS.CUSTOM_TEMPLATES_DIR, os.R_OK), # read
- },
- 'USER_PLUGINS_DIR': {
- 'path': CONSTANTS.USER_PLUGINS_DIR.resolve(),
- 'enabled': os.path.isdir(CONSTANTS.USER_PLUGINS_DIR),
- 'is_valid': os.path.isdir(CONSTANTS.USER_PLUGINS_DIR) and os.access(CONSTANTS.USER_PLUGINS_DIR, os.R_OK), # read
- },
- 'LIB_DIR': {
- 'path': STORAGE_CONFIG.LIB_DIR.resolve(),
- 'enabled': True,
- 'is_valid': os.path.isdir(STORAGE_CONFIG.LIB_DIR) and os.access(STORAGE_CONFIG.LIB_DIR, os.R_OK) and os.access(STORAGE_CONFIG.LIB_DIR, os.W_OK), # read + write
- },
- })
- # @cache
- # def get_LIB_DIR():
- # """
- # - should be shared with other collections on the same host
- # - must be scoped by CPU architecture, OS family, and archivebox version
- # - should not be shared with other hosts/archivebox versions
- # - must be writable by any archivebox user
- # - should be persistent across reboots
- # - can be on a docker bin mount but probably shouldnt be
- # - ok to have a long path (doesnt contain SOCKETS)
- # """
- # from .version import detect_installed_version
-
- # HOST_DIRS = PlatformDirs(appname='archivebox', appauthor='ArchiveBox', version=detect_installed_version(), opinion=True, ensure_exists=False)
-
- # lib_dir = tempfile.gettempdir()
- # try:
- # if 'SYSTEM_LIB_DIR' in os.environ:
- # lib_dir = Path(os.environ['SYSTEM_LIB_DIR'])
- # else:
- # with SudoPermission(uid=ARCHIVEBOX_USER, fallback=True):
- # lib_dir = HOST_DIRS.site_data_path
-
- # # Docker: /usr/local/share/archivebox/0.8.5
- # # Ubuntu: /usr/local/share/archivebox/0.8.5
- # # macOS: /Library/Application Support/archivebox
- # try:
- # with SudoPermission(uid=0, fallback=True):
- # lib_dir.mkdir(parents=True, exist_ok=True)
- # except PermissionError:
- # # our user cannot
- # lib_dir = HOST_DIRS.user_data_path
- # lib_dir.mkdir(parents=True, exist_ok=True)
-
- # if IS_ROOT or not dir_is_writable(lib_dir, uid=ARCHIVEBOX_USER):
- # if IS_ROOT:
- # # make sure lib dir is owned by the archivebox user, not root
- # with SudoPermission(uid=0):
- # if ARCHIVEBOX_USER == 0:
- # # print(f'[yellow]:warning: Waring: Creating SYSTEM_LIB_DIR {lib_dir} with mode 777 so that non-root archivebox users can share it.[/yellow] (caches shared libs used by archivebox for performance)', file=sys.stderr)
- # os.system(f'chmod -R 777 "{lib_dir}"')
- # else:
- # os.system(f'chown {ARCHIVEBOX_USER}:{ARCHIVEBOX_GROUP} "{lib_dir}"')
- # else:
- # raise PermissionError()
- # except (PermissionError, AssertionError):
- # # raise PermissionError(f'SYSTEM_LIB_DIR {lib_dir} is not writable by archivebox user {ARCHIVEBOX_USER}:{ARCHIVEBOX_GROUP}')
- # print(f'[red]:cross_mark: ERROR: SYSTEM_LIB_DIR {lib_dir} is not writable by archivebox user {ARCHIVEBOX_USER}:{ARCHIVEBOX_GROUP}[/red]', file=sys.stderr)
-
- # return lib_dir
-
- # @cache
- # def get_TMP_DIR():
- # """
- # - must NOT be inside DATA_DIR / inside a docker volume bind mount
- # - must NOT have a long PATH (UNIX socket path length restrictions)
- # - must NOT be shared with other collections/hosts
- # - must be writable by archivebox user & root
- # - must be cleared on every boot / not persisted
- # - must be cleared on every archivebox version upgrade
- # """
- # from .version import detect_installed_version
-
- # HOST_DIRS = PlatformDirs(appname='archivebox', appauthor='ArchiveBox', version=detect_installed_version(), opinion=True, ensure_exists=False)
-
- # # print('DATA_DIR OWNED BY:', ARCHIVEBOX_USER, ARCHIVEBOX_GROUP)
- # # print('RUNNING AS:', self.PUID, self.PGID)
- # run_dir = tempfile.gettempdir()
- # try:
- # if 'SYSTEM_TMP_DIR' in os.environ:
- # run_dir = Path(os.environ['SYSTEM_TMP_DIR']).resolve() / get_collection_id(DATA_DIR=DATA_DIR)
- # with SudoPermission(uid=0, fallback=True):
- # run_dir.mkdir(parents=True, exist_ok=True)
- # if not dir_is_writable(run_dir, uid=ARCHIVEBOX_USER):
- # if IS_ROOT:
- # with SudoPermission(uid=0, fallback=False):
- # if ARCHIVEBOX_USER == 0:
- # # print(f'[yellow]:warning: Waring: Creating SYSTEM_TMP_DIR {run_dir} with mode 777 so that non-root archivebox users can access it.[/yellow]', file=sys.stderr)
- # os.system(f'chmod -R 777 "{run_dir}"')
- # else:
- # os.system(f'chown {ARCHIVEBOX_USER}:{ARCHIVEBOX_GROUP} "{run_dir}"')
- # else:
- # raise PermissionError()
- # assert len(str(run_dir / 'supervisord.conf')) < 95, 'SYSTEM_TMP_DIR path is too long, please set SYSTEM_TMP_DIR env variable to a shorter path (unfortunately unix requires socket paths be < 108 chars)'
- # return run_dir
-
- # run_dir = (HOST_DIRS.site_runtime_path / get_collection_id(DATA_DIR=DATA_DIR)).resolve()
- # try:
- # assert len(str(run_dir)) + len('/supervisord.sock') < 95
- # except AssertionError:
- # run_dir = Path(tempfile.gettempdir()).resolve() / 'archivebox' / get_collection_id(DATA_DIR=DATA_DIR)
- # assert len(str(run_dir)) + len('/supervisord.sock') < 95, 'SYSTEM_TMP_DIR path is too long, please set SYSTEM_TMP_DIR env variable to a shorter path (unfortunately unix requires socket paths be < 108 chars)'
-
- # with SudoPermission(uid=0, fallback=True):
- # run_dir.mkdir(parents=True, exist_ok=True)
-
- # if IS_ROOT or not dir_is_writable(run_dir, uid=ARCHIVEBOX_USER):
- # if IS_ROOT:
- # with SudoPermission(uid=0):
- # if ARCHIVEBOX_USER == 0:
- # # print(f'[yellow]:warning: Waring: Creating SYSTEM_TMP_DIR {run_dir} with mode 777 so that non-root archivebox users can access it.[/yellow]', file=sys.stderr)
- # os.system(f'chmod -R 777 "{run_dir}"')
- # else:
- # os.system(f'chown {ARCHIVEBOX_USER}:{ARCHIVEBOX_GROUP} "{run_dir}"')
- # else:
- # raise PermissionError()
-
- # except (PermissionError, AssertionError):
- # # raise PermissionError(f'SYSTEM_TMP_DIR {run_dir} is not writable by archivebox user {ARCHIVEBOX_USER}:{ARCHIVEBOX_GROUP}')
- # print(f'[red]:cross_mark: ERROR: SYSTEM_TMP_DIR {run_dir} is not writable by archivebox user {ARCHIVEBOX_USER}:{ARCHIVEBOX_GROUP}[/red]', file=sys.stderr)
-
- # return run_dir
|