paths.py 20 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413
  1. __package__ = 'archivebox.config'
  2. import os
  3. import socket
  4. import hashlib
  5. import tempfile
  6. import platform
  7. from pathlib import Path
  8. from functools import cache
  9. from datetime import datetime
  10. from benedict import benedict
  11. from .permissions import SudoPermission, IS_ROOT, ARCHIVEBOX_USER
  12. #############################################################################################
  13. PACKAGE_DIR: Path = Path(__file__).resolve().parent.parent # archivebox source code dir
  14. DATA_DIR: Path = Path(os.getcwd()).resolve() # archivebox user data dir
  15. ARCHIVE_DIR: Path = DATA_DIR / 'archive' # archivebox snapshot data dir
  16. IN_DOCKER = os.environ.get('IN_DOCKER', False) in ('1', 'true', 'True', 'TRUE', 'yes')
  17. DATABASE_FILE = DATA_DIR / 'index.sqlite3'
  18. #############################################################################################
  19. def _get_collection_id(DATA_DIR=DATA_DIR, force_create=False) -> str:
  20. collection_id_file = DATA_DIR / '.archivebox_id'
  21. try:
  22. return collection_id_file.read_text().strip()
  23. except (OSError, FileNotFoundError, PermissionError):
  24. pass
  25. # hash the machine_id + collection dir path + creation time to get a unique collection_id
  26. machine_id = get_machine_id()
  27. collection_path = DATA_DIR.resolve()
  28. try:
  29. creation_date = DATA_DIR.stat().st_ctime
  30. except Exception:
  31. creation_date = datetime.now().isoformat()
  32. collection_id = hashlib.sha256(f'{machine_id}:{collection_path}@{creation_date}'.encode()).hexdigest()[:8]
  33. try:
  34. # only persist collection_id file if we already have an index.sqlite3 file present
  35. # otherwise we might be running in a directory that is not a collection, no point creating cruft files
  36. collection_is_active = os.path.isfile(DATABASE_FILE) and os.path.isdir(ARCHIVE_DIR) and os.access(DATA_DIR, os.W_OK)
  37. if collection_is_active or force_create:
  38. collection_id_file.write_text(collection_id)
  39. # if we're running as root right now, make sure the collection_id file is owned by the archivebox user
  40. if IS_ROOT:
  41. with SudoPermission(uid=0):
  42. if ARCHIVEBOX_USER == 0:
  43. os.system(f'chmod 777 "{collection_id_file}"')
  44. else:
  45. os.system(f'chown {ARCHIVEBOX_USER} "{collection_id_file}"')
  46. except (OSError, FileNotFoundError, PermissionError):
  47. pass
  48. return collection_id
  49. @cache
  50. def get_collection_id(DATA_DIR=DATA_DIR) -> str:
  51. """Get a short, stable, unique ID for the current collection (e.g. abc45678)"""
  52. return _get_collection_id(DATA_DIR=DATA_DIR)
  53. @cache
  54. def get_machine_id() -> str:
  55. """Get a short, stable, unique ID for the current machine (e.g. abc45678)"""
  56. MACHINE_ID = 'unknown'
  57. try:
  58. import machineid
  59. MACHINE_ID = machineid.hashed_id('archivebox')[:8]
  60. except Exception:
  61. try:
  62. import uuid
  63. import hashlib
  64. MACHINE_ID = hashlib.sha256(str(uuid.getnode()).encode()).hexdigest()[:8]
  65. except Exception:
  66. pass
  67. return MACHINE_ID
  68. @cache
  69. def get_machine_type() -> str:
  70. """Get a short, stable, unique type identifier for the current machine (e.g. linux-x86_64-docker)"""
  71. OS: str = platform.system().lower() # darwin, linux, etc.
  72. ARCH: str = platform.machine().lower() # arm64, x86_64, aarch64, etc.
  73. LIB_DIR_SCOPE: str = f'{ARCH}-{OS}-docker' if IN_DOCKER else f'{ARCH}-{OS}'
  74. return LIB_DIR_SCOPE
  75. def dir_is_writable(dir_path: Path, uid: int | None = None, gid: int | None = None, fallback=True, chown=True) -> bool:
  76. """Check if a given directory is writable by a specific user and group (fallback=try as current user is unable to check with provided uid)"""
  77. current_uid, current_gid = os.geteuid(), os.getegid()
  78. uid, gid = uid or current_uid, gid or current_gid
  79. test_file = dir_path / '.permissions_test'
  80. try:
  81. with SudoPermission(uid=uid, fallback=fallback):
  82. test_file.exists()
  83. test_file.write_text(f'Checking if PUID={uid} PGID={gid} can write to dir')
  84. test_file.unlink()
  85. return True
  86. except (IOError, OSError, PermissionError):
  87. if chown:
  88. # try fixing it using sudo permissions
  89. with SudoPermission(uid=uid, fallback=fallback):
  90. os.system(f'chown {uid}:{gid} "{dir_path}" 2>/dev/null')
  91. return dir_is_writable(dir_path, uid=uid, gid=gid, fallback=fallback, chown=False)
  92. return False
  93. def assert_dir_can_contain_unix_sockets(dir_path: Path) -> bool:
  94. """Check if a given directory can contain unix sockets (e.g. /tmp/supervisord.sock)"""
  95. from archivebox.misc.logging_util import pretty_path
  96. try:
  97. socket_path = str(dir_path / '.test_socket.sock')
  98. s = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM)
  99. try:
  100. os.remove(socket_path)
  101. except OSError:
  102. pass
  103. s.bind(socket_path)
  104. s.close()
  105. try:
  106. os.remove(socket_path)
  107. except OSError:
  108. pass
  109. except Exception as e:
  110. raise Exception(f'ArchiveBox failed to create a test UNIX socket file in {pretty_path(dir_path, color=False)}') from e
  111. return True
  112. def create_and_chown_dir(dir_path: Path) -> None:
  113. with SudoPermission(uid=0, fallback=True):
  114. dir_path.mkdir(parents=True, exist_ok=True)
  115. os.system(f'chown {ARCHIVEBOX_USER} "{dir_path}" 2>/dev/null')
  116. os.system(f'chown {ARCHIVEBOX_USER} "{dir_path}"/* 2>/dev/null &')
  117. @cache
  118. def get_or_create_working_tmp_dir(autofix=True, quiet=True):
  119. from archivebox import CONSTANTS
  120. from archivebox.config.common import STORAGE_CONFIG
  121. from archivebox.misc.checks import check_tmp_dir
  122. # try a few potential directories in order of preference
  123. CANDIDATES = [
  124. STORAGE_CONFIG.TMP_DIR, # <user-specified>
  125. CONSTANTS.DEFAULT_TMP_DIR, # ./data/tmp/<machine_id>
  126. Path('/var/run/archivebox') / get_collection_id(), # /var/run/archivebox/abc5d8512
  127. Path('/tmp') / 'archivebox' / get_collection_id(), # /tmp/archivebox/abc5d8512
  128. Path('~/.tmp/archivebox').expanduser() / get_collection_id(), # ~/.tmp/archivebox/abc5d8512
  129. Path(tempfile.gettempdir()) / 'archivebox' / get_collection_id(), # /var/folders/qy/6tpfrpx100j1t4l312nz683m0000gn/T/archivebox/abc5d8512
  130. Path(tempfile.gettempdir()) / 'archivebox' / get_collection_id()[:4], # /var/folders/qy/6tpfrpx100j1t4l312nz683m0000gn/T/archivebox/abc5d
  131. Path(tempfile.gettempdir()) / 'abx' / get_collection_id()[:4], # /var/folders/qy/6tpfrpx100j1t4l312nz683m0000gn/T/abx/abc5
  132. ]
  133. for candidate in CANDIDATES:
  134. try:
  135. create_and_chown_dir(candidate)
  136. except Exception:
  137. pass
  138. if check_tmp_dir(candidate, throw=False, quiet=True, must_exist=True):
  139. if autofix and STORAGE_CONFIG.TMP_DIR != candidate:
  140. STORAGE_CONFIG.update_in_place(TMP_DIR=candidate)
  141. return candidate
  142. if not quiet:
  143. raise OSError(f'ArchiveBox is unable to find a writable TMP_DIR, tried {CANDIDATES}!')
  144. @cache
  145. def get_or_create_working_lib_dir(autofix=True, quiet=False):
  146. from archivebox import CONSTANTS
  147. from archivebox.config.common import STORAGE_CONFIG
  148. from archivebox.misc.checks import check_lib_dir
  149. # try a few potential directories in order of preference
  150. CANDIDATES = [
  151. STORAGE_CONFIG.LIB_DIR, # <user-specified>
  152. CONSTANTS.DEFAULT_LIB_DIR, # ./data/lib/arm64-linux-docker
  153. Path('/usr/local/share/archivebox') / get_collection_id(), # /usr/local/share/archivebox/abc5
  154. *([Path('/opt/homebrew/share/archivebox') / get_collection_id()] if os.path.isfile('/opt/homebrew/bin/archivebox') else []), # /opt/homebrew/share/archivebox/abc5
  155. Path('~/.local/share/archivebox').expanduser() / get_collection_id(), # ~/.local/share/archivebox/abc5
  156. ]
  157. for candidate in CANDIDATES:
  158. try:
  159. create_and_chown_dir(candidate)
  160. except Exception:
  161. pass
  162. if check_lib_dir(candidate, throw=False, quiet=True, must_exist=True):
  163. if autofix and STORAGE_CONFIG.LIB_DIR != candidate:
  164. STORAGE_CONFIG.update_in_place(LIB_DIR=candidate)
  165. return candidate
  166. if not quiet:
  167. raise OSError(f'ArchiveBox is unable to find a writable LIB_DIR, tried {CANDIDATES}!')
  168. @cache
  169. def get_data_locations():
  170. from archivebox.config import CONSTANTS
  171. from archivebox.config.common import STORAGE_CONFIG
  172. return benedict({
  173. "DATA_DIR": {
  174. "path": DATA_DIR.resolve(),
  175. "enabled": True,
  176. "is_valid": os.path.isdir(DATA_DIR) and os.access(DATA_DIR, os.R_OK) and os.access(DATA_DIR, os.W_OK),
  177. "is_mount": os.path.ismount(DATA_DIR.resolve()),
  178. },
  179. "CONFIG_FILE": {
  180. "path": CONSTANTS.CONFIG_FILE.resolve(),
  181. "enabled": True,
  182. "is_valid": os.path.isfile(CONSTANTS.CONFIG_FILE) and os.access(CONSTANTS.CONFIG_FILE, os.R_OK) and os.access(CONSTANTS.CONFIG_FILE, os.W_OK),
  183. },
  184. "SQL_INDEX": {
  185. "path": DATABASE_FILE.resolve(),
  186. "enabled": True,
  187. "is_valid": os.path.isfile(DATABASE_FILE) and os.access(DATABASE_FILE, os.R_OK) and os.access(DATABASE_FILE, os.W_OK),
  188. "is_mount": os.path.ismount(DATABASE_FILE.resolve()),
  189. },
  190. "QUEUE_DATABASE": {
  191. "path": CONSTANTS.QUEUE_DATABASE_FILE,
  192. "enabled": True,
  193. "is_valid": os.path.isfile(CONSTANTS.QUEUE_DATABASE_FILE) and os.access(CONSTANTS.QUEUE_DATABASE_FILE, os.R_OK) and os.access(CONSTANTS.QUEUE_DATABASE_FILE, os.W_OK),
  194. "is_mount": os.path.ismount(CONSTANTS.QUEUE_DATABASE_FILE),
  195. },
  196. "ARCHIVE_DIR": {
  197. "path": ARCHIVE_DIR.resolve(),
  198. "enabled": True,
  199. "is_valid": os.path.isdir(ARCHIVE_DIR) and os.access(ARCHIVE_DIR, os.R_OK) and os.access(ARCHIVE_DIR, os.W_OK),
  200. "is_mount": os.path.ismount(ARCHIVE_DIR.resolve()),
  201. },
  202. "SOURCES_DIR": {
  203. "path": CONSTANTS.SOURCES_DIR.resolve(),
  204. "enabled": True,
  205. "is_valid": os.path.isdir(CONSTANTS.SOURCES_DIR) and os.access(CONSTANTS.SOURCES_DIR, os.R_OK) and os.access(CONSTANTS.SOURCES_DIR, os.W_OK),
  206. },
  207. "PERSONAS_DIR": {
  208. "path": CONSTANTS.PERSONAS_DIR.resolve(),
  209. "enabled": os.path.isdir(CONSTANTS.PERSONAS_DIR),
  210. "is_valid": os.path.isdir(CONSTANTS.PERSONAS_DIR) and os.access(CONSTANTS.PERSONAS_DIR, os.R_OK) and os.access(CONSTANTS.PERSONAS_DIR, os.W_OK), # read + write
  211. },
  212. "LOGS_DIR": {
  213. "path": CONSTANTS.LOGS_DIR.resolve(),
  214. "enabled": True,
  215. "is_valid": os.path.isdir(CONSTANTS.LOGS_DIR) and os.access(CONSTANTS.LOGS_DIR, os.R_OK) and os.access(CONSTANTS.LOGS_DIR, os.W_OK), # read + write
  216. },
  217. 'TMP_DIR': {
  218. 'path': STORAGE_CONFIG.TMP_DIR.resolve(),
  219. 'enabled': True,
  220. 'is_valid': os.path.isdir(STORAGE_CONFIG.TMP_DIR) and os.access(STORAGE_CONFIG.TMP_DIR, os.R_OK) and os.access(STORAGE_CONFIG.TMP_DIR, os.W_OK), # read + write
  221. },
  222. # "CACHE_DIR": {
  223. # "path": CACHE_DIR.resolve(),
  224. # "enabled": True,
  225. # "is_valid": os.access(CACHE_DIR, os.R_OK) and os.access(CACHE_DIR, os.W_OK), # read + write
  226. # },
  227. })
  228. @cache
  229. def get_code_locations():
  230. from archivebox.config import CONSTANTS
  231. from archivebox.config.common import STORAGE_CONFIG
  232. return benedict({
  233. 'PACKAGE_DIR': {
  234. 'path': (PACKAGE_DIR).resolve(),
  235. 'enabled': True,
  236. 'is_valid': os.access(PACKAGE_DIR / '__main__.py', os.X_OK), # executable
  237. },
  238. 'TEMPLATES_DIR': {
  239. 'path': CONSTANTS.TEMPLATES_DIR.resolve(),
  240. 'enabled': True,
  241. 'is_valid': os.access(CONSTANTS.STATIC_DIR, os.R_OK) and os.access(CONSTANTS.STATIC_DIR, os.X_OK), # read + list
  242. },
  243. 'CUSTOM_TEMPLATES_DIR': {
  244. 'path': CONSTANTS.CUSTOM_TEMPLATES_DIR.resolve(),
  245. 'enabled': os.path.isdir(CONSTANTS.CUSTOM_TEMPLATES_DIR),
  246. 'is_valid': os.path.isdir(CONSTANTS.CUSTOM_TEMPLATES_DIR) and os.access(CONSTANTS.CUSTOM_TEMPLATES_DIR, os.R_OK), # read
  247. },
  248. 'USER_PLUGINS_DIR': {
  249. 'path': CONSTANTS.USER_PLUGINS_DIR.resolve(),
  250. 'enabled': os.path.isdir(CONSTANTS.USER_PLUGINS_DIR),
  251. 'is_valid': os.path.isdir(CONSTANTS.USER_PLUGINS_DIR) and os.access(CONSTANTS.USER_PLUGINS_DIR, os.R_OK), # read
  252. },
  253. 'LIB_DIR': {
  254. 'path': STORAGE_CONFIG.LIB_DIR.resolve(),
  255. 'enabled': True,
  256. 'is_valid': os.path.isdir(STORAGE_CONFIG.LIB_DIR) and os.access(STORAGE_CONFIG.LIB_DIR, os.R_OK) and os.access(STORAGE_CONFIG.LIB_DIR, os.W_OK), # read + write
  257. },
  258. })
  259. # @cache
  260. # def get_LIB_DIR():
  261. # """
  262. # - should be shared with other collections on the same host
  263. # - must be scoped by CPU architecture, OS family, and archivebox version
  264. # - should not be shared with other hosts/archivebox versions
  265. # - must be writable by any archivebox user
  266. # - should be persistent across reboots
  267. # - can be on a docker bin mount but probably shouldnt be
  268. # - ok to have a long path (doesnt contain SOCKETS)
  269. # """
  270. # from .version import detect_installed_version
  271. # HOST_DIRS = PlatformDirs(appname='archivebox', appauthor='ArchiveBox', version=detect_installed_version(), opinion=True, ensure_exists=False)
  272. # lib_dir = tempfile.gettempdir()
  273. # try:
  274. # if 'SYSTEM_LIB_DIR' in os.environ:
  275. # lib_dir = Path(os.environ['SYSTEM_LIB_DIR'])
  276. # else:
  277. # with SudoPermission(uid=ARCHIVEBOX_USER, fallback=True):
  278. # lib_dir = HOST_DIRS.site_data_path
  279. # # Docker: /usr/local/share/archivebox/0.8.5
  280. # # Ubuntu: /usr/local/share/archivebox/0.8.5
  281. # # macOS: /Library/Application Support/archivebox
  282. # try:
  283. # with SudoPermission(uid=0, fallback=True):
  284. # lib_dir.mkdir(parents=True, exist_ok=True)
  285. # except PermissionError:
  286. # # our user cannot
  287. # lib_dir = HOST_DIRS.user_data_path
  288. # lib_dir.mkdir(parents=True, exist_ok=True)
  289. # if IS_ROOT or not dir_is_writable(lib_dir, uid=ARCHIVEBOX_USER):
  290. # if IS_ROOT:
  291. # # make sure lib dir is owned by the archivebox user, not root
  292. # with SudoPermission(uid=0):
  293. # if ARCHIVEBOX_USER == 0:
  294. # # print(f'[yellow]:warning: Waring: Creating SYSTEM_LIB_DIR {lib_dir} with mode 777 so that non-root archivebox users can share it.[/yellow] (caches shared libs used by archivebox for performance)', file=sys.stderr)
  295. # os.system(f'chmod -R 777 "{lib_dir}"')
  296. # else:
  297. # os.system(f'chown {ARCHIVEBOX_USER}:{ARCHIVEBOX_GROUP} "{lib_dir}"')
  298. # else:
  299. # raise PermissionError()
  300. # except (PermissionError, AssertionError):
  301. # # raise PermissionError(f'SYSTEM_LIB_DIR {lib_dir} is not writable by archivebox user {ARCHIVEBOX_USER}:{ARCHIVEBOX_GROUP}')
  302. # print(f'[red]:cross_mark: ERROR: SYSTEM_LIB_DIR {lib_dir} is not writable by archivebox user {ARCHIVEBOX_USER}:{ARCHIVEBOX_GROUP}[/red]', file=sys.stderr)
  303. # return lib_dir
  304. # @cache
  305. # def get_TMP_DIR():
  306. # """
  307. # - must NOT be inside DATA_DIR / inside a docker volume bind mount
  308. # - must NOT have a long PATH (UNIX socket path length restrictions)
  309. # - must NOT be shared with other collections/hosts
  310. # - must be writable by archivebox user & root
  311. # - must be cleared on every boot / not persisted
  312. # - must be cleared on every archivebox version upgrade
  313. # """
  314. # from .version import detect_installed_version
  315. # HOST_DIRS = PlatformDirs(appname='archivebox', appauthor='ArchiveBox', version=detect_installed_version(), opinion=True, ensure_exists=False)
  316. # # print('DATA_DIR OWNED BY:', ARCHIVEBOX_USER, ARCHIVEBOX_GROUP)
  317. # # print('RUNNING AS:', self.PUID, self.PGID)
  318. # run_dir = tempfile.gettempdir()
  319. # try:
  320. # if 'SYSTEM_TMP_DIR' in os.environ:
  321. # run_dir = Path(os.environ['SYSTEM_TMP_DIR']).resolve() / get_collection_id(DATA_DIR=DATA_DIR)
  322. # with SudoPermission(uid=0, fallback=True):
  323. # run_dir.mkdir(parents=True, exist_ok=True)
  324. # if not dir_is_writable(run_dir, uid=ARCHIVEBOX_USER):
  325. # if IS_ROOT:
  326. # with SudoPermission(uid=0, fallback=False):
  327. # if ARCHIVEBOX_USER == 0:
  328. # # print(f'[yellow]:warning: Waring: Creating SYSTEM_TMP_DIR {run_dir} with mode 777 so that non-root archivebox users can access it.[/yellow]', file=sys.stderr)
  329. # os.system(f'chmod -R 777 "{run_dir}"')
  330. # else:
  331. # os.system(f'chown {ARCHIVEBOX_USER}:{ARCHIVEBOX_GROUP} "{run_dir}"')
  332. # else:
  333. # raise PermissionError()
  334. # assert len(str(run_dir / 'supervisord.conf')) < 95, 'SYSTEM_TMP_DIR path is too long, please set SYSTEM_TMP_DIR env variable to a shorter path (unfortunately unix requires socket paths be < 108 chars)'
  335. # return run_dir
  336. # run_dir = (HOST_DIRS.site_runtime_path / get_collection_id(DATA_DIR=DATA_DIR)).resolve()
  337. # try:
  338. # assert len(str(run_dir)) + len('/supervisord.sock') < 95
  339. # except AssertionError:
  340. # run_dir = Path(tempfile.gettempdir()).resolve() / 'archivebox' / get_collection_id(DATA_DIR=DATA_DIR)
  341. # assert len(str(run_dir)) + len('/supervisord.sock') < 95, 'SYSTEM_TMP_DIR path is too long, please set SYSTEM_TMP_DIR env variable to a shorter path (unfortunately unix requires socket paths be < 108 chars)'
  342. # with SudoPermission(uid=0, fallback=True):
  343. # run_dir.mkdir(parents=True, exist_ok=True)
  344. # if IS_ROOT or not dir_is_writable(run_dir, uid=ARCHIVEBOX_USER):
  345. # if IS_ROOT:
  346. # with SudoPermission(uid=0):
  347. # if ARCHIVEBOX_USER == 0:
  348. # # print(f'[yellow]:warning: Waring: Creating SYSTEM_TMP_DIR {run_dir} with mode 777 so that non-root archivebox users can access it.[/yellow]', file=sys.stderr)
  349. # os.system(f'chmod -R 777 "{run_dir}"')
  350. # else:
  351. # os.system(f'chown {ARCHIVEBOX_USER}:{ARCHIVEBOX_GROUP} "{run_dir}"')
  352. # else:
  353. # raise PermissionError()
  354. # except (PermissionError, AssertionError):
  355. # # raise PermissionError(f'SYSTEM_TMP_DIR {run_dir} is not writable by archivebox user {ARCHIVEBOX_USER}:{ARCHIVEBOX_GROUP}')
  356. # print(f'[red]:cross_mark: ERROR: SYSTEM_TMP_DIR {run_dir} is not writable by archivebox user {ARCHIVEBOX_USER}:{ARCHIVEBOX_GROUP}[/red]', file=sys.stderr)
  357. # return run_dir