2
0

checks.py 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244
  1. __package__ = 'archivebox.misc'
  2. import os
  3. import sys
  4. from pathlib import Path
  5. from rich import print
  6. from rich.panel import Panel
  7. # DO NOT ADD ANY TOP-LEVEL IMPORTS HERE to anything other than builtin python libraries
  8. # this file is imported by archivebox/__init__.py
  9. # and any imports here will be imported by EVERYTHING else
  10. # so this file should only be used for pure python checks
  11. # that don't need to import other parts of ArchiveBox
  12. # if a check needs to import other parts of ArchiveBox,
  13. # the imports should be done inside the check function
  14. # and you should make sure if you need to import any django stuff
  15. # that the check is called after django.setup() has been called
  16. def check_data_folder() -> None:
  17. from archivebox import DATA_DIR, ARCHIVE_DIR
  18. from archivebox.config import CONSTANTS
  19. from archivebox.config.paths import create_and_chown_dir, get_or_create_working_tmp_dir, get_or_create_working_lib_dir
  20. archive_dir_exists = os.path.isdir(ARCHIVE_DIR)
  21. if not archive_dir_exists:
  22. print('[red][X] No archivebox index found in the current directory.[/red]', file=sys.stderr)
  23. print(f' {DATA_DIR}', file=sys.stderr)
  24. print(file=sys.stderr)
  25. print(' [violet]Hint[/violet]: Are you running archivebox in the right folder?', file=sys.stderr)
  26. print(' cd path/to/your/archive/folder', file=sys.stderr)
  27. print(' archivebox [command]', file=sys.stderr)
  28. print(file=sys.stderr)
  29. print(' [violet]Hint[/violet]: To create a new archive collection or import existing data in this folder, run:', file=sys.stderr)
  30. print(' archivebox init', file=sys.stderr)
  31. raise SystemExit(2)
  32. # Create data dir subdirs
  33. create_and_chown_dir(CONSTANTS.SOURCES_DIR)
  34. create_and_chown_dir(CONSTANTS.PERSONAS_DIR / 'Default')
  35. create_and_chown_dir(CONSTANTS.LOGS_DIR)
  36. # create_and_chown_dir(CONSTANTS.CACHE_DIR)
  37. # Create /tmp and /lib dirs if they don't exist
  38. get_or_create_working_tmp_dir(autofix=True, quiet=False)
  39. get_or_create_working_lib_dir(autofix=True, quiet=False)
  40. # Check data dir permissions, /tmp, and /lib permissions
  41. check_data_dir_permissions()
  42. def check_migrations():
  43. from archivebox import DATA_DIR
  44. from ..index.sql import list_migrations
  45. pending_migrations = [name for status, name in list_migrations() if not status]
  46. is_migrating = any(arg in sys.argv for arg in ['makemigrations', 'migrate', 'init'])
  47. if pending_migrations and not is_migrating:
  48. print('[red][X] This collection was created with an older version of ArchiveBox and must be upgraded first.[/red]')
  49. print(f' {DATA_DIR}', file=sys.stderr)
  50. print(file=sys.stderr)
  51. print(f' [violet]Hint:[/violet] To upgrade it to the latest version and apply the {len(pending_migrations)} pending migrations, run:', file=sys.stderr)
  52. print(' archivebox init', file=sys.stderr)
  53. raise SystemExit(3)
  54. def check_io_encoding():
  55. PYTHON_ENCODING = (sys.__stdout__ or sys.stdout or sys.__stderr__ or sys.stderr).encoding.upper().replace('UTF8', 'UTF-8')
  56. if PYTHON_ENCODING != 'UTF-8':
  57. print(f'[red][X] Your system is running python3 scripts with a bad locale setting: {PYTHON_ENCODING} (it should be UTF-8).[/red]', file=sys.stderr)
  58. print(' To fix it, add the line "export PYTHONIOENCODING=UTF-8" to your ~/.bashrc file (without quotes)', file=sys.stderr)
  59. print(' Or if you\'re using ubuntu/debian, run "dpkg-reconfigure locales"', file=sys.stderr)
  60. print('')
  61. print(' Confirm that it\'s fixed by opening a new shell and running:', file=sys.stderr)
  62. print(' python3 -c "import sys; print(sys.stdout.encoding)" # should output UTF-8', file=sys.stderr)
  63. raise SystemExit(2)
  64. # # hard errors: check python version
  65. # if sys.version_info[:3] < (3, 10, 0):
  66. # print('[red][X] Python version is not new enough: {sys.version} (>3.10 is required)[/red]', file=sys.stderr)
  67. # print(' See https://github.com/ArchiveBox/ArchiveBox/wiki/Troubleshooting#python for help upgrading your Python installation.', file=sys.stderr)
  68. # raise SystemExit(2)
  69. # # hard errors: check django version
  70. # if int(django.VERSION[0]) < 5:
  71. # print('[red][X] Django version is not new enough: {django.VERSION[:3]} (>=5.0 is required)[/red]', file=sys.stderr)
  72. # print(' Upgrade django using pip or your system package manager: pip3 install --upgrade django', file=sys.stderr)
  73. # raise SystemExit(2)
  74. def check_not_root():
  75. from archivebox.config.permissions import IS_ROOT, IN_DOCKER
  76. attempted_command = ' '.join(sys.argv[1:]) if len(sys.argv) > 1 else ''
  77. is_getting_help = '-h' in sys.argv or '--help' in sys.argv or 'help' in sys.argv
  78. is_getting_version = '--version' in sys.argv or 'version' in sys.argv
  79. is_installing = 'setup' in sys.argv or 'install' in sys.argv
  80. if IS_ROOT and not (is_getting_help or is_getting_version or is_installing):
  81. print('[red][!] ArchiveBox should never be run as root![/red]', file=sys.stderr)
  82. print(' For more information, see the security overview documentation:', file=sys.stderr)
  83. print(' https://github.com/ArchiveBox/ArchiveBox/wiki/Security-Overview#do-not-run-as-root', file=sys.stderr)
  84. if IN_DOCKER:
  85. print('[red][!] When using Docker, you must run commands with [green]docker run[/green] instead of [yellow3]docker exec[/yellow3], e.g.:', file=sys.stderr)
  86. print(' docker compose run archivebox {attempted_command}', file=sys.stderr)
  87. print(f' docker run -it -v $PWD/data:/data archivebox/archivebox {attempted_command}', file=sys.stderr)
  88. print(' or:', file=sys.stderr)
  89. print(f' docker compose exec --user=archivebox archivebox /bin/bash -c "archivebox {attempted_command}"', file=sys.stderr)
  90. print(f' docker exec -it --user=archivebox <container id> /bin/bash -c "archivebox {attempted_command}"', file=sys.stderr)
  91. raise SystemExit(2)
  92. def check_data_dir_permissions():
  93. from archivebox import DATA_DIR
  94. from archivebox.misc.logging import STDERR
  95. from archivebox.config.permissions import ARCHIVEBOX_USER, ARCHIVEBOX_GROUP, DEFAULT_PUID, DEFAULT_PGID, IS_ROOT, USER
  96. data_dir_stat = Path(DATA_DIR).stat()
  97. data_dir_uid, data_dir_gid = data_dir_stat.st_uid, data_dir_stat.st_gid
  98. data_owned_by_root = data_dir_uid == 0
  99. # data_owned_by_default_user = data_dir_uid == DEFAULT_PUID or data_dir_gid == DEFAULT_PGID
  100. data_owner_doesnt_match = (data_dir_uid != ARCHIVEBOX_USER and data_dir_gid != ARCHIVEBOX_GROUP) if not IS_ROOT else False
  101. data_not_writable = not (os.path.isdir(DATA_DIR) and os.access(DATA_DIR, os.W_OK))
  102. if data_owned_by_root:
  103. STDERR.print('\n[yellow]:warning: Warning: ArchiveBox [blue]DATA_DIR[/blue] is currently owned by [red]root[/red], it must be changed before archiving can run![/yellow]')
  104. elif data_owner_doesnt_match or data_not_writable:
  105. STDERR.print(f'\n[yellow]:warning: Warning: ArchiveBox [blue]DATA_DIR[/blue] is currently owned by [red]{data_dir_uid}:{data_dir_gid}[/red], but ArchiveBox user is [blue]{ARCHIVEBOX_USER}:{ARCHIVEBOX_GROUP}[/blue] ({USER})! (ArchiveBox may not be able to write to the data dir)[/yellow]')
  106. if data_owned_by_root or data_owner_doesnt_match or data_not_writable:
  107. STDERR.print(f'[violet]Hint:[/violet] Change the current ownership [red]{data_dir_uid}[/red]:{data_dir_gid} (PUID:PGID) to a non-root user & group that will run ArchiveBox, e.g.:')
  108. STDERR.print(f' [grey53]sudo[/grey53] chown -R [blue]{DEFAULT_PUID}:{DEFAULT_PGID}[/blue] {DATA_DIR.resolve()}')
  109. STDERR.print()
  110. STDERR.print('[blue]More info:[/blue]')
  111. STDERR.print(' [link=https://github.com/ArchiveBox/ArchiveBox#storage-requirements]https://github.com/ArchiveBox/ArchiveBox#storage-requirements[/link]')
  112. STDERR.print(' [link=https://github.com/ArchiveBox/ArchiveBox/wiki/Security-Overview#permissions]https://github.com/ArchiveBox/ArchiveBox/wiki/Security-Overview#permissions[/link]')
  113. STDERR.print(' [link=https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#puid--pgid]https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#puid--pgid[/link]')
  114. STDERR.print(' [link=https://github.com/ArchiveBox/ArchiveBox/wiki/Troubleshooting#filesystem-doesnt-support-fsync-eg-network-mounts]https://github.com/ArchiveBox/ArchiveBox/wiki/Troubleshooting#filesystem-doesnt-support-fsync-eg-network-mounts[/link]')
  115. from archivebox.config.common import STORAGE_CONFIG
  116. # Check /tmp dir permissions
  117. check_tmp_dir(STORAGE_CONFIG.TMP_DIR, throw=False, must_exist=True)
  118. # Check /lib dir permissions
  119. check_lib_dir(STORAGE_CONFIG.LIB_DIR, throw=False, must_exist=True)
  120. os.umask(0o777 - int(STORAGE_CONFIG.DIR_OUTPUT_PERMISSIONS, base=8)) # noqa: F821
  121. def check_tmp_dir(tmp_dir=None, throw=False, quiet=False, must_exist=True):
  122. from archivebox.config.paths import assert_dir_can_contain_unix_sockets, dir_is_writable, get_or_create_working_tmp_dir
  123. from archivebox.misc.logging import STDERR
  124. from archivebox.misc.logging_util import pretty_path
  125. from archivebox.config.permissions import ARCHIVEBOX_USER, ARCHIVEBOX_GROUP
  126. from archivebox.config.common import STORAGE_CONFIG
  127. tmp_dir = tmp_dir or STORAGE_CONFIG.TMP_DIR
  128. socket_file = tmp_dir.absolute().resolve() / "supervisord.sock"
  129. if not must_exist and not os.path.isdir(tmp_dir):
  130. # just check that its viable based on its length (because dir may not exist yet, we cant check if its writable)
  131. return len(f'file://{socket_file}') <= 96
  132. tmp_is_valid = False
  133. try:
  134. tmp_is_valid = dir_is_writable(tmp_dir)
  135. tmp_is_valid = tmp_is_valid and assert_dir_can_contain_unix_sockets(tmp_dir)
  136. assert tmp_is_valid, f'ArchiveBox user PUID={ARCHIVEBOX_USER} PGID={ARCHIVEBOX_GROUP} is unable to write to TMP_DIR={tmp_dir}'
  137. assert len(f'file://{socket_file}') <= 96, f'ArchiveBox TMP_DIR={tmp_dir} is too long, dir containing unix socket files must be <90 chars.'
  138. return True
  139. except Exception as e:
  140. if not quiet:
  141. STDERR.print()
  142. ERROR_TEXT = '\n'.join((
  143. '',
  144. f'[red]:cross_mark: ArchiveBox is unable to use TMP_DIR={pretty_path(tmp_dir)}[/red]',
  145. f' [yellow]{e}[/yellow]',
  146. '',
  147. '[blue]Info:[/blue] [grey53]The TMP_DIR is used for the supervisord unix socket file and other temporary files.',
  148. ' - It [red]must[/red] be on a local drive (not inside a docker volume, remote network drive, or FUSE mount).',
  149. f' - It [red]must[/red] be readable and writable by the ArchiveBox user (PUID={ARCHIVEBOX_USER}, PGID={ARCHIVEBOX_GROUP}).',
  150. ' - It [red]must[/red] be a *short* path (less than 90 characters) due to UNIX path length restrictions for sockets.',
  151. ' - It [yellow]should[/yellow] be able to hold at least 200MB of data (in-progress downloads can be large).[/grey53]',
  152. '',
  153. '[violet]Hint:[/violet] Fix it by setting TMP_DIR to a path that meets these requirements, e.g.:',
  154. f' [green]archivebox config --set TMP_DIR={get_or_create_working_tmp_dir(autofix=False, quiet=True) or "/tmp/archivebox"}[/green]',
  155. '',
  156. ))
  157. STDERR.print(Panel(ERROR_TEXT, expand=False, border_style='red', title='[red]:cross_mark: Error with configured TMP_DIR[/red]', subtitle='Background workers may fail to start until fixed.'))
  158. STDERR.print()
  159. if throw:
  160. raise OSError(f'TMP_DIR={tmp_dir} is invalid, ArchiveBox is unable to use it and the server will fail to start!') from e
  161. return False
  162. def check_lib_dir(lib_dir: Path | None = None, throw=False, quiet=False, must_exist=True):
  163. import archivebox
  164. from archivebox.config.permissions import ARCHIVEBOX_USER, ARCHIVEBOX_GROUP
  165. from archivebox.misc.logging import STDERR
  166. from archivebox.misc.logging_util import pretty_path
  167. from archivebox.config.paths import dir_is_writable, get_or_create_working_lib_dir
  168. from archivebox.config.common import STORAGE_CONFIG
  169. lib_dir = lib_dir or STORAGE_CONFIG.LIB_DIR
  170. assert lib_dir == archivebox.pm.hook.get_LIB_DIR(), "lib_dir is not the same as the one in the flat config"
  171. if not must_exist and not os.path.isdir(lib_dir):
  172. return True
  173. lib_is_valid = False
  174. try:
  175. lib_is_valid = dir_is_writable(lib_dir)
  176. assert lib_is_valid, f'ArchiveBox user PUID={ARCHIVEBOX_USER} PGID={ARCHIVEBOX_GROUP} is unable to write to LIB_DIR={lib_dir}'
  177. return True
  178. except Exception as e:
  179. if not quiet:
  180. STDERR.print()
  181. ERROR_TEXT = '\n'.join((
  182. '',
  183. f'[red]:cross_mark: ArchiveBox is unable to use LIB_DIR={pretty_path(lib_dir)}[/red]',
  184. f' [yellow]{e}[/yellow]',
  185. '',
  186. '[blue]Info:[/blue] [grey53]The LIB_DIR is used to store ArchiveBox auto-installed plugin library and binary dependencies.',
  187. f' - It [red]must[/red] be readable and writable by the ArchiveBox user (PUID={ARCHIVEBOX_USER}, PGID={ARCHIVEBOX_GROUP}).',
  188. ' - It [yellow]should[/yellow] be on a local (ideally fast) drive like an SSD or HDD (not on a network drive or external HDD).',
  189. ' - It [yellow]should[/yellow] be able to hold at least 1GB of data (some dependencies like Chrome can be large).[/grey53]',
  190. '',
  191. '[violet]Hint:[/violet] Fix it by setting LIB_DIR to a path that meets these requirements, e.g.:',
  192. f' [green]archivebox config --set LIB_DIR={get_or_create_working_lib_dir(autofix=False, quiet=True) or "/usr/local/share/archivebox"}[/green]',
  193. '',
  194. ))
  195. STDERR.print(Panel(ERROR_TEXT, expand=False, border_style='red', title='[red]:cross_mark: Error with configured LIB_DIR[/red]', subtitle='[yellow]Dependencies may not auto-install properly until fixed.[/yellow]'))
  196. STDERR.print()
  197. if throw:
  198. raise OSError(f'LIB_DIR={lib_dir} is invalid, ArchiveBox is unable to use it and dependencies will fail to install.') from e
  199. return False