constants.py 8.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233
  1. """
  2. Constants are for things that never change at runtime.
  3. (but they can change from run-to-run or machine-to-machine)
  4. DATA_DIR will never change at runtime, but you can run
  5. archivebox from inside a different DATA_DIR on the same machine.
  6. This is loaded very early in the archivebox startup flow, so nothing in this file
  7. or imported from this file should import anything from archivebox.config.common,
  8. django, other INSTALLED_APPS, or anything else that is not in a standard library.
  9. """
  10. __package__ = 'archivebox.config'
  11. import re
  12. import sys
  13. from typing import Dict
  14. from pathlib import Path
  15. from collections.abc import Mapping
  16. from benedict import benedict
  17. from archivebox.misc.logging import DEFAULT_CLI_COLORS
  18. from .paths import (
  19. PACKAGE_DIR,
  20. DATA_DIR,
  21. ARCHIVE_DIR,
  22. get_collection_id,
  23. get_machine_id,
  24. get_machine_type,
  25. )
  26. from .permissions import (
  27. IS_ROOT,
  28. IN_DOCKER,
  29. RUNNING_AS_UID,
  30. RUNNING_AS_GID,
  31. DEFAULT_PUID,
  32. DEFAULT_PGID,
  33. ARCHIVEBOX_USER,
  34. ARCHIVEBOX_GROUP,
  35. )
  36. from .version import detect_installed_version
  37. ###################### Config ##########################
  38. class ConstantsDict(Mapping):
  39. PACKAGE_DIR: Path = PACKAGE_DIR
  40. DATA_DIR: Path = DATA_DIR
  41. ARCHIVE_DIR: Path = ARCHIVE_DIR
  42. MACHINE_TYPE: str = get_machine_type()
  43. MACHINE_ID: str = get_machine_id()
  44. COLLECTION_ID: str = get_collection_id(DATA_DIR)
  45. # Host system
  46. VERSION: str = detect_installed_version(PACKAGE_DIR)
  47. IN_DOCKER: bool = IN_DOCKER
  48. # Permissions
  49. IS_ROOT: bool = IS_ROOT
  50. ARCHIVEBOX_USER: int = ARCHIVEBOX_USER
  51. ARCHIVEBOX_GROUP: int = ARCHIVEBOX_GROUP
  52. RUNNING_AS_UID: int = RUNNING_AS_UID
  53. RUNNING_AS_GID: int = RUNNING_AS_GID
  54. DEFAULT_PUID: int = DEFAULT_PUID
  55. DEFAULT_PGID: int = DEFAULT_PGID
  56. IS_INSIDE_VENV: bool = sys.prefix != sys.base_prefix
  57. # Source code dirs
  58. PACKAGE_DIR_NAME: str = PACKAGE_DIR.name
  59. TEMPLATES_DIR_NAME: str = 'templates'
  60. TEMPLATES_DIR: Path = PACKAGE_DIR / TEMPLATES_DIR_NAME
  61. STATIC_DIR_NAME: str = 'static'
  62. STATIC_DIR: Path = TEMPLATES_DIR / STATIC_DIR_NAME
  63. # Data dirs
  64. ARCHIVE_DIR_NAME: str = 'archive'
  65. SOURCES_DIR_NAME: str = 'sources'
  66. PERSONAS_DIR_NAME: str = 'personas'
  67. CRONTABS_DIR_NAME: str = 'crontabs'
  68. CACHE_DIR_NAME: str = 'cache'
  69. LOGS_DIR_NAME: str = 'logs'
  70. USER_PLUGINS_DIR_NAME: str = 'user_plugins'
  71. CUSTOM_TEMPLATES_DIR_NAME: str = 'user_templates'
  72. ARCHIVE_DIR: Path = DATA_DIR / ARCHIVE_DIR_NAME
  73. SOURCES_DIR: Path = DATA_DIR / SOURCES_DIR_NAME
  74. PERSONAS_DIR: Path = DATA_DIR / PERSONAS_DIR_NAME
  75. LOGS_DIR: Path = DATA_DIR / LOGS_DIR_NAME
  76. CACHE_DIR: Path = DATA_DIR / CACHE_DIR_NAME
  77. CUSTOM_TEMPLATES_DIR: Path = DATA_DIR / CUSTOM_TEMPLATES_DIR_NAME
  78. USER_PLUGINS_DIR: Path = DATA_DIR / USER_PLUGINS_DIR_NAME
  79. # Data dir files
  80. CONFIG_FILENAME: str = 'ArchiveBox.conf'
  81. SQL_INDEX_FILENAME: str = 'index.sqlite3'
  82. QUEUE_DATABASE_FILENAME: str = 'queue.sqlite3'
  83. CONFIG_FILE: Path = DATA_DIR / CONFIG_FILENAME
  84. DATABASE_FILE: Path = DATA_DIR / SQL_INDEX_FILENAME
  85. QUEUE_DATABASE_FILE: Path = DATA_DIR / QUEUE_DATABASE_FILENAME
  86. JSON_INDEX_FILENAME: str = 'index.json'
  87. HTML_INDEX_FILENAME: str = 'index.html'
  88. ROBOTS_TXT_FILENAME: str = 'robots.txt'
  89. FAVICON_FILENAME: str = 'favicon.ico'
  90. # Runtime dirs
  91. TMP_DIR_NAME: str = 'tmp'
  92. DEFAULT_TMP_DIR: Path = DATA_DIR / TMP_DIR_NAME / MACHINE_ID # ./data/tmp/abc3244323
  93. LIB_DIR_NAME: str = 'lib'
  94. DEFAULT_LIB_DIR: Path = DATA_DIR / LIB_DIR_NAME / MACHINE_TYPE # ./data/lib/arm64-linux-docker
  95. # Config constants
  96. TIMEZONE: str = 'UTC'
  97. DEFAULT_CLI_COLORS: Dict[str, str] = DEFAULT_CLI_COLORS
  98. DISABLED_CLI_COLORS: Dict[str, str] = benedict({k: '' for k in DEFAULT_CLI_COLORS})
  99. ALLOWDENYLIST_REGEX_FLAGS: int = re.IGNORECASE | re.UNICODE | re.MULTILINE
  100. STATICFILE_EXTENSIONS: frozenset[str] = frozenset((
  101. # 99.999% of the time, URLs ending in these extensions are static files
  102. # that can be downloaded as-is, not html pages that need to be rendered
  103. 'gif', 'jpeg', 'jpg', 'png', 'tif', 'tiff', 'wbmp', 'ico', 'jng', 'bmp',
  104. 'svg', 'svgz', 'webp', 'ps', 'eps', 'ai',
  105. 'mp3', 'mp4', 'm4a', 'mpeg', 'mpg', 'mkv', 'mov', 'webm', 'm4v',
  106. 'flv', 'wmv', 'avi', 'ogg', 'ts', 'm3u8',
  107. 'pdf', 'txt', 'rtf', 'rtfd', 'doc', 'docx', 'ppt', 'pptx', 'xls', 'xlsx',
  108. 'atom', 'rss', 'css', 'js', 'json',
  109. 'dmg', 'iso', 'img',
  110. 'rar', 'war', 'hqx', 'zip', 'gz', 'bz2', '7z',
  111. # Less common extensions to consider adding later
  112. # jar, swf, bin, com, exe, dll, deb
  113. # ear, hqx, eot, wmlc, kml, kmz, cco, jardiff, jnlp, run, msi, msp, msm,
  114. # pl pm, prc pdb, rar, rpm, sea, sit, tcl tk, der, pem, crt, xpi, xspf,
  115. # ra, mng, asx, asf, 3gpp, 3gp, mid, midi, kar, jad, wml, htc, mml
  116. # These are always treated as pages, not as static files, never add them:
  117. # html, htm, shtml, xhtml, xml, aspx, php, cgi
  118. ))
  119. PIP_RELATED_NAMES: frozenset[str] = frozenset((
  120. ".venv",
  121. "venv",
  122. "virtualenv",
  123. ".virtualenv",
  124. ))
  125. NPM_RELATED_NAMES: frozenset[str] = frozenset((
  126. "node_modules",
  127. "package.json",
  128. "package-lock.json",
  129. "yarn.lock",
  130. ))
  131. # When initializing archivebox in a new directory, we check to make sure the dir is
  132. # actually empty so that we dont clobber someone's home directory or desktop by accident.
  133. # These files are exceptions to the is_empty check when we're trying to init a new dir,
  134. # as they could be from a previous archivebox version, system artifacts, dependencies, etc.
  135. ALLOWED_IN_DATA_DIR: frozenset[str] = frozenset((
  136. *PIP_RELATED_NAMES,
  137. *NPM_RELATED_NAMES,
  138. ### Dirs:
  139. ARCHIVE_DIR_NAME,
  140. SOURCES_DIR_NAME,
  141. LOGS_DIR_NAME,
  142. CACHE_DIR_NAME,
  143. LIB_DIR_NAME,
  144. TMP_DIR_NAME,
  145. PERSONAS_DIR_NAME,
  146. CUSTOM_TEMPLATES_DIR_NAME,
  147. USER_PLUGINS_DIR_NAME,
  148. CRONTABS_DIR_NAME,
  149. "static", # created by old static exports <v0.6.0
  150. "sonic", # created by docker bind mount / sonic FTS process
  151. ".git",
  152. ".svn",
  153. ### Files:
  154. CONFIG_FILENAME,
  155. SQL_INDEX_FILENAME,
  156. f"{SQL_INDEX_FILENAME}-wal",
  157. f"{SQL_INDEX_FILENAME}-shm",
  158. QUEUE_DATABASE_FILENAME,
  159. f"{QUEUE_DATABASE_FILENAME}-wal",
  160. f"{QUEUE_DATABASE_FILENAME}-shm",
  161. "search.sqlite3",
  162. JSON_INDEX_FILENAME,
  163. HTML_INDEX_FILENAME,
  164. ROBOTS_TXT_FILENAME,
  165. FAVICON_FILENAME,
  166. CONFIG_FILENAME,
  167. f"{CONFIG_FILENAME}.bak",
  168. f".{CONFIG_FILENAME}.bak",
  169. "static_index.json",
  170. ".DS_Store",
  171. ".gitignore",
  172. "lost+found",
  173. ".DS_Store",
  174. ".env",
  175. ".collection_id",
  176. ".archivebox_id",
  177. "Dockerfile",
  178. ))
  179. @classmethod
  180. def __getitem__(cls, key: str):
  181. # so it behaves like a dict[key] == dict.key or object attr
  182. return getattr(cls, key)
  183. @classmethod
  184. def __benedict__(cls):
  185. # when casting to benedict, only include uppercase keys that don't start with an underscore
  186. return benedict({key: value for key, value in cls.__dict__.items() if key.isupper() and not key.startswith('_')})
  187. @classmethod
  188. def __len__(cls):
  189. return len(cls.__benedict__())
  190. @classmethod
  191. def __iter__(cls):
  192. return iter(cls.__benedict__())
  193. CONSTANTS = ConstantsDict()
  194. CONSTANTS_CONFIG = CONSTANTS.__benedict__()
  195. # add all key: values to globals() for easier importing, e.g.:
  196. # from archivebox.config.constants import IS_ROOT, PERSONAS_DIR, ...
  197. # globals().update(CONSTANTS)