constants.py 7.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218
  1. __package__ = 'archivebox.config'
  2. import re
  3. import sys
  4. from typing import Dict
  5. from pathlib import Path
  6. from collections.abc import Mapping
  7. from benedict import benedict
  8. from ..misc.logging import DEFAULT_CLI_COLORS
  9. from .paths import (
  10. PACKAGE_DIR,
  11. DATA_DIR,
  12. ARCHIVE_DIR,
  13. get_collection_id,
  14. get_machine_id,
  15. get_machine_type,
  16. )
  17. from .permissions import (
  18. IS_ROOT,
  19. IN_DOCKER,
  20. RUNNING_AS_UID,
  21. RUNNING_AS_GID,
  22. DEFAULT_PUID,
  23. DEFAULT_PGID,
  24. ARCHIVEBOX_USER,
  25. ARCHIVEBOX_GROUP,
  26. )
  27. from .version import detect_installed_version
  28. ###################### Config ##########################
  29. class ConstantsDict(Mapping):
  30. PACKAGE_DIR: Path = PACKAGE_DIR
  31. DATA_DIR: Path = DATA_DIR
  32. ARCHIVE_DIR: Path = ARCHIVE_DIR
  33. MACHINE_TYPE: str = get_machine_type()
  34. MACHINE_ID: str = get_machine_id()
  35. COLLECTION_ID: str = get_collection_id(DATA_DIR)
  36. # Host system
  37. VERSION: str = detect_installed_version(PACKAGE_DIR)
  38. IN_DOCKER: bool = IN_DOCKER
  39. # Permissions
  40. IS_ROOT: bool = IS_ROOT
  41. ARCHIVEBOX_USER: int = ARCHIVEBOX_USER
  42. ARCHIVEBOX_GROUP: int = ARCHIVEBOX_GROUP
  43. RUNNING_AS_UID: int = RUNNING_AS_UID
  44. RUNNING_AS_GID: int = RUNNING_AS_GID
  45. DEFAULT_PUID: int = DEFAULT_PUID
  46. DEFAULT_PGID: int = DEFAULT_PGID
  47. IS_INSIDE_VENV: bool = sys.prefix != sys.base_prefix
  48. # Source code dirs
  49. PACKAGE_DIR_NAME: str = PACKAGE_DIR.name
  50. TEMPLATES_DIR_NAME: str = 'templates'
  51. TEMPLATES_DIR: Path = PACKAGE_DIR / TEMPLATES_DIR_NAME
  52. STATIC_DIR_NAME: str = 'static'
  53. STATIC_DIR: Path = TEMPLATES_DIR / STATIC_DIR_NAME
  54. # Data dirs
  55. ARCHIVE_DIR_NAME: str = 'archive'
  56. SOURCES_DIR_NAME: str = 'sources'
  57. PERSONAS_DIR_NAME: str = 'personas'
  58. CRONTABS_DIR_NAME: str = 'crontabs'
  59. CACHE_DIR_NAME: str = 'cache'
  60. LOGS_DIR_NAME: str = 'logs'
  61. USER_PLUGINS_DIR_NAME: str = 'user_plugins'
  62. CUSTOM_TEMPLATES_DIR_NAME: str = 'user_templates'
  63. ARCHIVE_DIR: Path = DATA_DIR / ARCHIVE_DIR_NAME
  64. SOURCES_DIR: Path = DATA_DIR / SOURCES_DIR_NAME
  65. PERSONAS_DIR: Path = DATA_DIR / PERSONAS_DIR_NAME
  66. LOGS_DIR: Path = DATA_DIR / LOGS_DIR_NAME
  67. CACHE_DIR: Path = DATA_DIR / CACHE_DIR_NAME
  68. CUSTOM_TEMPLATES_DIR: Path = DATA_DIR / CUSTOM_TEMPLATES_DIR_NAME
  69. USER_PLUGINS_DIR: Path = DATA_DIR / USER_PLUGINS_DIR_NAME
  70. # Data dir files
  71. CONFIG_FILENAME: str = 'ArchiveBox.conf'
  72. SQL_INDEX_FILENAME: str = 'index.sqlite3'
  73. QUEUE_DATABASE_FILENAME: str = 'queue.sqlite3'
  74. CONFIG_FILE: Path = DATA_DIR / CONFIG_FILENAME
  75. DATABASE_FILE: Path = DATA_DIR / SQL_INDEX_FILENAME
  76. QUEUE_DATABASE_FILE: Path = DATA_DIR / QUEUE_DATABASE_FILENAME
  77. JSON_INDEX_FILENAME: str = 'index.json'
  78. HTML_INDEX_FILENAME: str = 'index.html'
  79. ROBOTS_TXT_FILENAME: str = 'robots.txt'
  80. FAVICON_FILENAME: str = 'favicon.ico'
  81. # Runtime dirs
  82. TMP_DIR_NAME: str = 'tmp'
  83. DEFAULT_TMP_DIR: Path = DATA_DIR / TMP_DIR_NAME / MACHINE_ID # ./data/tmp/abc3244323
  84. LIB_DIR_NAME: str = 'lib'
  85. DEFAULT_LIB_DIR: Path = DATA_DIR / LIB_DIR_NAME / MACHINE_TYPE # ./data/lib/arm64-linux-docker
  86. # Config constants
  87. TIMEZONE: str = 'UTC'
  88. DEFAULT_CLI_COLORS: Dict[str, str] = DEFAULT_CLI_COLORS
  89. DISABLED_CLI_COLORS: Dict[str, str] = benedict({k: '' for k in DEFAULT_CLI_COLORS})
  90. ALLOWDENYLIST_REGEX_FLAGS: int = re.IGNORECASE | re.UNICODE | re.MULTILINE
  91. STATICFILE_EXTENSIONS: frozenset[str] = frozenset((
  92. # 99.999% of the time, URLs ending in these extensions are static files
  93. # that can be downloaded as-is, not html pages that need to be rendered
  94. 'gif', 'jpeg', 'jpg', 'png', 'tif', 'tiff', 'wbmp', 'ico', 'jng', 'bmp',
  95. 'svg', 'svgz', 'webp', 'ps', 'eps', 'ai',
  96. 'mp3', 'mp4', 'm4a', 'mpeg', 'mpg', 'mkv', 'mov', 'webm', 'm4v',
  97. 'flv', 'wmv', 'avi', 'ogg', 'ts', 'm3u8',
  98. 'pdf', 'txt', 'rtf', 'rtfd', 'doc', 'docx', 'ppt', 'pptx', 'xls', 'xlsx',
  99. 'atom', 'rss', 'css', 'js', 'json',
  100. 'dmg', 'iso', 'img',
  101. 'rar', 'war', 'hqx', 'zip', 'gz', 'bz2', '7z',
  102. # Less common extensions to consider adding later
  103. # jar, swf, bin, com, exe, dll, deb
  104. # ear, hqx, eot, wmlc, kml, kmz, cco, jardiff, jnlp, run, msi, msp, msm,
  105. # pl pm, prc pdb, rar, rpm, sea, sit, tcl tk, der, pem, crt, xpi, xspf,
  106. # ra, mng, asx, asf, 3gpp, 3gp, mid, midi, kar, jad, wml, htc, mml
  107. # These are always treated as pages, not as static files, never add them:
  108. # html, htm, shtml, xhtml, xml, aspx, php, cgi
  109. ))
  110. PIP_RELATED_NAMES: frozenset[str] = frozenset((
  111. ".venv",
  112. "venv",
  113. "virtualenv",
  114. ".virtualenv",
  115. ))
  116. NPM_RELATED_NAMES: frozenset[str] = frozenset((
  117. "node_modules",
  118. "package.json",
  119. "package-lock.json",
  120. "yarn.lock",
  121. ))
  122. # When initializing archivebox in a new directory, we check to make sure the dir is
  123. # actually empty so that we dont clobber someone's home directory or desktop by accident.
  124. # These files are exceptions to the is_empty check when we're trying to init a new dir,
  125. # as they could be from a previous archivebox version, system artifacts, dependencies, etc.
  126. ALLOWED_IN_DATA_DIR: frozenset[str] = frozenset((
  127. *PIP_RELATED_NAMES,
  128. *NPM_RELATED_NAMES,
  129. ### Dirs:
  130. ARCHIVE_DIR_NAME,
  131. SOURCES_DIR_NAME,
  132. LOGS_DIR_NAME,
  133. CACHE_DIR_NAME,
  134. LIB_DIR_NAME,
  135. TMP_DIR_NAME,
  136. PERSONAS_DIR_NAME,
  137. CUSTOM_TEMPLATES_DIR_NAME,
  138. USER_PLUGINS_DIR_NAME,
  139. CRONTABS_DIR_NAME,
  140. "static", # created by old static exports <v0.6.0
  141. "sonic", # created by docker bind mount / sonic FTS process
  142. ".git",
  143. ".svn",
  144. ### Files:
  145. CONFIG_FILENAME,
  146. SQL_INDEX_FILENAME,
  147. f"{SQL_INDEX_FILENAME}-wal",
  148. f"{SQL_INDEX_FILENAME}-shm",
  149. QUEUE_DATABASE_FILENAME,
  150. f"{QUEUE_DATABASE_FILENAME}-wal",
  151. f"{QUEUE_DATABASE_FILENAME}-shm",
  152. "search.sqlite3",
  153. JSON_INDEX_FILENAME,
  154. HTML_INDEX_FILENAME,
  155. ROBOTS_TXT_FILENAME,
  156. FAVICON_FILENAME,
  157. CONFIG_FILENAME,
  158. f"{CONFIG_FILENAME}.bak",
  159. f".{CONFIG_FILENAME}.bak",
  160. "static_index.json",
  161. ".DS_Store",
  162. ".gitignore",
  163. "lost+found",
  164. ".DS_Store",
  165. ".env",
  166. ".collection_id",
  167. ".archivebox_id",
  168. "Dockerfile",
  169. ))
  170. @classmethod
  171. def __getitem__(cls, key: str):
  172. return getattr(cls, key)
  173. @classmethod
  174. def __benedict__(cls):
  175. return benedict({key: value for key, value in cls.__dict__.items() if key.isupper() and not key.startswith('_')})
  176. @classmethod
  177. def __len__(cls):
  178. return len(cls.__benedict__())
  179. @classmethod
  180. def __iter__(cls):
  181. return iter(cls.__benedict__())
  182. CONSTANTS = ConstantsDict()
  183. CONSTANTS_CONFIG = CONSTANTS.__benedict__()
  184. # add all key: values to globals() for easier importing
  185. globals().update(CONSTANTS)