constants.py 7.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242
  1. __package__ = 'archivebox'
  2. import os
  3. import re
  4. from typing import Dict
  5. from pathlib import Path
  6. from benedict import benedict
  7. import archivebox
  8. from .misc.logging import DEFAULT_CLI_COLORS
  9. ###################### Config ##########################
  10. VERSION: str = archivebox.VERSION
  11. TIMEZONE: str = 'UTC'
  12. DEFAULT_CLI_COLORS: Dict[str, str] = DEFAULT_CLI_COLORS
  13. DISABLED_CLI_COLORS: Dict[str, str] = benedict({k: '' for k in DEFAULT_CLI_COLORS})
  14. PACKAGE_DIR: Path = archivebox.PACKAGE_DIR
  15. PACKAGE_DIR_NAME: str = archivebox.PACKAGE_DIR.name
  16. TEMPLATES_DIR_NAME: str = 'templates'
  17. TEMPLATES_DIR: Path = archivebox.PACKAGE_DIR / TEMPLATES_DIR_NAME
  18. STATIC_DIR: Path = TEMPLATES_DIR / 'static'
  19. USER_PLUGINS_DIR_NAME: str = 'user_plugins'
  20. CUSTOM_TEMPLATES_DIR_NAME: str = 'user_templates'
  21. ARCHIVE_DIR_NAME: str = 'archive'
  22. SOURCES_DIR_NAME: str = 'sources'
  23. PERSONAS_DIR_NAME: str = 'personas'
  24. CRONTABS_DIR_NAME: str = 'crontabs'
  25. CACHE_DIR_NAME: str = 'cache'
  26. LOGS_DIR_NAME: str = 'logs'
  27. LIB_DIR_NAME: str = 'lib'
  28. TMP_DIR_NAME: str = 'tmp'
  29. OUTPUT_DIR: Path = archivebox.DATA_DIR
  30. ARCHIVE_DIR: Path = archivebox.DATA_DIR / ARCHIVE_DIR_NAME
  31. SOURCES_DIR: Path = archivebox.DATA_DIR / SOURCES_DIR_NAME
  32. PERSONAS_DIR: Path = archivebox.DATA_DIR / PERSONAS_DIR_NAME
  33. CACHE_DIR: Path = archivebox.DATA_DIR / CACHE_DIR_NAME
  34. LOGS_DIR: Path = archivebox.DATA_DIR / LOGS_DIR_NAME
  35. LIB_DIR: Path = archivebox.DATA_DIR / LIB_DIR_NAME
  36. TMP_DIR: Path = archivebox.DATA_DIR / TMP_DIR_NAME
  37. CUSTOM_TEMPLATES_DIR: Path = archivebox.DATA_DIR / CUSTOM_TEMPLATES_DIR_NAME
  38. USER_PLUGINS_DIR: Path = archivebox.DATA_DIR / USER_PLUGINS_DIR_NAME
  39. LIB_PIP_DIR: Path = LIB_DIR / 'pip'
  40. LIB_NPM_DIR: Path = LIB_DIR / 'npm'
  41. LIB_BROWSERS_DIR: Path = LIB_DIR / 'browsers'
  42. LIB_BIN_DIR: Path = LIB_DIR / 'bin'
  43. BIN_DIR: Path = LIB_BIN_DIR
  44. CONFIG_FILENAME: str = 'ArchiveBox.conf'
  45. SQL_INDEX_FILENAME: str = 'index.sqlite3'
  46. CONFIG_FILE: Path = archivebox.DATA_DIR / CONFIG_FILENAME
  47. DATABASE_FILE: Path = archivebox.DATA_DIR / SQL_INDEX_FILENAME
  48. QUEUE_DATABASE_FILE: Path = archivebox.DATA_DIR / SQL_INDEX_FILENAME.replace('index.', 'queue.')
  49. JSON_INDEX_FILENAME: str = 'index.json'
  50. HTML_INDEX_FILENAME: str = 'index.html'
  51. ROBOTS_TXT_FILENAME: str = 'robots.txt'
  52. FAVICON_FILENAME: str = 'favicon.ico'
  53. ALLOWDENYLIST_REGEX_FLAGS: int = re.IGNORECASE | re.UNICODE | re.MULTILINE
  54. STATICFILE_EXTENSIONS: frozenset[str] = frozenset((
  55. # 99.999% of the time, URLs ending in these extensions are static files
  56. # that can be downloaded as-is, not html pages that need to be rendered
  57. 'gif', 'jpeg', 'jpg', 'png', 'tif', 'tiff', 'wbmp', 'ico', 'jng', 'bmp',
  58. 'svg', 'svgz', 'webp', 'ps', 'eps', 'ai',
  59. 'mp3', 'mp4', 'm4a', 'mpeg', 'mpg', 'mkv', 'mov', 'webm', 'm4v',
  60. 'flv', 'wmv', 'avi', 'ogg', 'ts', 'm3u8',
  61. 'pdf', 'txt', 'rtf', 'rtfd', 'doc', 'docx', 'ppt', 'pptx', 'xls', 'xlsx',
  62. 'atom', 'rss', 'css', 'js', 'json',
  63. 'dmg', 'iso', 'img',
  64. 'rar', 'war', 'hqx', 'zip', 'gz', 'bz2', '7z',
  65. # Less common extensions to consider adding later
  66. # jar, swf, bin, com, exe, dll, deb
  67. # ear, hqx, eot, wmlc, kml, kmz, cco, jardiff, jnlp, run, msi, msp, msm,
  68. # pl pm, prc pdb, rar, rpm, sea, sit, tcl tk, der, pem, crt, xpi, xspf,
  69. # ra, mng, asx, asf, 3gpp, 3gp, mid, midi, kar, jad, wml, htc, mml
  70. # These are always treated as pages, not as static files, never add them:
  71. # html, htm, shtml, xhtml, xml, aspx, php, cgi
  72. ))
  73. INGORED_PATHS: frozenset[str] = frozenset((
  74. ".git",
  75. ".svn",
  76. ".DS_Store",
  77. ".gitignore",
  78. "lost+found",
  79. ".DS_Store",
  80. ".env",
  81. "Dockerfile",
  82. ))
  83. PIP_RELATED_NAMES: frozenset[str] = frozenset((
  84. ".venv",
  85. "venv",
  86. "virtualenv",
  87. ".virtualenv",
  88. ))
  89. NPM_RELATED_NAMES: frozenset[str] = frozenset((
  90. "node_modules",
  91. "package.json",
  92. "package-lock.json",
  93. "yarn.lock",
  94. ))
  95. DATA_DIR_NAMES: frozenset[str] = frozenset((
  96. ARCHIVE_DIR_NAME,
  97. SOURCES_DIR_NAME,
  98. LOGS_DIR_NAME,
  99. CACHE_DIR_NAME,
  100. LIB_DIR_NAME,
  101. PERSONAS_DIR_NAME,
  102. CUSTOM_TEMPLATES_DIR_NAME,
  103. USER_PLUGINS_DIR_NAME,
  104. ))
  105. DATA_DIRS: frozenset[Path] = frozenset(archivebox.DATA_DIR / dirname for dirname in DATA_DIR_NAMES)
  106. DATA_FILE_NAMES: frozenset[str] = frozenset((
  107. CONFIG_FILENAME,
  108. SQL_INDEX_FILENAME,
  109. f"{SQL_INDEX_FILENAME}-wal",
  110. f"{SQL_INDEX_FILENAME}-shm",
  111. "queue.sqlite3",
  112. "queue.sqlite3-wal",
  113. "queue.sqlite3-shm",
  114. "search.sqlite3",
  115. JSON_INDEX_FILENAME,
  116. HTML_INDEX_FILENAME,
  117. ROBOTS_TXT_FILENAME,
  118. FAVICON_FILENAME,
  119. CONFIG_FILENAME,
  120. f"{CONFIG_FILENAME}.bak",
  121. "static_index.json",
  122. ))
  123. # When initializing archivebox in a new directory, we check to make sure the dir is
  124. # actually empty so that we dont clobber someone's home directory or desktop by accident.
  125. # These files are exceptions to the is_empty check when we're trying to init a new dir,
  126. # as they could be from a previous archivebox version, system artifacts, dependencies, etc.
  127. ALLOWED_IN_OUTPUT_DIR: frozenset[str] = frozenset((
  128. *INGORED_PATHS,
  129. *PIP_RELATED_NAMES,
  130. *NPM_RELATED_NAMES,
  131. *DATA_DIR_NAMES,
  132. *DATA_FILE_NAMES,
  133. "static", # created by old static exports <v0.6.0
  134. "sonic", # created by docker bind mount
  135. ))
  136. CODE_LOCATIONS = benedict({
  137. 'PACKAGE_DIR': {
  138. 'path': (archivebox.PACKAGE_DIR).resolve(),
  139. 'enabled': True,
  140. 'is_valid': (archivebox.PACKAGE_DIR / '__main__.py').exists(),
  141. },
  142. 'LIB_DIR': {
  143. 'path': LIB_DIR.resolve(),
  144. 'enabled': True,
  145. 'is_valid': LIB_DIR.is_dir(),
  146. },
  147. 'RUNTIME_CONFIG': {
  148. 'path': TMP_DIR.resolve(),
  149. 'enabled': True,
  150. 'is_valid': TMP_DIR.is_dir(),
  151. },
  152. 'TEMPLATES_DIR': {
  153. 'path': TEMPLATES_DIR.resolve(),
  154. 'enabled': True,
  155. 'is_valid': STATIC_DIR.exists(),
  156. },
  157. 'CUSTOM_TEMPLATES_DIR': {
  158. 'path': CUSTOM_TEMPLATES_DIR.resolve(),
  159. 'enabled': True,
  160. 'is_valid': CUSTOM_TEMPLATES_DIR.is_dir(),
  161. },
  162. })
  163. DATA_LOCATIONS = benedict({
  164. "OUTPUT_DIR": {
  165. "path": archivebox.DATA_DIR.resolve(),
  166. "enabled": True,
  167. "is_valid": DATABASE_FILE.exists(),
  168. "is_mount": os.path.ismount(archivebox.DATA_DIR.resolve()),
  169. },
  170. "CONFIG_FILE": {
  171. "path": CONFIG_FILE.resolve(),
  172. "enabled": True,
  173. "is_valid": CONFIG_FILE.exists(),
  174. },
  175. "SQL_INDEX": {
  176. "path": DATABASE_FILE.resolve(),
  177. "enabled": True,
  178. "is_valid": DATABASE_FILE.exists(),
  179. "is_mount": os.path.ismount(DATABASE_FILE.resolve()),
  180. },
  181. "QUEUE_DATABASE": {
  182. "path": QUEUE_DATABASE_FILE.resolve(),
  183. "enabled": True,
  184. "is_valid": QUEUE_DATABASE_FILE.exists(),
  185. "is_mount": os.path.ismount(QUEUE_DATABASE_FILE.resolve()),
  186. },
  187. "ARCHIVE_DIR": {
  188. "path": ARCHIVE_DIR.resolve(),
  189. "enabled": True,
  190. "is_valid": ARCHIVE_DIR.exists(),
  191. "is_mount": os.path.ismount(ARCHIVE_DIR.resolve()),
  192. },
  193. "SOURCES_DIR": {
  194. "path": SOURCES_DIR.resolve(),
  195. "enabled": True,
  196. "is_valid": SOURCES_DIR.exists(),
  197. },
  198. "PERSONAS_DIR": {
  199. "path": PERSONAS_DIR.resolve(),
  200. "enabled": PERSONAS_DIR.exists(),
  201. "is_valid": PERSONAS_DIR.exists(),
  202. },
  203. "LOGS_DIR": {
  204. "path": LOGS_DIR.resolve(),
  205. "enabled": True,
  206. "is_valid": LOGS_DIR.is_dir(),
  207. },
  208. "CACHE_DIR": {
  209. "path": CACHE_DIR.resolve(),
  210. "enabled": True,
  211. "is_valid": CACHE_DIR.is_dir(),
  212. },
  213. })
  214. CONSTANTS = benedict({key: value for key, value in globals().items() if key.isupper()})
  215. CONSTANTS_CONFIG = CONSTANTS