common.py 8.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188
  1. __package__ = 'archivebox.config'
  2. import re
  3. import sys
  4. import shutil
  5. from typing import Dict, Optional, List
  6. from pathlib import Path
  7. from rich import print
  8. from pydantic import Field, field_validator
  9. from django.utils.crypto import get_random_string
  10. from abx_spec_config.base_configset import BaseConfigSet
  11. from .constants import CONSTANTS
  12. from .version import get_COMMIT_HASH, get_BUILD_TIME, VERSION
  13. from .permissions import IN_DOCKER
  14. ###################### Config ##########################
  15. class ShellConfig(BaseConfigSet):
  16. DEBUG: bool = Field(default=lambda: '--debug' in sys.argv)
  17. IS_TTY: bool = Field(default=sys.stdout.isatty())
  18. USE_COLOR: bool = Field(default=lambda c: c.IS_TTY)
  19. SHOW_PROGRESS: bool = Field(default=lambda c: c.IS_TTY)
  20. IN_DOCKER: bool = Field(default=IN_DOCKER)
  21. IN_QEMU: bool = Field(default=False)
  22. ANSI: Dict[str, str] = Field(default=lambda c: CONSTANTS.DEFAULT_CLI_COLORS if c.USE_COLOR else CONSTANTS.DISABLED_CLI_COLORS)
  23. @property
  24. def TERM_WIDTH(self) -> int:
  25. if not self.IS_TTY:
  26. return 200
  27. return shutil.get_terminal_size((140, 10)).columns
  28. @property
  29. def COMMIT_HASH(self) -> Optional[str]:
  30. return get_COMMIT_HASH()
  31. @property
  32. def BUILD_TIME(self) -> str:
  33. return get_BUILD_TIME()
  34. SHELL_CONFIG = ShellConfig()
  35. class StorageConfig(BaseConfigSet):
  36. # TMP_DIR must be a local, fast, readable/writable dir by archivebox user,
  37. # must be a short path due to unix path length restrictions for socket files (<100 chars)
  38. # must be a local SSD/tmpfs for speed and because bind mounts/network mounts/FUSE dont support unix sockets
  39. TMP_DIR: Path = Field(default=CONSTANTS.DEFAULT_TMP_DIR)
  40. # LIB_DIR must be a local, fast, readable/writable dir by archivebox user,
  41. # must be able to contain executable binaries (up to 5GB size)
  42. # should not be a remote/network/FUSE mount for speed reasons, otherwise extractors will be slow
  43. LIB_DIR: Path = Field(default=CONSTANTS.DEFAULT_LIB_DIR)
  44. OUTPUT_PERMISSIONS: str = Field(default='644')
  45. RESTRICT_FILE_NAMES: str = Field(default='windows')
  46. ENFORCE_ATOMIC_WRITES: bool = Field(default=True)
  47. # not supposed to be user settable:
  48. DIR_OUTPUT_PERMISSIONS: str = Field(default=lambda c: c['OUTPUT_PERMISSIONS'].replace('6', '7').replace('4', '5'))
  49. STORAGE_CONFIG = StorageConfig()
  50. class GeneralConfig(BaseConfigSet):
  51. TAG_SEPARATOR_PATTERN: str = Field(default=r'[,]')
  52. GENERAL_CONFIG = GeneralConfig()
  53. class ServerConfig(BaseConfigSet):
  54. SECRET_KEY: str = Field(default=lambda: get_random_string(50, 'abcdefghijklmnopqrstuvwxyz0123456789_'))
  55. BIND_ADDR: str = Field(default=lambda: ['127.0.0.1:8000', '0.0.0.0:8000'][SHELL_CONFIG.IN_DOCKER])
  56. ALLOWED_HOSTS: str = Field(default='*')
  57. CSRF_TRUSTED_ORIGINS: str = Field(default=lambda c: 'http://localhost:8000,http://127.0.0.1:8000,http://0.0.0.0:8000,http://{}'.format(c.BIND_ADDR))
  58. SNAPSHOTS_PER_PAGE: int = Field(default=40)
  59. PREVIEW_ORIGINALS: bool = Field(default=True)
  60. FOOTER_INFO: str = Field(default='Content is hosted for personal archiving purposes only. Contact server owner for any takedown requests.')
  61. # CUSTOM_TEMPLATES_DIR: Path = Field(default=None) # this is now a constant
  62. PUBLIC_INDEX: bool = Field(default=True)
  63. PUBLIC_SNAPSHOTS: bool = Field(default=True)
  64. PUBLIC_ADD_VIEW: bool = Field(default=False)
  65. ADMIN_USERNAME: str = Field(default=None)
  66. ADMIN_PASSWORD: str = Field(default=None)
  67. REVERSE_PROXY_USER_HEADER: str = Field(default='Remote-User')
  68. REVERSE_PROXY_WHITELIST: str = Field(default='')
  69. LOGOUT_REDIRECT_URL: str = Field(default='/')
  70. SERVER_CONFIG = ServerConfig()
  71. class ArchivingConfig(BaseConfigSet):
  72. ONLY_NEW: bool = Field(default=True)
  73. TIMEOUT: int = Field(default=60)
  74. MEDIA_TIMEOUT: int = Field(default=3600)
  75. MEDIA_MAX_SIZE: str = Field(default='750m')
  76. RESOLUTION: str = Field(default='1440,2000')
  77. CHECK_SSL_VALIDITY: bool = Field(default=True)
  78. USER_AGENT: str = Field(default=f'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36 ArchiveBox/{VERSION} (+https://github.com/ArchiveBox/ArchiveBox/)')
  79. COOKIES_FILE: Path | None = Field(default=None)
  80. URL_DENYLIST: str = Field(default=r'\.(css|js|otf|ttf|woff|woff2|gstatic\.com|googleapis\.com/css)(\?.*)?$', alias='URL_BLACKLIST')
  81. URL_ALLOWLIST: str | None = Field(default=None, alias='URL_WHITELIST')
  82. SAVE_ALLOWLIST: Dict[str, List[str]] = Field(default={}) # mapping of regex patterns to list of archive methods
  83. SAVE_DENYLIST: Dict[str, List[str]] = Field(default={})
  84. # GIT_DOMAINS: str = Field(default='github.com,bitbucket.org,gitlab.com,gist.github.com,codeberg.org,gitea.com,git.sr.ht')
  85. # WGET_USER_AGENT: str = Field(default=lambda c: c['USER_AGENT'] + ' wget/{WGET_VERSION}')
  86. # CURL_USER_AGENT: str = Field(default=lambda c: c['USER_AGENT'] + ' curl/{CURL_VERSION}')
  87. # CHROME_USER_AGENT: str = Field(default=lambda c: c['USER_AGENT'])
  88. # CHROME_USER_DATA_DIR: str | None = Field(default=None)
  89. # CHROME_TIMEOUT: int = Field(default=0)
  90. # CHROME_HEADLESS: bool = Field(default=True)
  91. # CHROME_SANDBOX: bool = Field(default=lambda: not SHELL_CONFIG.IN_DOCKER)
  92. def validate(self):
  93. if int(self.TIMEOUT) < 5:
  94. print(f'[red][!] Warning: TIMEOUT is set too low! (currently set to TIMEOUT={self.TIMEOUT} seconds)[/red]', file=sys.stderr)
  95. print(' You must allow *at least* 5 seconds for indexing and archive methods to run succesfully.', file=sys.stderr)
  96. print(' (Setting it to somewhere between 30 and 3000 seconds is recommended)', file=sys.stderr)
  97. print(file=sys.stderr)
  98. print(' If you want to make ArchiveBox run faster, disable specific archive methods instead:', file=sys.stderr)
  99. print(' https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#archive-method-toggles', file=sys.stderr)
  100. print(file=sys.stderr)
  101. @field_validator('CHECK_SSL_VALIDITY', mode='after')
  102. def validate_check_ssl_validity(cls, v):
  103. """SIDE EFFECT: disable "you really shouldnt disable ssl" warnings emitted by requests"""
  104. if not v:
  105. import requests
  106. import urllib3
  107. requests.packages.urllib3.disable_warnings(requests.packages.urllib3.exceptions.InsecureRequestWarning)
  108. urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
  109. return v
  110. @property
  111. def URL_ALLOWLIST_PTN(self) -> re.Pattern | None:
  112. return re.compile(self.URL_ALLOWLIST, CONSTANTS.ALLOWDENYLIST_REGEX_FLAGS) if self.URL_ALLOWLIST else None
  113. @property
  114. def URL_DENYLIST_PTN(self) -> re.Pattern:
  115. return re.compile(self.URL_DENYLIST, CONSTANTS.ALLOWDENYLIST_REGEX_FLAGS)
  116. @property
  117. def SAVE_ALLOWLIST_PTNS(self) -> Dict[re.Pattern, List[str]]:
  118. return {
  119. # regexp: methods list
  120. re.compile(key, CONSTANTS.ALLOWDENYLIST_REGEX_FLAGS): val
  121. for key, val in self.SAVE_ALLOWLIST.items()
  122. } if self.SAVE_ALLOWLIST else {}
  123. @property
  124. def SAVE_DENYLIST_PTNS(self) -> Dict[re.Pattern, List[str]]:
  125. return {
  126. # regexp: methods list
  127. re.compile(key, CONSTANTS.ALLOWDENYLIST_REGEX_FLAGS): val
  128. for key, val in self.SAVE_DENYLIST.items()
  129. } if self.SAVE_DENYLIST else {}
  130. ARCHIVING_CONFIG = ArchivingConfig()
  131. class SearchBackendConfig(BaseConfigSet):
  132. USE_INDEXING_BACKEND: bool = Field(default=True)
  133. USE_SEARCHING_BACKEND: bool = Field(default=True)
  134. SEARCH_BACKEND_ENGINE: str = Field(default='ripgrep')
  135. SEARCH_PROCESS_HTML: bool = Field(default=True)
  136. SEARCH_BACKEND_TIMEOUT: int = Field(default=10)
  137. SEARCH_BACKEND_CONFIG = SearchBackendConfig()