common.py 8.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191
  1. __package__ = 'archivebox.config'
  2. import re
  3. import sys
  4. import shutil
  5. from typing import Dict, Optional, List
  6. from pathlib import Path
  7. from rich import print
  8. from pydantic import Field, field_validator
  9. from django.utils.crypto import get_random_string
  10. from abx_spec_config.base_configset import BaseConfigSet
  11. from .constants import CONSTANTS
  12. from .version import get_COMMIT_HASH, get_BUILD_TIME, VERSION
  13. from .permissions import IN_DOCKER
  14. ###################### Config ##########################
  15. class ShellConfig(BaseConfigSet):
  16. DEBUG: bool = Field(default=lambda: '--debug' in sys.argv)
  17. IS_TTY: bool = Field(default=sys.stdout.isatty())
  18. USE_COLOR: bool = Field(default=lambda c: c.IS_TTY)
  19. SHOW_PROGRESS: bool = Field(default=lambda c: c.IS_TTY)
  20. IN_DOCKER: bool = Field(default=IN_DOCKER)
  21. IN_QEMU: bool = Field(default=False)
  22. ANSI: Dict[str, str] = Field(default=lambda c: CONSTANTS.DEFAULT_CLI_COLORS if c.USE_COLOR else CONSTANTS.DISABLED_CLI_COLORS)
  23. @property
  24. def TERM_WIDTH(self) -> int:
  25. if not self.IS_TTY:
  26. return 200
  27. return shutil.get_terminal_size((140, 10)).columns
  28. @property
  29. def COMMIT_HASH(self) -> Optional[str]:
  30. return get_COMMIT_HASH()
  31. @property
  32. def BUILD_TIME(self) -> str:
  33. return get_BUILD_TIME()
  34. SHELL_CONFIG = ShellConfig()
  35. class StorageConfig(BaseConfigSet):
  36. # TMP_DIR must be a local, fast, readable/writable dir by archivebox user,
  37. # must be a short path due to unix path length restrictions for socket files (<100 chars)
  38. # must be a local SSD/tmpfs for speed and because bind mounts/network mounts/FUSE dont support unix sockets
  39. TMP_DIR: Path = Field(default=CONSTANTS.DEFAULT_TMP_DIR)
  40. # LIB_DIR must be a local, fast, readable/writable dir by archivebox user,
  41. # must be able to contain executable binaries (up to 5GB size)
  42. # should not be a remote/network/FUSE mount for speed reasons, otherwise extractors will be slow
  43. LIB_DIR: Path = Field(default=CONSTANTS.DEFAULT_LIB_DIR)
  44. OUTPUT_PERMISSIONS: str = Field(default='644')
  45. RESTRICT_FILE_NAMES: str = Field(default='windows')
  46. ENFORCE_ATOMIC_WRITES: bool = Field(default=True)
  47. # not supposed to be user settable:
  48. DIR_OUTPUT_PERMISSIONS: str = Field(default=lambda c: c['OUTPUT_PERMISSIONS'].replace('6', '7').replace('4', '5'))
  49. STORAGE_CONFIG = StorageConfig()
  50. class GeneralConfig(BaseConfigSet):
  51. TAG_SEPARATOR_PATTERN: str = Field(default=r'[,]')
  52. GENERAL_CONFIG = GeneralConfig()
  53. class ServerConfig(BaseConfigSet):
  54. SECRET_KEY: str = Field(default=lambda: get_random_string(50, 'abcdefghijklmnopqrstuvwxyz0123456789_'))
  55. BIND_ADDR: str = Field(default=lambda: ['127.0.0.1:8000', '0.0.0.0:8000'][SHELL_CONFIG.IN_DOCKER])
  56. ALLOWED_HOSTS: str = Field(default='*')
  57. CSRF_TRUSTED_ORIGINS: str = Field(default=lambda c: 'http://localhost:8000,http://127.0.0.1:8000,http://0.0.0.0:8000,http://{}'.format(c.BIND_ADDR))
  58. SNAPSHOTS_PER_PAGE: int = Field(default=40)
  59. PREVIEW_ORIGINALS: bool = Field(default=True)
  60. FOOTER_INFO: str = Field(default='Content is hosted for personal archiving purposes only. Contact server owner for any takedown requests.')
  61. # CUSTOM_TEMPLATES_DIR: Path = Field(default=None) # this is now a constant
  62. PUBLIC_INDEX: bool = Field(default=True)
  63. PUBLIC_SNAPSHOTS: bool = Field(default=True)
  64. PUBLIC_ADD_VIEW: bool = Field(default=False)
  65. ADMIN_USERNAME: str = Field(default=None)
  66. ADMIN_PASSWORD: str = Field(default=None)
  67. REVERSE_PROXY_USER_HEADER: str = Field(default='Remote-User')
  68. REVERSE_PROXY_WHITELIST: str = Field(default='')
  69. LOGOUT_REDIRECT_URL: str = Field(default='/')
  70. SERVER_CONFIG = ServerConfig()
  71. class ArchivingConfig(BaseConfigSet):
  72. ONLY_NEW: bool = Field(default=True)
  73. OVERWRITE: bool = Field(default=False)
  74. TIMEOUT: int = Field(default=60)
  75. MEDIA_TIMEOUT: int = Field(default=3600)
  76. MEDIA_MAX_SIZE: str = Field(default='750m')
  77. RESOLUTION: str = Field(default='1440,2000')
  78. CHECK_SSL_VALIDITY: bool = Field(default=True)
  79. USER_AGENT: str = Field(default=f'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36 ArchiveBox/{VERSION} (+https://github.com/ArchiveBox/ArchiveBox/)')
  80. COOKIES_FILE: Path | None = Field(default=None)
  81. URL_DENYLIST: str = Field(default=r'\.(css|js|otf|ttf|woff|woff2|gstatic\.com|googleapis\.com/css)(\?.*)?$', alias='URL_BLACKLIST')
  82. URL_ALLOWLIST: str | None = Field(default=None, alias='URL_WHITELIST')
  83. SAVE_ALLOWLIST: Dict[str, List[str]] = Field(default={}) # mapping of regex patterns to list of archive methods
  84. SAVE_DENYLIST: Dict[str, List[str]] = Field(default={})
  85. DEFAULT_PERSONA: str = Field(default='Default')
  86. # GIT_DOMAINS: str = Field(default='github.com,bitbucket.org,gitlab.com,gist.github.com,codeberg.org,gitea.com,git.sr.ht')
  87. # WGET_USER_AGENT: str = Field(default=lambda c: c['USER_AGENT'] + ' wget/{WGET_VERSION}')
  88. # CURL_USER_AGENT: str = Field(default=lambda c: c['USER_AGENT'] + ' curl/{CURL_VERSION}')
  89. # CHROME_USER_AGENT: str = Field(default=lambda c: c['USER_AGENT'])
  90. # CHROME_USER_DATA_DIR: str | None = Field(default=None)
  91. # CHROME_TIMEOUT: int = Field(default=0)
  92. # CHROME_HEADLESS: bool = Field(default=True)
  93. # CHROME_SANDBOX: bool = Field(default=lambda: not SHELL_CONFIG.IN_DOCKER)
  94. def validate(self):
  95. if int(self.TIMEOUT) < 5:
  96. print(f'[red][!] Warning: TIMEOUT is set too low! (currently set to TIMEOUT={self.TIMEOUT} seconds)[/red]', file=sys.stderr)
  97. print(' You must allow *at least* 5 seconds for indexing and archive methods to run succesfully.', file=sys.stderr)
  98. print(' (Setting it to somewhere between 30 and 3000 seconds is recommended)', file=sys.stderr)
  99. print(file=sys.stderr)
  100. print(' If you want to make ArchiveBox run faster, disable specific archive methods instead:', file=sys.stderr)
  101. print(' https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#archive-method-toggles', file=sys.stderr)
  102. print(file=sys.stderr)
  103. @field_validator('CHECK_SSL_VALIDITY', mode='after')
  104. def validate_check_ssl_validity(cls, v):
  105. """SIDE EFFECT: disable "you really shouldnt disable ssl" warnings emitted by requests"""
  106. if not v:
  107. import requests
  108. import urllib3
  109. requests.packages.urllib3.disable_warnings(requests.packages.urllib3.exceptions.InsecureRequestWarning)
  110. urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
  111. return v
  112. @property
  113. def URL_ALLOWLIST_PTN(self) -> re.Pattern | None:
  114. return re.compile(self.URL_ALLOWLIST, CONSTANTS.ALLOWDENYLIST_REGEX_FLAGS) if self.URL_ALLOWLIST else None
  115. @property
  116. def URL_DENYLIST_PTN(self) -> re.Pattern:
  117. return re.compile(self.URL_DENYLIST, CONSTANTS.ALLOWDENYLIST_REGEX_FLAGS)
  118. @property
  119. def SAVE_ALLOWLIST_PTNS(self) -> Dict[re.Pattern, List[str]]:
  120. return {
  121. # regexp: methods list
  122. re.compile(key, CONSTANTS.ALLOWDENYLIST_REGEX_FLAGS): val
  123. for key, val in self.SAVE_ALLOWLIST.items()
  124. } if self.SAVE_ALLOWLIST else {}
  125. @property
  126. def SAVE_DENYLIST_PTNS(self) -> Dict[re.Pattern, List[str]]:
  127. return {
  128. # regexp: methods list
  129. re.compile(key, CONSTANTS.ALLOWDENYLIST_REGEX_FLAGS): val
  130. for key, val in self.SAVE_DENYLIST.items()
  131. } if self.SAVE_DENYLIST else {}
  132. ARCHIVING_CONFIG = ArchivingConfig()
  133. class SearchBackendConfig(BaseConfigSet):
  134. USE_INDEXING_BACKEND: bool = Field(default=True)
  135. USE_SEARCHING_BACKEND: bool = Field(default=True)
  136. SEARCH_BACKEND_ENGINE: str = Field(default='ripgrep')
  137. SEARCH_PROCESS_HTML: bool = Field(default=True)
  138. SEARCH_BACKEND_TIMEOUT: int = Field(default=10)
  139. SEARCH_BACKEND_CONFIG = SearchBackendConfig()