common.py 7.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169
  1. __package__ = 'archivebox.config'
  2. import os
  3. import sys
  4. import shutil
  5. import tempfile
  6. from typing import Dict, Optional
  7. from pathlib import Path
  8. from rich import print
  9. from pydantic import Field, field_validator, computed_field, model_validator
  10. from django.utils.crypto import get_random_string
  11. from abx.archivebox.base_configset import BaseConfigSet
  12. from .constants import CONSTANTS
  13. from .version import get_COMMIT_HASH, get_BUILD_TIME
  14. from .permissions import IN_DOCKER
  15. ###################### Config ##########################
  16. class ShellConfig(BaseConfigSet):
  17. DEBUG: bool = Field(default=lambda: '--debug' in sys.argv)
  18. IS_TTY: bool = Field(default=sys.stdout.isatty())
  19. USE_COLOR: bool = Field(default=lambda c: c.IS_TTY)
  20. SHOW_PROGRESS: bool = Field(default=lambda c: c.IS_TTY)
  21. IN_DOCKER: bool = Field(default=IN_DOCKER)
  22. IN_QEMU: bool = Field(default=False)
  23. ANSI: Dict[str, str] = Field(default=lambda c: CONSTANTS.DEFAULT_CLI_COLORS if c.USE_COLOR else CONSTANTS.DISABLED_CLI_COLORS)
  24. VERSIONS_AVAILABLE: bool = False # .check_for_update.get_versions_available_on_github(c)},
  25. CAN_UPGRADE: bool = False # .check_for_update.can_upgrade(c)},
  26. @computed_field
  27. @property
  28. def TERM_WIDTH(self) -> int:
  29. if not self.IS_TTY:
  30. return 200
  31. return shutil.get_terminal_size((140, 10)).columns
  32. @computed_field
  33. @property
  34. def COMMIT_HASH(self) -> Optional[str]:
  35. return get_COMMIT_HASH()
  36. @computed_field
  37. @property
  38. def BUILD_TIME(self) -> str:
  39. return get_BUILD_TIME()
  40. SHELL_CONFIG = ShellConfig()
  41. class StorageConfig(BaseConfigSet):
  42. # TMP_DIR must be a local, fast, readable/writable dir by archivebox user,
  43. # must be a short path due to unix path length restrictions for socket files (<100 chars)
  44. # must be a local SSD/tmpfs for speed and because bind mounts/network mounts/FUSE dont support unix sockets
  45. TMP_DIR: Path = Field(default=CONSTANTS.DEFAULT_TMP_DIR)
  46. # LIB_DIR must be a local, fast, readable/writable dir by archivebox user,
  47. # must be able to contain executable binaries (up to 5GB size)
  48. # should not be a remote/network/FUSE mount for speed reasons, otherwise extractors will be slow
  49. LIB_DIR: Path = Field(default=CONSTANTS.DEFAULT_LIB_DIR)
  50. OUTPUT_PERMISSIONS: str = Field(default='644')
  51. RESTRICT_FILE_NAMES: str = Field(default='windows')
  52. ENFORCE_ATOMIC_WRITES: bool = Field(default=True)
  53. # not supposed to be user settable:
  54. DIR_OUTPUT_PERMISSIONS: str = Field(default=lambda c: c['OUTPUT_PERMISSIONS'].replace('6', '7').replace('4', '5'))
  55. STORAGE_CONFIG = StorageConfig()
  56. class GeneralConfig(BaseConfigSet):
  57. TAG_SEPARATOR_PATTERN: str = Field(default=r'[,]')
  58. GENERAL_CONFIG = GeneralConfig()
  59. class ServerConfig(BaseConfigSet):
  60. SECRET_KEY: str = Field(default=lambda: get_random_string(50, 'abcdefghijklmnopqrstuvwxyz0123456789_'))
  61. BIND_ADDR: str = Field(default=lambda: ['127.0.0.1:8000', '0.0.0.0:8000'][SHELL_CONFIG.IN_DOCKER])
  62. ALLOWED_HOSTS: str = Field(default='*')
  63. CSRF_TRUSTED_ORIGINS: str = Field(default=lambda c: 'http://localhost:8000,http://127.0.0.1:8000,http://0.0.0.0:8000,http://{}'.format(c.BIND_ADDR))
  64. SNAPSHOTS_PER_PAGE: int = Field(default=40)
  65. FOOTER_INFO: str = Field(default='Content is hosted for personal archiving purposes only. Contact server owner for any takedown requests.')
  66. # CUSTOM_TEMPLATES_DIR: Path = Field(default=None) # this is now a constant
  67. PUBLIC_INDEX: bool = Field(default=True)
  68. PUBLIC_SNAPSHOTS: bool = Field(default=True)
  69. PUBLIC_ADD_VIEW: bool = Field(default=False)
  70. ADMIN_USERNAME: str = Field(default=None)
  71. ADMIN_PASSWORD: str = Field(default=None)
  72. REVERSE_PROXY_USER_HEADER: str = Field(default='Remote-User')
  73. REVERSE_PROXY_WHITELIST: str = Field(default='')
  74. LOGOUT_REDIRECT_URL: str = Field(default='/')
  75. PREVIEW_ORIGINALS: bool = Field(default=True)
  76. SERVER_CONFIG = ServerConfig()
  77. class ArchivingConfig(BaseConfigSet):
  78. ONLY_NEW: bool = Field(default=True)
  79. TIMEOUT: int = Field(default=60)
  80. MEDIA_TIMEOUT: int = Field(default=3600)
  81. MEDIA_MAX_SIZE: str = Field(default='750m')
  82. RESOLUTION: str = Field(default='1440,2000')
  83. CHECK_SSL_VALIDITY: bool = Field(default=True)
  84. USER_AGENT: str = Field(default='Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36 ArchiveBox/{VERSION} (+https://github.com/ArchiveBox/ArchiveBox/)')
  85. COOKIES_FILE: Path | None = Field(default=None)
  86. URL_DENYLIST: str = Field(default=r'\.(css|js|otf|ttf|woff|woff2|gstatic\.com|googleapis\.com/css)(\?.*)?$', alias='URL_BLACKLIST')
  87. URL_ALLOWLIST: str | None = Field(default=None, alias='URL_WHITELIST')
  88. # GIT_DOMAINS: str = Field(default='github.com,bitbucket.org,gitlab.com,gist.github.com,codeberg.org,gitea.com,git.sr.ht')
  89. # WGET_USER_AGENT: str = Field(default=lambda c: c['USER_AGENT'] + ' wget/{WGET_VERSION}')
  90. # CURL_USER_AGENT: str = Field(default=lambda c: c['USER_AGENT'] + ' curl/{CURL_VERSION}')
  91. # CHROME_USER_AGENT: str = Field(default=lambda c: c['USER_AGENT'])
  92. # CHROME_USER_DATA_DIR: str | None = Field(default=None)
  93. # CHROME_TIMEOUT: int = Field(default=0)
  94. # CHROME_HEADLESS: bool = Field(default=True)
  95. # CHROME_SANDBOX: bool = Field(default=lambda: not SHELL_CONFIG.IN_DOCKER)
  96. @field_validator('TIMEOUT', mode='after')
  97. def validate_timeout(cls, v):
  98. if int(v) < 5:
  99. print(f'[red][!] Warning: TIMEOUT is set too low! (currently set to TIMEOUT={v} seconds)[/red]', file=sys.stderr)
  100. print(' You must allow *at least* 5 seconds for indexing and archive methods to run succesfully.', file=sys.stderr)
  101. print(' (Setting it to somewhere between 30 and 3000 seconds is recommended)', file=sys.stderr)
  102. print(file=sys.stderr)
  103. print(' If you want to make ArchiveBox run faster, disable specific archive methods instead:', file=sys.stderr)
  104. print(' https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#archive-method-toggles', file=sys.stderr)
  105. print(file=sys.stderr)
  106. return v
  107. @field_validator('CHECK_SSL_VALIDITY', mode='after')
  108. def validate_check_ssl_validity(cls, v):
  109. """SIDE EFFECT: disable "you really shouldnt disable ssl" warnings emitted by requests"""
  110. if not v:
  111. import requests
  112. import urllib3
  113. requests.packages.urllib3.disable_warnings(requests.packages.urllib3.exceptions.InsecureRequestWarning)
  114. urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
  115. return v
  116. ARCHIVING_CONFIG = ArchivingConfig()
  117. class SearchBackendConfig(BaseConfigSet):
  118. USE_INDEXING_BACKEND: bool = Field(default=True)
  119. USE_SEARCHING_BACKEND: bool = Field(default=True)
  120. SEARCH_BACKEND_ENGINE: str = Field(default='ripgrep')
  121. SEARCH_PROCESS_HTML: bool = Field(default=True)
  122. SEARCH_BACKEND_TIMEOUT: int = Field(default=10)
  123. SEARCH_BACKEND_CONFIG = SearchBackendConfig()