base_configset.py 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334
  1. __package__ = 'abx.archivebox'
  2. import os
  3. import sys
  4. import re
  5. from pathlib import Path
  6. from typing import Type, Tuple, Callable, ClassVar, Dict, Any
  7. import toml
  8. from rich import print
  9. from benedict import benedict
  10. from pydantic import model_validator, TypeAdapter, AliasChoices
  11. from pydantic_settings import BaseSettings, SettingsConfigDict, PydanticBaseSettingsSource
  12. from pydantic_settings.sources import TomlConfigSettingsSource
  13. from pydantic_pkgr import func_takes_args_or_kwargs
  14. from . import toml_util
  15. PACKAGE_DIR = Path(__file__).resolve().parent.parent
  16. DATA_DIR = Path(os.getcwd()).resolve()
  17. ARCHIVEBOX_CONFIG_FILE = DATA_DIR / "ArchiveBox.conf"
  18. ARCHIVEBOX_CONFIG_FILE_BAK = ARCHIVEBOX_CONFIG_FILE.parent / ".ArchiveBox.conf.bak"
  19. AUTOFIXES_HEADER = "[AUTOFIXES]"
  20. AUTOFIXES_SUBHEADER = "# The following config was added automatically to fix problems detected at startup:"
  21. _ALREADY_WARNED_ABOUT_UPDATED_CONFIG = set()
  22. class FlatTomlConfigSettingsSource(TomlConfigSettingsSource):
  23. """
  24. A source class that loads variables from a TOML file
  25. """
  26. def __init__(
  27. self,
  28. settings_cls: type[BaseSettings],
  29. toml_file: Path | None=None,
  30. ):
  31. self.toml_file_path = toml_file or settings_cls.model_config.get("toml_file")
  32. self.nested_toml_data = self._read_files(self.toml_file_path)
  33. self.toml_data = {}
  34. for top_level_key, top_level_value in self.nested_toml_data.items():
  35. if isinstance(top_level_value, dict):
  36. # value is nested, flatten it
  37. for key, value in top_level_value.items():
  38. self.toml_data[key] = value
  39. else:
  40. # value is already flat, just set it as-is
  41. self.toml_data[top_level_key] = top_level_value
  42. # filter toml_data to only include keys that are defined on this settings_cls
  43. self.toml_data = {
  44. key: value
  45. for key, value in self.toml_data.items()
  46. if key in settings_cls.model_fields
  47. }
  48. super(TomlConfigSettingsSource, self).__init__(settings_cls, self.toml_data)
  49. class BaseConfigSet(BaseSettings):
  50. """
  51. This is the base class for an ArchiveBox ConfigSet.
  52. It handles loading values from schema defaults, ArchiveBox.conf TOML config, and environment variables.
  53. class WgetConfig(ArchiveBoxBaseConfig):
  54. WGET_BINARY: str = Field(default='wget', alias='WGET_BINARY_PATH')
  55. c = WgetConfig()
  56. print(c.WGET_BINARY) # outputs: wget
  57. # you can mutate process environment variable and reload config using .__init__()
  58. os.environ['WGET_BINARY_PATH'] = 'wget2'
  59. c.__init__()
  60. print(c.WGET_BINARY) # outputs: wget2
  61. """
  62. # these pydantic config options are all VERY carefully chosen, make sure to test thoroughly before changing!!!
  63. model_config = SettingsConfigDict(
  64. validate_default=False,
  65. case_sensitive=True,
  66. extra="ignore",
  67. arbitrary_types_allowed=False,
  68. populate_by_name=True,
  69. from_attributes=True,
  70. loc_by_alias=False,
  71. validate_assignment=True,
  72. validate_return=True,
  73. revalidate_instances="subclass-instances",
  74. )
  75. load_from_defaults: ClassVar[bool] = True
  76. load_from_collection: ClassVar[bool] = True
  77. load_from_environment: ClassVar[bool] = True
  78. @classmethod
  79. def settings_customise_sources(
  80. cls,
  81. settings_cls: Type[BaseSettings],
  82. init_settings: PydanticBaseSettingsSource,
  83. env_settings: PydanticBaseSettingsSource,
  84. dotenv_settings: PydanticBaseSettingsSource,
  85. file_secret_settings: PydanticBaseSettingsSource,
  86. ) -> Tuple[PydanticBaseSettingsSource, ...]:
  87. """Defines the config precedence order: Schema defaults -> ArchiveBox.conf (TOML) -> Environment variables"""
  88. # import ipdb; ipdb.set_trace()
  89. precedence_order = {}
  90. # if ArchiveBox.conf does not exist yet, return defaults -> env order
  91. if not ARCHIVEBOX_CONFIG_FILE.is_file():
  92. precedence_order = {
  93. 'defaults': init_settings,
  94. 'environment': env_settings,
  95. }
  96. # if ArchiveBox.conf exists and is in TOML format, return default -> TOML -> env order
  97. try:
  98. precedence_order = precedence_order or {
  99. 'defaults': init_settings,
  100. # 'collection': FlatTomlConfigSettingsSource(settings_cls, toml_file=ARCHIVEBOX_CONFIG_FILE),
  101. 'collection': FlatTomlConfigSettingsSource(settings_cls, toml_file=ARCHIVEBOX_CONFIG_FILE),
  102. 'environment': env_settings,
  103. }
  104. except Exception as err:
  105. if err.__class__.__name__ != "TOMLDecodeError":
  106. raise
  107. # if ArchiveBox.conf exists and is in INI format, convert it then return default -> TOML -> env order
  108. # Convert ArchiveBox.conf in INI format to TOML and save original to .ArchiveBox.bak
  109. original_ini = ARCHIVEBOX_CONFIG_FILE.read_text()
  110. ARCHIVEBOX_CONFIG_FILE_BAK.write_text(original_ini)
  111. new_toml = toml_util.convert(original_ini)
  112. ARCHIVEBOX_CONFIG_FILE.write_text(new_toml)
  113. precedence_order = {
  114. 'defaults': init_settings,
  115. # 'collection': FlatTomlConfigSettingsSource(settings_cls, toml_file=ARCHIVEBOX_CONFIG_FILE),
  116. 'collection': FlatTomlConfigSettingsSource(settings_cls, toml_file=ARCHIVEBOX_CONFIG_FILE),
  117. 'environment': env_settings,
  118. }
  119. if not cls.load_from_environment:
  120. precedence_order.pop('environment')
  121. if not cls.load_from_collection:
  122. precedence_order.pop('collection')
  123. if not cls.load_from_defaults:
  124. precedence_order.pop('defaults')
  125. return tuple(precedence_order.values())
  126. @model_validator(mode="after")
  127. def fill_defaults(self):
  128. """Populate any unset values using function provided as their default"""
  129. for key in self.model_fields.keys():
  130. if isinstance(getattr(self, key), Callable):
  131. if self.load_from_defaults:
  132. computed_default = self.get_default_value(key)
  133. # set generated default value as final validated value
  134. setattr(self, key, computed_default)
  135. return self
  136. def validate(self):
  137. """Manual validation method, to be called from plugin/__init__.py:get_CONFIG()"""
  138. pass
  139. def get_default_value(self, key):
  140. """Get the default value for a given config key"""
  141. field = self.model_fields[key]
  142. value = getattr(self, key)
  143. if isinstance(value, Callable):
  144. # if value is a function, execute it to get the actual value, passing existing config as a dict arg if expected
  145. if func_takes_args_or_kwargs(value):
  146. # assemble dict of existing field values to pass to default factory functions
  147. config_so_far = benedict(self.model_dump(include=set(self.model_fields.keys()), warnings=False))
  148. computed_default = field.default(config_so_far)
  149. else:
  150. # otherwise it's a pure function with no args, just call it
  151. computed_default = field.default()
  152. # coerce/check to make sure default factory return value matches type annotation
  153. TypeAdapter(field.annotation).validate_python(computed_default)
  154. return computed_default
  155. return value
  156. def update_in_place(self, warn=True, persist=False, hint='', **kwargs):
  157. """
  158. Update the config with new values. Use this sparingly! We should almost never be updating config at runtime.
  159. Sets them in the environment so they propagate to spawned subprocesses / across future re-__init__()s and reload from environment
  160. Example acceptable use case: user config says SEARCH_BACKEND_ENGINE=sonic but sonic_client pip library is not installed so we cannot use it.
  161. SEARCH_BACKEND_CONFIG.update_in_place(SEARCH_BACKEND_ENGINE='ripgrep') can be used to reset it back to ripgrep so we can continue.
  162. """
  163. from archivebox.misc.toml_util import CustomTOMLEncoder
  164. # silence warnings if they've already been shown once
  165. if all(key in _ALREADY_WARNED_ABOUT_UPDATED_CONFIG for key in kwargs.keys()):
  166. warn = False
  167. if warn:
  168. fix_scope = 'in ArchiveBox.conf' if persist else 'just for current run'
  169. print(f'\n[yellow]:warning: WARNING: Some config cannot be used as-is, fixing automatically {fix_scope}:[/yellow] {hint}', file=sys.stderr)
  170. # set the new values in the environment
  171. for key, value in kwargs.items():
  172. os.environ[key] = str(value)
  173. original_value = getattr(self, key)
  174. if warn:
  175. print(f' {key}={original_value} -> {value}')
  176. _ALREADY_WARNED_ABOUT_UPDATED_CONFIG.add(key)
  177. # if persist=True, write config changes to data/ArchiveBox.conf [AUTOFIXES] section
  178. try:
  179. if persist and ARCHIVEBOX_CONFIG_FILE.is_file():
  180. autofixes_to_add = benedict(kwargs).to_toml(encoder=CustomTOMLEncoder())
  181. existing_config = ARCHIVEBOX_CONFIG_FILE.read_text().split(AUTOFIXES_HEADER, 1)[0].strip()
  182. if AUTOFIXES_HEADER in existing_config:
  183. existing_autofixes = existing_config.split(AUTOFIXES_HEADER, 1)[-1].strip().replace(AUTOFIXES_SUBHEADER, '').replace(AUTOFIXES_HEADER, '').strip()
  184. else:
  185. existing_autofixes = ''
  186. new_config = '\n'.join(line for line in [
  187. existing_config,
  188. '\n' + AUTOFIXES_HEADER,
  189. AUTOFIXES_SUBHEADER,
  190. existing_autofixes,
  191. autofixes_to_add,
  192. ] if line.strip()).strip() + '\n'
  193. ARCHIVEBOX_CONFIG_FILE.write_text(new_config)
  194. except Exception:
  195. pass
  196. self.__init__()
  197. if warn:
  198. print(file=sys.stderr)
  199. return self
  200. @property
  201. def aliases(self) -> Dict[str, str]:
  202. alias_map = {}
  203. for key, field in self.model_fields.items():
  204. alias_map[key] = key
  205. if field.validation_alias is None:
  206. continue
  207. if isinstance(field.validation_alias, AliasChoices):
  208. for alias in field.validation_alias.choices:
  209. alias_map[alias] = key
  210. elif isinstance(field.alias, str):
  211. alias_map[field.alias] = key
  212. else:
  213. raise ValueError(f'Unknown alias type for field {key}: {field.alias}')
  214. return benedict(alias_map)
  215. @property
  216. def toml_section_header(self):
  217. """Convert the class name to a TOML section header e.g. ShellConfig -> SHELL_CONFIG"""
  218. class_name = self.__class__.__name__
  219. return re.sub('([A-Z]+)', r'_\1', class_name).upper().strip('_')
  220. def from_defaults(self) -> Dict[str, Any]:
  221. """Get the dictionary of {key: value} config loaded from the default values"""
  222. class OnlyDefaultsConfig(self.__class__):
  223. load_from_defaults = True
  224. load_from_collection = False
  225. load_from_environment = False
  226. return benedict(OnlyDefaultsConfig().model_dump(exclude_unset=False, exclude_defaults=False, exclude=set(self.model_computed_fields.keys())))
  227. def from_collection(self) -> Dict[str, Any]:
  228. """Get the dictionary of {key: value} config loaded from the collection ArchiveBox.conf"""
  229. class OnlyConfigFileConfig(self.__class__):
  230. load_from_defaults = False
  231. load_from_collection = True
  232. load_from_environment = False
  233. return benedict(OnlyConfigFileConfig().model_dump(exclude_unset=True, exclude_defaults=False, exclude=set(self.model_computed_fields.keys())))
  234. def from_environment(self) -> Dict[str, Any]:
  235. """Get the dictionary of {key: value} config loaded from the environment variables"""
  236. class OnlyEnvironmentConfig(self.__class__):
  237. load_from_defaults = False
  238. load_from_collection = False
  239. load_from_environment = True
  240. return benedict(OnlyEnvironmentConfig().model_dump(exclude_unset=True, exclude_defaults=False, exclude=set(self.model_computed_fields.keys())))
  241. def from_computed(self) -> Dict[str, Any]:
  242. """Get the dictionary of {key: value} config loaded from the computed fields"""
  243. return benedict(self.model_dump(include=set(self.model_computed_fields.keys())))
  244. def to_toml_dict(self, defaults=False) -> Dict[str, Any]:
  245. """Get the current config as a TOML-ready dict"""
  246. config_dict = {}
  247. for key, value in benedict(self).items():
  248. if defaults or value != self.get_default_value(key):
  249. config_dict[key] = value
  250. return benedict({self.toml_section_header: config_dict})
  251. def to_toml_str(self, defaults=False) -> str:
  252. """Get the current config as a TOML string"""
  253. from archivebox.misc.toml_util import CustomTOMLEncoder
  254. toml_dict = self.to_toml_dict(defaults=defaults)
  255. if not toml_dict[self.toml_section_header]:
  256. # if the section is empty, don't write it
  257. toml_dict.pop(self.toml_section_header)
  258. return toml.dumps(toml_dict, encoder=CustomTOMLEncoder())
  259. def as_legacy_config_schema(self) -> Dict[str, Any]:
  260. # shim for backwards compatibility with old config schema style
  261. model_values = self.model_dump()
  262. return benedict({
  263. key: {'type': field.annotation, 'default': model_values[key]}
  264. for key, field in self.model_fields.items()
  265. })