configset.py 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394
  1. """
  2. Simplified config system for ArchiveBox.
  3. This replaces the complex abx_spec_config/base_configset.py with a simpler
  4. approach that still supports environment variables, config files, and
  5. per-object overrides.
  6. """
  7. __package__ = "archivebox.config"
  8. import os
  9. import json
  10. from pathlib import Path
  11. from typing import Any, Dict, Optional, List, Type, Tuple, TYPE_CHECKING, cast
  12. from configparser import ConfigParser
  13. from pydantic import Field, ConfigDict
  14. from pydantic_settings import BaseSettings, PydanticBaseSettingsSource
  15. class IniConfigSettingsSource(PydanticBaseSettingsSource):
  16. """
  17. Custom settings source that reads from ArchiveBox.conf (INI format).
  18. Flattens all sections into a single namespace.
  19. """
  20. def get_field_value(self, field: Any, field_name: str) -> Tuple[Any, str, bool]:
  21. config_vals = self._load_config_file()
  22. field_value = config_vals.get(field_name.upper())
  23. return field_value, field_name, False
  24. def __call__(self) -> Dict[str, Any]:
  25. return self._load_config_file()
  26. def _load_config_file(self) -> Dict[str, Any]:
  27. try:
  28. from archivebox.config.constants import CONSTANTS
  29. config_path = CONSTANTS.CONFIG_FILE
  30. except ImportError:
  31. return {}
  32. if not config_path.exists():
  33. return {}
  34. parser = ConfigParser()
  35. parser.optionxform = lambda x: x # preserve case
  36. parser.read(config_path)
  37. # Flatten all sections into single namespace (ignore section headers)
  38. return {key.upper(): value for section in parser.sections() for key, value in parser.items(section)}
  39. class BaseConfigSet(BaseSettings):
  40. """
  41. Base class for config sections.
  42. Automatically loads values from (highest to lowest priority):
  43. 1. Environment variables
  44. 2. ArchiveBox.conf file (INI format, flattened)
  45. 3. Default values
  46. Subclasses define fields with defaults and types:
  47. class ShellConfig(BaseConfigSet):
  48. DEBUG: bool = Field(default=False)
  49. USE_COLOR: bool = Field(default=True)
  50. """
  51. model_config = ConfigDict(
  52. env_prefix="",
  53. extra="ignore",
  54. validate_default=True,
  55. )
  56. @classmethod
  57. def settings_customise_sources(
  58. cls,
  59. settings_cls: Type[BaseSettings],
  60. init_settings: PydanticBaseSettingsSource,
  61. env_settings: PydanticBaseSettingsSource,
  62. dotenv_settings: PydanticBaseSettingsSource,
  63. file_secret_settings: PydanticBaseSettingsSource,
  64. ) -> Tuple[PydanticBaseSettingsSource, ...]:
  65. """
  66. Define the order of settings sources (first = highest priority).
  67. """
  68. return (
  69. init_settings, # 1. Passed to __init__
  70. env_settings, # 2. Environment variables
  71. IniConfigSettingsSource(settings_cls), # 3. ArchiveBox.conf file
  72. # dotenv_settings, # Skip .env files
  73. # file_secret_settings, # Skip secrets files
  74. )
  75. @classmethod
  76. def load_from_file(cls, config_path: Path) -> Dict[str, str]:
  77. """Load config values from INI file."""
  78. if not config_path.exists():
  79. return {}
  80. parser = ConfigParser()
  81. parser.optionxform = lambda x: x # preserve case
  82. parser.read(config_path)
  83. # Flatten all sections into single namespace
  84. return {key.upper(): value for section in parser.sections() for key, value in parser.items(section)}
  85. def update_in_place(self, warn: bool = True, persist: bool = False, **kwargs) -> None:
  86. """
  87. Update config values in place.
  88. This allows runtime updates to config without reloading.
  89. """
  90. for key, value in kwargs.items():
  91. if hasattr(self, key):
  92. # Use object.__setattr__ to bypass pydantic's frozen model
  93. object.__setattr__(self, key, value)
  94. def get_config(
  95. defaults: Optional[Dict] = None,
  96. persona: Any = None,
  97. user: Any = None,
  98. crawl: Any = None,
  99. snapshot: Any = None,
  100. archiveresult: Any = None,
  101. machine: Any = None,
  102. ) -> Dict[str, Any]:
  103. """
  104. Get merged config from all sources.
  105. Priority (highest to lowest):
  106. 1. Per-snapshot config (snapshot.config JSON field)
  107. 2. Per-crawl config (crawl.config JSON field)
  108. 3. Per-user config (user.config JSON field)
  109. 4. Per-persona config (persona.get_derived_config() - includes CHROME_USER_DATA_DIR etc.)
  110. 5. Environment variables
  111. 6. Per-machine config (machine.config JSON field - resolved binary paths)
  112. 7. Config file (ArchiveBox.conf)
  113. 8. Plugin schema defaults (config.json)
  114. 9. Core config defaults
  115. Args:
  116. defaults: Default values to start with
  117. persona: Persona object (provides derived paths like CHROME_USER_DATA_DIR)
  118. user: User object with config JSON field
  119. crawl: Crawl object with config JSON field
  120. snapshot: Snapshot object with config JSON field
  121. archiveresult: ArchiveResult object (auto-fetches snapshot)
  122. machine: Machine object with config JSON field (defaults to Machine.current())
  123. Note: Objects are auto-fetched from relationships if not provided:
  124. - snapshot auto-fetched from archiveresult.snapshot
  125. - crawl auto-fetched from snapshot.crawl
  126. - user auto-fetched from crawl.created_by
  127. Returns:
  128. Merged config dict
  129. """
  130. # Auto-fetch related objects from relationships
  131. if snapshot is None and archiveresult and hasattr(archiveresult, "snapshot"):
  132. snapshot = archiveresult.snapshot
  133. if crawl is None and snapshot and hasattr(snapshot, "crawl"):
  134. crawl = snapshot.crawl
  135. if user is None and crawl and hasattr(crawl, "created_by"):
  136. user = crawl.created_by
  137. from archivebox.config.constants import CONSTANTS
  138. from archivebox.config.common import (
  139. SHELL_CONFIG,
  140. STORAGE_CONFIG,
  141. GENERAL_CONFIG,
  142. SERVER_CONFIG,
  143. ARCHIVING_CONFIG,
  144. SEARCH_BACKEND_CONFIG,
  145. )
  146. # Start with defaults
  147. config = dict(defaults or {})
  148. # Add plugin config defaults from JSONSchema config.json files
  149. try:
  150. from archivebox.hooks import get_config_defaults_from_plugins
  151. plugin_defaults = get_config_defaults_from_plugins()
  152. config.update(plugin_defaults)
  153. except ImportError:
  154. pass # hooks not available yet during early startup
  155. # Add all core config sections
  156. config.update(dict(SHELL_CONFIG))
  157. config.update(dict(STORAGE_CONFIG))
  158. config.update(dict(GENERAL_CONFIG))
  159. config.update(dict(SERVER_CONFIG))
  160. config.update(dict(ARCHIVING_CONFIG))
  161. config.update(dict(SEARCH_BACKEND_CONFIG))
  162. # Load from archivebox.config.file
  163. config_file = CONSTANTS.CONFIG_FILE
  164. if config_file.exists():
  165. file_config = BaseConfigSet.load_from_file(config_file)
  166. config.update(file_config)
  167. # Apply machine config overrides (cached binary paths, etc.)
  168. if machine is None:
  169. # Default to current machine if not provided
  170. try:
  171. from archivebox.machine.models import Machine
  172. machine = Machine.current()
  173. except Exception:
  174. pass # Machine might not be available during early init
  175. if machine and hasattr(machine, "config") and machine.config:
  176. config.update(machine.config)
  177. # Override with environment variables (for keys that exist in config)
  178. for key in config:
  179. env_val = os.environ.get(key)
  180. if env_val is not None:
  181. config[key] = _parse_env_value(env_val, config.get(key))
  182. # Also add NEW environment variables (not yet in config)
  183. # This is important for worker subprocesses that receive config via Process.env
  184. for key, value in os.environ.items():
  185. if key.isupper() and key not in config: # Only uppercase keys (config convention)
  186. config[key] = _parse_env_value(value, None)
  187. # Also check plugin config aliases in environment
  188. try:
  189. from archivebox.hooks import discover_plugin_configs
  190. plugin_configs = discover_plugin_configs()
  191. for plugin_name, schema in plugin_configs.items():
  192. for key, prop_schema in schema.get('properties', {}).items():
  193. # Check x-aliases
  194. for alias in prop_schema.get('x-aliases', []):
  195. if alias in os.environ and key not in os.environ:
  196. config[key] = _parse_env_value(os.environ[alias], config.get(key))
  197. break
  198. # Check x-fallback
  199. fallback = prop_schema.get('x-fallback')
  200. if fallback and fallback in config and key not in config:
  201. config[key] = config[fallback]
  202. except ImportError:
  203. pass
  204. # Apply persona config overrides (includes derived paths like CHROME_USER_DATA_DIR)
  205. if persona and hasattr(persona, "get_derived_config"):
  206. config.update(persona.get_derived_config())
  207. # Apply user config overrides
  208. if user and hasattr(user, "config") and user.config:
  209. config.update(user.config)
  210. # Apply crawl config overrides
  211. if crawl and hasattr(crawl, "config") and crawl.config:
  212. config.update(crawl.config)
  213. # Add CRAWL_OUTPUT_DIR for snapshot hooks to find shared Chrome session
  214. if crawl and hasattr(crawl, "output_dir"):
  215. config['CRAWL_OUTPUT_DIR'] = str(crawl.output_dir)
  216. config['CRAWL_ID'] = str(getattr(crawl, "id", "")) if getattr(crawl, "id", None) else config.get('CRAWL_ID')
  217. # Apply snapshot config overrides (highest priority)
  218. if snapshot and hasattr(snapshot, "config") and snapshot.config:
  219. config.update(snapshot.config)
  220. if snapshot:
  221. config['SNAPSHOT_ID'] = str(getattr(snapshot, "id", "")) if getattr(snapshot, "id", None) else config.get('SNAPSHOT_ID')
  222. config['SNAPSHOT_DEPTH'] = int(getattr(snapshot, "depth", 0) or 0)
  223. if getattr(snapshot, "crawl_id", None):
  224. config['CRAWL_ID'] = str(snapshot.crawl_id)
  225. # Normalize all aliases to canonical names (after all sources merged)
  226. # This handles aliases that came from user/crawl/snapshot configs, not just env
  227. try:
  228. from archivebox.hooks import discover_plugin_configs
  229. plugin_configs = discover_plugin_configs()
  230. aliases_to_normalize = {} # {alias_key: canonical_key}
  231. # Build alias mapping from all plugin schemas
  232. for plugin_name, schema in plugin_configs.items():
  233. for canonical_key, prop_schema in schema.get('properties', {}).items():
  234. for alias in prop_schema.get('x-aliases', []):
  235. aliases_to_normalize[alias] = canonical_key
  236. # Normalize: copy alias values to canonical keys (aliases take precedence)
  237. for alias_key, canonical_key in aliases_to_normalize.items():
  238. if alias_key in config:
  239. # Alias exists - copy to canonical key (overwriting any default)
  240. config[canonical_key] = config[alias_key]
  241. # Remove alias from config to keep it clean
  242. del config[alias_key]
  243. except ImportError:
  244. pass
  245. return config
  246. def get_flat_config() -> Dict[str, Any]:
  247. """
  248. Get a flat dictionary of all config values.
  249. Replaces abx.pm.hook.get_FLAT_CONFIG()
  250. """
  251. return get_config()
  252. def get_all_configs() -> Dict[str, BaseConfigSet]:
  253. """
  254. Get all config section objects as a dictionary.
  255. Replaces abx.pm.hook.get_CONFIGS()
  256. """
  257. from archivebox.config.common import (
  258. SHELL_CONFIG, SERVER_CONFIG, ARCHIVING_CONFIG, SEARCH_BACKEND_CONFIG
  259. )
  260. return {
  261. 'SHELL_CONFIG': SHELL_CONFIG,
  262. 'SERVER_CONFIG': SERVER_CONFIG,
  263. 'ARCHIVING_CONFIG': ARCHIVING_CONFIG,
  264. 'SEARCH_BACKEND_CONFIG': SEARCH_BACKEND_CONFIG,
  265. }
  266. def _parse_env_value(value: str, default: Any = None) -> Any:
  267. """Parse an environment variable value based on expected type."""
  268. if default is None:
  269. # Try to guess the type
  270. if value.lower() in ("true", "false", "yes", "no", "1", "0"):
  271. return value.lower() in ("true", "yes", "1")
  272. try:
  273. return int(value)
  274. except ValueError:
  275. pass
  276. try:
  277. return json.loads(value)
  278. except (json.JSONDecodeError, ValueError):
  279. pass
  280. return value
  281. # Parse based on default's type
  282. if isinstance(default, bool):
  283. return value.lower() in ("true", "yes", "1")
  284. elif isinstance(default, int):
  285. return int(value)
  286. elif isinstance(default, float):
  287. return float(value)
  288. elif isinstance(default, (list, dict)):
  289. return json.loads(value)
  290. elif isinstance(default, Path):
  291. return Path(value)
  292. else:
  293. return value
  294. # Default worker concurrency settings
  295. DEFAULT_WORKER_CONCURRENCY = {
  296. "crawl": 2,
  297. "snapshot": 3,
  298. "wget": 2,
  299. "ytdlp": 2,
  300. "screenshot": 3,
  301. "singlefile": 2,
  302. "title": 5,
  303. "favicon": 5,
  304. "headers": 5,
  305. "archivedotorg": 2,
  306. "readability": 3,
  307. "mercury": 3,
  308. "git": 2,
  309. "pdf": 2,
  310. "dom": 3,
  311. }
  312. def get_worker_concurrency() -> Dict[str, int]:
  313. """
  314. Get worker concurrency settings.
  315. Can be configured via WORKER_CONCURRENCY env var as JSON dict.
  316. """
  317. config = get_config()
  318. # Start with defaults
  319. concurrency = DEFAULT_WORKER_CONCURRENCY.copy()
  320. # Override with config
  321. if "WORKER_CONCURRENCY" in config:
  322. custom = config["WORKER_CONCURRENCY"]
  323. if isinstance(custom, str):
  324. custom = json.loads(custom)
  325. concurrency.update(custom)
  326. return concurrency