software
/
archival.ArchiveBox
mirror of https://github.com/ArchiveBox/ArchiveBox.git


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394
							"""
Simplified config system for ArchiveBox.

This replaces the complex abx_spec_config/base_configset.py with a simpler
approach that still supports environment variables, config files, and
per-object overrides.
"""

__package__ = "archivebox.config"

import os
import json
from pathlib import Path
from typing import Any, Dict, Optional, List, Type, Tuple, TYPE_CHECKING, cast
from configparser import ConfigParser

from pydantic import Field, ConfigDict
from pydantic_settings import BaseSettings, PydanticBaseSettingsSource


class IniConfigSettingsSource(PydanticBaseSettingsSource):
    """
    Custom settings source that reads from ArchiveBox.conf (INI format).
    Flattens all sections into a single namespace.
    """

    def get_field_value(self, field: Any, field_name: str) -> Tuple[Any, str, bool]:
        config_vals = self._load_config_file()
        field_value = config_vals.get(field_name.upper())
        return field_value, field_name, False

    def __call__(self) -> Dict[str, Any]:
        return self._load_config_file()

    def _load_config_file(self) -> Dict[str, Any]:
        try:
            from archivebox.config.constants import CONSTANTS
            config_path = CONSTANTS.CONFIG_FILE
        except ImportError:
            return {}

        if not config_path.exists():
            return {}

        parser = ConfigParser()
        parser.optionxform = lambda x: x  # preserve case
        parser.read(config_path)

        # Flatten all sections into single namespace (ignore section headers)
        return {key.upper(): value for section in parser.sections() for key, value in parser.items(section)}


class BaseConfigSet(BaseSettings):
    """
    Base class for config sections.

    Automatically loads values from (highest to lowest priority):
    1. Environment variables
    2. ArchiveBox.conf file (INI format, flattened)
    3. Default values

    Subclasses define fields with defaults and types:

        class ShellConfig(BaseConfigSet):
            DEBUG: bool = Field(default=False)
            USE_COLOR: bool = Field(default=True)
    """

    model_config = ConfigDict(
        env_prefix="",
        extra="ignore",
        validate_default=True,
    )

    @classmethod
    def settings_customise_sources(
        cls,
        settings_cls: Type[BaseSettings],
        init_settings: PydanticBaseSettingsSource,
        env_settings: PydanticBaseSettingsSource,
        dotenv_settings: PydanticBaseSettingsSource,
        file_secret_settings: PydanticBaseSettingsSource,
    ) -> Tuple[PydanticBaseSettingsSource, ...]:
        """
        Define the order of settings sources (first = highest priority).
        """
        return (
            init_settings,           # 1. Passed to __init__
            env_settings,            # 2. Environment variables
            IniConfigSettingsSource(settings_cls),  # 3. ArchiveBox.conf file
            # dotenv_settings,       # Skip .env files
            # file_secret_settings,  # Skip secrets files
        )

    @classmethod
    def load_from_file(cls, config_path: Path) -> Dict[str, str]:
        """Load config values from INI file."""
        if not config_path.exists():
            return {}

        parser = ConfigParser()
        parser.optionxform = lambda x: x  # preserve case
        parser.read(config_path)

        # Flatten all sections into single namespace
        return {key.upper(): value for section in parser.sections() for key, value in parser.items(section)}

    def update_in_place(self, warn: bool = True, persist: bool = False, **kwargs) -> None:
        """
        Update config values in place.

        This allows runtime updates to config without reloading.
        """
        for key, value in kwargs.items():
            if hasattr(self, key):
                # Use object.__setattr__ to bypass pydantic's frozen model
                object.__setattr__(self, key, value)


def get_config(
    defaults: Optional[Dict] = None,
    persona: Any = None,
    user: Any = None,
    crawl: Any = None,
    snapshot: Any = None,
    archiveresult: Any = None,
    machine: Any = None,
) -> Dict[str, Any]:
    """
    Get merged config from all sources.

    Priority (highest to lowest):
    1. Per-snapshot config (snapshot.config JSON field)
    2. Per-crawl config (crawl.config JSON field)
    3. Per-user config (user.config JSON field)
    4. Per-persona config (persona.get_derived_config() - includes CHROME_USER_DATA_DIR etc.)
    5. Environment variables
    6. Per-machine config (machine.config JSON field - resolved binary paths)
    7. Config file (ArchiveBox.conf)
    8. Plugin schema defaults (config.json)
    9. Core config defaults

    Args:
        defaults: Default values to start with
        persona: Persona object (provides derived paths like CHROME_USER_DATA_DIR)
        user: User object with config JSON field
        crawl: Crawl object with config JSON field
        snapshot: Snapshot object with config JSON field
        archiveresult: ArchiveResult object (auto-fetches snapshot)
        machine: Machine object with config JSON field (defaults to Machine.current())

    Note: Objects are auto-fetched from relationships if not provided:
        - snapshot auto-fetched from archiveresult.snapshot
        - crawl auto-fetched from snapshot.crawl
        - user auto-fetched from crawl.created_by

    Returns:
        Merged config dict
    """
    # Auto-fetch related objects from relationships
    if snapshot is None and archiveresult and hasattr(archiveresult, "snapshot"):
        snapshot = archiveresult.snapshot

    if crawl is None and snapshot and hasattr(snapshot, "crawl"):
        crawl = snapshot.crawl

    if user is None and crawl and hasattr(crawl, "created_by"):
        user = crawl.created_by
    from archivebox.config.constants import CONSTANTS
    from archivebox.config.common import (
        SHELL_CONFIG,
        STORAGE_CONFIG,
        GENERAL_CONFIG,
        SERVER_CONFIG,
        ARCHIVING_CONFIG,
        SEARCH_BACKEND_CONFIG,
    )

    # Start with defaults
    config = dict(defaults or {})

    # Add plugin config defaults from JSONSchema config.json files
    try:
        from archivebox.hooks import get_config_defaults_from_plugins
        plugin_defaults = get_config_defaults_from_plugins()
        config.update(plugin_defaults)
    except ImportError:
        pass  # hooks not available yet during early startup

    # Add all core config sections
    config.update(dict(SHELL_CONFIG))
    config.update(dict(STORAGE_CONFIG))
    config.update(dict(GENERAL_CONFIG))
    config.update(dict(SERVER_CONFIG))
    config.update(dict(ARCHIVING_CONFIG))
    config.update(dict(SEARCH_BACKEND_CONFIG))

    # Load from archivebox.config.file
    config_file = CONSTANTS.CONFIG_FILE
    if config_file.exists():
        file_config = BaseConfigSet.load_from_file(config_file)
        config.update(file_config)

    # Apply machine config overrides (cached binary paths, etc.)
    if machine is None:
        # Default to current machine if not provided
        try:
            from archivebox.machine.models import Machine
            machine = Machine.current()
        except Exception:
            pass  # Machine might not be available during early init

    if machine and hasattr(machine, "config") and machine.config:
        config.update(machine.config)

    # Override with environment variables (for keys that exist in config)
    for key in config:
        env_val = os.environ.get(key)
        if env_val is not None:
            config[key] = _parse_env_value(env_val, config.get(key))

    # Also add NEW environment variables (not yet in config)
    # This is important for worker subprocesses that receive config via Process.env
    for key, value in os.environ.items():
        if key.isupper() and key not in config:  # Only uppercase keys (config convention)
            config[key] = _parse_env_value(value, None)

    # Also check plugin config aliases in environment
    try:
        from archivebox.hooks import discover_plugin_configs
        plugin_configs = discover_plugin_configs()
        for plugin_name, schema in plugin_configs.items():
            for key, prop_schema in schema.get('properties', {}).items():
                # Check x-aliases
                for alias in prop_schema.get('x-aliases', []):
                    if alias in os.environ and key not in os.environ:
                        config[key] = _parse_env_value(os.environ[alias], config.get(key))
                        break
                # Check x-fallback
                fallback = prop_schema.get('x-fallback')
                if fallback and fallback in config and key not in config:
                    config[key] = config[fallback]
    except ImportError:
        pass

    # Apply persona config overrides (includes derived paths like CHROME_USER_DATA_DIR)
    if persona and hasattr(persona, "get_derived_config"):
        config.update(persona.get_derived_config())

    # Apply user config overrides
    if user and hasattr(user, "config") and user.config:
        config.update(user.config)

    # Apply crawl config overrides
    if crawl and hasattr(crawl, "config") and crawl.config:
        config.update(crawl.config)

    # Add CRAWL_OUTPUT_DIR for snapshot hooks to find shared Chrome session
    if crawl and hasattr(crawl, "output_dir"):
        config['CRAWL_OUTPUT_DIR'] = str(crawl.output_dir)
        config['CRAWL_ID'] = str(getattr(crawl, "id", "")) if getattr(crawl, "id", None) else config.get('CRAWL_ID')

    # Apply snapshot config overrides (highest priority)
    if snapshot and hasattr(snapshot, "config") and snapshot.config:
        config.update(snapshot.config)

    if snapshot:
        config['SNAPSHOT_ID'] = str(getattr(snapshot, "id", "")) if getattr(snapshot, "id", None) else config.get('SNAPSHOT_ID')
        config['SNAPSHOT_DEPTH'] = int(getattr(snapshot, "depth", 0) or 0)
        if getattr(snapshot, "crawl_id", None):
            config['CRAWL_ID'] = str(snapshot.crawl_id)

    # Normalize all aliases to canonical names (after all sources merged)
    # This handles aliases that came from user/crawl/snapshot configs, not just env
    try:
        from archivebox.hooks import discover_plugin_configs
        plugin_configs = discover_plugin_configs()
        aliases_to_normalize = {}  # {alias_key: canonical_key}

        # Build alias mapping from all plugin schemas
        for plugin_name, schema in plugin_configs.items():
            for canonical_key, prop_schema in schema.get('properties', {}).items():
                for alias in prop_schema.get('x-aliases', []):
                    aliases_to_normalize[alias] = canonical_key

        # Normalize: copy alias values to canonical keys (aliases take precedence)
        for alias_key, canonical_key in aliases_to_normalize.items():
            if alias_key in config:
                # Alias exists - copy to canonical key (overwriting any default)
                config[canonical_key] = config[alias_key]
                # Remove alias from config to keep it clean
                del config[alias_key]
    except ImportError:
        pass

    return config


def get_flat_config() -> Dict[str, Any]:
    """
    Get a flat dictionary of all config values.

    Replaces abx.pm.hook.get_FLAT_CONFIG()
    """
    return get_config()


def get_all_configs() -> Dict[str, BaseConfigSet]:
    """
    Get all config section objects as a dictionary.

    Replaces abx.pm.hook.get_CONFIGS()
    """
    from archivebox.config.common import (
        SHELL_CONFIG, SERVER_CONFIG, ARCHIVING_CONFIG, SEARCH_BACKEND_CONFIG
    )
    return {
        'SHELL_CONFIG': SHELL_CONFIG,
        'SERVER_CONFIG': SERVER_CONFIG,
        'ARCHIVING_CONFIG': ARCHIVING_CONFIG,
        'SEARCH_BACKEND_CONFIG': SEARCH_BACKEND_CONFIG,
    }


def _parse_env_value(value: str, default: Any = None) -> Any:
    """Parse an environment variable value based on expected type."""
    if default is None:
        # Try to guess the type
        if value.lower() in ("true", "false", "yes", "no", "1", "0"):
            return value.lower() in ("true", "yes", "1")
        try:
            return int(value)
        except ValueError:
            pass
        try:
            return json.loads(value)
        except (json.JSONDecodeError, ValueError):
            pass
        return value

    # Parse based on default's type
    if isinstance(default, bool):
        return value.lower() in ("true", "yes", "1")
    elif isinstance(default, int):
        return int(value)
    elif isinstance(default, float):
        return float(value)
    elif isinstance(default, (list, dict)):
        return json.loads(value)
    elif isinstance(default, Path):
        return Path(value)
    else:
        return value


# Default worker concurrency settings
DEFAULT_WORKER_CONCURRENCY = {
    "crawl": 2,
    "snapshot": 3,
    "wget": 2,
    "ytdlp": 2,
    "screenshot": 3,
    "singlefile": 2,
    "title": 5,
    "favicon": 5,
    "headers": 5,
    "archivedotorg": 2,
    "readability": 3,
    "mercury": 3,
    "git": 2,
    "pdf": 2,
    "dom": 3,
}


def get_worker_concurrency() -> Dict[str, int]:
    """
    Get worker concurrency settings.

    Can be configured via WORKER_CONCURRENCY env var as JSON dict.
    """
    config = get_config()

    # Start with defaults
    concurrency = DEFAULT_WORKER_CONCURRENCY.copy()

    # Override with config
    if "WORKER_CONCURRENCY" in config:
        custom = config["WORKER_CONCURRENCY"]
        if isinstance(custom, str):
            custom = json.loads(custom)
        concurrency.update(custom)

    return concurrency