Bläddra i källkod

rename configfile to collection

Nick Sweeting 1 år sedan
förälder
incheckning
60f0458c77

+ 1 - 1
archivebox/abx/archivebox/base_binary.py

@@ -14,7 +14,6 @@ from pydantic_pkgr import (
     EnvProvider,
 )
 
-from archivebox.config import CONSTANTS
 from archivebox.config.permissions import ARCHIVEBOX_USER
 
 import abx
@@ -34,6 +33,7 @@ class BaseBinProvider(BinProvider):
         return [self]
 
 class BaseBinary(Binary):
+    # TODO: formalize state diagram, final states, transitions, side effects, etc.
 
     @staticmethod
     def symlink_to_lib(binary, bin_dir=None) -> None:

+ 12 - 10
archivebox/abx/archivebox/base_configset.py

@@ -99,7 +99,7 @@ class BaseConfigSet(BaseSettings):
     )
     
     load_from_defaults: ClassVar[bool] = True
-    load_from_configfile: ClassVar[bool] = True
+    load_from_collection: ClassVar[bool] = True
     load_from_environment: ClassVar[bool] = True
 
     @classmethod
@@ -128,7 +128,8 @@ class BaseConfigSet(BaseSettings):
         try:
             precedence_order = precedence_order or {
                 'defaults': init_settings,
-                'configfile': FlatTomlConfigSettingsSource(settings_cls, toml_file=ARCHIVEBOX_CONFIG_FILE),
+                # 'collection': FlatTomlConfigSettingsSource(settings_cls, toml_file=ARCHIVEBOX_CONFIG_FILE),
+                'collection': FlatTomlConfigSettingsSource(settings_cls, toml_file=ARCHIVEBOX_CONFIG_FILE),
                 'environment': env_settings,
             }
         except Exception as err:
@@ -144,14 +145,15 @@ class BaseConfigSet(BaseSettings):
 
             precedence_order = {
                 'defaults': init_settings,
-                'configfile': FlatTomlConfigSettingsSource(settings_cls, toml_file=ARCHIVEBOX_CONFIG_FILE),
+                # 'collection': FlatTomlConfigSettingsSource(settings_cls, toml_file=ARCHIVEBOX_CONFIG_FILE),
+                'collection': FlatTomlConfigSettingsSource(settings_cls, toml_file=ARCHIVEBOX_CONFIG_FILE),
                 'environment': env_settings,
             }
             
         if not cls.load_from_environment:
             precedence_order.pop('environment')
-        if not cls.load_from_configfile:
-            precedence_order.pop('configfile')
+        if not cls.load_from_collection:
+            precedence_order.pop('collection')
         if not cls.load_from_defaults:
             precedence_order.pop('defaults')
 
@@ -278,15 +280,15 @@ class BaseConfigSet(BaseSettings):
         """Get the dictionary of {key: value} config loaded from the default values"""
         class OnlyDefaultsConfig(self.__class__):
             load_from_defaults = True
-            load_from_configfile = False
+            load_from_collection = False
             load_from_environment = False
         return benedict(OnlyDefaultsConfig().model_dump(exclude_unset=False, exclude_defaults=False, exclude=set(self.model_computed_fields.keys())))
     
-    def from_configfile(self) -> Dict[str, Any]:
-        """Get the dictionary of {key: value} config loaded from the configfile ArchiveBox.conf"""
+    def from_collection(self) -> Dict[str, Any]:
+        """Get the dictionary of {key: value} config loaded from the collection ArchiveBox.conf"""
         class OnlyConfigFileConfig(self.__class__):
             load_from_defaults = False
-            load_from_configfile = True
+            load_from_collection = True
             load_from_environment = False
         return benedict(OnlyConfigFileConfig().model_dump(exclude_unset=True, exclude_defaults=False, exclude=set(self.model_computed_fields.keys())))
     
@@ -294,7 +296,7 @@ class BaseConfigSet(BaseSettings):
         """Get the dictionary of {key: value} config loaded from the environment variables"""
         class OnlyEnvironmentConfig(self.__class__):
             load_from_defaults = False
-            load_from_configfile = False
+            load_from_collection = False
             load_from_environment = True
         return benedict(OnlyEnvironmentConfig().model_dump(exclude_unset=True, exclude_defaults=False, exclude=set(self.model_computed_fields.keys())))
     

+ 5 - 20
archivebox/abx/archivebox/base_extractor.py

@@ -4,10 +4,9 @@ import json
 import os
 
 from typing import Optional, List, Literal, Annotated, Dict, Any, Tuple
-from typing_extensions import Self
 from pathlib import Path
 
-from pydantic import model_validator, AfterValidator
+from pydantic import AfterValidator
 from pydantic_pkgr import BinName
 from django.utils.functional import cached_property
 from django.utils import timezone
@@ -17,36 +16,22 @@ import abx
 from .base_binary import BaseBinary
 
 
-def no_empty_args(args: List[str]) -> List[str]:
+def assert_no_empty_args(args: List[str]) -> List[str]:
     assert all(len(arg) for arg in args)
     return args
 
-ExtractorName = Literal['wget', 'warc', 'media', 'singlefile'] | str
+ExtractorName = Annotated[str, AfterValidator(lambda s: s.isidentifier())]
 
 HandlerFuncStr = Annotated[str, AfterValidator(lambda s: s.startswith('self.'))]
-CmdArgsList = Annotated[List[str] | Tuple[str, ...], AfterValidator(no_empty_args)]
+CmdArgsList = Annotated[List[str] | Tuple[str, ...], AfterValidator(assert_no_empty_args)]
 
 
 class BaseExtractor:
-    
     name: ExtractorName
     binary: BinName
 
-    output_path_func: HandlerFuncStr = 'self.get_output_path'
-    should_extract_func: HandlerFuncStr = 'self.should_extract'
-    extract_func: HandlerFuncStr = 'self.extract'
-    exec_func: HandlerFuncStr = 'self.exec'
-
     default_args: CmdArgsList = []
     extra_args: CmdArgsList = []
-    args: Optional[CmdArgsList] = None
-
-    @model_validator(mode='after')
-    def validate_model(self) -> Self:
-        if self.args is None:
-            self.args = [*self.default_args, *self.extra_args]
-        return self
-
 
     def get_output_path(self, snapshot) -> Path:
         return Path(self.__class__.__name__.lower())
@@ -71,7 +56,7 @@ class BaseExtractor:
         
         snapshot = Snapshot.objects.get(id=snapshot_id)
         
-        if not self.should_extract(snapshot):
+        if not self.should_extract(snapshot.url):
             return {}
         
         status = 'failed'

+ 1 - 1
archivebox/abx/archivebox/reads.py

@@ -57,7 +57,7 @@ def get_HOOKS() -> Set[str]:
             for hook_name in get_PLUGIN(plugin_id).hooks
     }
 
-def get_CONFIGS() -> Dict[str, 'BaseConfigSet']:
+def get_CONFIGS() -> benedict:   # Dict[str, 'BaseConfigSet']
     return benedict({
         config_id: configset
         for plugin_configs in pm.hook.get_CONFIG()

+ 1 - 1
archivebox/abx/archivebox/writes.py

@@ -88,7 +88,7 @@ def create_root_snapshot_from_seed(crawl):
 def create_archiveresults_pending_from_snapshot(snapshot, config):
     config = get_scope_config(
         # defaults=settings.CONFIG_FROM_DEFAULTS,
-        # configfile=settings.CONFIG_FROM_FILE,
+        # collection=settings.CONFIG_FROM_FILE,
         # environment=settings.CONFIG_FROM_ENVIRONMENT,
         persona=archiveresult.snapshot.crawl.persona,
         seed=archiveresult.snapshot.crawl.seed,

+ 1 - 1
archivebox/config/configfile.py

@@ -15,7 +15,7 @@ from archivebox.misc.logging import stderr
 
 
 def get_real_name(key: str) -> str:
-    """get the current canonical name for a given deprecated config key"""
+    """get the up-to-date canonical name for a given old alias or current key"""
     from django.conf import settings
     
     for section in settings.CONFIGS.values():

+ 17 - 2
archivebox/config/constants.py

@@ -1,3 +1,15 @@
+"""
+Constants are for things that never change at runtime.
+(but they can change from run-to-run or machine-to-machine)
+
+DATA_DIR will never change at runtime, but you can run
+archivebox from inside a different DATA_DIR on the same machine.
+
+This is loaded very early in the archivebox startup flow, so nothing in this file 
+or imported from this file should import anything from archivebox.config.common, 
+django, other INSTALLED_APPS, or anything else that is not in a standard library.
+"""
+
 __package__ = 'archivebox.config'
 
 import re
@@ -197,10 +209,12 @@ class ConstantsDict(Mapping):
 
     @classmethod
     def __getitem__(cls, key: str):
+        # so it behaves like a dict[key] == dict.key or object attr
         return getattr(cls, key)
     
     @classmethod
     def __benedict__(cls):
+        # when casting to benedict, only include uppercase keys that don't start with an underscore
         return benedict({key: value for key, value in cls.__dict__.items() if key.isupper() and not key.startswith('_')})
     
     @classmethod
@@ -214,5 +228,6 @@ class ConstantsDict(Mapping):
 CONSTANTS = ConstantsDict()
 CONSTANTS_CONFIG = CONSTANTS.__benedict__()
 
-# add all key: values to globals() for easier importing
-globals().update(CONSTANTS)
+# add all key: values to globals() for easier importing, e.g.:
+# from archivebox.config.constants import IS_ROOT, PERSONAS_DIR, ...
+# globals().update(CONSTANTS)

+ 1 - 1
archivebox/main.py

@@ -22,7 +22,7 @@ from archivebox.misc.logging import stderr, hint
 from archivebox.config import CONSTANTS, VERSION, DATA_DIR, ARCHIVE_DIR
 from archivebox.config.common import SHELL_CONFIG, SEARCH_BACKEND_CONFIG, STORAGE_CONFIG, SERVER_CONFIG, ARCHIVING_CONFIG
 from archivebox.config.permissions import SudoPermission, IN_DOCKER
-from archivebox.config.configfile import (
+from archivebox.config.collection import (
     write_config_file,
     load_all_config,
     get_real_name,

+ 2 - 0
archivebox/misc/util.py

@@ -126,6 +126,7 @@ def is_static_file(url: str):
 def enforce_types(func):
     """
     Enforce function arg and kwarg types at runtime using its python3 type hints
+    Simpler version of pydantic @validate_call decorator
     """
     # TODO: check return type as well
 
@@ -283,6 +284,7 @@ def get_headers(url: str, timeout: int=None) -> str:
 def ansi_to_html(text: str) -> str:
     """
     Based on: https://stackoverflow.com/questions/19212665/python-converting-ansi-color-codes-to-html
+    Simple way to render colored CLI stdout/stderr in HTML properly, Textual/rich is probably better though.
     """
 
     TEMPLATE = '<span style="color: rgb{}"><br>'