Browse Source

new vastly simplified plugin spec without pydantic

Nick Sweeting 1 year ago
parent
commit
01ba6d49d3
100 changed files with 2012 additions and 1969 deletions
  1. 4 3
      archivebox/abx/__init__.py
  2. 8 22
      archivebox/abx/archivebox/__init__.py
  3. 0 38
      archivebox/abx/archivebox/base_admindataview.py
  4. 2 6
      archivebox/abx/archivebox/base_binary.py
  5. 2 27
      archivebox/abx/archivebox/base_configset.py
  6. 2 4
      archivebox/abx/archivebox/base_extractor.py
  7. 0 80
      archivebox/abx/archivebox/base_hook.py
  8. 0 175
      archivebox/abx/archivebox/base_plugin.py
  9. 0 106
      archivebox/abx/archivebox/base_queue.py
  10. 1 5
      archivebox/abx/archivebox/base_replayer.py
  11. 7 15
      archivebox/abx/archivebox/base_searchbackend.py
  12. 4 2
      archivebox/abx/archivebox/hookspec.py
  13. 111 73
      archivebox/abx/archivebox/use.py
  14. 30 1
      archivebox/config/__init__.py
  15. 0 57
      archivebox/config/apps.py
  16. 3 7
      archivebox/config/legacy.py
  17. 53 30
      archivebox/config/views.py
  18. 8 9
      archivebox/core/settings.py
  19. 1 1
      archivebox/core/views.py
  20. 3 2
      archivebox/extractors/archive_org.py
  21. 3 3
      archivebox/extractors/dom.py
  22. 3 2
      archivebox/extractors/favicon.py
  23. 3 1
      archivebox/extractors/git.py
  24. 2 1
      archivebox/extractors/headers.py
  25. 3 6
      archivebox/extractors/media.py
  26. 2 1
      archivebox/extractors/mercury.py
  27. 4 4
      archivebox/extractors/pdf.py
  28. 5 4
      archivebox/extractors/readability.py
  29. 4 3
      archivebox/extractors/screenshot.py
  30. 6 5
      archivebox/extractors/singlefile.py
  31. 3 1
      archivebox/extractors/title.py
  32. 2 2
      archivebox/extractors/wget.py
  33. 1 1
      archivebox/index/html.py
  34. 1 1
      archivebox/index/schema.py
  35. 5 5
      archivebox/machine/models.py
  36. 3 3
      archivebox/main.py
  37. 61 0
      archivebox/plugins_auth/ldap/__init__.py
  38. 6 42
      archivebox/plugins_auth/ldap/binaries.py
  39. 1 1
      archivebox/plugins_auth/ldap/config.py
  40. 39 0
      archivebox/plugins_extractor/archivedotorg/__init__.py
  41. 0 28
      archivebox/plugins_extractor/archivedotorg/apps.py
  42. 11 0
      archivebox/plugins_extractor/archivedotorg/config.py
  43. 46 0
      archivebox/plugins_extractor/chrome/__init__.py
  44. 145 0
      archivebox/plugins_extractor/chrome/binaries.py
  45. 24 118
      archivebox/plugins_extractor/chrome/config.py
  46. 38 0
      archivebox/plugins_extractor/curl/__init__.py
  47. 0 79
      archivebox/plugins_extractor/curl/apps.py
  48. 18 0
      archivebox/plugins_extractor/curl/binaries.py
  49. 33 0
      archivebox/plugins_extractor/curl/config.py
  50. 39 0
      archivebox/plugins_extractor/favicon/__init__.py
  51. 0 30
      archivebox/plugins_extractor/favicon/apps.py
  52. 13 0
      archivebox/plugins_extractor/favicon/config.py
  53. 46 0
      archivebox/plugins_extractor/git/__init__.py
  54. 0 66
      archivebox/plugins_extractor/git/apps.py
  55. 18 0
      archivebox/plugins_extractor/git/binaries.py
  56. 28 0
      archivebox/plugins_extractor/git/config.py
  57. 17 0
      archivebox/plugins_extractor/git/extractors.py
  58. 46 0
      archivebox/plugins_extractor/mercury/__init__.py
  59. 0 80
      archivebox/plugins_extractor/mercury/apps.py
  60. 32 0
      archivebox/plugins_extractor/mercury/binaries.py
  61. 31 0
      archivebox/plugins_extractor/mercury/config.py
  62. 19 0
      archivebox/plugins_extractor/mercury/extractors.py
  63. 46 0
      archivebox/plugins_extractor/readability/__init__.py
  64. 0 86
      archivebox/plugins_extractor/readability/apps.py
  65. 27 0
      archivebox/plugins_extractor/readability/binaries.py
  66. 19 0
      archivebox/plugins_extractor/readability/config.py
  67. 20 0
      archivebox/plugins_extractor/readability/extractors.py
  68. 51 0
      archivebox/plugins_extractor/singlefile/__init__.py
  69. 0 110
      archivebox/plugins_extractor/singlefile/apps.py
  70. 48 0
      archivebox/plugins_extractor/singlefile/binaries.py
  71. 25 0
      archivebox/plugins_extractor/singlefile/config.py
  72. 19 0
      archivebox/plugins_extractor/singlefile/extractors.py
  73. 0 26
      archivebox/plugins_extractor/singlefile/migrations/0001_initial.py
  74. 0 0
      archivebox/plugins_extractor/singlefile/migrations/__init__.py
  75. 0 40
      archivebox/plugins_extractor/singlefile/tasks.py
  76. 47 0
      archivebox/plugins_extractor/wget/__init__.py
  77. 0 127
      archivebox/plugins_extractor/wget/apps.py
  78. 18 0
      archivebox/plugins_extractor/wget/binaries.py
  79. 72 0
      archivebox/plugins_extractor/wget/config.py
  80. 37 0
      archivebox/plugins_extractor/wget/extractors.py
  81. 37 0
      archivebox/plugins_extractor/ytdlp/__init__.py
  82. 0 98
      archivebox/plugins_extractor/ytdlp/apps.py
  83. 42 0
      archivebox/plugins_extractor/ytdlp/binaries.py
  84. 35 0
      archivebox/plugins_extractor/ytdlp/config.py
  85. 47 0
      archivebox/plugins_pkg/npm/__init__.py
  86. 0 114
      archivebox/plugins_pkg/npm/apps.py
  87. 48 0
      archivebox/plugins_pkg/npm/binaries.py
  88. 40 0
      archivebox/plugins_pkg/npm/binproviders.py
  89. 20 0
      archivebox/plugins_pkg/npm/config.py
  90. 51 0
      archivebox/plugins_pkg/pip/__init__.py
  91. 7 109
      archivebox/plugins_pkg/pip/binaries.py
  92. 80 0
      archivebox/plugins_pkg/pip/binproviders.py
  93. 16 0
      archivebox/plugins_pkg/pip/config.py
  94. 44 0
      archivebox/plugins_pkg/playwright/__init__.py
  95. 23 0
      archivebox/plugins_pkg/playwright/binaries.py
  96. 10 57
      archivebox/plugins_pkg/playwright/binproviders.py
  97. 10 0
      archivebox/plugins_pkg/playwright/config.py
  98. 46 0
      archivebox/plugins_pkg/puppeteer/__init__.py
  99. 23 0
      archivebox/plugins_pkg/puppeteer/binaries.py
  100. 4 52
      archivebox/plugins_pkg/puppeteer/binproviders.py

+ 4 - 3
archivebox/abx/__init__.py

@@ -5,8 +5,8 @@ from pathlib import Path
 from typing import Dict
 from typing import Dict
 
 
 from . import hookspec as base_spec
 from . import hookspec as base_spec
-from .hookspec import hookimpl, hookspec           # noqa
-from .manager import pm, PluginManager             # noqa
+from abx.hookspec import hookimpl, hookspec           # noqa
+from abx.manager import pm, PluginManager             # noqa
 
 
 
 
 pm.add_hookspecs(base_spec)
 pm.add_hookspecs(base_spec)
@@ -32,7 +32,8 @@ def register_hookspecs(hookspecs):
 def find_plugins_in_dir(plugins_dir: Path, prefix: str) -> Dict[str, Path]:
 def find_plugins_in_dir(plugins_dir: Path, prefix: str) -> Dict[str, Path]:
     return {
     return {
         f"{prefix}.{plugin_entrypoint.parent.name}": plugin_entrypoint.parent
         f"{prefix}.{plugin_entrypoint.parent.name}": plugin_entrypoint.parent
-        for plugin_entrypoint in sorted(plugins_dir.glob("*/apps.py"), key=get_plugin_order)
+        for plugin_entrypoint in sorted(plugins_dir.glob("*/__init__.py"), key=get_plugin_order)
+        if plugin_entrypoint.parent.name != 'abx'
     }   # "plugins_pkg.pip": "/app/archivebox/plugins_pkg/pip"
     }   # "plugins_pkg.pip": "/app/archivebox/plugins_pkg/pip"
 
 
 
 

+ 8 - 22
archivebox/abx/archivebox/__init__.py

@@ -10,35 +10,21 @@ from pathlib import Path
 def load_archivebox_plugins(pm, plugins_dict: Dict[str, Path]):
 def load_archivebox_plugins(pm, plugins_dict: Dict[str, Path]):
     """Load archivebox plugins, very similar to abx.load_plugins but it looks for a pydantic PLUGIN model + hooks in apps.py"""
     """Load archivebox plugins, very similar to abx.load_plugins but it looks for a pydantic PLUGIN model + hooks in apps.py"""
     LOADED_PLUGINS = {}
     LOADED_PLUGINS = {}
-    for plugin_module, plugin_dir in plugins_dict.items():
+    for plugin_module, plugin_dir in reversed(plugins_dict.items()):
         # print(f'Loading plugin: {plugin_module} from {plugin_dir}')
         # print(f'Loading plugin: {plugin_module} from {plugin_dir}')
         
         
-        archivebox_plugins_found = []
-        
         # 1. register the plugin module directly in case it contains any look hookimpls (e.g. in __init__.py)
         # 1. register the plugin module directly in case it contains any look hookimpls (e.g. in __init__.py)
-        plugin_module_loaded = importlib.import_module(plugin_module)
-        pm.register(plugin_module_loaded)
-        if hasattr(plugin_module_loaded, 'PLUGIN'):
-            archivebox_plugins_found.append(plugin_module_loaded.PLUGIN)
+        try:
+            plugin_module_loaded = importlib.import_module(plugin_module)
+            pm.register(plugin_module_loaded)
+        except Exception as e:
+            print(f'Error registering plugin: {plugin_module} - {e}')
+            
         
         
         # 2. then try to import plugin_module.apps as well
         # 2. then try to import plugin_module.apps as well
         if os.access(plugin_dir / 'apps.py', os.R_OK):
         if os.access(plugin_dir / 'apps.py', os.R_OK):
             plugin_apps = importlib.import_module(plugin_module + '.apps')
             plugin_apps = importlib.import_module(plugin_module + '.apps')
             pm.register(plugin_apps)                                           # register the whole .apps  in case it contains loose hookimpls (not in a class)
             pm.register(plugin_apps)                                           # register the whole .apps  in case it contains loose hookimpls (not in a class)
-            if hasattr(plugin_apps, 'PLUGIN'):
-                archivebox_plugins_found.append(plugin_apps.PLUGIN)
-        
-        # 3. then try to look for plugin_module.PLUGIN and register it + all its hooks
-        for ab_plugin in archivebox_plugins_found:
-            pm.register(ab_plugin)
-            for hook in ab_plugin.hooks:
-                try:
-                    # if hook is a pydantic class, fix its __signature__ to make it usable as a Pluggy plugin
-                    hook.__signature__ = hook.__class__.__signature__              # fix to make pydantic model usable as Pluggy plugin
-                except Exception:
-                    pass
-                pm.register(hook)
-            LOADED_PLUGINS[plugin_module] = ab_plugin
             
             
-        print(f'    √ Loaded plugin: {plugin_module} {len(archivebox_plugins_found) * "🧩"}')
+        # print(f'    √ Loaded plugin: {plugin_module} {len(archivebox_plugins_found) * "🧩"}')
     return LOADED_PLUGINS
     return LOADED_PLUGINS

+ 0 - 38
archivebox/abx/archivebox/base_admindataview.py

@@ -1,38 +0,0 @@
-__package__ = 'abx.archivebox'
-
-from typing import Dict
-
-import abx
-
-from .base_hook import BaseHook, HookType
-
-
-class BaseAdminDataView(BaseHook):
-    hook_type: HookType = "ADMINDATAVIEW"
-    
-    name: str = 'example_admin_data_view_list'
-    verbose_name: str = 'Data View'
-    route: str = '/__OVERRIDE_THIS__/'
-    view: str = 'plugins_example.example.views.example_view_list'
-    
-    items: Dict[str, str] = {
-        'route': '<str:key>/',
-        "name": 'example_admin_data_view_item',
-        'view': 'plugins_example.example.views.example_view_item',
-    }
-    
-    @abx.hookimpl
-    def get_ADMINDATAVIEWS(self):
-        return [self]
-    
-    @abx.hookimpl
-    def get_ADMIN_DATA_VIEWS_URLS(self):
-        """routes to be added to django.conf.settings.ADMIN_DATA_VIEWS['urls']"""
-        route = {
-            "route": self.route,
-            "view": self.view,
-            "name": self.verbose_name,
-            "items": self.items,
-        }
-        return [route]
-

+ 2 - 6
archivebox/abx/archivebox/base_binary.py

@@ -18,12 +18,9 @@ from archivebox.config import CONSTANTS
 from archivebox.config.permissions import ARCHIVEBOX_USER
 from archivebox.config.permissions import ARCHIVEBOX_USER
 
 
 import abx
 import abx
-from .base_hook import BaseHook, HookType
 
 
 
 
-class BaseBinProvider(BaseHook, BinProvider):
-    hook_type: HookType = "BINPROVIDER"
-
+class BaseBinProvider(BinProvider):
     
     
     # TODO: add install/load/load_or_install methods as abx.hookimpl methods
     # TODO: add install/load/load_or_install methods as abx.hookimpl methods
     
     
@@ -36,8 +33,7 @@ class BaseBinProvider(BaseHook, BinProvider):
     def get_BINPROVIDERS(self):
     def get_BINPROVIDERS(self):
         return [self]
         return [self]
 
 
-class BaseBinary(BaseHook, Binary):
-    hook_type: HookType = "BINARY"
+class BaseBinary(Binary):
 
 
     @staticmethod
     @staticmethod
     def symlink_to_lib(binary, bin_dir=None) -> None:
     def symlink_to_lib(binary, bin_dir=None) -> None:

+ 2 - 27
archivebox/abx/archivebox/base_configset.py

@@ -11,9 +11,7 @@ from pydantic_settings.sources import TomlConfigSettingsSource
 
 
 from pydantic_pkgr import func_takes_args_or_kwargs
 from pydantic_pkgr import func_takes_args_or_kwargs
 
 
-import abx
 
 
-from .base_hook import BaseHook, HookType
 from . import toml_util
 from . import toml_util
 
 
 
 
@@ -201,29 +199,6 @@ class ArchiveBoxBaseConfig(BaseSettings):
         })
         })
 
 
 
 
-class BaseConfigSet(ArchiveBoxBaseConfig, BaseHook):      # type: ignore[type-arg]
-    hook_type: ClassVar[HookType] = 'CONFIG'
+class BaseConfigSet(ArchiveBoxBaseConfig):      # type: ignore[type-arg]
 
 
-    # @abx.hookimpl
-    # def ready(self, settings):
-    #    # reload config from environment, in case it's been changed by any other plugins
-    #    self.__init__()
-
-
-    @abx.hookimpl
-    def get_CONFIGS(self):
-        try:
-            return {self.id: self}
-        except Exception as e:
-            # raise Exception(f'Error computing CONFIGS for {type(self)}: {e.__class__.__name__}: {e}')
-            print(f'Error computing CONFIGS for {type(self)}: {e.__class__.__name__}: {e}')
-        return {}
-
-    @abx.hookimpl
-    def get_FLAT_CONFIG(self):
-        try:
-            return self.model_dump()
-        except Exception as e:
-            # raise Exception(f'Error computing FLAT_CONFIG for {type(self)}: {e.__class__.__name__}: {e}')
-            print(f'Error computing FLAT_CONFIG for {type(self)}: {e.__class__.__name__}: {e}')
-        return {}
+    pass

+ 2 - 4
archivebox/abx/archivebox/base_extractor.py

@@ -14,7 +14,6 @@ from django.utils import timezone
 
 
 import abx
 import abx
 
 
-from .base_hook import BaseHook, HookType
 from .base_binary import BaseBinary
 from .base_binary import BaseBinary
 
 
 
 
@@ -28,8 +27,7 @@ HandlerFuncStr = Annotated[str, AfterValidator(lambda s: s.startswith('self.'))]
 CmdArgsList = Annotated[List[str] | Tuple[str, ...], AfterValidator(no_empty_args)]
 CmdArgsList = Annotated[List[str] | Tuple[str, ...], AfterValidator(no_empty_args)]
 
 
 
 
-class BaseExtractor(BaseHook):
-    hook_type: HookType = 'EXTRACTOR'
+class BaseExtractor:
     
     
     name: ExtractorName
     name: ExtractorName
     binary: BinName
     binary: BinName
@@ -51,7 +49,7 @@ class BaseExtractor(BaseHook):
 
 
 
 
     def get_output_path(self, snapshot) -> Path:
     def get_output_path(self, snapshot) -> Path:
-        return Path(self.id.lower())
+        return Path(self.__class__.__name__.lower())
 
 
     def should_extract(self, uri: str, config: dict | None=None) -> bool:
     def should_extract(self, uri: str, config: dict | None=None) -> bool:
         try:
         try:

+ 0 - 80
archivebox/abx/archivebox/base_hook.py

@@ -1,80 +0,0 @@
-__package__ = 'abx.archivebox'
-
-import inspect
-from huey.api import TaskWrapper
-
-from pathlib import Path
-from typing import Tuple, Literal, ClassVar, get_args
-from pydantic import BaseModel, ConfigDict
-from django.utils.functional import cached_property
-
-import abx
-
-HookType = Literal['CONFIG', 'BINPROVIDER', 'BINARY', 'EXTRACTOR', 'REPLAYER', 'CHECK', 'ADMINDATAVIEW', 'QUEUE', 'SEARCHBACKEND']
-hook_type_names: Tuple[HookType] = get_args(HookType)
-
-class BaseHook(BaseModel):
-    model_config = ConfigDict(
-        extra="allow",
-        arbitrary_types_allowed=True,
-        from_attributes=True,
-        populate_by_name=True,
-        validate_defaults=True,
-        validate_assignment=False,
-        revalidate_instances="subclass-instances",
-        ignored_types=(TaskWrapper, cached_property),
-    )
-    
-    hook_type: ClassVar[HookType]     # e.g. = 'CONFIG'
-    
-    # verbose_name: str = Field()
-    
-    _is_registered: bool = False
-    _is_ready: bool = False
-
-
-    @property
-    def id(self) -> str:
-        return self.__class__.__name__
-
-    @property
-    def hook_module(self) -> str:
-        """e.g. plugins_extractor.singlefile.apps.SinglefileConfigSet"""
-        return f'{self.__module__}.{self.__class__.__name__}'
-
-    @property
-    def hook_file(self) -> Path:
-        """e.g. plugins_extractor.singlefile.apps.SinglefileConfigSet"""
-        return Path(inspect.getfile(self.__class__))
-
-    @property
-    def plugin_module(self) -> str:
-        """e.g. plugins_extractor.singlefile"""
-        return f"{self.__module__}.{self.__class__.__name__}".split("archivebox.", 1)[-1].rsplit(".apps.", 1)[0]
-
-    @property
-    def plugin_dir(self) -> Path:
-        return Path(inspect.getfile(self.__class__)).parent.resolve()
-    
-    @property
-    def admin_url(self) -> str:
-        # e.g. /admin/environment/config/LdapConfig/
-        return f"/admin/environment/{self.hook_type.lower()}/{self.id}/"
-
-
-    @abx.hookimpl
-    def register(self, settings):
-        """Called when django.apps.AppConfig.ready() is called"""
-        
-        # print("REGISTERED HOOK:", self.hook_module)
-        self._is_registered = True
-        
-
-    @abx.hookimpl
-    def ready(self):
-        """Called when django.apps.AppConfig.ready() is called"""
-        
-        assert self._is_registered, f"Tried to run {self.hook_module}.ready() but it was never registered!"
-       
-        # print("READY HOOK:", self.hook_module)
-        self._is_ready = True

+ 0 - 175
archivebox/abx/archivebox/base_plugin.py

@@ -1,175 +0,0 @@
-__package__ = 'abx.archivebox'
-
-import abx
-import inspect
-from pathlib import Path
-
-from django.apps import AppConfig
-
-from typing import List, Type, Dict
-from typing_extensions import Self
-from types import ModuleType
-
-from pydantic import (
-    BaseModel,
-    ConfigDict,
-    Field,
-    model_validator,
-    InstanceOf,
-    computed_field,
-)
-from benedict import benedict
-
-from .base_hook import BaseHook, HookType
-
-def convert_flat_module_to_hook_class(hook_module: ModuleType) -> Type[BaseHook]:
-    plugin_name = hook_module.__module__.split('.')[-1]  # e.g. core
-    hook_id = hook_module.__name__                       # e.g. admin
-    
-    class_name = f"{plugin_name.title()}{hook_id.title()}"   # e.g. CoreAdmin
-    
-    return type(class_name, (BaseHook,),
-                {key: staticmethod(value) if callable(value) else value
-                 for key, value in ((name, getattr(hook_module, name))
-                                    for name in dir(hook_module))})
-
-
-class BasePlugin(BaseModel):
-    model_config = ConfigDict(
-        extra='forbid',
-        arbitrary_types_allowed=True,
-        populate_by_name=True,
-        from_attributes=True,
-        validate_defaults=False,
-        validate_assignment=False,
-        revalidate_instances="always",
-        # frozen=True,
-    )
-
-    # Required by AppConfig:
-    app_label: str = Field()                      # e.g. 'singlefile'                  (one-word machine-readable representation, to use as url-safe id/db-table prefix_/attr name)
-    verbose_name: str = Field()                   # e.g. 'SingleFile'                  (human-readable *short* label, for use in column names, form labels, etc.)
-    docs_url: str = Field(default=None)           # e.g. 'https://github.com/...'
-    
-    # All the hooks the plugin will install:
-    hooks: List[InstanceOf[BaseHook] | InstanceOf[ModuleType]] = Field(default=[])
-    
-    _is_registered: bool = False
-    _is_ready: bool = False
-    
-    @computed_field
-    @property
-    def id(self) -> str:
-        return self.__class__.__name__
-    
-    @property
-    def name(self) -> str:
-        return self.app_label
-    
-    # @computed_field
-    @property
-    def plugin_module(self) -> str:  # DottedImportPath
-        """ "
-        Dotted import path of the plugin's module (after its loaded via settings.INSTALLED_APPS).
-        e.g. 'archivebox.plugins_pkg.npm.apps.NpmPlugin' -> 'plugins_pkg.npm'
-        """
-        return f"{self.__module__}.{self.__class__.__name__}".split("archivebox.", 1)[-1].rsplit('.apps.', 1)[0]
-
-
-    @property
-    def plugin_module_full(self) -> str:  # DottedImportPath
-        """e.g. 'archivebox.plugins_pkg.npm.apps.NpmPlugin'"""
-        return f"{self.__module__}.{self.__class__.__name__}"
-    
-    # @computed_field
-    @property
-    def plugin_dir(self) -> Path:
-        return Path(inspect.getfile(self.__class__)).parent.resolve()
-    
-    @model_validator(mode='after')
-    def validate(self) -> Self:
-        """Validate the plugin's build-time configuration here before it's registered in Django at runtime."""
-        
-        # VERY IMPORTANT:
-        # preserve references to original default objects,
-        # pydantic deepcopies them by default which breaks mutability
-        # see https://github.com/pydantic/pydantic/issues/7608
-        # if we dont do this, then plugins_extractor.SINGLEFILE_CONFIG != settings.CONFIGS.SingleFileConfig for example
-        # and calling .__init__() on one of them will not update the other
-        self.hooks = []
-        for hook in self.model_fields['hooks'].default:
-            if isinstance(hook, BaseHook):
-                self.hooks.append(hook)
-            elif isinstance(hook, ModuleType):
-                # if hook is a module, turn it into a Hook class instance
-                # hook_instance = convert_flat_module_to_hook_class(hook)()
-                # self.hooks.extend(hook_instance)
-                print('SKIPPING INVALID HOOK:', hook)
-        
-        assert self.app_label and self.app_label and self.verbose_name, f'{self.__class__.__name__} is missing .name or .app_label or .verbose_name'
-        
-        # assert json.dumps(self.model_json_schema(), indent=4), f"Plugin {self.plugin_module} has invalid JSON schema."
-        
-        return self
-    
-    @property
-    def AppConfig(plugin_self) -> Type[AppConfig]:
-        """Generate a Django AppConfig class for this plugin."""
-
-
-        class PluginAppConfig(AppConfig):
-            """Django AppConfig for plugin, allows it to be loaded as a Django app listed in settings.INSTALLED_APPS."""
-            name = plugin_self.plugin_module
-            app_label = plugin_self.app_label
-            verbose_name = plugin_self.verbose_name
-
-            default_auto_field = 'django.db.models.AutoField'
-
-            # handled by abx.hookimpl  ready()
-            # def ready(self):
-            #     from django.conf import settings
-            #     plugin_self.ready(settings)
-
-        return PluginAppConfig
-
-    @property
-    def HOOKS_BY_ID(self) -> Dict[str, InstanceOf[BaseHook]]:
-        return benedict({hook.id: hook for hook in self.hooks})
-
-    @property
-    def HOOKS_BY_TYPE(self) -> Dict[HookType, Dict[str, InstanceOf[BaseHook]]]:
-        hooks = benedict({})
-        for hook in self.hooks:
-            hooks[hook.hook_type] = hooks.get(hook.hook_type) or benedict({})
-            hooks[hook.hook_type][hook.id] = hook
-        return hooks
-
-
-
-    @abx.hookimpl
-    def register(self, settings):
-        from archivebox.config.legacy import bump_startup_progress_bar
-
-        self._is_registered = True
-        bump_startup_progress_bar()
-
-        # print('◣----------------- REGISTERED PLUGIN:', self.plugin_module, '-----------------◢')
-        # print()
-
-    @abx.hookimpl
-    def ready(self, settings=None):
-        """Runs any runtime code needed when AppConfig.ready() is called (after all models are imported)."""
-
-        from archivebox.config.legacy import bump_startup_progress_bar
-
-        assert self._is_registered, f"Tried to run {self.plugin_module}.ready() but it was never registered!"
-        self._is_ready = True
-
-        # settings.PLUGINS[self.id]._is_ready = True
-        bump_startup_progress_bar()
-
-
-    @abx.hookimpl
-    def get_INSTALLED_APPS(self):
-        return [self.plugin_module]
-

+ 0 - 106
archivebox/abx/archivebox/base_queue.py

@@ -1,106 +0,0 @@
-__package__ = 'abx.archivebox'
-
-import importlib
-
-from typing import Dict, List, TYPE_CHECKING
-from pydantic import Field, InstanceOf
-from benedict import benedict
-
-if TYPE_CHECKING:
-    from huey.api import TaskWrapper
-
-import abx
-
-from .base_hook import BaseHook, HookType
-from .base_binary import BaseBinary
-
-
-
-class BaseQueue(BaseHook):
-    hook_type: HookType = 'QUEUE'
-
-    name: str = Field()       # e.g. 'singlefile'
-
-    binaries: List[InstanceOf[BaseBinary]] = Field()
-
-    @property
-    def tasks(self) -> Dict[str, 'TaskWrapper']:
-        """Return an dict of all the background worker tasks defined in the plugin's tasks.py file."""
-        tasks = importlib.import_module(f"{self.plugin_module}.tasks")
-
-        all_tasks = {}
-
-        for task_name, task in tasks.__dict__.items():
-            # if attr is a Huey task and its queue_name matches our hook's queue name
-            if hasattr(task, "task_class") and task.huey.name == self.name:
-                all_tasks[task_name] = task
-
-        return benedict(all_tasks)
-
-    def get_django_huey_config(self, QUEUE_DATABASE_NAME) -> dict:
-        """Get the config dict to insert into django.conf.settings.DJANGO_HUEY['queues']."""
-        return {
-            "huey_class": "huey.SqliteHuey",
-            "filename": QUEUE_DATABASE_NAME,
-            "name": self.name,
-            "results": True,
-            "store_none": True,
-            "immediate": False,
-            "utc": True,
-            "consumer": {
-                "workers": 1,
-                "worker_type": "thread",
-                "initial_delay": 0.1,  # Smallest polling interval, same as -d.
-                "backoff": 1.15,  # Exponential backoff using this rate, -b.
-                "max_delay": 10.0,  # Max possible polling interval, -m.
-                "scheduler_interval": 1,  # Check schedule every second, -s.
-                "periodic": True,  # Enable crontab feature.
-                "check_worker_health": True,  # Enable worker health checks.
-                "health_check_interval": 1,  # Check worker health every second.
-            },
-        }
-        
-    def get_supervisord_config(self, settings) -> dict:
-        """Ge the config dict used to tell sueprvisord to start a huey consumer for this queue."""
-        return {
-            "name": f"worker_{self.name}",
-            "command": f"archivebox manage djangohuey --queue {self.name}",
-            "stdout_logfile": f"logs/worker_{self.name}.log",
-            "redirect_stderr": "true",
-            "autorestart": "true",
-            "autostart": "false",
-        }
-        
-    def start_supervisord_worker(self, settings, lazy=True):
-        from queues.supervisor_util import get_or_create_supervisord_process, start_worker
-        print()
-        try:
-            supervisor = get_or_create_supervisord_process(daemonize=False)
-        except Exception as e:
-            print(f"Error starting worker for queue {self.name}: {e}")
-            return None
-        print()
-        worker = start_worker(supervisor, self.get_supervisord_config(settings), lazy=lazy)
-
-        # Update settings.WORKERS to include this worker
-        settings.WORKERS = getattr(settings, "WORKERS", None) or benedict({})
-        settings.WORKERS[self.id] = self.start_supervisord_worker(settings, lazy=True)
-
-        return worker
-
-    @abx.hookimpl
-    def get_QUEUES(self):
-        return [self]
-
-    @abx.hookimpl
-    def get_DJANGO_HUEY_QUEUES(self, QUEUE_DATABASE_NAME):
-        """queue configs to be added to django.conf.settings.DJANGO_HUEY['queues']"""
-        return {
-            self.name: self.get_django_huey_config(QUEUE_DATABASE_NAME)
-        }
-        
-        
-    # @abx.hookimpl
-    # def ready(self, settings):
-    #     self.start_supervisord_worker(settings, lazy=True)
-    #     super().ready(settings)

+ 1 - 5
archivebox/abx/archivebox/base_replayer.py

@@ -2,14 +2,10 @@ __package__ = 'abx.archivebox'
 
 
 import abx
 import abx
 
 
-from .base_hook import BaseHook, HookType
 
 
-
-class BaseReplayer(BaseHook):
+class BaseReplayer:
     """Describes how to render an ArchiveResult in several contexts"""
     """Describes how to render an ArchiveResult in several contexts"""
     
     
-    hook_type: HookType = 'REPLAYER'
-    
     url_pattern: str = '*'
     url_pattern: str = '*'
 
 
     row_template: str = 'plugins/generic_replayer/templates/row.html'
     row_template: str = 'plugins/generic_replayer/templates/row.html'

+ 7 - 15
archivebox/abx/archivebox/base_searchbackend.py

@@ -1,33 +1,25 @@
 __package__ = 'abx.archivebox'
 __package__ = 'abx.archivebox'
 
 
 from typing import Iterable, List
 from typing import Iterable, List
-from pydantic import Field
+import abc
 
 
-import abx
-from .base_hook import BaseHook, HookType
 
 
 
 
-
-class BaseSearchBackend(BaseHook):
-    hook_type: HookType = 'SEARCHBACKEND'
-
-    name: str = Field()       # e.g. 'singlefile'
-
-
-    # TODO: move these to a hookimpl
+class BaseSearchBackend(abc.ABC):
+    name: str
 
 
     @staticmethod
     @staticmethod
+    @abc.abstractmethod
     def index(snapshot_id: str, texts: List[str]):
     def index(snapshot_id: str, texts: List[str]):
         return
         return
 
 
     @staticmethod
     @staticmethod
+    @abc.abstractmethod
     def flush(snapshot_ids: Iterable[str]):
     def flush(snapshot_ids: Iterable[str]):
         return
         return
 
 
     @staticmethod
     @staticmethod
+    @abc.abstractmethod
     def search(text: str) -> List[str]:
     def search(text: str) -> List[str]:
         raise NotImplementedError("search method must be implemented by subclass")
         raise NotImplementedError("search method must be implemented by subclass")
-    
-    @abx.hookimpl
-    def get_SEARCHBACKENDS(self):
-        return [self]
+

+ 4 - 2
archivebox/abx/archivebox/hookspec.py

@@ -4,10 +4,12 @@ from typing import Dict, Any
 
 
 from .. import hookspec
 from .. import hookspec
 
 
+from .base_configset import BaseConfigSet
 
 
 @hookspec
 @hookspec
-def get_CONFIGS():
-    return {}
+def get_CONFIG() -> BaseConfigSet:
+    ...
+
 
 
 @hookspec
 @hookspec
 def get_EXTRACTORS():
 def get_EXTRACTORS():

+ 111 - 73
archivebox/abx/archivebox/use.py

@@ -1,130 +1,168 @@
 __package__ = 'abx.archivebox'
 __package__ = 'abx.archivebox'
 
 
+import importlib
 from typing import Dict, Any, TYPE_CHECKING
 from typing import Dict, Any, TYPE_CHECKING
 
 
-from django.utils import timezone
 from benedict import benedict
 from benedict import benedict
 
 
 from .. import pm
 from .. import pm
 
 
 if TYPE_CHECKING:
 if TYPE_CHECKING:
-    from .base_hook import BaseHook
     from .base_configset import BaseConfigSet
     from .base_configset import BaseConfigSet
     from .base_binary import BaseBinary, BaseBinProvider
     from .base_binary import BaseBinary, BaseBinProvider
     from .base_extractor import BaseExtractor
     from .base_extractor import BaseExtractor
-    from .base_replayer import BaseReplayer
-    from .base_queue import BaseQueue
-    from .base_admindataview import BaseAdminDataView
     from .base_searchbackend import BaseSearchBackend
     from .base_searchbackend import BaseSearchBackend
+    # from .base_replayer import BaseReplayer
+    # from .base_queue import BaseQueue
+    # from .base_admindataview import BaseAdminDataView
 
 
 # API exposed to ArchiveBox code
 # API exposed to ArchiveBox code
 
 
-def get_PLUGINS():
+def get_PLUGINS() -> Dict[str, Dict[str, Any]]:
     return benedict({
     return benedict({
-        plugin.PLUGIN.id: plugin.PLUGIN
-        for plugin in pm.get_plugins()
-    })
-
-def get_HOOKS(PLUGINS) -> Dict[str, 'BaseHook']:
-    return benedict({
-        hook.id: hook
-        for plugin in PLUGINS.values()
-            for hook in plugin.hooks
+        plugin_id: plugin
+        for plugin_dict in pm.hook.get_PLUGIN()
+            for plugin_id, plugin in plugin_dict.items()
     })
     })
+    
+def get_PLUGIN(plugin_id: str):
+    plugin_info = get_PLUGINS().get(plugin_id, {})
+    assert plugin_info and getattr(plugin_info, 'PACKAGE', None), f'Plugin {plugin_id} not found'
+    
+    module = importlib.import_module(plugin_info['PACKAGE'])
+    extra_info ={
+        'ID': plugin_id,
+        'id': plugin_id,
+        **plugin_info,
+        'SOURCE_PATH': module.__file__,
+        'MODULE': module,
+        'CONFIG': {},
+        'BINARIES': {},
+        'BINPROVIDERS': {},
+        'EXTRACTORS': {},
+        'SEARCHBACKENDS': {},
+    }
+    try:
+        extra_info['CONFIG'] = module.get_CONFIG()[plugin_id]
+    except AttributeError:
+        pass
+    try:
+        extra_info['BINARIES'] = module.get_BINARIES()
+    except AttributeError:
+        pass
+    try:
+        extra_info['BINPROVIDERS'] = module.get_BINPROVIDERS()
+    except AttributeError:
+        pass
+    try:
+        extra_info['EXTRACTORS'] = module.get_EXTRACTORS()
+    except AttributeError:
+        pass
+    try:
+        extra_info['SEARCHBACKENDS'] = module.get_SEARCHBACKENDS()
+    except AttributeError:
+        pass
+    return benedict(extra_info)
+
+# def get_HOOKS(PLUGINS) -> Dict[str, 'BaseHook']:
+#     return benedict({
+#         hook.id: hook
+#         for plugin in PLUGINS.values()
+#             for hook in plugin.hooks
+#     })
 
 
 def get_CONFIGS() -> Dict[str, 'BaseConfigSet']:
 def get_CONFIGS() -> Dict[str, 'BaseConfigSet']:
     return benedict({
     return benedict({
-        config_id: config
-        for plugin_configs in pm.hook.get_CONFIGS()
-            for config_id, config in plugin_configs.items()
+        config_id: configset
+        for plugin_configs in pm.hook.get_CONFIG()
+            for config_id, configset in plugin_configs.items()
     })
     })
     
     
 def get_FLAT_CONFIG() -> Dict[str, Any]:
 def get_FLAT_CONFIG() -> Dict[str, Any]:
     return benedict({
     return benedict({
         key: value
         key: value
-        for plugin_config_dict in pm.hook.get_FLAT_CONFIG()
-            for key, value in plugin_config_dict.items()
+        for configset in get_CONFIGS().values()
+            for key, value in configset.model_dump().items()
     })
     })
 
 
 def get_BINPROVIDERS() -> Dict[str, 'BaseBinProvider']:
 def get_BINPROVIDERS() -> Dict[str, 'BaseBinProvider']:
     # TODO: move these to plugins
     # TODO: move these to plugins
     from abx.archivebox.base_binary import apt, brew, env
     from abx.archivebox.base_binary import apt, brew, env
-    builtin_binproviders = [apt, brew, env]
+    builtin_binproviders = {
+        'apt': apt,
+        'brew': brew,
+        'env': env,
+    }
     
     
     return benedict({
     return benedict({
-        binprovider.id: binprovider
+        binprovider_id: binprovider
         for plugin_binproviders in [builtin_binproviders, *pm.hook.get_BINPROVIDERS()]
         for plugin_binproviders in [builtin_binproviders, *pm.hook.get_BINPROVIDERS()]
-            for binprovider in plugin_binproviders
+            for binprovider_id, binprovider in plugin_binproviders.items()
     })
     })
 
 
 def get_BINARIES() -> Dict[str, 'BaseBinary']:
 def get_BINARIES() -> Dict[str, 'BaseBinary']:
     return benedict({
     return benedict({
-        binary.id: binary
+        binary_id: binary
         for plugin_binaries in pm.hook.get_BINARIES()
         for plugin_binaries in pm.hook.get_BINARIES()
-            for binary in plugin_binaries
+            for binary_id, binary in plugin_binaries.items()
     })
     })
 
 
 def get_EXTRACTORS() -> Dict[str, 'BaseExtractor']:
 def get_EXTRACTORS() -> Dict[str, 'BaseExtractor']:
     return benedict({
     return benedict({
-        extractor.id: extractor
+        extractor_id: extractor
         for plugin_extractors in pm.hook.get_EXTRACTORS()
         for plugin_extractors in pm.hook.get_EXTRACTORS()
-            for extractor in plugin_extractors
-    })
-
-def get_REPLAYERS() -> Dict[str, 'BaseReplayer']:
-    return benedict({
-        replayer.id: replayer
-        for plugin_replayers in pm.hook.get_REPLAYERS()
-            for replayer in plugin_replayers
+            for extractor_id, extractor in plugin_extractors.items()
     })
     })
 
 
-def get_ADMINDATAVIEWS() -> Dict[str, 'BaseAdminDataView']:
-    return benedict({
-        admin_dataview.id: admin_dataview
-        for plugin_admin_dataviews in pm.hook.get_ADMINDATAVIEWS()
-            for admin_dataview in plugin_admin_dataviews
-    })
-
-def get_QUEUES() -> Dict[str, 'BaseQueue']:
-    return benedict({
-        queue.id: queue
-        for plugin_queues in pm.hook.get_QUEUES()
-            for queue in plugin_queues
-    })
+# def get_REPLAYERS() -> Dict[str, 'BaseReplayer']:
+#     return benedict({
+#         replayer.id: replayer
+#         for plugin_replayers in pm.hook.get_REPLAYERS()
+#             for replayer in plugin_replayers
+#     })
+
+# def get_ADMINDATAVIEWS() -> Dict[str, 'BaseAdminDataView']:
+#     return benedict({
+#         admin_dataview.id: admin_dataview
+#         for plugin_admin_dataviews in pm.hook.get_ADMINDATAVIEWS()
+#             for admin_dataview in plugin_admin_dataviews
+#     })
+
+# def get_QUEUES() -> Dict[str, 'BaseQueue']:
+#     return benedict({
+#         queue.id: queue
+#         for plugin_queues in pm.hook.get_QUEUES()
+#             for queue in plugin_queues
+#     })
 
 
 def get_SEARCHBACKENDS() -> Dict[str, 'BaseSearchBackend']:
 def get_SEARCHBACKENDS() -> Dict[str, 'BaseSearchBackend']:
     return benedict({
     return benedict({
-        searchbackend.id: searchbackend
+        searchbackend_id: searchbackend
         for plugin_searchbackends in pm.hook.get_SEARCHBACKENDS()
         for plugin_searchbackends in pm.hook.get_SEARCHBACKENDS()
-            for searchbackend in plugin_searchbackends
+            for searchbackend_id,searchbackend in plugin_searchbackends.items()
     })
     })
 
 
 
 
 ###########################
 ###########################
 
 
 
 
-def register_all_hooks(settings):
-    pm.hook.register(settings=settings)
-
-
-
-def extract(url_or_snapshot_id):
-    from core.models import Snapshot
+# def extract(url_or_snapshot_id):
+#     from core.models import Snapshot
     
     
-    url, snapshot_abid, snapshot_id = None, None, None
-    snapshot = None
-    if '://' in url_or_snapshot_id:
-        url = url_or_snapshot_id
-        try:
-            snapshot = Snapshot.objects.get(url=url)
-        except Snapshot.DoesNotExist:
-            snapshot = Snapshot(url=url_or_snapshot_id, timestamp=str(timezone.now().timestamp()), bookmarked_at=timezone.now())
-            snapshot.save()
-    elif '-' in url_or_snapshot_id:
-        snapshot_id = url_or_snapshot_id
-        snapshot = Snapshot.objects.get(id=snapshot_id)
-    else:
-        snapshot_abid = url_or_snapshot_id
-        snapshot = Snapshot.objects.get(abid=snapshot_abid)
-
-    return pm.hook.extract(snapshot_id=snapshot.id)
+#     url, snapshot_abid, snapshot_id = None, None, None
+#     snapshot = None
+#     if '://' in url_or_snapshot_id:
+#         url = url_or_snapshot_id
+#         try:
+#             snapshot = Snapshot.objects.get(url=url)
+#         except Snapshot.DoesNotExist:
+#             snapshot = Snapshot(url=url_or_snapshot_id, timestamp=str(timezone.now().timestamp()), bookmarked_at=timezone.now())
+#             snapshot.save()
+#     elif '-' in url_or_snapshot_id:
+#         snapshot_id = url_or_snapshot_id
+#         snapshot = Snapshot.objects.get(id=snapshot_id)
+#     else:
+#         snapshot_abid = url_or_snapshot_id
+#         snapshot = Snapshot.objects.get(abid=snapshot_abid)
+
+#     return pm.hook.extract(snapshot_id=snapshot.id)

+ 30 - 1
archivebox/config/__init__.py

@@ -5,5 +5,34 @@ from .paths import (
     DATA_DIR,                                       # noqa
     DATA_DIR,                                       # noqa
     ARCHIVE_DIR,                                    # noqa
     ARCHIVE_DIR,                                    # noqa
 )
 )
-from .constants import CONSTANTS, CONSTANTS_CONFIG  # noqa
+from .constants import CONSTANTS, CONSTANTS_CONFIG, PACKAGE_DIR, DATA_DIR, ARCHIVE_DIR      # noqa
 from .version import VERSION                        # noqa
 from .version import VERSION                        # noqa
+
+
+import abx
+
+
+# @abx.hookimpl
+# def get_INSTALLED_APPS():
+#     return ['config']
+
+
[email protected]
+def get_CONFIG():
+    from .common import (
+        SHELL_CONFIG,
+        STORAGE_CONFIG,
+        GENERAL_CONFIG,
+        SERVER_CONFIG,
+        ARCHIVING_CONFIG,
+        SEARCH_BACKEND_CONFIG,
+    )
+    return {
+        'SHELL': SHELL_CONFIG,
+        'STORAGE': STORAGE_CONFIG,
+        'GENERAL': GENERAL_CONFIG,
+        'SERVER': SERVER_CONFIG,
+        'ARCHIVING': ARCHIVING_CONFIG,
+        'SEARCHBACKEND': SEARCH_BACKEND_CONFIG,
+    }
+

+ 0 - 57
archivebox/config/apps.py

@@ -1,57 +0,0 @@
-__package__ = 'archivebox.config'
-
-from typing import List
-from pydantic import InstanceOf
-
-from abx.archivebox.base_plugin import BasePlugin
-from abx.archivebox.base_hook import BaseHook
-
-
-from .constants import CONSTANTS, CONSTANTS_CONFIG, PACKAGE_DIR, DATA_DIR, ARCHIVE_DIR      # noqa
-from .common import (
-    ShellConfig,                    # noqa: F401
-    StorageConfig,                  # noqa: F401
-    GeneralConfig,                  # noqa: F401
-    ServerConfig,                   # noqa: F401
-    ArchivingConfig,                # noqa: F401
-    SearchBackendConfig,            # noqa: F401
-    SHELL_CONFIG,
-    STORAGE_CONFIG,
-    GENERAL_CONFIG,
-    SERVER_CONFIG,
-    ARCHIVING_CONFIG,
-    SEARCH_BACKEND_CONFIG,
-)
-
-###################### Config ##########################
-
-
-class ConfigPlugin(BasePlugin):
-    app_label: str = 'CONFIG'
-    verbose_name: str = 'Configuration'
-
-    hooks: List[InstanceOf[BaseHook]] = [
-        SHELL_CONFIG,
-        GENERAL_CONFIG,
-        STORAGE_CONFIG,
-        SERVER_CONFIG,
-        ARCHIVING_CONFIG,
-        SEARCH_BACKEND_CONFIG,
-    ]
-
-
-PLUGIN = ConfigPlugin()
-DJANGO_APP = PLUGIN.AppConfig
-
-
-
-# # register django apps
-# @abx.hookimpl
-# def get_INSTALLED_APPS():
-#     return [DJANGO_APP.name]
-
-# # register configs
-# @abx.hookimpl
-# def register_CONFIG():
-#     return PLUGIN.HOOKS_BY_TYPE['CONFIG'].values()
-

+ 3 - 7
archivebox/config/legacy.py

@@ -50,13 +50,11 @@ from ..misc.logging import (
 )
 )
 
 
 from .common import SHELL_CONFIG, GENERAL_CONFIG, ARCHIVING_CONFIG, SERVER_CONFIG, SEARCH_BACKEND_CONFIG, STORAGE_CONFIG
 from .common import SHELL_CONFIG, GENERAL_CONFIG, ARCHIVING_CONFIG, SERVER_CONFIG, SEARCH_BACKEND_CONFIG, STORAGE_CONFIG
-from archivebox.plugins_auth.ldap.apps import LDAP_CONFIG
-from archivebox.plugins_extractor.favicon.apps import FAVICON_CONFIG
-from archivebox.plugins_extractor.wget.apps import WGET_CONFIG
-from archivebox.plugins_extractor.curl.apps import CURL_CONFIG
+from archivebox.plugins_extractor.favicon.config import FAVICON_CONFIG
+from archivebox.plugins_extractor.wget.config import WGET_CONFIG
+from archivebox.plugins_extractor.curl.config import CURL_CONFIG
 
 
 ANSI = SHELL_CONFIG.ANSI
 ANSI = SHELL_CONFIG.ANSI
-LDAP = LDAP_CONFIG.LDAP_ENABLED
 
 
 ############################### Config Schema ##################################
 ############################### Config Schema ##################################
 
 
@@ -73,8 +71,6 @@ CONFIG_SCHEMA: Dict[str, Dict[str, Any]] = {
 
 
     'STORAGE_CONFIG': STORAGE_CONFIG.as_legacy_config_schema(),
     'STORAGE_CONFIG': STORAGE_CONFIG.as_legacy_config_schema(),
     
     
-    'LDAP_CONFIG': LDAP_CONFIG.as_legacy_config_schema(),
-    
     # 'FAVICON_CONFIG': FAVICON_CONFIG.as_legacy_config_schema(),
     # 'FAVICON_CONFIG': FAVICON_CONFIG.as_legacy_config_schema(),
     
     
     # 'WGET_CONFIG': WGET_CONFIG.as_legacy_config_schema(),
     # 'WGET_CONFIG': WGET_CONFIG.as_legacy_config_schema(),

+ 53 - 30
archivebox/config/views.py

@@ -2,6 +2,7 @@ __package__ = 'abx.archivebox'
 
 
 import os
 import os
 import inspect
 import inspect
+from pathlib import Path
 from typing import Any, List, Dict, cast
 from typing import Any, List, Dict, cast
 from benedict import benedict
 from benedict import benedict
 
 
@@ -13,6 +14,8 @@ from django.utils.html import format_html, mark_safe
 from admin_data_views.typing import TableContext, ItemContext
 from admin_data_views.typing import TableContext, ItemContext
 from admin_data_views.utils import render_with_table_view, render_with_item_view, ItemLink
 from admin_data_views.utils import render_with_table_view, render_with_item_view, ItemLink
 
 
+import abx.archivebox.use
+
 from archivebox.config import CONSTANTS
 from archivebox.config import CONSTANTS
 from archivebox.misc.util import parse_date
 from archivebox.misc.util import parse_date
 
 
@@ -82,8 +85,10 @@ def binaries_list_view(request: HttpRequest, **kwargs) -> TableContext:
         if '_BINARY' in key or '_VERSION' in key
         if '_BINARY' in key or '_VERSION' in key
     }
     }
 
 
-    for plugin in settings.PLUGINS.values():
-        for binary in plugin.HOOKS_BY_TYPE.get('BINARY', {}).values():
+    for plugin_id in abx.archivebox.use.get_PLUGINS().keys():
+        plugin = abx.archivebox.use.get_PLUGIN(plugin_id)
+        
+        for binary in plugin.BINARIES.values():
             try:
             try:
                 installed_binary = InstalledBinary.objects.get_from_db_or_cache(binary)
                 installed_binary = InstalledBinary.objects.get_from_db_or_cache(binary)
                 binary = installed_binary.load_from_db()
                 binary = installed_binary.load_from_db()
@@ -92,7 +97,7 @@ def binaries_list_view(request: HttpRequest, **kwargs) -> TableContext:
 
 
             rows['Binary Name'].append(ItemLink(binary.name, key=binary.name))
             rows['Binary Name'].append(ItemLink(binary.name, key=binary.name))
             rows['Found Version'].append(f'✅ {binary.loaded_version}' if binary.loaded_version else '❌ missing')
             rows['Found Version'].append(f'✅ {binary.loaded_version}' if binary.loaded_version else '❌ missing')
-            rows['From Plugin'].append(plugin.plugin_module)
+            rows['From Plugin'].append(plugin.PACKAGE)
             rows['Provided By'].append(
             rows['Provided By'].append(
                 ', '.join(
                 ', '.join(
                     f'[{binprovider.name}]' if binprovider.name == getattr(binary.loaded_binprovider, 'name', None) else binprovider.name
                     f'[{binprovider.name}]' if binprovider.name == getattr(binary.loaded_binprovider, 'name', None) else binprovider.name
@@ -128,8 +133,9 @@ def binary_detail_view(request: HttpRequest, key: str, **kwargs) -> ItemContext:
 
 
     binary = None
     binary = None
     plugin = None
     plugin = None
-    for loaded_plugin in settings.PLUGINS.values():
-        for loaded_binary in loaded_plugin.HOOKS_BY_TYPE.get('BINARY', {}).values():
+    for plugin_id in abx.archivebox.use.get_PLUGINS().keys():
+        loaded_plugin = abx.archivebox.use.get_PLUGIN(plugin_id)
+        for loaded_binary in loaded_plugin.BINARIES.values():
             if loaded_binary.name == key:
             if loaded_binary.name == key:
                 binary = loaded_binary
                 binary = loaded_binary
                 plugin = loaded_plugin
                 plugin = loaded_plugin
@@ -149,7 +155,7 @@ def binary_detail_view(request: HttpRequest, key: str, **kwargs) -> ItemContext:
                 "name": binary.name,
                 "name": binary.name,
                 "description": binary.abspath,
                 "description": binary.abspath,
                 "fields": {
                 "fields": {
-                    'plugin': plugin.name,
+                    'plugin': plugin.PACKAGE,
                     'binprovider': binary.loaded_binprovider,
                     'binprovider': binary.loaded_binprovider,
                     'abspath': binary.loaded_abspath,
                     'abspath': binary.loaded_abspath,
                     'version': binary.loaded_version,
                     'version': binary.loaded_version,
@@ -170,28 +176,43 @@ def plugins_list_view(request: HttpRequest, **kwargs) -> TableContext:
     assert request.user.is_superuser, 'Must be a superuser to view configuration settings.'
     assert request.user.is_superuser, 'Must be a superuser to view configuration settings.'
 
 
     rows = {
     rows = {
-        "Name": [],
-        "verbose_name": [],
-        "module": [],
-        "source_code": [],
-        "hooks": [],
+        "Label": [],
+        "Version": [],
+        "Author": [],
+        "Package": [],
+        "Source Code": [],
+        "Config": [],
+        "Binaries": [],
+        "Package Managers": [],
+        # "Search Backends": [],
     }
     }
 
 
 
 
-    for plugin in settings.PLUGINS.values():
-        # try:
-        #     plugin.load_binaries()
-        # except Exception as e:
-        #     print(e)
-
-        rows['Name'].append(ItemLink(plugin.id, key=plugin.id))
-        rows['verbose_name'].append(mark_safe(f'<a href="{plugin.docs_url}" target="_blank">{plugin.verbose_name}</a>'))
-        rows['module'].append(str(plugin.plugin_module))
-        rows['source_code'].append(str(plugin.plugin_dir))
-        rows['hooks'].append(mark_safe(', '.join(
-            f'<a href="{hook.admin_url}">{hook.id}</a>'
-            for hook in plugin.hooks
+    for plugin_id in settings.PLUGINS.keys():
+        
+        plugin = abx.archivebox.use.get_PLUGIN(plugin_id)
+
+        rows['Label'].append(mark_safe(f'<a href="{plugin.HOMEPAGE}" target="_blank">{plugin.LABEL}</a>'))
+        rows['Version'].append(str(plugin.VERSION))
+        rows['Author'].append(str(plugin.AUTHOR))
+        rows['Package'].append(ItemLink(plugin.PACKAGE, key=plugin.PACKAGE))
+        rows['Source Code'].append(format_html('<code>{}</code>', str(plugin.SOURCE_PATH).replace(str(Path('~').expanduser()), '~')))
+        rows['Config'].append(mark_safe(''.join(
+            f'<a href="/admin/environment/config/{key}/"><b><code>{key}</code></b>=<code>{value}</code></a><br/>'
+            for key, value in plugin.CONFIG.model_dump().items()
+        )))
+        rows['Binaries'].append(mark_safe(', '.join(
+            f'<a href="/admin/environment/binaries/{binary.name}/"><code>{binary.name}</code></a>'
+            for binary in plugin.BINARIES.values()
+        )))
+        rows['Package Managers'].append(mark_safe(', '.join(
+            f'<a href="/admin/environment/binproviders/{binprovider.name}/"><code>{binprovider.name}</code></a>'
+            for binprovider in plugin.BINPROVIDERS.values()
         )))
         )))
+        # rows['Search Backends'].append(mark_safe(', '.join(
+        #     f'<a href="/admin/environment/searchbackends/{searchbackend.name}/"><code>{searchbackend.name}</code></a>'
+        #     for searchbackend in plugin.SEARCHBACKENDS.values()
+        # )))
 
 
     return TableContext(
     return TableContext(
         title="Installed plugins",
         title="Installed plugins",
@@ -204,8 +225,8 @@ def plugin_detail_view(request: HttpRequest, key: str, **kwargs) -> ItemContext:
     assert request.user.is_superuser, 'Must be a superuser to view configuration settings.'
     assert request.user.is_superuser, 'Must be a superuser to view configuration settings.'
 
 
     plugin = None
     plugin = None
-    for loaded_plugin in settings.PLUGINS.values():
-        if loaded_plugin.id == key:
+    for plugin_id, loaded_plugin in settings.PLUGINS.items0():
+        if loaded_plugin.PACKAGE == key or plugin_id == key:
             plugin = loaded_plugin
             plugin = loaded_plugin
 
 
     assert plugin, f'Could not find a plugin matching the specified name: {key}'
     assert plugin, f'Could not find a plugin matching the specified name: {key}'
@@ -220,11 +241,13 @@ def plugin_detail_view(request: HttpRequest, key: str, **kwargs) -> ItemContext:
         title=key,
         title=key,
         data=[
         data=[
             {
             {
-                "name": plugin.id,
-                "description": plugin.verbose_name,
+                "name": plugin.PACKAGE,
+                "description": plugin.LABEL,
                 "fields": {
                 "fields": {
-                    "hooks": plugin.hooks,
-                    "schema": obj_to_yaml(plugin.model_dump(include=("name", "verbose_name", "app_label", "hooks"))),
+                    "version": plugin.VERSION,
+                    "author": plugin.AUTHOR,
+                    "homepage": plugin.HOMEPAGE,
+                    "dependencies": getattr(plugin, 'DEPENDENCIES', []),
                 },
                 },
                 "help_texts": {
                 "help_texts": {
                     # TODO
                     # TODO

+ 8 - 9
archivebox/core/settings.py

@@ -41,7 +41,7 @@ BUILTIN_PLUGIN_DIRS = {
     'plugins_extractor':       PACKAGE_DIR / 'plugins_extractor',
     'plugins_extractor':       PACKAGE_DIR / 'plugins_extractor',
 }
 }
 USER_PLUGIN_DIRS = {
 USER_PLUGIN_DIRS = {
-    'user_plugins':            DATA_DIR / 'user_plugins',
+    # 'user_plugins':            DATA_DIR / 'user_plugins',
 }
 }
 
 
 # Discover ArchiveBox plugins
 # Discover ArchiveBox plugins
@@ -52,19 +52,18 @@ ALL_PLUGINS = {**BUILTIN_PLUGINS, **PIP_PLUGINS, **USER_PLUGINS}
 
 
 # Load ArchiveBox plugins
 # Load ArchiveBox plugins
 PLUGIN_MANAGER = abx.pm
 PLUGIN_MANAGER = abx.pm
-PLUGINS = abx.archivebox.load_archivebox_plugins(PLUGIN_MANAGER, ALL_PLUGINS)
-HOOKS = abx.archivebox.use.get_HOOKS(PLUGINS)
+abx.archivebox.load_archivebox_plugins(PLUGIN_MANAGER, ALL_PLUGINS)
+PLUGINS = abx.archivebox.use.get_PLUGINS()
 
 
 # Load ArchiveBox config from plugins
 # Load ArchiveBox config from plugins
 CONFIGS = abx.archivebox.use.get_CONFIGS()
 CONFIGS = abx.archivebox.use.get_CONFIGS()
-FLAT_CONFIG = abx.archivebox.use.get_FLAT_CONFIG()
+CONFIG = FLAT_CONFIG = abx.archivebox.use.get_FLAT_CONFIG()
 BINPROVIDERS = abx.archivebox.use.get_BINPROVIDERS()
 BINPROVIDERS = abx.archivebox.use.get_BINPROVIDERS()
 BINARIES = abx.archivebox.use.get_BINARIES()
 BINARIES = abx.archivebox.use.get_BINARIES()
 EXTRACTORS = abx.archivebox.use.get_EXTRACTORS()
 EXTRACTORS = abx.archivebox.use.get_EXTRACTORS()
-REPLAYERS = abx.archivebox.use.get_REPLAYERS()
-ADMINDATAVIEWS = abx.archivebox.use.get_ADMINDATAVIEWS()
-QUEUES = abx.archivebox.use.get_QUEUES()
 SEARCHBACKENDS = abx.archivebox.use.get_SEARCHBACKENDS()
 SEARCHBACKENDS = abx.archivebox.use.get_SEARCHBACKENDS()
+# REPLAYERS = abx.archivebox.use.get_REPLAYERS()
+# ADMINDATAVIEWS = abx.archivebox.use.get_ADMINDATAVIEWS()
 
 
 
 
 ################################################################################
 ################################################################################
@@ -101,7 +100,7 @@ INSTALLED_APPS = [
     'django_object_actions',     # provides easy Django Admin action buttons on change views       https://github.com/crccheck/django-object-actions
     'django_object_actions',     # provides easy Django Admin action buttons on change views       https://github.com/crccheck/django-object-actions
 
 
     # Our ArchiveBox-provided apps
     # Our ArchiveBox-provided apps
-    #'config',                   # ArchiveBox config settings (loaded as a plugin, don't need to add it here)
+    'config',                    # ArchiveBox config settings (loaded as a plugin, don't need to add it here)
     'machine',                   # handles collecting and storing information about the host machine, network interfaces, installed binaries, etc.
     'machine',                   # handles collecting and storing information about the host machine, network interfaces, installed binaries, etc.
     'queues',                    # handles starting and managing background workers and processes
     'queues',                    # handles starting and managing background workers and processes
     'abid_utils',                # handles ABID ID creation, handling, and models
     'abid_utils',                # handles ABID ID creation, handling, and models
@@ -610,6 +609,6 @@ if DEBUG_REQUESTS_TRACKER:
 
 
 
 
 abx.django.use.register_checks()
 abx.django.use.register_checks()
-abx.archivebox.use.register_all_hooks(globals())
+# abx.archivebox.use.register_all_hooks(globals())
 
 
 # import ipdb; ipdb.set_trace()
 # import ipdb; ipdb.set_trace()

+ 1 - 1
archivebox/core/views.py

@@ -32,7 +32,7 @@ from archivebox.config.common import SHELL_CONFIG, SERVER_CONFIG
 from archivebox.misc.util import base_url, htmlencode, ts_to_date_str
 from archivebox.misc.util import base_url, htmlencode, ts_to_date_str
 from archivebox.misc.serve_static import serve_static_with_byterange_support
 from archivebox.misc.serve_static import serve_static_with_byterange_support
 
 
-from ..plugins_extractor.archivedotorg.apps import ARCHIVEDOTORG_CONFIG
+from ..plugins_extractor.archivedotorg.config import ARCHIVEDOTORG_CONFIG
 from ..logging_util import printable_filesize
 from ..logging_util import printable_filesize
 from ..search import query_search_index
 from ..search import query_search_index
 
 

+ 3 - 2
archivebox/extractors/archive_org.py

@@ -8,8 +8,9 @@ from collections import defaultdict
 from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError
 from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError
 from archivebox.misc.system import run, chmod_file
 from archivebox.misc.system import run, chmod_file
 from archivebox.misc.util import enforce_types, is_static_file, dedupe
 from archivebox.misc.util import enforce_types, is_static_file, dedupe
-from archivebox.plugins_extractor.archivedotorg.apps import ARCHIVEDOTORG_CONFIG
-from archivebox.plugins_extractor.curl.apps import CURL_CONFIG, CURL_BINARY
+from archivebox.plugins_extractor.archivedotorg.config import ARCHIVEDOTORG_CONFIG
+from archivebox.plugins_extractor.curl.config import CURL_CONFIG
+from archivebox.plugins_extractor.curl.binaries import CURL_BINARY
 
 
 from ..logging_util import TimedProgress
 from ..logging_util import TimedProgress
 
 

+ 3 - 3
archivebox/extractors/dom.py

@@ -11,6 +11,9 @@ from archivebox.misc.util import (
 )
 )
 from ..logging_util import TimedProgress
 from ..logging_util import TimedProgress
 
 
+from plugins_extractor.chrome.config import CHROME_CONFIG
+from plugins_extractor.chrome.binaries import CHROME_BINARY
+
 
 
 def get_output_path():
 def get_output_path():
     return 'output.html'
     return 'output.html'
@@ -18,7 +21,6 @@ def get_output_path():
 
 
 @enforce_types
 @enforce_types
 def should_save_dom(link: Link, out_dir: Optional[Path]=None, overwrite: Optional[bool]=False) -> bool:
 def should_save_dom(link: Link, out_dir: Optional[Path]=None, overwrite: Optional[bool]=False) -> bool:
-    from plugins_extractor.chrome.apps import CHROME_CONFIG
     
     
     if is_static_file(link.url):
     if is_static_file(link.url):
         return False
         return False
@@ -34,8 +36,6 @@ def should_save_dom(link: Link, out_dir: Optional[Path]=None, overwrite: Optiona
 def save_dom(link: Link, out_dir: Optional[Path]=None, timeout: int=60) -> ArchiveResult:
 def save_dom(link: Link, out_dir: Optional[Path]=None, timeout: int=60) -> ArchiveResult:
     """print HTML of site to file using chrome --dump-html"""
     """print HTML of site to file using chrome --dump-html"""
 
 
-    from plugins_extractor.chrome.apps import CHROME_CONFIG, CHROME_BINARY
-
     CHROME_BIN = CHROME_BINARY.load()
     CHROME_BIN = CHROME_BINARY.load()
     assert CHROME_BIN.abspath and CHROME_BIN.version
     assert CHROME_BIN.abspath and CHROME_BIN.version
 
 

+ 3 - 2
archivebox/extractors/favicon.py

@@ -4,8 +4,9 @@ from pathlib import Path
 
 
 from archivebox.misc.system import chmod_file, run
 from archivebox.misc.system import chmod_file, run
 from archivebox.misc.util import enforce_types, domain, dedupe
 from archivebox.misc.util import enforce_types, domain, dedupe
-from archivebox.plugins_extractor.favicon.apps import FAVICON_CONFIG
-from archivebox.plugins_extractor.curl.apps import CURL_CONFIG, CURL_BINARY
+from archivebox.plugins_extractor.favicon.config import FAVICON_CONFIG
+from archivebox.plugins_extractor.curl.config import CURL_CONFIG
+from archivebox.plugins_extractor.curl.binaries import CURL_BINARY
 from ..index.schema import Link, ArchiveResult, ArchiveOutput
 from ..index.schema import Link, ArchiveResult, ArchiveOutput
 from ..logging_util import TimedProgress
 from ..logging_util import TimedProgress
 
 

+ 3 - 1
archivebox/extractors/git.py

@@ -13,10 +13,12 @@ from archivebox.misc.util import (
     without_query,
     without_query,
     without_fragment,
     without_fragment,
 )
 )
-from archivebox.plugins_extractor.git.apps import GIT_CONFIG, GIT_BINARY
 from ..logging_util import TimedProgress
 from ..logging_util import TimedProgress
 from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError
 from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError
 
 
+from archivebox.plugins_extractor.git.config import GIT_CONFIG
+from archivebox.plugins_extractor.git.binaries import GIT_BINARY
+
 
 
 def get_output_path():
 def get_output_path():
     return 'git/'
     return 'git/'

+ 2 - 1
archivebox/extractors/headers.py

@@ -10,7 +10,8 @@ from archivebox.misc.util import (
     get_headers,
     get_headers,
     dedupe,
     dedupe,
 )
 )
-from archivebox.plugins_extractor.curl.apps import CURL_CONFIG, CURL_BINARY
+from archivebox.plugins_extractor.curl.config import CURL_CONFIG
+from archivebox.plugins_extractor.curl.binaries import CURL_BINARY
 from ..index.schema import Link, ArchiveResult, ArchiveOutput
 from ..index.schema import Link, ArchiveResult, ArchiveOutput
 from ..logging_util import TimedProgress
 from ..logging_util import TimedProgress
 
 

+ 3 - 6
archivebox/extractors/media.py

@@ -3,11 +3,13 @@ __package__ = 'archivebox.extractors'
 from pathlib import Path
 from pathlib import Path
 from typing import Optional
 from typing import Optional
 
 
-from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError
 from archivebox.misc.system import run, chmod_file
 from archivebox.misc.system import run, chmod_file
 from archivebox.misc.util import enforce_types, is_static_file, dedupe
 from archivebox.misc.util import enforce_types, is_static_file, dedupe
+from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError
 from ..logging_util import TimedProgress
 from ..logging_util import TimedProgress
 
 
+from plugins_extractor.ytdlp.config import YTDLP_CONFIG
+from plugins_extractor.ytdlp.binaries import YTDLP_BINARY
 
 
 def get_output_path():
 def get_output_path():
     return 'media/'
     return 'media/'
@@ -25,7 +27,6 @@ def get_embed_path(archiveresult=None):
 
 
 @enforce_types
 @enforce_types
 def should_save_media(link: Link, out_dir: Optional[Path]=None, overwrite: Optional[bool]=False) -> bool:
 def should_save_media(link: Link, out_dir: Optional[Path]=None, overwrite: Optional[bool]=False) -> bool:
-    from plugins_extractor.ytdlp.apps import YTDLP_CONFIG
     
     
     if is_static_file(link.url):
     if is_static_file(link.url):
         return False
         return False
@@ -40,10 +41,6 @@ def should_save_media(link: Link, out_dir: Optional[Path]=None, overwrite: Optio
 def save_media(link: Link, out_dir: Optional[Path]=None, timeout: int=0) -> ArchiveResult:
 def save_media(link: Link, out_dir: Optional[Path]=None, timeout: int=0) -> ArchiveResult:
     """Download playlists or individual video, audio, and subtitles using youtube-dl or yt-dlp"""
     """Download playlists or individual video, audio, and subtitles using youtube-dl or yt-dlp"""
 
 
-
-    # from plugins_extractor.chrome.apps import CHROME_CONFIG
-    from plugins_extractor.ytdlp.apps import YTDLP_BINARY, YTDLP_CONFIG
-
     YTDLP_BIN = YTDLP_BINARY.load()
     YTDLP_BIN = YTDLP_BINARY.load()
     assert YTDLP_BIN.abspath and YTDLP_BIN.version
     assert YTDLP_BIN.abspath and YTDLP_BIN.version
 
 

+ 2 - 1
archivebox/extractors/mercury.py

@@ -12,7 +12,8 @@ from archivebox.misc.util import (
     enforce_types,
     enforce_types,
     is_static_file,
     is_static_file,
 )
 )
-from archivebox.plugins_extractor.mercury.apps import MERCURY_CONFIG, MERCURY_BINARY
+from archivebox.plugins_extractor.mercury.config import MERCURY_CONFIG
+from archivebox.plugins_extractor.mercury.binaries import MERCURY_BINARY
 
 
 from ..logging_util import TimedProgress
 from ..logging_util import TimedProgress
 
 

+ 4 - 4
archivebox/extractors/pdf.py

@@ -3,14 +3,17 @@ __package__ = 'archivebox.extractors'
 from pathlib import Path
 from pathlib import Path
 from typing import Optional
 from typing import Optional
 
 
-from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError
 from archivebox.misc.system import run, chmod_file
 from archivebox.misc.system import run, chmod_file
 from archivebox.misc.util import (
 from archivebox.misc.util import (
     enforce_types,
     enforce_types,
     is_static_file,
     is_static_file,
 )
 )
+from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError
 from ..logging_util import TimedProgress
 from ..logging_util import TimedProgress
 
 
+from plugins_extractor.chrome.config import CHROME_CONFIG
+from plugins_extractor.chrome.binaries import CHROME_BINARY
+
 
 
 def get_output_path():
 def get_output_path():
     return 'output.pdf'
     return 'output.pdf'
@@ -18,7 +21,6 @@ def get_output_path():
 
 
 @enforce_types
 @enforce_types
 def should_save_pdf(link: Link, out_dir: Optional[Path]=None, overwrite: Optional[bool]=False) -> bool:
 def should_save_pdf(link: Link, out_dir: Optional[Path]=None, overwrite: Optional[bool]=False) -> bool:
-    from plugins_extractor.chrome.apps import CHROME_CONFIG
     
     
     if is_static_file(link.url):
     if is_static_file(link.url):
         return False
         return False
@@ -34,8 +36,6 @@ def should_save_pdf(link: Link, out_dir: Optional[Path]=None, overwrite: Optiona
 def save_pdf(link: Link, out_dir: Optional[Path]=None, timeout: int=60) -> ArchiveResult:
 def save_pdf(link: Link, out_dir: Optional[Path]=None, timeout: int=60) -> ArchiveResult:
     """print PDF of site to file using chrome --headless"""
     """print PDF of site to file using chrome --headless"""
 
 
-    from plugins_extractor.chrome.apps import CHROME_CONFIG, CHROME_BINARY
-
     CHROME_BIN = CHROME_BINARY.load()
     CHROME_BIN = CHROME_BINARY.load()
     assert CHROME_BIN.abspath and CHROME_BIN.version
     assert CHROME_BIN.abspath and CHROME_BIN.version
 
 

+ 5 - 4
archivebox/extractors/readability.py

@@ -6,12 +6,16 @@ from tempfile import NamedTemporaryFile
 from typing import Optional
 from typing import Optional
 import json
 import json
 
 
-from ..index.schema import Link, ArchiveResult, ArchiveError
 from archivebox.misc.system import run, atomic_write
 from archivebox.misc.system import run, atomic_write
 from archivebox.misc.util import enforce_types, is_static_file
 from archivebox.misc.util import enforce_types, is_static_file
+from ..index.schema import Link, ArchiveResult, ArchiveError
 from ..logging_util import TimedProgress
 from ..logging_util import TimedProgress
 from .title import get_html
 from .title import get_html
 
 
+from plugins_extractor.readability.config import READABILITY_CONFIG
+from plugins_extractor.readability.binaries import READABILITY_BINARY
+
+
 def get_output_path():
 def get_output_path():
     return 'readability/'
     return 'readability/'
 
 
@@ -21,7 +25,6 @@ def get_embed_path(archiveresult=None):
 
 
 @enforce_types
 @enforce_types
 def should_save_readability(link: Link, out_dir: Optional[str]=None, overwrite: Optional[bool]=False) -> bool:
 def should_save_readability(link: Link, out_dir: Optional[str]=None, overwrite: Optional[bool]=False) -> bool:
-    from plugins_extractor.readability.apps import READABILITY_CONFIG
     
     
     if is_static_file(link.url):
     if is_static_file(link.url):
         return False
         return False
@@ -37,8 +40,6 @@ def should_save_readability(link: Link, out_dir: Optional[str]=None, overwrite:
 def save_readability(link: Link, out_dir: Optional[str]=None, timeout: int=0) -> ArchiveResult:
 def save_readability(link: Link, out_dir: Optional[str]=None, timeout: int=0) -> ArchiveResult:
     """download reader friendly version using @mozilla/readability"""
     """download reader friendly version using @mozilla/readability"""
     
     
-    from plugins_extractor.readability.apps import READABILITY_CONFIG, READABILITY_BINARY
-    
     READABILITY_BIN = READABILITY_BINARY.load()
     READABILITY_BIN = READABILITY_BINARY.load()
     assert READABILITY_BIN.abspath and READABILITY_BIN.version
     assert READABILITY_BIN.abspath and READABILITY_BIN.version
 
 

+ 4 - 3
archivebox/extractors/screenshot.py

@@ -3,11 +3,14 @@ __package__ = 'archivebox.extractors'
 from pathlib import Path
 from pathlib import Path
 from typing import Optional
 from typing import Optional
 
 
-from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError
 from archivebox.misc.system import run, chmod_file
 from archivebox.misc.system import run, chmod_file
 from archivebox.misc.util import enforce_types, is_static_file
 from archivebox.misc.util import enforce_types, is_static_file
+from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError
 from ..logging_util import TimedProgress
 from ..logging_util import TimedProgress
 
 
+from plugins_extractor.chrome.config import CHROME_CONFIG
+from plugins_extractor.chrome.binaries import CHROME_BINARY
+
 
 
 def get_output_path():
 def get_output_path():
     return 'screenshot.png'
     return 'screenshot.png'
@@ -15,7 +18,6 @@ def get_output_path():
 
 
 @enforce_types
 @enforce_types
 def should_save_screenshot(link: Link, out_dir: Optional[Path]=None, overwrite: Optional[bool]=False) -> bool:
 def should_save_screenshot(link: Link, out_dir: Optional[Path]=None, overwrite: Optional[bool]=False) -> bool:
-    from plugins_extractor.chrome.apps import CHROME_CONFIG
     
     
     if is_static_file(link.url):
     if is_static_file(link.url):
         return False
         return False
@@ -30,7 +32,6 @@ def should_save_screenshot(link: Link, out_dir: Optional[Path]=None, overwrite:
 def save_screenshot(link: Link, out_dir: Optional[Path]=None, timeout: int=60) -> ArchiveResult:
 def save_screenshot(link: Link, out_dir: Optional[Path]=None, timeout: int=60) -> ArchiveResult:
     """take screenshot of site using chrome --headless"""
     """take screenshot of site using chrome --headless"""
     
     
-    from plugins_extractor.chrome.apps import CHROME_CONFIG, CHROME_BINARY
     CHROME_BIN = CHROME_BINARY.load()
     CHROME_BIN = CHROME_BINARY.load()
     assert CHROME_BIN.abspath and CHROME_BIN.version
     assert CHROME_BIN.abspath and CHROME_BIN.version
 
 

+ 6 - 5
archivebox/extractors/singlefile.py

@@ -10,6 +10,11 @@ from archivebox.misc.system import run, chmod_file
 from archivebox.misc.util import enforce_types, is_static_file, dedupe
 from archivebox.misc.util import enforce_types, is_static_file, dedupe
 from ..logging_util import TimedProgress
 from ..logging_util import TimedProgress
 
 
+from plugins_extractor.chrome.config import CHROME_CONFIG
+from plugins_extractor.chrome.binaries import CHROME_BINARY
+from plugins_extractor.singlefile.config import SINGLEFILE_CONFIG
+from plugins_extractor.singlefile.binaries import SINGLEFILE_BINARY
+
 
 
 def get_output_path():
 def get_output_path():
     return 'singlefile.html'
     return 'singlefile.html'
@@ -17,7 +22,6 @@ def get_output_path():
 
 
 @enforce_types
 @enforce_types
 def should_save_singlefile(link: Link, out_dir: Optional[Path]=None, overwrite: Optional[bool]=False) -> bool:
 def should_save_singlefile(link: Link, out_dir: Optional[Path]=None, overwrite: Optional[bool]=False) -> bool:
-    from plugins_extractor.singlefile.apps import SINGLEFILE_CONFIG
     
     
     if is_static_file(link.url):
     if is_static_file(link.url):
         return False
         return False
@@ -26,15 +30,12 @@ def should_save_singlefile(link: Link, out_dir: Optional[Path]=None, overwrite:
     if not overwrite and (out_dir / get_output_path()).exists():
     if not overwrite and (out_dir / get_output_path()).exists():
         return False
         return False
 
 
-    return SINGLEFILE_CONFIG.SAVE_SINGLEFILE
+    return CHROME_CONFIG.USE_CHROME and SINGLEFILE_CONFIG.SAVE_SINGLEFILE
 
 
 
 
 @enforce_types
 @enforce_types
 def save_singlefile(link: Link, out_dir: Optional[Path]=None, timeout: int=60) -> ArchiveResult:
 def save_singlefile(link: Link, out_dir: Optional[Path]=None, timeout: int=60) -> ArchiveResult:
     """download full site using single-file"""
     """download full site using single-file"""
-    
-    from plugins_extractor.chrome.apps import CHROME_CONFIG, CHROME_BINARY
-    from plugins_extractor.singlefile.apps import SINGLEFILE_CONFIG, SINGLEFILE_BINARY
 
 
     CHROME_BIN = CHROME_BINARY.load()
     CHROME_BIN = CHROME_BINARY.load()
     assert CHROME_BIN.abspath and CHROME_BIN.version
     assert CHROME_BIN.abspath and CHROME_BIN.version

+ 3 - 1
archivebox/extractors/title.py

@@ -11,7 +11,9 @@ from archivebox.misc.util import (
     htmldecode,
     htmldecode,
     dedupe,
     dedupe,
 )
 )
-from archivebox.plugins_extractor.curl.apps import CURL_CONFIG, CURL_BINARY
+from archivebox.plugins_extractor.curl.config import CURL_CONFIG
+from archivebox.plugins_extractor.curl.binaries import CURL_BINARY
+
 from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError
 from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError
 from ..logging_util import TimedProgress
 from ..logging_util import TimedProgress
 
 

+ 2 - 2
archivebox/extractors/wget.py

@@ -17,8 +17,8 @@ from archivebox.misc.util import (
     urldecode,
     urldecode,
     dedupe,
     dedupe,
 )
 )
-from archivebox.plugins_extractor.wget.apps import WGET_BINARY, WGET_CONFIG
-
+from archivebox.plugins_extractor.wget.config import WGET_CONFIG
+from archivebox.plugins_extractor.wget.binaries import WGET_BINARY
 from ..logging_util import TimedProgress
 from ..logging_util import TimedProgress
 from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError
 from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError
 
 

+ 1 - 1
archivebox/index/html.py

@@ -19,7 +19,7 @@ from archivebox.misc.util import (
 from archivebox.config import CONSTANTS, DATA_DIR, VERSION
 from archivebox.config import CONSTANTS, DATA_DIR, VERSION
 from archivebox.config.common import SERVER_CONFIG
 from archivebox.config.common import SERVER_CONFIG
 from archivebox.config.version import get_COMMIT_HASH
 from archivebox.config.version import get_COMMIT_HASH
-from archivebox.plugins_extractor.archivedotorg.apps import ARCHIVEDOTORG_CONFIG
+from archivebox.plugins_extractor.archivedotorg.config import ARCHIVEDOTORG_CONFIG
 
 
 from .schema import Link
 from .schema import Link
 from ..logging_util import printable_filesize
 from ..logging_util import printable_filesize

+ 1 - 1
archivebox/index/schema.py

@@ -19,7 +19,7 @@ from django.utils.functional import cached_property
 
 
 from archivebox.config import ARCHIVE_DIR, CONSTANTS
 from archivebox.config import ARCHIVE_DIR, CONSTANTS
 
 
-from plugins_extractor.favicon.apps import FAVICON_CONFIG
+from plugins_extractor.favicon.config import FAVICON_CONFIG
 
 
 from archivebox.misc.system import get_dir_size
 from archivebox.misc.system import get_dir_size
 from archivebox.misc.util import ts_to_date_str, parse_date
 from archivebox.misc.util import ts_to_date_str, parse_date

+ 5 - 5
archivebox/machine/models.py

@@ -183,7 +183,7 @@ class InstalledBinaryManager(models.Manager):
         """Get or create an InstalledBinary record for a Binary on the local machine"""
         """Get or create an InstalledBinary record for a Binary on the local machine"""
         
         
         global _CURRENT_BINARIES
         global _CURRENT_BINARIES
-        cached_binary = _CURRENT_BINARIES.get(binary.id)
+        cached_binary = _CURRENT_BINARIES.get(binary.name)
         if cached_binary:
         if cached_binary:
             expires_at = cached_binary.modified_at + timedelta(seconds=INSTALLED_BINARY_RECHECK_INTERVAL)
             expires_at = cached_binary.modified_at + timedelta(seconds=INSTALLED_BINARY_RECHECK_INTERVAL)
             if timezone.now() < expires_at:
             if timezone.now() < expires_at:
@@ -198,7 +198,7 @@ class InstalledBinaryManager(models.Manager):
                         or binary.sha256 != cached_binary.sha256
                         or binary.sha256 != cached_binary.sha256
                     )
                     )
                     if is_different_from_cache:
                     if is_different_from_cache:
-                        _CURRENT_BINARIES.pop(binary.id)
+                        _CURRENT_BINARIES.pop(binary.name)
                     else:
                     else:
                         return cached_binary
                         return cached_binary
                 else:
                 else:
@@ -209,7 +209,7 @@ class InstalledBinaryManager(models.Manager):
                     return cached_binary
                     return cached_binary
             else:
             else:
                 # cached binary is too old, reload it from scratch
                 # cached binary is too old, reload it from scratch
-                _CURRENT_BINARIES.pop(binary.id)
+                _CURRENT_BINARIES.pop(binary.name)
         
         
         if not binary.abspath or not binary.version or not binary.sha256:
         if not binary.abspath or not binary.version or not binary.sha256:
             # if binary was not yet loaded from filesystem, do it now
             # if binary was not yet loaded from filesystem, do it now
@@ -219,7 +219,7 @@ class InstalledBinaryManager(models.Manager):
 
 
         assert binary.loaded_binprovider and binary.loaded_abspath and binary.loaded_version and binary.loaded_sha256, f'Failed to load binary {binary.name} abspath, version, and sha256'
         assert binary.loaded_binprovider and binary.loaded_abspath and binary.loaded_version and binary.loaded_sha256, f'Failed to load binary {binary.name} abspath, version, and sha256'
         
         
-        _CURRENT_BINARIES[binary.id], _created = self.update_or_create(
+        _CURRENT_BINARIES[binary.name], _created = self.update_or_create(
             machine=Machine.objects.current(),
             machine=Machine.objects.current(),
             name=binary.name,
             name=binary.name,
             binprovider=binary.loaded_binprovider.name,
             binprovider=binary.loaded_binprovider.name,
@@ -227,7 +227,7 @@ class InstalledBinaryManager(models.Manager):
             abspath=str(binary.loaded_abspath),
             abspath=str(binary.loaded_abspath),
             sha256=str(binary.loaded_sha256),
             sha256=str(binary.loaded_sha256),
         )
         )
-        cached_binary = _CURRENT_BINARIES[binary.id]
+        cached_binary = _CURRENT_BINARIES[binary.name]
         cached_binary.save()   # populate ABID
         cached_binary.save()   # populate ABID
         
         
         # if we get this far make sure DB record matches in-memroy cache
         # if we get this far make sure DB record matches in-memroy cache

+ 3 - 3
archivebox/main.py

@@ -193,7 +193,7 @@ def version(quiet: bool=False,
     console = Console()
     console = Console()
     prnt = console.print
     prnt = console.print
     
     
-    from plugins_auth.ldap.apps import LDAP_CONFIG
+    from plugins_auth.ldap.config import LDAP_CONFIG
     from django.conf import settings
     from django.conf import settings
     from archivebox.config.version import get_COMMIT_HASH, get_BUILD_TIME
     from archivebox.config.version import get_COMMIT_HASH, get_BUILD_TIME
     from archivebox.config.permissions import ARCHIVEBOX_USER, ARCHIVEBOX_GROUP, RUNNING_AS_UID, RUNNING_AS_GID
     from archivebox.config.permissions import ARCHIVEBOX_USER, ARCHIVEBOX_GROUP, RUNNING_AS_UID, RUNNING_AS_GID
@@ -1122,7 +1122,7 @@ def install(out_dir: Path=DATA_DIR, binproviders: Optional[List[str]]=None, bina
     
     
     print('\n[green][√] Set up ArchiveBox and its dependencies successfully.[/green]\n', file=sys.stderr)
     print('\n[green][√] Set up ArchiveBox and its dependencies successfully.[/green]\n', file=sys.stderr)
     
     
-    from plugins_pkg.pip.apps import ARCHIVEBOX_BINARY
+    from plugins_pkg.pip.binaries import ARCHIVEBOX_BINARY
     
     
     extra_args = []
     extra_args = []
     if binproviders:
     if binproviders:
@@ -1253,7 +1253,7 @@ def schedule(add: bool=False,
     """Set ArchiveBox to regularly import URLs at specific times using cron"""
     """Set ArchiveBox to regularly import URLs at specific times using cron"""
     
     
     check_data_folder()
     check_data_folder()
-    from archivebox.plugins_pkg.pip.apps import ARCHIVEBOX_BINARY
+    from archivebox.plugins_pkg.pip.binaries import ARCHIVEBOX_BINARY
     from archivebox.config.permissions import USER
     from archivebox.config.permissions import USER
 
 
     Path(CONSTANTS.LOGS_DIR).mkdir(exist_ok=True)
     Path(CONSTANTS.LOGS_DIR).mkdir(exist_ok=True)

+ 61 - 0
archivebox/plugins_auth/ldap/__init__.py

@@ -0,0 +1,61 @@
+__package__ = 'plugins_auth.ldap'
+__label__ = 'ldap'
+__version__ = '2024.10.14'
+__author__ = 'Nick Sweeting'
+__homepage__ = 'https://github.com/django-auth-ldap/django-auth-ldap'
+# __dependencies__ = ['pip']
+
+import abx
+
+
[email protected]
+def get_PLUGIN():
+    return {
+        'ldap': {
+            'PACKAGE': __package__,
+            'LABEL': __label__,
+            'VERSION': __version__,
+            'AUTHOR': __author__,
+            'HOMEPAGE': __homepage__,
+            # 'DEPENDENCIES': __dependencies__,
+        }
+    }
+
[email protected]
+def get_CONFIG():
+    from .config import LDAP_CONFIG
+    
+    return {
+        'ldap': LDAP_CONFIG
+    }
+
[email protected]
+def get_BINARIES():
+    from .binaries import LDAP_BINARY
+    
+    return {
+        'ldap': LDAP_BINARY,
+    }
+
+
+def create_superuser_from_ldap_user(sender, user=None, ldap_user=None, **kwargs):
+    from django.conf import settings
+    
+    if user is None:
+        return                        # not authenticated at all
+    
+    if not user.id and settings.CONFIGS.ldap.LDAP_CREATE_SUPERUSER:
+        user.is_superuser = True      # authenticated via LDAP, but user is not set up in DB yet
+
+    user.is_staff = True
+    print(f'[!] WARNING: Creating new user {user} based on LDAP user {ldap_user} (is_staff={user.is_staff}, is_superuser={user.is_superuser})')
+
+
[email protected]
+def ready():
+    from django.conf import settings
+    
+    if settings.CONFIGS.ldap.LDAP_ENABLED:
+        import django_auth_ldap.backend
+        django_auth_ldap.backend.populate_user.connect(create_superuser_from_ldap_user)
+    

+ 6 - 42
archivebox/plugins_auth/ldap/apps.py → archivebox/plugins_auth/ldap/binaries.py

@@ -1,4 +1,4 @@
-__package__ = 'archivebox.plugins_auth.ldap'
+__package__ = 'plugins_auth.ldap'
 
 
 
 
 import inspect
 import inspect
@@ -9,17 +9,14 @@ from pydantic import InstanceOf
 
 
 from pydantic_pkgr import BinaryOverrides, SemVer
 from pydantic_pkgr import BinaryOverrides, SemVer
 
 
-import abx
 
 
-from abx.archivebox.base_plugin import BasePlugin
-from abx.archivebox.base_hook import BaseHook
 from abx.archivebox.base_binary import BaseBinary, BaseBinProvider, apt
 from abx.archivebox.base_binary import BaseBinary, BaseBinProvider, apt
 
 
-from plugins_pkg.pip.apps import SYS_PIP_BINPROVIDER, VENV_PIP_BINPROVIDER, LIB_PIP_BINPROVIDER, VENV_SITE_PACKAGES, LIB_SITE_PACKAGES, USER_SITE_PACKAGES, SYS_SITE_PACKAGES
-from .settings import LDAP_CONFIG, get_ldap_lib
+from plugins_pkg.pip.binproviders import SYS_PIP_BINPROVIDER, VENV_PIP_BINPROVIDER, LIB_PIP_BINPROVIDER, VENV_SITE_PACKAGES, LIB_SITE_PACKAGES, USER_SITE_PACKAGES, SYS_SITE_PACKAGES
+
+from .config import get_ldap_lib
 
 
 
 
-###################### Config ##########################
 
 
 def get_LDAP_LIB_path(paths=()):
 def get_LDAP_LIB_path(paths=()):
     LDAP_LIB = get_ldap_lib()[0]
     LDAP_LIB = get_ldap_lib()[0]
@@ -36,10 +33,12 @@ def get_LDAP_LIB_path(paths=()):
             return lib_path
             return lib_path
     return None
     return None
 
 
+
 def get_LDAP_LIB_version():
 def get_LDAP_LIB_version():
     LDAP_LIB = get_ldap_lib()[0]
     LDAP_LIB = get_ldap_lib()[0]
     return LDAP_LIB and SemVer(LDAP_LIB.__version__)
     return LDAP_LIB and SemVer(LDAP_LIB.__version__)
 
 
+
 class LdapBinary(BaseBinary):
 class LdapBinary(BaseBinary):
     name: str = 'ldap'
     name: str = 'ldap'
     description: str = 'LDAP Authentication'
     description: str = 'LDAP Authentication'
@@ -69,38 +68,3 @@ class LdapBinary(BaseBinary):
     }
     }
 
 
 LDAP_BINARY = LdapBinary()
 LDAP_BINARY = LdapBinary()
-
-
-def create_superuser_from_ldap_user(sender, user=None, ldap_user=None, **kwargs):
-    if user is None:
-        # not authenticated at all
-        return
-    
-    if not user.id and LDAP_CONFIG.LDAP_CREATE_SUPERUSER:
-        # authenticated via LDAP, but user is not set up in DB yet
-        user.is_superuser = True
-
-    user.is_staff = True
-    print(f'[!] WARNING: Creating new user {user} based on LDAP user {ldap_user} (is_staff={user.is_staff}, is_superuser={user.is_superuser})')
-
-
-class LdapAuthPlugin(BasePlugin):
-    app_label: str = 'ldap'
-    verbose_name: str = 'LDAP Authentication'
-
-    hooks: List[InstanceOf[BaseHook]] = [
-        LDAP_CONFIG,
-        *([LDAP_BINARY] if LDAP_CONFIG.LDAP_ENABLED else []),
-    ]
-    
-    @abx.hookimpl
-    def ready(self):
-        super().ready()
-        
-        if LDAP_CONFIG.LDAP_ENABLED:
-            import django_auth_ldap.backend
-            django_auth_ldap.backend.populate_user.connect(create_superuser_from_ldap_user)
-        
-
-PLUGIN = LdapAuthPlugin()
-DJANGO_APP = PLUGIN.AppConfig

+ 1 - 1
archivebox/plugins_auth/ldap/settings.py → archivebox/plugins_auth/ldap/config.py

@@ -1,4 +1,4 @@
-__package__ = 'archivebox.plugins_auth.ldap'
+__package__ = 'plugins_auth.ldap'
 
 
 import sys
 import sys
 
 

+ 39 - 0
archivebox/plugins_extractor/archivedotorg/__init__.py

@@ -0,0 +1,39 @@
+__package__ = 'plugins_extractor.archivedotorg'
+__label__ = 'archivedotorg'
+__version__ = '2024.10.14'
+__author__ = 'Nick Sweeting'
+__homepage__ = 'https://archive.org'
+__dependencies__ = []
+
+import abx
+
+
[email protected]
+def get_PLUGIN():
+    return {
+        'archivedotorg': {
+            'PACKAGE': __package__,
+            'LABEL': __label__,
+            'VERSION': __version__,
+            'AUTHOR': __author__,
+            'HOMEPAGE': __homepage__,
+            'DEPENDENCIES': __dependencies__,
+        }
+    }
+
[email protected]
+def get_CONFIG():
+    from .config import ARCHIVEDOTORG_CONFIG
+    
+    return {
+        'archivedotorg': ARCHIVEDOTORG_CONFIG
+    }
+
+
+# @abx.hookimpl
+# def get_EXTRACTORS():
+#     from .extractors import ARCHIVEDOTORG_EXTRACTOR
+#
+#     return {
+#         'archivedotorg': ARCHIVEDOTORG_EXTRACTOR,
+#     }

+ 0 - 28
archivebox/plugins_extractor/archivedotorg/apps.py

@@ -1,28 +0,0 @@
-__package__ = 'archivebox.plugins_extractor.archivedotorg'
-
-from typing import List
-
-from abx.archivebox.base_plugin import BasePlugin
-from abx.archivebox.base_configset import BaseConfigSet
-from abx.archivebox.base_hook import BaseHook
-
-###################### Config ##########################
-
-
-class ArchivedotorgConfig(BaseConfigSet):
-    SAVE_ARCHIVE_DOT_ORG: bool = True
-
-
-ARCHIVEDOTORG_CONFIG = ArchivedotorgConfig()
-
-
-class ArchivedotorgPlugin(BasePlugin):
-    app_label: str = 'archivedotorg'
-    verbose_name: str = 'Archive.org'
-    
-    hooks: List[BaseHook] = [
-        ARCHIVEDOTORG_CONFIG
-    ]
-
-PLUGIN = ArchivedotorgPlugin()
-DJANGO_APP = PLUGIN.AppConfig

+ 11 - 0
archivebox/plugins_extractor/archivedotorg/config.py

@@ -0,0 +1,11 @@
+__package__ = 'plugins_extractor.archivedotorg'
+
+
+from abx.archivebox.base_configset import BaseConfigSet
+
+
+class ArchivedotorgConfig(BaseConfigSet):
+    SAVE_ARCHIVE_DOT_ORG: bool = True
+
+
+ARCHIVEDOTORG_CONFIG = ArchivedotorgConfig()

+ 46 - 0
archivebox/plugins_extractor/chrome/__init__.py

@@ -0,0 +1,46 @@
+__package__ = 'plugins_extractor.chrome'
+__label__ = 'chrome'
+__version__ = '2024.10.14'
+__author__ = 'Nick Sweeting'
+__homepage__ = 'https://github.com/ArchiveBox/ArchiveBox/tree/main/archivebox/plugins_extractor/chrome'
+__dependencies__ = []
+
+import abx
+
+
[email protected]
+def get_PLUGIN():
+    return {
+        'chrome': {
+            'PACKAGE': __package__,
+            'LABEL': __label__,
+            'VERSION': __version__,
+            'AUTHOR': __author__,
+            'HOMEPAGE': __homepage__,
+            'DEPENDENCIES': __dependencies__,
+        }
+    }
+
[email protected]
+def get_CONFIG():
+    from .config import CHROME_CONFIG
+    
+    return {
+        'chrome': CHROME_CONFIG
+    }
+
[email protected]
+def get_BINARIES():
+    from .binaries import CHROME_BINARY
+    
+    return {
+        'chrome': CHROME_BINARY,
+    }
+
+# @abx.hookimpl
+# def get_EXTRACTORS():
+#     return {
+#         'pdf': PDF_EXTRACTOR,
+#         'screenshot': SCREENSHOT_EXTRACTOR,
+#         'dom': DOM_EXTRACTOR,
+#     }

+ 145 - 0
archivebox/plugins_extractor/chrome/binaries.py

@@ -0,0 +1,145 @@
+__package__ = 'plugins_extractor.chrome'
+
+import os
+import platform
+from pathlib import Path
+from typing import List, Optional
+
+from pydantic import InstanceOf
+from pydantic_pkgr import (
+    BinProvider,
+    BinName,
+    BinaryOverrides,
+    bin_abspath,
+)
+
+from abx.archivebox.base_binary import BaseBinary, env, apt, brew
+
+# Depends on Other Plugins:
+from archivebox.config import CONSTANTS
+from archivebox.config.common import SHELL_CONFIG
+from plugins_pkg.puppeteer.binproviders import PUPPETEER_BINPROVIDER
+from plugins_pkg.playwright.binproviders import PLAYWRIGHT_BINPROVIDER
+
+
+from .config import CHROME_CONFIG
+CHROMIUM_BINARY_NAMES_LINUX = [
+    "chromium",
+    "chromium-browser",
+    "chromium-browser-beta",
+    "chromium-browser-unstable",
+    "chromium-browser-canary",
+    "chromium-browser-dev",
+]
+CHROMIUM_BINARY_NAMES_MACOS = ["/Applications/Chromium.app/Contents/MacOS/Chromium"]
+CHROMIUM_BINARY_NAMES = CHROMIUM_BINARY_NAMES_LINUX + CHROMIUM_BINARY_NAMES_MACOS
+
+CHROME_BINARY_NAMES_LINUX = [
+    "google-chrome",
+    "google-chrome-stable",
+    "google-chrome-beta",
+    "google-chrome-canary",
+    "google-chrome-unstable",
+    "google-chrome-dev",
+    "chrome"
+]
+CHROME_BINARY_NAMES_MACOS = [
+    "/Applications/Google Chrome.app/Contents/MacOS/Google Chrome",
+    "/Applications/Google Chrome Canary.app/Contents/MacOS/Google Chrome Canary",
+]
+CHROME_BINARY_NAMES = CHROME_BINARY_NAMES_LINUX + CHROME_BINARY_NAMES_MACOS
+
+APT_DEPENDENCIES = [
+    'apt-transport-https', 'at-spi2-common', 'chromium-browser',
+    'fontconfig', 'fonts-freefont-ttf', 'fonts-ipafont-gothic', 'fonts-kacst', 'fonts-khmeros', 'fonts-liberation', 'fonts-noto', 'fonts-noto-color-emoji', 'fonts-symbola', 'fonts-thai-tlwg', 'fonts-tlwg-loma-otf', 'fonts-unifont', 'fonts-wqy-zenhei',
+    'libasound2', 'libatk-bridge2.0-0', 'libatk1.0-0', 'libatspi2.0-0', 'libavahi-client3', 'libavahi-common-data', 'libavahi-common3', 'libcairo2', 'libcups2',
+    'libdbus-1-3', 'libdrm2', 'libfontenc1', 'libgbm1', 'libglib2.0-0', 'libice6', 'libnspr4', 'libnss3', 'libsm6', 'libunwind8', 'libx11-6', 'libxaw7', 'libxcb1',
+    'libxcomposite1', 'libxdamage1', 'libxext6', 'libxfixes3', 'libxfont2', 'libxkbcommon0', 'libxkbfile1', 'libxmu6', 'libxpm4', 'libxrandr2', 'libxt6', 'x11-utils', 'x11-xkb-utils', 'xfonts-encodings',
+]
+
+
+def autodetect_system_chrome_install(PATH=None) -> Optional[Path]:
+    for bin_name in CHROME_BINARY_NAMES + CHROMIUM_BINARY_NAMES:
+        abspath = bin_abspath(bin_name, PATH=env.PATH)
+        if abspath:
+            return abspath
+    return None
+
+def create_macos_app_symlink(target: Path, shortcut: Path):
+    """
+    on macOS, some binaries are inside of .app, so we need to
+    create a tiny bash script instead of a symlink
+    (so that ../ parent relationships are relative to original .app instead of callsite dir)
+    """
+    # TODO: should we enforce this? is it useful in any other situation?
+    # if platform.system().lower() != 'darwin':
+    #     raise Exception(...)
+    shortcut.unlink(missing_ok=True)
+    shortcut.write_text(f"""#!/usr/bin/env bash\nexec '{target}' "$@"\n""")
+    shortcut.chmod(0o777)   # make sure its executable by everyone
+
+###################### Config ##########################
+
+
+class ChromeBinary(BaseBinary):
+    name: BinName = CHROME_CONFIG.CHROME_BINARY
+    binproviders_supported: List[InstanceOf[BinProvider]] = [PUPPETEER_BINPROVIDER, env, PLAYWRIGHT_BINPROVIDER, apt, brew]
+    
+    overrides: BinaryOverrides = {
+        env.name: {
+            'abspath': lambda: autodetect_system_chrome_install(PATH=env.PATH),  # /usr/bin/google-chrome-stable
+        },
+        PUPPETEER_BINPROVIDER.name: {
+            'packages': ['chrome@stable'],              # npx @puppeteer/browsers install chrome@stable
+        },
+        PLAYWRIGHT_BINPROVIDER.name: {
+            'packages': ['chromium'],                   # playwright install chromium
+        },
+        apt.name: {
+            'packages': APT_DEPENDENCIES,
+        },
+        brew.name: {
+            'packages': ['--cask', 'chromium'],
+        },
+    }
+
+    @staticmethod
+    def symlink_to_lib(binary, bin_dir=CONSTANTS.LIB_BIN_DIR) -> None:
+        if not (binary.abspath and os.access(binary.abspath, os.F_OK)):
+            return
+        
+        bin_dir.mkdir(parents=True, exist_ok=True)
+        symlink = bin_dir / binary.name
+        
+        try:
+            if platform.system().lower() == 'darwin':
+                # if on macOS, browser binary is inside a .app, so we need to create a tiny bash script instead of a symlink
+                create_macos_app_symlink(binary.abspath, symlink)
+            else:
+                # otherwise on linux we can symlink directly to binary executable
+                symlink.unlink(missing_ok=True)
+                symlink.symlink_to(binary.abspath)
+        except Exception as err:
+            # print(f'[red]:warning: Failed to symlink {symlink} -> {binary.abspath}[/red] {err}')
+            # not actually needed, we can just run without it
+            pass
+
+    @staticmethod            
+    def chrome_cleanup_lockfile():
+        """
+        Cleans up any state or runtime files that chrome leaves behind when killed by
+        a timeout or other error
+        """
+        lock_file = Path("~/.config/chromium/SingletonLock").expanduser()
+
+        if SHELL_CONFIG.IN_DOCKER and os.access(lock_file, os.F_OK):
+            lock_file.unlink()
+        
+        if CHROME_CONFIG.CHROME_USER_DATA_DIR:
+            if os.access(CHROME_CONFIG.CHROME_USER_DATA_DIR / 'SingletonLock', os.F_OK):
+                lock_file.unlink()
+
+
+
+CHROME_BINARY = ChromeBinary()
+

+ 24 - 118
archivebox/plugins_extractor/chrome/apps.py → archivebox/plugins_extractor/chrome/config.py

@@ -1,35 +1,18 @@
-__package__ = 'archivebox.plugins_extractor.chrome'
+__package__ = 'plugins_extractor.chrome'
 
 
 import os
 import os
-import sys
-import platform
+
 from pathlib import Path
 from pathlib import Path
 from typing import List, Optional
 from typing import List, Optional
 
 
-# Depends on other PyPI/vendor packages:
-from rich import print
-from pydantic import InstanceOf, Field, model_validator
-from pydantic_pkgr import (
-    BinProvider,
-    BinName,
-    BinaryOverrides,
-    bin_abspath,
-)
+from pydantic import Field, model_validator
+from pydantic_pkgr import bin_abspath
 
 
-# Depends on other Django apps:
-from abx.archivebox.base_plugin import BasePlugin
 from abx.archivebox.base_configset import BaseConfigSet
 from abx.archivebox.base_configset import BaseConfigSet
-from abx.archivebox.base_binary import BaseBinary, env, apt, brew
-# from abx.archivebox.base_extractor import BaseExtractor
-# from abx.archivebox.base_queue import BaseQueue
-from abx.archivebox.base_hook import BaseHook
+from abx.archivebox.base_binary import env
 
 
-# Depends on Other Plugins:
-from archivebox.config import CONSTANTS
 from archivebox.config.common import ARCHIVING_CONFIG, SHELL_CONFIG
 from archivebox.config.common import ARCHIVING_CONFIG, SHELL_CONFIG
-from plugins_pkg.puppeteer.apps import PUPPETEER_BINPROVIDER
-from plugins_pkg.playwright.apps import PLAYWRIGHT_BINPROVIDER
-
+from archivebox.misc.logging import STDERR
 from archivebox.misc.util import dedupe
 from archivebox.misc.util import dedupe
 
 
 
 
@@ -129,33 +112,34 @@ class ChromeConfig(BaseConfigSet):
     @model_validator(mode='after')
     @model_validator(mode='after')
     def validate_use_chrome(self):
     def validate_use_chrome(self):
         if self.USE_CHROME and self.CHROME_TIMEOUT < 15:
         if self.USE_CHROME and self.CHROME_TIMEOUT < 15:
-            print(f'[red][!] Warning: TIMEOUT is set too low! (currently set to TIMEOUT={self.CHROME_TIMEOUT} seconds)[/red]', file=sys.stderr)
-            print('    Chrome will fail to archive all sites if set to less than ~15 seconds.', file=sys.stderr)
-            print('    (Setting it to somewhere between 30 and 300 seconds is recommended)', file=sys.stderr)
-            print(file=sys.stderr)
-            print('    If you want to make ArchiveBox run faster, disable specific archive methods instead:', file=sys.stderr)
-            print('        https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#archive-method-toggles', file=sys.stderr)
-            print(file=sys.stderr)
+            STDERR.print(f'[red][!] Warning: TIMEOUT is set too low! (currently set to TIMEOUT={self.CHROME_TIMEOUT} seconds)[/red]')
+            STDERR.print('    Chrome will fail to archive all sites if set to less than ~15 seconds.')
+            STDERR.print('    (Setting it to somewhere between 30 and 300 seconds is recommended)')
+            STDERR.print()
+            STDERR.print('    If you want to make ArchiveBox run faster, disable specific archive methods instead:')
+            STDERR.print('        https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#archive-method-toggles')
+            STDERR.print()
             
             
         # if user has specified a user data dir, make sure its valid
         # if user has specified a user data dir, make sure its valid
         if self.CHROME_USER_DATA_DIR and os.access(self.CHROME_USER_DATA_DIR, os.R_OK):
         if self.CHROME_USER_DATA_DIR and os.access(self.CHROME_USER_DATA_DIR, os.R_OK):
             # check to make sure user_data_dir/<profile_name> exists
             # check to make sure user_data_dir/<profile_name> exists
             if not (self.CHROME_USER_DATA_DIR / self.CHROME_PROFILE_NAME).is_dir():
             if not (self.CHROME_USER_DATA_DIR / self.CHROME_PROFILE_NAME).is_dir():
-                print(f'[red][X] Could not find profile "{self.CHROME_PROFILE_NAME}" in CHROME_USER_DATA_DIR.[/red]', file=sys.stderr)
-                print(f'    {self.CHROME_USER_DATA_DIR}', file=sys.stderr)
-                print('    Make sure you set it to a Chrome user data directory containing a Default profile folder.', file=sys.stderr)
-                print('    For more info see:', file=sys.stderr)
-                print('        https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#CHROME_USER_DATA_DIR', file=sys.stderr)
+                STDERR.print(f'[red][X] Could not find profile "{self.CHROME_PROFILE_NAME}" in CHROME_USER_DATA_DIR.[/red]')
+                STDERR.print(f'    {self.CHROME_USER_DATA_DIR}')
+                STDERR.print('    Make sure you set it to a Chrome user data directory containing a Default profile folder.')
+                STDERR.print('    For more info see:')
+                STDERR.print('        https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#CHROME_USER_DATA_DIR')
                 if '/Default' in str(self.CHROME_USER_DATA_DIR):
                 if '/Default' in str(self.CHROME_USER_DATA_DIR):
-                    print(file=sys.stderr)
-                    print('    Try removing /Default from the end e.g.:', file=sys.stderr)
-                    print('        CHROME_USER_DATA_DIR="{}"'.format(str(self.CHROME_USER_DATA_DIR).split('/Default')[0]), file=sys.stderr)
+                    STDERR.print()
+                    STDERR.print('    Try removing /Default from the end e.g.:')
+                    STDERR.print('        CHROME_USER_DATA_DIR="{}"'.format(str(self.CHROME_USER_DATA_DIR).split('/Default')[0]))
                 
                 
                 # hard error is too annoying here, instead just set it to nothing
                 # hard error is too annoying here, instead just set it to nothing
                 # raise SystemExit(2)
                 # raise SystemExit(2)
-                self.CHROME_USER_DATA_DIR = None
+                self.update_in_place(CHROME_USER_DATA_DIR=None)
         else:
         else:
-            self.CHROME_USER_DATA_DIR = None
+            if self.CHROME_USER_DATA_DIR is not None:
+                self.update_in_place(CHROME_USER_DATA_DIR=None)
             
             
         return self
         return self
 
 
@@ -206,81 +190,3 @@ class ChromeConfig(BaseConfigSet):
 
 
 CHROME_CONFIG = ChromeConfig()
 CHROME_CONFIG = ChromeConfig()
 
 
-
-class ChromeBinary(BaseBinary):
-    name: BinName = CHROME_CONFIG.CHROME_BINARY
-    binproviders_supported: List[InstanceOf[BinProvider]] = [PUPPETEER_BINPROVIDER, env, PLAYWRIGHT_BINPROVIDER, apt, brew]
-    
-    overrides: BinaryOverrides = {
-        env.name: {
-            'abspath': lambda: autodetect_system_chrome_install(PATH=env.PATH),  # /usr/bin/google-chrome-stable
-        },
-        PUPPETEER_BINPROVIDER.name: {
-            'packages': ['chrome@stable'],              # npx @puppeteer/browsers install chrome@stable
-        },
-        PLAYWRIGHT_BINPROVIDER.name: {
-            'packages': ['chromium'],                   # playwright install chromium
-        },
-        apt.name: {
-            'packages': APT_DEPENDENCIES,
-        },
-        brew.name: {
-            'packages': ['--cask', 'chromium'],
-        },
-    }
-
-    @staticmethod
-    def symlink_to_lib(binary, bin_dir=CONSTANTS.LIB_BIN_DIR) -> None:
-        if not (binary.abspath and os.access(binary.abspath, os.F_OK)):
-            return
-        
-        bin_dir.mkdir(parents=True, exist_ok=True)
-        symlink = bin_dir / binary.name
-        
-        try:
-            if platform.system().lower() == 'darwin':
-                # if on macOS, browser binary is inside a .app, so we need to create a tiny bash script instead of a symlink
-                create_macos_app_symlink(binary.abspath, symlink)
-            else:
-                # otherwise on linux we can symlink directly to binary executable
-                symlink.unlink(missing_ok=True)
-                symlink.symlink_to(binary.abspath)
-        except Exception as err:
-            # print(f'[red]:warning: Failed to symlink {symlink} -> {binary.abspath}[/red] {err}')
-            # not actually needed, we can just run without it
-            pass
-
-    @staticmethod            
-    def chrome_cleanup_lockfile():
-        """
-        Cleans up any state or runtime files that chrome leaves behind when killed by
-        a timeout or other error
-        """
-        lock_file = Path("~/.config/chromium/SingletonLock").expanduser()
-
-        if SHELL_CONFIG.IN_DOCKER and os.access(lock_file, os.F_OK):
-            lock_file.unlink()
-        
-        if CHROME_CONFIG.CHROME_USER_DATA_DIR:
-            if os.access(CHROME_CONFIG.CHROME_USER_DATA_DIR / 'SingletonLock', os.F_OK):
-                lock_file.unlink()
-
-
-
-CHROME_BINARY = ChromeBinary()
-
-
-class ChromePlugin(BasePlugin):
-    app_label: str = 'chrome'
-    verbose_name: str = 'Chrome Browser'
-
-    hooks: List[InstanceOf[BaseHook]] = [
-        CHROME_CONFIG,
-        CHROME_BINARY,
-    ]
-
-
-
-PLUGIN = ChromePlugin()
-# PLUGIN.register(settings)
-DJANGO_APP = PLUGIN.AppConfig

+ 38 - 0
archivebox/plugins_extractor/curl/__init__.py

@@ -0,0 +1,38 @@
+__package__ = 'plugins_extractor.curl'
+__label__ = 'curl'
+__version__ = '2024.10.14'
+__author__ = 'Nick Sweeting'
+__homepage__ = 'https://github.com/curl/curl'
+__dependencies__ = []
+
+import abx
+
+
[email protected]
+def get_PLUGIN():
+    return {
+        'curl': {
+            'PACKAGE': __package__,
+            'LABEL': __label__,
+            'VERSION': __version__,
+            'AUTHOR': __author__,
+            'HOMEPAGE': __homepage__,
+            'DEPENDENCIES': __dependencies__,
+        }
+    }
+
[email protected]
+def get_CONFIG():
+    from .config import CURL_CONFIG
+    
+    return {
+        'curl': CURL_CONFIG
+    }
+
[email protected]
+def get_BINARIES():
+    from .binaries import CURL_BINARY
+    
+    return {
+        'curl': CURL_BINARY,
+    }

+ 0 - 79
archivebox/plugins_extractor/curl/apps.py

@@ -1,79 +0,0 @@
-__package__ = 'plugins_extractor.curl'
-
-from typing import List, Optional
-from pathlib import Path
-
-from pydantic import InstanceOf, Field
-from pydantic_pkgr import BinProvider, BinName
-
-from abx.archivebox.base_plugin import BasePlugin, BaseHook
-from abx.archivebox.base_configset import BaseConfigSet
-from abx.archivebox.base_binary import BaseBinary, env, apt, brew
-# from abx.archivebox.base_extractor import BaseExtractor, ExtractorName
-
-from archivebox.config.common import ARCHIVING_CONFIG
-from archivebox.plugins_extractor.favicon.apps import FAVICON_CONFIG
-from archivebox.plugins_extractor.archivedotorg.apps import ARCHIVEDOTORG_CONFIG
-
-class CurlConfig(BaseConfigSet):
-    
-    SAVE_TITLE: bool = Field(default=True)
-    SAVE_HEADERS: bool = Field(default=True)
-    USE_CURL: bool = Field(default=lambda c: 
-        ARCHIVEDOTORG_CONFIG.SAVE_ARCHIVE_DOT_ORG
-        or FAVICON_CONFIG.SAVE_FAVICON
-        or c.SAVE_HEADERS
-        or c.SAVE_TITLE
-    )
-    
-    CURL_BINARY: str = Field(default='curl')
-    CURL_ARGS: List[str] = [
-        '--silent',
-        '--location',
-        '--compressed',
-    ]
-    CURL_EXTRA_ARGS: List[str] = []
-    
-    CURL_TIMEOUT: int =  Field(default=lambda: ARCHIVING_CONFIG.TIMEOUT)
-    CURL_CHECK_SSL_VALIDITY: bool = Field(default=lambda: ARCHIVING_CONFIG.CHECK_SSL_VALIDITY)
-    CURL_USER_AGENT: str = Field(default=lambda: ARCHIVING_CONFIG.USER_AGENT)
-    CURL_COOKIES_FILE: Optional[Path] = Field(default=lambda: ARCHIVING_CONFIG.COOKIES_FILE)
-    
-
-CURL_CONFIG = CurlConfig()
-
-
-class CurlBinary(BaseBinary):
-    name: BinName = CURL_CONFIG.CURL_BINARY
-    binproviders_supported: List[InstanceOf[BinProvider]] = [apt, brew, env]
-
-CURL_BINARY = CurlBinary()
-
-
-# class CurlExtractor(BaseExtractor):
-#     name: ExtractorName = 'curl'
-#     binary: str = CURL_BINARY.name
-
-#     def get_output_path(self, snapshot) -> Path | None:
-#         curl_index_path = curl_output_path(snapshot.as_link())
-#         if curl_index_path:
-#             return Path(curl_index_path)
-#         return None
-
-# CURL_EXTRACTOR = CurlExtractor()
-
-
-
-class CurlPlugin(BasePlugin):
-    app_label: str = 'curl'
-    verbose_name: str = 'CURL'
-    
-    hooks: List[InstanceOf[BaseHook]] = [
-        CURL_CONFIG,
-        CURL_BINARY,
-        # CURL_EXTRACTOR,
-    ]
-
-
-PLUGIN = CurlPlugin()
-DJANGO_APP = PLUGIN.AppConfig

+ 18 - 0
archivebox/plugins_extractor/curl/binaries.py

@@ -0,0 +1,18 @@
+__package__ = 'plugins_extractor.curl'
+
+from typing import List
+
+from pydantic import InstanceOf
+from pydantic_pkgr import BinProvider, BinName
+
+from abx.archivebox.base_binary import BaseBinary, env, apt, brew
+
+
+from .config import CURL_CONFIG
+
+
+class CurlBinary(BaseBinary):
+    name: BinName = CURL_CONFIG.CURL_BINARY
+    binproviders_supported: List[InstanceOf[BinProvider]] = [apt, brew, env]
+
+CURL_BINARY = CurlBinary()

+ 33 - 0
archivebox/plugins_extractor/curl/config.py

@@ -0,0 +1,33 @@
+__package__ = 'plugins_extractor.curl'
+
+from typing import List, Optional
+from pathlib import Path
+
+from pydantic import Field
+
+from abx.archivebox.base_configset import BaseConfigSet
+
+from archivebox.config.common import ARCHIVING_CONFIG
+
+
+class CurlConfig(BaseConfigSet):
+    
+    SAVE_TITLE: bool = Field(default=True)
+    SAVE_HEADERS: bool = Field(default=True)
+    USE_CURL: bool = Field(default=True)
+    
+    CURL_BINARY: str = Field(default='curl')
+    CURL_ARGS: List[str] = [
+        '--silent',
+        '--location',
+        '--compressed',
+    ]
+    CURL_EXTRA_ARGS: List[str] = []
+    
+    CURL_TIMEOUT: int =  Field(default=lambda: ARCHIVING_CONFIG.TIMEOUT)
+    CURL_CHECK_SSL_VALIDITY: bool = Field(default=lambda: ARCHIVING_CONFIG.CHECK_SSL_VALIDITY)
+    CURL_USER_AGENT: str = Field(default=lambda: ARCHIVING_CONFIG.USER_AGENT)
+    CURL_COOKIES_FILE: Optional[Path] = Field(default=lambda: ARCHIVING_CONFIG.COOKIES_FILE)
+    
+
+CURL_CONFIG = CurlConfig()

+ 39 - 0
archivebox/plugins_extractor/favicon/__init__.py

@@ -0,0 +1,39 @@
+__package__ = 'plugins_extractor.favicon'
+__label__ = 'favicon'
+__version__ = '2024.10.14'
+__author__ = 'Nick Sweeting'
+__homepage__ = 'https://github.com/ArchiveBox/archivebox'
+__dependencies__ = []
+
+import abx
+
+
[email protected]
+def get_PLUGIN():
+    return {
+        'favicon': {
+            'PACKAGE': __package__,
+            'LABEL': __label__,
+            'VERSION': __version__,
+            'AUTHOR': __author__,
+            'HOMEPAGE': __homepage__,
+            'DEPENDENCIES': __dependencies__,
+        }
+    }
+
[email protected]
+def get_CONFIG():
+    from .config import FAVICON_CONFIG
+    
+    return {
+        'favicon': FAVICON_CONFIG
+    }
+
+
+# @abx.hookimpl
+# def get_EXTRACTORS():
+#     from .extractors import FAVICON_EXTRACTOR
+    
+#     return {
+#         'favicon': FAVICON_EXTRACTOR,
+#     }

+ 0 - 30
archivebox/plugins_extractor/favicon/apps.py

@@ -1,30 +0,0 @@
-__package__ = 'archivebox.plugins_extractor.favicon'
-
-from typing import List
-
-from abx.archivebox.base_plugin import BasePlugin
-from abx.archivebox.base_configset import BaseConfigSet
-from abx.archivebox.base_hook import BaseHook
-
-###################### Config ##########################
-
-
-class FaviconConfig(BaseConfigSet):
-    SAVE_FAVICON: bool = True
-    
-    FAVICON_PROVIDER: str = 'https://www.google.com/s2/favicons?domain={}'
-
-
-FAVICON_CONFIG = FaviconConfig()
-
-
-class FaviconPlugin(BasePlugin):
-    app_label: str = 'favicon'
-    verbose_name: str = 'Favicon'
-    
-    hooks: List[BaseHook] = [
-        FAVICON_CONFIG
-    ]
-
-PLUGIN = FaviconPlugin()
-DJANGO_APP = PLUGIN.AppConfig

+ 13 - 0
archivebox/plugins_extractor/favicon/config.py

@@ -0,0 +1,13 @@
+__package__ = 'plugins_extractor.favicon'
+
+
+from abx.archivebox.base_configset import BaseConfigSet
+
+
+class FaviconConfig(BaseConfigSet):
+    SAVE_FAVICON: bool = True
+    
+    FAVICON_PROVIDER: str = 'https://www.google.com/s2/favicons?domain={}'
+
+
+FAVICON_CONFIG = FaviconConfig()

+ 46 - 0
archivebox/plugins_extractor/git/__init__.py

@@ -0,0 +1,46 @@
+__package__ = 'plugins_extractor.git'
+__label__ = 'git'
+__version__ = '2024.10.14'
+__author__ = 'Nick Sweeting'
+__homepage__ = 'https://github.com/git/git'
+__dependencies__ = []
+
+import abx
+
+
[email protected]
+def get_PLUGIN():
+    return {
+        'git': {
+            'PACKAGE': __package__,
+            'LABEL': __label__,
+            'VERSION': __version__,
+            'AUTHOR': __author__,
+            'HOMEPAGE': __homepage__,
+            'DEPENDENCIES': __dependencies__,
+        }
+    }
+
[email protected]
+def get_CONFIG():
+    from .config import GIT_CONFIG
+    
+    return {
+        'git': GIT_CONFIG
+    }
+
[email protected]
+def get_BINARIES():
+    from .binaries import GIT_BINARY
+    
+    return {
+        'git': GIT_BINARY,
+    }
+
[email protected]
+def get_EXTRACTORS():
+    from .extractors import GIT_EXTRACTOR
+    
+    return {
+        'git': GIT_EXTRACTOR,
+    }

+ 0 - 66
archivebox/plugins_extractor/git/apps.py

@@ -1,66 +0,0 @@
-__package__ = 'plugins_extractor.git'
-
-from typing import List
-from pathlib import Path
-
-from pydantic import InstanceOf, Field
-from pydantic_pkgr import BinProvider, BinName
-
-from abx.archivebox.base_plugin import BasePlugin, BaseHook
-from abx.archivebox.base_configset import BaseConfigSet
-from abx.archivebox.base_binary import BaseBinary, env, apt, brew
-from abx.archivebox.base_extractor import BaseExtractor, ExtractorName
-
-from archivebox.config.common import ARCHIVING_CONFIG
-
-
-class GitConfig(BaseConfigSet):
-
-    SAVE_GIT: bool = True
-    
-    GIT_DOMAINS: str = Field(default='github.com,bitbucket.org,gitlab.com,gist.github.com,codeberg.org,gitea.com,git.sr.ht')
-    
-    GIT_BINARY: str = Field(default='git')
-    GIT_ARGS: List[str] = [
-        '--recursive',
-    ]
-    GIT_EXTRA_ARGS: List[str] = []
-    
-    GIT_TIMEOUT: int =  Field(default=lambda: ARCHIVING_CONFIG.TIMEOUT)
-    GIT_CHECK_SSL_VALIDITY: bool = Field(default=lambda: ARCHIVING_CONFIG.CHECK_SSL_VALIDITY)
-    
-
-GIT_CONFIG = GitConfig()
-
-
-class GitBinary(BaseBinary):
-    name: BinName = GIT_CONFIG.GIT_BINARY
-    binproviders_supported: List[InstanceOf[BinProvider]] = [apt, brew, env]
-
-GIT_BINARY = GitBinary()
-
-
-class GitExtractor(BaseExtractor):
-    name: ExtractorName = 'git'
-    binary: str = GIT_BINARY.name
-
-    def get_output_path(self, snapshot) -> Path | None:
-        return snapshot.as_link() / 'git'
-
-GIT_EXTRACTOR = GitExtractor()
-
-
-
-class GitPlugin(BasePlugin):
-    app_label: str = 'git'
-    verbose_name: str = 'GIT'
-    
-    hooks: List[InstanceOf[BaseHook]] = [
-        GIT_CONFIG,
-        GIT_BINARY,
-        GIT_EXTRACTOR,
-    ]
-
-
-PLUGIN = GitPlugin()
-DJANGO_APP = PLUGIN.AppConfig

+ 18 - 0
archivebox/plugins_extractor/git/binaries.py

@@ -0,0 +1,18 @@
+__package__ = 'plugins_extractor.git'
+
+from typing import List
+
+from pydantic import InstanceOf
+from pydantic_pkgr import BinProvider, BinName
+
+from abx.archivebox.base_binary import BaseBinary, env, apt, brew
+
+from .config import GIT_CONFIG
+
+
+
+class GitBinary(BaseBinary):
+    name: BinName = GIT_CONFIG.GIT_BINARY
+    binproviders_supported: List[InstanceOf[BinProvider]] = [apt, brew, env]
+
+GIT_BINARY = GitBinary()

+ 28 - 0
archivebox/plugins_extractor/git/config.py

@@ -0,0 +1,28 @@
+__package__ = 'plugins_extractor.git'
+
+from typing import List
+
+from pydantic import Field
+
+from abx.archivebox.base_configset import BaseConfigSet
+
+from archivebox.config.common import ARCHIVING_CONFIG
+
+
+class GitConfig(BaseConfigSet):
+
+    SAVE_GIT: bool = True
+    
+    GIT_DOMAINS: str = Field(default='github.com,bitbucket.org,gitlab.com,gist.github.com,codeberg.org,gitea.com,git.sr.ht')
+    
+    GIT_BINARY: str = Field(default='git')
+    GIT_ARGS: List[str] = [
+        '--recursive',
+    ]
+    GIT_EXTRA_ARGS: List[str] = []
+    
+    GIT_TIMEOUT: int =  Field(default=lambda: ARCHIVING_CONFIG.TIMEOUT)
+    GIT_CHECK_SSL_VALIDITY: bool = Field(default=lambda: ARCHIVING_CONFIG.CHECK_SSL_VALIDITY)
+    
+
+GIT_CONFIG = GitConfig()

+ 17 - 0
archivebox/plugins_extractor/git/extractors.py

@@ -0,0 +1,17 @@
+__package__ = 'plugins_extractor.git'
+
+from pathlib import Path
+
+from abx.archivebox.base_extractor import BaseExtractor, ExtractorName
+
+from .binaries import GIT_BINARY
+
+
+class GitExtractor(BaseExtractor):
+    name: ExtractorName = 'git'
+    binary: str = GIT_BINARY.name
+
+    def get_output_path(self, snapshot) -> Path | None:
+        return snapshot.as_link() / 'git'
+
+GIT_EXTRACTOR = GitExtractor()

+ 46 - 0
archivebox/plugins_extractor/mercury/__init__.py

@@ -0,0 +1,46 @@
+__package__ = 'plugins_extractor.mercury'
+__label__ = 'mercury'
+__version__ = '2024.10.14'
+__author__ = 'Nick Sweeting'
+__homepage__ = 'https://github.com/postlight/mercury-parser'
+__dependencies__ = ['npm']
+
+import abx
+
+
[email protected]
+def get_PLUGIN():
+    return {
+        'mercury': {
+            'PACKAGE': __package__,
+            'LABEL': __label__,
+            'VERSION': __version__,
+            'AUTHOR': __author__,
+            'HOMEPAGE': __homepage__,
+            'DEPENDENCIES': __dependencies__,
+        }
+    }
+
[email protected]
+def get_CONFIG():
+    from .config import MERCURY_CONFIG
+    
+    return {
+        'mercury': MERCURY_CONFIG
+    }
+
[email protected]
+def get_BINARIES():
+    from .binaries import MERCURY_BINARY
+    
+    return {
+        'mercury': MERCURY_BINARY,
+    }
+
[email protected]
+def get_EXTRACTORS():
+    from .extractors import MERCURY_EXTRACTOR
+    
+    return {
+        'mercury': MERCURY_EXTRACTOR,
+    }

+ 0 - 80
archivebox/plugins_extractor/mercury/apps.py

@@ -1,80 +0,0 @@
-__package__ = 'plugins_extractor.mercury'
-
-from typing import List, Optional
-from pathlib import Path
-
-from pydantic import InstanceOf, Field
-from pydantic_pkgr import BinProvider, BinName, BinaryOverrides, bin_abspath
-
-from abx.archivebox.base_plugin import BasePlugin, BaseHook
-from abx.archivebox.base_configset import BaseConfigSet
-from abx.archivebox.base_binary import BaseBinary, env
-from abx.archivebox.base_extractor import BaseExtractor, ExtractorName
-
-from archivebox.config.common import ARCHIVING_CONFIG, STORAGE_CONFIG
-from archivebox.plugins_pkg.npm.apps import SYS_NPM_BINPROVIDER, LIB_NPM_BINPROVIDER
-
-class MercuryConfig(BaseConfigSet):
-
-    SAVE_MERCURY: bool = Field(default=True, alias='USE_MERCURY')
-    
-    MERCURY_BINARY: str = Field(default='postlight-parser')
-    MERCURY_EXTRA_ARGS: List[str] = []
-    
-    SAVE_MERCURY_REQUISITES: bool = Field(default=True)
-    MERCURY_RESTRICT_FILE_NAMES: str = Field(default=lambda: STORAGE_CONFIG.RESTRICT_FILE_NAMES)
-    
-    MERCURY_TIMEOUT: int =  Field(default=lambda: ARCHIVING_CONFIG.TIMEOUT)
-    MERCURY_CHECK_SSL_VALIDITY: bool = Field(default=lambda: ARCHIVING_CONFIG.CHECK_SSL_VALIDITY)
-    MERCURY_USER_AGENT: str = Field(default=lambda: ARCHIVING_CONFIG.USER_AGENT)
-    MERCURY_COOKIES_FILE: Optional[Path] = Field(default=lambda: ARCHIVING_CONFIG.COOKIES_FILE)
-    
-
-
-MERCURY_CONFIG = MercuryConfig()
-
-
-class MercuryBinary(BaseBinary):
-    name: BinName = MERCURY_CONFIG.MERCURY_BINARY
-    binproviders_supported: List[InstanceOf[BinProvider]] = [LIB_NPM_BINPROVIDER, SYS_NPM_BINPROVIDER, env]
-
-    overrides: BinaryOverrides = {
-        LIB_NPM_BINPROVIDER.name: {
-            'packages': ['@postlight/parser@^2.2.3'],
-        },
-        SYS_NPM_BINPROVIDER.name: {
-            'packages': ['@postlight/parser@^2.2.3'],
-            'install': lambda: None,                          # never try to install things into global prefix
-        },
-        env.name: {
-            'version': lambda: '999.999.999' if bin_abspath('postlight-parser', PATH=env.PATH) else None,
-        },
-    }
-
-MERCURY_BINARY = MercuryBinary()
-
-
-class MercuryExtractor(BaseExtractor):
-    name: ExtractorName = 'mercury'
-    binary: str = MERCURY_BINARY.name
-
-    def get_output_path(self, snapshot) -> Path | None:
-        return snapshot.link_dir / 'mercury' / 'content.html'
-
-MERCURY_EXTRACTOR = MercuryExtractor()
-
-
-
-class MercuryPlugin(BasePlugin):
-    app_label: str = 'mercury'
-    verbose_name: str = 'MERCURY'
-    
-    hooks: List[InstanceOf[BaseHook]] = [
-        MERCURY_CONFIG,
-        MERCURY_BINARY,
-        MERCURY_EXTRACTOR,
-    ]
-
-
-PLUGIN = MercuryPlugin()
-DJANGO_APP = PLUGIN.AppConfig

+ 32 - 0
archivebox/plugins_extractor/mercury/binaries.py

@@ -0,0 +1,32 @@
+__package__ = 'plugins_extractor.mercury'
+
+from typing import List
+
+from pydantic import InstanceOf
+from pydantic_pkgr import BinProvider, BinName, BinaryOverrides, bin_abspath
+
+from abx.archivebox.base_binary import BaseBinary, env
+
+from archivebox.plugins_pkg.npm.binproviders import SYS_NPM_BINPROVIDER, LIB_NPM_BINPROVIDER
+
+from .config import MERCURY_CONFIG
+
+
+class MercuryBinary(BaseBinary):
+    name: BinName = MERCURY_CONFIG.MERCURY_BINARY
+    binproviders_supported: List[InstanceOf[BinProvider]] = [LIB_NPM_BINPROVIDER, SYS_NPM_BINPROVIDER, env]
+
+    overrides: BinaryOverrides = {
+        LIB_NPM_BINPROVIDER.name: {
+            'packages': ['@postlight/parser@^2.2.3'],
+        },
+        SYS_NPM_BINPROVIDER.name: {
+            'packages': ['@postlight/parser@^2.2.3'],
+            'install': lambda: None,                          # never try to install things into global prefix
+        },
+        env.name: {
+            'version': lambda: '999.999.999' if bin_abspath('postlight-parser', PATH=env.PATH) else None,
+        },
+    }
+
+MERCURY_BINARY = MercuryBinary()

+ 31 - 0
archivebox/plugins_extractor/mercury/config.py

@@ -0,0 +1,31 @@
+__package__ = 'plugins_extractor.mercury'
+
+from typing import List, Optional
+from pathlib import Path
+
+from pydantic import Field
+
+from abx.archivebox.base_configset import BaseConfigSet
+
+from archivebox.config.common import ARCHIVING_CONFIG, STORAGE_CONFIG
+
+
+
+class MercuryConfig(BaseConfigSet):
+
+    SAVE_MERCURY: bool = Field(default=True, alias='USE_MERCURY')
+    
+    MERCURY_BINARY: str = Field(default='postlight-parser')
+    MERCURY_EXTRA_ARGS: List[str] = []
+    
+    SAVE_MERCURY_REQUISITES: bool = Field(default=True)
+    MERCURY_RESTRICT_FILE_NAMES: str = Field(default=lambda: STORAGE_CONFIG.RESTRICT_FILE_NAMES)
+    
+    MERCURY_TIMEOUT: int =  Field(default=lambda: ARCHIVING_CONFIG.TIMEOUT)
+    MERCURY_CHECK_SSL_VALIDITY: bool = Field(default=lambda: ARCHIVING_CONFIG.CHECK_SSL_VALIDITY)
+    MERCURY_USER_AGENT: str = Field(default=lambda: ARCHIVING_CONFIG.USER_AGENT)
+    MERCURY_COOKIES_FILE: Optional[Path] = Field(default=lambda: ARCHIVING_CONFIG.COOKIES_FILE)
+    
+
+
+MERCURY_CONFIG = MercuryConfig()

+ 19 - 0
archivebox/plugins_extractor/mercury/extractors.py

@@ -0,0 +1,19 @@
+__package__ = 'plugins_extractor.mercury'
+
+from pathlib import Path
+
+from abx.archivebox.base_extractor import BaseExtractor, ExtractorName
+
+from .binaries import MERCURY_BINARY
+
+
+
+class MercuryExtractor(BaseExtractor):
+    name: ExtractorName = 'mercury'
+    binary: str = MERCURY_BINARY.name
+
+    def get_output_path(self, snapshot) -> Path | None:
+        return snapshot.link_dir / 'mercury' / 'content.html'
+
+
+MERCURY_EXTRACTOR = MercuryExtractor()

+ 46 - 0
archivebox/plugins_extractor/readability/__init__.py

@@ -0,0 +1,46 @@
+__package__ = 'plugins_extractor.readability'
+__label__ = 'readability'
+__version__ = '2024.10.14'
+__author__ = 'Nick Sweeting'
+__homepage__ = 'https://github.com/ArchiveBox/readability-extractor'
+__dependencies__ = ['npm']
+
+import abx
+
+
[email protected]
+def get_PLUGIN():
+    return {
+        'readability': {
+            'PACKAGE': __package__,
+            'LABEL': __label__,
+            'VERSION': __version__,
+            'AUTHOR': __author__,
+            'HOMEPAGE': __homepage__,
+            'DEPENDENCIES': __dependencies__,
+        }
+    }
+
[email protected]
+def get_CONFIG():
+    from .config import READABILITY_CONFIG
+    
+    return {
+        'readability': READABILITY_CONFIG
+    }
+
[email protected]
+def get_BINARIES():
+    from .binaries import READABILITY_BINARY
+    
+    return {
+        'readability': READABILITY_BINARY,
+    }
+
[email protected]
+def get_EXTRACTORS():
+    from .extractors import READABILITY_EXTRACTOR
+    
+    return {
+        'readability': READABILITY_EXTRACTOR,
+    }

+ 0 - 86
archivebox/plugins_extractor/readability/apps.py

@@ -1,86 +0,0 @@
-__package__ = 'archivebox.plugins_extractor.readability'
-
-from pathlib import Path
-from typing import List
-# from typing_extensions import Self
-
-# Depends on other PyPI/vendor packages:
-from pydantic import InstanceOf, Field
-from pydantic_pkgr import BinProvider, BinaryOverrides, BinName
-
-# Depends on other Django apps:
-from abx.archivebox.base_plugin import BasePlugin
-from abx.archivebox.base_configset import BaseConfigSet
-from abx.archivebox.base_binary import BaseBinary, env
-from abx.archivebox.base_extractor import BaseExtractor
-from abx.archivebox.base_hook import BaseHook
-
-# Depends on Other Plugins:
-from archivebox.config.common import ARCHIVING_CONFIG
-from plugins_pkg.npm.apps import SYS_NPM_BINPROVIDER, LIB_NPM_BINPROVIDER
-
-###################### Config ##########################
-
-class ReadabilityConfig(BaseConfigSet):
-    SAVE_READABILITY: bool = Field(default=True, alias='USE_READABILITY')
-
-    READABILITY_TIMEOUT: int                 = Field(default=lambda: ARCHIVING_CONFIG.TIMEOUT)
-
-    READABILITY_BINARY: str = Field(default='readability-extractor')
-    # READABILITY_EXTRA_ARGS: List[str] = []                                # readability-extractor doesn't take any extra args
-
-
-READABILITY_CONFIG = ReadabilityConfig()
-
-
-READABILITY_PACKAGE_NAME = 'github:ArchiveBox/readability-extractor'
-
-class ReadabilityBinary(BaseBinary):
-    name: BinName = READABILITY_CONFIG.READABILITY_BINARY
-    binproviders_supported: List[InstanceOf[BinProvider]] = [LIB_NPM_BINPROVIDER, SYS_NPM_BINPROVIDER, env]
-
-    overrides: BinaryOverrides = {
-        LIB_NPM_BINPROVIDER.name: {"packages": [READABILITY_PACKAGE_NAME]},
-        SYS_NPM_BINPROVIDER.name: {"packages": [READABILITY_PACKAGE_NAME], "install": lambda: None},    # prevent modifying system global npm packages
-    }
-
-
-
-
-READABILITY_BINARY = ReadabilityBinary()
-
-
-class ReadabilityExtractor(BaseExtractor):
-    name: str = 'readability'
-    binary: BinName = READABILITY_BINARY.name
-
-    def get_output_path(self, snapshot) -> Path:
-        return Path(snapshot.link_dir) / 'readability' / 'content.html'
-
-
-READABILITY_BINARY = ReadabilityBinary()
-READABILITY_EXTRACTOR = ReadabilityExtractor()
-
-# class ReadabilityQueue(BaseQueue):
-#     name: str = 'singlefile'
-    
-#     binaries: List[InstanceOf[BaseBinary]] = [READABILITY_BINARY]
-
-# READABILITY_QUEUE = ReadabilityQueue()
-
-class ReadabilityPlugin(BasePlugin):
-    app_label: str ='readability'
-    verbose_name: str = 'Readability'
-
-    hooks: List[InstanceOf[BaseHook]] = [
-        READABILITY_CONFIG,
-        READABILITY_BINARY,
-        READABILITY_EXTRACTOR,
-        # READABILITY_QUEUE,
-    ]
-
-
-
-PLUGIN = ReadabilityPlugin()
-# PLUGIN.register(settings)
-DJANGO_APP = PLUGIN.AppConfig

+ 27 - 0
archivebox/plugins_extractor/readability/binaries.py

@@ -0,0 +1,27 @@
+__package__ = 'plugins_extractor.readability'
+
+from typing import List
+
+from pydantic import InstanceOf
+from pydantic_pkgr import BinProvider, BinaryOverrides, BinName
+
+from abx.archivebox.base_binary import BaseBinary, env
+
+from plugins_pkg.npm.binproviders import SYS_NPM_BINPROVIDER, LIB_NPM_BINPROVIDER
+
+from .config import READABILITY_CONFIG
+
+
+READABILITY_PACKAGE_NAME = 'github:ArchiveBox/readability-extractor'
+
+class ReadabilityBinary(BaseBinary):
+    name: BinName = READABILITY_CONFIG.READABILITY_BINARY
+    binproviders_supported: List[InstanceOf[BinProvider]] = [LIB_NPM_BINPROVIDER, SYS_NPM_BINPROVIDER, env]
+
+    overrides: BinaryOverrides = {
+        LIB_NPM_BINPROVIDER.name: {"packages": [READABILITY_PACKAGE_NAME]},
+        SYS_NPM_BINPROVIDER.name: {"packages": [READABILITY_PACKAGE_NAME], "install": lambda: None},    # prevent modifying system global npm packages
+    }
+
+
+READABILITY_BINARY = ReadabilityBinary()

+ 19 - 0
archivebox/plugins_extractor/readability/config.py

@@ -0,0 +1,19 @@
+__package__ = 'plugins_extractor.readability'
+
+from pydantic import Field
+
+from abx.archivebox.base_configset import BaseConfigSet
+
+from archivebox.config.common import ARCHIVING_CONFIG
+
+
+class ReadabilityConfig(BaseConfigSet):
+    SAVE_READABILITY: bool = Field(default=True, alias='USE_READABILITY')
+
+    READABILITY_TIMEOUT: int                 = Field(default=lambda: ARCHIVING_CONFIG.TIMEOUT)
+
+    READABILITY_BINARY: str = Field(default='readability-extractor')
+    # READABILITY_EXTRA_ARGS: List[str] = []                                # readability-extractor doesn't take any extra args
+
+
+READABILITY_CONFIG = ReadabilityConfig()

+ 20 - 0
archivebox/plugins_extractor/readability/extractors.py

@@ -0,0 +1,20 @@
+__package__ = 'plugins_extractor.readability'
+
+from pathlib import Path
+
+from pydantic_pkgr import BinName
+
+from abx.archivebox.base_extractor import BaseExtractor
+
+from .binaries import READABILITY_BINARY
+
+
+class ReadabilityExtractor(BaseExtractor):
+    name: str = 'readability'
+    binary: BinName = READABILITY_BINARY.name
+
+    def get_output_path(self, snapshot) -> Path:
+        return Path(snapshot.link_dir) / 'readability' / 'content.html'
+
+
+READABILITY_EXTRACTOR = ReadabilityExtractor()

+ 51 - 0
archivebox/plugins_extractor/singlefile/__init__.py

@@ -0,0 +1,51 @@
+__package__ = 'plugins_extractor.singlefile'
+__label__ = 'singlefile'
+__version__ = '2024.10.14'
+__author__ = 'Nick Sweeting'
+__homepage__ = 'https://github.com/gildas-lormeau/singlefile'
+__dependencies__ = ['npm']
+
+import abx
+
+
[email protected]
+def get_PLUGIN():
+    return {
+        'singlefile': {
+            'PACKAGE': __package__,
+            'LABEL': __label__,
+            'VERSION': __version__,
+            'AUTHOR': __author__,
+            'HOMEPAGE': __homepage__,
+            'DEPENDENCIES': __dependencies__,
+        }
+    }
+
[email protected]
+def get_CONFIG():
+    from .config import SINGLEFILE_CONFIG
+    
+    return {
+        'singlefile': SINGLEFILE_CONFIG
+    }
+
[email protected]
+def get_BINARIES():
+    from .binaries import SINGLEFILE_BINARY
+    
+    return {
+        'singlefile': SINGLEFILE_BINARY,
+    }
+
[email protected]
+def get_EXTRACTORS():
+    from .extractors import SINGLEFILE_EXTRACTOR
+    
+    return {
+        'singlefile': SINGLEFILE_EXTRACTOR,
+    }
+
+# @abx.hookimpl
+# def get_INSTALLED_APPS():
+#     # needed to load ./models.py
+#     return [__package__]

+ 0 - 110
archivebox/plugins_extractor/singlefile/apps.py

@@ -1,110 +0,0 @@
-__package__ = 'archivebox.plugins_extractor.singlefile'
-
-from pathlib import Path
-from typing import List, Optional
-# from typing_extensions import Self
-
-# Depends on other PyPI/vendor packages:
-from pydantic import InstanceOf, Field
-from pydantic_pkgr import BinProvider, BinaryOverrides, BinName, bin_abspath
-
-# Depends on other Django apps:
-from abx.archivebox.base_plugin import BasePlugin
-from abx.archivebox.base_configset import BaseConfigSet
-from abx.archivebox.base_binary import BaseBinary, env
-from abx.archivebox.base_extractor import BaseExtractor
-from abx.archivebox.base_queue import BaseQueue
-from abx.archivebox.base_hook import BaseHook
-
-# Depends on Other Plugins:
-from archivebox.config.common import ARCHIVING_CONFIG
-from plugins_pkg.npm.apps import SYS_NPM_BINPROVIDER, LIB_NPM_BINPROVIDER
-
-###################### Config ##########################
-
-class SinglefileConfig(BaseConfigSet):
-    SAVE_SINGLEFILE: bool = True
-
-    SINGLEFILE_USER_AGENT: str              = Field(default=lambda: ARCHIVING_CONFIG.USER_AGENT)
-    SINGLEFILE_TIMEOUT: int                 = Field(default=lambda: ARCHIVING_CONFIG.TIMEOUT)
-    SINGLEFILE_CHECK_SSL_VALIDITY: bool     = Field(default=lambda: ARCHIVING_CONFIG.CHECK_SSL_VALIDITY)
-    SINGLEFILE_COOKIES_FILE: Optional[Path] = Field(default=lambda: ARCHIVING_CONFIG.COOKIES_FILE)
-
-    SINGLEFILE_BINARY: str = Field(default='single-file')
-    SINGLEFILE_EXTRA_ARGS: List[str] = []
-
-
-SINGLEFILE_CONFIG = SinglefileConfig()
-
-
-SINGLEFILE_MIN_VERSION = '1.1.54'
-SINGLEFILE_MAX_VERSION = '1.1.60'
-
-
-class SinglefileBinary(BaseBinary):
-    name: BinName = SINGLEFILE_CONFIG.SINGLEFILE_BINARY
-    binproviders_supported: List[InstanceOf[BinProvider]] = [LIB_NPM_BINPROVIDER, SYS_NPM_BINPROVIDER, env]
-
-    overrides: BinaryOverrides = {
-        LIB_NPM_BINPROVIDER.name: {
-            "abspath": lambda:
-                bin_abspath(SINGLEFILE_CONFIG.SINGLEFILE_BINARY, PATH=LIB_NPM_BINPROVIDER.PATH)
-                or bin_abspath("single-file", PATH=LIB_NPM_BINPROVIDER.PATH)
-                or bin_abspath("single-file-node.js", PATH=LIB_NPM_BINPROVIDER.PATH),
-            "packages": [f"single-file-cli@>={SINGLEFILE_MIN_VERSION} <{SINGLEFILE_MAX_VERSION}"],
-        },
-        SYS_NPM_BINPROVIDER.name: {
-            "abspath": lambda:
-                bin_abspath(SINGLEFILE_CONFIG.SINGLEFILE_BINARY, PATH=SYS_NPM_BINPROVIDER.PATH)
-                or bin_abspath("single-file", PATH=SYS_NPM_BINPROVIDER.PATH)
-                or bin_abspath("single-file-node.js", PATH=SYS_NPM_BINPROVIDER.PATH),
-            "packages": [f"single-file-cli@>={SINGLEFILE_MIN_VERSION} <{SINGLEFILE_MAX_VERSION}"],
-            "install": lambda: None,
-        },
-        env.name: {
-            'abspath': lambda:
-                bin_abspath(SINGLEFILE_CONFIG.SINGLEFILE_BINARY, PATH=env.PATH)
-                or bin_abspath('single-file', PATH=env.PATH)
-                or bin_abspath('single-file-node.js', PATH=env.PATH),
-        },
-    }
-
-
-SINGLEFILE_BINARY = SinglefileBinary()
-
-PLUGIN_BINARIES = [SINGLEFILE_BINARY]
-
-class SinglefileExtractor(BaseExtractor):
-    name: str = 'singlefile'
-    binary: BinName = SINGLEFILE_BINARY.name
-
-    def get_output_path(self, snapshot) -> Path:
-        return Path(snapshot.link_dir) / 'singlefile.html'
-
-
-SINGLEFILE_BINARY = SinglefileBinary()
-SINGLEFILE_EXTRACTOR = SinglefileExtractor()
-
-class SinglefileQueue(BaseQueue):
-    name: str = 'singlefile'
-    
-    binaries: List[InstanceOf[BaseBinary]] = [SINGLEFILE_BINARY]
-
-SINGLEFILE_QUEUE = SinglefileQueue()
-
-class SinglefilePlugin(BasePlugin):
-    app_label: str ='singlefile'
-    verbose_name: str = 'SingleFile'
-
-    hooks: List[InstanceOf[BaseHook]] = [
-        SINGLEFILE_CONFIG,
-        SINGLEFILE_BINARY,
-        SINGLEFILE_EXTRACTOR,
-        SINGLEFILE_QUEUE,
-    ]
-
-
-
-PLUGIN = SinglefilePlugin()
-# PLUGIN.register(settings)
-DJANGO_APP = PLUGIN.AppConfig

+ 48 - 0
archivebox/plugins_extractor/singlefile/binaries.py

@@ -0,0 +1,48 @@
+__package__ = 'plugins_extractor.singlefile'
+
+from typing import List
+
+from pydantic import InstanceOf
+from pydantic_pkgr import BinProvider, BinaryOverrides, BinName, bin_abspath
+
+from abx.archivebox.base_binary import BaseBinary, env
+
+from plugins_pkg.npm.binproviders import SYS_NPM_BINPROVIDER, LIB_NPM_BINPROVIDER
+
+from .config import SINGLEFILE_CONFIG
+
+
+SINGLEFILE_MIN_VERSION = '1.1.54'
+SINGLEFILE_MAX_VERSION = '1.1.60'
+
+
+class SinglefileBinary(BaseBinary):
+    name: BinName = SINGLEFILE_CONFIG.SINGLEFILE_BINARY
+    binproviders_supported: List[InstanceOf[BinProvider]] = [LIB_NPM_BINPROVIDER, SYS_NPM_BINPROVIDER, env]
+
+    overrides: BinaryOverrides = {
+        LIB_NPM_BINPROVIDER.name: {
+            "abspath": lambda:
+                bin_abspath(SINGLEFILE_CONFIG.SINGLEFILE_BINARY, PATH=LIB_NPM_BINPROVIDER.PATH)
+                or bin_abspath("single-file", PATH=LIB_NPM_BINPROVIDER.PATH)
+                or bin_abspath("single-file-node.js", PATH=LIB_NPM_BINPROVIDER.PATH),
+            "packages": [f"single-file-cli@>={SINGLEFILE_MIN_VERSION} <{SINGLEFILE_MAX_VERSION}"],
+        },
+        SYS_NPM_BINPROVIDER.name: {
+            "abspath": lambda:
+                bin_abspath(SINGLEFILE_CONFIG.SINGLEFILE_BINARY, PATH=SYS_NPM_BINPROVIDER.PATH)
+                or bin_abspath("single-file", PATH=SYS_NPM_BINPROVIDER.PATH)
+                or bin_abspath("single-file-node.js", PATH=SYS_NPM_BINPROVIDER.PATH),
+            "packages": [f"single-file-cli@>={SINGLEFILE_MIN_VERSION} <{SINGLEFILE_MAX_VERSION}"],
+            "install": lambda: None,
+        },
+        env.name: {
+            'abspath': lambda:
+                bin_abspath(SINGLEFILE_CONFIG.SINGLEFILE_BINARY, PATH=env.PATH)
+                or bin_abspath('single-file', PATH=env.PATH)
+                or bin_abspath('single-file-node.js', PATH=env.PATH),
+        },
+    }
+
+
+SINGLEFILE_BINARY = SinglefileBinary()

+ 25 - 0
archivebox/plugins_extractor/singlefile/config.py

@@ -0,0 +1,25 @@
+__package__ = 'plugins_extractor.singlefile'
+
+from pathlib import Path
+from typing import List, Optional
+
+from pydantic import Field
+
+from abx.archivebox.base_configset import BaseConfigSet
+
+from archivebox.config.common import ARCHIVING_CONFIG
+
+
+class SinglefileConfig(BaseConfigSet):
+    SAVE_SINGLEFILE: bool = True
+
+    SINGLEFILE_USER_AGENT: str              = Field(default=lambda: ARCHIVING_CONFIG.USER_AGENT)
+    SINGLEFILE_TIMEOUT: int                 = Field(default=lambda: ARCHIVING_CONFIG.TIMEOUT)
+    SINGLEFILE_CHECK_SSL_VALIDITY: bool     = Field(default=lambda: ARCHIVING_CONFIG.CHECK_SSL_VALIDITY)
+    SINGLEFILE_COOKIES_FILE: Optional[Path] = Field(default=lambda: ARCHIVING_CONFIG.COOKIES_FILE)
+
+    SINGLEFILE_BINARY: str = Field(default='single-file')
+    SINGLEFILE_EXTRA_ARGS: List[str] = []
+
+
+SINGLEFILE_CONFIG = SinglefileConfig()

+ 19 - 0
archivebox/plugins_extractor/singlefile/extractors.py

@@ -0,0 +1,19 @@
+__package__ = 'plugins_extractor.singlefile'
+
+from pathlib import Path
+
+from pydantic_pkgr import BinName
+from abx.archivebox.base_extractor import BaseExtractor
+
+from .binaries import SINGLEFILE_BINARY
+
+
+class SinglefileExtractor(BaseExtractor):
+    name: str = 'singlefile'
+    binary: BinName = SINGLEFILE_BINARY.name
+
+    def get_output_path(self, snapshot) -> Path:
+        return Path(snapshot.link_dir) / 'singlefile.html'
+
+
+SINGLEFILE_EXTRACTOR = SinglefileExtractor()

+ 0 - 26
archivebox/plugins_extractor/singlefile/migrations/0001_initial.py

@@ -1,26 +0,0 @@
-# Generated by Django 5.1.1 on 2024-09-10 05:05
-
-from django.db import migrations
-
-
-class Migration(migrations.Migration):
-
-    initial = True
-
-    dependencies = [
-        ('core', '0074_alter_snapshot_downloaded_at'),
-    ]
-
-    operations = [
-        migrations.CreateModel(
-            name='SinglefileResult',
-            fields=[
-            ],
-            options={
-                'proxy': True,
-                'indexes': [],
-                'constraints': [],
-            },
-            bases=('core.archiveresult',),
-        ),
-    ]

+ 0 - 0
archivebox/plugins_extractor/singlefile/migrations/__init__.py


+ 0 - 40
archivebox/plugins_extractor/singlefile/tasks.py

@@ -1,40 +0,0 @@
-__package__ = 'archivebox.queues'
-
-import time
-
-from django.core.cache import cache
-
-from huey import crontab
-from django_huey import db_task, on_startup, db_periodic_task
-from huey_monitor.models import TaskModel
-from huey_monitor.tqdm import ProcessInfo
-
-@db_task(queue="singlefile", context=True)
-def extract(url, out_dir, config, task=None, parent_task_id=None):
-    if task and parent_task_id:
-        TaskModel.objects.set_parent_task(main_task_id=parent_task_id, sub_task_id=task.id)
-
-    process_info = ProcessInfo(task, desc="extract_singlefile", parent_task_id=parent_task_id, total=1)
-
-    time.sleep(5)
-
-    process_info.update(n=1)
-    return {'output': 'singlefile.html', 'status': 'succeeded'}
-
-
-# @on_startup(queue='singlefile')
-# def start_singlefile_queue():
-#     print("[+] Starting singlefile worker...")
-#     update_version.call_local()
-
-
-# @db_periodic_task(crontab(minute='*/5'), queue='singlefile')
-# def update_version():
-#     print('[*] Updating singlefile version... 5 minute interval')
-#     from django.conf import settings
-    
-#     bin = settings.BINARIES.SinglefileBinary.load()
-#     if bin.version:
-#         cache.set(f"bin:abspath:{bin.name}", bin.abspath)
-#         cache.set(f"bin:version:{bin.name}:{bin.abspath}", bin.version)
-#         print('[√] Updated singlefile version:', bin.version, bin.abspath)

+ 47 - 0
archivebox/plugins_extractor/wget/__init__.py

@@ -0,0 +1,47 @@
+__package__ = 'plugins_extractor.wget'
+__label__ = 'wget'
+__version__ = '2024.10.14'
+__author__ = 'Nick Sweeting'
+__homepage__ = 'https://github.com/ArchiveBox/ArchiveBox/tree/main/archivebox/plugins_extractor/wget'
+__dependencies__ = []
+
+import abx
+
+
[email protected]
+def get_PLUGIN():
+    return {
+        'wget': {
+            'PACKAGE': __package__,
+            'LABEL': __label__,
+            'VERSION': __version__,
+            'AUTHOR': __author__,
+            'HOMEPAGE': __homepage__,
+            'DEPENDENCIES': __dependencies__,
+        }
+    }
+
[email protected]
+def get_CONFIG():
+    from .config import WGET_CONFIG
+    
+    return {
+        'wget': WGET_CONFIG
+    }
+
[email protected]
+def get_BINARIES():
+    from .binaries import WGET_BINARY
+    
+    return {
+        'wget': WGET_BINARY,
+    }
+
[email protected]
+def get_EXTRACTORS():
+    from .extractors import WGET_EXTRACTOR, WARC_EXTRACTOR
+    
+    return {
+        'wget': WGET_EXTRACTOR,
+        'warc': WARC_EXTRACTOR,
+    }

+ 0 - 127
archivebox/plugins_extractor/wget/apps.py

@@ -1,127 +0,0 @@
-__package__ = 'plugins_extractor.wget'
-
-import sys
-from typing import List, Optional
-from pathlib import Path
-from subprocess import run, DEVNULL
-
-from rich import print
-from pydantic import InstanceOf, Field, model_validator
-from pydantic_pkgr import BinProvider, BinName
-
-from abx.archivebox.base_plugin import BasePlugin, BaseHook
-from abx.archivebox.base_configset import BaseConfigSet
-from abx.archivebox.base_binary import BaseBinary, env, apt, brew
-from abx.archivebox.base_extractor import BaseExtractor, ExtractorName
-
-from archivebox.config.common import ARCHIVING_CONFIG, STORAGE_CONFIG
-from .wget_util import wget_output_path
-
-
-class WgetConfig(BaseConfigSet):
-
-    SAVE_WGET: bool = True
-    SAVE_WARC: bool = True
-    
-    USE_WGET: bool = Field(default=lambda c: c.SAVE_WGET or c.SAVE_WARC)
-    
-    WGET_BINARY: str = Field(default='wget')
-    WGET_ARGS: List[str] = [
-        '--no-verbose',
-        '--adjust-extension',
-        '--convert-links',
-        '--force-directories',
-        '--backup-converted',
-        '--span-hosts',
-        '--no-parent',
-        '-e', 'robots=off',
-    ]
-    WGET_EXTRA_ARGS: List[str] = []
-    
-    SAVE_WGET_REQUISITES: bool = Field(default=True)
-    WGET_RESTRICT_FILE_NAMES: str = Field(default=lambda: STORAGE_CONFIG.RESTRICT_FILE_NAMES)
-    
-    WGET_TIMEOUT: int =  Field(default=lambda: ARCHIVING_CONFIG.TIMEOUT)
-    WGET_CHECK_SSL_VALIDITY: bool = Field(default=lambda: ARCHIVING_CONFIG.CHECK_SSL_VALIDITY)
-    WGET_USER_AGENT: str = Field(default=lambda: ARCHIVING_CONFIG.USER_AGENT)
-    WGET_COOKIES_FILE: Optional[Path] = Field(default=lambda: ARCHIVING_CONFIG.COOKIES_FILE)
-    
-    @model_validator(mode='after')
-    def validate_use_ytdlp(self):
-        if self.USE_WGET and self.WGET_TIMEOUT < 10:
-            print(f'[red][!] Warning: TIMEOUT is set too low! (currently set to TIMEOUT={self.WGET_TIMEOUT} seconds)[/red]', file=sys.stderr)
-            print('    wget will fail to archive any sites if set to less than ~20 seconds.', file=sys.stderr)
-            print('    (Setting it somewhere over 60 seconds is recommended)', file=sys.stderr)
-            print(file=sys.stderr)
-            print('    If you want to disable media archiving entirely, set SAVE_MEDIA=False instead:', file=sys.stderr)
-            print('        https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#save_media', file=sys.stderr)
-            print(file=sys.stderr)
-        return self
-    
-    @property
-    def WGET_AUTO_COMPRESSION(self) -> bool:
-        if hasattr(self, '_WGET_AUTO_COMPRESSION'):
-            return self._WGET_AUTO_COMPRESSION
-        try:
-            cmd = [
-                self.WGET_BINARY,
-                "--compression=auto",
-                "--help",
-            ]
-            self._WGET_AUTO_COMPRESSION = not run(cmd, stdout=DEVNULL, stderr=DEVNULL, timeout=3).returncode
-            return self._WGET_AUTO_COMPRESSION
-        except (FileNotFoundError, OSError):
-            self._WGET_AUTO_COMPRESSION = False
-            return False
-
-WGET_CONFIG = WgetConfig()
-
-
-class WgetBinary(BaseBinary):
-    name: BinName = WGET_CONFIG.WGET_BINARY
-    binproviders_supported: List[InstanceOf[BinProvider]] = [apt, brew, env]
-
-WGET_BINARY = WgetBinary()
-
-
-class WgetExtractor(BaseExtractor):
-    name: ExtractorName = 'wget'
-    binary: BinName = WGET_BINARY.name
-
-    def get_output_path(self, snapshot) -> Path | None:
-        wget_index_path = wget_output_path(snapshot.as_link())
-        if wget_index_path:
-            return Path(wget_index_path)
-        return None
-
-WGET_EXTRACTOR = WgetExtractor()
-
-
-class WarcExtractor(BaseExtractor):
-    name: ExtractorName = 'warc'
-    binary: BinName = WGET_BINARY.name
-
-    def get_output_path(self, snapshot) -> Path | None:
-        warc_files = list((Path(snapshot.link_dir) / 'warc').glob('*.warc.gz'))
-        if warc_files:
-            return sorted(warc_files, key=lambda x: x.stat().st_size, reverse=True)[0]
-        return None
-
-
-WARC_EXTRACTOR = WarcExtractor()
-
-
-class WgetPlugin(BasePlugin):
-    app_label: str = 'wget'
-    verbose_name: str = 'WGET'
-    
-    hooks: List[InstanceOf[BaseHook]] = [
-        WGET_CONFIG,
-        WGET_BINARY,
-        WGET_EXTRACTOR,
-        WARC_EXTRACTOR,
-    ]
-
-
-PLUGIN = WgetPlugin()
-DJANGO_APP = PLUGIN.AppConfig

+ 18 - 0
archivebox/plugins_extractor/wget/binaries.py

@@ -0,0 +1,18 @@
+__package__ = 'plugins_extractor.wget'
+
+from typing import List
+
+
+from pydantic import InstanceOf
+from pydantic_pkgr import BinProvider, BinName
+
+from abx.archivebox.base_binary import BaseBinary, env, apt, brew
+
+from .config import WGET_CONFIG
+
+
+class WgetBinary(BaseBinary):
+    name: BinName = WGET_CONFIG.WGET_BINARY
+    binproviders_supported: List[InstanceOf[BinProvider]] = [apt, brew, env]
+
+WGET_BINARY = WgetBinary()

+ 72 - 0
archivebox/plugins_extractor/wget/config.py

@@ -0,0 +1,72 @@
+__package__ = 'plugins_extractor.wget'
+
+import subprocess
+from typing import List, Optional
+from pathlib import Path
+
+from pydantic import Field, model_validator
+
+from abx.archivebox.base_configset import BaseConfigSet
+
+from archivebox.config.common import ARCHIVING_CONFIG, STORAGE_CONFIG
+from archivebox.misc.logging import STDERR
+
+
+class WgetConfig(BaseConfigSet):
+
+    SAVE_WGET: bool = True
+    SAVE_WARC: bool = True
+    
+    USE_WGET: bool = Field(default=lambda c: c.SAVE_WGET or c.SAVE_WARC)
+    
+    WGET_BINARY: str = Field(default='wget')
+    WGET_ARGS: List[str] = [
+        '--no-verbose',
+        '--adjust-extension',
+        '--convert-links',
+        '--force-directories',
+        '--backup-converted',
+        '--span-hosts',
+        '--no-parent',
+        '-e', 'robots=off',
+    ]
+    WGET_EXTRA_ARGS: List[str] = []
+    
+    SAVE_WGET_REQUISITES: bool = Field(default=True)
+    WGET_RESTRICT_FILE_NAMES: str = Field(default=lambda: STORAGE_CONFIG.RESTRICT_FILE_NAMES)
+    
+    WGET_TIMEOUT: int =  Field(default=lambda: ARCHIVING_CONFIG.TIMEOUT)
+    WGET_CHECK_SSL_VALIDITY: bool = Field(default=lambda: ARCHIVING_CONFIG.CHECK_SSL_VALIDITY)
+    WGET_USER_AGENT: str = Field(default=lambda: ARCHIVING_CONFIG.USER_AGENT)
+    WGET_COOKIES_FILE: Optional[Path] = Field(default=lambda: ARCHIVING_CONFIG.COOKIES_FILE)
+    
+    @model_validator(mode='after')
+    def validate_use_ytdlp(self):
+        if self.USE_WGET and self.WGET_TIMEOUT < 10:
+            STDERR.print(f'[red][!] Warning: TIMEOUT is set too low! (currently set to TIMEOUT={self.WGET_TIMEOUT} seconds)[/red]')
+            STDERR.print('    wget will fail to archive any sites if set to less than ~20 seconds.')
+            STDERR.print('    (Setting it somewhere over 60 seconds is recommended)')
+            STDERR.print()
+            STDERR.print('    If you want to disable media archiving entirely, set SAVE_MEDIA=False instead:')
+            STDERR.print('        https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#save_media')
+            STDERR.print()
+        return self
+
+    @property
+    def WGET_AUTO_COMPRESSION(self) -> bool:
+        if hasattr(self, '_WGET_AUTO_COMPRESSION'):
+            return self._WGET_AUTO_COMPRESSION
+        try:
+            cmd = [
+                self.WGET_BINARY,
+                "--compression=auto",
+                "--help",
+            ]
+            self._WGET_AUTO_COMPRESSION = not subprocess.run(cmd, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL, timeout=3).returncode
+            return self._WGET_AUTO_COMPRESSION
+        except (FileNotFoundError, OSError):
+            self._WGET_AUTO_COMPRESSION = False
+            return False
+
+WGET_CONFIG = WgetConfig()
+

+ 37 - 0
archivebox/plugins_extractor/wget/extractors.py

@@ -0,0 +1,37 @@
+__package__ = 'plugins_extractor.wget'
+
+from pathlib import Path
+
+from pydantic_pkgr import BinName
+
+from abx.archivebox.base_extractor import BaseExtractor, ExtractorName
+
+from .binaries import WGET_BINARY
+from .wget_util import wget_output_path
+
+class WgetExtractor(BaseExtractor):
+    name: ExtractorName = 'wget'
+    binary: BinName = WGET_BINARY.name
+
+    def get_output_path(self, snapshot) -> Path | None:
+        wget_index_path = wget_output_path(snapshot.as_link())
+        if wget_index_path:
+            return Path(wget_index_path)
+        return None
+
+WGET_EXTRACTOR = WgetExtractor()
+
+
+class WarcExtractor(BaseExtractor):
+    name: ExtractorName = 'warc'
+    binary: BinName = WGET_BINARY.name
+
+    def get_output_path(self, snapshot) -> Path | None:
+        warc_files = list((Path(snapshot.link_dir) / 'warc').glob('*.warc.gz'))
+        if warc_files:
+            return sorted(warc_files, key=lambda x: x.stat().st_size, reverse=True)[0]
+        return None
+
+
+WARC_EXTRACTOR = WarcExtractor()
+

+ 37 - 0
archivebox/plugins_extractor/ytdlp/__init__.py

@@ -0,0 +1,37 @@
+__package__ = 'plugins_extractor.ytdlp'
+__label__ = 'YT-DLP'
+__version__ = '2024.10.14'
+__author__ = 'Nick Sweeting'
+__homepage__ = 'https://github.com/yt-dlp/yt-dlp'
+
+import abx
+
+
[email protected]
+def get_PLUGIN():
+    return {
+        'ytdlp': {
+            'PACKAGE': __package__,
+            'LABEL': __label__,
+            'VERSION': __version__,
+            'AUTHOR': __author__,
+            'HOMEPAGE': __homepage__,
+        }
+    }
+
[email protected]
+def get_CONFIG():
+    from .config import YTDLP_CONFIG
+    
+    return {
+        'ytdlp': YTDLP_CONFIG
+    }
+
[email protected]
+def get_BINARIES():
+    from .binaries import YTDLP_BINARY, FFMPEG_BINARY
+    
+    return {
+        'ytdlp': YTDLP_BINARY,
+        'ffmpeg': FFMPEG_BINARY,
+    }

+ 0 - 98
archivebox/plugins_extractor/ytdlp/apps.py

@@ -1,98 +0,0 @@
-import sys
-from typing import List
-from subprocess import run, PIPE
-
-from rich import print
-from pydantic import InstanceOf, Field, model_validator, AliasChoices
-from pydantic_pkgr import BinProvider, BinName, BinaryOverrides
-
-from abx.archivebox.base_plugin import BasePlugin
-from abx.archivebox.base_configset import BaseConfigSet
-from abx.archivebox.base_binary import BaseBinary, env, apt, brew
-from abx.archivebox.base_hook import BaseHook
-
-from archivebox.config.common import ARCHIVING_CONFIG
-from plugins_pkg.pip.apps import pip
-
-###################### Config ##########################
-
-
-class YtdlpConfig(BaseConfigSet):
-    USE_YTDLP: bool               = Field(default=True, validation_alias=AliasChoices('USE_YOUTUBEDL', 'SAVE_MEDIA'))
-
-    YTDLP_BINARY: str             = Field(default='yt-dlp', alias='YOUTUBEDL_BINARY')
-    YTDLP_EXTRA_ARGS: List[str]   = Field(default=[], alias='YOUTUBEDL_EXTRA_ARGS')
-    
-    YTDLP_CHECK_SSL_VALIDITY: bool = Field(default=lambda: ARCHIVING_CONFIG.CHECK_SSL_VALIDITY)
-    YTDLP_TIMEOUT: int             = Field(default=lambda: ARCHIVING_CONFIG.MEDIA_TIMEOUT)
-    
-    @model_validator(mode='after')
-    def validate_use_ytdlp(self):
-        if self.USE_YTDLP and self.YTDLP_TIMEOUT < 20:
-            print(f'[red][!] Warning: MEDIA_TIMEOUT is set too low! (currently set to MEDIA_TIMEOUT={self.YTDLP_TIMEOUT} seconds)[/red]', file=sys.stderr)
-            print('    youtube-dl/yt-dlp will fail to archive any media if set to less than ~20 seconds.', file=sys.stderr)
-            print('    (Setting it somewhere over 60 seconds is recommended)', file=sys.stderr)
-            print(file=sys.stderr)
-            print('    If you want to disable media archiving entirely, set SAVE_MEDIA=False instead:', file=sys.stderr)
-            print('        https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#save_media', file=sys.stderr)
-            print(file=sys.stderr)
-        return self
-
-
-YTDLP_CONFIG = YtdlpConfig()
-
-
-
-class YtdlpBinary(BaseBinary):
-    name: BinName = YTDLP_CONFIG.YTDLP_BINARY
-    binproviders_supported: List[InstanceOf[BinProvider]] = [pip, apt, brew, env]
-
-YTDLP_BINARY = YtdlpBinary()
-
-
-class FfmpegBinary(BaseBinary):
-    name: BinName = 'ffmpeg'
-    binproviders_supported: List[InstanceOf[BinProvider]] = [apt, brew, env]
-
-    overrides: BinaryOverrides = {
-        'env': {
-            # 'abspath': lambda: shutil.which('ffmpeg', PATH=env.PATH),
-            'version': lambda: run(['ffmpeg', '-version'], stdout=PIPE, stderr=PIPE, text=True).stdout,
-        },
-        'apt': {
-            # 'abspath': lambda: shutil.which('ffmpeg', PATH=apt.PATH),
-            'version': lambda: run(['apt', 'show', 'ffmpeg'], stdout=PIPE, stderr=PIPE, text=True).stdout,
-        },
-        'brew': {
-            # 'abspath': lambda: shutil.which('ffmpeg', PATH=brew.PATH),
-            'version': lambda: run(['brew', 'info', 'ffmpeg', '--quiet'], stdout=PIPE, stderr=PIPE, text=True).stdout,
-        },
-    }
-
-    # def get_ffmpeg_version(self) -> Optional[str]:
-    #     return self.exec(cmd=['-version']).stdout
-
-FFMPEG_BINARY = FfmpegBinary()
-
-
-# class YtdlpExtractor(BaseExtractor):
-#     name: str = 'ytdlp'
-#     binary: str = 'ytdlp'
-
-
-
-class YtdlpPlugin(BasePlugin):
-    app_label: str = 'ytdlp'
-    verbose_name: str = 'YT-DLP'
-    docs_url: str = 'https://github.com/yt-dlp/yt-dlp'
-
-    hooks: List[InstanceOf[BaseHook]] = [
-        YTDLP_CONFIG,
-        YTDLP_BINARY,
-        FFMPEG_BINARY,
-    ]
-
-
-PLUGIN = YtdlpPlugin()
-# PLUGIN.register(settings)
-DJANGO_APP = PLUGIN.AppConfig

+ 42 - 0
archivebox/plugins_extractor/ytdlp/binaries.py

@@ -0,0 +1,42 @@
+__package__ = 'plugins_extractor.ytdlp'
+
+import subprocess
+from typing import List
+
+from pydantic import InstanceOf
+from pydantic_pkgr import BinProvider, BinName, BinaryOverrides
+
+from abx.archivebox.base_binary import BaseBinary, env, apt, brew
+
+from plugins_pkg.pip.binproviders import LIB_PIP_BINPROVIDER, VENV_PIP_BINPROVIDER, SYS_PIP_BINPROVIDER
+
+from .config import YTDLP_CONFIG
+
+
+class YtdlpBinary(BaseBinary):
+    name: BinName = YTDLP_CONFIG.YTDLP_BINARY
+    binproviders_supported: List[InstanceOf[BinProvider]] = [LIB_PIP_BINPROVIDER, VENV_PIP_BINPROVIDER, SYS_PIP_BINPROVIDER, apt, brew, env]
+
+YTDLP_BINARY = YtdlpBinary()
+
+
+class FfmpegBinary(BaseBinary):
+    name: BinName = 'ffmpeg'
+    binproviders_supported: List[InstanceOf[BinProvider]] = [apt, brew, env]
+
+    overrides: BinaryOverrides = {
+        'env': {
+            # 'abspath': lambda: shutil.which('ffmpeg', PATH=env.PATH),
+            'version': lambda: subprocess.run(['ffmpeg', '-version'], capture_output=True, text=True).stdout,
+        },
+        'apt': {
+            # 'abspath': lambda: shutil.which('ffmpeg', PATH=apt.PATH),
+            'version': lambda: subprocess.run(['apt', 'show', 'ffmpeg'], capture_output=True, text=True).stdout,
+        },
+        'brew': {
+            # 'abspath': lambda: shutil.which('ffmpeg', PATH=brew.PATH),
+            'version': lambda: subprocess.run(['brew', 'info', 'ffmpeg', '--quiet'], capture_output=True, text=True).stdout,
+        },
+    }
+
+FFMPEG_BINARY = FfmpegBinary()

+ 35 - 0
archivebox/plugins_extractor/ytdlp/config.py

@@ -0,0 +1,35 @@
+__package__ = 'plugins_extractor.ytdlp'
+
+from typing import List
+
+from pydantic import Field, model_validator, AliasChoices
+
+from abx.archivebox.base_configset import BaseConfigSet
+
+from archivebox.config.common import ARCHIVING_CONFIG
+from archivebox.misc.logging import STDERR
+
+
+class YtdlpConfig(BaseConfigSet):
+    USE_YTDLP: bool                = Field(default=True, validation_alias=AliasChoices('USE_YOUTUBEDL', 'SAVE_MEDIA'))
+
+    YTDLP_BINARY: str              = Field(default='yt-dlp', alias='YOUTUBEDL_BINARY')
+    YTDLP_EXTRA_ARGS: List[str]    = Field(default=[], alias='YOUTUBEDL_EXTRA_ARGS')
+    
+    YTDLP_CHECK_SSL_VALIDITY: bool = Field(default=lambda: ARCHIVING_CONFIG.CHECK_SSL_VALIDITY)
+    YTDLP_TIMEOUT: int             = Field(default=lambda: ARCHIVING_CONFIG.MEDIA_TIMEOUT)
+    
+    @model_validator(mode='after')
+    def validate_use_ytdlp(self):
+        if self.USE_YTDLP and self.YTDLP_TIMEOUT < 20:
+            STDERR.print(f'[red][!] Warning: MEDIA_TIMEOUT is set too low! (currently set to MEDIA_TIMEOUT={self.YTDLP_TIMEOUT} seconds)[/red]')
+            STDERR.print('    youtube-dl/yt-dlp will fail to archive any media if set to less than ~20 seconds.')
+            STDERR.print('    (Setting it somewhere over 60 seconds is recommended)')
+            STDERR.print()
+            STDERR.print('    If you want to disable media archiving entirely, set SAVE_MEDIA=False instead:')
+            STDERR.print('        https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#save_media')
+            STDERR.print()
+        return self
+
+
+YTDLP_CONFIG = YtdlpConfig()

+ 47 - 0
archivebox/plugins_pkg/npm/__init__.py

@@ -0,0 +1,47 @@
+__package__ = 'plugins_pkg.npm'
+__label__ = 'npm'
+__version__ = '2024.10.14'
+__author__ = 'Nick Sweeting'
+__homepage__ = 'https://www.npmjs.com/'
+
+import abx
+
+
[email protected]
+def get_PLUGIN():
+    return {
+        'npm': {
+            'PACKAGE': __package__,
+            'LABEL': __label__,
+            'VERSION': __version__,
+            'AUTHOR': __author__,
+            'HOMEPAGE': __homepage__,
+        }
+    }
+
[email protected]
+def get_CONFIG():
+    from .config import NPM_CONFIG
+    
+    return {
+        'npm': NPM_CONFIG,
+    }
+
[email protected]
+def get_BINARIES():
+    from .binaries import NODE_BINARY, NPM_BINARY, NPX_BINARY
+    
+    return {
+        'node': NODE_BINARY,
+        'npm': NPM_BINARY,
+        'npx': NPX_BINARY,
+    }
+
[email protected]
+def get_BINPROVIDERS():
+    from .binproviders import LIB_NPM_BINPROVIDER, SYS_NPM_BINPROVIDER
+    
+    return {
+        'lib_npm': LIB_NPM_BINPROVIDER,
+        'sys_npm': SYS_NPM_BINPROVIDER,
+    }

+ 0 - 114
archivebox/plugins_pkg/npm/apps.py

@@ -1,114 +0,0 @@
-__package__ = 'archivebox.plugins_pkg.npm'
-
-from pathlib import Path
-from typing import List, Optional
-
-from pydantic import InstanceOf, model_validator
-
-from pydantic_pkgr import BinProvider, NpmProvider, BinName, PATHStr, BinProviderName, BinaryOverrides
-
-from archivebox.config import DATA_DIR, CONSTANTS
-
-from abx.archivebox.base_plugin import BasePlugin
-from abx.archivebox.base_configset import BaseConfigSet
-from abx.archivebox.base_binary import BaseBinary, BaseBinProvider, env, apt, brew
-from abx.archivebox.base_hook import BaseHook
-
-
-###################### Config ##########################
-
-
-class NpmDependencyConfigs(BaseConfigSet):
-    # USE_NPM: bool = True
-    # NPM_BINARY: str = Field(default='npm')
-    # NPM_ARGS: Optional[List[str]] = Field(default=None)
-    # NPM_EXTRA_ARGS: List[str] = []
-    # NPM_DEFAULT_ARGS: List[str] = []
-    pass
-
-
-DEFAULT_GLOBAL_CONFIG = {
-}
-NPM_CONFIG = NpmDependencyConfigs(**DEFAULT_GLOBAL_CONFIG)
-
-
-OLD_NODE_BIN_PATH = DATA_DIR / 'node_modules' / '.bin'
-NEW_NODE_BIN_PATH = CONSTANTS.LIB_NPM_DIR / 'node_modules' / '.bin'
-
-class SystemNpmBinProvider(NpmProvider, BaseBinProvider):
-    name: BinProviderName = "sys_npm"
-    
-    npm_prefix: Optional[Path] = None
-
-class LibNpmBinProvider(NpmProvider, BaseBinProvider):
-    name: BinProviderName = "lib_npm"
-    PATH: PATHStr = f'{NEW_NODE_BIN_PATH}:{OLD_NODE_BIN_PATH}'
-    
-    npm_prefix: Optional[Path] = CONSTANTS.LIB_NPM_DIR
-    
-    @model_validator(mode='after')
-    def validate_path(self):
-        assert self.npm_prefix == NEW_NODE_BIN_PATH.parent.parent
-        return self
-
-
-SYS_NPM_BINPROVIDER = SystemNpmBinProvider()
-LIB_NPM_BINPROVIDER = LibNpmBinProvider()
-npm = LIB_NPM_BINPROVIDER
-
-class NodeBinary(BaseBinary):
-    name: BinName = 'node'
-    binproviders_supported: List[InstanceOf[BinProvider]] = [apt, brew, env]
-    
-    overrides: BinaryOverrides = {
-        apt.name: {'packages': ['nodejs']},
-    }
-
-
-NODE_BINARY = NodeBinary()
-
-
-class NpmBinary(BaseBinary):
-    name: BinName = 'npm'
-    binproviders_supported: List[InstanceOf[BinProvider]] = [apt, brew, env]
-
-    overrides: BinaryOverrides = {
-        apt.name: {'packages': ['npm']},   # already installed when nodejs is installed
-        brew.name: {'install': lambda: None},  # already installed when nodejs is installed
-    }
-    
-NPM_BINARY = NpmBinary()
-
-
-class NpxBinary(BaseBinary):
-    name: BinName = 'npx'
-    binproviders_supported: List[InstanceOf[BinProvider]] = [apt, brew, env]
-    
-    overrides: BinaryOverrides = {
-        apt.name: {'install': lambda: None},   # already installed when nodejs is installed
-        brew.name: {'install': lambda: None},  # already installed when nodejs is installed
-    }
-
-NPX_BINARY = NpxBinary()
-
-
-
-
-
-class NpmPlugin(BasePlugin):
-    app_label: str = 'npm'
-    verbose_name: str = 'NPM'
-    
-    hooks: List[InstanceOf[BaseHook]] = [
-        NPM_CONFIG,
-        SYS_NPM_BINPROVIDER,
-        LIB_NPM_BINPROVIDER,
-        NODE_BINARY,
-        NPM_BINARY,
-        NPX_BINARY,
-    ]
-
-
-PLUGIN = NpmPlugin()
-# PLUGIN.register(settings)
-DJANGO_APP = PLUGIN.AppConfig

+ 48 - 0
archivebox/plugins_pkg/npm/binaries.py

@@ -0,0 +1,48 @@
+__package__ = 'plugins_pkg.npm'
+
+
+from typing import List
+
+from pydantic import InstanceOf
+
+from pydantic_pkgr import BinProvider, BinName, BinaryOverrides
+
+
+from abx.archivebox.base_binary import BaseBinary, env, apt, brew
+
+
+class NodeBinary(BaseBinary):
+    name: BinName = 'node'
+    binproviders_supported: List[InstanceOf[BinProvider]] = [apt, brew, env]
+    
+    overrides: BinaryOverrides = {
+        apt.name: {'packages': ['nodejs']},
+    }
+
+
+NODE_BINARY = NodeBinary()
+
+
+class NpmBinary(BaseBinary):
+    name: BinName = 'npm'
+    binproviders_supported: List[InstanceOf[BinProvider]] = [apt, brew, env]
+
+    overrides: BinaryOverrides = {
+        apt.name: {'packages': ['npm']},   # already installed when nodejs is installed
+        brew.name: {'install': lambda: None},  # already installed when nodejs is installed
+    }
+    
+NPM_BINARY = NpmBinary()
+
+
+class NpxBinary(BaseBinary):
+    name: BinName = 'npx'
+    binproviders_supported: List[InstanceOf[BinProvider]] = [apt, brew, env]
+    
+    overrides: BinaryOverrides = {
+        apt.name: {'install': lambda: None},   # already installed when nodejs is installed
+        brew.name: {'install': lambda: None},  # already installed when nodejs is installed
+    }
+
+NPX_BINARY = NpxBinary()
+

+ 40 - 0
archivebox/plugins_pkg/npm/binproviders.py

@@ -0,0 +1,40 @@
+__package__ = 'plugins_pkg.npm'
+
+from pathlib import Path
+from typing import Optional
+
+from pydantic import model_validator
+
+from pydantic_pkgr import NpmProvider, PATHStr, BinProviderName
+
+from archivebox.config import DATA_DIR, CONSTANTS
+
+from abx.archivebox.base_binary import BaseBinProvider
+
+
+
+OLD_NODE_BIN_PATH = DATA_DIR / 'node_modules' / '.bin'
+NEW_NODE_BIN_PATH = CONSTANTS.LIB_NPM_DIR / 'node_modules' / '.bin'
+
+
+class SystemNpmBinProvider(NpmProvider, BaseBinProvider):
+    name: BinProviderName = "sys_npm"
+    
+    npm_prefix: Optional[Path] = None
+
+
+class LibNpmBinProvider(NpmProvider, BaseBinProvider):
+    name: BinProviderName = "lib_npm"
+    PATH: PATHStr = f'{NEW_NODE_BIN_PATH}:{OLD_NODE_BIN_PATH}'
+    
+    npm_prefix: Optional[Path] = CONSTANTS.LIB_NPM_DIR
+    
+    @model_validator(mode='after')
+    def validate_path(self):
+        assert self.npm_prefix == NEW_NODE_BIN_PATH.parent.parent
+        return self
+
+
+SYS_NPM_BINPROVIDER = SystemNpmBinProvider()
+LIB_NPM_BINPROVIDER = LibNpmBinProvider()
+npm = LIB_NPM_BINPROVIDER

+ 20 - 0
archivebox/plugins_pkg/npm/config.py

@@ -0,0 +1,20 @@
+__package__ = 'plugins_pkg.npm'
+
+
+from abx.archivebox.base_configset import BaseConfigSet
+
+
+###################### Config ##########################
+
+
+class NpmDependencyConfigs(BaseConfigSet):
+    # USE_NPM: bool = True
+    # NPM_BINARY: str = Field(default='npm')
+    # NPM_ARGS: Optional[List[str]] = Field(default=None)
+    # NPM_EXTRA_ARGS: List[str] = []
+    # NPM_DEFAULT_ARGS: List[str] = []
+    pass
+
+
+NPM_CONFIG = NpmDependencyConfigs()
+

+ 51 - 0
archivebox/plugins_pkg/pip/__init__.py

@@ -0,0 +1,51 @@
+__package__ = 'plugins_pkg.pip'
+__label__ = 'pip'
+__version__ = '2024.10.14'
+__author__ = 'Nick Sweeting'
+__homepage__ = 'https://github.com/pypa/pip'
+
+import abx
+
+
[email protected]
+def get_PLUGIN():
+    return {
+        'pip': {
+            'PACKAGE': __package__,
+            'LABEL': __label__,
+            'VERSION': __version__,
+            'AUTHOR': __author__,
+            'HOMEPAGE': __homepage__,
+        }
+    }
+
[email protected]
+def get_CONFIG():
+    from .config import PIP_CONFIG
+    
+    return {
+        'pip': PIP_CONFIG
+    }
+
[email protected]
+def get_BINARIES():
+    from .binaries import ARCHIVEBOX_BINARY, PYTHON_BINARY, DJANGO_BINARY, SQLITE_BINARY, PIP_BINARY, PIPX_BINARY
+    
+    return {
+        'archivebox': ARCHIVEBOX_BINARY,
+        'python': PYTHON_BINARY,
+        'django': DJANGO_BINARY,
+        'sqlite': SQLITE_BINARY,
+        'pip': PIP_BINARY,
+        'pipx': PIPX_BINARY,
+    }
+
[email protected]
+def get_BINPROVIDERS():
+    from .binproviders import SYS_PIP_BINPROVIDER, VENV_PIP_BINPROVIDER, LIB_PIP_BINPROVIDER
+    
+    return {
+        'sys_pip': SYS_PIP_BINPROVIDER,
+        'venv_pip': VENV_PIP_BINPROVIDER,
+        'lib_pip': LIB_PIP_BINPROVIDER,
+    }

+ 7 - 109
archivebox/plugins_pkg/pip/apps.py → archivebox/plugins_pkg/pip/binaries.py

@@ -1,105 +1,27 @@
-__package__ = 'archivebox.plugins_pkg.pip'
+__package__ = 'plugins_pkg.pip'
 
 
-import os
 import sys
 import sys
-import site
 from pathlib import Path
 from pathlib import Path
-from typing import List, Optional
-from pydantic import InstanceOf, Field, model_validator, validate_call
+from typing import List
+from pydantic import InstanceOf, Field, model_validator
 
 
 
 
 import django
 import django
 import django.db.backends.sqlite3.base
 import django.db.backends.sqlite3.base
 from django.db.backends.sqlite3.base import Database as django_sqlite3     # type: ignore[import-type]
 from django.db.backends.sqlite3.base import Database as django_sqlite3     # type: ignore[import-type]
-from pydantic_pkgr import BinProvider, PipProvider, BinName, BinProviderName, BinaryOverrides, SemVer
+from pydantic_pkgr import BinProvider, BinName, BinaryOverrides, SemVer
 
 
-from archivebox.config import CONSTANTS, VERSION
+from archivebox import VERSION
 
 
-from abx.archivebox.base_plugin import BasePlugin
-from abx.archivebox.base_configset import BaseConfigSet
 from abx.archivebox.base_binary import BaseBinary, BaseBinProvider, env, apt, brew
 from abx.archivebox.base_binary import BaseBinary, BaseBinProvider, env, apt, brew
-from abx.archivebox.base_hook import BaseHook
 
 
-from ...misc.logging import hint
+from archivebox.misc.logging import hint
 
 
+from .binproviders import LIB_PIP_BINPROVIDER, VENV_PIP_BINPROVIDER, SYS_PIP_BINPROVIDER
 
 
 ###################### Config ##########################
 ###################### Config ##########################
 
 
 
 
-class PipDependencyConfigs(BaseConfigSet):
-    USE_PIP: bool = True
-    PIP_BINARY: str = Field(default='pip')
-    PIP_ARGS: Optional[List[str]] = Field(default=None)
-    PIP_EXTRA_ARGS: List[str] = []
-    PIP_DEFAULT_ARGS: List[str] = []
-    
-PIP_CONFIG = PipDependencyConfigs()
-
-
-class SystemPipBinProvider(PipProvider, BaseBinProvider):
-    name: BinProviderName = "sys_pip"
-    INSTALLER_BIN: BinName = "pip"
-    
-    pip_venv: Optional[Path] = None        # global pip scope
-    
-    def on_install(self, bin_name: str, **kwargs):
-        # never modify system pip packages
-        return 'refusing to install packages globally with system pip, use a venv instead'
-
-class SystemPipxBinProvider(PipProvider, BaseBinProvider):
-    name: BinProviderName = "pipx"
-    INSTALLER_BIN: BinName = "pipx"
-    
-    pip_venv: Optional[Path] = None        # global pipx scope
-
-
-IS_INSIDE_VENV = sys.prefix != sys.base_prefix
-
-class VenvPipBinProvider(PipProvider, BaseBinProvider):
-    name: BinProviderName = "venv_pip"
-    INSTALLER_BIN: BinName = "pip"
-
-    pip_venv: Optional[Path] = Path(sys.prefix if IS_INSIDE_VENV else os.environ.get("VIRTUAL_ENV", '/tmp/NotInsideAVenv/lib'))
-    
-    def setup(self):
-        """never attempt to create a venv here, this is just used to detect if we are inside an existing one"""
-        return None
-    
-
-class LibPipBinProvider(PipProvider, BaseBinProvider):
-    name: BinProviderName = "lib_pip"
-    INSTALLER_BIN: BinName = "pip"
-    
-    pip_venv: Optional[Path] = CONSTANTS.LIB_PIP_DIR / 'venv'
-
-SYS_PIP_BINPROVIDER = SystemPipBinProvider()
-PIPX_PIP_BINPROVIDER = SystemPipxBinProvider()
-VENV_PIP_BINPROVIDER = VenvPipBinProvider()
-LIB_PIP_BINPROVIDER = LibPipBinProvider()
-pip = LIB_PIP_BINPROVIDER
-
-# ensure python libraries are importable from these locations (if archivebox wasnt executed from one of these then they wont already be in sys.path)
-assert VENV_PIP_BINPROVIDER.pip_venv is not None
-assert LIB_PIP_BINPROVIDER.pip_venv is not None
-
-major, minor, patch = sys.version_info[:3]
-site_packages_dir = f'lib/python{major}.{minor}/site-packages'
-
-LIB_SITE_PACKAGES = (LIB_PIP_BINPROVIDER.pip_venv / site_packages_dir,)
-VENV_SITE_PACKAGES = (VENV_PIP_BINPROVIDER.pip_venv / site_packages_dir,)
-USER_SITE_PACKAGES = site.getusersitepackages()
-SYS_SITE_PACKAGES = site.getsitepackages()
-
-ALL_SITE_PACKAGES = (
-    *LIB_SITE_PACKAGES,
-    *VENV_SITE_PACKAGES,
-    *USER_SITE_PACKAGES,
-    *SYS_SITE_PACKAGES,
-)
-for site_packages_dir in ALL_SITE_PACKAGES:
-    if site_packages_dir not in sys.path:
-        sys.path.append(str(site_packages_dir))
-
 
 
 class ArchiveboxBinary(BaseBinary):
 class ArchiveboxBinary(BaseBinary):
     name: BinName = 'archivebox'
     name: BinName = 'archivebox'
@@ -237,27 +159,3 @@ class PipxBinary(BaseBinary):
     binproviders_supported: List[InstanceOf[BinProvider]] = [LIB_PIP_BINPROVIDER, VENV_PIP_BINPROVIDER, SYS_PIP_BINPROVIDER, apt, brew, env]
     binproviders_supported: List[InstanceOf[BinProvider]] = [LIB_PIP_BINPROVIDER, VENV_PIP_BINPROVIDER, SYS_PIP_BINPROVIDER, apt, brew, env]
 
 
 PIPX_BINARY = PipxBinary()
 PIPX_BINARY = PipxBinary()
-
-
-class PipPlugin(BasePlugin):
-    app_label: str = 'pip'
-    verbose_name: str = 'PIP'
-
-    hooks: List[InstanceOf[BaseHook]] = [
-        PIP_CONFIG,
-        SYS_PIP_BINPROVIDER,
-        PIPX_PIP_BINPROVIDER,
-        VENV_PIP_BINPROVIDER,
-        LIB_PIP_BINPROVIDER,
-        PIP_BINARY,
-        PIPX_BINARY,
-        ARCHIVEBOX_BINARY,
-        PYTHON_BINARY,
-        SQLITE_BINARY,
-        DJANGO_BINARY,
-    ]
-
-
-PLUGIN = PipPlugin()
-# PLUGIN.register(settings)
-DJANGO_APP = PLUGIN.AppConfig

+ 80 - 0
archivebox/plugins_pkg/pip/binproviders.py

@@ -0,0 +1,80 @@
+__package__ = 'plugins_pkg.pip'
+
+import os
+import sys
+import site
+from pathlib import Path
+from typing import Optional
+
+from pydantic_pkgr import PipProvider, BinName, BinProviderName
+
+from archivebox.config import CONSTANTS
+
+from abx.archivebox.base_binary import BaseBinProvider
+
+
+###################### Config ##########################
+
+class SystemPipBinProvider(PipProvider, BaseBinProvider):
+    name: BinProviderName = "sys_pip"
+    INSTALLER_BIN: BinName = "pip"
+    
+    pip_venv: Optional[Path] = None        # global pip scope
+    
+    def on_install(self, bin_name: str, **kwargs):
+        # never modify system pip packages
+        return 'refusing to install packages globally with system pip, use a venv instead'
+
+class SystemPipxBinProvider(PipProvider, BaseBinProvider):
+    name: BinProviderName = "pipx"
+    INSTALLER_BIN: BinName = "pipx"
+    
+    pip_venv: Optional[Path] = None        # global pipx scope
+
+
+IS_INSIDE_VENV = sys.prefix != sys.base_prefix
+
+class VenvPipBinProvider(PipProvider, BaseBinProvider):
+    name: BinProviderName = "venv_pip"
+    INSTALLER_BIN: BinName = "pip"
+
+    pip_venv: Optional[Path] = Path(sys.prefix if IS_INSIDE_VENV else os.environ.get("VIRTUAL_ENV", '/tmp/NotInsideAVenv/lib'))
+    
+    def setup(self):
+        """never attempt to create a venv here, this is just used to detect if we are inside an existing one"""
+        return None
+    
+
+class LibPipBinProvider(PipProvider, BaseBinProvider):
+    name: BinProviderName = "lib_pip"
+    INSTALLER_BIN: BinName = "pip"
+    
+    pip_venv: Optional[Path] = CONSTANTS.LIB_PIP_DIR / 'venv'
+
+SYS_PIP_BINPROVIDER = SystemPipBinProvider()
+PIPX_PIP_BINPROVIDER = SystemPipxBinProvider()
+VENV_PIP_BINPROVIDER = VenvPipBinProvider()
+LIB_PIP_BINPROVIDER = LibPipBinProvider()
+pip = LIB_PIP_BINPROVIDER
+
+# ensure python libraries are importable from these locations (if archivebox wasnt executed from one of these then they wont already be in sys.path)
+assert VENV_PIP_BINPROVIDER.pip_venv is not None
+assert LIB_PIP_BINPROVIDER.pip_venv is not None
+
+major, minor, patch = sys.version_info[:3]
+site_packages_dir = f'lib/python{major}.{minor}/site-packages'
+
+LIB_SITE_PACKAGES = (LIB_PIP_BINPROVIDER.pip_venv / site_packages_dir,)
+VENV_SITE_PACKAGES = (VENV_PIP_BINPROVIDER.pip_venv / site_packages_dir,)
+USER_SITE_PACKAGES = site.getusersitepackages()
+SYS_SITE_PACKAGES = site.getsitepackages()
+
+ALL_SITE_PACKAGES = (
+    *LIB_SITE_PACKAGES,
+    *VENV_SITE_PACKAGES,
+    *USER_SITE_PACKAGES,
+    *SYS_SITE_PACKAGES,
+)
+for site_packages_dir in ALL_SITE_PACKAGES:
+    if site_packages_dir not in sys.path:
+        sys.path.append(str(site_packages_dir))

+ 16 - 0
archivebox/plugins_pkg/pip/config.py

@@ -0,0 +1,16 @@
+__package__ = 'pip'
+
+from typing import List, Optional
+from pydantic import Field
+
+from abx.archivebox.base_configset import BaseConfigSet
+
+
+class PipDependencyConfigs(BaseConfigSet):
+    USE_PIP: bool = True
+    PIP_BINARY: str = Field(default='pip')
+    PIP_ARGS: Optional[List[str]] = Field(default=None)
+    PIP_EXTRA_ARGS: List[str] = []
+    PIP_DEFAULT_ARGS: List[str] = []
+    
+PIP_CONFIG = PipDependencyConfigs()

+ 44 - 0
archivebox/plugins_pkg/playwright/__init__.py

@@ -0,0 +1,44 @@
+__package__ = 'plugins_pkg.playwright'
+__label__ = 'playwright'
+__version__ = '2024.10.14'
+__author__ = 'Nick Sweeting'
+__homepage__ = 'https://github.com/microsoft/playwright-python'
+
+import abx
+
+
[email protected]
+def get_PLUGIN():
+    return {
+        'playwright': {
+            'PACKAGE': __package__,
+            'LABEL': __label__,
+            'VERSION': __version__,
+            'AUTHOR': __author__,
+            'HOMEPAGE': __homepage__,
+        }
+    }
+
[email protected]
+def get_CONFIG():
+    from .config import PLAYWRIGHT_CONFIG
+    
+    return {
+        'playwright': PLAYWRIGHT_CONFIG
+    }
+
[email protected]
+def get_BINARIES():
+    from .binaries import PLAYWRIGHT_BINARY
+    
+    return {
+        'playwright': PLAYWRIGHT_BINARY,
+    }
+
[email protected]
+def get_BINPROVIDERS():
+    from .binproviders import PLAYWRIGHT_BINPROVIDER
+    
+    return {
+        'playwright': PLAYWRIGHT_BINPROVIDER,
+    }

+ 23 - 0
archivebox/plugins_pkg/playwright/binaries.py

@@ -0,0 +1,23 @@
+__package__ = 'plugins_pkg.playwright'
+
+from typing import List
+
+from pydantic import InstanceOf
+from pydantic_pkgr import BinName, BinProvider
+
+from abx.archivebox.base_binary import BaseBinary, env
+
+from plugins_pkg.pip.binproviders import SYS_PIP_BINPROVIDER, VENV_PIP_BINPROVIDER, LIB_PIP_BINPROVIDER
+
+from .config import PLAYWRIGHT_CONFIG
+
+
+
+
+class PlaywrightBinary(BaseBinary):
+    name: BinName = PLAYWRIGHT_CONFIG.PLAYWRIGHT_BINARY
+
+    binproviders_supported: List[InstanceOf[BinProvider]] = [LIB_PIP_BINPROVIDER, VENV_PIP_BINPROVIDER, SYS_PIP_BINPROVIDER, env]
+    
+
+PLAYWRIGHT_BINARY = PlaywrightBinary()

+ 10 - 57
archivebox/plugins_pkg/playwright/apps.py → archivebox/plugins_pkg/playwright/binproviders.py

@@ -1,15 +1,13 @@
-__package__ = 'archivebox.plugins_pkg.playwright'
+__package__ = 'plugins_pkg.playwright'
 
 
 import os
 import os
 import platform
 import platform
 from pathlib import Path
 from pathlib import Path
 from typing import List, Optional, Dict, ClassVar
 from typing import List, Optional, Dict, ClassVar
 
 
-# Depends on other PyPI/vendor packages:
-from pydantic import InstanceOf, computed_field, Field
+from pydantic import computed_field, Field
 from pydantic_pkgr import (
 from pydantic_pkgr import (
     BinName,
     BinName,
-    BinProvider,
     BinProviderName,
     BinProviderName,
     BinProviderOverrides,
     BinProviderOverrides,
     InstallArgs,
     InstallArgs,
@@ -22,42 +20,15 @@ from pydantic_pkgr import (
 
 
 from archivebox.config import CONSTANTS
 from archivebox.config import CONSTANTS
 
 
-# Depends on other Django apps:
-from abx.archivebox.base_plugin import BasePlugin
-from abx.archivebox.base_configset import BaseConfigSet
-from abx.archivebox.base_binary import BaseBinary, BaseBinProvider, env
-# from abx.archivebox.base_extractor import BaseExtractor
-# from abx.archivebox.base_queue import BaseQueue
-from abx.archivebox.base_hook import BaseHook
+from abx.archivebox.base_binary import BaseBinProvider, env
 
 
-from plugins_pkg.pip.apps import SYS_PIP_BINPROVIDER, VENV_PIP_BINPROVIDER, LIB_PIP_BINPROVIDER
+from plugins_pkg.pip.binproviders import SYS_PIP_BINPROVIDER
 
 
+from .binaries import PLAYWRIGHT_BINARY
 
 
-###################### Config ##########################
 
 
-
-class PlaywrightConfigs(BaseConfigSet):
-    # PLAYWRIGHT_BINARY: str = Field(default='wget')
-    # PLAYWRIGHT_ARGS: Optional[List[str]] = Field(default=None)
-    # PLAYWRIGHT_EXTRA_ARGS: List[str] = []
-    # PLAYWRIGHT_DEFAULT_ARGS: List[str] = ['--timeout={TIMEOUT-10}']
-    pass
-
-
-PLAYWRIGHT_CONFIG = PlaywrightConfigs()
-
-LIB_DIR_BROWSERS = CONSTANTS.LIB_BROWSERS_DIR
-
-
-
-class PlaywrightBinary(BaseBinary):
-    name: BinName = "playwright"
-
-    binproviders_supported: List[InstanceOf[BinProvider]] = [LIB_PIP_BINPROVIDER, VENV_PIP_BINPROVIDER, SYS_PIP_BINPROVIDER, env]
-    
-
-
-PLAYWRIGHT_BINARY = PlaywrightBinary()
+MACOS_PLAYWRIGHT_CACHE_DIR: Path = Path("~/Library/Caches/ms-playwright")
+LINUX_PLAYWRIGHT_CACHE_DIR: Path = Path("~/.cache/ms-playwright")
 
 
 
 
 class PlaywrightBinProvider(BaseBinProvider):
 class PlaywrightBinProvider(BaseBinProvider):
@@ -67,11 +38,11 @@ class PlaywrightBinProvider(BaseBinProvider):
     PATH: PATHStr = f"{CONSTANTS.LIB_BIN_DIR}:{DEFAULT_ENV_PATH}"
     PATH: PATHStr = f"{CONSTANTS.LIB_BIN_DIR}:{DEFAULT_ENV_PATH}"
 
 
     playwright_browsers_dir: Path = (
     playwright_browsers_dir: Path = (
-        Path("~/Library/Caches/ms-playwright").expanduser()      # macos playwright cache dir
+        MACOS_PLAYWRIGHT_CACHE_DIR.expanduser()
         if OPERATING_SYSTEM == "darwin" else
         if OPERATING_SYSTEM == "darwin" else
-        Path("~/.cache/ms-playwright").expanduser()              # linux playwright cache dir
+        LINUX_PLAYWRIGHT_CACHE_DIR.expanduser()
     )
     )
-    playwright_install_args: List[str] = ["install"]              # --with-deps
+    playwright_install_args: List[str] = ["install"]
 
 
     packages_handler: BinProviderOverrides = Field(default={
     packages_handler: BinProviderOverrides = Field(default={
         "chrome": ["chromium"],
         "chrome": ["chromium"],
@@ -183,21 +154,3 @@ class PlaywrightBinProvider(BaseBinProvider):
         return (proc.stderr.strip() + "\n" + proc.stdout.strip()).strip()
         return (proc.stderr.strip() + "\n" + proc.stdout.strip()).strip()
 
 
 PLAYWRIGHT_BINPROVIDER = PlaywrightBinProvider()
 PLAYWRIGHT_BINPROVIDER = PlaywrightBinProvider()
-
-
-
-class PlaywrightPlugin(BasePlugin):
-    app_label: str = 'playwright'
-    verbose_name: str = 'Playwright (PIP)'
-
-    hooks: List[InstanceOf[BaseHook]] = [
-        PLAYWRIGHT_CONFIG,
-        PLAYWRIGHT_BINPROVIDER,
-        PLAYWRIGHT_BINARY,
-    ]
-
-
-
-PLUGIN = PlaywrightPlugin()
-# PLUGIN.register(settings)
-DJANGO_APP = PLUGIN.AppConfig

+ 10 - 0
archivebox/plugins_pkg/playwright/config.py

@@ -0,0 +1,10 @@
+__package__ = 'playwright'
+
+from abx.archivebox.base_configset import BaseConfigSet
+
+
+class PlaywrightConfigs(BaseConfigSet):
+    PLAYWRIGHT_BINARY: str = 'playwright'
+
+
+PLAYWRIGHT_CONFIG = PlaywrightConfigs()

+ 46 - 0
archivebox/plugins_pkg/puppeteer/__init__.py

@@ -0,0 +1,46 @@
+__package__ = 'plugins_pkg.puppeteer'
+__label__ = 'puppeteer'
+__version__ = '2024.10.14'
+__author__ = 'Nick Sweeting'
+__homepage__ = 'https://github.com/puppeteer/puppeteer'
+__dependencies__ = ['npm']
+
+import abx
+
+
[email protected]
+def get_PLUGIN():
+    return {
+        'puppeteer': {
+            'PACKAGE': __package__,
+            'LABEL': __label__,
+            'VERSION': __version__,
+            'AUTHOR': __author__,
+            'HOMEPAGE': __homepage__,
+            'DEPENDENCIES': __dependencies__,
+        }
+    }
+
[email protected]
+def get_CONFIG():
+    from .config import PUPPETEER_CONFIG
+    
+    return {
+        'puppeteer': PUPPETEER_CONFIG
+    }
+
[email protected]
+def get_BINARIES():
+    from .binaries import PUPPETEER_BINARY
+    
+    return {
+        'puppeteer': PUPPETEER_BINARY,
+    }
+
[email protected]
+def get_BINPROVIDERS():
+    from .binproviders import PUPPETEER_BINPROVIDER
+    
+    return {
+        'puppeteer': PUPPETEER_BINPROVIDER,
+    }

+ 23 - 0
archivebox/plugins_pkg/puppeteer/binaries.py

@@ -0,0 +1,23 @@
+__package__ = 'plugins_pkg.puppeteer'
+
+from typing import List
+
+from pydantic import InstanceOf
+from pydantic_pkgr import BinProvider, BinName
+
+
+from abx.archivebox.base_binary import BaseBinary, env
+
+from plugins_pkg.npm.binproviders import LIB_NPM_BINPROVIDER, SYS_NPM_BINPROVIDER
+
+
+###################### Config ##########################
+
+
+class PuppeteerBinary(BaseBinary):
+    name: BinName = "puppeteer"
+
+    binproviders_supported: List[InstanceOf[BinProvider]] = [LIB_NPM_BINPROVIDER, SYS_NPM_BINPROVIDER, env]
+
+
+PUPPETEER_BINARY = PuppeteerBinary()

+ 4 - 52
archivebox/plugins_pkg/puppeteer/apps.py → archivebox/plugins_pkg/puppeteer/binproviders.py

@@ -1,14 +1,12 @@
-__package__ = 'archivebox.plugins_pkg.puppeteer'
+__package__ = 'plugins_pkg.puppeteer'
 
 
 import os
 import os
 import platform
 import platform
 from pathlib import Path
 from pathlib import Path
 from typing import List, Optional, Dict, ClassVar
 from typing import List, Optional, Dict, ClassVar
 
 
-# Depends on other PyPI/vendor packages:
-from pydantic import InstanceOf, Field
+from pydantic import Field
 from pydantic_pkgr import (
 from pydantic_pkgr import (
-    BinProvider,
     BinName,
     BinName,
     BinProviderName,
     BinProviderName,
     BinProviderOverrides,
     BinProviderOverrides,
@@ -20,43 +18,14 @@ from pydantic_pkgr import (
 from archivebox.config import CONSTANTS
 from archivebox.config import CONSTANTS
 from archivebox.config.permissions import ARCHIVEBOX_USER
 from archivebox.config.permissions import ARCHIVEBOX_USER
 
 
-# Depends on other Django apps:
-from abx.archivebox.base_plugin import BasePlugin
-from abx.archivebox.base_configset import BaseConfigSet
-from abx.archivebox.base_binary import BaseBinary, BaseBinProvider, env
-# from abx.archivebox.base_extractor import BaseExtractor
-# from abx.archivebox.base_queue import BaseQueue
-from abx.archivebox.base_hook import BaseHook
+from abx.archivebox.base_binary import BaseBinProvider
 
 
-# Depends on Other Plugins:
-from plugins_pkg.npm.apps import LIB_NPM_BINPROVIDER, SYS_NPM_BINPROVIDER
+from plugins_pkg.npm.binproviders import SYS_NPM_BINPROVIDER
 
 
 
 
-###################### Config ##########################
-
-
-class PuppeteerConfigs(BaseConfigSet):
-    # PUPPETEER_BINARY: str = Field(default='wget')
-    # PUPPETEER_ARGS: Optional[List[str]] = Field(default=None)
-    # PUPPETEER_EXTRA_ARGS: List[str] = []
-    # PUPPETEER_DEFAULT_ARGS: List[str] = ['--timeout={TIMEOUT-10}']
-    pass
-
-
-PUPPETEER_CONFIG = PuppeteerConfigs()
-
 LIB_DIR_BROWSERS = CONSTANTS.LIB_BROWSERS_DIR
 LIB_DIR_BROWSERS = CONSTANTS.LIB_BROWSERS_DIR
 
 
 
 
-class PuppeteerBinary(BaseBinary):
-    name: BinName = "puppeteer"
-
-    binproviders_supported: List[InstanceOf[BinProvider]] = [LIB_NPM_BINPROVIDER, SYS_NPM_BINPROVIDER, env]
-
-
-PUPPETEER_BINARY = PuppeteerBinary()
-
-
 class PuppeteerBinProvider(BaseBinProvider):
 class PuppeteerBinProvider(BaseBinProvider):
     name: BinProviderName = "puppeteer"
     name: BinProviderName = "puppeteer"
     INSTALLER_BIN: BinName = "npx"
     INSTALLER_BIN: BinName = "npx"
@@ -157,20 +126,3 @@ PUPPETEER_BINPROVIDER = PuppeteerBinProvider()
 #         "binproviders_supported": self.binproviders_supported,
 #         "binproviders_supported": self.binproviders_supported,
 #     }
 #     }
 # )
 # )
-
-
-class PuppeteerPlugin(BasePlugin):
-    app_label: str ='puppeteer'
-    verbose_name: str = 'Puppeteer (NPM)'
-
-    hooks: List[InstanceOf[BaseHook]] = [
-        PUPPETEER_CONFIG,
-        PUPPETEER_BINPROVIDER,
-        PUPPETEER_BINARY,
-    ]
-
-
-
-PLUGIN = PuppeteerPlugin()
-# PLUGIN.register(settings)
-DJANGO_APP = PLUGIN.AppConfig

Some files were not shown because too many files changed in this diff