Browse Source

v0.8.6-rc: Moving plugins to independent python packages with finite state machine interfaces (#1576)

Nick Sweeting 1 year ago
parent
commit
b7b3addbab
100 changed files with 1998 additions and 1414 deletions
  1. 1 1
      .github/workflows/test.yml
  2. 0 6
      .gitmodules
  3. 1 1
      archivebox/.flake8
  4. 46 3
      archivebox/__init__.py
  5. 1 1
      archivebox/abid_utils/models.py
  6. 0 131
      archivebox/abx/__init__.py
  7. 0 30
      archivebox/abx/archivebox/__init__.py
  8. 0 106
      archivebox/abx/archivebox/base_binary.py
  9. 0 219
      archivebox/abx/archivebox/base_extractor.py
  10. 0 25
      archivebox/abx/archivebox/base_replayer.py
  11. 0 25
      archivebox/abx/archivebox/base_searchbackend.py
  12. 0 52
      archivebox/abx/archivebox/hookspec.py
  13. 0 160
      archivebox/abx/archivebox/reads.py
  14. 0 1
      archivebox/abx/django/__init__.py
  15. 0 13
      archivebox/abx/django/apps.py
  16. 0 125
      archivebox/abx/django/hookspec.py
  17. 0 101
      archivebox/abx/django/use.py
  18. 0 22
      archivebox/abx/hookspec.py
  19. 0 30
      archivebox/abx/manager.py
  20. 0 1
      archivebox/abx/pydantic_pkgr/__init__.py
  21. 0 13
      archivebox/abx/pydantic_pkgr/hookspec.py
  22. 0 0
      archivebox/actors/__init__.py
  23. 313 0
      archivebox/actors/actor.py
  24. 3 0
      archivebox/actors/admin.py
  25. 6 0
      archivebox/actors/apps.py
  26. 0 0
      archivebox/actors/migrations/__init__.py
  27. 3 0
      archivebox/actors/models.py
  28. 244 0
      archivebox/actors/orchestrator.py
  29. 286 0
      archivebox/actors/statemachine.py
  30. 3 0
      archivebox/actors/tests.py
  31. 3 0
      archivebox/actors/views.py
  32. 24 30
      archivebox/config/__init__.py
  33. 10 10
      archivebox/config/collection.py
  34. 1 3
      archivebox/config/common.py
  35. 17 2
      archivebox/config/constants.py
  36. 2 2
      archivebox/config/django.py
  37. 8 4
      archivebox/config/version.py
  38. 12 16
      archivebox/config/views.py
  39. 29 0
      archivebox/core/__init__.py
  40. 73 0
      archivebox/core/actors.py
  41. 2 2
      archivebox/core/admin_archiveresults.py
  42. 2 2
      archivebox/core/admin_site.py
  43. 4 9
      archivebox/core/apps.py
  44. 61 10
      archivebox/core/models.py
  45. 19 58
      archivebox/core/settings.py
  46. 0 5
      archivebox/core/settings_logging.py
  47. 115 0
      archivebox/core/statemachines.py
  48. 25 18
      archivebox/core/views.py
  49. 69 0
      archivebox/crawls/actors.py
  50. 48 5
      archivebox/crawls/models.py
  51. 48 0
      archivebox/crawls/statemachines.py
  52. 16 30
      archivebox/extractors/__init__.py
  53. 7 4
      archivebox/index/html.py
  54. 3 4
      archivebox/index/json.py
  55. 6 3
      archivebox/index/schema.py
  56. 15 14
      archivebox/machine/models.py
  57. 33 30
      archivebox/main.py
  58. 3 0
      archivebox/misc/checks.py
  59. 1 2
      archivebox/misc/shell_welcome_message.py
  60. 20 10
      archivebox/misc/util.py
  61. 2 5
      archivebox/parsers/generic_jsonl.py
  62. 6 5
      archivebox/parsers/pocket_api.py
  63. 16 9
      archivebox/parsers/readwise_reader_api.py
  64. 39 0
      archivebox/pkgs/__init__.py
  65. 0 0
      archivebox/pkgs/abx-plugin-archivedotorg/README.md
  66. 21 0
      archivebox/pkgs/abx-plugin-archivedotorg/abx_plugin_archivedotorg/__init__.py
  67. 0 0
      archivebox/pkgs/abx-plugin-archivedotorg/abx_plugin_archivedotorg/archive_org.py
  68. 1 4
      archivebox/pkgs/abx-plugin-archivedotorg/abx_plugin_archivedotorg/config.py
  69. 18 0
      archivebox/pkgs/abx-plugin-archivedotorg/pyproject.toml
  70. 0 0
      archivebox/pkgs/abx-plugin-chrome/README.md
  71. 34 0
      archivebox/pkgs/abx-plugin-chrome/abx_plugin_chrome/__init__.py
  72. 23 21
      archivebox/pkgs/abx-plugin-chrome/abx_plugin_chrome/binaries.py
  73. 13 13
      archivebox/pkgs/abx-plugin-chrome/abx_plugin_chrome/config.py
  74. 0 0
      archivebox/pkgs/abx-plugin-chrome/abx_plugin_chrome/dom.py
  75. 0 0
      archivebox/pkgs/abx-plugin-chrome/abx_plugin_chrome/pdf.py
  76. 0 0
      archivebox/pkgs/abx-plugin-chrome/abx_plugin_chrome/screenshot.py
  77. 18 0
      archivebox/pkgs/abx-plugin-chrome/pyproject.toml
  78. 0 0
      archivebox/pkgs/abx-plugin-curl/README.md
  79. 18 0
      archivebox/pkgs/abx-plugin-curl/abx_plugin_curl/__init__.py
  80. 4 4
      archivebox/pkgs/abx-plugin-curl/abx_plugin_curl/binaries.py
  81. 2 2
      archivebox/pkgs/abx-plugin-curl/abx_plugin_curl/config.py
  82. 0 0
      archivebox/pkgs/abx-plugin-curl/abx_plugin_curl/headers.py
  83. 18 0
      archivebox/pkgs/abx-plugin-curl/pyproject.toml
  84. 0 0
      archivebox/pkgs/abx-plugin-default-binproviders/README.md
  85. 23 0
      archivebox/pkgs/abx-plugin-default-binproviders/abx_plugin_default_binproviders.py
  86. 18 0
      archivebox/pkgs/abx-plugin-default-binproviders/pyproject.toml
  87. 0 0
      archivebox/pkgs/abx-plugin-favicon/README.md
  88. 29 0
      archivebox/pkgs/abx-plugin-favicon/abx_plugin_favicon/__init__.py
  89. 1 4
      archivebox/pkgs/abx-plugin-favicon/abx_plugin_favicon/config.py
  90. 0 0
      archivebox/pkgs/abx-plugin-favicon/abx_plugin_favicon/favicon.py
  91. 18 0
      archivebox/pkgs/abx-plugin-favicon/pyproject.toml
  92. 0 0
      archivebox/pkgs/abx-plugin-git/README.md
  93. 29 0
      archivebox/pkgs/abx-plugin-git/abx_plugin_git/__init__.py
  94. 4 4
      archivebox/pkgs/abx-plugin-git/abx_plugin_git/binaries.py
  95. 2 2
      archivebox/pkgs/abx-plugin-git/abx_plugin_git/config.py
  96. 15 0
      archivebox/pkgs/abx-plugin-git/abx_plugin_git/extractors.py
  97. 2 2
      archivebox/pkgs/abx-plugin-git/abx_plugin_git/git.py
  98. 19 0
      archivebox/pkgs/abx-plugin-git/pyproject.toml
  99. 0 0
      archivebox/pkgs/abx-plugin-htmltotext/README.md
  100. 22 0
      archivebox/pkgs/abx-plugin-htmltotext/abx_plugin_htmltotext/__init__.py

+ 1 - 1
.github/workflows/test.yml

@@ -102,7 +102,7 @@ jobs:
         # TODO: remove this exception for windows once we get tests passing on that platform
         if: ${{ !contains(matrix.os, 'windows') }}
         run: |
-          python -m pytest -s --basetemp=tests/out --ignore=archivebox/vendor --ignore=deb_dist --ignore=pip_dist --ignore=brew_dist
+          python -m pytest -s --basetemp=tests/out --ignore=archivebox/pkgs
 
   docker_tests:
     runs-on: ubuntu-latest

+ 0 - 6
.gitmodules

@@ -1,9 +1,3 @@
 [submodule "docs"]
     path = docs
     url = https://github.com/ArchiveBox/ArchiveBox.wiki.git
-[submodule "archivebox/vendor/pocket"]
-	path = archivebox/vendor/pocket
-	url = https://github.com/tapanpandita/pocket
-[submodule "archivebox/vendor/pydantic-pkgr"]
-	path = archivebox/vendor/pydantic-pkgr
-	url = https://github.com/ArchiveBox/pydantic-pkgr

+ 1 - 1
archivebox/.flake8

@@ -3,4 +3,4 @@ ignore = D100,D101,D102,D103,D104,D105,D202,D203,D205,D400,E131,E241,E252,E266,E
 select = F,E9,W
 max-line-length = 130
 max-complexity = 10
-exclude = migrations,tests,node_modules,vendor,venv,.venv,.venv2,.docker-venv
+exclude = migrations,tests,node_modules,vendor,venv,.venv,.venv2,.docker-venv,data,data*

+ 46 - 3
archivebox/__init__.py

@@ -13,8 +13,8 @@ __package__ = 'archivebox'
 
 import os
 import sys
-
 from pathlib import Path
+from typing import cast
 
 ASCII_LOGO = """
  █████╗ ██████╗  ██████╗██╗  ██╗██╗██╗   ██╗███████╗ ██████╗  ██████╗ ██╗  ██╗
@@ -47,11 +47,54 @@ from .monkey_patches import *                    # noqa
 
 
 # print('LOADING VENDORED LIBRARIES')
-from .vendor import load_vendored_libs           # noqa
-load_vendored_libs()
+from .pkgs import load_vendored_pkgs             # noqa
+load_vendored_pkgs()
 # print('DONE LOADING VENDORED LIBRARIES')
 
+# Load ABX Plugin Specifications + Default Implementations
+import abx                                       # noqa
+import abx_spec_archivebox                       # noqa
+import abx_spec_config                           # noqa
+import abx_spec_pydantic_pkgr                    # noqa
+import abx_spec_django                           # noqa
+import abx_spec_searchbackend                    # noqa
+
+abx.pm.add_hookspecs(abx_spec_config.PLUGIN_SPEC)
+abx.pm.register(abx_spec_config.PLUGIN_SPEC())
+
+abx.pm.add_hookspecs(abx_spec_pydantic_pkgr.PLUGIN_SPEC)
+abx.pm.register(abx_spec_pydantic_pkgr.PLUGIN_SPEC())
+
+abx.pm.add_hookspecs(abx_spec_django.PLUGIN_SPEC)
+abx.pm.register(abx_spec_django.PLUGIN_SPEC())
+
+abx.pm.add_hookspecs(abx_spec_searchbackend.PLUGIN_SPEC)
+abx.pm.register(abx_spec_searchbackend.PLUGIN_SPEC())
+
+# Cast to ArchiveBoxPluginSpec to enable static type checking of pm.hook.call() methods
+abx.pm = cast(abx.ABXPluginManager[abx_spec_archivebox.ArchiveBoxPluginSpec], abx.pm)
+pm = abx.pm
+
+
+# Load all pip-installed ABX-compatible plugins
+ABX_ECOSYSTEM_PLUGINS = abx.get_pip_installed_plugins(group='abx')
+
+# Load all built-in ArchiveBox plugins
+ARCHIVEBOX_BUILTIN_PLUGINS = {
+    'config': PACKAGE_DIR / 'config',
+    'core': PACKAGE_DIR / 'core',
+    # 'search': PACKAGE_DIR / 'search',
+    # 'core': PACKAGE_DIR / 'core',
+}
+
+# Load all user-defined ArchiveBox plugins
+USER_PLUGINS = abx.find_plugins_in_dir(Path(os.getcwd()) / 'user_plugins')
+
+# Import all plugins and register them with ABX Plugin Manager
+ALL_PLUGINS = {**ABX_ECOSYSTEM_PLUGINS, **ARCHIVEBOX_BUILTIN_PLUGINS, **USER_PLUGINS}
+LOADED_PLUGINS = abx.load_plugins(ALL_PLUGINS)
 
+# Setup basic config, constants, paths, and version
 from .config.constants import CONSTANTS                         # noqa
 from .config.paths import PACKAGE_DIR, DATA_DIR, ARCHIVE_DIR    # noqa
 from .config.version import VERSION                             # noqa

+ 1 - 1
archivebox/abid_utils/models.py

@@ -175,7 +175,7 @@ class ABIDModel(models.Model):
             'uri': self.abid_uri_src,
             'subtype': self.abid_subtype_src,
             'rand': self.abid_rand_src,
-            'salt': 'self.abid_salt',               # defined as static class vars at build time
+            'salt': 'self.abid_salt',                 # defined as static class vars at build time
         }
 
     @property

+ 0 - 131
archivebox/abx/__init__.py

@@ -1,131 +0,0 @@
-__package__ = 'abx'
-
-import importlib
-from pathlib import Path
-from typing import Dict, Callable, List
-
-from . import hookspec as base_spec
-from abx.hookspec import hookimpl, hookspec           # noqa
-from abx.manager import pm, PluginManager             # noqa
-
-
-pm.add_hookspecs(base_spec)
-
-
-###### PLUGIN DISCOVERY AND LOADING ########################################################
-
-def get_plugin_order(plugin_entrypoint: Path):
-    order = 999
-    try:
-        # if .plugin_order file exists, use it to set the load priority
-        order = int((plugin_entrypoint.parent / '.plugin_order').read_text())
-    except FileNotFoundError:
-        pass
-    return (order, plugin_entrypoint)
-
-def register_hookspecs(hookspecs: List[str]):
-    """
-    Register all the hookspecs from a list of module names.
-    """
-    for hookspec_import_path in hookspecs:
-        hookspec_module = importlib.import_module(hookspec_import_path)
-        pm.add_hookspecs(hookspec_module)
-
-
-def find_plugins_in_dir(plugins_dir: Path, prefix: str) -> Dict[str, Path]:
-    """
-    Find all the plugins in a given directory. Just looks for an __init__.py file.
-    """
-    return {
-        f"{prefix}.{plugin_entrypoint.parent.name}": plugin_entrypoint.parent
-        for plugin_entrypoint in sorted(plugins_dir.glob("*/__init__.py"), key=get_plugin_order)
-        if plugin_entrypoint.parent.name != 'abx'
-    }   # "plugins_pkg.pip": "/app/archivebox/plugins_pkg/pip"
-
-
-def get_pip_installed_plugins(group='abx'):
-    """replaces pm.load_setuptools_entrypoints("abx"), finds plugins that registered entrypoints via pip"""
-    import importlib.metadata
-
-    DETECTED_PLUGINS = {}   # module_name: module_dir_path
-    for dist in list(importlib.metadata.distributions()):
-        for entrypoint in dist.entry_points:
-            if entrypoint.group != group or pm.is_blocked(entrypoint.name):
-                continue
-            DETECTED_PLUGINS[entrypoint.name] = Path(entrypoint.load().__file__).parent
-            # pm.register(plugin, name=ep.name)
-            # pm._plugin_distinfo.append((plugin, DistFacade(dist)))
-    return DETECTED_PLUGINS
-
-
-def get_plugins_in_dirs(plugin_dirs: Dict[str, Path]):
-    """
-    Get the mapping of dir_name: {plugin_id: plugin_dir} for all plugins in the given directories.
-    """
-    DETECTED_PLUGINS = {}
-    for plugin_prefix, plugin_dir in plugin_dirs.items():
-        DETECTED_PLUGINS.update(find_plugins_in_dir(plugin_dir, prefix=plugin_prefix))
-    return DETECTED_PLUGINS
-
-
-# Load all plugins from pip packages, archivebox built-ins, and user plugins
-
-def load_plugins(plugins_dict: Dict[str, Path]):
-    """
-    Load all the plugins from a dictionary of module names and directory paths.
-    """
-    LOADED_PLUGINS = {}
-    for plugin_module, plugin_dir in plugins_dict.items():
-        # print(f'Loading plugin: {plugin_module} from {plugin_dir}')
-        plugin_module_loaded = importlib.import_module(plugin_module)
-        pm.register(plugin_module_loaded)
-        LOADED_PLUGINS[plugin_module] = plugin_module_loaded.PLUGIN
-        # print(f'    √ Loaded plugin: {plugin_module}')
-    return LOADED_PLUGINS
-
-def get_registered_plugins():
-    """
-    Get all the plugins registered with Pluggy.
-    """
-    plugins = {}
-    plugin_to_distinfo = dict(pm.list_plugin_distinfo())
-    for plugin in pm.get_plugins():
-        plugin_info = {
-            "name": plugin.__name__,
-            "hooks": [h.name for h in pm.get_hookcallers(plugin) or ()],
-        }
-        distinfo = plugin_to_distinfo.get(plugin)
-        if distinfo:
-            plugin_info["version"] = distinfo.version
-            plugin_info["name"] = (
-                getattr(distinfo, "name", None) or distinfo.project_name
-            )
-        plugins[plugin_info["name"]] = plugin_info
-    return plugins
-
-
-
-
-def get_plugin_hooks(plugin_pkg: str | None) -> Dict[str, Callable]:
-    """
-    Get all the functions marked with @hookimpl on a module.
-    """
-    if not plugin_pkg:
-        return {}
-    
-    hooks = {}
-    
-    plugin_module = importlib.import_module(plugin_pkg)
-    for attr_name in dir(plugin_module):
-        if attr_name.startswith('_'):
-            continue
-        try:
-            attr = getattr(plugin_module, attr_name)
-            if isinstance(attr, Callable):
-                hooks[attr_name] = None
-                pm.parse_hookimpl_opts(plugin_module, attr_name)
-                hooks[attr_name] = attr
-        except Exception as e:
-            print(f'Error getting hookimpls for {plugin_pkg}: {e}')
-
-    return hooks

+ 0 - 30
archivebox/abx/archivebox/__init__.py

@@ -1,30 +0,0 @@
-__package__ = 'abx.archivebox'
-
-import os
-import importlib
-
-from typing import Dict
-from pathlib import Path
-
-
-def load_archivebox_plugins(pm, plugins_dict: Dict[str, Path]):
-    """Load archivebox plugins, very similar to abx.load_plugins but it looks for a pydantic PLUGIN model + hooks in apps.py"""
-    LOADED_PLUGINS = {}
-    for plugin_module, plugin_dir in reversed(plugins_dict.items()):
-        # print(f'Loading plugin: {plugin_module} from {plugin_dir}')
-        
-        # 1. register the plugin module directly in case it contains any look hookimpls (e.g. in __init__.py)
-        try:
-            plugin_module_loaded = importlib.import_module(plugin_module)
-            pm.register(plugin_module_loaded)
-        except Exception as e:
-            print(f'Error registering plugin: {plugin_module} - {e}')
-            
-        
-        # 2. then try to import plugin_module.apps as well
-        if os.access(plugin_dir / 'apps.py', os.R_OK):
-            plugin_apps = importlib.import_module(plugin_module + '.apps')
-            pm.register(plugin_apps)                                           # register the whole .apps  in case it contains loose hookimpls (not in a class)
-            
-        # print(f'    √ Loaded plugin: {plugin_module} {len(archivebox_plugins_found) * "🧩"}')
-    return LOADED_PLUGINS

+ 0 - 106
archivebox/abx/archivebox/base_binary.py

@@ -1,106 +0,0 @@
-__package__ = "abx.archivebox"
-
-import os
-from typing import Optional, cast
-from typing_extensions import Self
-
-from pydantic import validate_call
-from pydantic_pkgr import (
-    Binary,
-    BinProvider,
-    BinProviderName,
-    AptProvider,
-    BrewProvider,
-    EnvProvider,
-)
-
-from archivebox.config.permissions import ARCHIVEBOX_USER
-
-
-class BaseBinProvider(BinProvider):
-    
-    # TODO: add install/load/load_or_install methods as abx.hookimpl methods
-    
-    @property
-    def admin_url(self) -> str:
-        # e.g. /admin/environment/binproviders/NpmBinProvider/   TODO
-        return "/admin/environment/binaries/"
-
-class BaseBinary(Binary):
-
-    @staticmethod
-    def symlink_to_lib(binary, bin_dir=None) -> None:
-        from archivebox.config.common import STORAGE_CONFIG
-        bin_dir = bin_dir or STORAGE_CONFIG.LIB_DIR / 'bin'
-        
-        if not (binary.abspath and os.access(binary.abspath, os.R_OK)):
-            return
-        
-        try:
-            bin_dir.mkdir(parents=True, exist_ok=True)
-            symlink = bin_dir / binary.name
-            symlink.unlink(missing_ok=True)
-            symlink.symlink_to(binary.abspath)
-            symlink.chmod(0o777)   # make sure its executable by everyone
-        except Exception as err:
-            # print(f'[red]:warning: Failed to symlink {symlink} -> {binary.abspath}[/red] {err}')
-            # not actually needed, we can just run without it
-            pass
-        
-    @validate_call
-    def load(self, fresh=False, **kwargs) -> Self:
-        from archivebox.config.common import STORAGE_CONFIG
-        if fresh:
-            binary = super().load(**kwargs)
-            self.symlink_to_lib(binary=binary, bin_dir=STORAGE_CONFIG.LIB_DIR / 'bin')
-        else:
-            # get cached binary from db
-            try:
-                from machine.models import InstalledBinary
-                installed_binary = InstalledBinary.objects.get_from_db_or_cache(self)    # type: ignore
-                binary = InstalledBinary.load_from_db(installed_binary)
-            except Exception:
-                # maybe we are not in a DATA dir so there is no db, fallback to reading from fs
-                # (e.g. when archivebox version is run outside of a DATA dir)
-                binary = super().load(**kwargs)
-        return cast(Self, binary)
-    
-    @validate_call
-    def install(self, **kwargs) -> Self:
-        from archivebox.config.common import STORAGE_CONFIG
-        binary = super().install(**kwargs)
-        self.symlink_to_lib(binary=binary, bin_dir=STORAGE_CONFIG.LIB_DIR / 'bin')
-        return binary
-    
-    @validate_call
-    def load_or_install(self, fresh=False, **kwargs) -> Self:
-        from archivebox.config.common import STORAGE_CONFIG
-        try:
-            binary = self.load(fresh=fresh)
-            if binary and binary.version:
-                self.symlink_to_lib(binary=binary, bin_dir=STORAGE_CONFIG.LIB_DIR / 'bin')
-                return binary
-        except Exception:
-            pass
-        return self.install(**kwargs)
-    
-    @property
-    def admin_url(self) -> str:
-        # e.g. /admin/environment/config/LdapConfig/
-        return f"/admin/environment/binaries/{self.name}/"
-
-
-class AptBinProvider(AptProvider, BaseBinProvider):
-    name: BinProviderName = "apt"
-    
-class BrewBinProvider(BrewProvider, BaseBinProvider):
-    name: BinProviderName = "brew"
-    
-class EnvBinProvider(EnvProvider, BaseBinProvider):
-    name: BinProviderName = "env"
-    
-    euid: Optional[int] = ARCHIVEBOX_USER
-
-apt = AptBinProvider()
-brew = BrewBinProvider()
-env = EnvBinProvider()

+ 0 - 219
archivebox/abx/archivebox/base_extractor.py

@@ -1,219 +0,0 @@
-__package__ = 'abx.archivebox'
-
-import json
-import os
-
-from typing import Optional, List, Literal, Annotated, Dict, Any, Tuple
-from typing_extensions import Self
-from pathlib import Path
-
-from pydantic import model_validator, AfterValidator
-from pydantic_pkgr import BinName
-from django.utils.functional import cached_property
-from django.utils import timezone
-
-import abx
-
-from .base_binary import BaseBinary
-
-
-def no_empty_args(args: List[str]) -> List[str]:
-    assert all(len(arg) for arg in args)
-    return args
-
-ExtractorName = Literal['wget', 'warc', 'media', 'singlefile'] | str
-
-HandlerFuncStr = Annotated[str, AfterValidator(lambda s: s.startswith('self.'))]
-CmdArgsList = Annotated[List[str] | Tuple[str, ...], AfterValidator(no_empty_args)]
-
-
-class BaseExtractor:
-    
-    name: ExtractorName
-    binary: BinName
-
-    output_path_func: HandlerFuncStr = 'self.get_output_path'
-    should_extract_func: HandlerFuncStr = 'self.should_extract'
-    extract_func: HandlerFuncStr = 'self.extract'
-    exec_func: HandlerFuncStr = 'self.exec'
-
-    default_args: CmdArgsList = []
-    extra_args: CmdArgsList = []
-    args: Optional[CmdArgsList] = None
-
-    @model_validator(mode='after')
-    def validate_model(self) -> Self:
-        if self.args is None:
-            self.args = [*self.default_args, *self.extra_args]
-        return self
-
-
-    def get_output_path(self, snapshot) -> Path:
-        return Path(self.__class__.__name__.lower())
-
-    def should_extract(self, uri: str, config: dict | None=None) -> bool:
-        try:
-            assert self.detect_installed_binary().version
-        except Exception:
-            raise
-            # could not load binary
-            return False
-        
-        # output_dir = self.get_output_path(snapshot)
-        # if output_dir.glob('*.*'):
-        #     return False
-        return True
-
-    @abx.hookimpl
-    def extract(self, snapshot_id: str) -> Dict[str, Any]:
-        from core.models import Snapshot
-        from archivebox import CONSTANTS
-        
-        snapshot = Snapshot.objects.get(id=snapshot_id)
-        
-        if not self.should_extract(snapshot):
-            return {}
-        
-        status = 'failed'
-        start_ts = timezone.now()
-        uplink = self.detect_network_interface()
-        installed_binary = self.detect_installed_binary()
-        machine = installed_binary.machine
-        assert uplink.machine == installed_binary.machine  # it would be *very* weird if this wasn't true
-        
-        output_dir = CONSTANTS.DATA_DIR / '.tmp' / 'extractors' / self.name / str(snapshot.abid)
-        output_dir.mkdir(parents=True, exist_ok=True)
-
-        # execute the extractor binary with the given args
-        args = [snapshot.url, *self.args] if self.args is not None else [snapshot.url, *self.default_args, *self.extra_args]
-        cmd = [str(installed_binary.abspath), *args]
-        proc = self.exec(installed_binary=installed_binary, args=args, cwd=output_dir)
-
-        # collect the output
-        end_ts = timezone.now()
-        output_files = list(str(path.relative_to(output_dir)) for path in output_dir.glob('**/*.*'))
-        stdout = proc.stdout.strip()
-        stderr = proc.stderr.strip()
-        output_json = None
-        output_text = stdout
-        try:
-            output_json = json.loads(stdout.strip())
-            output_text = None
-        except json.JSONDecodeError:
-            pass
-        
-        errors = []
-        if proc.returncode == 0:
-            status = 'success'
-        else:
-            errors.append(f'{installed_binary.name} returned non-zero exit code: {proc.returncode}')   
-
-        # increment health stats counters
-        if status == 'success':
-            machine.record_health_success()
-            uplink.record_health_success()
-            installed_binary.record_health_success()
-        else:
-            machine.record_health_failure()
-            uplink.record_health_failure()
-            installed_binary.record_health_failure()
-
-        return {
-            'extractor': self.name,
-            
-            'snapshot': {
-                'id': snapshot.id,
-                'abid': snapshot.abid,
-                'url': snapshot.url,
-                'created_by_id': snapshot.created_by_id,
-            },
-            
-            'machine': {
-                'id': machine.id,
-                'abid': machine.abid,
-                'guid': machine.guid,
-                'hostname': machine.hostname,
-                'hw_in_docker': machine.hw_in_docker,
-                'hw_in_vm': machine.hw_in_vm,
-                'hw_manufacturer': machine.hw_manufacturer,
-                'hw_product': machine.hw_product,
-                'hw_uuid': machine.hw_uuid,
-                'os_arch': machine.os_arch,
-                'os_family': machine.os_family,
-                'os_platform': machine.os_platform,
-                'os_release': machine.os_release,
-                'os_kernel': machine.os_kernel,
-            },
-            
-            'uplink': { 
-                'id': uplink.id,
-                'abid': uplink.abid,
-                'mac_address': uplink.mac_address,
-                'ip_public': uplink.ip_public,
-                'ip_local': uplink.ip_local,
-                'dns_server': uplink.dns_server,
-                'hostname': uplink.hostname,
-                'iface': uplink.iface,
-                'isp': uplink.isp,
-                'city': uplink.city,
-                'region': uplink.region,
-                'country': uplink.country,
-            },
-            
-            'binary': {
-                'id': installed_binary.id,
-                'abid': installed_binary.abid,
-                'name': installed_binary.name,
-                'binprovider': installed_binary.binprovider,
-                'abspath': installed_binary.abspath,
-                'version': installed_binary.version,
-                'sha256': installed_binary.sha256,
-            },
-
-            'cmd': cmd,
-            'stdout': stdout,
-            'stderr': stderr,
-            'returncode': proc.returncode,
-            'start_ts': start_ts,
-            'end_ts': end_ts,
-            
-            'status': status,
-            'errors': errors,
-            'output_dir': str(output_dir.relative_to(CONSTANTS.DATA_DIR)),
-            'output_files': output_files,
-            'output_json': output_json or {},
-            'output_text': output_text or '',
-        }
-
-    # TODO: move this to a hookimpl
-    def exec(self, args: CmdArgsList=(), cwd: Optional[Path]=None, installed_binary=None):
-        cwd = cwd or Path(os.getcwd())
-        binary = self.load_binary(installed_binary=installed_binary)
-        
-        return binary.exec(cmd=args, cwd=cwd)
-    
-    @cached_property
-    def BINARY(self) -> BaseBinary:
-        import abx.archivebox.reads
-        for binary in abx.archivebox.reads.get_BINARIES().values():
-            if binary.name == self.binary:
-                return binary
-        raise ValueError(f'Binary {self.binary} not found')
-    
-    def detect_installed_binary(self):
-        from machine.models import InstalledBinary
-        # hydrates binary from DB/cache if record of installed version is recent enough
-        # otherwise it finds it from scratch by detecting installed version/abspath/sha256 on host
-        return InstalledBinary.objects.get_from_db_or_cache(self.BINARY)
-
-    def load_binary(self, installed_binary=None) -> BaseBinary:
-        installed_binary = installed_binary or self.detect_installed_binary()
-        return installed_binary.load_from_db()
-    
-    def detect_network_interface(self):
-        from machine.models import NetworkInterface
-        return NetworkInterface.objects.current()
-
-    @abx.hookimpl
-    def get_EXTRACTORS(self):
-        return [self]

+ 0 - 25
archivebox/abx/archivebox/base_replayer.py

@@ -1,25 +0,0 @@
-__package__ = 'abx.archivebox'
-
-import abx
-
-
-class BaseReplayer:
-    """Describes how to render an ArchiveResult in several contexts"""
-    
-    url_pattern: str = '*'
-
-    row_template: str = 'plugins/generic_replayer/templates/row.html'
-    embed_template: str = 'plugins/generic_replayer/templates/embed.html'
-    fullpage_template: str = 'plugins/generic_replayer/templates/fullpage.html'
-
-    # row_view: LazyImportStr = 'plugins.generic_replayer.views.row_view'
-    # embed_view: LazyImportStr = 'plugins.generic_replayer.views.embed_view'
-    # fullpage_view: LazyImportStr = 'plugins.generic_replayer.views.fullpage_view'
-    # icon_view: LazyImportStr = 'plugins.generic_replayer.views.get_icon'
-    # thumbnail_view: LazyImportStr = 'plugins.generic_replayer.views.get_icon'
-
-    @abx.hookimpl
-    def get_REPLAYERS(self):
-        return [self]
-
-    # TODO: add hookimpl methods for get_row_template, get_embed_template, get_fullpage_template, etc...

+ 0 - 25
archivebox/abx/archivebox/base_searchbackend.py

@@ -1,25 +0,0 @@
-__package__ = 'abx.archivebox'
-
-from typing import Iterable, List
-import abc
-
-
-
-class BaseSearchBackend(abc.ABC):
-    name: str
-
-    @staticmethod
-    @abc.abstractmethod
-    def index(snapshot_id: str, texts: List[str]):
-        return
-
-    @staticmethod
-    @abc.abstractmethod
-    def flush(snapshot_ids: Iterable[str]):
-        return
-
-    @staticmethod
-    @abc.abstractmethod
-    def search(text: str) -> List[str]:
-        raise NotImplementedError("search method must be implemented by subclass")
-

+ 0 - 52
archivebox/abx/archivebox/hookspec.py

@@ -1,52 +0,0 @@
-__package__ = 'abx.archivebox'
-
-from typing import Dict, Any
-
-from .. import hookspec
-
-from .base_binary import BaseBinary, BaseBinProvider
-from .base_configset import BaseConfigSet
-from .base_extractor import BaseExtractor
-from .base_searchbackend import BaseSearchBackend
-
-
-@hookspec
-def get_PLUGIN() -> Dict[str, Dict[str, Any]]:
-    return {}
-
-@hookspec
-def get_CONFIG() -> Dict[str, BaseConfigSet]:
-    return {}
-
-
-
-@hookspec
-def get_EXTRACTORS() -> Dict[str, BaseExtractor]:
-    return {}
-
-@hookspec
-def get_SEARCHBACKENDS() -> Dict[str, BaseSearchBackend]:
-    return {}
-
-# @hookspec
-# def get_REPLAYERS() -> Dict[str, BaseReplayer]:
-#     return {}
-
-# @hookspec
-# def get_ADMINDATAVIEWS():
-#     return {}
-
-# @hookspec
-# def get_QUEUES():
-#     return {}
-
-
-##############################################################
-# provided by abx.pydantic_pkgr.hookspec:
-# @hookspec
-# def get_BINARIES() -> Dict[str, BaseBinary]:
-#     return {}
-
-# @hookspec
-# def get_BINPROVIDERS() -> Dict[str, BaseBinProvider]:
-#     return {}

+ 0 - 160
archivebox/abx/archivebox/reads.py

@@ -1,160 +0,0 @@
-__package__ = 'abx.archivebox'
-
-import importlib
-from typing import Dict, Set, Any, TYPE_CHECKING
-
-from benedict import benedict
-
-import abx
-from .. import pm
-
-if TYPE_CHECKING:
-    from .base_configset import BaseConfigSet
-    from .base_binary import BaseBinary, BaseBinProvider
-    from .base_extractor import BaseExtractor
-    from .base_searchbackend import BaseSearchBackend
-    # from .base_replayer import BaseReplayer
-    # from .base_queue import BaseQueue
-    # from .base_admindataview import BaseAdminDataView
-
-# API exposed to ArchiveBox code
-
-def get_PLUGINS() -> Dict[str, Dict[str, Any]]:
-    return benedict({
-        plugin_id: plugin
-        for plugin_dict in pm.hook.get_PLUGIN()
-            for plugin_id, plugin in plugin_dict.items()
-    })
-
-def get_PLUGIN(plugin_id: str) -> Dict[str, Any]:
-    plugin_info = get_PLUGINS().get(plugin_id, {})
-    package = plugin_info.get('package', plugin_info.get('PACKAGE', None))
-    if not package:
-        return {'id': plugin_id, 'hooks': {}}
-    module = importlib.import_module(package)
-    hooks = abx.get_plugin_hooks(module.__package__)
-    assert plugin_info and (plugin_info.get('id') or plugin_info.get('ID') or hooks)
-    
-    return benedict({
-        'id': plugin_id,
-        'label': getattr(module, '__label__', plugin_id),
-        'module': module,
-        'package': module.__package__,
-        'hooks': hooks,
-        'version': getattr(module, '__version__', '999.999.999'),
-        'author': getattr(module, '__author__', 'Unknown'),
-        'homepage': getattr(module, '__homepage__', 'https://github.com/ArchiveBox/ArchiveBox'),
-        'dependencies': getattr(module, '__dependencies__', []),
-        'source_code': module.__file__,
-        **plugin_info,
-    })
-    
-
-def get_HOOKS() -> Set[str]:
-    return {
-        hook_name
-        for plugin_id in get_PLUGINS().keys()
-            for hook_name in get_PLUGIN(plugin_id).hooks
-    }
-
-def get_CONFIGS() -> Dict[str, 'BaseConfigSet']:
-    return benedict({
-        config_id: configset
-        for plugin_configs in pm.hook.get_CONFIG()
-            for config_id, configset in plugin_configs.items()
-    })
-
-
-def get_FLAT_CONFIG() -> Dict[str, Any]:
-    return benedict({
-        key: value
-        for configset in get_CONFIGS().values()
-            for key, value in configset.model_dump().items()
-    })
-
-def get_BINPROVIDERS() -> Dict[str, 'BaseBinProvider']:
-    # TODO: move these to plugins
-    from abx.archivebox.base_binary import apt, brew, env
-    builtin_binproviders = {
-        'env': env,
-        'apt': apt,
-        'brew': brew,
-    }
-    
-    return benedict({
-        binprovider_id: binprovider
-        for plugin_binproviders in [builtin_binproviders, *pm.hook.get_BINPROVIDERS()]
-            for binprovider_id, binprovider in plugin_binproviders.items()
-    })
-
-def get_BINARIES() -> Dict[str, 'BaseBinary']:
-    return benedict({
-        binary_id: binary
-        for plugin_binaries in pm.hook.get_BINARIES()
-            for binary_id, binary in plugin_binaries.items()
-    })
-
-def get_EXTRACTORS() -> Dict[str, 'BaseExtractor']:
-    return benedict({
-        extractor_id: extractor
-        for plugin_extractors in pm.hook.get_EXTRACTORS()
-            for extractor_id, extractor in plugin_extractors.items()
-    })
-
-# def get_REPLAYERS() -> Dict[str, 'BaseReplayer']:
-#     return benedict({
-#         replayer.id: replayer
-#         for plugin_replayers in pm.hook.get_REPLAYERS()
-#             for replayer in plugin_replayers
-#     })
-
-# def get_ADMINDATAVIEWS() -> Dict[str, 'BaseAdminDataView']:
-#     return benedict({
-#         admin_dataview.id: admin_dataview
-#         for plugin_admin_dataviews in pm.hook.get_ADMINDATAVIEWS()
-#             for admin_dataview in plugin_admin_dataviews
-#     })
-
-# def get_QUEUES() -> Dict[str, 'BaseQueue']:
-#     return benedict({
-#         queue.id: queue
-#         for plugin_queues in pm.hook.get_QUEUES()
-#             for queue in plugin_queues
-#     })
-
-def get_SEARCHBACKENDS() -> Dict[str, 'BaseSearchBackend']:
-    return benedict({
-        searchbackend_id: searchbackend
-        for plugin_searchbackends in pm.hook.get_SEARCHBACKENDS()
-            for searchbackend_id,searchbackend in plugin_searchbackends.items()
-    })
-
-
-
-def get_scope_config(defaults: benedict | None = None, persona=None, seed=None, crawl=None, snapshot=None, archiveresult=None, extra_config=None):
-    """Get all the relevant config for the given scope, in correct precedence order"""
-    
-    from django.conf import settings
-    default_config: benedict = defaults or settings.CONFIG
-    
-    snapshot = snapshot or (archiveresult and archiveresult.snapshot)
-    crawl = crawl or (snapshot and snapshot.crawl)
-    seed = seed or (crawl and crawl.seed)
-    persona = persona or (crawl and crawl.persona)
-    
-    persona_config = persona.config if persona else {}
-    seed_config = seed.config if seed else {}
-    crawl_config = crawl.config if crawl else {}
-    snapshot_config = snapshot.config if snapshot else {}
-    archiveresult_config = archiveresult.config if archiveresult else {}
-    extra_config = extra_config or {}
-    
-    return {
-        **default_config,               # defaults / config file / environment variables
-        **persona_config,               # lowest precedence
-        **seed_config,
-        **crawl_config,
-        **snapshot_config,
-        **archiveresult_config,
-        **extra_config,                 # highest precedence
-    }

+ 0 - 1
archivebox/abx/django/__init__.py

@@ -1 +0,0 @@
-__package__ = 'abx.django'

+ 0 - 13
archivebox/abx/django/apps.py

@@ -1,13 +0,0 @@
-__package__ = 'abx.django'
-
-from django.apps import AppConfig
-
-
-class ABXConfig(AppConfig):
-    name = 'abx'
-
-    def ready(self):
-        import abx
-        from django.conf import settings
-        
-        abx.pm.hook.ready(settings=settings)

+ 0 - 125
archivebox/abx/django/hookspec.py

@@ -1,125 +0,0 @@
-__package__ = 'abx.django'
-
-from ..hookspec import hookspec
-
-
-###########################################################################################
-
-@hookspec
-def get_INSTALLED_APPS():
-    """Return a list of apps to add to INSTALLED_APPS"""
-    # e.g. ['your_plugin_type.plugin_name']
-    return []
-
-# @hookspec
-# def register_INSTALLED_APPS(INSTALLED_APPS):
-#     """Mutate INSTALLED_APPS in place to add your app in a specific position"""
-#     # idx_of_contrib = INSTALLED_APPS.index('django.contrib.auth')
-#     # INSTALLED_APPS.insert(idx_of_contrib + 1, 'your_plugin_type.plugin_name')
-#     pass
-
-
-@hookspec
-def get_TEMPLATE_DIRS():
-    return []     # e.g. ['your_plugin_type/plugin_name/templates']
-
-# @hookspec
-# def register_TEMPLATE_DIRS(TEMPLATE_DIRS):
-#     """Install django settings"""
-#     # e.g. TEMPLATE_DIRS.insert(0, 'your_plugin_type/plugin_name/templates')
-#     pass
-
-
-@hookspec
-def get_STATICFILES_DIRS():
-    return []     # e.g. ['your_plugin_type/plugin_name/static']
-
-# @hookspec
-# def register_STATICFILES_DIRS(STATICFILES_DIRS):
-#     """Mutate STATICFILES_DIRS in place to add your static dirs in a specific position"""
-#     # e.g. STATICFILES_DIRS.insert(0, 'your_plugin_type/plugin_name/static')
-#     pass
-
-
-@hookspec
-def get_MIDDLEWARE():
-    return []     # e.g. ['your_plugin_type.plugin_name.middleware.YourMiddleware']
-
-# @hookspec
-# def register_MIDDLEWARE(MIDDLEWARE):
-#     """Mutate MIDDLEWARE in place to add your middleware in a specific position"""
-#     # e.g. MIDDLEWARE.insert(0, 'your_plugin_type.plugin_name.middleware.YourMiddleware')
-#     pass
-
-
-@hookspec
-def get_AUTHENTICATION_BACKENDS():
-    return []     # e.g. ['django_auth_ldap.backend.LDAPBackend']
-
-# @hookspec
-# def register_AUTHENTICATION_BACKENDS(AUTHENTICATION_BACKENDS):
-#     """Mutate AUTHENTICATION_BACKENDS in place to add your auth backends in a specific position"""
-#     # e.g. AUTHENTICATION_BACKENDS.insert(0, 'your_plugin_type.plugin_name.backend.YourBackend')
-#     pass
-
-@hookspec
-def get_DJANGO_HUEY_QUEUES(QUEUE_DATABASE_NAME):
-    return []     # e.g. [{'name': 'your_plugin_type.plugin_name', 'HUEY': {...}}]
-
-# @hookspec
-# def register_DJANGO_HUEY(DJANGO_HUEY):
-#     """Mutate DJANGO_HUEY in place to add your huey queues in a specific position"""
-#     # e.g. DJANGO_HUEY['queues']['some_queue_name']['some_setting'] = 'some_value'
-#     pass
-
-
-@hookspec
-def get_ADMIN_DATA_VIEWS_URLS():
-    return []
-
-# @hookspec
-# def register_ADMIN_DATA_VIEWS(ADMIN_DATA_VIEWS):
-#     """Mutate ADMIN_DATA_VIEWS in place to add your admin data views in a specific position"""
-#     # e.g. ADMIN_DATA_VIEWS['URLS'].insert(0, 'your_plugin_type/plugin_name/admin_data_views.py')
-#     pass
-
-
-# @hookspec
-# def register_settings(settings):
-#     """Mutate settings in place to add your settings / modify existing settings"""
-#     # settings.SOME_KEY = 'some_value'
-#     pass
-
-
-###########################################################################################
-
-@hookspec
-def get_urlpatterns():
-    return []     # e.g. [path('your_plugin_type/plugin_name/url.py', your_view)]
-
-# @hookspec
-# def register_urlpatterns(urlpatterns):
-#     """Mutate urlpatterns in place to add your urlpatterns in a specific position"""
-#     # e.g. urlpatterns.insert(0, path('your_plugin_type/plugin_name/url.py', your_view))
-#     pass
-
-###########################################################################################
-
-@hookspec
-def register_checks():
-    """Register django checks with django system checks system"""
-    pass
-
-@hookspec
-def register_admin(admin_site):
-    """Register django admin views/models with the main django admin site instance"""
-    pass
-
-
-###########################################################################################
-
-
-@hookspec
-def ready():
-    """Called when Django apps app.ready() are triggered"""
-    pass

+ 0 - 101
archivebox/abx/django/use.py

@@ -1,101 +0,0 @@
-__package__ = 'abx.django'
-
-import itertools
-# from benedict import benedict
-
-from .. import pm
-
-
-def get_INSTALLED_APPS():
-    return itertools.chain(*reversed(pm.hook.get_INSTALLED_APPS()))
-
-# def register_INSTALLLED_APPS(INSTALLED_APPS):
-#     pm.hook.register_INSTALLED_APPS(INSTALLED_APPS=INSTALLED_APPS)
-
-
-def get_MIDDLEWARES():
-    return itertools.chain(*reversed(pm.hook.get_MIDDLEWARE()))
-
-# def register_MIDDLEWARES(MIDDLEWARE):
-#     pm.hook.register_MIDDLEWARE(MIDDLEWARE=MIDDLEWARE)
-
-
-def get_AUTHENTICATION_BACKENDS():
-    return itertools.chain(*reversed(pm.hook.get_AUTHENTICATION_BACKENDS()))
-
-# def register_AUTHENTICATION_BACKENDS(AUTHENTICATION_BACKENDS):
-#     pm.hook.register_AUTHENTICATION_BACKENDS(AUTHENTICATION_BACKENDS=AUTHENTICATION_BACKENDS)
-
-
-def get_STATICFILES_DIRS():
-    return itertools.chain(*reversed(pm.hook.get_STATICFILES_DIRS()))
-
-# def register_STATICFILES_DIRS(STATICFILES_DIRS):
-#     pm.hook.register_STATICFILES_DIRS(STATICFILES_DIRS=STATICFILES_DIRS)
-
-
-def get_TEMPLATE_DIRS():
-    return itertools.chain(*reversed(pm.hook.get_TEMPLATE_DIRS()))
-
-# def register_TEMPLATE_DIRS(TEMPLATE_DIRS):
-#     pm.hook.register_TEMPLATE_DIRS(TEMPLATE_DIRS=TEMPLATE_DIRS)
-
-def get_DJANGO_HUEY_QUEUES(QUEUE_DATABASE_NAME='queue.sqlite3'):
-    HUEY_QUEUES = {}
-    for plugin_result in pm.hook.get_DJANGO_HUEY_QUEUES(QUEUE_DATABASE_NAME=QUEUE_DATABASE_NAME):
-        HUEY_QUEUES.update(plugin_result)
-    return HUEY_QUEUES
-
-# def register_DJANGO_HUEY(DJANGO_HUEY):
-#     pm.hook.register_DJANGO_HUEY(DJANGO_HUEY=DJANGO_HUEY)
-
-def get_ADMIN_DATA_VIEWS_URLS():
-    return itertools.chain(*reversed(pm.hook.get_ADMIN_DATA_VIEWS_URLS()))
-
-# def register_ADMIN_DATA_VIEWS(ADMIN_DATA_VIEWS):
-#     pm.hook.register_ADMIN_DATA_VIEWS(ADMIN_DATA_VIEWS=ADMIN_DATA_VIEWS)
-
-
-# def register_settings(settings):
-#     # convert settings dict to an benedict so we can set values using settings.attr = xyz notation
-#     settings_as_obj = benedict(settings, keypath_separator=None)
-    
-#     # set default values for settings that are used by plugins
-#     # settings_as_obj.INSTALLED_APPS = settings_as_obj.get('INSTALLED_APPS', [])
-#     # settings_as_obj.MIDDLEWARE = settings_as_obj.get('MIDDLEWARE', [])
-#     # settings_as_obj.AUTHENTICATION_BACKENDS = settings_as_obj.get('AUTHENTICATION_BACKENDS', [])
-#     # settings_as_obj.STATICFILES_DIRS = settings_as_obj.get('STATICFILES_DIRS', [])
-#     # settings_as_obj.TEMPLATE_DIRS = settings_as_obj.get('TEMPLATE_DIRS', [])
-#     # settings_as_obj.DJANGO_HUEY = settings_as_obj.get('DJANGO_HUEY', {'queues': {}})
-#     # settings_as_obj.ADMIN_DATA_VIEWS = settings_as_obj.get('ADMIN_DATA_VIEWS', {'URLS': []})
-    
-#     # # call all the hook functions to mutate the settings values in-place
-#     # register_INSTALLLED_APPS(settings_as_obj.INSTALLED_APPS)
-#     # register_MIDDLEWARES(settings_as_obj.MIDDLEWARE)
-#     # register_AUTHENTICATION_BACKENDS(settings_as_obj.AUTHENTICATION_BACKENDS)
-#     # register_STATICFILES_DIRS(settings_as_obj.STATICFILES_DIRS)
-#     # register_TEMPLATE_DIRS(settings_as_obj.TEMPLATE_DIRS)
-#     # register_DJANGO_HUEY(settings_as_obj.DJANGO_HUEY)
-#     # register_ADMIN_DATA_VIEWS(settings_as_obj.ADMIN_DATA_VIEWS)
-    
-#     # calls Plugin.settings(settings) on each registered plugin
-#     pm.hook.register_settings(settings=settings_as_obj)
-    
-#     # then finally update the settings globals() object will all the new settings
-#     # settings.update(settings_as_obj)
-
-
-def get_urlpatterns():
-    return list(itertools.chain(*pm.hook.urlpatterns()))
-
-def register_urlpatterns(urlpatterns):
-    pm.hook.register_urlpatterns(urlpatterns=urlpatterns)
-
-
-def register_checks():
-    """register any django system checks"""
-    pm.hook.register_checks()
-
-def register_admin(admin_site):
-    """register any django admin models/views with the main django admin site instance"""
-    pm.hook.register_admin(admin_site=admin_site)

+ 0 - 22
archivebox/abx/hookspec.py

@@ -1,22 +0,0 @@
-from pathlib import Path
-
-from pluggy import HookimplMarker
-from pluggy import HookspecMarker
-
-spec = hookspec = HookspecMarker("abx")
-impl = hookimpl = HookimplMarker("abx")
-
-
-@hookspec
-@hookimpl
-def get_system_user() -> str:
-    # Beware $HOME may not match current EUID, UID, PUID, SUID, there are edge cases
-    # - sudo (EUD != UID != SUID)
-    # - running with an autodetected UID based on data dir ownership
-    #   but mapping of UID:username is broken because it was created
-    #   by a different host system, e.g. 911's $HOME outside of docker
-    #   might be /usr/lib/lxd instead of /home/archivebox
-    # - running as a user that doens't have a home directory
-    # - home directory is set to a path that doesn't exist, or is inside a dir we cant read
-    return Path('~').expanduser().name
-

+ 0 - 30
archivebox/abx/manager.py

@@ -1,30 +0,0 @@
-import inspect
-
-import pluggy
-
-
-class PluginManager(pluggy.PluginManager):
-    """
-    Patch to fix pluggy's PluginManager to work with pydantic models.
-    See: https://github.com/pytest-dev/pluggy/pull/536
-    """
-    def parse_hookimpl_opts(self, plugin, name: str) -> pluggy.HookimplOpts | None:
-        # IMPORTANT: @property methods can have side effects, and are never hookimpl
-        # if attr is a property, skip it in advance
-        plugin_class = plugin if inspect.isclass(plugin) else type(plugin)
-        if isinstance(getattr(plugin_class, name, None), property):
-            return None
-
-        # pydantic model fields are like attrs and also can never be hookimpls
-        plugin_is_pydantic_obj = hasattr(plugin, "__pydantic_core_schema__")
-        if plugin_is_pydantic_obj and name in getattr(plugin, "model_fields", {}):
-            # pydantic models mess with the class and attr __signature__
-            # so inspect.isroutine(...) throws exceptions and cant be used
-            return None
-        
-        try:
-            return super().parse_hookimpl_opts(plugin, name)
-        except AttributeError:
-            return super().parse_hookimpl_opts(type(plugin), name)
-
-pm = PluginManager("abx")

+ 0 - 1
archivebox/abx/pydantic_pkgr/__init__.py

@@ -1 +0,0 @@
-__package__ = 'abx.pydantic_pkgr'

+ 0 - 13
archivebox/abx/pydantic_pkgr/hookspec.py

@@ -1,13 +0,0 @@
-
-from ..hookspec import hookspec
-
-###########################################################################################
-
-@hookspec
-def get_BINPROVIDERS():
-    return {}
-
-@hookspec
-def get_BINARIES():
-    return {}
-

+ 0 - 0
archivebox/plugins_auth/__init__.py → archivebox/actors/__init__.py


+ 313 - 0
archivebox/actors/actor.py

@@ -0,0 +1,313 @@
+__package__ = 'archivebox.actors'
+
+import os
+import time
+from abc import ABC, abstractmethod
+from typing import ClassVar, Generic, TypeVar, Any, cast, Literal, Type
+from django.utils.functional import classproperty
+
+from rich import print
+import psutil
+
+from django import db
+from django.db import models
+from django.db.models import QuerySet
+from multiprocessing import Process, cpu_count
+from threading import Thread, get_native_id
+
+# from archivebox.logging_util import TimedProgress
+
+LaunchKwargs = dict[str, Any]
+
+ModelType = TypeVar('ModelType', bound=models.Model)
+
+class ActorType(ABC, Generic[ModelType]):
+    """
+    Base class for all actors. Usage:
+    class FaviconActor(ActorType[ArchiveResult]):
+        QUERYSET: ClassVar[QuerySet] = ArchiveResult.objects.filter(status='queued', extractor='favicon')
+        CLAIM_WHERE: ClassVar[str] = 'status = "queued" AND extractor = "favicon"'
+        CLAIM_ORDER: ClassVar[str] = 'created_at DESC'
+        ATOMIC: ClassVar[bool] = True
+
+        def claim_sql_set(self, obj: ArchiveResult) -> str:
+            # SQL fields to update atomically while claiming an object from the queue
+            retry_at = datetime.now() + timedelta(seconds=self.MAX_TICK_TIME)
+            return f"status = 'started', locked_by = {self.pid}, retry_at = {retry_at}"
+
+        def tick(self, obj: ArchiveResult) -> None:
+            run_favicon_extractor(obj)
+            ArchiveResult.objects.filter(pk=obj.pk, status='started').update(status='success')
+    """
+    pid: int
+    idle_count: int = 0
+    launch_kwargs: LaunchKwargs = {}
+    mode: Literal['thread', 'process'] = 'process'
+    
+    MAX_CONCURRENT_ACTORS: ClassVar[int] = min(max(2, int(cpu_count() * 0.6)), 8)   # min 2, max 8, up to 60% of available cpu cores
+    MAX_TICK_TIME: ClassVar[int] = 60                          # maximum duration in seconds to process a single object
+    
+    QUERYSET: ClassVar[QuerySet]                      # the QuerySet to claim objects from
+    CLAIM_WHERE: ClassVar[str] = 'status = "queued"'  # the WHERE clause to filter the objects when atomically getting the next object from the queue
+    CLAIM_SET: ClassVar[str] = 'status = "started"'   # the SET clause to claim the object when atomically getting the next object from the queue
+    CLAIM_ORDER: ClassVar[str] = 'created_at DESC'    # the ORDER BY clause to sort the objects with when atomically getting the next object from the queue
+    CLAIM_FROM_TOP: ClassVar[int] = MAX_CONCURRENT_ACTORS * 10  # the number of objects to consider when atomically getting the next object from the queue
+    ATOMIC: ClassVar[bool] = True                     # whether to atomically fetch+claim the nextobject in one step, or fetch and lock it in two steps
+    
+    # model_type: Type[ModelType]
+    
+    _SPAWNED_ACTOR_PIDS: ClassVar[list[psutil.Process]] = []   # record all the pids of Actors spawned by this class
+    
+    def __init__(self, mode: Literal['thread', 'process']|None=None, **launch_kwargs: LaunchKwargs):
+        self.mode = mode or self.mode
+        self.launch_kwargs = launch_kwargs or dict(self.launch_kwargs)
+    
+    @classproperty
+    def name(cls) -> str:
+        return cls.__name__  # type: ignore
+    
+    def __str__(self) -> str:
+        return self.__repr__()
+    
+    def __repr__(self) -> str:
+        """FaviconActor[pid=1234]"""
+        label = 'pid' if self.mode == 'process' else 'tid'
+        return f'[underline]{self.name}[/underline]\\[{label}={self.pid}]'
+    
+    ### Class Methods: Called by Orchestrator on ActorType class before it has been spawned
+    
+    @classmethod
+    def get_running_actors(cls) -> list[int]:
+        """returns a list of pids of all running actors of this type"""
+        # WARNING: only works for process actors, not thread actors
+        if cls.mode == 'thread':
+            raise NotImplementedError('get_running_actors() is not implemented for thread actors')
+        return [
+            proc.pid for proc in cls._SPAWNED_ACTOR_PIDS
+            if proc.is_running() and proc.status() != 'zombie'
+        ]
+        
+    @classmethod
+    def get_actors_to_spawn(cls, queue: QuerySet, running_actors: list[int]) -> list[LaunchKwargs]:
+        """Get a list of launch kwargs for the number of actors to spawn based on the queue and currently running actors"""
+        queue_length = queue.count()
+        if not queue_length:                                      # queue is empty, spawn 0 actors
+            return []
+        
+        actors_to_spawn: list[LaunchKwargs] = []
+        max_spawnable = cls.MAX_CONCURRENT_ACTORS - len(running_actors)
+        
+        # spawning new actors is expensive, avoid spawning all the actors at once. To stagger them,
+        # let the next orchestrator tick handle starting another 2 on the next tick()
+        # if queue_length > 10:                                   # queue is long, spawn as many as possible
+        #   actors_to_spawn += max_spawnable * [{}]
+        
+        if queue_length > 4:                                    # queue is medium, spawn 1 or 2 actors
+            actors_to_spawn += min(2, max_spawnable) * [{**cls.launch_kwargs}]
+        else:                                                     # queue is short, spawn 1 actor
+            actors_to_spawn += min(1, max_spawnable) * [{**cls.launch_kwargs}]
+        return actors_to_spawn
+        
+    @classmethod
+    def start(cls, mode: Literal['thread', 'process']='process', **launch_kwargs: LaunchKwargs) -> int:
+        if mode == 'thread':
+            return cls.fork_actor_as_thread(**launch_kwargs)
+        elif mode == 'process':
+            return cls.fork_actor_as_process(**launch_kwargs)
+        raise ValueError(f'Invalid actor mode: {mode} must be "thread" or "process"')
+        
+    @classmethod
+    def fork_actor_as_thread(cls, **launch_kwargs: LaunchKwargs) -> int:
+        """Spawn a new background thread running the actor's runloop"""
+        actor = cls(mode='thread', **launch_kwargs)
+        bg_actor_thread = Thread(target=actor.runloop)
+        bg_actor_thread.start()
+        assert bg_actor_thread.native_id is not None
+        return bg_actor_thread.native_id
+    
+    @classmethod
+    def fork_actor_as_process(cls, **launch_kwargs: LaunchKwargs) -> int:
+        """Spawn a new background process running the actor's runloop"""
+        actor = cls(mode='process', **launch_kwargs)
+        bg_actor_process = Process(target=actor.runloop)
+        bg_actor_process.start()
+        assert bg_actor_process.pid is not None
+        cls._SPAWNED_ACTOR_PIDS.append(psutil.Process(pid=bg_actor_process.pid))
+        return bg_actor_process.pid
+    
+    @classmethod
+    def get_model(cls) -> Type[ModelType]:
+        # wish this was a @classproperty but Generic[ModelType] return type cant be statically inferred for @classproperty
+        return cls.QUERYSET.model
+    
+    @classmethod
+    def get_queue(cls) -> QuerySet:
+        """override this to provide your queryset as the queue"""
+        # return ArchiveResult.objects.filter(status='queued', extractor__in=('pdf', 'dom', 'screenshot'))
+        return cls.QUERYSET
+    
+    ### Instance Methods: Called by Actor after it has been spawned (i.e. forked as a thread or process)
+    
+    def runloop(self):
+        """The main runloop that starts running when the actor is spawned (as subprocess or thread) and exits when the queue is empty"""
+        self.on_startup()
+        try:
+            while True:
+                obj_to_process: ModelType | None = None
+                try:
+                    obj_to_process = cast(ModelType, self.get_next(atomic=self.atomic))
+                except Exception:
+                    pass
+                
+                if obj_to_process:
+                    self.idle_count = 0   # reset idle count if we got an object
+                else:
+                    if self.idle_count >= 30:
+                        break             # stop looping and exit if queue is empty and we have idled for 30sec
+                    else:
+                        # print('Actor runloop()', f'pid={self.pid}', 'queue empty, rechecking...')
+                        self.idle_count += 1
+                        time.sleep(1)
+                        continue
+                
+                self.on_tick_start(obj_to_process)
+                
+                # Process the object
+                try:
+                    self.tick(obj_to_process)
+                except Exception as err:
+                    print(f'[red]🏃‍♂️ ERROR: {self}.tick()[/red]', err)
+                    db.connections.close_all()                         # always reset the db connection after an exception to clear any pending transactions
+                    self.on_tick_exception(obj_to_process, err)
+                finally:
+                    self.on_tick_end(obj_to_process)
+            
+            self.on_shutdown(err=None)
+        except BaseException as err:
+            if isinstance(err, KeyboardInterrupt):
+                print()
+            else:
+                print(f'\n[red]🏃‍♂️ {self}.runloop() FATAL:[/red]', err.__class__.__name__, err)
+            self.on_shutdown(err=err)
+    
+    def get_next(self, atomic: bool | None=None) -> ModelType | None:
+        """get the next object from the queue, atomically locking it if self.atomic=True"""
+        if atomic is None:
+            atomic = self.ATOMIC
+
+        if atomic:
+            # fetch and claim the next object from in the queue in one go atomically
+            obj = self.get_next_atomic()
+        else:
+            # two-step claim: fetch the next object and lock it in a separate query
+            obj = self.get_queue().last()
+            assert obj and self.lock_next(obj), f'Unable to fetch+lock the next {self.get_model().__name__} ojbect from {self}.QUEUE'
+        return obj
+    
+    def lock_next(self, obj: ModelType) -> bool:
+        """override this to implement a custom two-step (non-atomic)lock mechanism"""
+        # For example:
+        # assert obj._model.objects.filter(pk=obj.pk, status='queued').update(status='started', locked_by=self.pid)
+        # Not needed if using get_next_and_lock() to claim the object atomically
+        # print(f'[blue]🏃‍♂️ {self}.lock()[/blue]', obj.abid or obj.id)
+        return True
+    
+    def claim_sql_where(self) -> str:
+        """override this to implement a custom WHERE clause for the atomic claim step e.g. "status = 'queued' AND locked_by = NULL" """
+        return self.CLAIM_WHERE
+    
+    def claim_sql_set(self) -> str:
+        """override this to implement a custom SET clause for the atomic claim step e.g. "status = 'started' AND locked_by = {self.pid}" """
+        return self.CLAIM_SET
+    
+    def claim_sql_order(self) -> str:
+        """override this to implement a custom ORDER BY clause for the atomic claim step e.g. "created_at DESC" """
+        return self.CLAIM_ORDER
+    
+    def claim_from_top(self) -> int:
+        """override this to implement a custom number of objects to consider when atomically claiming the next object from the top of the queue"""
+        return self.CLAIM_FROM_TOP
+        
+    def get_next_atomic(self, shallow: bool=True) -> ModelType | None:
+        """
+        claim a random object from the top n=50 objects in the queue (atomically updates status=queued->started for claimed object)
+        optimized for minimizing contention on the queue with other actors selecting from the same list
+        slightly faster than claim_any_obj() which selects randomly from the entire queue but needs to know the total count
+        """
+        Model = self.get_model()                                     # e.g. ArchiveResult
+        table = f'{Model._meta.app_label}_{Model._meta.model_name}'  # e.g. core_archiveresult
+        
+        where_sql = self.claim_sql_where()
+        set_sql = self.claim_sql_set()
+        order_by_sql = self.claim_sql_order()
+        choose_from_top = self.claim_from_top()
+        
+        with db.connection.cursor() as cursor:
+            # subquery gets the pool of the top 50 candidates sorted by sort and order
+            # main query selects a random one from that pool
+            cursor.execute(f"""
+                UPDATE {table} 
+                SET {set_sql}
+                WHERE {where_sql} and id = (
+                    SELECT id FROM (
+                        SELECT id FROM {table}
+                        WHERE {where_sql}
+                        ORDER BY {order_by_sql}
+                        LIMIT {choose_from_top}
+                    ) candidates
+                    ORDER BY RANDOM()
+                    LIMIT 1
+                )
+                RETURNING id;
+            """)
+            result = cursor.fetchone()
+            
+            if result is None:
+                return None           # If no rows were claimed, return None
+
+            if shallow:
+                # shallow: faster, returns potentially incomplete object instance missing some django auto-populated fields:
+                columns = [col[0] for col in cursor.description or ['id']]
+                return Model(**dict(zip(columns, result)))
+
+            # if not shallow do one extra query to get a more complete object instance (load it fully from scratch)
+            return Model.objects.get(id=result[0])
+
+    @abstractmethod
+    def tick(self, obj: ModelType) -> None:
+        """override this to process the object"""
+        print(f'[blue]🏃‍♂️ {self}.tick()[/blue]', obj.abid or obj.id)
+        # For example:
+        # do_some_task(obj)
+        # do_something_else(obj)
+        # obj._model.objects.filter(pk=obj.pk, status='started').update(status='success')
+        raise NotImplementedError('tick() must be implemented by the Actor subclass')
+    
+    def on_startup(self) -> None:
+        if self.mode == 'thread':
+            self.pid = get_native_id()  # thread id
+            print(f'[green]🏃‍♂️ {self}.on_startup() STARTUP (THREAD)[/green]')
+        else:
+            self.pid = os.getpid()      # process id
+            print(f'[green]🏃‍♂️ {self}.on_startup() STARTUP (PROCESS)[/green]')
+        # abx.pm.hook.on_actor_startup(self)
+        
+    def on_shutdown(self, err: BaseException | None=None) -> None:
+        print(f'[grey53]🏃‍♂️ {self}.on_shutdown() SHUTTING DOWN[/grey53]', err or '[green](gracefully)[/green]')
+        # abx.pm.hook.on_actor_shutdown(self)
+        
+    def on_tick_start(self, obj: ModelType) -> None:
+        # print(f'🏃‍♂️ {self}.on_tick_start()', obj.abid or obj.id)
+        # abx.pm.hook.on_actor_tick_start(self, obj_to_process)
+        # self.timer = TimedProgress(self.MAX_TICK_TIME, prefix='      ')
+        pass
+    
+    def on_tick_end(self, obj: ModelType) -> None:
+        # print(f'🏃‍♂️ {self}.on_tick_end()', obj.abid or obj.id)
+        # abx.pm.hook.on_actor_tick_end(self, obj_to_process)
+        # self.timer.end()
+        pass
+    
+    def on_tick_exception(self, obj: ModelType, err: BaseException) -> None:
+        print(f'[red]🏃‍♂️ {self}.on_tick_exception()[/red]', obj.abid or obj.id, err)
+        # abx.pm.hook.on_actor_tick_exception(self, obj_to_process, err)

+ 3 - 0
archivebox/actors/admin.py

@@ -0,0 +1,3 @@
+from django.contrib import admin
+
+# Register your models here.

+ 6 - 0
archivebox/actors/apps.py

@@ -0,0 +1,6 @@
+from django.apps import AppConfig
+
+
+class ActorsConfig(AppConfig):
+    default_auto_field = "django.db.models.BigAutoField"
+    name = "actors"

+ 0 - 0
archivebox/plugins_extractor/__init__.py → archivebox/actors/migrations/__init__.py


+ 3 - 0
archivebox/actors/models.py

@@ -0,0 +1,3 @@
+from django.db import models
+
+# Create your models here.

+ 244 - 0
archivebox/actors/orchestrator.py

@@ -0,0 +1,244 @@
+__package__ = 'archivebox.actors'
+
+import os
+import time
+import itertools
+from typing import Dict, Type, Literal, ClassVar
+from django.utils.functional import classproperty
+
+from multiprocessing import Process, cpu_count
+from threading import Thread, get_native_id
+
+
+from rich import print
+
+from django.db.models import QuerySet
+
+from django.apps import apps
+from .actor import ActorType
+
+class Orchestrator:
+    pid: int
+    idle_count: int = 0
+    actor_types: Dict[str, Type[ActorType]]
+    mode: Literal['thread', 'process'] = 'process'
+
+    def __init__(self, actor_types: Dict[str, Type[ActorType]] | None = None, mode: Literal['thread', 'process'] | None=None):
+        self.actor_types = actor_types or self.actor_types or self.autodiscover_actor_types()
+        self.mode = mode or self.mode
+
+    def __repr__(self) -> str:
+        label = 'tid' if self.mode == 'thread' else 'pid'
+        return f'[underline]{self.name}[/underline]\\[{label}={self.pid}]'
+    
+    def __str__(self) -> str:
+        return self.__repr__()
+    
+    @classproperty
+    def name(cls) -> str:
+        return cls.__name__   # type: ignore
+    
+    def fork_as_thread(self):
+        self.thread = Thread(target=self.runloop)
+        self.thread.start()
+        assert self.thread.native_id is not None
+        return self.thread.native_id
+    
+    def fork_as_process(self):
+        self.process = Process(target=self.runloop)
+        self.process.start()
+        assert self.process.pid is not None
+        return self.process.pid
+
+    def start(self) -> int:
+        if self.mode == 'thread':
+            return self.fork_as_thread()
+        elif self.mode == 'process':
+            return self.fork_as_process()
+        raise ValueError(f'Invalid orchestrator mode: {self.mode}')
+    
+    @classmethod
+    def autodiscover_actor_types(cls) -> Dict[str, Type[ActorType]]:
+        # returns a Dict of all discovered {actor_type_id: ActorType} across the codebase
+        # override this method in a subclass to customize the actor types that are used
+        # return {'Snapshot': SnapshotActorType, 'ArchiveResult_chrome': ChromeActorType, ...}
+        return {
+            # look through all models and find all classes that inherit from ActorType
+            # actor_type.__name__: actor_type
+            # for actor_type in abx.pm.hook.get_all_ACTORS_TYPES().values()
+        }
+    
+    @classmethod
+    def get_orphaned_objects(cls, all_queues) -> list:
+        # returns a list of objects that are in the queues of all actor types but not in the queues of any other actor types
+        all_queued_ids = itertools.chain(*[queue.values('id', flat=True) for queue in all_queues.values()])
+        orphaned_objects = []
+        for model in apps.get_models():
+            if hasattr(model, 'retry_at'):
+                orphaned_objects.extend(model.objects.filter(retry_at__lt=timezone.now()).exclude(id__in=all_queued_ids))
+        return orphaned_objects
+    
+    def on_startup(self):
+        if self.mode == 'thread':
+            self.pid = get_native_id()
+            print(f'[green]👨‍✈️ {self}.on_startup() STARTUP (THREAD)[/green]')
+        elif self.mode == 'process':
+            self.pid = os.getpid()
+            print(f'[green]👨‍✈️ {self}.on_startup() STARTUP (PROCESS)[/green]')
+        # abx.pm.hook.on_orchestrator_startup(self)
+    
+    def on_shutdown(self, err: BaseException | None = None):
+        print(f'[grey53]👨‍✈️ {self}.on_shutdown() SHUTTING DOWN[/grey53]', err or '[green](gracefully)[/green]')
+        # abx.pm.hook.on_orchestrator_shutdown(self)
+        
+    def on_tick_started(self, all_queues):
+        # total_pending = sum(queue.count() for queue in all_queues.values())
+        # print(f'👨‍✈️ {self}.on_tick_started()', f'total_pending={total_pending}')
+        # abx.pm.hook.on_orchestrator_tick_started(self, actor_types, all_queues)
+        pass
+    
+    def on_tick_finished(self, all_queues, all_existing_actors, all_spawned_actors):
+        if all_spawned_actors:
+            total_queue_length = sum(queue.count() for queue in all_queues.values())
+            print(f'[grey53]👨‍✈️ {self}.on_tick_finished() queue={total_queue_length} existing_actors={len(all_existing_actors)} spawned_actors={len(all_spawned_actors)}[/grey53]')
+        # abx.pm.hook.on_orchestrator_tick_finished(self, actor_types, all_queues)
+
+    def on_idle(self, all_queues):
+        # print(f'👨‍✈️ {self}.on_idle()')
+        # abx.pm.hook.on_orchestrator_idle(self)
+        # check for orphaned objects left behind
+        if self.idle_count == 60:
+            orphaned_objects = self.get_orphaned_objects(all_queues)
+            if orphaned_objects:
+                print('[red]👨‍✈️ WARNING: some objects may not be processed, no actor has claimed them after 60s:[/red]', orphaned_objects)
+
+    def runloop(self):
+        self.on_startup()
+        try:
+            while True:
+                all_queues = {
+                    actor_type: actor_type.get_queue()
+                    for actor_type in self.actor_types.values()
+                }
+                if not all_queues:
+                    raise Exception('Failed to find any actor_types to process')
+
+                self.on_tick_started(all_queues)
+
+                all_existing_actors = []
+                all_spawned_actors = []
+
+                for actor_type, queue in all_queues.items():
+                    try:
+                        existing_actors = actor_type.get_running_actors()
+                        all_existing_actors.extend(existing_actors)
+                        actors_to_spawn = actor_type.get_actors_to_spawn(queue, existing_actors)
+                        for launch_kwargs in actors_to_spawn:
+                            new_actor_pid = actor_type.start(mode='process', **launch_kwargs)
+                            all_spawned_actors.append(new_actor_pid)
+                    except Exception as err:
+                        print(f'🏃‍♂️ ERROR: {self} Failed to get {actor_type} queue & running actors', err)
+                    except BaseException:
+                        raise
+
+                if not any(queue.exists() for queue in all_queues.values()):
+                    self.on_idle(all_queues)
+                    self.idle_count += 1
+                    time.sleep(1)
+                else:
+                    self.idle_count = 0
+                    
+                self.on_tick_finished(all_queues, all_existing_actors, all_spawned_actors)
+                time.sleep(1)
+
+        except BaseException as err:
+            if isinstance(err, KeyboardInterrupt):
+                print()
+            else:
+                print(f'\n[red]🏃‍♂️ {self}.runloop() FATAL:[/red]', err.__class__.__name__, err)
+            self.on_shutdown(err=err)
+
+
+
+from archivebox.config.django import setup_django
+
+setup_django()
+
+from core.models import ArchiveResult, Snapshot
+
+from django.utils import timezone
+
+from django import db
+from django.db import connection
+
+
+from crawls.actors import CrawlActor
+from .actor_snapshot import SnapshotActor
+
+from abx_plugin_singlefile.actors import SinglefileActor
+
+
+class FaviconActor(ActorType[ArchiveResult]):
+    CLAIM_ORDER: ClassVar[str] = 'created_at DESC'
+    CLAIM_WHERE: ClassVar[str] = 'status = "queued" AND extractor = "favicon"'
+    CLAIM_SET: ClassVar[str] = 'status = "started"'
+    
+    @classproperty
+    def QUERYSET(cls) -> QuerySet:
+        return ArchiveResult.objects.filter(status='failed', extractor='favicon')
+
+    def tick(self, obj: ArchiveResult):
+        print(f'[grey53]{self}.tick({obj.abid or obj.id}, status={obj.status}) remaining:[/grey53]', self.get_queue().count())
+        updated = ArchiveResult.objects.filter(id=obj.id, status='started').update(status='success') == 1
+        if not updated:
+            raise Exception(f'Failed to update {obj.abid or obj.id}, interrupted by another actor writing to the same object')
+        obj.refresh_from_db()
+        obj.save()
+
+
+class ExtractorsOrchestrator(Orchestrator):
+    actor_types = {
+        'CrawlActor': CrawlActor,
+        'SnapshotActor': SnapshotActor,
+        'FaviconActor': FaviconActor,
+        'SinglefileActor': SinglefileActor,
+    }
+
+
+if __name__ == '__main__':    
+    orchestrator = ExtractorsOrchestrator()
+    orchestrator.start()
+    
+    snap = Snapshot.objects.last()
+    assert snap is not None
+    created = 0
+    while True:
+        time.sleep(0.05)
+        # try:
+        #     ArchiveResult.objects.bulk_create([
+        #         ArchiveResult(
+        #             id=uuid.uuid4(),
+        #             snapshot=snap,
+        #             status='failed',
+        #             extractor='favicon',
+        #             cmd=['echo', '"hello"'],
+        #             cmd_version='1.0',
+        #             pwd='.',
+        #             start_ts=timezone.now(),
+        #             end_ts=timezone.now(),
+        #             created_at=timezone.now(),
+        #             modified_at=timezone.now(),
+        #             created_by_id=1,
+        #         )
+        #         for _ in range(100)
+        #     ])
+        #     created += 100
+        #     if created % 1000 == 0:
+        #         print(f'[blue]Created {created} ArchiveResults...[/blue]')
+        #         time.sleep(25)
+        # except Exception as err:
+        #     print(err)
+        #     db.connections.close_all()
+        # except BaseException as err:
+        #     print(err)
+        #     break

+ 286 - 0
archivebox/actors/statemachine.py

@@ -0,0 +1,286 @@
+from statemachine import State, StateMachine
+from django.db import models
+from multiprocessing import Process
+import psutil
+import time
+
+# State Machine Definitions
+#################################################
+
+class SnapshotMachine(StateMachine):
+    """State machine for managing Snapshot lifecycle."""
+    
+    # States
+    queued = State(initial=True)
+    started = State()
+    sealed = State(final=True)
+    
+    # Transitions
+    start = queued.to(started, cond='can_start')
+    seal = started.to(sealed, cond='is_finished')
+    
+    # Events
+    tick = (
+        queued.to.itself(unless='can_start') |
+        queued.to(started, cond='can_start') |
+        started.to.itself(unless='is_finished') |
+        started.to(sealed, cond='is_finished')
+    )
+    
+    def __init__(self, snapshot):
+        self.snapshot = snapshot
+        super().__init__()
+        
+    def can_start(self):
+        return True
+        
+    def is_finished(self):
+        return not self.snapshot.has_pending_archiveresults()
+        
+    def before_start(self):
+        """Pre-start validation and setup."""
+        self.snapshot.cleanup_dir()
+        
+    def after_start(self):
+        """Post-start side effects."""
+        self.snapshot.create_pending_archiveresults()
+        self.snapshot.update_indices()
+        self.snapshot.bump_retry_at(seconds=10)
+        
+    def before_seal(self):
+        """Pre-seal validation and cleanup."""
+        self.snapshot.cleanup_dir()
+        
+    def after_seal(self):
+        """Post-seal actions."""
+        self.snapshot.update_indices()
+        self.snapshot.seal_dir()
+        self.snapshot.upload_dir()
+        self.snapshot.retry_at = None
+        self.snapshot.save()
+
+
+class ArchiveResultMachine(StateMachine):
+    """State machine for managing ArchiveResult lifecycle."""
+    
+    # States
+    queued = State(initial=True)
+    started = State()
+    succeeded = State(final=True)
+    backoff = State()
+    failed = State(final=True)
+    
+    # Transitions
+    start = queued.to(started, cond='can_start')
+    succeed = started.to(succeeded, cond='extractor_succeeded')
+    backoff = started.to(backoff, unless='extractor_succeeded')
+    retry = backoff.to(queued, cond='can_retry')
+    fail = backoff.to(failed, unless='can_retry')
+    
+    # Events
+    tick = (
+        queued.to.itself(unless='can_start') |
+        queued.to(started, cond='can_start') |
+        started.to.itself(cond='extractor_still_running') |
+        started.to(succeeded, cond='extractor_succeeded') |
+        started.to(backoff, unless='extractor_succeeded') |
+        backoff.to.itself(cond='still_waiting_to_retry') |
+        backoff.to(queued, cond='can_retry') |
+        backoff.to(failed, unless='can_retry')
+    )
+    
+    def __init__(self, archiveresult):
+        self.archiveresult = archiveresult
+        super().__init__()
+    
+    def can_start(self):
+        return True
+    
+    def extractor_still_running(self):
+        return self.archiveresult.start_ts > time.now() - timedelta(seconds=5)
+    
+    def extractor_succeeded(self):
+        # return check_if_extractor_succeeded(self.archiveresult)
+        return self.archiveresult.start_ts < time.now() - timedelta(seconds=5)
+    
+    def can_retry(self):
+        return self.archiveresult.retries < self.archiveresult.max_retries
+        
+    def before_start(self):
+        """Pre-start initialization."""
+        self.archiveresult.retries += 1
+        self.archiveresult.start_ts = time.now()
+        self.archiveresult.output = None
+        self.archiveresult.error = None
+        
+    def after_start(self):
+        """Post-start execution."""
+        self.archiveresult.bump_retry_at(seconds=self.archiveresult.timeout + 5)
+        execute_extractor(self.archiveresult)
+        self.archiveresult.snapshot.bump_retry_at(seconds=5)
+        
+    def before_succeed(self):
+        """Pre-success validation."""
+        self.archiveresult.output = get_archiveresult_output(self.archiveresult)
+        
+    def after_succeed(self):
+        """Post-success cleanup."""
+        self.archiveresult.end_ts = time.now()
+        self.archiveresult.retry_at = None
+        self.archiveresult.update_indices()
+        
+    def before_backoff(self):
+        """Pre-backoff error capture."""
+        self.archiveresult.error = get_archiveresult_error(self.archiveresult)
+        
+    def after_backoff(self):
+        """Post-backoff retry scheduling."""
+        self.archiveresult.end_ts = time.now()
+        self.archiveresult.bump_retry_at(
+            seconds=self.archiveresult.timeout * self.archiveresult.retries
+        )
+        self.archiveresult.update_indices()
+        
+    def before_fail(self):
+        """Pre-failure finalization."""
+        self.archiveresult.retry_at = None
+        
+    def after_fail(self):
+        """Post-failure cleanup."""
+        self.archiveresult.update_indices()
+
+# Models
+#################################################
+
+class Snapshot(models.Model):
+    status = models.CharField(max_length=32, default='queued')
+    retry_at = models.DateTimeField(null=True)
+    
+    @property
+    def sm(self):
+        """Get the state machine for this snapshot."""
+        return SnapshotMachine(self)
+    
+    def has_pending_archiveresults(self):
+        return self.archiveresult_set.exclude(
+            status__in=['succeeded', 'failed']
+        ).exists()
+    
+    def bump_retry_at(self, seconds):
+        self.retry_at = time.now() + timedelta(seconds=seconds)
+        self.save()
+        
+    def cleanup_dir(self):
+        cleanup_snapshot_dir(self)
+        
+    def create_pending_archiveresults(self):
+        create_snapshot_pending_archiveresults(self)
+        
+    def update_indices(self):
+        update_snapshot_index_json(self)
+        update_snapshot_index_html(self)
+        
+    def seal_dir(self):
+        seal_snapshot_dir(self)
+        
+    def upload_dir(self):
+        upload_snapshot_dir(self)
+
+
+class ArchiveResult(models.Model):
+    snapshot = models.ForeignKey(Snapshot, on_delete=models.CASCADE)
+    status = models.CharField(max_length=32, default='queued')
+    retry_at = models.DateTimeField(null=True)
+    retries = models.IntegerField(default=0)
+    max_retries = models.IntegerField(default=3)
+    timeout = models.IntegerField(default=60)
+    start_ts = models.DateTimeField(null=True)
+    end_ts = models.DateTimeField(null=True)
+    output = models.TextField(null=True)
+    error = models.TextField(null=True)
+    
+    def get_machine(self):
+        return ArchiveResultMachine(self)
+    
+    def bump_retry_at(self, seconds):
+        self.retry_at = time.now() + timedelta(seconds=seconds)
+        self.save()
+        
+    def update_indices(self):
+        update_archiveresult_index_json(self)
+        update_archiveresult_index_html(self)
+
+
+# Actor System
+#################################################
+
+class BaseActor:
+    MAX_TICK_TIME = 60
+    
+    def tick(self, obj):
+        """Process a single object through its state machine."""
+        machine = obj.get_machine()
+        
+        if machine.is_queued:
+            if machine.can_start():
+                machine.start()
+                
+        elif machine.is_started:
+            if machine.can_seal():
+                machine.seal()
+                
+        elif machine.is_backoff:
+            if machine.can_retry():
+                machine.retry()
+            else:
+                machine.fail()
+
+
+class Orchestrator:
+    """Main orchestrator that manages all actors."""
+    
+    def __init__(self):
+        self.pid = None
+        
+    @classmethod
+    def spawn(cls):
+        orchestrator = cls()
+        proc = Process(target=orchestrator.runloop)
+        proc.start()
+        return proc.pid
+        
+    def runloop(self):
+        self.pid = os.getpid()
+        abx.pm.hook.on_orchestrator_startup(self)
+        
+        try:
+            while True:
+                self.process_queue(Snapshot)
+                self.process_queue(ArchiveResult)
+                time.sleep(0.1)
+                
+        except (KeyboardInterrupt, SystemExit):
+            abx.pm.hook.on_orchestrator_shutdown(self)
+            
+    def process_queue(self, model):
+        retry_at_reached = Q(retry_at__isnull=True) | Q(retry_at__lte=time.now())
+        queue = model.objects.filter(retry_at_reached)
+        
+        if queue.exists():
+            actor = BaseActor()
+            for obj in queue:
+                try:
+                    with transaction.atomic():
+                        actor.tick(obj)
+                except Exception as e:
+                    abx.pm.hook.on_actor_tick_exception(actor, obj, e)
+
+
+# Periodic Tasks
+#################################################
+
[email protected]_task(schedule=djhuey.crontab(minute='*'))
+def ensure_orchestrator_running():
+    """Ensure orchestrator is running, start if not."""
+    if not any(p.name().startswith('Orchestrator') for p in psutil.process_iter()):
+        Orchestrator.spawn()

+ 3 - 0
archivebox/actors/tests.py

@@ -0,0 +1,3 @@
+from django.test import TestCase
+
+# Create your tests here.

+ 3 - 0
archivebox/actors/views.py

@@ -0,0 +1,3 @@
+from django.shortcuts import render
+
+# Create your views here.

+ 24 - 30
archivebox/config/__init__.py

@@ -1,4 +1,5 @@
-__package__ = 'archivebox.config'
+__package__ = 'config'
+__order__ = 200
 
 from .paths import (
     PACKAGE_DIR,                                    # noqa
@@ -8,35 +9,28 @@ from .paths import (
 from .constants import CONSTANTS, CONSTANTS_CONFIG, PACKAGE_DIR, DATA_DIR, ARCHIVE_DIR      # noqa
 from .version import VERSION                        # noqa
 
-
-import abx
-
+# import abx
 
 # @abx.hookimpl
-# def get_INSTALLED_APPS():
-#     return ['config']
-
+# def get_CONFIG():
+#     from .common import (
+#         SHELL_CONFIG,
+#         STORAGE_CONFIG,
+#         GENERAL_CONFIG,
+#         SERVER_CONFIG,
+#         ARCHIVING_CONFIG,
+#         SEARCH_BACKEND_CONFIG,
+#     )
+#     return {
+#         'SHELL_CONFIG': SHELL_CONFIG,
+#         'STORAGE_CONFIG': STORAGE_CONFIG,
+#         'GENERAL_CONFIG': GENERAL_CONFIG,
+#         'SERVER_CONFIG': SERVER_CONFIG,
+#         'ARCHIVING_CONFIG': ARCHIVING_CONFIG,
+#         'SEARCHBACKEND_CONFIG': SEARCH_BACKEND_CONFIG,
+#     }
 
[email protected]
-def get_CONFIG():
-    from .common import (
-        SHELL_CONFIG,
-        STORAGE_CONFIG,
-        GENERAL_CONFIG,
-        SERVER_CONFIG,
-        ARCHIVING_CONFIG,
-        SEARCH_BACKEND_CONFIG,
-    )
-    return {
-        'SHELL_CONFIG': SHELL_CONFIG,
-        'STORAGE_CONFIG': STORAGE_CONFIG,
-        'GENERAL_CONFIG': GENERAL_CONFIG,
-        'SERVER_CONFIG': SERVER_CONFIG,
-        'ARCHIVING_CONFIG': ARCHIVING_CONFIG,
-        'SEARCHBACKEND_CONFIG': SEARCH_BACKEND_CONFIG,
-    }
-
[email protected]
-def ready():
-    for config in get_CONFIG().values():
-        config.validate()
+# @abx.hookimpl
+# def ready():
+#     for config in get_CONFIG().values():
+#         config.validate()

+ 10 - 10
archivebox/config/configfile.py → archivebox/config/collection.py

@@ -9,16 +9,18 @@ from configparser import ConfigParser
 
 from benedict import benedict
 
+import archivebox
+
 from archivebox.config.constants import CONSTANTS
 
 from archivebox.misc.logging import stderr
 
 
 def get_real_name(key: str) -> str:
-    """get the current canonical name for a given deprecated config key"""
-    from django.conf import settings
+    """get the up-to-date canonical name for a given old alias or current key"""
+    CONFIGS = archivebox.pm.hook.get_CONFIGS()
     
-    for section in settings.CONFIGS.values():
+    for section in CONFIGS.values():
         try:
             return section.aliases[key]
         except KeyError:
@@ -115,17 +117,15 @@ def load_config_file() -> Optional[benedict]:
 
 
 def section_for_key(key: str) -> Any:
-    from django.conf import settings
-    for config_section in settings.CONFIGS.values():
+    for config_section in archivebox.pm.hook.get_CONFIGS().values():
         if hasattr(config_section, key):
             return config_section
-    return None
+    raise ValueError(f'No config section found for key: {key}')
 
 
 def write_config_file(config: Dict[str, str]) -> benedict:
     """load the ini-formatted config file from DATA_DIR/Archivebox.conf"""
 
-    import abx.archivebox.reads
     from archivebox.misc.system import atomic_write
 
     CONFIG_HEADER = (
@@ -175,7 +175,7 @@ def write_config_file(config: Dict[str, str]) -> benedict:
     updated_config = {}
     try:
         # validate the updated_config by attempting to re-parse it
-        updated_config = {**load_all_config(), **abx.archivebox.reads.get_FLAT_CONFIG()}
+        updated_config = {**load_all_config(), **archivebox.pm.hook.get_FLAT_CONFIG()}
     except BaseException:                                                       # lgtm [py/catch-base-exception]
         # something went horribly wrong, revert to the previous version
         with open(f'{config_path}.bak', 'r', encoding='utf-8') as old:
@@ -233,11 +233,11 @@ def load_config(defaults: Dict[str, Any],
     return benedict(extended_config)
 
 def load_all_config():
-    import abx.archivebox.reads
+    import abx
     
     flat_config = benedict()
     
-    for config_section in abx.archivebox.reads.get_CONFIGS().values():
+    for config_section in abx.pm.hook.get_CONFIGS().values():
         config_section.__init__()
         flat_config.update(config_section.model_dump())
         

+ 1 - 3
archivebox/config/common.py

@@ -10,7 +10,7 @@ from rich import print
 from pydantic import Field, field_validator
 from django.utils.crypto import get_random_string
 
-from abx.archivebox.base_configset import BaseConfigSet
+from abx_spec_config.base_configset import BaseConfigSet
 
 from .constants import CONSTANTS
 from .version import get_COMMIT_HASH, get_BUILD_TIME, VERSION
@@ -45,8 +45,6 @@ class ShellConfig(BaseConfigSet):
     def BUILD_TIME(self) -> str:
         return get_BUILD_TIME()
  
-    # def VERSIONS_AVAILABLE() -> bool             # .check_for_update.get_versions_available_on_github(c)},
-    # def CAN_UPGRADE() -> bool                    # .check_for_update.can_upgrade(c)},
 
 SHELL_CONFIG = ShellConfig()
 

+ 17 - 2
archivebox/config/constants.py

@@ -1,3 +1,15 @@
+"""
+Constants are for things that never change at runtime.
+(but they can change from run-to-run or machine-to-machine)
+
+DATA_DIR will never change at runtime, but you can run
+archivebox from inside a different DATA_DIR on the same machine.
+
+This is loaded very early in the archivebox startup flow, so nothing in this file 
+or imported from this file should import anything from archivebox.config.common, 
+django, other INSTALLED_APPS, or anything else that is not in a standard library.
+"""
+
 __package__ = 'archivebox.config'
 
 import re
@@ -197,10 +209,12 @@ class ConstantsDict(Mapping):
 
     @classmethod
     def __getitem__(cls, key: str):
+        # so it behaves like a dict[key] == dict.key or object attr
         return getattr(cls, key)
     
     @classmethod
     def __benedict__(cls):
+        # when casting to benedict, only include uppercase keys that don't start with an underscore
         return benedict({key: value for key, value in cls.__dict__.items() if key.isupper() and not key.startswith('_')})
     
     @classmethod
@@ -214,5 +228,6 @@ class ConstantsDict(Mapping):
 CONSTANTS = ConstantsDict()
 CONSTANTS_CONFIG = CONSTANTS.__benedict__()
 
-# add all key: values to globals() for easier importing
-globals().update(CONSTANTS)
+# add all key: values to globals() for easier importing, e.g.:
+# from archivebox.config.constants import IS_ROOT, PERSONAS_DIR, ...
+# globals().update(CONSTANTS)

+ 2 - 2
archivebox/config/django.py

@@ -60,7 +60,7 @@ def setup_django(check_db=False, in_memory_db=False) -> None:
         return
 
     with Progress(transient=True, expand=True, console=STDERR) as INITIAL_STARTUP_PROGRESS:
-        INITIAL_STARTUP_PROGRESS_TASK = INITIAL_STARTUP_PROGRESS.add_task("[green]Loading modules...", total=25)
+        INITIAL_STARTUP_PROGRESS_TASK = INITIAL_STARTUP_PROGRESS.add_task("[green]Loading modules...", total=25, visible=False)
         
         from archivebox.config.permissions import IS_ROOT, ARCHIVEBOX_USER, ARCHIVEBOX_GROUP, SudoPermission
     
@@ -97,7 +97,7 @@ def setup_django(check_db=False, in_memory_db=False) -> None:
                 except Exception as e:
                     bump_startup_progress_bar(advance=1000)
                     
-                    is_using_meta_cmd = any(ignored_subcommand in sys.argv for ignored_subcommand in ('help', 'version', '--help', '--version', 'init'))
+                    is_using_meta_cmd = any(ignored_subcommand in sys.argv for ignored_subcommand in ('help', 'version', '--help', '--version'))
                     if not is_using_meta_cmd:
                         # show error message to user only if they're not running a meta command / just trying to get help
                         STDERR.print()

+ 8 - 4
archivebox/config/version.py

@@ -45,7 +45,7 @@ def detect_installed_version(PACKAGE_DIR: Path=PACKAGE_DIR):
 @cache
 def get_COMMIT_HASH() -> Optional[str]:
     try:
-        git_dir = PACKAGE_DIR / '../.git'
+        git_dir = PACKAGE_DIR.parent / '.git'
         ref = (git_dir / 'HEAD').read_text().strip().split(' ')[-1]
         commit_hash = git_dir.joinpath(ref).read_text().strip()
         return commit_hash
@@ -53,7 +53,7 @@ def get_COMMIT_HASH() -> Optional[str]:
         pass
 
     try:
-        return list((PACKAGE_DIR / '../.git/refs/heads/').glob('*'))[0].read_text().strip()
+        return list((PACKAGE_DIR.parent / '.git/refs/heads/').glob('*'))[0].read_text().strip()
     except Exception:
         pass
     
@@ -62,8 +62,12 @@ def get_COMMIT_HASH() -> Optional[str]:
 @cache
 def get_BUILD_TIME() -> str:
     if IN_DOCKER:
-        docker_build_end_time = Path('/VERSION.txt').read_text().rsplit('BUILD_END_TIME=')[-1].split('\n', 1)[0]
-        return docker_build_end_time
+        try:
+            # if we're in the archivebox official docker image, /VERSION.txt will contain the build time
+            docker_build_end_time = Path('/VERSION.txt').read_text().rsplit('BUILD_END_TIME=')[-1].split('\n', 1)[0]
+            return docker_build_end_time
+        except Exception:
+            pass
 
     src_last_modified_unix_timestamp = (PACKAGE_DIR / 'README.md').stat().st_mtime
     return datetime.fromtimestamp(src_last_modified_unix_timestamp).strftime('%Y-%m-%d %H:%M:%S %s')

+ 12 - 16
archivebox/config/views.py

@@ -14,8 +14,8 @@ from django.utils.html import format_html, mark_safe
 from admin_data_views.typing import TableContext, ItemContext
 from admin_data_views.utils import render_with_table_view, render_with_item_view, ItemLink
 
-import abx.archivebox.reads
-
+import abx
+import archivebox
 from archivebox.config import CONSTANTS
 from archivebox.misc.util import parse_date
 
@@ -65,7 +65,7 @@ def obj_to_yaml(obj: Any, indent: int=0) -> str:
 
 @render_with_table_view
 def binaries_list_view(request: HttpRequest, **kwargs) -> TableContext:
-
+    FLAT_CONFIG = archivebox.pm.hook.get_FLAT_CONFIG()
     assert request.user.is_superuser, 'Must be a superuser to view configuration settings.'
 
     rows = {
@@ -81,12 +81,11 @@ def binaries_list_view(request: HttpRequest, **kwargs) -> TableContext:
 
     relevant_configs = {
         key: val
-        for key, val in settings.FLAT_CONFIG.items()
+        for key, val in FLAT_CONFIG.items()
         if '_BINARY' in key or '_VERSION' in key
     }
 
-    for plugin_id, plugin in abx.archivebox.reads.get_PLUGINS().items():
-        plugin = abx.archivebox.reads.get_PLUGIN(plugin_id)
+    for plugin_id, plugin in abx.get_all_plugins().items():
         if not plugin.hooks.get('get_BINARIES'):
             continue
         
@@ -131,17 +130,16 @@ def binaries_list_view(request: HttpRequest, **kwargs) -> TableContext:
 @render_with_item_view
 def binary_detail_view(request: HttpRequest, key: str, **kwargs) -> ItemContext:
 
-    assert request.user.is_superuser, 'Must be a superuser to view configuration settings.'
+    assert request.user and request.user.is_superuser, 'Must be a superuser to view configuration settings.'
 
     binary = None
     plugin = None
-    for plugin_id in abx.archivebox.reads.get_PLUGINS().keys():
-        loaded_plugin = abx.archivebox.reads.get_PLUGIN(plugin_id)
+    for plugin_id, plugin in abx.get_all_plugins().items():
         try:
-            for loaded_binary in loaded_plugin.hooks.get_BINARIES().values():
+            for loaded_binary in plugin['hooks'].get_BINARIES().values():
                 if loaded_binary.name == key:
                     binary = loaded_binary
-                    plugin = loaded_plugin
+                    plugin = plugin
                     # break  # last write wins
         except Exception as e:
             print(e)
@@ -161,7 +159,7 @@ def binary_detail_view(request: HttpRequest, key: str, **kwargs) -> ItemContext:
                 "name": binary.name,
                 "description": binary.abspath,
                 "fields": {
-                    'plugin': plugin.package,
+                    'plugin': plugin['package'],
                     'binprovider': binary.loaded_binprovider,
                     'abspath': binary.loaded_abspath,
                     'version': binary.loaded_version,
@@ -215,9 +213,7 @@ def plugins_list_view(request: HttpRequest, **kwargs) -> TableContext:
                 return color
         return 'black'
 
-    for plugin_id in settings.PLUGINS.keys():
-        
-        plugin = abx.archivebox.reads.get_PLUGIN(plugin_id)
+    for plugin_id, plugin in abx.get_all_plugins().items():
         plugin.hooks.get_BINPROVIDERS = plugin.hooks.get('get_BINPROVIDERS', lambda: {})
         plugin.hooks.get_BINARIES = plugin.hooks.get('get_BINARIES', lambda: {})
         plugin.hooks.get_CONFIG = plugin.hooks.get('get_CONFIG', lambda: {})
@@ -263,7 +259,7 @@ def plugin_detail_view(request: HttpRequest, key: str, **kwargs) -> ItemContext:
 
     assert plugin_id, f'Could not find a plugin matching the specified name: {key}'
 
-    plugin = abx.archivebox.reads.get_PLUGIN(plugin_id)
+    plugin = abx.get_plugin(plugin_id)
 
     return ItemContext(
         slug=key,

+ 29 - 0
archivebox/core/__init__.py

@@ -1,2 +1,31 @@
 __package__ = 'archivebox.core'
 
+import abx
+
[email protected]
+def register_admin(admin_site):
+    """Register the core.models views (Snapshot, ArchiveResult, Tag, etc.) with the admin site"""
+    from core.admin import register_admin
+    register_admin(admin_site)
+
+
+
[email protected]
+def get_CONFIG():
+    from archivebox.config.common import (
+        SHELL_CONFIG,
+        STORAGE_CONFIG,
+        GENERAL_CONFIG,
+        SERVER_CONFIG,
+        ARCHIVING_CONFIG,
+        SEARCH_BACKEND_CONFIG,
+    )
+    return {
+        'SHELL_CONFIG': SHELL_CONFIG,
+        'STORAGE_CONFIG': STORAGE_CONFIG,
+        'GENERAL_CONFIG': GENERAL_CONFIG,
+        'SERVER_CONFIG': SERVER_CONFIG,
+        'ARCHIVING_CONFIG': ARCHIVING_CONFIG,
+        'SEARCHBACKEND_CONFIG': SEARCH_BACKEND_CONFIG,
+    }
+

+ 73 - 0
archivebox/core/actors.py

@@ -0,0 +1,73 @@
+__package__ = 'archivebox.core'
+
+from typing import ClassVar
+
+from rich import print
+
+from django.db.models import QuerySet
+from django.utils import timezone
+from datetime import timedelta
+from core.models import Snapshot
+
+from actors.actor import ActorType
+
+
+class SnapshotActor(ActorType[Snapshot]):
+    
+    QUERYSET: ClassVar[QuerySet] = Snapshot.objects.filter(status='queued')
+    CLAIM_WHERE: ClassVar[str] = 'status = "queued"'  # the WHERE clause to filter the objects when atomically getting the next object from the queue
+    CLAIM_SET: ClassVar[str] = 'status = "started"'   # the SET clause to claim the object when atomically getting the next object from the queue
+    CLAIM_ORDER: ClassVar[str] = 'created_at DESC'    # the ORDER BY clause to sort the objects with when atomically getting the next object from the queue
+    CLAIM_FROM_TOP: ClassVar[int] = 50                # the number of objects to consider when atomically getting the next object from the queue
+    
+    # model_type: Type[ModelType]
+    MAX_CONCURRENT_ACTORS: ClassVar[int] = 4               # min 2, max 8, up to 60% of available cpu cores
+    MAX_TICK_TIME: ClassVar[int] = 60                          # maximum duration in seconds to process a single object
+    
+    def claim_sql_where(self) -> str:
+        """override this to implement a custom WHERE clause for the atomic claim step e.g. "status = 'queued' AND locked_by = NULL" """
+        return self.CLAIM_WHERE
+    
+    def claim_sql_set(self) -> str:
+        """override this to implement a custom SET clause for the atomic claim step e.g. "status = 'started' AND locked_by = {self.pid}" """
+        retry_at = timezone.now() + timedelta(seconds=self.MAX_TICK_TIME)
+        # format as 2024-10-31 10:14:33.240903
+        retry_at_str = retry_at.strftime('%Y-%m-%d %H:%M:%S.%f')
+        return f'{self.CLAIM_SET}, retry_at = {retry_at_str}'
+    
+    def claim_sql_order(self) -> str:
+        """override this to implement a custom ORDER BY clause for the atomic claim step e.g. "created_at DESC" """
+        return self.CLAIM_ORDER
+    
+    def claim_from_top(self) -> int:
+        """override this to implement a custom number of objects to consider when atomically claiming the next object from the top of the queue"""
+        return self.CLAIM_FROM_TOP
+        
+    def tick(self, obj: Snapshot) -> None:
+        """override this to process the object"""
+        print(f'[blue]🏃‍♂️ {self}.tick()[/blue]', obj.abid or obj.id)
+        # For example:
+        # do_some_task(obj)
+        # do_something_else(obj)
+        # obj._model.objects.filter(pk=obj.pk, status='started').update(status='success')
+        # raise NotImplementedError('tick() must be implemented by the Actor subclass')
+    
+    def on_shutdown(self, err: BaseException | None=None) -> None:
+        print(f'[grey53]🏃‍♂️ {self}.on_shutdown() SHUTTING DOWN[/grey53]', err or '[green](gracefully)[/green]')
+        # abx.pm.hook.on_actor_shutdown(self)
+        
+    def on_tick_start(self, obj: Snapshot) -> None:
+        # print(f'🏃‍♂️ {self}.on_tick_start()', obj.abid or obj.id)
+        # abx.pm.hook.on_actor_tick_start(self, obj_to_process)
+        # self.timer = TimedProgress(self.MAX_TICK_TIME, prefix='      ')
+        pass
+    
+    def on_tick_end(self, obj: Snapshot) -> None:
+        # print(f'🏃‍♂️ {self}.on_tick_end()', obj.abid or obj.id)
+        # abx.pm.hook.on_actor_tick_end(self, obj_to_process)
+        # self.timer.end()
+        pass
+    
+    def on_tick_exception(self, obj: Snapshot, err: BaseException) -> None:
+        print(f'[red]🏃‍♂️ {self}.on_tick_exception()[/red]', obj.abid or obj.id, err)
+        # abx.pm.hook.on_actor_tick_exception(self, obj_to_process, err)

+ 2 - 2
archivebox/core/admin_archiveresults.py

@@ -8,7 +8,7 @@ from django.utils.html import format_html, mark_safe
 from django.core.exceptions import ValidationError
 from django.urls import reverse, resolve
 from django.utils import timezone
-from django.forms import forms
+from django_jsonform.forms.fields import JSONFormField
 
 from huey_monitor.admin import TaskModel
 
@@ -83,7 +83,7 @@ class ArchiveResultInline(admin.TabularInline):
         formset.form.base_fields['cmd_version'].initial = '-'
         formset.form.base_fields['pwd'].initial = str(snapshot.link_dir)
         formset.form.base_fields['created_by'].initial = request.user
-        formset.form.base_fields['cmd'] = forms.JSONField(initial=['-'])
+        formset.form.base_fields['cmd'] = JSONFormField(initial=['-'])
         formset.form.base_fields['output'].initial = 'Manually recorded cmd output...'
         
         if obj is not None:

+ 2 - 2
archivebox/core/admin_site.py

@@ -2,7 +2,7 @@ __package__ = 'archivebox.core'
 
 from django.contrib import admin
 
-import abx.django.use
+import archivebox
 
 class ArchiveBoxAdmin(admin.AdminSite):
     site_header = 'ArchiveBox'
@@ -37,6 +37,6 @@ def register_admin_site():
     sites.site = archivebox_admin
     
     # register all plugins admin classes
-    abx.django.use.register_admin(archivebox_admin)
+    archivebox.pm.hook.register_admin(admin_site=archivebox_admin)
     
     return archivebox_admin

+ 4 - 9
archivebox/core/apps.py

@@ -2,7 +2,7 @@ __package__ = 'archivebox.core'
 
 from django.apps import AppConfig
 
-import abx
+import archivebox
 
 
 class CoreConfig(AppConfig):
@@ -10,16 +10,11 @@ class CoreConfig(AppConfig):
 
     def ready(self):
         """Register the archivebox.core.admin_site as the main django admin site"""
+        from django.conf import settings
+        archivebox.pm.hook.ready(settings=settings)
+        
         from core.admin_site import register_admin_site
         register_admin_site()
         
-        abx.pm.hook.ready()
-
-
 
 
[email protected]
-def register_admin(admin_site):
-    """Register the core.models views (Snapshot, ArchiveResult, Tag, etc.) with the admin site"""
-    from core.admin import register_admin
-    register_admin(admin_site)

+ 61 - 10
archivebox/core/models.py

@@ -8,21 +8,25 @@ import os
 import json
 
 from pathlib import Path
+from datetime import timedelta
 
 from django.db import models
 from django.utils.functional import cached_property
 from django.utils.text import slugify
+from django.utils import timezone
 from django.core.cache import cache
 from django.urls import reverse, reverse_lazy
 from django.db.models import Case, When, Value, IntegerField
 from django.contrib import admin
 from django.conf import settings
 
+from statemachine.mixins import MachineMixin
+
 from archivebox.config import CONSTANTS
 
 from abid_utils.models import ABIDModel, ABIDField, AutoDateTimeField
 from queues.tasks import bg_archive_snapshot
-# from crawls.models import Crawl
+from crawls.models import Crawl
 # from machine.models import Machine, NetworkInterface
 
 from archivebox.misc.system import get_dir_size
@@ -152,7 +156,7 @@ class SnapshotManager(models.Manager):
         return super().get_queryset().prefetch_related('tags', 'archiveresult_set')  # .annotate(archiveresult_count=models.Count('archiveresult')).distinct()
 
 
-class Snapshot(ABIDModel):
+class Snapshot(ABIDModel, MachineMixin):
     abid_prefix = 'snp_'
     abid_ts_src = 'self.created_at'
     abid_uri_src = 'self.url'
@@ -160,6 +164,17 @@ class Snapshot(ABIDModel):
     abid_rand_src = 'self.id'
     abid_drift_allowed = True
 
+    state_field_name = 'status'
+    state_machine_name = 'core.statemachines.SnapshotMachine'
+    state_machine_attr = 'sm'
+    
+    class SnapshotStatus(models.TextChoices):
+        QUEUED = 'queued', 'Queued'
+        STARTED = 'started', 'Started'
+        SEALED = 'sealed', 'Sealed'
+        
+    status = models.CharField(max_length=15, default=SnapshotStatus.QUEUED, null=False, blank=False)
+
     id = models.UUIDField(primary_key=True, default=None, null=False, editable=False, unique=True, verbose_name='ID')
     abid = ABIDField(prefix=abid_prefix)
 
@@ -171,7 +186,7 @@ class Snapshot(ABIDModel):
     bookmarked_at = AutoDateTimeField(default=None, null=False, editable=True, db_index=True)
     downloaded_at = models.DateTimeField(default=None, null=True, editable=False, db_index=True, blank=True)
 
-    # crawl = models.ForeignKey(Crawl, on_delete=models.CASCADE, default=None, null=True, blank=True, related_name='snapshot_set')
+    crawl = models.ForeignKey(Crawl, on_delete=models.CASCADE, default=None, null=True, blank=True, related_name='snapshot_set')
 
     url = models.URLField(unique=True, db_index=True)
     timestamp = models.CharField(max_length=32, unique=True, db_index=True, editable=False)
@@ -396,6 +411,25 @@ class Snapshot(ABIDModel):
                 tags_id.append(Tag.objects.get_or_create(name=tag)[0].pk)
         self.tags.clear()
         self.tags.add(*tags_id)
+        
+    def has_pending_archiveresults(self) -> bool:
+        pending_statuses = [ArchiveResult.ArchiveResultStatus.QUEUED, ArchiveResult.ArchiveResultStatus.STARTED]
+        pending_archiveresults = self.archiveresult_set.filter(status__in=pending_statuses)
+        return pending_archiveresults.exists()
+    
+    def create_pending_archiveresults(self) -> list['ArchiveResult']:
+        archiveresults = []
+        for extractor in EXTRACTORS:
+            archiveresult, _created = ArchiveResult.objects.get_or_create(
+                snapshot=self,
+                extractor=extractor,
+                status=ArchiveResult.ArchiveResultStatus.QUEUED,
+            )
+            archiveresults.append(archiveresult)
+        return archiveresults
+    
+    def bump_retry_at(self, seconds: int = 10):
+        self.retry_at = timezone.now() + timedelta(seconds=seconds)
 
 
     # def get_storage_dir(self, create=True, symlink=True) -> Path:
@@ -452,6 +486,20 @@ class ArchiveResult(ABIDModel):
     abid_subtype_src = 'self.extractor'
     abid_rand_src = 'self.id'
     abid_drift_allowed = True
+    
+    state_field_name = 'status'
+    state_machine_name = 'core.statemachines.ArchiveResultMachine'
+    state_machine_attr = 'sm'
+
+    class ArchiveResultStatus(models.TextChoices):
+        QUEUED = 'queued', 'Queued'
+        STARTED = 'started', 'Started'
+        SUCCEEDED = 'succeeded', 'Succeeded'
+        FAILED = 'failed', 'Failed'
+        SKIPPED = 'skipped', 'Skipped'
+        BACKOFF = 'backoff', 'Waiting to retry'
+        
+    status = models.CharField(max_length=15, choices=ArchiveResultStatus.choices, default=ArchiveResultStatus.QUEUED, null=False, blank=False)
 
     EXTRACTOR_CHOICES = (
         ('htmltotext', 'htmltotext'),
@@ -469,11 +517,7 @@ class ArchiveResult(ABIDModel):
         ('title', 'title'),
         ('wget', 'wget'),
     )
-    STATUS_CHOICES = [
-        ("succeeded", "succeeded"),
-        ("failed", "failed"),
-        ("skipped", "skipped")
-    ]
+
 
     id = models.UUIDField(primary_key=True, default=None, null=False, editable=False, unique=True, verbose_name='ID')
     abid = ABIDField(prefix=abid_prefix)
@@ -491,7 +535,6 @@ class ArchiveResult(ABIDModel):
     output = models.CharField(max_length=1024)
     start_ts = models.DateTimeField(db_index=True)
     end_ts = models.DateTimeField()
-    status = models.CharField(max_length=16, choices=STATUS_CHOICES)
 
     # the network interface that was used to download this result
     # uplink = models.ForeignKey(NetworkInterface, on_delete=models.SET_NULL, null=True, blank=True, verbose_name='Network Interface Used')
@@ -552,7 +595,15 @@ class ArchiveResult(ABIDModel):
         return link.canonical_outputs().get(f'{self.extractor}_path')
 
     def output_exists(self) -> bool:
-        return os.access(self.output_path(), os.R_OK)
+        return os.path.exists(self.output_path())
+    
+    def bump_retry_at(self, seconds: int = 10):
+        self.retry_at = timezone.now() + timedelta(seconds=seconds)
+        
+    def create_output_dir(self):
+        snap_dir = self.snapshot_dir
+        snap_dir.mkdir(parents=True, exist_ok=True)
+        return snap_dir / self.output_path()
 
 
     # def get_storage_dir(self, create=True, symlink=True):

+ 19 - 58
archivebox/core/settings.py

@@ -9,13 +9,12 @@ from pathlib import Path
 from django.utils.crypto import get_random_string
 
 import abx
-import abx.archivebox
-import abx.archivebox.reads
-import abx.django.use
+import archivebox
 
-from archivebox.config import DATA_DIR, PACKAGE_DIR, ARCHIVE_DIR, CONSTANTS
+from archivebox.config import DATA_DIR, PACKAGE_DIR, ARCHIVE_DIR, CONSTANTS  # noqa
 from archivebox.config.common import SHELL_CONFIG, SERVER_CONFIG      # noqa
 
+
 IS_MIGRATING = 'makemigrations' in sys.argv[:3] or 'migrate' in sys.argv[:3]
 IS_TESTING = 'test' in sys.argv[:3] or 'PYTEST_CURRENT_TEST' in os.environ
 IS_SHELL = 'shell' in sys.argv[:3] or 'shell_plus' in sys.argv[:3]
@@ -25,45 +24,8 @@ IS_GETTING_VERSION_OR_HELP = 'version' in sys.argv or 'help' in sys.argv or '--v
 ### ArchiveBox Plugin Settings
 ################################################################################
 
-PLUGIN_HOOKSPECS = [
-    'abx.django.hookspec',
-    'abx.pydantic_pkgr.hookspec',
-    'abx.archivebox.hookspec',
-]
-abx.register_hookspecs(PLUGIN_HOOKSPECS)
-
-BUILTIN_PLUGIN_DIRS = {
-    'archivebox':              PACKAGE_DIR,
-    'plugins_pkg':             PACKAGE_DIR / 'plugins_pkg',
-    'plugins_auth':            PACKAGE_DIR / 'plugins_auth',
-    'plugins_search':          PACKAGE_DIR / 'plugins_search',
-    'plugins_extractor':       PACKAGE_DIR / 'plugins_extractor',
-}
-USER_PLUGIN_DIRS = {
-    # 'user_plugins':            DATA_DIR / 'user_plugins',
-}
-
-# Discover ArchiveBox plugins
-BUILTIN_PLUGINS = abx.get_plugins_in_dirs(BUILTIN_PLUGIN_DIRS)
-PIP_PLUGINS = abx.get_pip_installed_plugins(group='archivebox')
-USER_PLUGINS = abx.get_plugins_in_dirs(USER_PLUGIN_DIRS)
-ALL_PLUGINS = {**BUILTIN_PLUGINS, **PIP_PLUGINS, **USER_PLUGINS}
-
-# Load ArchiveBox plugins
-PLUGIN_MANAGER = abx.pm
-abx.archivebox.load_archivebox_plugins(PLUGIN_MANAGER, ALL_PLUGINS)
-PLUGINS = abx.archivebox.reads.get_PLUGINS()
-
-# Load ArchiveBox config from plugins
-CONFIGS = abx.archivebox.reads.get_CONFIGS()
-CONFIG = FLAT_CONFIG = abx.archivebox.reads.get_FLAT_CONFIG()
-BINPROVIDERS = abx.archivebox.reads.get_BINPROVIDERS()
-BINARIES = abx.archivebox.reads.get_BINARIES()
-EXTRACTORS = abx.archivebox.reads.get_EXTRACTORS()
-SEARCHBACKENDS = abx.archivebox.reads.get_SEARCHBACKENDS()
-# REPLAYERS = abx.archivebox.reads.get_REPLAYERS()
-# ADMINDATAVIEWS = abx.archivebox.reads.get_ADMINDATAVIEWS()
-
+ALL_PLUGINS = archivebox.ALL_PLUGINS
+LOADED_PLUGINS = archivebox.LOADED_PLUGINS
 
 ################################################################################
 ### Django Core Settings
@@ -102,7 +64,8 @@ INSTALLED_APPS = [
     # 'abid_utils',                # handles ABID ID creation, handling, and models
     'config',                    # ArchiveBox config settings (loaded as a plugin, don't need to add it here) 
     'machine',                   # handles collecting and storing information about the host machine, network interfaces, installed binaries, etc.
-    'queues',                    # handles starting and managing background workers and processes
+    'actors',                    # handles starting and managing background workers and processes (orchestrators and actors)
+    'queues',                    # handles starting and managing background workers and processes (supervisord)
     'seeds',                     # handles Seed model and URL source management
     'crawls',                    # handles Crawl and CrawlSchedule models and management
     'personas',                  # handles Persona and session management
@@ -110,7 +73,7 @@ INSTALLED_APPS = [
     'api',                       # Django-Ninja-based Rest API interfaces, config, APIToken model, etc.
 
     # ArchiveBox plugins
-    *abx.django.use.get_INSTALLED_APPS(),  # all plugin django-apps found in archivebox/plugins_* and data/user_plugins,
+    *abx.as_list(abx.pm.hook.get_INSTALLED_APPS()),  # all plugin django-apps found in archivebox/plugins_* and data/user_plugins,
 
     # 3rd-party apps from PyPI that need to be loaded last
     'admin_data_views',          # handles rendering some convenient automatic read-only views of data in Django admin
@@ -125,6 +88,7 @@ INSTALLED_APPS = [
 
 
 
+
 MIDDLEWARE = [
     'core.middleware.TimezoneMiddleware',
     'django.middleware.security.SecurityMiddleware',
@@ -135,7 +99,7 @@ MIDDLEWARE = [
     'core.middleware.ReverseProxyAuthMiddleware',
     'django.contrib.messages.middleware.MessageMiddleware',
     'core.middleware.CacheControlMiddleware',
-    *abx.django.use.get_MIDDLEWARES(),
+    *abx.as_list(abx.pm.hook.get_MIDDLEWARES()),
 ]
 
 
@@ -148,7 +112,7 @@ MIDDLEWARE = [
 AUTHENTICATION_BACKENDS = [
     'django.contrib.auth.backends.RemoteUserBackend',
     'django.contrib.auth.backends.ModelBackend',
-    *abx.django.use.get_AUTHENTICATION_BACKENDS(),
+    *abx.as_list(abx.pm.hook.get_AUTHENTICATION_BACKENDS()),
 ]
 
 
@@ -169,7 +133,7 @@ AUTHENTICATION_BACKENDS = [
 
 STATIC_URL = '/static/'
 TEMPLATES_DIR_NAME = 'templates'
-CUSTOM_TEMPLATES_ENABLED = os.access(CONSTANTS.CUSTOM_TEMPLATES_DIR, os.R_OK) and CONSTANTS.CUSTOM_TEMPLATES_DIR.is_dir()
+CUSTOM_TEMPLATES_ENABLED = os.path.isdir(CONSTANTS.CUSTOM_TEMPLATES_DIR) and os.access(CONSTANTS.CUSTOM_TEMPLATES_DIR, os.R_OK)
 STATICFILES_DIRS = [
     *([str(CONSTANTS.CUSTOM_TEMPLATES_DIR / 'static')] if CUSTOM_TEMPLATES_ENABLED else []),
     # *[
@@ -177,7 +141,7 @@ STATICFILES_DIRS = [
     #     for plugin_dir in PLUGIN_DIRS.values()
     #     if (plugin_dir / 'static').is_dir()
     # ],
-    *abx.django.use.get_STATICFILES_DIRS(),
+    *abx.as_list(abx.pm.hook.get_STATICFILES_DIRS()),
     str(PACKAGE_DIR / TEMPLATES_DIR_NAME / 'static'),
 ]
 
@@ -188,7 +152,7 @@ TEMPLATE_DIRS = [
     #     for plugin_dir in PLUGIN_DIRS.values()
     #     if (plugin_dir / 'templates').is_dir()
     # ],
-    *abx.django.use.get_TEMPLATE_DIRS(),
+    *abx.as_list(abx.pm.hook.get_TEMPLATE_DIRS()),
     str(PACKAGE_DIR / TEMPLATES_DIR_NAME / 'core'),
     str(PACKAGE_DIR / TEMPLATES_DIR_NAME / 'admin'),
     str(PACKAGE_DIR / TEMPLATES_DIR_NAME),
@@ -228,7 +192,7 @@ SQLITE_CONNECTION_OPTIONS = {
         # https://gcollazo.com/optimal-sqlite-settings-for-django/
         # https://litestream.io/tips/#busy-timeout
         # https://docs.djangoproject.com/en/5.1/ref/databases/#setting-pragma-options
-        "timeout": 5,
+        "timeout": 10,
         "check_same_thread": False,
         "transaction_mode": "IMMEDIATE",
         "init_command": (
@@ -267,7 +231,7 @@ if not IS_GETTING_VERSION_OR_HELP:             # dont create queue.sqlite3 file
     HUEY = {
         "huey_class": "huey.SqliteHuey",
         "filename": CONSTANTS.QUEUE_DATABASE_FILENAME,
-        "name": "system_tasks",
+        "name": "commands",
         "results": True,
         "store_none": True,
         "immediate": False,
@@ -288,11 +252,11 @@ if not IS_GETTING_VERSION_OR_HELP:             # dont create queue.sqlite3 file
     # https://huey.readthedocs.io/en/latest/contrib.html#setting-things-up
     # https://github.com/gaiacoop/django-huey
     DJANGO_HUEY = {
-        "default": "system_tasks",
+        "default": "commands",
         "queues": {
             HUEY["name"]: HUEY.copy(),
             # more registered here at plugin import-time by BaseQueue.register()
-            **abx.django.use.get_DJANGO_HUEY_QUEUES(QUEUE_DATABASE_NAME=CONSTANTS.QUEUE_DATABASE_FILENAME),
+            **abx.as_dict(abx.pm.hook.get_DJANGO_HUEY_QUEUES(QUEUE_DATABASE_NAME=CONSTANTS.QUEUE_DATABASE_FILENAME)),
         },
     }
 
@@ -517,7 +481,7 @@ ADMIN_DATA_VIEWS = {
                 "name": "log",
             },
         },
-        *abx.django.use.get_ADMIN_DATA_VIEWS_URLS(),
+        *abx.as_list(abx.pm.hook.get_ADMIN_DATA_VIEWS_URLS()),
     ],
 }
 
@@ -611,7 +575,4 @@ if DEBUG_REQUESTS_TRACKER:
 # JET_TOKEN = 'some-api-token-here'
 
 
-abx.django.use.register_checks()
-# abx.archivebox.reads.register_all_hooks(globals())
-
 # import ipdb; ipdb.set_trace()

+ 0 - 5
archivebox/core/settings_logging.py

@@ -163,11 +163,6 @@ SETTINGS_LOGGING = {
             "level": "DEBUG",
             "propagate": False,
         },
-        "plugins_extractor": {
-            "handlers": ["default", "logfile"],
-            "level": "DEBUG",
-            "propagate": False,
-        },
         "httpx": {
             "handlers": ["outbound_webhooks"],
             "level": "INFO",

+ 115 - 0
archivebox/core/statemachines.py

@@ -0,0 +1,115 @@
+__package__ = 'archivebox.snapshots'
+
+from django.utils import timezone
+
+from statemachine import State, StateMachine
+
+from core.models import Snapshot, ArchiveResult
+
+# State Machine Definitions
+#################################################
+
+
+class SnapshotMachine(StateMachine, strict_states=True):
+    """State machine for managing Snapshot lifecycle."""
+    
+    model: Snapshot
+    
+    # States
+    queued = State(value=Snapshot.SnapshotStatus.QUEUED, initial=True)
+    started = State(value=Snapshot.SnapshotStatus.STARTED)
+    sealed = State(value=Snapshot.SnapshotStatus.SEALED, final=True)
+    
+    # Tick Event
+    tick = (
+        queued.to.itself(unless='can_start', internal=True) |
+        queued.to(started, cond='can_start') |
+        started.to.itself(unless='is_finished', internal=True) |
+        started.to(sealed, cond='is_finished')
+    )
+    
+    def __init__(self, snapshot, *args, **kwargs):
+        self.snapshot = snapshot
+        super().__init__(snapshot, *args, **kwargs)
+        
+    def can_start(self) -> bool:
+        return self.snapshot.seed and self.snapshot.seed.uri
+        
+    def is_finished(self) -> bool:
+        return not self.snapshot.has_pending_archiveresults()
+        
+    def on_started(self):
+        self.snapshot.create_pending_archiveresults()
+        self.snapshot.bump_retry_at(seconds=60)
+        self.snapshot.save()
+        
+    def on_sealed(self):
+        self.snapshot.retry_at = None
+        self.snapshot.save()
+
+class ArchiveResultMachine(StateMachine, strict_states=True):
+    """State machine for managing ArchiveResult lifecycle."""
+    
+    model: ArchiveResult
+    
+    # States
+    queued = State(value=ArchiveResult.ArchiveResultStatus.QUEUED, initial=True)
+    started = State(value=ArchiveResult.ArchiveResultStatus.STARTED)
+    backoff = State(value=ArchiveResult.ArchiveResultStatus.BACKOFF)
+    succeeded = State(value=ArchiveResult.ArchiveResultStatus.SUCCEEDED, final=True)
+    failed = State(value=ArchiveResult.ArchiveResultStatus.FAILED, final=True)
+    
+    # Tick Event
+    tick = (
+        queued.to.itself(unless='can_start', internal=True) |
+        queued.to(started, cond='can_start') |
+        started.to.itself(unless='is_finished', internal=True) |
+        started.to(succeeded, cond='is_succeeded') |
+        started.to(failed, cond='is_failed') |
+        started.to(backoff, cond='is_backoff') |
+        backoff.to.itself(unless='can_start', internal=True) |
+        backoff.to(started, cond='can_start') |
+        backoff.to(succeeded, cond='is_succeeded') |
+        backoff.to(failed, cond='is_failed')
+    )
+
+    def __init__(self, archiveresult, *args, **kwargs):
+        self.archiveresult = archiveresult
+        super().__init__(archiveresult, *args, **kwargs)
+        
+    def can_start(self) -> bool:
+        return self.archiveresult.snapshot and self.archiveresult.snapshot.is_started()
+    
+    def is_succeeded(self) -> bool:
+        return self.archiveresult.output_exists()
+    
+    def is_failed(self) -> bool:
+        return not self.archiveresult.output_exists()
+    
+    def is_backoff(self) -> bool:
+        return self.archiveresult.status == ArchiveResult.ArchiveResultStatus.BACKOFF
+
+    def on_started(self):
+        self.archiveresult.start_ts = timezone.now()
+        self.archiveresult.create_output_dir()
+        self.archiveresult.bump_retry_at(seconds=60)
+        self.archiveresult.save()
+
+    def on_backoff(self):
+        self.archiveresult.bump_retry_at(seconds=60)
+        self.archiveresult.save()
+
+    def on_succeeded(self):
+        self.archiveresult.end_ts = timezone.now()
+        self.archiveresult.save()
+
+    def on_failed(self):
+        self.archiveresult.end_ts = timezone.now()
+        self.archiveresult.save()
+        
+    def after_transition(self, event: str, source: State, target: State):
+        print(f"after '{event}' from '{source.id}' to '{target.id}'")
+        # self.archiveresult.save_merkle_index()
+        # self.archiveresult.save_html_index()
+        # self.archiveresult.save_json_index()
+        return "after_transition"

+ 25 - 18
archivebox/core/views.py

@@ -12,7 +12,6 @@ from django.views import View
 from django.views.generic.list import ListView
 from django.views.generic import FormView
 from django.db.models import Q
-from django.conf import settings
 from django.contrib import messages
 from django.contrib.auth.mixins import UserPassesTestMixin
 from django.views.decorators.csrf import csrf_exempt
@@ -21,6 +20,7 @@ from django.utils.decorators import method_decorator
 from admin_data_views.typing import TableContext, ItemContext
 from admin_data_views.utils import render_with_table_view, render_with_item_view, ItemLink
 
+import archivebox
 
 from core.models import Snapshot
 from core.forms import AddLinkForm
@@ -32,9 +32,8 @@ from archivebox.config.common import SHELL_CONFIG, SERVER_CONFIG
 from archivebox.misc.util import base_url, htmlencode, ts_to_date_str
 from archivebox.misc.serve_static import serve_static_with_byterange_support
 
-from ..plugins_extractor.archivedotorg.config import ARCHIVEDOTORG_CONFIG
-from ..logging_util import printable_filesize
-from ..search import query_search_index
+from archivebox.logging_util import printable_filesize
+from archivebox.search import query_search_index
 
 
 class HomepageView(View):
@@ -69,7 +68,7 @@ class SnapshotView(View):
                 and embed_path
                 and os.access(abs_path, os.R_OK)
                 and abs_path.exists()):
-                if abs_path.is_dir() and not any(abs_path.glob('*.*')):
+                if os.path.isdir(abs_path) and not any(abs_path.glob('*.*')):
                     continue
 
                 result_info = {
@@ -103,7 +102,7 @@ class SnapshotView(View):
 
         # iterate through all the files in the snapshot dir and add the biggest ones to1 the result list
         snap_dir = Path(snapshot.link_dir)
-        assert os.access(snap_dir, os.R_OK) and os.access(snap_dir, os.X_OK)
+        assert os.path.isdir(snap_dir) and os.access(snap_dir, os.R_OK)
         
         for result_file in (*snap_dir.glob('*'), *snap_dir.glob('*/*')):
             extension = result_file.suffix.lstrip('.').lower()
@@ -154,7 +153,7 @@ class SnapshotView(View):
             'status_color': 'success' if link.is_archived else 'danger',
             'oldest_archive_date': ts_to_date_str(link.oldest_archive_date),
             'warc_path': warc_path,
-            'SAVE_ARCHIVE_DOT_ORG': ARCHIVEDOTORG_CONFIG.SAVE_ARCHIVE_DOT_ORG,
+            'SAVE_ARCHIVE_DOT_ORG': archivebox.pm.hook.get_FLAT_CONFIG().SAVE_ARCHIVE_DOT_ORG,
             'PREVIEW_ORIGINALS': SERVER_CONFIG.PREVIEW_ORIGINALS,
             'archiveresults': sorted(archiveresults.values(), key=lambda r: all_types.index(r['name']) if r['name'] in all_types else -r['size']),
             'best_result': best_result,
@@ -500,21 +499,25 @@ class HealthCheckView(View):
 
 
 def find_config_section(key: str) -> str:
+    CONFIGS = archivebox.pm.hook.get_CONFIGS()
+    
     if key in CONSTANTS_CONFIG:
         return 'CONSTANT'
     matching_sections = [
-        section_id for section_id, section in settings.CONFIGS.items() if key in section.model_fields
+        section_id for section_id, section in CONFIGS.items() if key in section.model_fields
     ]
     section = matching_sections[0] if matching_sections else 'DYNAMIC'
     return section
 
 def find_config_default(key: str) -> str:
+    CONFIGS = archivebox.pm.hook.get_CONFIGS()
+    
     if key in CONSTANTS_CONFIG:
         return str(CONSTANTS_CONFIG[key])
     
     default_val = None
 
-    for config in settings.CONFIGS.values():
+    for config in CONFIGS.values():
         if key in config.model_fields:
             default_val = config.model_fields[key].default
             break
@@ -530,7 +533,9 @@ def find_config_default(key: str) -> str:
     return default_val
 
 def find_config_type(key: str) -> str:
-    for config in settings.CONFIGS.values():
+    CONFIGS = archivebox.pm.hook.get_CONFIGS()
+    
+    for config in CONFIGS.values():
         if hasattr(config, key):
             type_hints = get_type_hints(config)
             try:
@@ -547,7 +552,8 @@ def key_is_safe(key: str) -> bool:
 
 @render_with_table_view
 def live_config_list_view(request: HttpRequest, **kwargs) -> TableContext:
-
+    CONFIGS = archivebox.pm.hook.get_CONFIGS()
+    
     assert request.user.is_superuser, 'Must be a superuser to view configuration settings.'
 
     rows = {
@@ -560,7 +566,7 @@ def live_config_list_view(request: HttpRequest, **kwargs) -> TableContext:
         # "Aliases": [],
     }
 
-    for section_id, section in reversed(list(settings.CONFIGS.items())):
+    for section_id, section in reversed(list(CONFIGS.items())):
         for key, field in section.model_fields.items():
             rows['Section'].append(section_id)   # section.replace('_', ' ').title().replace(' Config', '')
             rows['Key'].append(ItemLink(key, key=key))
@@ -570,7 +576,6 @@ def live_config_list_view(request: HttpRequest, **kwargs) -> TableContext:
             # rows['Documentation'].append(mark_safe(f'Wiki: <a href="https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#{key.lower()}">{key}</a>'))
             # rows['Aliases'].append(', '.join(find_config_aliases(key)))
 
-   
     section = 'CONSTANT'
     for key in CONSTANTS_CONFIG.keys():
         rows['Section'].append(section)   # section.replace('_', ' ').title().replace(' Config', '')
@@ -589,7 +594,9 @@ def live_config_list_view(request: HttpRequest, **kwargs) -> TableContext:
 
 @render_with_item_view
 def live_config_value_view(request: HttpRequest, key: str, **kwargs) -> ItemContext:
-
+    CONFIGS = archivebox.pm.hook.get_CONFIGS()
+    FLAT_CONFIG = archivebox.pm.hook.get_FLAT_CONFIG()
+    
     assert request.user.is_superuser, 'Must be a superuser to view configuration settings.'
 
     # aliases = USER_CONFIG.get(key, {}).get("aliases", [])
@@ -597,7 +604,7 @@ def live_config_value_view(request: HttpRequest, key: str, **kwargs) -> ItemCont
 
     if key in CONSTANTS_CONFIG:
         section_header = mark_safe(f'[CONSTANTS]   &nbsp; <b><code style="color: lightgray">{key}</code></b> &nbsp; <small>(read-only, hardcoded by ArchiveBox)</small>')
-    elif key in settings.FLAT_CONFIG:
+    elif key in FLAT_CONFIG:
         section_header = mark_safe(f'data / ArchiveBox.conf &nbsp; [{find_config_section(key)}]  &nbsp; <b><code style="color: lightgray">{key}</code></b>')
     else:
         section_header = mark_safe(f'[DYNAMIC CONFIG]   &nbsp; <b><code style="color: lightgray">{key}</code></b> &nbsp; <small>(read-only, calculated at runtime)</small>')
@@ -613,7 +620,7 @@ def live_config_value_view(request: HttpRequest, key: str, **kwargs) -> ItemCont
                 "fields": {
                     'Key': key,
                     'Type': find_config_type(key),
-                    'Value': settings.FLAT_CONFIG.get(key, settings.CONFIGS.get(key, None)) if key_is_safe(key) else '********',
+                    'Value': FLAT_CONFIG.get(key, CONFIGS.get(key, None)) if key_is_safe(key) else '********',
                 },
                 "help_texts": {
                     'Key': mark_safe(f'''
@@ -635,13 +642,13 @@ def live_config_value_view(request: HttpRequest, key: str, **kwargs) -> ItemCont
                             <code>{find_config_default(key) or '↗️ See in ArchiveBox source code...'}</code>
                         </a>
                         <br/><br/>
-                        <p style="display: {"block" if key in settings.FLAT_CONFIG else "none"}">
+                        <p style="display: {"block" if key in FLAT_CONFIG else "none"}">
                             <i>To change this value, edit <code>data/ArchiveBox.conf</code> or run:</i>
                             <br/><br/>
                             <code>archivebox config --set {key}="{
                                 val.strip("'")
                                 if (val := find_config_default(key)) else
-                                (repr(settings.FLAT_CONFIG[key] if key_is_safe(key) else '********')).strip("'")
+                                (repr(FLAT_CONFIG[key] if key_is_safe(key) else '********')).strip("'")
                             }"</code>
                         </p>
                     '''),

+ 69 - 0
archivebox/crawls/actors.py

@@ -0,0 +1,69 @@
+__package__ = 'archivebox.crawls'
+
+from typing import ClassVar
+
+from rich import print
+
+from django.db.models import QuerySet
+
+from crawls.models import Crawl
+
+from actors.actor import ActorType
+
+
+class CrawlActor(ActorType[Crawl]):
+    
+    QUERYSET: ClassVar[QuerySet] = Crawl.objects.filter(status='queued')
+    CLAIM_WHERE: ClassVar[str] = 'status = "queued"'  # the WHERE clause to filter the objects when atomically getting the next object from the queue
+    CLAIM_SET: ClassVar[str] = 'status = "started"'   # the SET clause to claim the object when atomically getting the next object from the queue
+    CLAIM_ORDER: ClassVar[str] = 'created_at DESC'    # the ORDER BY clause to sort the objects with when atomically getting the next object from the queue
+    CLAIM_FROM_TOP: ClassVar[int] = 50                # the number of objects to consider when atomically getting the next object from the queue
+    
+    # model_type: Type[ModelType]
+    MAX_CONCURRENT_ACTORS: ClassVar[int] = 4               # min 2, max 8, up to 60% of available cpu cores
+    MAX_TICK_TIME: ClassVar[int] = 60                          # maximum duration in seconds to process a single object
+    
+    def claim_sql_where(self) -> str:
+        """override this to implement a custom WHERE clause for the atomic claim step e.g. "status = 'queued' AND locked_by = NULL" """
+        return self.CLAIM_WHERE
+    
+    def claim_sql_set(self) -> str:
+        """override this to implement a custom SET clause for the atomic claim step e.g. "status = 'started' AND locked_by = {self.pid}" """
+        return self.CLAIM_SET
+    
+    def claim_sql_order(self) -> str:
+        """override this to implement a custom ORDER BY clause for the atomic claim step e.g. "created_at DESC" """
+        return self.CLAIM_ORDER
+    
+    def claim_from_top(self) -> int:
+        """override this to implement a custom number of objects to consider when atomically claiming the next object from the top of the queue"""
+        return self.CLAIM_FROM_TOP
+        
+    def tick(self, obj: Crawl) -> None:
+        """override this to process the object"""
+        print(f'[blue]🏃‍♂️ {self}.tick()[/blue]', obj.abid or obj.id)
+        # For example:
+        # do_some_task(obj)
+        # do_something_else(obj)
+        # obj._model.objects.filter(pk=obj.pk, status='started').update(status='success')
+        # raise NotImplementedError('tick() must be implemented by the Actor subclass')
+    
+    def on_shutdown(self, err: BaseException | None=None) -> None:
+        print(f'[grey53]🏃‍♂️ {self}.on_shutdown() SHUTTING DOWN[/grey53]', err or '[green](gracefully)[/green]')
+        # abx.pm.hook.on_actor_shutdown(self)
+        
+    def on_tick_start(self, obj: Crawl) -> None:
+        # print(f'🏃‍♂️ {self}.on_tick_start()', obj.abid or obj.id)
+        # abx.pm.hook.on_actor_tick_start(self, obj_to_process)
+        # self.timer = TimedProgress(self.MAX_TICK_TIME, prefix='      ')
+        pass
+    
+    def on_tick_end(self, obj: Crawl) -> None:
+        # print(f'🏃‍♂️ {self}.on_tick_end()', obj.abid or obj.id)
+        # abx.pm.hook.on_actor_tick_end(self, obj_to_process)
+        # self.timer.end()
+        pass
+    
+    def on_tick_exception(self, obj: Crawl, err: BaseException) -> None:
+        print(f'[red]🏃‍♂️ {self}.on_tick_exception()[/red]', obj.abid or obj.id, err)
+        # abx.pm.hook.on_actor_tick_exception(self, obj_to_process, err)

+ 48 - 5
archivebox/crawls/models.py

@@ -1,13 +1,20 @@
 __package__ = 'archivebox.crawls'
 
+from typing import TYPE_CHECKING
 from django_stubs_ext.db.models import TypedModelMeta
 
+from datetime import timedelta
+
 from django.db import models
-from django.db.models import Q
 from django.core.validators import MaxValueValidator, MinValueValidator 
 from django.conf import settings
-from django.utils import timezone
 from django.urls import reverse_lazy
+from django.utils import timezone
+
+from statemachine.mixins import MachineMixin
+
+if TYPE_CHECKING:
+    from core.models import Snapshot
 
 from seeds.models import Seed
 
@@ -41,8 +48,9 @@ class CrawlSchedule(ABIDModel, ModelWithHealthStats):
         """The base crawl that each new scheduled job should copy as a template"""
         return self.crawl_set.first()
 
+    
 
-class Crawl(ABIDModel, ModelWithHealthStats):
+class Crawl(ABIDModel, ModelWithHealthStats, MachineMixin):
     """
     A single session of URLs to archive starting from a given Seed and expanding outwards. An "archiving session" so to speak.
 
@@ -55,16 +63,29 @@ class Crawl(ABIDModel, ModelWithHealthStats):
     abid_prefix = 'crl_'
     abid_ts_src = 'self.created_at'
     abid_uri_src = 'self.seed.uri'
-    abid_subtype_src = 'self.persona_id'
+    abid_subtype_src = 'self.persona'
     abid_rand_src = 'self.id'
     abid_drift_allowed = True
+    
+    state_field_name = 'status'
+    state_machine_name = 'crawls.statemachines.CrawlMachine'
+    state_machine_attr = 'sm'
+    bind_events_as_methods = True
+
+    class CrawlStatus(models.TextChoices):
+        QUEUED = 'queued', 'Queued'
+        STARTED = 'started', 'Started'
+        SEALED = 'sealed', 'Sealed'
 
+    status = models.CharField(choices=CrawlStatus.choices, max_length=15, default=CrawlStatus.QUEUED, null=False, blank=False)
+    
     id = models.UUIDField(primary_key=True, default=None, null=False, editable=False, unique=True, verbose_name='ID')
     abid = ABIDField(prefix=abid_prefix)
 
     created_by = models.ForeignKey(settings.AUTH_USER_MODEL, on_delete=models.CASCADE, default=None, null=False, related_name='crawl_set')
     created_at = AutoDateTimeField(default=None, null=False, db_index=True)
     modified_at = models.DateTimeField(auto_now=True)
+    
 
     seed = models.ForeignKey(Seed, on_delete=models.PROTECT, related_name='crawl_set', null=False, blank=False)
     max_depth = models.PositiveSmallIntegerField(default=0, validators=[MinValueValidator(0), MaxValueValidator(4)])
@@ -79,7 +100,7 @@ class Crawl(ABIDModel, ModelWithHealthStats):
     # schedule = models.JSONField()
     # config = models.JSONField()
     
-    # snapshot_set: models.Manager['Snapshot']
+    snapshot_set: models.Manager['Snapshot']
     
 
     class Meta(TypedModelMeta):
@@ -102,6 +123,28 @@ class Crawl(ABIDModel, ModelWithHealthStats):
     @property
     def api_docs_url(self) -> str:
         return '/api/v1/docs#/Core%20Models/api_v1_core_get_crawl'
+    
+    def has_pending_archiveresults(self) -> bool:
+        from core.models import ArchiveResult
+        
+        pending_statuses = [ArchiveResult.ArchiveResultStatus.QUEUED, ArchiveResult.ArchiveResultStatus.STARTED]
+        
+        snapshot_ids = self.snapshot_set.values_list('id', flat=True)
+        pending_archiveresults = ArchiveResult.objects.filter(snapshot_id__in=snapshot_ids, status__in=pending_statuses)
+        return pending_archiveresults.exists()
+    
+    def create_root_snapshot(self) -> 'Snapshot':
+        from core.models import Snapshot
+        
+        root_snapshot, _ = Snapshot.objects.get_or_create(
+            crawl=self,
+            url=self.seed.uri,
+        )
+        return root_snapshot
+    
+    def bump_retry_at(self, seconds: int = 10):
+        self.retry_at = timezone.now() + timedelta(seconds=seconds)
+        self.save()
 
 
 class Outlink(models.Model):

+ 48 - 0
archivebox/crawls/statemachines.py

@@ -0,0 +1,48 @@
+__package__ = 'archivebox.crawls'
+
+from statemachine import State, StateMachine
+
+from crawls.models import Crawl
+
+# State Machine Definitions
+#################################################
+
+
+class CrawlMachine(StateMachine, strict_states=True):
+    """State machine for managing Crawl lifecycle."""
+    
+    model: Crawl
+    
+    # States
+    queued = State(value=Crawl.CrawlStatus.QUEUED, initial=True)
+    started = State(value=Crawl.CrawlStatus.STARTED)
+    sealed = State(value=Crawl.CrawlStatus.SEALED, final=True)
+    
+    # Tick Event
+    tick = (
+        queued.to.itself(unless='can_start', internal=True) |
+        queued.to(started, cond='can_start') |
+        started.to.itself(unless='is_finished', internal=True) |
+        started.to(sealed, cond='is_finished')
+    )
+    
+    def __init__(self, crawl, *args, **kwargs):
+        self.crawl = crawl
+        super().__init__(crawl, *args, **kwargs)
+        
+    def can_start(self) -> bool:
+        return self.crawl.seed and self.crawl.seed.uri
+        
+    def is_finished(self) -> bool:
+        return not self.crawl.has_pending_archiveresults()
+
+
+        
+    def on_started(self):
+        self.crawl.create_root_snapshot()
+        self.crawl.bump_retry_at(seconds=10)
+        self.crawl.save()
+        
+    def on_sealed(self):
+        self.crawl.retry_at = None
+        self.crawl.save()

+ 16 - 30
archivebox/extractors/__init__.py

@@ -27,43 +27,29 @@ from ..logging_util import (
     log_archive_method_finished,
 )
 
-from .title import should_save_title, save_title
-from .favicon import should_save_favicon, save_favicon
-from .wget import should_save_wget, save_wget
-from .singlefile import should_save_singlefile, save_singlefile
-from .readability import should_save_readability, save_readability
-from .mercury import should_save_mercury, save_mercury
-from .htmltotext import should_save_htmltotext, save_htmltotext
-from .pdf import should_save_pdf, save_pdf
-from .screenshot import should_save_screenshot, save_screenshot
-from .dom import should_save_dom, save_dom
-from .git import should_save_git, save_git
-from .media import should_save_media, save_media
-from .archive_org import should_save_archive_dot_org, save_archive_dot_org
-from .headers import should_save_headers, save_headers
-
 
 ShouldSaveFunction = Callable[[Link, Optional[Path], Optional[bool]], bool]
 SaveFunction = Callable[[Link, Optional[Path], int], ArchiveResult]
 ArchiveMethodEntry = tuple[str, ShouldSaveFunction, SaveFunction]
 
 def get_default_archive_methods() -> List[ArchiveMethodEntry]:
+    # TODO: move to abx.pm.hook.get_EXTRACTORS()
     return [
-        ('favicon', should_save_favicon, save_favicon),
-        ('headers', should_save_headers, save_headers),
-        ('singlefile', should_save_singlefile, save_singlefile),
-        ('pdf', should_save_pdf, save_pdf),
-        ('screenshot', should_save_screenshot, save_screenshot),
-        ('dom', should_save_dom, save_dom),
-        ('wget', should_save_wget, save_wget),
-        # keep title, readability, and htmltotext below wget and singlefile, as they depend on them
-        ('title', should_save_title, save_title),
-        ('readability', should_save_readability, save_readability),
-        ('mercury', should_save_mercury, save_mercury),
-        ('htmltotext', should_save_htmltotext, save_htmltotext),
-        ('git', should_save_git, save_git),
-        ('media', should_save_media, save_media),
-        ('archive_org', should_save_archive_dot_org, save_archive_dot_org),
+        # ('favicon', should_save_favicon, save_favicon),
+        # ('headers', should_save_headers, save_headers),
+        # ('singlefile', should_save_singlefile, save_singlefile),
+        # ('pdf', should_save_pdf, save_pdf),
+        # ('screenshot', should_save_screenshot, save_screenshot),
+        # ('dom', should_save_dom, save_dom),
+        # ('wget', should_save_wget, save_wget),
+        # # keep title, readability, and htmltotext below wget and singlefile, as they depend on them
+        # ('title', should_save_title, save_title),
+        # ('readability', should_save_readability, save_readability),
+        # ('mercury', should_save_mercury, save_mercury),
+        # ('htmltotext', should_save_htmltotext, save_htmltotext),
+        # ('git', should_save_git, save_git),
+        # ('media', should_save_media, save_media),
+        # ('archive_org', should_save_archive_dot_org, save_archive_dot_org),
     ]
 
 ARCHIVE_METHODS_INDEXING_PRECEDENCE = [

+ 7 - 4
archivebox/index/html.py

@@ -8,6 +8,8 @@ from typing import List, Optional, Iterator, Mapping
 from django.utils.html import format_html, mark_safe   # type: ignore
 from django.core.cache import cache
 
+import abx
+
 from archivebox.misc.system import atomic_write
 from archivebox.misc.util import (
     enforce_types,
@@ -19,7 +21,6 @@ from archivebox.misc.util import (
 from archivebox.config import CONSTANTS, DATA_DIR, VERSION
 from archivebox.config.common import SERVER_CONFIG
 from archivebox.config.version import get_COMMIT_HASH
-from archivebox.plugins_extractor.archivedotorg.config import ARCHIVEDOTORG_CONFIG
 
 from .schema import Link
 from ..logging_util import printable_filesize
@@ -79,8 +80,10 @@ def write_html_link_details(link: Link, out_dir: Optional[str]=None) -> None:
 
 @enforce_types
 def link_details_template(link: Link) -> str:
-
-    from ..extractors.wget import wget_output_path
+    
+    from abx_plugin_wget_extractor.wget import wget_output_path
+    
+    SAVE_ARCHIVE_DOT_ORG = abx.pm.hook.get_FLAT_CONFIG().SAVE_ARCHIVE_DOT_ORG
 
     link_info = link._asdict(extended=True)
 
@@ -102,7 +105,7 @@ def link_details_template(link: Link) -> str:
         'status': 'archived' if link.is_archived else 'not yet archived',
         'status_color': 'success' if link.is_archived else 'danger',
         'oldest_archive_date': ts_to_date_str(link.oldest_archive_date),
-        'SAVE_ARCHIVE_DOT_ORG': ARCHIVEDOTORG_CONFIG.SAVE_ARCHIVE_DOT_ORG,
+        'SAVE_ARCHIVE_DOT_ORG': SAVE_ARCHIVE_DOT_ORG,
         'PREVIEW_ORIGINALS': SERVER_CONFIG.PREVIEW_ORIGINALS,
     })
 

+ 3 - 4
archivebox/index/json.py

@@ -8,6 +8,8 @@ from pathlib import Path
 from datetime import datetime, timezone
 from typing import List, Optional, Iterator, Any, Union
 
+import abx
+
 from archivebox.config import VERSION, DATA_DIR, CONSTANTS
 from archivebox.config.common import SERVER_CONFIG, SHELL_CONFIG
 
@@ -19,8 +21,6 @@ from archivebox.misc.util import enforce_types
 
 @enforce_types
 def generate_json_index_from_links(links: List[Link], with_headers: bool):
-    from django.conf import settings
-    
     MAIN_INDEX_HEADER = {
         'info': 'This is an index of site data archived by ArchiveBox: The self-hosted web archive.',
         'schema': 'archivebox.index.json',
@@ -33,11 +33,10 @@ def generate_json_index_from_links(links: List[Link], with_headers: bool):
             'docs': 'https://github.com/ArchiveBox/ArchiveBox/wiki',
             'source': 'https://github.com/ArchiveBox/ArchiveBox',
             'issues': 'https://github.com/ArchiveBox/ArchiveBox/issues',
-            'dependencies': settings.BINARIES,
+            'dependencies': dict(abx.pm.hook.get_BINARIES()),
         },
     }
     
-    
     if with_headers:
         output = {
             **MAIN_INDEX_HEADER,

+ 6 - 3
archivebox/index/schema.py

@@ -17,9 +17,9 @@ from dataclasses import dataclass, asdict, field, fields
 
 from django.utils.functional import cached_property
 
-from archivebox.config import ARCHIVE_DIR, CONSTANTS
+import abx
 
-from plugins_extractor.favicon.config import FAVICON_CONFIG
+from archivebox.config import ARCHIVE_DIR, CONSTANTS
 
 from archivebox.misc.system import get_dir_size
 from archivebox.misc.util import ts_to_date_str, parse_date
@@ -426,7 +426,10 @@ class Link:
     def canonical_outputs(self) -> Dict[str, Optional[str]]:
         """predict the expected output paths that should be present after archiving"""
 
-        from ..extractors.wget import wget_output_path
+        from abx_plugin_wget.wget import wget_output_path
+        
+        FAVICON_CONFIG = abx.pm.hook.get_CONFIGS().favicon
+        
         # TODO: banish this awful duplication from the codebase and import these
         # from their respective extractor files
         canonical = {

+ 15 - 14
archivebox/machine/models.py

@@ -8,9 +8,10 @@ from django.db import models
 from django.utils import timezone
 from django.utils.functional import cached_property
 
-import abx.archivebox.reads
+import abx
+import archivebox
 
-from abx.archivebox.base_binary import BaseBinary, BaseBinProvider
+from pydantic_pkgr import Binary, BinProvider
 from archivebox.abid_utils.models import ABIDModel, ABIDField, AutoDateTimeField, ModelWithHealthStats
 
 from .detect import get_host_guid, get_os_info, get_vm_info, get_host_network, get_host_stats
@@ -180,7 +181,7 @@ class NetworkInterface(ABIDModel, ModelWithHealthStats):
 
 
 class InstalledBinaryManager(models.Manager):
-    def get_from_db_or_cache(self, binary: BaseBinary) -> 'InstalledBinary':
+    def get_from_db_or_cache(self, binary: Binary) -> 'InstalledBinary':
         """Get or create an InstalledBinary record for a Binary on the local machine"""
         
         global _CURRENT_BINARIES
@@ -216,7 +217,7 @@ class InstalledBinaryManager(models.Manager):
             # if binary was not yet loaded from filesystem, do it now
             # this is expensive, we have to find it's abspath, version, and sha256, but it's necessary
             # to make sure we have a good, up-to-date record of it in the DB & in-memroy cache
-            binary = binary.load(fresh=True)
+            binary = archivebox.pm.hook.binary_load(binary=binary, fresh=True)
 
         assert binary.loaded_binprovider and binary.loaded_abspath and binary.loaded_version and binary.loaded_sha256, f'Failed to load binary {binary.name} abspath, version, and sha256'
         
@@ -291,8 +292,8 @@ class InstalledBinary(ABIDModel, ModelWithHealthStats):
         if not hasattr(self, 'machine'):
             self.machine = Machine.objects.current()
         if not self.binprovider:
-            all_known_binproviders = list(abx.archivebox.reads.get_BINPROVIDERS().values())
-            binary = BaseBinary(name=self.name, binproviders=all_known_binproviders).load(fresh=True)
+            all_known_binproviders = list(abx.as_dict(archivebox.pm.hook.get_BINPROVIDERS()).values())
+            binary = archivebox.pm.hook.binary_load(binary=Binary(name=self.name, binproviders=all_known_binproviders), fresh=True)
             self.binprovider = binary.loaded_binprovider.name if binary.loaded_binprovider else None
         if not self.abspath:
             self.abspath = self.BINPROVIDER.get_abspath(self.name)
@@ -304,16 +305,16 @@ class InstalledBinary(ABIDModel, ModelWithHealthStats):
         super().clean(*args, **kwargs)
 
     @cached_property
-    def BINARY(self) -> BaseBinary:
-        for binary in abx.archivebox.reads.get_BINARIES().values():
+    def BINARY(self) -> Binary:
+        for binary in abx.as_dict(archivebox.pm.hook.get_BINARIES()).values():
             if binary.name == self.name:
                 return binary
         raise Exception(f'Orphaned InstalledBinary {self.name} {self.binprovider} was found in DB, could not find any plugin that defines it')
         # TODO: we could technically reconstruct it from scratch, but why would we ever want to do that?
 
     @cached_property
-    def BINPROVIDER(self) -> BaseBinProvider:
-        for binprovider in abx.archivebox.reads.get_BINPROVIDERS().values():
+    def BINPROVIDER(self) -> BinProvider:
+        for binprovider in abx.as_dict(archivebox.pm.hook.get_BINPROVIDERS()).values():
             if binprovider.name == self.binprovider:
                 return binprovider
         raise Exception(f'Orphaned InstalledBinary(name={self.name}) was found in DB, could not find any plugin that defines BinProvider(name={self.binprovider})')
@@ -321,7 +322,7 @@ class InstalledBinary(ABIDModel, ModelWithHealthStats):
     # maybe not a good idea to provide this? Binary in DB is a record of the binary's config
     # whereas a loaded binary is a not-yet saved instance that may not have the same config
     # why would we want to load a binary record from the db when it could be freshly loaded?
-    def load_from_db(self) -> BaseBinary:
+    def load_from_db(self) -> Binary:
         # TODO: implement defaults arg in pydantic_pkgr
         # return self.BINARY.load(defaults={
         #     'binprovider': self.BINPROVIDER,
@@ -330,7 +331,7 @@ class InstalledBinary(ABIDModel, ModelWithHealthStats):
         #     'sha256': self.sha256,
         # })
         
-        return BaseBinary.model_validate({
+        return Binary.model_validate({
             **self.BINARY.model_dump(),
             'abspath': self.abspath and Path(self.abspath),
             'version': self.version,
@@ -340,5 +341,5 @@ class InstalledBinary(ABIDModel, ModelWithHealthStats):
             'overrides': self.BINARY.overrides,
         })
 
-    def load_fresh(self) -> BaseBinary:
-        return self.BINARY.load(fresh=True)
+    def load_fresh(self) -> Binary:
+        return archivebox.pm.hook.binary_load(binary=self.BINARY, fresh=True)

+ 33 - 30
archivebox/main.py

@@ -14,6 +14,10 @@ from crontab import CronTab, CronSlices
 from django.db.models import QuerySet
 from django.utils import timezone
 
+from pydantic_pkgr import Binary
+
+import abx
+import archivebox
 from archivebox.misc.checks import check_data_folder
 from archivebox.misc.util import enforce_types                         # type: ignore
 from archivebox.misc.system import get_dir_size, dedupe_cron_jobs, CRON_COMMENT
@@ -22,7 +26,7 @@ from archivebox.misc.logging import stderr, hint
 from archivebox.config import CONSTANTS, VERSION, DATA_DIR, ARCHIVE_DIR
 from archivebox.config.common import SHELL_CONFIG, SEARCH_BACKEND_CONFIG, STORAGE_CONFIG, SERVER_CONFIG, ARCHIVING_CONFIG
 from archivebox.config.permissions import SudoPermission, IN_DOCKER
-from archivebox.config.configfile import (
+from archivebox.config.collection import (
     write_config_file,
     load_all_config,
     get_real_name,
@@ -195,15 +199,13 @@ def version(quiet: bool=False,
     console = Console()
     prnt = console.print
     
-    from django.conf import settings
-    
-    from abx.archivebox.base_binary import BaseBinary, apt, brew, env
+    from abx_plugin_default_binproviders import apt, brew, env
     
     from archivebox.config.version import get_COMMIT_HASH, get_BUILD_TIME
     from archivebox.config.permissions import ARCHIVEBOX_USER, ARCHIVEBOX_GROUP, RUNNING_AS_UID, RUNNING_AS_GID
     from archivebox.config.paths import get_data_locations, get_code_locations
     
-    from plugins_auth.ldap.config import LDAP_CONFIG
+    LDAP_ENABLED = archivebox.pm.hook.get_SCOPE_CONFIG().LDAP_ENABLED
 
 
     # 0.7.1
@@ -242,7 +244,7 @@ def version(quiet: bool=False,
         f'SUDO={CONSTANTS.IS_ROOT}',
         f'ID={CONSTANTS.MACHINE_ID}:{CONSTANTS.COLLECTION_ID}',
         f'SEARCH_BACKEND={SEARCH_BACKEND_CONFIG.SEARCH_BACKEND_ENGINE}',
-        f'LDAP={LDAP_CONFIG.LDAP_ENABLED}',
+        f'LDAP={LDAP_ENABLED}',
         #f'DB=django.db.backends.sqlite3 (({CONFIG["SQLITE_JOURNAL_MODE"]})',  # add this if we have more useful info to show eventually
     )
     prnt()
@@ -264,7 +266,8 @@ def version(quiet: bool=False,
 
     prnt('[pale_green1][i] Binary Dependencies:[/pale_green1]')
     failures = []
-    for name, binary in list(settings.BINARIES.items()):
+    BINARIES = abx.as_dict(archivebox.pm.hook.get_BINARIES())
+    for name, binary in list(BINARIES.items()):
         if binary.name == 'archivebox':
             continue
         
@@ -295,14 +298,15 @@ def version(quiet: bool=False,
             
     prnt()
     prnt('[gold3][i] Package Managers:[/gold3]')
-    for name, binprovider in list(settings.BINPROVIDERS.items()):
+    BINPROVIDERS = abx.as_dict(archivebox.pm.hook.get_BINPROVIDERS())
+    for name, binprovider in list(BINPROVIDERS.items()):
         err = None
         
         if binproviders and binprovider.name not in binproviders:
             continue
         
         # TODO: implement a BinProvider.BINARY() method that gets the loaded binary for a binprovider's INSTALLER_BIN
-        loaded_bin = binprovider.INSTALLER_BINARY or BaseBinary(name=binprovider.INSTALLER_BIN, binproviders=[env, apt, brew])
+        loaded_bin = binprovider.INSTALLER_BINARY or Binary(name=binprovider.INSTALLER_BIN, binproviders=[env, apt, brew])
         
         abspath = None
         if loaded_bin.abspath:
@@ -1050,9 +1054,7 @@ def install(out_dir: Path=DATA_DIR, binproviders: Optional[List[str]]=None, bina
     #    - recommend user re-run with sudo if any deps need to be installed as root
 
     from rich import print
-    from django.conf import settings
     
-    from archivebox import CONSTANTS
     from archivebox.config.permissions import IS_ROOT, ARCHIVEBOX_USER, ARCHIVEBOX_GROUP
     from archivebox.config.paths import get_or_create_working_lib_dir
 
@@ -1075,11 +1077,11 @@ def install(out_dir: Path=DATA_DIR, binproviders: Optional[List[str]]=None, bina
     
     package_manager_names = ', '.join(
         f'[yellow]{binprovider.name}[/yellow]'
-        for binprovider in list(settings.BINPROVIDERS.values())
+        for binprovider in reversed(list(abx.as_dict(abx.pm.hook.get_BINPROVIDERS()).values()))
         if not binproviders or (binproviders and binprovider.name in binproviders)
     )
     print(f'[+] Setting up package managers {package_manager_names}...')
-    for binprovider in list(settings.BINPROVIDERS.values()):
+    for binprovider in reversed(list(abx.as_dict(abx.pm.hook.get_BINPROVIDERS()).values())):
         if binproviders and binprovider.name not in binproviders:
             continue
         try:
@@ -1092,7 +1094,7 @@ def install(out_dir: Path=DATA_DIR, binproviders: Optional[List[str]]=None, bina
     
     print()
     
-    for binary in list(settings.BINARIES.values()):
+    for binary in reversed(list(abx.as_dict(abx.pm.hook.get_BINARIES()).values())):
         if binary.name in ('archivebox', 'django', 'sqlite', 'python'):
             # obviously must already be installed if we are running
             continue
@@ -1122,7 +1124,8 @@ def install(out_dir: Path=DATA_DIR, binproviders: Optional[List[str]]=None, bina
                                 result = binary.install(binproviders=[binprovider_name], dry_run=dry_run).model_dump(exclude={'overrides', 'bin_dir', 'hook_type'})
                                 sys.stderr.write("\033[00m\n")     # reset
                             else:
-                                result = binary.load_or_install(binproviders=[binprovider_name], fresh=True, dry_run=dry_run, quiet=False).model_dump(exclude={'overrides', 'bin_dir', 'hook_type'})
+                                loaded_binary = archivebox.pm.hook.binary_load_or_install(binary=binary, binproviders=[binprovider_name], fresh=True, dry_run=dry_run, quiet=False)
+                                result = loaded_binary.model_dump(exclude={'overrides', 'bin_dir', 'hook_type'})
                             if result and result['loaded_version']:
                                 break
                         except Exception as e:
@@ -1133,7 +1136,8 @@ def install(out_dir: Path=DATA_DIR, binproviders: Optional[List[str]]=None, bina
                         binary.install(dry_run=dry_run).model_dump(exclude={'overrides', 'bin_dir', 'hook_type'})
                         sys.stderr.write("\033[00m\n")  # reset
                     else:
-                        binary.load_or_install(fresh=True, dry_run=dry_run).model_dump(exclude={'overrides', 'bin_dir', 'hook_type'})
+                        loaded_binary = archivebox.pm.hook.binary_load_or_install(binary=binary, fresh=True, dry_run=dry_run)
+                        result = loaded_binary.model_dump(exclude={'overrides', 'bin_dir', 'hook_type'})
             if IS_ROOT and LIB_DIR:
                 with SudoPermission(uid=0):
                     if ARCHIVEBOX_USER == 0:
@@ -1157,7 +1161,7 @@ def install(out_dir: Path=DATA_DIR, binproviders: Optional[List[str]]=None, bina
     
     print('\n[green][√] Set up ArchiveBox and its dependencies successfully.[/green]\n', file=sys.stderr)
     
-    from plugins_pkg.pip.binaries import ARCHIVEBOX_BINARY
+    from abx_plugin_pip.binaries import ARCHIVEBOX_BINARY
     
     extra_args = []
     if binproviders:
@@ -1183,8 +1187,6 @@ def config(config_options_str: Optional[str]=None,
            out_dir: Path=DATA_DIR) -> None:
     """Get and set your ArchiveBox project configuration values"""
 
-    import abx.archivebox.reads
-
     from rich import print
 
     check_data_folder()
@@ -1198,7 +1200,8 @@ def config(config_options_str: Optional[str]=None,
     elif config_options_str:
         config_options = config_options_str.split('\n')
 
-    from django.conf import settings
+    FLAT_CONFIG = archivebox.pm.hook.get_FLAT_CONFIG()
+    CONFIGS = archivebox.pm.hook.get_CONFIGS()
     
     config_options = config_options or []
 
@@ -1208,8 +1211,8 @@ def config(config_options_str: Optional[str]=None,
     if search:
         if config_options:
             config_options = [get_real_name(key) for key in config_options]
-            matching_config = {key: settings.FLAT_CONFIG[key] for key in config_options if key in settings.FLAT_CONFIG}
-            for config_section in settings.CONFIGS.values():
+            matching_config = {key: FLAT_CONFIG[key] for key in config_options if key in FLAT_CONFIG}
+            for config_section in CONFIGS.values():
                 aliases = config_section.aliases
                 
                 for search_key in config_options:
@@ -1228,15 +1231,15 @@ def config(config_options_str: Optional[str]=None,
     elif get or no_args:
         if config_options:
             config_options = [get_real_name(key) for key in config_options]
-            matching_config = {key: settings.FLAT_CONFIG[key] for key in config_options if key in settings.FLAT_CONFIG}
-            failed_config = [key for key in config_options if key not in settings.FLAT_CONFIG]
+            matching_config = {key: FLAT_CONFIG[key] for key in config_options if key in FLAT_CONFIG}
+            failed_config = [key for key in config_options if key not in FLAT_CONFIG]
             if failed_config:
                 stderr()
                 stderr('[X] These options failed to get', color='red')
                 stderr('    {}'.format('\n    '.join(config_options)))
                 raise SystemExit(1)
         else:
-            matching_config = settings.FLAT_CONFIG
+            matching_config = FLAT_CONFIG
         
         print(printable_config(matching_config))
         raise SystemExit(not matching_config)
@@ -1257,20 +1260,20 @@ def config(config_options_str: Optional[str]=None,
             if key != raw_key:
                 stderr(f'[i] Note: The config option {raw_key} has been renamed to {key}, please use the new name going forwards.', color='lightyellow')
 
-            if key in settings.FLAT_CONFIG:
+            if key in FLAT_CONFIG:
                 new_config[key] = val.strip()
             else:
                 failed_options.append(line)
 
         if new_config:
-            before = settings.FLAT_CONFIG
+            before = FLAT_CONFIG
             matching_config = write_config_file(new_config)
-            after = {**load_all_config(), **abx.archivebox.reads.get_FLAT_CONFIG()}
+            after = {**load_all_config(), **archivebox.pm.hook.get_FLAT_CONFIG()}
             print(printable_config(matching_config))
 
             side_effect_changes = {}
             for key, val in after.items():
-                if key in settings.FLAT_CONFIG and (str(before[key]) != str(after[key])) and (key not in matching_config):
+                if key in FLAT_CONFIG and (str(before[key]) != str(after[key])) and (key not in matching_config):
                     side_effect_changes[key] = after[key]
                     # import ipdb; ipdb.set_trace()
 
@@ -1312,7 +1315,7 @@ def schedule(add: bool=False,
     """Set ArchiveBox to regularly import URLs at specific times using cron"""
     
     check_data_folder()
-    from archivebox.plugins_pkg.pip.binaries import ARCHIVEBOX_BINARY
+    from abx_plugin_pip.binaries import ARCHIVEBOX_BINARY
     from archivebox.config.permissions import USER
 
     Path(CONSTANTS.LOGS_DIR).mkdir(exist_ok=True)

+ 3 - 0
archivebox/misc/checks.py

@@ -201,6 +201,7 @@ def check_tmp_dir(tmp_dir=None, throw=False, quiet=False, must_exist=True):
 
 
 def check_lib_dir(lib_dir: Path | None = None, throw=False, quiet=False, must_exist=True):
+    import archivebox
     from archivebox.config.permissions import ARCHIVEBOX_USER, ARCHIVEBOX_GROUP
     from archivebox.misc.logging import STDERR
     from archivebox.config.paths import dir_is_writable, get_or_create_working_lib_dir
@@ -209,6 +210,8 @@ def check_lib_dir(lib_dir: Path | None = None, throw=False, quiet=False, must_ex
     
     lib_dir = lib_dir or STORAGE_CONFIG.LIB_DIR
     
+    assert lib_dir == archivebox.pm.hook.get_LIB_DIR(), "lib_dir is not the same as the one in the flat config"
+    
     if not must_exist and not os.path.isdir(lib_dir):
         return True
     

+ 1 - 2
archivebox/misc/shell_welcome_message.py

@@ -23,7 +23,7 @@ from archivebox import CONSTANTS           # noqa
 from ..main import *                       # noqa
 from ..cli import CLI_SUBCOMMANDS
 
-CONFIG = settings.FLAT_CONFIG
+CONFIG = archivebox.pm.hook.get_FLAT_CONFIG()
 CLI_COMMAND_NAMES = ", ".join(CLI_SUBCOMMANDS.keys())
 
 if __name__ == '__main__':
@@ -55,6 +55,5 @@ if __name__ == '__main__':
     prnt('    add[blink][deep_sky_blue4]?[/deep_sky_blue4][/blink]                                                                        [grey53]# add ? after anything to get help[/]')
     prnt('    add("https://example.com/some/new/url")                                     [grey53]# call CLI methods from the shell[/]')
     prnt('    snap = Snapshot.objects.filter(url__contains="https://example.com").last()  [grey53]# query for individual snapshots[/]')
-    prnt('    archivebox.plugins_extractor.wget.apps.WGET_EXTRACTOR.extract(snap.id)      [grey53]# call an extractor directly[/]')
     prnt('    snap.archiveresult_set.all()                                                [grey53]# see extractor results[/]')
     prnt('    bool(re.compile(CONFIG.URL_DENYLIST).search("https://example.com/abc.exe")) [grey53]# test out a config change[/]')

+ 20 - 10
archivebox/misc/util.py

@@ -5,7 +5,7 @@ import requests
 import json as pyjson
 import http.cookiejar
 
-from typing import List, Optional, Any
+from typing import List, Optional, Any, Callable
 from pathlib import Path
 from inspect import signature
 from functools import wraps
@@ -19,14 +19,13 @@ from requests.exceptions import RequestException, ReadTimeout
 from base32_crockford import encode as base32_encode                            # type: ignore
 from w3lib.encoding import html_body_declared_encoding, http_content_type_encoding
 try:
-    import chardet
+    import chardet    # type:ignore
     detect_encoding = lambda rawdata: chardet.detect(rawdata)["encoding"]
 except ImportError:
     detect_encoding = lambda rawdata: "utf-8"
 
 
-from archivebox.config import CONSTANTS
-from archivebox.config.common import ARCHIVING_CONFIG
+from archivebox.config.constants import CONSTANTS
 
 from .logging import COLOR_DICT
 
@@ -126,6 +125,7 @@ def is_static_file(url: str):
 def enforce_types(func):
     """
     Enforce function arg and kwarg types at runtime using its python3 type hints
+    Simpler version of pydantic @validate_call decorator
     """
     # TODO: check return type as well
 
@@ -186,11 +186,11 @@ def str_between(string: str, start: str, end: str=None) -> str:
 
 
 @enforce_types
-def parse_date(date: Any) -> Optional[datetime]:
+def parse_date(date: Any) -> datetime:
     """Parse unix timestamps, iso format, and human-readable strings"""
     
     if date is None:
-        return None
+        return None    # type: ignore
 
     if isinstance(date, datetime):
         if date.tzinfo is None:
@@ -212,6 +212,8 @@ def parse_date(date: Any) -> Optional[datetime]:
 def download_url(url: str, timeout: int=None) -> str:
     """Download the contents of a remote url and return the text"""
 
+    from archivebox.config.common import ARCHIVING_CONFIG
+
     timeout = timeout or ARCHIVING_CONFIG.TIMEOUT
     session = requests.Session()
 
@@ -241,8 +243,12 @@ def download_url(url: str, timeout: int=None) -> str:
         return url.rsplit('/', 1)[-1]
 
 @enforce_types
-def get_headers(url: str, timeout: int=None) -> str:
+def get_headers(url: str, timeout: int | None=None) -> str:
     """Download the contents of a remote url and return the headers"""
+    # TODO: get rid of this and use an abx pluggy hook instead
+    
+    from archivebox.config.common import ARCHIVING_CONFIG
+    
     timeout = timeout or ARCHIVING_CONFIG.TIMEOUT
 
     try:
@@ -283,6 +289,7 @@ def get_headers(url: str, timeout: int=None) -> str:
 def ansi_to_html(text: str) -> str:
     """
     Based on: https://stackoverflow.com/questions/19212665/python-converting-ansi-color-codes-to-html
+    Simple way to render colored CLI stdout/stderr in HTML properly, Textual/rich is probably better though.
     """
 
     TEMPLATE = '<span style="color: rgb{}"><br>'
@@ -306,13 +313,13 @@ def ansi_to_html(text: str) -> str:
 @enforce_types
 def dedupe(options: List[str]) -> List[str]:
     """
-    Deduplicates the given options. Options that come later clobber earlier
-    conflicting options.
+    Deduplicates the given CLI args by key=value. Options that come later override earlier.
     """
     deduped = {}
 
     for option in options:
-        deduped[option.split('=')[0]] = option
+        key = option.split('=')[0]
+        deduped[key] = option
 
     return list(deduped.values())
 
@@ -344,6 +351,9 @@ class ExtendedEncoder(pyjson.JSONEncoder):
         
         elif cls_name in ('dict_items', 'dict_keys', 'dict_values'):
             return tuple(obj)
+        
+        elif isinstance(obj, Callable):
+            return str(obj)
 
         return pyjson.JSONEncoder.default(self, obj)
 

+ 2 - 5
archivebox/parsers/generic_jsonl.py

@@ -1,14 +1,11 @@
 __package__ = 'archivebox.parsers'
 
 import json
-
 from typing import IO, Iterable
 
-from ..index.schema import Link
-from archivebox.misc.util import (
-    enforce_types,
-)
+from archivebox.misc.util import enforce_types
 
+from ..index.schema import Link
 from .generic_json import jsonObjectToLink
 
 def parse_line(line: str):

+ 6 - 5
archivebox/parsers/pocket_api.py

@@ -6,8 +6,7 @@ import re
 from typing import IO, Iterable, Optional
 from configparser import ConfigParser
 
-from pocket import Pocket
-
+import archivebox
 from archivebox.config import CONSTANTS
 from archivebox.misc.util import enforce_types
 from archivebox.misc.system import atomic_write
@@ -22,7 +21,7 @@ API_DB_PATH = CONSTANTS.SOURCES_DIR / 'pocket_api.db'
 _BROKEN_PROTOCOL_RE = re.compile('^(http[s]?)(:/(?!/))')
 
 
-def get_pocket_articles(api: Pocket, since=None, page=0):
+def get_pocket_articles(api, since=None, page=0):
     body, headers = api.get(
         state='archive',
         sort='oldest',
@@ -94,7 +93,9 @@ def should_parse_as_pocket_api(text: str) -> bool:
 def parse_pocket_api_export(input_buffer: IO[str], **_kwargs) -> Iterable[Link]:
     """Parse bookmarks from the Pocket API"""
 
-    from archivebox.plugins_extractor.pocket.config import POCKET_CONFIG
+    from pocket import Pocket
+
+    FLAT_CONFIG = archivebox.pm.hook.get_FLAT_CONFIG()
 
     input_buffer.seek(0)
     pattern = re.compile(r"^pocket:\/\/(\w+)")
@@ -102,7 +103,7 @@ def parse_pocket_api_export(input_buffer: IO[str], **_kwargs) -> Iterable[Link]:
         if should_parse_as_pocket_api(line):
             
             username = pattern.search(line).group(1)
-            api = Pocket(POCKET_CONFIG.POCKET_CONSUMER_KEY, POCKET_CONFIG.POCKET_ACCESS_TOKENS[username])
+            api = Pocket(FLAT_CONFIG.POCKET_CONSUMER_KEY, FLAT_CONFIG.POCKET_ACCESS_TOKENS[username])
             api.last_since = None
     
             for article in get_pocket_articles(api, since=read_since(username)):

+ 16 - 9
archivebox/parsers/readwise_reader_api.py

@@ -8,9 +8,10 @@ from datetime import datetime
 from typing import IO, Iterable, Optional
 from configparser import ConfigParser
 
+import abx
+
 from archivebox.misc.util import enforce_types
 from archivebox.misc.system import atomic_write
-from archivebox.plugins_extractor.readwise.config import READWISE_CONFIG
 
 from ..index.schema import Link
 
@@ -62,26 +63,30 @@ def link_from_article(article: dict, sources: list):
 
 
 def write_cursor(username: str, since: str):
-    if not READWISE_CONFIG.READWISE_DB_PATH.exists():
-        atomic_write(READWISE_CONFIG.READWISE_DB_PATH, "")
+    READWISE_DB_PATH = abx.pm.hook.get_CONFIG().READWISE_DB_PATH
+    
+    if not READWISE_DB_PATH.exists():
+        atomic_write(READWISE_DB_PATH, "")
 
     since_file = ConfigParser()
     since_file.optionxform = str
-    since_file.read(READWISE_CONFIG.READWISE_DB_PATH)
+    since_file.read(READWISE_DB_PATH)
 
     since_file[username] = {"since": since}
 
-    with open(READWISE_CONFIG.READWISE_DB_PATH, "w+") as new:
+    with open(READWISE_DB_PATH, "w+") as new:
         since_file.write(new)
 
 
 def read_cursor(username: str) -> Optional[str]:
-    if not READWISE_CONFIG.READWISE_DB_PATH.exists():
-        atomic_write(READWISE_CONFIG.READWISE_DB_PATH, "")
+    READWISE_DB_PATH = abx.pm.hook.get_CONFIG().READWISE_DB_PATH
+    
+    if not READWISE_DB_PATH.exists():
+        atomic_write(READWISE_DB_PATH, "")
 
     config_file = ConfigParser()
     config_file.optionxform = str
-    config_file.read(READWISE_CONFIG.READWISE_DB_PATH)
+    config_file.read(READWISE_DB_PATH)
 
     return config_file.get(username, "since", fallback=None)
 
@@ -97,12 +102,14 @@ def should_parse_as_readwise_reader_api(text: str) -> bool:
 def parse_readwise_reader_api_export(input_buffer: IO[str], **_kwargs) -> Iterable[Link]:
     """Parse bookmarks from the Readwise Reader API"""
 
+    READWISE_READER_TOKENS = abx.pm.hook.get_CONFIG().READWISE_READER_TOKENS
+
     input_buffer.seek(0)
     pattern = re.compile(r"^readwise-reader:\/\/(\w+)")
     for line in input_buffer:
         if should_parse_as_readwise_reader_api(line):
             username = pattern.search(line).group(1)
-            api = ReadwiseReaderAPI(READWISE_CONFIG.READWISE_READER_TOKENS[username], cursor=read_cursor(username))
+            api = ReadwiseReaderAPI(READWISE_READER_TOKENS[username], cursor=read_cursor(username))
 
             for article in get_readwise_reader_articles(api):
                 yield link_from_article(article, sources=[line])

+ 39 - 0
archivebox/pkgs/__init__.py

@@ -0,0 +1,39 @@
+import sys
+import importlib
+from pathlib import Path
+
+PKGS_DIR = Path(__file__).parent
+
+VENDORED_PKGS = [
+    'abx',
+    # 'pydantic-pkgr',
+]
+
+# scan ./pkgs and add all dirs present to list of available VENDORED_PKGS
+for subdir in reversed(sorted(PKGS_DIR.iterdir())):
+    if subdir.is_dir() and subdir.name not in VENDORED_PKGS and not subdir.name.startswith('_'):
+        VENDORED_PKGS.append(subdir.name)
+
+
+def load_vendored_pkgs():
+    """Add archivebox/vendor to sys.path and import all vendored libraries present within"""
+    if str(PKGS_DIR) not in sys.path:
+        sys.path.append(str(PKGS_DIR))
+    
+    for pkg_name in VENDORED_PKGS:
+        pkg_dir = PKGS_DIR / pkg_name
+        assert pkg_dir.is_dir(), f'Required vendored pkg {pkg_name} could not be found in {pkg_dir}'
+
+        try:
+            lib = importlib.import_module(pkg_name)
+            # print(f"Successfully imported lib from environment {pkg_name}")
+        except ImportError:
+            sys.path.append(str(pkg_dir))
+            try:
+                lib = importlib.import_module(pkg_name)
+                # print(f"Successfully imported lib from vendored fallback {pkg_name}: {inspect.getfile(lib)}")
+            except ImportError as e:
+                print(f"Failed to import lib from environment or vendored fallback {pkg_name}: {e}", file=sys.stderr)
+                sys.exit(1)
+        
+

+ 0 - 0
archivebox/plugins_pkg/__init__.py → archivebox/pkgs/abx-plugin-archivedotorg/README.md


+ 21 - 0
archivebox/pkgs/abx-plugin-archivedotorg/abx_plugin_archivedotorg/__init__.py

@@ -0,0 +1,21 @@
+__label__ = 'Archive.org'
+__homepage__ = 'https://archive.org'
+
+import abx
+
[email protected]
+def get_CONFIG():
+    from .config import ARCHIVEDOTORG_CONFIG
+    
+    return {
+        'ARCHIVEDOTORG_CONFIG': ARCHIVEDOTORG_CONFIG
+    }
+
+
+# @abx.hookimpl
+# def get_EXTRACTORS():
+#     from .extractors import ARCHIVEDOTORG_EXTRACTOR
+#
+#     return {
+#         'archivedotorg': ARCHIVEDOTORG_EXTRACTOR,
+#     }

+ 0 - 0
archivebox/extractors/archive_org.py → archivebox/pkgs/abx-plugin-archivedotorg/abx_plugin_archivedotorg/archive_org.py


+ 1 - 4
archivebox/plugins_extractor/archivedotorg/config.py → archivebox/pkgs/abx-plugin-archivedotorg/abx_plugin_archivedotorg/config.py

@@ -1,7 +1,4 @@
-__package__ = 'plugins_extractor.archivedotorg'
-
-
-from abx.archivebox.base_configset import BaseConfigSet
+from abx_spec_config.base_configset import BaseConfigSet
 
 
 class ArchivedotorgConfig(BaseConfigSet):

+ 18 - 0
archivebox/pkgs/abx-plugin-archivedotorg/pyproject.toml

@@ -0,0 +1,18 @@
+[project]
+name = "abx-plugin-archivedotorg"
+version = "2024.10.28"
+description = "Add your description here"
+readme = "README.md"
+requires-python = ">=3.10"
+dependencies = [
+    "abx>=0.1.0",
+    "abx-spec-config>=0.1.0",
+    "abx-plugin-curl>=2024.10.24",
+]
+
+[build-system]
+requires = ["hatchling"]
+build-backend = "hatchling.build"
+
+[project.entry-points.abx]
+abx_plugin_archivedotorg = "abx_plugin_archivedotorg"

+ 0 - 0
archivebox/plugins_search/__init__.py → archivebox/pkgs/abx-plugin-chrome/README.md


+ 34 - 0
archivebox/pkgs/abx-plugin-chrome/abx_plugin_chrome/__init__.py

@@ -0,0 +1,34 @@
+__label__ = 'Chrome'
+__author__ = 'ArchiveBox'
+
+import abx
+
[email protected]
+def get_CONFIG():
+    from .config import CHROME_CONFIG
+    
+    return {
+        'CHROME_CONFIG': CHROME_CONFIG
+    }
+
[email protected]
+def get_BINARIES():
+    from .binaries import CHROME_BINARY
+    
+    return {
+        'chrome': CHROME_BINARY,
+    }
+
[email protected]
+def ready():
+    from .config import CHROME_CONFIG
+    CHROME_CONFIG.validate()
+
+
+# @abx.hookimpl
+# def get_EXTRACTORS():
+#     return {
+#         'pdf': PDF_EXTRACTOR,
+#         'screenshot': SCREENSHOT_EXTRACTOR,
+#         'dom': DOM_EXTRACTOR,
+#     }

+ 23 - 21
archivebox/plugins_extractor/chrome/binaries.py → archivebox/pkgs/abx-plugin-chrome/abx_plugin_chrome/binaries.py

@@ -1,5 +1,3 @@
-__package__ = 'plugins_extractor.chrome'
-
 import os
 import platform
 from pathlib import Path
@@ -7,21 +5,22 @@ from typing import List, Optional
 
 from pydantic import InstanceOf
 from pydantic_pkgr import (
+    Binary,
     BinProvider,
     BinName,
     BinaryOverrides,
     bin_abspath,
 )
 
-from abx.archivebox.base_binary import BaseBinary, env, apt, brew
+import abx
 
-# Depends on Other Plugins:
-from archivebox.config.common import SHELL_CONFIG
-from plugins_pkg.puppeteer.binproviders import PUPPETEER_BINPROVIDER
-from plugins_pkg.playwright.binproviders import PLAYWRIGHT_BINPROVIDER
+from abx_plugin_default_binproviders import apt, brew, env
+from abx_plugin_puppeteer.binproviders import PUPPETEER_BINPROVIDER
+from abx_plugin_playwright.binproviders import PLAYWRIGHT_BINPROVIDER
 
 
 from .config import CHROME_CONFIG
+
 CHROMIUM_BINARY_NAMES_LINUX = [
     "chromium",
     "chromium-browser",
@@ -48,12 +47,13 @@ CHROME_BINARY_NAMES_MACOS = [
 ]
 CHROME_BINARY_NAMES = CHROME_BINARY_NAMES_LINUX + CHROME_BINARY_NAMES_MACOS
 
-APT_DEPENDENCIES = [
-    'apt-transport-https', 'at-spi2-common', 'chromium-browser',
+CHROME_APT_DEPENDENCIES = [
+    'apt-transport-https', 'at-spi2-common',
     'fontconfig', 'fonts-freefont-ttf', 'fonts-ipafont-gothic', 'fonts-kacst', 'fonts-khmeros', 'fonts-liberation', 'fonts-noto', 'fonts-noto-color-emoji', 'fonts-symbola', 'fonts-thai-tlwg', 'fonts-tlwg-loma-otf', 'fonts-unifont', 'fonts-wqy-zenhei',
     'libasound2', 'libatk-bridge2.0-0', 'libatk1.0-0', 'libatspi2.0-0', 'libavahi-client3', 'libavahi-common-data', 'libavahi-common3', 'libcairo2', 'libcups2',
     'libdbus-1-3', 'libdrm2', 'libfontenc1', 'libgbm1', 'libglib2.0-0', 'libice6', 'libnspr4', 'libnss3', 'libsm6', 'libunwind8', 'libx11-6', 'libxaw7', 'libxcb1',
     'libxcomposite1', 'libxdamage1', 'libxext6', 'libxfixes3', 'libxfont2', 'libxkbcommon0', 'libxkbfile1', 'libxmu6', 'libxpm4', 'libxrandr2', 'libxt6', 'x11-utils', 'x11-xkb-utils', 'xfonts-encodings',
+    'chromium-browser',
 ]
 
 
@@ -80,7 +80,7 @@ def create_macos_app_symlink(target: Path, shortcut: Path):
 ###################### Config ##########################
 
 
-class ChromeBinary(BaseBinary):
+class ChromeBinary(Binary):
     name: BinName = CHROME_CONFIG.CHROME_BINARY
     binproviders_supported: List[InstanceOf[BinProvider]] = [PUPPETEER_BINPROVIDER, env, PLAYWRIGHT_BINPROVIDER, apt, brew]
     
@@ -95,7 +95,7 @@ class ChromeBinary(BaseBinary):
             'packages': ['chromium'],                   # playwright install chromium
         },
         apt.name: {
-            'packages': APT_DEPENDENCIES,
+            'packages': CHROME_APT_DEPENDENCIES,
         },
         brew.name: {
             'packages': ['--cask', 'chromium'] if platform.system().lower() == 'darwin' else [],
@@ -104,10 +104,9 @@ class ChromeBinary(BaseBinary):
 
     @staticmethod
     def symlink_to_lib(binary, bin_dir=None) -> None:
-        from archivebox.config.common import STORAGE_CONFIG
-        bin_dir = bin_dir or STORAGE_CONFIG.LIB_DIR / 'bin'
+        bin_dir = bin_dir or abx.pm.hook.get_BIN_DIR()
         
-        if not (binary.abspath and os.access(binary.abspath, os.F_OK)):
+        if not (binary.abspath and os.path.isfile(binary.abspath)):
             return
         
         bin_dir.mkdir(parents=True, exist_ok=True)
@@ -121,7 +120,7 @@ class ChromeBinary(BaseBinary):
                 # otherwise on linux we can symlink directly to binary executable
                 symlink.unlink(missing_ok=True)
                 symlink.symlink_to(binary.abspath)
-        except Exception as err:
+        except Exception:
             # print(f'[red]:warning: Failed to symlink {symlink} -> {binary.abspath}[/red] {err}')
             # not actually needed, we can just run without it
             pass
@@ -132,14 +131,17 @@ class ChromeBinary(BaseBinary):
         Cleans up any state or runtime files that chrome leaves behind when killed by
         a timeout or other error
         """
-        lock_file = Path("~/.config/chromium/SingletonLock").expanduser()
-
-        if SHELL_CONFIG.IN_DOCKER and os.access(lock_file, os.F_OK):
-            lock_file.unlink()
+        try:
+            linux_lock_file = Path("~/.config/chromium/SingletonLock").expanduser()
+            linux_lock_file.unlink(missing_ok=True)
+        except Exception:
+            pass
         
         if CHROME_CONFIG.CHROME_USER_DATA_DIR:
-            if os.access(CHROME_CONFIG.CHROME_USER_DATA_DIR / 'SingletonLock', os.F_OK):
-                lock_file.unlink()
+            try:
+                (CHROME_CONFIG.CHROME_USER_DATA_DIR / 'SingletonLock').unlink(missing_ok=True)
+            except Exception:
+                pass
 
 
 

+ 13 - 13
archivebox/plugins_extractor/chrome/config.py → archivebox/pkgs/abx-plugin-chrome/abx_plugin_chrome/config.py

@@ -1,5 +1,3 @@
-__package__ = 'plugins_extractor.chrome'
-
 import os
 from pathlib import Path
 from typing import List, Optional
@@ -7,8 +5,8 @@ from typing import List, Optional
 from pydantic import Field
 from pydantic_pkgr import bin_abspath
 
-from abx.archivebox.base_configset import BaseConfigSet
-from abx.archivebox.base_binary import env
+from abx_spec_config.base_configset import BaseConfigSet
+from abx_plugin_default_binproviders import env
 
 from archivebox.config import CONSTANTS
 from archivebox.config.common import ARCHIVING_CONFIG, SHELL_CONFIG
@@ -81,15 +79,16 @@ class ChromeConfig(BaseConfigSet):
     # Chrome Binary
     CHROME_BINARY: str                      = Field(default='chrome')
     CHROME_DEFAULT_ARGS: List[str]          = Field(default=[
-        '--virtual-time-budget=15000',
-        '--disable-features=DarkMode',
-        "--run-all-compositor-stages-before-draw",
-        "--hide-scrollbars",
-        "--autoplay-policy=no-user-gesture-required",
-        "--no-first-run",
-        "--use-fake-ui-for-media-stream",
-        "--use-fake-device-for-media-stream",
-        "--simulate-outdated-no-au='Tue, 31 Dec 2099 23:59:59 GMT'",
+        "--no-first-run",                                              # dont show any first run ui / setup prompts
+        '--virtual-time-budget=15000',                                 # accellerate any animations on the page by 15s into the future
+        '--disable-features=DarkMode',                                 # disable dark mode for archiving
+        "--run-all-compositor-stages-before-draw",                     # dont draw partially rendered content, wait until everything is ready
+        "--hide-scrollbars",                                           # hide scrollbars to prevent layout shift / scrollbar visible in screenshots
+        "--autoplay-policy=no-user-gesture-required",                  # allow media autoplay without user gesture (e.g. on mobile)
+        "--use-fake-ui-for-media-stream",                              # provide fake camera if site tries to request camera access
+        "--use-fake-device-for-media-stream",                          # provide fake camera if site tries to request camera access
+        "--simulate-outdated-no-au='Tue, 31 Dec 2099 23:59:59 GMT'",   # ignore chrome updates
+        "--force-gpu-mem-available-mb=4096",                           # allows for longer full page screenshots https://github.com/puppeteer/puppeteer/issues/5530
     ])
     CHROME_EXTRA_ARGS: List[str]           = Field(default=[])
     
@@ -196,6 +195,7 @@ class ChromeConfig(BaseConfigSet):
             cmd_args.append('--user-data-dir={}'.format(options.CHROME_USER_DATA_DIR))
             cmd_args.append('--profile-directory={}'.format(options.CHROME_PROFILE_NAME or 'Default'))
         
+            # if CHROME_USER_DATA_DIR is set but folder is empty, create a new profile inside it
             if not os.path.isfile(options.CHROME_USER_DATA_DIR / options.CHROME_PROFILE_NAME / 'Preferences'):
                 STDERR.print(f'[green]        + creating new Chrome profile in: {pretty_path(options.CHROME_USER_DATA_DIR / options.CHROME_PROFILE_NAME)}[/green]')
                 cmd_args.remove('--no-first-run')

+ 0 - 0
archivebox/extractors/dom.py → archivebox/pkgs/abx-plugin-chrome/abx_plugin_chrome/dom.py


+ 0 - 0
archivebox/extractors/pdf.py → archivebox/pkgs/abx-plugin-chrome/abx_plugin_chrome/pdf.py


+ 0 - 0
archivebox/extractors/screenshot.py → archivebox/pkgs/abx-plugin-chrome/abx_plugin_chrome/screenshot.py


+ 18 - 0
archivebox/pkgs/abx-plugin-chrome/pyproject.toml

@@ -0,0 +1,18 @@
+[project]
+name = "abx-plugin-chrome"
+version = "2024.10.28"
+description = "Add your description here"
+readme = "README.md"
+requires-python = ">=3.10"
+dependencies = [
+    "abx>=0.1.0",
+    "abx-spec-config>=0.1.0",
+    "abx-spec-pydantic-pkgr>=0.1.0",
+]
+
+[build-system]
+requires = ["hatchling"]
+build-backend = "hatchling.build"
+
+[project.entry-points.abx]
+abx_plugin_chrome = "abx_plugin_chrome"

+ 0 - 0
archivebox/pkgs/abx-plugin-curl/README.md


+ 18 - 0
archivebox/pkgs/abx-plugin-curl/abx_plugin_curl/__init__.py

@@ -0,0 +1,18 @@
+import abx
+
+
[email protected]
+def get_CONFIG():
+    from .config import CURL_CONFIG
+    
+    return {
+        'curl': CURL_CONFIG
+    }
+
[email protected]
+def get_BINARIES():
+    from .binaries import CURL_BINARY
+    
+    return {
+        'curl': CURL_BINARY,
+    }

+ 4 - 4
archivebox/plugins_extractor/curl/binaries.py → archivebox/pkgs/abx-plugin-curl/abx_plugin_curl/binaries.py

@@ -1,17 +1,17 @@
-__package__ = 'plugins_extractor.curl'
+__package__ = 'abx_plugin_curl'
 
 from typing import List
 
 from pydantic import InstanceOf
-from pydantic_pkgr import BinProvider, BinName
+from pydantic_pkgr import BinProvider, BinName, Binary
 
-from abx.archivebox.base_binary import BaseBinary, env, apt, brew
+from abx_plugin_default_binproviders import apt, brew, env
 
 
 from .config import CURL_CONFIG
 
 
-class CurlBinary(BaseBinary):
+class CurlBinary(Binary):
     name: BinName = CURL_CONFIG.CURL_BINARY
     binproviders_supported: List[InstanceOf[BinProvider]] = [apt, brew, env]
 

+ 2 - 2
archivebox/plugins_extractor/curl/config.py → archivebox/pkgs/abx-plugin-curl/abx_plugin_curl/config.py

@@ -1,11 +1,11 @@
-__package__ = 'plugins_extractor.curl'
+__package__ = 'abx_plugin_curl'
 
 from typing import List, Optional
 from pathlib import Path
 
 from pydantic import Field
 
-from abx.archivebox.base_configset import BaseConfigSet
+from abx_spec_config.base_configset import BaseConfigSet
 
 from archivebox.config.common import ARCHIVING_CONFIG
 

+ 0 - 0
archivebox/extractors/headers.py → archivebox/pkgs/abx-plugin-curl/abx_plugin_curl/headers.py


+ 18 - 0
archivebox/pkgs/abx-plugin-curl/pyproject.toml

@@ -0,0 +1,18 @@
+[project]
+name = "abx-plugin-curl"
+version = "2024.10.24"
+description = "Add your description here"
+readme = "README.md"
+requires-python = ">=3.10"
+dependencies = [
+    "abx>=0.1.0",
+    "abx-spec-config>=0.1.0",
+    "abx-spec-pydantic-pkgr>=0.1.0",
+]
+
+[build-system]
+requires = ["hatchling"]
+build-backend = "hatchling.build"
+
+[project.entry-points.abx]
+abx_plugin_curl = "abx_plugin_curl"

+ 0 - 0
archivebox/pkgs/abx-plugin-default-binproviders/README.md


+ 23 - 0
archivebox/pkgs/abx-plugin-default-binproviders/abx_plugin_default_binproviders.py

@@ -0,0 +1,23 @@
+
+import abx
+
+from typing import Dict
+
+from pydantic_pkgr import (
+    AptProvider,
+    BrewProvider,
+    EnvProvider,
+    BinProvider,
+)
+apt = APT_BINPROVIDER = AptProvider()
+brew = BREW_BINPROVIDER = BrewProvider()
+env = ENV_BINPROVIDER = EnvProvider()
+
+
[email protected](tryfirst=True)
+def get_BINPROVIDERS() -> Dict[str, BinProvider]:
+    return {
+        'apt': APT_BINPROVIDER,
+        'brew': BREW_BINPROVIDER,
+        'env': ENV_BINPROVIDER,
+    }

+ 18 - 0
archivebox/pkgs/abx-plugin-default-binproviders/pyproject.toml

@@ -0,0 +1,18 @@
+[project]
+name = "abx-plugin-default-binproviders"
+version = "2024.10.24"
+description = "Default BinProviders for ABX (apt, brew, env)"
+readme = "README.md"
+requires-python = ">=3.10"
+dependencies = [
+    "abx>=0.1.0",
+    "pydantic-pkgr>=0.5.4",
+    "abx-spec-pydantic-pkgr>=0.1.0",
+]
+
+[build-system]
+requires = ["hatchling"]
+build-backend = "hatchling.build"
+
+[project.entry-points.abx]
+abx_plugin_default_binproviders = "abx_plugin_default_binproviders"

+ 0 - 0
archivebox/pkgs/abx-plugin-favicon/README.md


+ 29 - 0
archivebox/pkgs/abx-plugin-favicon/abx_plugin_favicon/__init__.py

@@ -0,0 +1,29 @@
+__label__ = 'Favicon'
+__version__ = '2024.10.24'
+__author__ = 'ArchiveBox'
+__homepage__ = 'https://github.com/ArchiveBox/archivebox'
+__dependencies__ = [
+    'abx>=0.1.0',
+    'abx-spec-config>=0.1.0',
+    'abx-plugin-curl-extractor>=2024.10.24',
+]
+
+import abx
+
+
[email protected]
+def get_CONFIG():
+    from .config import FAVICON_CONFIG
+    
+    return {
+        'FAVICON_CONFIG': FAVICON_CONFIG
+    }
+
+
+# @abx.hookimpl
+# def get_EXTRACTORS():
+#     from .extractors import FAVICON_EXTRACTOR
+    
+#     return {
+#         'favicon': FAVICON_EXTRACTOR,
+#     }

+ 1 - 4
archivebox/plugins_extractor/favicon/config.py → archivebox/pkgs/abx-plugin-favicon/abx_plugin_favicon/config.py

@@ -1,7 +1,4 @@
-__package__ = 'plugins_extractor.favicon'
-
-
-from abx.archivebox.base_configset import BaseConfigSet
+from abx_spec_config.base_configset import BaseConfigSet
 
 
 class FaviconConfig(BaseConfigSet):

+ 0 - 0
archivebox/extractors/favicon.py → archivebox/pkgs/abx-plugin-favicon/abx_plugin_favicon/favicon.py


+ 18 - 0
archivebox/pkgs/abx-plugin-favicon/pyproject.toml

@@ -0,0 +1,18 @@
+[project]
+name = "abx-plugin-favicon"
+version = "2024.10.28"
+description = "Add your description here"
+readme = "README.md"
+requires-python = ">=3.10"
+dependencies = [
+    "abx>=0.1.0",
+    "abx-spec-config>=0.1.0",
+    "abx-plugin-curl>=2024.10.28",
+]
+
+[build-system]
+requires = ["hatchling"]
+build-backend = "hatchling.build"
+
+[project.entry-points.abx]
+abx_plugin_favicon = "abx_plugin_favicon"

+ 0 - 0
archivebox/pkgs/abx-plugin-git/README.md


+ 29 - 0
archivebox/pkgs/abx-plugin-git/abx_plugin_git/__init__.py

@@ -0,0 +1,29 @@
+__package__ = 'abx_plugin_git'
+__label__ = 'Git'
+
+import abx
+
+
[email protected]
+def get_CONFIG():
+    from .config import GIT_CONFIG
+    
+    return {
+        'GIT_CONFIG': GIT_CONFIG
+    }
+
[email protected]
+def get_BINARIES():
+    from .binaries import GIT_BINARY
+    
+    return {
+        'git': GIT_BINARY,
+    }
+
[email protected]
+def get_EXTRACTORS():
+    from .extractors import GIT_EXTRACTOR
+    
+    return {
+        'git': GIT_EXTRACTOR,
+    }

+ 4 - 4
archivebox/plugins_extractor/git/binaries.py → archivebox/pkgs/abx-plugin-git/abx_plugin_git/binaries.py

@@ -1,17 +1,17 @@
-__package__ = 'plugins_extractor.git'
+__package__ = 'abx_plugin_git'
 
 from typing import List
 
 from pydantic import InstanceOf
-from pydantic_pkgr import BinProvider, BinName
+from pydantic_pkgr import BinProvider, BinName, Binary
 
-from abx.archivebox.base_binary import BaseBinary, env, apt, brew
+from abx_plugin_default_binproviders import apt, brew, env
 
 from .config import GIT_CONFIG
 
 
 
-class GitBinary(BaseBinary):
+class GitBinary(Binary):
     name: BinName = GIT_CONFIG.GIT_BINARY
     binproviders_supported: List[InstanceOf[BinProvider]] = [apt, brew, env]
 

+ 2 - 2
archivebox/plugins_extractor/git/config.py → archivebox/pkgs/abx-plugin-git/abx_plugin_git/config.py

@@ -1,10 +1,10 @@
-__package__ = 'plugins_extractor.git'
+__package__ = 'abx_plugin_git'
 
 from typing import List
 
 from pydantic import Field
 
-from abx.archivebox.base_configset import BaseConfigSet
+from abx_spec_config.base_configset import BaseConfigSet
 
 from archivebox.config.common import ARCHIVING_CONFIG
 

+ 15 - 0
archivebox/pkgs/abx-plugin-git/abx_plugin_git/extractors.py

@@ -0,0 +1,15 @@
+__package__ = 'abx_plugin_git'
+
+# from pathlib import Path
+
+# from .binaries import GIT_BINARY
+
+
+# class GitExtractor(BaseExtractor):
+#     name: ExtractorName = 'git'
+#     binary: str = GIT_BINARY.name
+
+#     def get_output_path(self, snapshot) -> Path | None:
+#         return snapshot.as_link() / 'git'
+
+# GIT_EXTRACTOR = GitExtractor()

+ 2 - 2
archivebox/extractors/git.py → archivebox/pkgs/abx-plugin-git/abx_plugin_git/git.py

@@ -16,8 +16,8 @@ from archivebox.misc.util import (
 from ..logging_util import TimedProgress
 from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError
 
-from archivebox.plugins_extractor.git.config import GIT_CONFIG
-from archivebox.plugins_extractor.git.binaries import GIT_BINARY
+from abx_plugin_git.config import GIT_CONFIG
+from abx_plugin_git.binaries import GIT_BINARY
 
 
 def get_output_path():

+ 19 - 0
archivebox/pkgs/abx-plugin-git/pyproject.toml

@@ -0,0 +1,19 @@
+[project]
+name = "abx-plugin-git"
+version = "2024.10.28"
+description = "Add your description here"
+readme = "README.md"
+requires-python = ">=3.10"
+dependencies = [
+    "abx>=0.1.0",
+    "abx-spec-config>=0.1.0",
+    "abx-spec-pydantic-pkgr>=0.1.0",
+    "abx-plugin-default-binproviders>=2024.10.24",
+]
+
+[build-system]
+requires = ["hatchling"]
+build-backend = "hatchling.build"
+
+[project.entry-points.abx]
+abx_plugin_git = "abx_plugin_git"

+ 0 - 0
archivebox/pkgs/abx-plugin-htmltotext/README.md


+ 22 - 0
archivebox/pkgs/abx-plugin-htmltotext/abx_plugin_htmltotext/__init__.py

@@ -0,0 +1,22 @@
+__package__ = 'abx_plugin_htmltotext'
+__label__ = 'HTML-to-Text'
+
+import abx
+
+
[email protected]
+def get_CONFIG():
+    from .config import HTMLTOTEXT_CONFIG
+    
+    return {
+        'HTMLTOTEXT_CONFIG': HTMLTOTEXT_CONFIG
+    }
+
+
+# @abx.hookimpl
+# def get_EXTRACTORS():
+#     from .extractors import FAVICON_EXTRACTOR
+    
+#     return {
+#         'htmltotext': FAVICON_EXTRACTOR,
+#     }

Some files were not shown because too many files changed in this diff