瀏覽代碼

v0.8.+: Massive speed improvements for Admin UI & common queries, v3 plugins progress, and bugfixes (#1498)

Nick Sweeting 1 年之前
父節點
當前提交
43e87ef437
共有 66 個文件被更改,包括 1150 次插入1100 次删除
  1. 3 0
      .gitmodules
  2. 18 1
      archivebox/abid_utils/models.py
  3. 0 0
      archivebox/builtin_plugins/__init__.py
  4. 0 0
      archivebox/builtin_plugins/base/__init__.py
  5. 3 0
      archivebox/builtin_plugins/base/admin.py
  6. 83 0
      archivebox/builtin_plugins/base/apps.py
  7. 0 0
      archivebox/builtin_plugins/base/migrations/__init__.py
  8. 3 0
      archivebox/builtin_plugins/base/models.py
  9. 3 0
      archivebox/builtin_plugins/base/tests.py
  10. 3 0
      archivebox/builtin_plugins/base/views.py
  11. 0 0
      archivebox/builtin_plugins/singlefile/__init__.py
  12. 113 0
      archivebox/builtin_plugins/singlefile/apps.py
  13. 66 0
      archivebox/builtin_plugins/singlefile/config.yaml
  14. 3 0
      archivebox/builtin_plugins/singlefile/tests.py
  15. 28 15
      archivebox/config.py
  16. 108 25
      archivebox/core/admin.py
  17. 1 1
      archivebox/core/forms.py
  18. 2 2
      archivebox/core/migrations/0027_update_snapshot_ids.py
  19. 1 1
      archivebox/core/migrations/0034_alter_archiveresult_old_id_alter_archiveresult_uuid.py
  20. 1 1
      archivebox/core/migrations/0051_snapshottag_snapshot_alter_snapshottag_snapshot_old.py
  21. 1 1
      archivebox/core/migrations/0059_tag_id.py
  22. 1 1
      archivebox/core/migrations/0063_snapshottag_tag_alter_snapshottag_old_tag.py
  23. 35 0
      archivebox/core/migrations/0069_alter_archiveresult_created_alter_snapshot_added_and_more.py
  24. 68 36
      archivebox/core/models.py
  25. 126 80
      archivebox/core/settings.py
  26. 1 1
      archivebox/extractors/__init__.py
  27. 12 1
      archivebox/extractors/wget.py
  28. 6 6
      archivebox/index/__init__.py
  29. 9 1
      archivebox/index/html.py
  30. 3 1
      archivebox/index/sql.py
  31. 6 5
      archivebox/main.py
  32. 10 10
      archivebox/package-lock.json
  33. 1 1
      archivebox/parsers/pocket_api.py
  34. 0 0
      archivebox/pkg/__init__.py
  35. 3 0
      archivebox/pkg/admin.py
  36. 14 0
      archivebox/pkg/apps.py
  37. 0 0
      archivebox/pkg/management/__init__.py
  38. 0 0
      archivebox/pkg/management/commands/__init__.py
  39. 75 0
      archivebox/pkg/management/commands/pkg.py
  40. 0 0
      archivebox/pkg/migrations/__init__.py
  41. 3 0
      archivebox/pkg/models.py
  42. 86 0
      archivebox/pkg/settings.py
  43. 3 0
      archivebox/pkg/tests.py
  44. 3 0
      archivebox/pkg/views.py
  45. 0 1
      archivebox/plugantic/__init__.py
  46. 11 0
      archivebox/plugantic/apps.py
  47. 22 280
      archivebox/plugantic/binaries.py
  48. 0 561
      archivebox/plugantic/binproviders.py
  49. 1 1
      archivebox/plugantic/extractors.py
  50. 0 12
      archivebox/plugantic/plugins.py
  51. 0 1
      archivebox/plugantic/replayers.py
  52. 48 7
      archivebox/plugantic/views.py
  53. 1 2
      archivebox/system.py
  54. 4 8
      archivebox/util.py
  55. 34 0
      archivebox/vendor/__init__.py
  56. 0 1
      archivebox/vendor/atomicwrites.py
  57. 0 1
      archivebox/vendor/base32_crockford.py
  58. 0 1
      archivebox/vendor/package-lock.json
  59. 0 1
      archivebox/vendor/package.json
  60. 0 1
      archivebox/vendor/pocket.py
  61. 1 0
      archivebox/vendor/pydantic-pkgr
  62. 0 1
      archivebox/vendor/taggit_utils.py
  63. 9 9
      package-lock.json
  64. 92 11
      pdm.lock
  65. 14 8
      pyproject.toml
  66. 8 3
      requirements.txt

+ 3 - 0
.gitmodules

@@ -26,3 +26,6 @@
 [submodule "archivebox/vendor/python-atomicwrites"]
 [submodule "archivebox/vendor/python-atomicwrites"]
 	path = archivebox/vendor/python-atomicwrites
 	path = archivebox/vendor/python-atomicwrites
 	url = https://github.com/untitaker/python-atomicwrites
 	url = https://github.com/untitaker/python-atomicwrites
+[submodule "archivebox/vendor/pydantic-pkgr"]
+	path = archivebox/vendor/pydantic-pkgr
+	url = https://github.com/ArchiveBox/pydantic-pkgr

+ 18 - 1
archivebox/abid_utils/models.py

@@ -61,6 +61,11 @@ def get_or_create_system_user_pk(username='system'):
     return user.pk
     return user.pk
 
 
 
 
+class AutoDateTimeField(models.DateTimeField):
+    def pre_save(self, model_instance, add):
+        return timezone.now()
+
+
 class ABIDModel(models.Model):
 class ABIDModel(models.Model):
     """
     """
     Abstract Base Model for other models to depend on. Provides ArchiveBox ID (ABID) interface.
     Abstract Base Model for other models to depend on. Provides ArchiveBox ID (ABID) interface.
@@ -76,13 +81,16 @@ class ABIDModel(models.Model):
     abid = ABIDField(prefix=abid_prefix)
     abid = ABIDField(prefix=abid_prefix)
 
 
     created_by = models.ForeignKey(settings.AUTH_USER_MODEL, on_delete=models.CASCADE, default=get_or_create_system_user_pk)
     created_by = models.ForeignKey(settings.AUTH_USER_MODEL, on_delete=models.CASCADE, default=get_or_create_system_user_pk)
-    created = models.DateTimeField(auto_now_add=True)
+    created = AutoDateTimeField(default=timezone.now, db_index=True)
     modified = models.DateTimeField(auto_now=True)
     modified = models.DateTimeField(auto_now=True)
 
 
     class Meta(TypedModelMeta):
     class Meta(TypedModelMeta):
         abstract = True
         abstract = True
 
 
     def save(self, *args: Any, **kwargs: Any) -> None:
     def save(self, *args: Any, **kwargs: Any) -> None:
+        if self._state.adding or not self.created:
+            self.created = timezone.now()
+
         # when first creating a row, self.ABID is the source of truth
         # when first creating a row, self.ABID is the source of truth
         # overwrite default prefilled self.id & self.abid with generated self.ABID value
         # overwrite default prefilled self.id & self.abid with generated self.ABID value
         if self._state.adding or not self.id:
         if self._state.adding or not self.id:
@@ -93,6 +101,7 @@ class ABIDModel(models.Model):
         super().save(*args, **kwargs)
         super().save(*args, **kwargs)
         assert str(self.id) == str(self.ABID.uuid), f'self.id {self.id} does not match self.ABID {self.ABID.uuid}'
         assert str(self.id) == str(self.ABID.uuid), f'self.id {self.id} does not match self.ABID {self.ABID.uuid}'
         assert str(self.abid) == str(self.ABID), f'self.abid {self.id} does not match self.ABID {self.ABID.uuid}'
         assert str(self.abid) == str(self.ABID), f'self.abid {self.id} does not match self.ABID {self.ABID.uuid}'
+        assert str(self.uuid) == str(self.ABID.uuid), f'self.uuid ({self.uuid}) does not match .ABID.uuid ({self.ABID.uuid})'
 
 
     @property
     @property
     def abid_values(self) -> Dict[str, Any]:
     def abid_values(self) -> Dict[str, Any]:
@@ -186,6 +195,14 @@ class ABIDModel(models.Model):
         Get a uuid.UUID (v4) representation of the object's ABID.
         Get a uuid.UUID (v4) representation of the object's ABID.
         """
         """
         return self.ABID.uuid
         return self.ABID.uuid
+    
+    @property
+    def uuid(self) -> str:
+        """
+        Get a str uuid.UUID (v4) representation of the object's ABID.
+        """
+        assert str(self.id) == str(self.ABID.uuid)
+        return str(self.id)
 
 
     @property
     @property
     def TypeID(self) -> TypeID:
     def TypeID(self) -> TypeID:

+ 0 - 0
archivebox/builtin_plugins/__init__.py


+ 0 - 0
archivebox/builtin_plugins/base/__init__.py


+ 3 - 0
archivebox/builtin_plugins/base/admin.py

@@ -0,0 +1,3 @@
+from django.contrib import admin
+
+# Register your models here.

+ 83 - 0
archivebox/builtin_plugins/base/apps.py

@@ -0,0 +1,83 @@
+import sys
+import inspect
+from typing import List, Dict, Any, Optional
+from pathlib import Path
+
+import django
+from django.apps import AppConfig
+from django.core.checks import Tags, Warning, register
+from django.db.backends.sqlite3.base import Database as sqlite3
+
+from pydantic import (
+    Field,
+    SerializeAsAny,
+)
+
+from pydantic_pkgr import SemVer, BinProvider, BinProviderName, ProviderLookupDict, BinName, Binary, EnvProvider, NpmProvider
+
+from plugantic.extractors import Extractor, ExtractorName
+from plugantic.plugins import Plugin
+from plugantic.configs import ConfigSet, ConfigSectionName
+from plugantic.replayers import Replayer
+
+
+class PythonBinary(Binary):
+    name: BinName = 'python'
+
+    providers_supported: List[BinProvider] = [EnvProvider()]
+    provider_overrides: Dict[str, Any] = {
+        'env': {
+            'subdeps': \
+                lambda: 'python3 python3-minimal python3-pip python3-virtualenv',
+            'abspath': \
+                lambda: sys.executable,
+            'version': \
+                lambda: '{}.{}.{}'.format(*sys.version_info[:3]),
+        },
+    }
+
+class SqliteBinary(Binary):
+    name: BinName = 'sqlite'
+    providers_supported: List[BinProvider] = [EnvProvider()]
+    provider_overrides:  Dict[BinProviderName, ProviderLookupDict] = {
+        'env': {
+            'abspath': \
+                lambda: Path(inspect.getfile(sqlite3)),
+            'version': \
+                lambda: SemVer(sqlite3.version),
+        },
+    }
+
+
+class DjangoBinary(Binary):
+    name: BinName = 'django'
+
+    providers_supported: List[BinProvider] = [EnvProvider()]
+    provider_overrides:  Dict[BinProviderName, ProviderLookupDict] = {
+        'env': {
+            'abspath': \
+                lambda: inspect.getfile(django),
+            'version': \
+                lambda: django.VERSION[:3],
+        },
+    }
+
+
+class BasicReplayer(Replayer):
+    name: str = 'basic'
+
+
+class BasePlugin(Plugin):
+    name: str = 'base'
+    configs: List[SerializeAsAny[ConfigSet]] = []
+    binaries: List[SerializeAsAny[Binary]] = [PythonBinary(), SqliteBinary(), DjangoBinary()]
+    extractors: List[SerializeAsAny[Extractor]] = []
+    replayers: List[SerializeAsAny[Replayer]] = [BasicReplayer()]
+
+
+PLUGINS = [BasePlugin()]
+
+
+class BaseConfig(AppConfig):
+    default_auto_field = 'django.db.models.BigAutoField'
+    name = 'builtin_plugins.base'

+ 0 - 0
archivebox/builtin_plugins/base/migrations/__init__.py


+ 3 - 0
archivebox/builtin_plugins/base/models.py

@@ -0,0 +1,3 @@
+from django.db import models
+
+# Create your models here.

+ 3 - 0
archivebox/builtin_plugins/base/tests.py

@@ -0,0 +1,3 @@
+from django.test import TestCase
+
+# Create your tests here.

+ 3 - 0
archivebox/builtin_plugins/base/views.py

@@ -0,0 +1,3 @@
+from django.shortcuts import render
+
+# Create your views here.

+ 0 - 0
archivebox/builtin_plugins/singlefile/__init__.py


+ 113 - 0
archivebox/builtin_plugins/singlefile/apps.py

@@ -0,0 +1,113 @@
+from typing import List, Optional, Dict
+from pathlib import Path
+
+from django.apps import AppConfig
+from django.core.checks import Tags, Warning, register
+
+from pydantic import (
+    Field,
+    SerializeAsAny,
+)
+
+from pydantic_pkgr import BinProvider, BinName, Binary, EnvProvider, NpmProvider
+from pydantic_pkgr.binprovider import bin_abspath
+from pydantic_pkgr.binary import BinProviderName, ProviderLookupDict
+
+from plugantic.extractors import Extractor, ExtractorName
+from plugantic.plugins import Plugin
+from plugantic.configs import ConfigSet, ConfigSectionName
+
+from pkg.settings import env
+
+
+###################### Config ##########################
+
+class SinglefileToggleConfig(ConfigSet):
+    section: ConfigSectionName = 'ARCHIVE_METHOD_TOGGLES'
+
+    SAVE_SINGLEFILE: bool = True
+
+
+class SinglefileDependencyConfig(ConfigSet):
+    section: ConfigSectionName = 'DEPENDENCY_CONFIG'
+
+    SINGLEFILE_BINARY: str = Field(default='wget')
+    SINGLEFILE_ARGS: Optional[List[str]] = Field(default=None)
+    SINGLEFILE_EXTRA_ARGS: List[str] = []
+    SINGLEFILE_DEFAULT_ARGS: List[str] = ['--timeout={TIMEOUT-10}']
+
+class SinglefileOptionsConfig(ConfigSet):
+    section: ConfigSectionName = 'ARCHIVE_METHOD_OPTIONS'
+
+    # loaded from shared config
+    SINGLEFILE_USER_AGENT: str = Field(default='', alias='USER_AGENT')
+    SINGLEFILE_TIMEOUT: int = Field(default=60, alias='TIMEOUT')
+    SINGLEFILE_CHECK_SSL_VALIDITY: bool = Field(default=True, alias='CHECK_SSL_VALIDITY')
+    SINGLEFILE_RESTRICT_FILE_NAMES: str = Field(default='windows', alias='RESTRICT_FILE_NAMES')
+    SINGLEFILE_COOKIES_FILE: Optional[Path] = Field(default=None, alias='COOKIES_FILE')
+
+
+
+DEFAULT_CONFIG = {
+    'CHECK_SSL_VALIDITY': False,
+    'SAVE_SINGLEFILE': True,
+    'TIMEOUT': 120,
+}
+
+PLUGIN_CONFIG = [
+    SinglefileToggleConfig(**DEFAULT_CONFIG),
+    SinglefileDependencyConfig(**DEFAULT_CONFIG),
+    SinglefileOptionsConfig(**DEFAULT_CONFIG),
+]
+
+###################### Binaries ############################
+
+min_version: str = "1.1.54"
+max_version: str = "2.0.0"
+
+class SinglefileBinary(Binary):
+    name: BinName = 'single-file'
+    providers_supported: List[BinProvider] = [NpmProvider()]
+
+
+    provider_overrides: Dict[BinProviderName, ProviderLookupDict] ={
+        'env': {
+            'abspath': lambda: bin_abspath('single-file-node.js', PATH=env.PATH) or bin_abspath('single-file', PATH=env.PATH),
+        },
+        'npm': {
+            # 'abspath': lambda: bin_abspath('single-file', PATH=NpmProvider().PATH) or bin_abspath('single-file', PATH=env.PATH),
+            'subdeps': lambda: f'single-file-cli@>={min_version} <{max_version}',
+        },
+    }
+
+
+###################### Extractors ##########################
+
+class SinglefileExtractor(Extractor):
+    name: ExtractorName = 'singlefile'
+    binary: Binary = SinglefileBinary()
+
+    def get_output_path(self, snapshot) -> Path:
+        return Path(snapshot.link_dir) / 'singlefile.html'
+
+
+###################### Plugins #############################
+
+
+class SinglefilePlugin(Plugin):
+    name: str = 'singlefile'
+    configs: List[SerializeAsAny[ConfigSet]] = [*PLUGIN_CONFIG]
+    binaries: List[SerializeAsAny[Binary]] = [SinglefileBinary()]
+    extractors: List[SerializeAsAny[Extractor]] = [SinglefileExtractor()]
+
+PLUGINS = [SinglefilePlugin()]
+
+###################### Django Apps #########################
+
+class SinglefileConfig(AppConfig):
+    name = 'builtin_plugins.singlefile'
+    verbose_name = 'SingleFile'
+
+    def ready(self):
+        pass
+        # print('Loaded singlefile plugin')

+ 66 - 0
archivebox/builtin_plugins/singlefile/config.yaml

@@ -0,0 +1,66 @@
+name: singlefile
+plugin_version: '0.0.1'
+plugin_spec: '0.0.1'
+
+binaries:
+    singlefile:
+        providers:
+            - env
+            - npm
+
+commands:
+    - singlefile.exec
+    - singlefile.extract
+    - singlefile.should_extract
+    - singlefile.get_output_path
+
+extractors:
+    singlefile:
+        binary: singlefile
+        test: singlefile.should_extract
+        extract: singlefile.extract
+        output_files:
+            - singlefile.html
+
+configs:
+    ARCHIVE_METHOD_TOGGLES:
+        SAVE_SINGLEFILE:
+            type: bool
+            default: true
+
+    DEPENDENCY_CONFIG:
+        SINGLEFILE_BINARY:
+            type: str
+            default: wget
+        SINGLEFILE_ARGS:
+            type: Optional[List[str]]
+            default: null
+        SINGLEFILE_EXTRA_ARGS:
+            type: List[str]
+            default: []
+        SINGLEFILE_DEFAULT_ARGS:
+            type: List[str]
+            default: 
+            - "--timeout={TIMEOUT-10}"
+
+    ARCHIVE_METHOD_OPTIONS:
+        SINGLEFILE_USER_AGENT:
+            type: str
+            default: ""
+            alias: USER_AGENT
+        SINGLEFILE_TIMEOUT:
+            type: int
+            default: 60
+            alias: TIMEOUT
+        SINGLEFILE_CHECK_SSL_VALIDITY:
+            type: bool
+            default: true
+            alias: CHECK_SSL_VALIDITY
+        SINGLEFILE_RESTRICT_FILE_NAMES:
+            type: str
+            default: windows
+            alias: RESTRICT_FILE_NAMES
+        SINGLEFILE_COOKIES_FILE:
+            type: Optional[Path]
+            default: null
+            alias: COOKIES_FILE

+ 3 - 0
archivebox/builtin_plugins/singlefile/tests.py

@@ -0,0 +1,3 @@
+from django.test import TestCase
+
+# Create your tests here.

+ 28 - 15
archivebox/config.py

@@ -31,8 +31,6 @@ import getpass
 import platform
 import platform
 import shutil
 import shutil
 import requests
 import requests
-import django
-from sqlite3 import dbapi2 as sqlite3
 
 
 from hashlib import md5
 from hashlib import md5
 from pathlib import Path
 from pathlib import Path
@@ -43,6 +41,11 @@ from configparser import ConfigParser
 from collections import defaultdict
 from collections import defaultdict
 import importlib.metadata
 import importlib.metadata
 
 
+from pydantic_pkgr import SemVer
+
+import django
+from django.db.backends.sqlite3.base import Database as sqlite3
+
 from .config_stubs import (
 from .config_stubs import (
     AttrDict,
     AttrDict,
     SimpleConfigValueDict,
     SimpleConfigValueDict,
@@ -52,6 +55,11 @@ from .config_stubs import (
     ConfigDefaultDict,
     ConfigDefaultDict,
 )
 )
 
 
+# load fallback libraries from vendor dir
+from .vendor import load_vendored_libs
+load_vendored_libs()
+
+
 
 
 ############################### Config Schema ##################################
 ############################### Config Schema ##################################
 
 
@@ -89,13 +97,13 @@ CONFIG_SCHEMA: Dict[str, ConfigDefaultDict] = {
         'SECRET_KEY':                {'type': str,   'default': None},
         'SECRET_KEY':                {'type': str,   'default': None},
         'BIND_ADDR':                 {'type': str,   'default': lambda c: ['127.0.0.1:8000', '0.0.0.0:8000'][c['IN_DOCKER']]},
         'BIND_ADDR':                 {'type': str,   'default': lambda c: ['127.0.0.1:8000', '0.0.0.0:8000'][c['IN_DOCKER']]},
         'ALLOWED_HOSTS':             {'type': str,   'default': '*'},     # e.g. archivebox.example.com,archivebox2.example.com
         'ALLOWED_HOSTS':             {'type': str,   'default': '*'},     # e.g. archivebox.example.com,archivebox2.example.com
-        'CSRF_TRUSTED_ORIGINS':      {'type': str,   'default': ''},      # e.g. https://archivebox.example.com,https://archivebox2.example.com:8080
+        'CSRF_TRUSTED_ORIGINS':      {'type': str,   'default': lambda c: 'http://localhost:8000,http://127.0.0.1:8000,http://0.0.0.0:8000,http://{}'.format(c['BIND_ADDR'])},   # e.g. https://archivebox.example.com,https://archivebox2.example.com:8080
         'DEBUG':                     {'type': bool,  'default': False},
         'DEBUG':                     {'type': bool,  'default': False},
         'PUBLIC_INDEX':              {'type': bool,  'default': True},
         'PUBLIC_INDEX':              {'type': bool,  'default': True},
         'PUBLIC_SNAPSHOTS':          {'type': bool,  'default': True},
         'PUBLIC_SNAPSHOTS':          {'type': bool,  'default': True},
         'PUBLIC_ADD_VIEW':           {'type': bool,  'default': False},
         'PUBLIC_ADD_VIEW':           {'type': bool,  'default': False},
         'FOOTER_INFO':               {'type': str,   'default': 'Content is hosted for personal archiving purposes only.  Contact server owner for any takedown requests.'},
         'FOOTER_INFO':               {'type': str,   'default': 'Content is hosted for personal archiving purposes only.  Contact server owner for any takedown requests.'},
-        'SNAPSHOTS_PER_PAGE':        {'type': int,   'default': 40},
+        'SNAPSHOTS_PER_PAGE':        {'type': int,   'default': 100},
         'CUSTOM_TEMPLATES_DIR':      {'type': str,   'default': None},
         'CUSTOM_TEMPLATES_DIR':      {'type': str,   'default': None},
         'TIME_ZONE':                 {'type': str,   'default': 'UTC'},
         'TIME_ZONE':                 {'type': str,   'default': 'UTC'},
         'TIMEZONE':                  {'type': str,   'default': 'UTC'},
         'TIMEZONE':                  {'type': str,   'default': 'UTC'},
@@ -565,7 +573,7 @@ DYNAMIC_CONFIG_SCHEMA: ConfigDefaultDict = {
     'PYTHON_VERSION':           {'default': lambda c: '{}.{}.{}'.format(*sys.version_info[:3])},
     'PYTHON_VERSION':           {'default': lambda c: '{}.{}.{}'.format(*sys.version_info[:3])},
 
 
     'DJANGO_BINARY':            {'default': lambda c: inspect.getfile(django)},
     'DJANGO_BINARY':            {'default': lambda c: inspect.getfile(django)},
-    'DJANGO_VERSION':           {'default': lambda c: '{}.{}.{} {} ({})'.format(*django.VERSION)},
+    'DJANGO_VERSION':           {'default': lambda c: '{}.{}.{}'.format(*django.VERSION[:3])},
     
     
     'SQLITE_BINARY':            {'default': lambda c: inspect.getfile(sqlite3)},
     'SQLITE_BINARY':            {'default': lambda c: inspect.getfile(sqlite3)},
     'SQLITE_VERSION':           {'default': lambda c: sqlite3.version},
     'SQLITE_VERSION':           {'default': lambda c: sqlite3.version},
@@ -902,16 +910,9 @@ def bin_version(binary: Optional[str], cmd: Optional[str]=None) -> Optional[str]
             version_str = run(cmd or [abspath, "--version"], shell=is_cmd_str, stdout=PIPE, stderr=STDOUT).stdout.strip().decode()
             version_str = run(cmd or [abspath, "--version"], shell=is_cmd_str, stdout=PIPE, stderr=STDOUT).stdout.strip().decode()
         
         
         # take first 3 columns of first line of version info
         # take first 3 columns of first line of version info
-        version_ptn = re.compile(r"\d+?\.\d+?\.?\d*", re.MULTILINE)
-        try:
-            version_nums = version_ptn.findall(version_str.split('\n')[0])[0]
-            if version_nums:
-                return version_nums
-            else:
-                raise IndexError
-        except IndexError:
-            # take first 3 columns of first line of version info
-            return ' '.join(version_str.split('\n')[0].strip().split()[:3])
+        semver = SemVer.parse(version_str)
+        if semver:
+            return str(semver)
     except OSError:
     except OSError:
         pass
         pass
         # stderr(f'[X] Unable to find working version of dependency: {binary}', color='red')
         # stderr(f'[X] Unable to find working version of dependency: {binary}', color='red')
@@ -1524,5 +1525,17 @@ def setup_django(out_dir: Path=None, check_db=False, config: ConfigDict=CONFIG,
             assert sql_index_path.exists(), (
             assert sql_index_path.exists(), (
                 f'No database file {SQL_INDEX_FILENAME} found in: {config["OUTPUT_DIR"]} (Are you in an ArchiveBox collection directory?)')
                 f'No database file {SQL_INDEX_FILENAME} found in: {config["OUTPUT_DIR"]} (Are you in an ArchiveBox collection directory?)')
 
 
+
+            # https://docs.pydantic.dev/logfire/integrations/django/ Logfire Debugging
+            if settings.DEBUG_LOGFIRE:
+                from opentelemetry.instrumentation.sqlite3 import SQLite3Instrumentor
+                SQLite3Instrumentor().instrument()
+
+                import logfire
+
+                logfire.configure()
+                logfire.instrument_django(is_sql_commentor_enabled=True)
+                logfire.info(f'Started ArchiveBox v{CONFIG.VERSION}', argv=sys.argv)
+
     except KeyboardInterrupt:
     except KeyboardInterrupt:
         raise SystemExit(2)
         raise SystemExit(2)

+ 108 - 25
archivebox/core/admin.py

@@ -10,12 +10,15 @@ from datetime import datetime, timezone
 from typing import Dict, Any
 from typing import Dict, Any
 
 
 from django.contrib import admin
 from django.contrib import admin
-from django.db.models import Count, Q
-from django.urls import path, reverse
+from django.db.models import Count, Q, Prefetch
+from django.urls import path, reverse, resolve
+from django.utils import timezone
+from django.utils.functional import cached_property
 from django.utils.html import format_html
 from django.utils.html import format_html
 from django.utils.safestring import mark_safe
 from django.utils.safestring import mark_safe
 from django.shortcuts import render, redirect
 from django.shortcuts import render, redirect
 from django.contrib.auth import get_user_model
 from django.contrib.auth import get_user_model
+from django.core.paginator import Paginator
 from django.core.exceptions import ValidationError
 from django.core.exceptions import ValidationError
 from django.conf import settings
 from django.conf import settings
 from django import forms
 from django import forms
@@ -126,22 +129,99 @@ archivebox_admin.get_admin_data_urls = get_admin_data_urls.__get__(archivebox_ad
 archivebox_admin.get_urls = get_urls(archivebox_admin.get_urls).__get__(archivebox_admin, ArchiveBoxAdmin)
 archivebox_admin.get_urls = get_urls(archivebox_admin.get_urls).__get__(archivebox_admin, ArchiveBoxAdmin)
 
 
 
 
+class AccelleratedPaginator(Paginator):
+    """
+    Accellerated Pagniator ignores DISTINCT when counting total number of rows.
+    Speeds up SELECT Count(*) on Admin views by >20x.
+    https://hakibenita.com/optimizing-the-django-admin-paginator
+    """
+
+    @cached_property
+    def count(self):
+        if self.object_list._has_filters():                             # type: ignore
+            # fallback to normal count method on filtered queryset
+            return super().count
+        else:
+            # otherwise count total rows in a separate fast query
+            return self.object_list.model.objects.count()
+    
+        # Alternative approach for PostgreSQL: fallback count takes > 200ms
+        # from django.db import connection, transaction, OperationalError
+        # with transaction.atomic(), connection.cursor() as cursor:
+        #     cursor.execute('SET LOCAL statement_timeout TO 200;')
+        #     try:
+        #         return super().count
+        #     except OperationalError:
+        #         return 9999999999999
+
+
 class ArchiveResultInline(admin.TabularInline):
 class ArchiveResultInline(admin.TabularInline):
     name = 'Archive Results Log'
     name = 'Archive Results Log'
     model = ArchiveResult
     model = ArchiveResult
+    parent_model = Snapshot
     # fk_name = 'snapshot'
     # fk_name = 'snapshot'
-    extra = 1
-    readonly_fields = ('result_id', 'start_ts', 'end_ts', 'extractor', 'command', 'cmd_version')
-    fields = ('id', *readonly_fields, 'status', 'output')
+    extra = 0
+    sort_fields = ('end_ts', 'extractor', 'output', 'status', 'cmd_version')
+    readonly_fields = ('result_id', 'completed', 'extractor', 'command', 'version')
+    fields = ('id', 'start_ts', 'end_ts', *readonly_fields, 'cmd', 'cmd_version', 'pwd', 'created_by', 'status', 'output')
+    # exclude = ('id',)
+    ordering = ('end_ts',)
     show_change_link = True
     show_change_link = True
     # # classes = ['collapse']
     # # classes = ['collapse']
     # # list_display_links = ['abid']
     # # list_display_links = ['abid']
 
 
+    def get_parent_object_from_request(self, request):
+        resolved = resolve(request.path_info)
+        return self.parent_model.objects.get(pk=resolved.kwargs['object_id'])
+
+    @admin.display(
+        description='Completed',
+        ordering='end_ts',
+    )
+    def completed(self, obj):
+        return format_html('<p style="white-space: nowrap">{}</p>', obj.end_ts.strftime('%Y-%m-%d %H:%M:%S'))
+
     def result_id(self, obj):
     def result_id(self, obj):
-        return format_html('<a href="{}"><small><code>[{}]</code></small></a>', reverse('admin:core_archiveresult_change', args=(obj.id,)), obj.abid)
+        return format_html('<a href="{}"><code style="font-size: 10px">[{}]</code></a>', reverse('admin:core_archiveresult_change', args=(obj.id,)), obj.abid)
     
     
     def command(self, obj):
     def command(self, obj):
         return format_html('<small><code>{}</code></small>', " ".join(obj.cmd or []))
         return format_html('<small><code>{}</code></small>', " ".join(obj.cmd or []))
+    
+    def version(self, obj):
+        return format_html('<small><code>{}</code></small>', obj.cmd_version or '-')
+    
+    def get_formset(self, request, obj=None, **kwargs):
+        formset = super().get_formset(request, obj, **kwargs)
+        snapshot = self.get_parent_object_from_request(request)
+
+        # import ipdb; ipdb.set_trace()
+        formset.form.base_fields['id'].widget = formset.form.base_fields['id'].hidden_widget()
+        
+        # default values for new entries
+        formset.form.base_fields['status'].initial = 'succeeded'
+        formset.form.base_fields['start_ts'].initial = timezone.now()
+        formset.form.base_fields['end_ts'].initial = timezone.now()
+        formset.form.base_fields['cmd_version'].initial = '-'
+        formset.form.base_fields['pwd'].initial = str(snapshot.link_dir)
+        formset.form.base_fields['created_by'].initial = request.user
+        formset.form.base_fields['cmd'] = forms.JSONField(initial=['-'])
+        formset.form.base_fields['output'].initial = 'Manually recorded cmd output...'
+        
+        if obj is not None:
+            # hidden values for existing entries and new entries
+            formset.form.base_fields['start_ts'].widget = formset.form.base_fields['start_ts'].hidden_widget()
+            formset.form.base_fields['end_ts'].widget = formset.form.base_fields['end_ts'].hidden_widget()
+            formset.form.base_fields['cmd'].widget = formset.form.base_fields['cmd'].hidden_widget()
+            formset.form.base_fields['pwd'].widget = formset.form.base_fields['pwd'].hidden_widget()
+            formset.form.base_fields['created_by'].widget = formset.form.base_fields['created_by'].hidden_widget()
+            formset.form.base_fields['cmd_version'].widget = formset.form.base_fields['cmd_version'].hidden_widget()
+        return formset
+    
+    def get_readonly_fields(self, request, obj=None):
+        if obj is not None:
+            return self.readonly_fields
+        else:
+            return []
 
 
 
 
 class TagInline(admin.TabularInline):
 class TagInline(admin.TabularInline):
@@ -222,25 +302,22 @@ def get_abid_info(self, obj):
 
 
 @admin.register(Snapshot, site=archivebox_admin)
 @admin.register(Snapshot, site=archivebox_admin)
 class SnapshotAdmin(SearchResultsAdminMixin, ABIDModelAdmin):
 class SnapshotAdmin(SearchResultsAdminMixin, ABIDModelAdmin):
-    class Meta:
-        model = Snapshot
-
     list_display = ('added', 'title_str', 'files', 'size', 'url_str')
     list_display = ('added', 'title_str', 'files', 'size', 'url_str')
-    # list_editable = ('title',)
     sort_fields = ('title_str', 'url_str', 'added', 'files')
     sort_fields = ('title_str', 'url_str', 'added', 'files')
-    readonly_fields = ('tags', 'timestamp', 'admin_actions', 'status_info', 'bookmarked', 'added', 'updated', 'created', 'modified', 'API', 'link_dir')
+    readonly_fields = ('tags_str', 'timestamp', 'admin_actions', 'status_info', 'bookmarked', 'added', 'updated', 'created', 'modified', 'API', 'link_dir')
     search_fields = ('id', 'url', 'abid', 'old_id', 'timestamp', 'title', 'tags__name')
     search_fields = ('id', 'url', 'abid', 'old_id', 'timestamp', 'title', 'tags__name')
-    list_filter = ('added', 'updated', 'archiveresult__status', 'created_by', 'tags')
+    list_filter = ('added', 'updated', 'archiveresult__status', 'created_by', 'tags__name')
     fields = ('url', 'created_by', 'title', *readonly_fields)
     fields = ('url', 'created_by', 'title', *readonly_fields)
     ordering = ['-added']
     ordering = ['-added']
     actions = ['add_tags', 'remove_tags', 'update_titles', 'update_snapshots', 'resnapshot_snapshot', 'overwrite_snapshots', 'delete_snapshots']
     actions = ['add_tags', 'remove_tags', 'update_titles', 'update_snapshots', 'resnapshot_snapshot', 'overwrite_snapshots', 'delete_snapshots']
-    autocomplete_fields = ['tags']
     inlines = [TagInline, ArchiveResultInline]
     inlines = [TagInline, ArchiveResultInline]
-    list_per_page = CONFIG.SNAPSHOTS_PER_PAGE
+    list_per_page = min(max(5, CONFIG.SNAPSHOTS_PER_PAGE), 5000)
 
 
     action_form = SnapshotActionForm
     action_form = SnapshotActionForm
+    paginator = AccelleratedPaginator
 
 
     save_on_top = True
     save_on_top = True
+    show_full_result_count = False
 
 
     def changelist_view(self, request, extra_context=None):
     def changelist_view(self, request, extra_context=None):
         extra_context = extra_context or {}
         extra_context = extra_context or {}
@@ -286,12 +363,15 @@ class SnapshotAdmin(SearchResultsAdminMixin, ABIDModelAdmin):
         ]
         ]
         return custom_urls + urls
         return custom_urls + urls
 
 
-    def get_queryset(self, request):
-        self.request = request
-        return super().get_queryset(request).prefetch_related('tags', 'archiveresult_set').annotate(archiveresult_count=Count('archiveresult'))
+    # def get_queryset(self, request):
+    #     # tags_qs = SnapshotTag.objects.all().select_related('tag')
+    #     # prefetch = Prefetch('snapshottag_set', queryset=tags_qs)
+
+    #     self.request = request
+    #     return super().get_queryset(request).prefetch_related('archiveresult_set').distinct()  # .annotate(archiveresult_count=Count('archiveresult'))
 
 
     def tag_list(self, obj):
     def tag_list(self, obj):
-        return ', '.join(obj.tags.values_list('name', flat=True))
+        return ', '.join(tag.name for tag in obj.tags.all())
 
 
     # TODO: figure out a different way to do this, you cant nest forms so this doenst work
     # TODO: figure out a different way to do this, you cant nest forms so this doenst work
     # def action(self, obj):
     # def action(self, obj):
@@ -360,21 +440,20 @@ class SnapshotAdmin(SearchResultsAdminMixin, ABIDModelAdmin):
         ordering='title',
         ordering='title',
     )
     )
     def title_str(self, obj):
     def title_str(self, obj):
-        canon = obj.as_link().canonical_outputs()
         tags = ''.join(
         tags = ''.join(
-            format_html('<a href="/admin/core/snapshot/?tags__id__exact={}"><span class="tag">{}</span></a> ', tag.id, tag)
+            format_html('<a href="/admin/core/snapshot/?tags__id__exact={}"><span class="tag">{}</span></a> ', tag.pk, tag.name)
             for tag in obj.tags.all()
             for tag in obj.tags.all()
-            if str(tag).strip()
+            if str(tag.name).strip()
         )
         )
         return format_html(
         return format_html(
             '<a href="/{}">'
             '<a href="/{}">'
-                '<img src="/{}/{}" class="favicon" onerror="this.remove()">'
+                '<img src="/{}/favicon.ico" class="favicon" onerror="this.remove()">'
             '</a>'
             '</a>'
             '<a href="/{}/index.html">'
             '<a href="/{}/index.html">'
                 '<b class="status-{}">{}</b>'
                 '<b class="status-{}">{}</b>'
             '</a>',
             '</a>',
             obj.archive_path,
             obj.archive_path,
-            obj.archive_path, canon['favicon_path'],
+            obj.archive_path,
             obj.archive_path,
             obj.archive_path,
             'fetched' if obj.latest_title or obj.title else 'pending',
             'fetched' if obj.latest_title or obj.title else 'pending',
             urldecode(htmldecode(obj.latest_title or obj.title or ''))[:128] or 'Pending...'
             urldecode(htmldecode(obj.latest_title or obj.title or ''))[:128] or 'Pending...'
@@ -382,14 +461,14 @@ class SnapshotAdmin(SearchResultsAdminMixin, ABIDModelAdmin):
 
 
     @admin.display(
     @admin.display(
         description='Files Saved',
         description='Files Saved',
-        ordering='archiveresult_count',
+        # ordering='archiveresult_count',
     )
     )
     def files(self, obj):
     def files(self, obj):
         return snapshot_icons(obj)
         return snapshot_icons(obj)
 
 
 
 
     @admin.display(
     @admin.display(
-        ordering='archiveresult_count'
+        # ordering='archiveresult_count'
     )
     )
     def size(self, obj):
     def size(self, obj):
         archive_size = (Path(obj.link_dir) / 'index.html').exists() and obj.archive_size
         archive_size = (Path(obj.link_dir) / 'index.html').exists() and obj.archive_size
@@ -536,6 +615,8 @@ class TagAdmin(ABIDModelAdmin):
     actions = ['delete_selected']
     actions = ['delete_selected']
     ordering = ['-created']
     ordering = ['-created']
 
 
+    paginator = AccelleratedPaginator
+
     def API(self, obj):
     def API(self, obj):
         try:
         try:
             return get_abid_info(self, obj)
             return get_abid_info(self, obj)
@@ -574,6 +655,8 @@ class ArchiveResultAdmin(ABIDModelAdmin):
     list_filter = ('status', 'extractor', 'start_ts', 'cmd_version')
     list_filter = ('status', 'extractor', 'start_ts', 'cmd_version')
     ordering = ['-start_ts']
     ordering = ['-start_ts']
     list_per_page = CONFIG.SNAPSHOTS_PER_PAGE
     list_per_page = CONFIG.SNAPSHOTS_PER_PAGE
+    
+    paginator = AccelleratedPaginator
 
 
     @admin.display(
     @admin.display(
         description='Snapshot Info'
         description='Snapshot Info'

+ 1 - 1
archivebox/core/forms.py

@@ -4,7 +4,7 @@ from django import forms
 
 
 from ..util import URL_REGEX
 from ..util import URL_REGEX
 from ..parsers import PARSERS
 from ..parsers import PARSERS
-from ..vendor.taggit_utils import edit_string_for_tags, parse_tags
+from taggit.utils import edit_string_for_tags, parse_tags
 
 
 PARSER_CHOICES = [
 PARSER_CHOICES = [
     (parser_key, parser[0])
     (parser_key, parser[0])

+ 2 - 2
archivebox/core/migrations/0027_update_snapshot_ids.py

@@ -52,7 +52,7 @@ def update_snapshot_ids(apps, schema_editor):
     Snapshot = apps.get_model("core", "Snapshot")
     Snapshot = apps.get_model("core", "Snapshot")
     num_total = Snapshot.objects.all().count()
     num_total = Snapshot.objects.all().count()
     print(f'   Updating {num_total} Snapshot.id, Snapshot.uuid values in place...')
     print(f'   Updating {num_total} Snapshot.id, Snapshot.uuid values in place...')
-    for idx, snapshot in enumerate(Snapshot.objects.all().only('abid').iterator()):
+    for idx, snapshot in enumerate(Snapshot.objects.all().only('abid').iterator(chunk_size=500)):
         assert snapshot.abid
         assert snapshot.abid
         snapshot.abid_prefix = 'snp_'
         snapshot.abid_prefix = 'snp_'
         snapshot.abid_ts_src = 'self.added'
         snapshot.abid_ts_src = 'self.added'
@@ -72,7 +72,7 @@ def update_archiveresult_ids(apps, schema_editor):
     ArchiveResult = apps.get_model("core", "ArchiveResult")
     ArchiveResult = apps.get_model("core", "ArchiveResult")
     num_total = ArchiveResult.objects.all().count()
     num_total = ArchiveResult.objects.all().count()
     print(f'   Updating {num_total} ArchiveResult.id, ArchiveResult.uuid values in place... (may take an hour or longer for large collections...)')
     print(f'   Updating {num_total} ArchiveResult.id, ArchiveResult.uuid values in place... (may take an hour or longer for large collections...)')
-    for idx, result in enumerate(ArchiveResult.objects.all().only('abid', 'snapshot_id').iterator()):
+    for idx, result in enumerate(ArchiveResult.objects.all().only('abid', 'snapshot_id').iterator(chunk_size=500)):
         assert result.abid
         assert result.abid
         result.abid_prefix = 'res_'
         result.abid_prefix = 'res_'
         result.snapshot = Snapshot.objects.get(pk=result.snapshot_id)
         result.snapshot = Snapshot.objects.get(pk=result.snapshot_id)

+ 1 - 1
archivebox/core/migrations/0034_alter_archiveresult_old_id_alter_archiveresult_uuid.py

@@ -11,7 +11,7 @@ def update_archiveresult_ids(apps, schema_editor):
     ArchiveResult = apps.get_model("core", "ArchiveResult")
     ArchiveResult = apps.get_model("core", "ArchiveResult")
     num_total = ArchiveResult.objects.all().count()
     num_total = ArchiveResult.objects.all().count()
     print(f'   Updating {num_total} ArchiveResult.id, ArchiveResult.uuid values in place... (may take an hour or longer for large collections...)')
     print(f'   Updating {num_total} ArchiveResult.id, ArchiveResult.uuid values in place... (may take an hour or longer for large collections...)')
-    for idx, result in enumerate(ArchiveResult.objects.all().only('abid').iterator()):
+    for idx, result in enumerate(ArchiveResult.objects.all().only('abid').iterator(chunk_size=500)):
         assert result.abid
         assert result.abid
         result.uuid = ABID.parse(result.abid).uuid
         result.uuid = ABID.parse(result.abid).uuid
         result.save(update_fields=["uuid"])
         result.save(update_fields=["uuid"])

+ 1 - 1
archivebox/core/migrations/0051_snapshottag_snapshot_alter_snapshottag_snapshot_old.py

@@ -9,7 +9,7 @@ def update_snapshottag_ids(apps, schema_editor):
     SnapshotTag = apps.get_model("core", "SnapshotTag")
     SnapshotTag = apps.get_model("core", "SnapshotTag")
     num_total = SnapshotTag.objects.all().count()
     num_total = SnapshotTag.objects.all().count()
     print(f'   Updating {num_total} SnapshotTag.snapshot_id values in place... (may take an hour or longer for large collections...)')
     print(f'   Updating {num_total} SnapshotTag.snapshot_id values in place... (may take an hour or longer for large collections...)')
-    for idx, snapshottag in enumerate(SnapshotTag.objects.all().only('snapshot_old_id').iterator()):
+    for idx, snapshottag in enumerate(SnapshotTag.objects.all().only('snapshot_old_id').iterator(chunk_size=500)):
         assert snapshottag.snapshot_old_id
         assert snapshottag.snapshot_old_id
         snapshot = Snapshot.objects.get(old_id=snapshottag.snapshot_old_id)
         snapshot = Snapshot.objects.get(old_id=snapshottag.snapshot_old_id)
         snapshottag.snapshot_id = snapshot.id
         snapshottag.snapshot_id = snapshot.id

+ 1 - 1
archivebox/core/migrations/0059_tag_id.py

@@ -49,7 +49,7 @@ def update_archiveresult_ids(apps, schema_editor):
     Tag = apps.get_model("core", "Tag")
     Tag = apps.get_model("core", "Tag")
     num_total = Tag.objects.all().count()
     num_total = Tag.objects.all().count()
     print(f'   Updating {num_total} Tag.id, ArchiveResult.uuid values in place...')
     print(f'   Updating {num_total} Tag.id, ArchiveResult.uuid values in place...')
-    for idx, tag in enumerate(Tag.objects.all().iterator()):
+    for idx, tag in enumerate(Tag.objects.all().iterator(chunk_size=500)):
         if not tag.slug:
         if not tag.slug:
             tag.slug = tag.name.lower().replace(' ', '_')
             tag.slug = tag.name.lower().replace(' ', '_')
         if not tag.name:
         if not tag.name:

+ 1 - 1
archivebox/core/migrations/0063_snapshottag_tag_alter_snapshottag_old_tag.py

@@ -9,7 +9,7 @@ def update_snapshottag_ids(apps, schema_editor):
     SnapshotTag = apps.get_model("core", "SnapshotTag")
     SnapshotTag = apps.get_model("core", "SnapshotTag")
     num_total = SnapshotTag.objects.all().count()
     num_total = SnapshotTag.objects.all().count()
     print(f'   Updating {num_total} SnapshotTag.tag_id values in place... (may take an hour or longer for large collections...)')
     print(f'   Updating {num_total} SnapshotTag.tag_id values in place... (may take an hour or longer for large collections...)')
-    for idx, snapshottag in enumerate(SnapshotTag.objects.all().only('old_tag_id').iterator()):
+    for idx, snapshottag in enumerate(SnapshotTag.objects.all().only('old_tag_id').iterator(chunk_size=500)):
         assert snapshottag.old_tag_id
         assert snapshottag.old_tag_id
         tag = Tag.objects.get(old_id=snapshottag.old_tag_id)
         tag = Tag.objects.get(old_id=snapshottag.old_tag_id)
         snapshottag.tag_id = tag.id
         snapshottag.tag_id = tag.id

+ 35 - 0
archivebox/core/migrations/0069_alter_archiveresult_created_alter_snapshot_added_and_more.py

@@ -0,0 +1,35 @@
+# Generated by Django 5.1 on 2024-08-28 09:40
+
+import abid_utils.models
+import django.utils.timezone
+from django.db import migrations
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ('core', '0068_alter_archiveresult_options'),
+    ]
+
+    operations = [
+        migrations.AlterField(
+            model_name='archiveresult',
+            name='created',
+            field=abid_utils.models.AutoDateTimeField(db_index=True, default=django.utils.timezone.now),
+        ),
+        migrations.AlterField(
+            model_name='snapshot',
+            name='added',
+            field=abid_utils.models.AutoDateTimeField(db_index=True, default=django.utils.timezone.now),
+        ),
+        migrations.AlterField(
+            model_name='snapshot',
+            name='created',
+            field=abid_utils.models.AutoDateTimeField(db_index=True, default=django.utils.timezone.now),
+        ),
+        migrations.AlterField(
+            model_name='tag',
+            name='created',
+            field=abid_utils.models.AutoDateTimeField(db_index=True, default=django.utils.timezone.now),
+        ),
+    ]

+ 68 - 36
archivebox/core/models.py

@@ -12,6 +12,7 @@ from uuid import uuid4
 from pathlib import Path
 from pathlib import Path
 
 
 from django.db import models
 from django.db import models
+from django.utils import timezone
 from django.utils.functional import cached_property
 from django.utils.functional import cached_property
 from django.utils.text import slugify
 from django.utils.text import slugify
 from django.core.cache import cache
 from django.core.cache import cache
@@ -19,7 +20,7 @@ from django.urls import reverse, reverse_lazy
 from django.db.models import Case, When, Value, IntegerField
 from django.db.models import Case, When, Value, IntegerField
 from django.conf import settings
 from django.conf import settings
 
 
-from abid_utils.models import ABIDModel, ABIDField
+from abid_utils.models import ABIDModel, ABIDField, AutoDateTimeField
 
 
 from ..system import get_dir_size
 from ..system import get_dir_size
 from ..util import parse_date, base_url
 from ..util import parse_date, base_url
@@ -50,7 +51,7 @@ class Tag(ABIDModel):
     Based on django-taggit model + ABID base.
     Based on django-taggit model + ABID base.
     """
     """
     abid_prefix = 'tag_'
     abid_prefix = 'tag_'
-    abid_ts_src = 'self.created'          # TODO: add created/modified time
+    abid_ts_src = 'self.created'
     abid_uri_src = 'self.slug'
     abid_uri_src = 'self.slug'
     abid_subtype_src = '"03"'
     abid_subtype_src = '"03"'
     abid_rand_src = 'self.old_id'
     abid_rand_src = 'self.old_id'
@@ -60,7 +61,6 @@ class Tag(ABIDModel):
     id = models.UUIDField(primary_key=True, default=uuid.uuid4, editable=False, unique=True)
     id = models.UUIDField(primary_key=True, default=uuid.uuid4, editable=False, unique=True)
     abid = ABIDField(prefix=abid_prefix)
     abid = ABIDField(prefix=abid_prefix)
 
 
-
     name = models.CharField(unique=True, blank=False, max_length=100)
     name = models.CharField(unique=True, blank=False, max_length=100)
     slug = models.SlugField(unique=True, blank=False, max_length=100, editable=False)
     slug = models.SlugField(unique=True, blank=False, max_length=100, editable=False)
     # slug is autoset on save from name, never set it manually
     # slug is autoset on save from name, never set it manually
@@ -125,6 +125,12 @@ class SnapshotTag(models.Model):
         db_table = 'core_snapshot_tags'
         db_table = 'core_snapshot_tags'
         unique_together = [('snapshot', 'tag')]
         unique_together = [('snapshot', 'tag')]
 
 
+
+class SnapshotManager(models.Manager):
+    def get_queryset(self):
+        return super().get_queryset().prefetch_related('tags', 'archiveresult_set')  # .annotate(archiveresult_count=models.Count('archiveresult')).distinct()
+
+
 class Snapshot(ABIDModel):
 class Snapshot(ABIDModel):
     abid_prefix = 'snp_'
     abid_prefix = 'snp_'
     abid_ts_src = 'self.added'
     abid_ts_src = 'self.added'
@@ -143,16 +149,15 @@ class Snapshot(ABIDModel):
     
     
     tags = models.ManyToManyField(Tag, blank=True, through=SnapshotTag, related_name='snapshot_set', through_fields=('snapshot', 'tag'))
     tags = models.ManyToManyField(Tag, blank=True, through=SnapshotTag, related_name='snapshot_set', through_fields=('snapshot', 'tag'))
 
 
-    added = models.DateTimeField(auto_now_add=True, db_index=True)
+    added = AutoDateTimeField(default=timezone.now, db_index=True)
     updated = models.DateTimeField(auto_now=True, blank=True, null=True, db_index=True)
     updated = models.DateTimeField(auto_now=True, blank=True, null=True, db_index=True)
 
 
     keys = ('url', 'timestamp', 'title', 'tags', 'updated')
     keys = ('url', 'timestamp', 'title', 'tags', 'updated')
 
 
     archiveresult_set: models.Manager['ArchiveResult']
     archiveresult_set: models.Manager['ArchiveResult']
 
 
-    @property
-    def uuid(self):
-        return self.id
+    objects = SnapshotManager()
+
 
 
     def __repr__(self) -> str:
     def __repr__(self) -> str:
         title = (self.title_stripped or '-')[:64]
         title = (self.title_stripped or '-')[:64]
@@ -162,13 +167,6 @@ class Snapshot(ABIDModel):
         title = (self.title_stripped or '-')[:64]
         title = (self.title_stripped or '-')[:64]
         return f'[{self.timestamp}] {self.url[:64]} ({title})'
         return f'[{self.timestamp}] {self.url[:64]} ({title})'
 
 
-    def save(self, *args, **kwargs):
-        super().save(*args, **kwargs)
-        try:
-            assert str(self.id) == str(self.ABID.uuid) == str(self.uuid), f'Snapshot.id ({self.id}) does not match .ABID.uuid ({self.ABID.uuid})'
-        except AssertionError as e:
-            print(e)
-
     @classmethod
     @classmethod
     def from_json(cls, info: dict):
     def from_json(cls, info: dict):
         info = {k: v for k, v in info.items() if k in cls.keys}
         info = {k: v for k, v in info.items() if k in cls.keys}
@@ -177,8 +175,7 @@ class Snapshot(ABIDModel):
     def as_json(self, *args) -> dict:
     def as_json(self, *args) -> dict:
         args = args or self.keys
         args = args or self.keys
         return {
         return {
-            key: getattr(self, key)
-            if key != 'tags' else self.tags_str()
+            key: getattr(self, key) if key != 'tags' else self.tags_str(nocache=False)
             for key in args
             for key in args
         }
         }
 
 
@@ -190,8 +187,14 @@ class Snapshot(ABIDModel):
         return load_link_details(self.as_link())
         return load_link_details(self.as_link())
 
 
     def tags_str(self, nocache=True) -> str | None:
     def tags_str(self, nocache=True) -> str | None:
+        calc_tags_str = lambda: ','.join(sorted(tag.name for tag in self.tags.all()))
         cache_key = f'{self.pk}-{(self.updated or self.added).timestamp()}-tags'
         cache_key = f'{self.pk}-{(self.updated or self.added).timestamp()}-tags'
-        calc_tags_str = lambda: ','.join(self.tags.order_by('name').values_list('name', flat=True))
+        
+        if hasattr(self, '_prefetched_objects_cache') and 'tags' in self._prefetched_objects_cache:
+            # tags are pre-fetched already, use them directly (best because db is always freshest)
+            tags_str = calc_tags_str()
+            return tags_str
+        
         if nocache:
         if nocache:
             tags_str = calc_tags_str()
             tags_str = calc_tags_str()
             cache.set(cache_key, tags_str)
             cache.set(cache_key, tags_str)
@@ -234,7 +237,10 @@ class Snapshot(ABIDModel):
 
 
     @cached_property
     @cached_property
     def num_outputs(self) -> int:
     def num_outputs(self) -> int:
-        return self.archiveresult_set.filter(status='succeeded').count()
+        # DONT DO THIS: it will trigger a separate query for every snapshot
+        # return self.archiveresult_set.filter(status='succeeded').count()
+        # this is better:
+        return sum((1 for result in self.archiveresult_set.all() if result.status == 'succeeded'))
 
 
     @cached_property
     @cached_property
     def base_url(self):
     def base_url(self):
@@ -262,10 +268,21 @@ class Snapshot(ABIDModel):
 
 
     @cached_property
     @cached_property
     def thumbnail_url(self) -> Optional[str]:
     def thumbnail_url(self) -> Optional[str]:
-        result = self.archiveresult_set.filter(
-            extractor='screenshot',
-            status='succeeded'
-        ).only('output').last()
+        if hasattr(self, '_prefetched_objects_cache') and 'archiveresult_set' in self._prefetched_objects_cache:
+            result = (sorted(
+                (
+                    result
+                    for result in self.archiveresult_set.all()
+                    if result.extractor == 'screenshot' and result.status =='succeeded' and result.output
+                ),
+                key=lambda result: result.created,
+            ) or [None])[-1]
+        else:
+            result = self.archiveresult_set.filter(
+                extractor='screenshot',
+                status='succeeded'
+            ).only('output').last()
+
         if result:
         if result:
             return reverse('Snapshot', args=[f'{str(self.timestamp)}/{result.output}'])
             return reverse('Snapshot', args=[f'{str(self.timestamp)}/{result.output}'])
         return None
         return None
@@ -292,6 +309,21 @@ class Snapshot(ABIDModel):
         if self.title:
         if self.title:
             return self.title   # whoopdedoo that was easy
             return self.title   # whoopdedoo that was easy
         
         
+        # check if ArchiveResult set has already been prefetched, if so use it instead of fetching it from db again
+        if hasattr(self, '_prefetched_objects_cache') and 'archiveresult_set' in self._prefetched_objects_cache:
+            try:
+                return (sorted(
+                    (
+                        result.output.strip()
+                        for result in self.archiveresult_set.all()
+                        if result.extractor == 'title' and result.status =='succeeded' and result.output
+                    ),
+                    key=lambda title: len(title),
+                ) or [None])[-1]
+            except IndexError:
+                pass
+        
+
         try:
         try:
             # take longest successful title from ArchiveResult db history
             # take longest successful title from ArchiveResult db history
             return sorted(
             return sorted(
@@ -355,12 +387,23 @@ class Snapshot(ABIDModel):
 
 
 class ArchiveResultManager(models.Manager):
 class ArchiveResultManager(models.Manager):
     def indexable(self, sorted: bool = True):
     def indexable(self, sorted: bool = True):
+        """Return only ArchiveResults containing text suitable for full-text search (sorted in order of typical result quality)"""
+        
         INDEXABLE_METHODS = [ r[0] for r in ARCHIVE_METHODS_INDEXING_PRECEDENCE ]
         INDEXABLE_METHODS = [ r[0] for r in ARCHIVE_METHODS_INDEXING_PRECEDENCE ]
-        qs = self.get_queryset().filter(extractor__in=INDEXABLE_METHODS,status='succeeded')
+        qs = self.get_queryset().filter(extractor__in=INDEXABLE_METHODS, status='succeeded')
 
 
         if sorted:
         if sorted:
-            precedence = [ When(extractor=method, then=Value(precedence)) for method, precedence in ARCHIVE_METHODS_INDEXING_PRECEDENCE ]
-            qs = qs.annotate(indexing_precedence=Case(*precedence, default=Value(1000),output_field=IntegerField())).order_by('indexing_precedence')
+            precedence = [
+                When(extractor=method, then=Value(precedence))
+                for method, precedence in ARCHIVE_METHODS_INDEXING_PRECEDENCE
+            ]
+            qs = qs.annotate(
+                indexing_precedence=Case(
+                    *precedence,
+                    default=Value(1000),
+                    output_field=IntegerField()
+                )
+            ).order_by('indexing_precedence')
         return qs
         return qs
 
 
 class ArchiveResult(ABIDModel):
 class ArchiveResult(ABIDModel):
@@ -418,17 +461,6 @@ class ArchiveResult(ABIDModel):
     def __str__(self):
     def __str__(self):
         return self.extractor
         return self.extractor
 
 
-    def save(self, *args, **kwargs):
-        super().save(*args, **kwargs)
-        try:
-            assert str(self.id) == str(self.ABID.uuid) == str(self.uuid), f'ArchiveResult.id ({self.id}) does not match .ABID.uuid ({self.ABID.uuid})'
-        except AssertionError as e:
-            print(e)
-
-    @property
-    def uuid(self):
-        return self.id
-
     @cached_property
     @cached_property
     def snapshot_dir(self):
     def snapshot_dir(self):
         return Path(self.snapshot.link_dir)
         return Path(self.snapshot.link_dir)

+ 126 - 80
archivebox/core/settings.py

@@ -4,7 +4,9 @@ import os
 import sys
 import sys
 import re
 import re
 import logging
 import logging
+import inspect
 import tempfile
 import tempfile
+from typing import Any, Dict
 
 
 from pathlib import Path
 from pathlib import Path
 from django.utils.crypto import get_random_string
 from django.utils.crypto import get_random_string
@@ -33,22 +35,20 @@ APPEND_SLASH = True
 DEBUG = CONFIG.DEBUG or ('--debug' in sys.argv)
 DEBUG = CONFIG.DEBUG or ('--debug' in sys.argv)
 
 
 
 
-# add plugins folders to system path, and load plugins in installed_apps
-BUILTIN_PLUGINS_DIR = CONFIG.PACKAGE_DIR / 'plugins'
-USER_PLUGINS_DIR = CONFIG.OUTPUT_DIR / 'plugins'
-sys.path.insert(0, str(BUILTIN_PLUGINS_DIR))
-sys.path.insert(0, str(USER_PLUGINS_DIR))
+BUILTIN_PLUGINS_DIR = CONFIG.PACKAGE_DIR / 'builtin_plugins'
+USER_PLUGINS_DIR = CONFIG.OUTPUT_DIR / 'user_plugins'
 
 
-def find_plugins(plugins_dir):
-    return {
-        # plugin_entrypoint.parent.name: import_module(plugin_entrypoint.parent.name).METADATA
-        plugin_entrypoint.parent.name: plugin_entrypoint.parent
+def find_plugins(plugins_dir, prefix: str) -> Dict[str, Any]:
+    plugins = {
+        f'{prefix}.{plugin_entrypoint.parent.name}': plugin_entrypoint.parent
         for plugin_entrypoint in plugins_dir.glob('*/apps.py')
         for plugin_entrypoint in plugins_dir.glob('*/apps.py')
     }
     }
+    # print(f'Found {prefix} plugins:\n', '\n    '.join(plugins.keys()))
+    return plugins
 
 
 INSTALLED_PLUGINS = {
 INSTALLED_PLUGINS = {
-    **find_plugins(BUILTIN_PLUGINS_DIR),
-    **find_plugins(USER_PLUGINS_DIR),
+    **find_plugins(BUILTIN_PLUGINS_DIR, prefix='builtin_plugins'),
+    **find_plugins(USER_PLUGINS_DIR, prefix='user_plugins'),
 }
 }
 
 
 
 
@@ -66,11 +66,11 @@ INSTALLED_APPS = [
     'plugantic',
     'plugantic',
     'core',
     'core',
     'api',
     'api',
+    'pkg',
 
 
     *INSTALLED_PLUGINS.keys(),
     *INSTALLED_PLUGINS.keys(),
 
 
     'admin_data_views',
     'admin_data_views',
-
     'django_extensions',
     'django_extensions',
 ]
 ]
 
 
@@ -144,64 +144,6 @@ if CONFIG.LDAP:
         # sys.exit(1)
         # sys.exit(1)
 
 
 
 
-################################################################################
-### Debug Settings
-################################################################################
-
-# only enable debug toolbar when in DEBUG mode with --nothreading (it doesnt work in multithreaded mode)
-DEBUG_TOOLBAR = DEBUG and ('--nothreading' in sys.argv) and ('--reload' not in sys.argv)
-if DEBUG_TOOLBAR:
-    try:
-        import debug_toolbar   # noqa
-        DEBUG_TOOLBAR = True
-    except ImportError:
-        DEBUG_TOOLBAR = False
-
-if DEBUG_TOOLBAR:
-    INSTALLED_APPS = [*INSTALLED_APPS, 'debug_toolbar']
-    INTERNAL_IPS = ['0.0.0.0', '127.0.0.1', '*']
-    DEBUG_TOOLBAR_CONFIG = {
-        "SHOW_TOOLBAR_CALLBACK": lambda request: True,
-        "RENDER_PANELS": True,
-    }
-    DEBUG_TOOLBAR_PANELS = [
-        'debug_toolbar.panels.history.HistoryPanel',
-        'debug_toolbar.panels.versions.VersionsPanel',
-        'debug_toolbar.panels.timer.TimerPanel',
-        'debug_toolbar.panels.settings.SettingsPanel',
-        'debug_toolbar.panels.headers.HeadersPanel',
-        'debug_toolbar.panels.request.RequestPanel',
-        'debug_toolbar.panels.sql.SQLPanel',
-        'debug_toolbar.panels.staticfiles.StaticFilesPanel',
-        # 'debug_toolbar.panels.templates.TemplatesPanel',
-        'debug_toolbar.panels.cache.CachePanel',
-        'debug_toolbar.panels.signals.SignalsPanel',
-        'debug_toolbar.panels.logging.LoggingPanel',
-        'debug_toolbar.panels.redirects.RedirectsPanel',
-        'debug_toolbar.panels.profiling.ProfilingPanel',
-        'djdt_flamegraph.FlamegraphPanel',
-    ]
-    MIDDLEWARE = [*MIDDLEWARE, 'debug_toolbar.middleware.DebugToolbarMiddleware']
-
-if DEBUG:
-    from django_autotyping.typing import AutotypingSettingsDict
-
-    INSTALLED_APPS += ['django_autotyping']
-    AUTOTYPING: AutotypingSettingsDict = {
-        "STUBS_GENERATION": {
-            "LOCAL_STUBS_DIR": Path(CONFIG.PACKAGE_DIR) / "typings",
-        }
-    }
-
-# https://github.com/bensi94/Django-Requests-Tracker (improved version of django-debug-toolbar)
-# Must delete archivebox/templates/admin to use because it relies on some things we override
-# visit /__requests_tracker__/ to access
-DEBUG_REQUESTS_TRACKER = False
-if DEBUG_REQUESTS_TRACKER:
-    INSTALLED_APPS += ["requests_tracker"]
-    MIDDLEWARE += ["requests_tracker.middleware.requests_tracker_middleware"]
-    INTERNAL_IPS = ["127.0.0.1", "10.0.2.2", "0.0.0.0", "*"]
-
 
 
 ################################################################################
 ################################################################################
 ### Staticfile and Template Settings
 ### Staticfile and Template Settings
@@ -317,13 +259,15 @@ STORAGES = {
 SECRET_KEY = CONFIG.SECRET_KEY or get_random_string(50, 'abcdefghijklmnopqrstuvwxyz0123456789_')
 SECRET_KEY = CONFIG.SECRET_KEY or get_random_string(50, 'abcdefghijklmnopqrstuvwxyz0123456789_')
 
 
 ALLOWED_HOSTS = CONFIG.ALLOWED_HOSTS.split(',')
 ALLOWED_HOSTS = CONFIG.ALLOWED_HOSTS.split(',')
-CSRF_TRUSTED_ORIGINS = CONFIG.CSRF_TRUSTED_ORIGINS.split(',')
+CSRF_TRUSTED_ORIGINS = list(set(CONFIG.CSRF_TRUSTED_ORIGINS.split(',')))
 
 
 # automatically fix case when user sets ALLOWED_HOSTS (e.g. to archivebox.example.com)
 # automatically fix case when user sets ALLOWED_HOSTS (e.g. to archivebox.example.com)
 # but forgets to add https://archivebox.example.com to CSRF_TRUSTED_ORIGINS
 # but forgets to add https://archivebox.example.com to CSRF_TRUSTED_ORIGINS
-if CONFIG.ALLOWED_HOSTS != '*' and (not CSRF_TRUSTED_ORIGINS):
-    for hostname in ALLOWED_HOSTS:
-        CSRF_TRUSTED_ORIGINS.append(f'https://{hostname}')
+for hostname in ALLOWED_HOSTS:
+    https_endpoint = f'https://{hostname}'
+    if hostname != '*' and https_endpoint not in CSRF_TRUSTED_ORIGINS:
+        print(f'[!] WARNING: {https_endpoint} from ALLOWED_HOSTS should be added to CSRF_TRUSTED_ORIGINS')
+        CSRF_TRUSTED_ORIGINS.append(https_endpoint)
 
 
 SECURE_BROWSER_XSS_FILTER = True
 SECURE_BROWSER_XSS_FILTER = True
 SECURE_CONTENT_TYPE_NOSNIFF = True
 SECURE_CONTENT_TYPE_NOSNIFF = True
@@ -345,6 +289,8 @@ AUTH_PASSWORD_VALIDATORS = [
     {'NAME': 'django.contrib.auth.password_validation.NumericPasswordValidator'},
     {'NAME': 'django.contrib.auth.password_validation.NumericPasswordValidator'},
 ]
 ]
 
 
+DATA_UPLOAD_MAX_NUMBER_FIELDS = None
+
 ################################################################################
 ################################################################################
 ### Shell Settings
 ### Shell Settings
 ################################################################################
 ################################################################################
@@ -385,6 +331,10 @@ IGNORABLE_404_URLS = [
     re.compile(r'robots\.txt$'),
     re.compile(r'robots\.txt$'),
     re.compile(r'.*\.(css|js)\.map$'),
     re.compile(r'.*\.(css|js)\.map$'),
 ]
 ]
+IGNORABLE_200_URLS = [
+    re.compile(r'^"GET /static/.* HTTP/.*" (200|30.) .+', re.I | re.M),
+    re.compile(r'^"GET /admin/jsi18n/ HTTP/.*" (200|30.) .+', re.I | re.M),
+]
 
 
 class NoisyRequestsFilter(logging.Filter):
 class NoisyRequestsFilter(logging.Filter):
     def filter(self, record) -> bool:
     def filter(self, record) -> bool:
@@ -396,19 +346,26 @@ class NoisyRequestsFilter(logging.Filter):
             if ignorable_log_pattern.match(logline):
             if ignorable_log_pattern.match(logline):
                 return False
                 return False
 
 
-        # ignore staticfile requests that 200 or 30*
-        ignoreable_200_log_pattern = re.compile(r'"GET /static/.* HTTP/.*" (200|30.) .+', re.I | re.M)
-        if ignoreable_200_log_pattern.match(logline):
-            return False
+            ignorable_log_pattern = re.compile(f'^Not Found: /.*/?{ignorable_url_pattern.pattern}', re.I | re.M)
+            if ignorable_log_pattern.match(logline):
+                return False
 
 
+        # ignore staticfile requests that 200 or 30*
+        for ignorable_url_pattern in IGNORABLE_200_URLS:
+            if ignorable_log_pattern.match(logline):
+                return False
+            
         return True
         return True
 
 
+
+ERROR_LOG = tempfile.NamedTemporaryFile().name
+
 if CONFIG.LOGS_DIR.exists():
 if CONFIG.LOGS_DIR.exists():
     ERROR_LOG = (CONFIG.LOGS_DIR / 'errors.log')
     ERROR_LOG = (CONFIG.LOGS_DIR / 'errors.log')
 else:
 else:
     # historically too many edge cases here around creating log dir w/ correct permissions early on
     # historically too many edge cases here around creating log dir w/ correct permissions early on
     # if there's an issue on startup, we trash the log and let user figure it out via stdout/stderr
     # if there's an issue on startup, we trash the log and let user figure it out via stdout/stderr
-    ERROR_LOG = tempfile.NamedTemporaryFile().name
+    print(f'[!] WARNING: data/logs dir does not exist. Logging to temp file: {ERROR_LOG}')
 
 
 LOGGING = {
 LOGGING = {
     'version': 1,
     'version': 1,
@@ -445,6 +402,10 @@ LOGGING = {
 }
 }
 
 
 
 
+################################################################################
+### REST API Outbound Webhooks settings
+################################################################################
+
 # Add default webhook configuration to the User model
 # Add default webhook configuration to the User model
 SIGNAL_WEBHOOKS_CUSTOM_MODEL = 'api.models.OutboundWebhook'
 SIGNAL_WEBHOOKS_CUSTOM_MODEL = 'api.models.OutboundWebhook'
 SIGNAL_WEBHOOKS = {
 SIGNAL_WEBHOOKS = {
@@ -458,7 +419,9 @@ SIGNAL_WEBHOOKS = {
     },
     },
 }
 }
 
 
-DATA_UPLOAD_MAX_NUMBER_FIELDS = None
+################################################################################
+### Admin Data View Settings
+################################################################################
 
 
 ADMIN_DATA_VIEWS = {
 ADMIN_DATA_VIEWS = {
     "NAME": "Environment",
     "NAME": "Environment",
@@ -495,3 +458,86 @@ ADMIN_DATA_VIEWS = {
         },
         },
     ],
     ],
 }
 }
+
+
+################################################################################
+### Debug Settings
+################################################################################
+
+# only enable debug toolbar when in DEBUG mode with --nothreading (it doesnt work in multithreaded mode)
+DEBUG_TOOLBAR = False
+DEBUG_TOOLBAR = DEBUG_TOOLBAR and DEBUG and ('--nothreading' in sys.argv) and ('--reload' not in sys.argv)
+if DEBUG_TOOLBAR:
+    try:
+        import debug_toolbar   # noqa
+        DEBUG_TOOLBAR = True
+    except ImportError:
+        DEBUG_TOOLBAR = False
+
+if DEBUG_TOOLBAR:
+    INSTALLED_APPS = [*INSTALLED_APPS, 'debug_toolbar']
+    INTERNAL_IPS = ['0.0.0.0', '127.0.0.1', '*']
+    DEBUG_TOOLBAR_CONFIG = {
+        "SHOW_TOOLBAR_CALLBACK": lambda request: True,
+        "RENDER_PANELS": True,
+    }
+    DEBUG_TOOLBAR_PANELS = [
+        'debug_toolbar.panels.history.HistoryPanel',
+        'debug_toolbar.panels.versions.VersionsPanel',
+        'debug_toolbar.panels.timer.TimerPanel',
+        'debug_toolbar.panels.settings.SettingsPanel',
+        'debug_toolbar.panels.headers.HeadersPanel',
+        'debug_toolbar.panels.request.RequestPanel',
+        'debug_toolbar.panels.sql.SQLPanel',
+        'debug_toolbar.panels.staticfiles.StaticFilesPanel',
+        # 'debug_toolbar.panels.templates.TemplatesPanel',
+        'debug_toolbar.panels.cache.CachePanel',
+        'debug_toolbar.panels.signals.SignalsPanel',
+        'debug_toolbar.panels.logging.LoggingPanel',
+        'debug_toolbar.panels.redirects.RedirectsPanel',
+        'debug_toolbar.panels.profiling.ProfilingPanel',
+        'djdt_flamegraph.FlamegraphPanel',
+    ]
+    MIDDLEWARE = [*MIDDLEWARE, 'debug_toolbar.middleware.DebugToolbarMiddleware']
+
+if DEBUG:
+    from django_autotyping.typing import AutotypingSettingsDict
+
+    INSTALLED_APPS += ['django_autotyping']
+    AUTOTYPING: AutotypingSettingsDict = {
+        "STUBS_GENERATION": {
+            "LOCAL_STUBS_DIR": Path(CONFIG.PACKAGE_DIR) / "typings",
+        }
+    }
+
+# https://github.com/bensi94/Django-Requests-Tracker (improved version of django-debug-toolbar)
+# Must delete archivebox/templates/admin to use because it relies on some things we override
+# visit /__requests_tracker__/ to access
+DEBUG_REQUESTS_TRACKER = True
+DEBUG_REQUESTS_TRACKER = DEBUG_REQUESTS_TRACKER and DEBUG
+if DEBUG_REQUESTS_TRACKER:
+    import requests_tracker
+
+    INSTALLED_APPS += ["requests_tracker"]
+    MIDDLEWARE += ["requests_tracker.middleware.requests_tracker_middleware"]
+    INTERNAL_IPS = ["127.0.0.1", "10.0.2.2", "0.0.0.0", "*"]
+
+    TEMPLATE_DIRS.insert(0, str(Path(inspect.getfile(requests_tracker)).parent / "templates"))
+
+    REQUESTS_TRACKER_CONFIG = {
+        "TRACK_SQL": True,
+        "ENABLE_STACKTRACES": False,
+        "IGNORE_PATHS_PATTERNS": (
+            r".*/favicon\.ico",
+            r".*\.png",
+            r"/admin/jsi18n/",
+        ),
+        "IGNORE_SQL_PATTERNS": (
+            r"^SELECT .* FROM django_migrations WHERE app = 'requests_tracker'",
+            r"^SELECT .* FROM django_migrations WHERE app = 'auth'",
+        ),
+    }
+
+# https://docs.pydantic.dev/logfire/integrations/django/ (similar to DataDog / NewRelic / etc.)
+DEBUG_LOGFIRE = False
+DEBUG_LOGFIRE = DEBUG_LOGFIRE and (Path(CONFIG.OUTPUT_DIR) / '.logfire').is_dir()

+ 1 - 1
archivebox/extractors/__init__.py

@@ -218,7 +218,7 @@ def archive_links(all_links: Union[Iterable[Link], QuerySet], overwrite: bool=Fa
     if type(all_links) is QuerySet:
     if type(all_links) is QuerySet:
         num_links: int = all_links.count()
         num_links: int = all_links.count()
         get_link = lambda x: x.as_link_with_details()
         get_link = lambda x: x.as_link_with_details()
-        all_links = all_links.iterator()
+        all_links = all_links.iterator(chunk_size=500)
     else:
     else:
         num_links: int = len(all_links)
         num_links: int = len(all_links)
         get_link = lambda x: x
         get_link = lambda x: x

+ 12 - 1
archivebox/extractors/wget.py

@@ -197,7 +197,7 @@ def unsafe_wget_output_path(link: Link) -> Optional[str]:
 
 
 
 
 @enforce_types
 @enforce_types
-def wget_output_path(link: Link) -> Optional[str]:
+def wget_output_path(link: Link, nocache: bool=False) -> Optional[str]:
     """calculate the path to the wgetted .html file, since wget may
     """calculate the path to the wgetted .html file, since wget may
     adjust some paths to be different than the base_url path.
     adjust some paths to be different than the base_url path.
 
 
@@ -245,6 +245,15 @@ def wget_output_path(link: Link) -> Optional[str]:
     #    https://example.com/abc/test/?v=zzVa_tX1OiI
     #    https://example.com/abc/test/?v=zzVa_tX1OiI
     #       > example.com/abc/test/index.html@v=zzVa_tX1OiI.html
     #       > example.com/abc/test/index.html@v=zzVa_tX1OiI.html
 
 
+    cache_key = f'{link.url_hash}:{link.timestamp}-{link.updated and link.updated.timestamp()}-wget-output-path'
+    
+    if not nocache:
+        from django.core.cache import cache
+        cached_result = cache.get(cache_key)
+        if cached_result:
+            return cached_result
+
+
     # There's also lots of complexity around how the urlencoding and renaming
     # There's also lots of complexity around how the urlencoding and renaming
     # is done for pages with query and hash fragments, extensions like shtml / htm / php / etc,
     # is done for pages with query and hash fragments, extensions like shtml / htm / php / etc,
     # unicode escape sequences, punycode domain names, unicode double-width characters, extensions longer than
     # unicode escape sequences, punycode domain names, unicode double-width characters, extensions longer than
@@ -271,6 +280,8 @@ def wget_output_path(link: Link) -> Optional[str]:
         output_path = None
         output_path = None
 
 
     if output_path:
     if output_path:
+        if not nocache:
+            cache.set(cache_key, output_path)
         return output_path
         return output_path
 
 
     # fallback to just the domain dir
     # fallback to just the domain dir

+ 6 - 6
archivebox/index/__init__.py

@@ -407,7 +407,7 @@ def snapshot_filter(snapshots: QuerySet, filter_patterns: List[str], filter_type
 
 
 def get_indexed_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
 def get_indexed_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
     """indexed links without checking archive status or data directory validity"""
     """indexed links without checking archive status or data directory validity"""
-    links = (snapshot.as_link() for snapshot in snapshots.iterator())
+    links = (snapshot.as_link() for snapshot in snapshots.iterator(chunk_size=500))
     return {
     return {
         link.link_dir: link
         link.link_dir: link
         for link in links
         for link in links
@@ -415,7 +415,7 @@ def get_indexed_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Option
 
 
 def get_archived_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
 def get_archived_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
     """indexed links that are archived with a valid data directory"""
     """indexed links that are archived with a valid data directory"""
-    links = (snapshot.as_link() for snapshot in snapshots.iterator())
+    links = (snapshot.as_link() for snapshot in snapshots.iterator(chunk_size=500))
     return {
     return {
         link.link_dir: link
         link.link_dir: link
         for link in filter(is_archived, links)
         for link in filter(is_archived, links)
@@ -423,7 +423,7 @@ def get_archived_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optio
 
 
 def get_unarchived_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
 def get_unarchived_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
     """indexed links that are unarchived with no data directory or an empty data directory"""
     """indexed links that are unarchived with no data directory or an empty data directory"""
-    links = (snapshot.as_link() for snapshot in snapshots.iterator())
+    links = (snapshot.as_link() for snapshot in snapshots.iterator(chunk_size=500))
     return {
     return {
         link.link_dir: link
         link.link_dir: link
         for link in filter(is_unarchived, links)
         for link in filter(is_unarchived, links)
@@ -448,7 +448,7 @@ def get_present_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Option
 
 
 def get_valid_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
 def get_valid_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
     """dirs with a valid index matched to the main index and archived content"""
     """dirs with a valid index matched to the main index and archived content"""
-    links = [snapshot.as_link_with_details() for snapshot in snapshots.iterator()]
+    links = [snapshot.as_link_with_details() for snapshot in snapshots.iterator(chunk_size=500)]
     return {
     return {
         link.link_dir: link
         link.link_dir: link
         for link in filter(is_valid, links)
         for link in filter(is_valid, links)
@@ -475,7 +475,7 @@ def get_duplicate_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Opti
             if entry.is_dir() and not snapshots.filter(timestamp=entry.name).exists()
             if entry.is_dir() and not snapshots.filter(timestamp=entry.name).exists()
     )
     )
 
 
-    for path in chain(snapshots.iterator(), data_folders):
+    for path in chain(snapshots.iterator(chunk_size=500), data_folders):
         link = None
         link = None
         if type(path) is not str:
         if type(path) is not str:
             path = path.as_link().link_dir
             path = path.as_link().link_dir
@@ -518,7 +518,7 @@ def get_orphaned_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optio
 def get_corrupted_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
 def get_corrupted_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
     """dirs that don't contain a valid index and aren't listed in the main index"""
     """dirs that don't contain a valid index and aren't listed in the main index"""
     corrupted = {}
     corrupted = {}
-    for snapshot in snapshots.iterator():
+    for snapshot in snapshots.iterator(chunk_size=500):
         link = snapshot.as_link()
         link = snapshot.as_link()
         if is_corrupt(link):
         if is_corrupt(link):
             corrupted[link.link_dir] = link
             corrupted[link.link_dir] = link

+ 9 - 1
archivebox/index/html.py

@@ -124,7 +124,15 @@ def snapshot_icons(snapshot) -> str:
         from core.models import ArchiveResult
         from core.models import ArchiveResult
         # start = datetime.now(timezone.utc)
         # start = datetime.now(timezone.utc)
 
 
-        archive_results = snapshot.archiveresult_set.filter(status="succeeded", output__isnull=False)
+        if hasattr(snapshot, '_prefetched_objects_cache') and 'archiveresult_set' in snapshot._prefetched_objects_cache:
+            archive_results = [
+                result
+                for result in snapshot.archiveresult_set.all()
+                if result.status == "succeeded" and result.output
+            ]
+        else:
+            archive_results = snapshot.archiveresult_set.filter(status="succeeded", output__isnull=False)
+
         link = snapshot.as_link()
         link = snapshot.as_link()
         path = link.archive_path
         path = link.archive_path
         canon = link.canonical_outputs()
         canon = link.canonical_outputs()

+ 3 - 1
archivebox/index/sql.py

@@ -37,9 +37,11 @@ def remove_from_sql_main_index(snapshots: QuerySet, atomic: bool=False, out_dir:
 @enforce_types
 @enforce_types
 def write_link_to_sql_index(link: Link, created_by_id: int | None=None):
 def write_link_to_sql_index(link: Link, created_by_id: int | None=None):
     from core.models import Snapshot, ArchiveResult
     from core.models import Snapshot, ArchiveResult
+    from abid_utils.models import get_or_create_system_user_pk
+
     info = {k: v for k, v in link._asdict().items() if k in Snapshot.keys}
     info = {k: v for k, v in link._asdict().items() if k in Snapshot.keys}
 
 
-    info['created_by_id'] = created_by_id
+    info['created_by_id'] = created_by_id or get_or_create_system_user_pk()
 
 
     tag_list = list(dict.fromkeys(
     tag_list = list(dict.fromkeys(
         tag.strip() for tag in re.split(TAG_SEPARATOR_PATTERN, link.tags or '')
         tag.strip() for tag in re.split(TAG_SEPARATOR_PATTERN, link.tags or '')

+ 6 - 5
archivebox/main.py

@@ -960,7 +960,8 @@ def setup(out_dir: Path=OUTPUT_DIR) -> None:
         run_subcommand('init', stdin=None, pwd=out_dir)
         run_subcommand('init', stdin=None, pwd=out_dir)
 
 
     setup_django(out_dir=out_dir, check_db=True)
     setup_django(out_dir=out_dir, check_db=True)
-    from core.models import User
+    from django.contrib.auth import get_user_model
+    User = get_user_model()
 
 
     if not User.objects.filter(is_superuser=True).exists():
     if not User.objects.filter(is_superuser=True).exists():
         stderr('\n[+] Creating new admin user for the Web UI...', color='green')
         stderr('\n[+] Creating new admin user for the Web UI...', color='green')
@@ -979,16 +980,16 @@ def setup(out_dir: Path=OUTPUT_DIR) -> None:
                 '--upgrade',
                 '--upgrade',
                 '--no-cache-dir',
                 '--no-cache-dir',
                 '--no-warn-script-location',
                 '--no-warn-script-location',
-                'youtube_dl',
+                'yt-dlp',
             ], capture_output=False, cwd=out_dir)
             ], capture_output=False, cwd=out_dir)
             pkg_path = run_shell([
             pkg_path = run_shell([
                 PYTHON_BINARY, '-m', 'pip',
                 PYTHON_BINARY, '-m', 'pip',
                 'show',
                 'show',
-                'youtube_dl',
+                'yt-dlp',
             ], capture_output=True, text=True, cwd=out_dir).stdout.decode().split('Location: ')[-1].split('\n', 1)[0]
             ], capture_output=True, text=True, cwd=out_dir).stdout.decode().split('Location: ')[-1].split('\n', 1)[0]
-            NEW_YOUTUBEDL_BINARY = Path(pkg_path) / 'youtube_dl' / '__main__.py'
+            NEW_YOUTUBEDL_BINARY = Path(pkg_path) / 'yt-dlp' / '__main__.py'
             os.chmod(NEW_YOUTUBEDL_BINARY, 0o777)
             os.chmod(NEW_YOUTUBEDL_BINARY, 0o777)
-            assert NEW_YOUTUBEDL_BINARY.exists(), f'youtube_dl must exist inside {pkg_path}'
+            assert NEW_YOUTUBEDL_BINARY.exists(), f'yt-dlp must exist inside {pkg_path}'
             config(f'YOUTUBEDL_BINARY={NEW_YOUTUBEDL_BINARY}', set=True, out_dir=out_dir)
             config(f'YOUTUBEDL_BINARY={NEW_YOUTUBEDL_BINARY}', set=True, out_dir=out_dir)
         except BaseException as e:                                              # lgtm [py/catch-base-exception]
         except BaseException as e:                                              # lgtm [py/catch-base-exception]
             stderr(f'[X] Failed to install python packages: {e}', color='red')
             stderr(f'[X] Failed to install python packages: {e}', color='red')

+ 10 - 10
archivebox/package-lock.json

@@ -11,7 +11,7 @@
       "dependencies": {
       "dependencies": {
         "@postlight/parser": "^2.2.3",
         "@postlight/parser": "^2.2.3",
         "readability-extractor": "github:ArchiveBox/readability-extractor",
         "readability-extractor": "github:ArchiveBox/readability-extractor",
-        "single-file-cli": "^1.1.54"
+        "single-file-cli": "^2.0.58"
       }
       }
     },
     },
     "node_modules/@asamuzakjp/dom-selector": {
     "node_modules/@asamuzakjp/dom-selector": {
@@ -236,9 +236,9 @@
       "license": "MIT"
       "license": "MIT"
     },
     },
     "node_modules/@types/node": {
     "node_modules/@types/node": {
-      "version": "22.5.0",
-      "resolved": "https://registry.npmjs.org/@types/node/-/node-22.5.0.tgz",
-      "integrity": "sha512-DkFrJOe+rfdHTqqMg0bSNlGlQ85hSoh2TPzZyhHsXnMtligRWpxUySiyw8FY14ITt24HVCiQPWxS3KO/QlGmWg==",
+      "version": "22.5.1",
+      "resolved": "https://registry.npmjs.org/@types/node/-/node-22.5.1.tgz",
+      "integrity": "sha512-KkHsxej0j9IW1KKOOAA/XBA0z08UFSrRQHErzEfA3Vgq57eXIMYboIlHJuYIfd+lwCQjtKqUu3UnmKbtUc9yRw==",
       "license": "MIT",
       "license": "MIT",
       "optional": true,
       "optional": true,
       "dependencies": {
       "dependencies": {
@@ -353,9 +353,9 @@
       }
       }
     },
     },
     "node_modules/aws4": {
     "node_modules/aws4": {
-      "version": "1.13.1",
-      "resolved": "https://registry.npmjs.org/aws4/-/aws4-1.13.1.tgz",
-      "integrity": "sha512-u5w79Rd7SU4JaIlA/zFqG+gOiuq25q5VLyZ8E+ijJeILuTxVzZgp2CaGw/UTw6pXYN9XMO9yiqj/nEHmhTG5CA==",
+      "version": "1.13.2",
+      "resolved": "https://registry.npmjs.org/aws4/-/aws4-1.13.2.tgz",
+      "integrity": "sha512-lHe62zvbTB5eEABUVi/AwVh0ZKY9rMMDhmm+eeyuuUQbQ3+J+fONVQOZyj+DdrvD4BY33uYniyRJ4UJIaSKAfw==",
       "license": "MIT"
       "license": "MIT"
     },
     },
     "node_modules/b4a": {
     "node_modules/b4a": {
@@ -2376,9 +2376,9 @@
       }
       }
     },
     },
     "node_modules/tslib": {
     "node_modules/tslib": {
-      "version": "2.6.3",
-      "resolved": "https://registry.npmjs.org/tslib/-/tslib-2.6.3.tgz",
-      "integrity": "sha512-xNvxJEOUiWPGhUuUdQgAJPKOOJfGnIyKySOc09XkKsgdUV/3E2zvwZYdejjmRgPCgcym1juLH3226yA7sEFJKQ==",
+      "version": "2.7.0",
+      "resolved": "https://registry.npmjs.org/tslib/-/tslib-2.7.0.tgz",
+      "integrity": "sha512-gLXCKdN1/j47AiHiOkJN69hJmcbGTHI0ImLmbYLHykhgeN0jVGola9yVjFgzCUklsZQMW55o+dW7IXv3RCXDzA==",
       "license": "0BSD"
       "license": "0BSD"
     },
     },
     "node_modules/turndown": {
     "node_modules/turndown": {

+ 1 - 1
archivebox/parsers/pocket_api.py

@@ -7,7 +7,7 @@ from typing import IO, Iterable, Optional
 from configparser import ConfigParser
 from configparser import ConfigParser
 
 
 from pathlib import Path
 from pathlib import Path
-from ..vendor.pocket import Pocket
+from pocket import Pocket
 
 
 from ..index.schema import Link
 from ..index.schema import Link
 from ..util import enforce_types
 from ..util import enforce_types

+ 0 - 0
archivebox/pkg/__init__.py


+ 3 - 0
archivebox/pkg/admin.py

@@ -0,0 +1,3 @@
+from django.contrib import admin
+
+# Register your models here.

+ 14 - 0
archivebox/pkg/apps.py

@@ -0,0 +1,14 @@
+__package__ = 'archivebox.pkg'
+
+from django.apps import AppConfig
+
+
+class PkgsConfig(AppConfig):
+    default_auto_field = 'django.db.models.BigAutoField'
+    name = 'pkg'
+
+    def ready(self):
+        from .settings import LOADED_DEPENDENCIES
+
+        # print(LOADED_DEPENDENCIES)
+        

+ 0 - 0
archivebox/pkg/management/__init__.py


+ 0 - 0
archivebox/pkg/management/commands/__init__.py


+ 75 - 0
archivebox/pkg/management/commands/pkg.py

@@ -0,0 +1,75 @@
+__package__ = 'archivebox.pkg.management.commands'
+
+from django.core.management.base import BaseCommand
+from django.conf import settings
+
+from pydantic_pkgr import Binary, BinProvider, BrewProvider, EnvProvider, SemVer
+from pydantic_pkgr.binprovider import bin_abspath
+
+from ....config import NODE_BIN_PATH, bin_path
+
+from plugantic.plugins import LOADED_PLUGINS
+
+from pkg.settings import env
+
+
+class Command(BaseCommand):
+    def handle(self, *args, method, **options):
+        method(*args, **options)
+
+    def add_arguments(self, parser):
+        subparsers = parser.add_subparsers(title="sub-commands", required=True)
+
+        list_parser = subparsers.add_parser("list", help="List archivebox runtime dependencies.")
+        list_parser.set_defaults(method=self.list)
+
+        install_parser = subparsers.add_parser("install", help="Install archivebox runtime dependencies.")
+        install_parser.add_argument("--update", action="store_true", help="Update dependencies to latest versions.")
+        install_parser.add_argument("package_names", nargs="+", type=str)
+        install_parser.set_defaults(method=self.install)
+
+    def list(self, *args, **options):
+        self.stdout.write('################# PLUGINS ####################')
+        for plugin in LOADED_PLUGINS:
+            self.stdout.write(f'{plugin.name}:')
+            for binary in plugin.binaries:
+                try:
+                    binary = binary.install()
+                except Exception as e:
+                    # import ipdb; ipdb.set_trace()
+                    raise
+                self.stdout.write(f'    {binary.name.ljust(14)} {str(binary.version).ljust(11)} {binary.binprovider.INSTALLER_BIN.ljust(5)}  {binary.abspath}')
+
+        self.stdout.write('\n################# LEGACY ####################')
+        for bin_key, dependency in settings.CONFIG.DEPENDENCIES.items():
+            bin_name = settings.CONFIG[bin_key]
+
+            self.stdout.write(f'{bin_key}:     {bin_name}')
+
+            # binary = Binary(name=package_name, providers=[env])
+            # print(binary)
+
+            # try:
+            #     loaded_bin = binary.load()
+            #     self.stdout.write(
+            #         self.style.SUCCESS(f'Successfully loaded {package_name}:') + str(loaded_bin)
+            #     )
+            # except Exception as e:
+            #     self.stderr.write(
+            #         self.style.ERROR(f"Error loading {package_name}: {e}")
+            #     )
+
+    def install(self, *args, bright, **options):
+        for package_name in options["package_names"]:
+            binary = Binary(name=package_name, providers=[env])
+            print(binary)
+
+            try:
+                loaded_bin = binary.load()
+                self.stdout.write(
+                    self.style.SUCCESS(f'Successfully loaded {package_name}:') + str(loaded_bin)
+                )
+            except Exception as e:
+                self.stderr.write(
+                    self.style.ERROR(f"Error loading {package_name}: {e}")
+                )

+ 0 - 0
archivebox/pkg/migrations/__init__.py


+ 3 - 0
archivebox/pkg/models.py

@@ -0,0 +1,3 @@
+from django.db import models
+
+# Create your models here.

+ 86 - 0
archivebox/pkg/settings.py

@@ -0,0 +1,86 @@
+__package__ = 'archivebox.pkg'
+
+import os
+import sys
+import shutil
+import inspect
+from pathlib import Path
+
+import django
+from django.conf import settings
+from django.db.backends.sqlite3.base import Database as sqlite3
+
+from pydantic_pkgr import Binary, BinProvider, BrewProvider, EnvProvider, SemVer
+from pydantic_pkgr.binprovider import bin_abspath
+
+from ..config import NODE_BIN_PATH, bin_path
+
+env = EnvProvider(PATH=NODE_BIN_PATH + ':' + os.environ.get('PATH', '/bin'))
+
+
+LOADED_DEPENDENCIES = {}
+
+for bin_key, dependency in settings.CONFIG.DEPENDENCIES.items():
+    # 'PYTHON_BINARY': {
+    #     'path': bin_path(config['PYTHON_BINARY']),
+    #     'version': config['PYTHON_VERSION'],
+    #     'hash': bin_hash(config['PYTHON_BINARY']),
+    #     'enabled': True,
+    #     'is_valid': bool(config['PYTHON_VERSION']),
+    # },
+    
+
+    bin_name = settings.CONFIG[bin_key]
+
+    if bin_name.endswith('django/__init__.py'):
+        binary_spec = Binary(name='django', providers=[env], provider_overrides={
+            'env': {
+                'abspath': lambda: Path(inspect.getfile(django)),
+                'version': lambda: SemVer('{}.{}.{} {} ({})'.format(*django.VERSION)),
+            }
+        })
+    elif bin_name.endswith('sqlite3/dbapi2.py'):
+        binary_spec = Binary(name='sqlite3', providers=[env], provider_overrides={
+            'env': {
+                'abspath': lambda: Path(inspect.getfile(sqlite3)),
+                'version': lambda: SemVer(sqlite3.version),
+            }
+        })
+    elif bin_name.endswith('archivebox'):
+        binary_spec = Binary(name='archivebox', providers=[env], provider_overrides={
+            'env': {
+                'abspath': lambda: shutil.which(str(Path('archivebox').expanduser())),
+                'version': lambda: settings.CONFIG.VERSION,
+            }
+        })
+    elif bin_name.endswith('postlight/parser/cli.js'):
+        binary_spec = Binary(name='postlight-parser', providers=[env], provider_overrides={
+            'env': {
+                'abspath': lambda: bin_path('postlight-parser'),
+                'version': lambda: SemVer('1.0.0'),
+            }
+        })
+    else:
+        binary_spec = Binary(name=bin_name, providers=[env])
+    
+    try:
+        binary = binary_spec.load()
+    except Exception as e:
+        # print(f"- ❌ Binary {bin_name} failed to load with error: {e}")
+        continue
+
+    assert isinstance(binary.loaded_version, SemVer)
+
+    try:
+        assert str(binary.loaded_version) == dependency['version'], f"Expected {bin_name} version {dependency['version']}, got {binary.loaded_version}"
+        assert str(binary.loaded_respath) == str(bin_abspath(dependency['path']).resolve()), f"Expected {bin_name} abspath {bin_abspath(dependency['path']).resolve()}, got {binary.loaded_respath}"
+        assert binary.is_valid == dependency['is_valid'], f"Expected {bin_name} is_valid={dependency['is_valid']}, got {binary.is_valid}"
+    except Exception as e:
+        pass
+        # print(f"WARNING: Error loading {bin_name}: {e}")
+        # import ipdb; ipdb.set_trace()
+    
+    # print(f"- ✅ Binary {bin_name} loaded successfully")
+    LOADED_DEPENDENCIES[bin_key] = binary
+
+

+ 3 - 0
archivebox/pkg/tests.py

@@ -0,0 +1,3 @@
+from django.test import TestCase
+
+# Create your tests here.

+ 3 - 0
archivebox/pkg/views.py

@@ -0,0 +1,3 @@
+from django.shortcuts import render
+
+# Create your views here.

+ 0 - 1
archivebox/plugantic/__init__.py

@@ -1,6 +1,5 @@
 __package__ = 'archivebox.plugantic'
 __package__ = 'archivebox.plugantic'
 
 
-from .binproviders import BinProvider
 from .binaries import Binary
 from .binaries import Binary
 from .extractors import Extractor
 from .extractors import Extractor
 from .replayers import Replayer
 from .replayers import Replayer

+ 11 - 0
archivebox/plugantic/apps.py

@@ -1,6 +1,17 @@
+import importlib
 from django.apps import AppConfig
 from django.apps import AppConfig
 
 
 
 
 class PluganticConfig(AppConfig):
 class PluganticConfig(AppConfig):
     default_auto_field = 'django.db.models.BigAutoField'
     default_auto_field = 'django.db.models.BigAutoField'
     name = 'plugantic'
     name = 'plugantic'
+
+    def ready(self) -> None:
+        from django.conf import settings
+        from .plugins import PLUGINS
+
+        for plugin_name in settings.INSTALLED_PLUGINS.keys():
+            lib = importlib.import_module(f'{plugin_name}.apps')
+            if hasattr(lib, 'PLUGINS'):
+                for plugin_instance in lib.PLUGINS:
+                    PLUGINS.append(plugin_instance)

+ 22 - 280
archivebox/plugantic/binaries.py

@@ -10,285 +10,17 @@ from typing import Any, Optional, Dict, List
 from typing_extensions import Self
 from typing_extensions import Self
 from subprocess import run, PIPE
 from subprocess import run, PIPE
 
 
+from pydantic_pkgr import Binary, SemVer, BinName, BinProvider, EnvProvider, AptProvider, BrewProvider, PipProvider, BinProviderName, ProviderLookupDict
 
 
-from pydantic_core import ValidationError
+import django
+from django.db.backends.sqlite3.base import Database as sqlite3
 
 
-from pydantic import BaseModel, Field, model_validator, computed_field, field_validator, validate_call, field_serializer
 
 
-from .binproviders import (
-    SemVer,
-    BinName,
-    BinProviderName,
-    HostBinPath,
-    BinProvider,
-    EnvProvider,
-    AptProvider,
-    BrewProvider,
-    PipProvider,
-    ProviderLookupDict,
-    bin_name,
-    bin_abspath,
-    path_is_script,
-    path_is_executable,
-)
 
 
 
 
-class Binary(BaseModel):
-    name: BinName
-    description: str = Field(default='')
-
-    providers_supported: List[BinProvider] = Field(default=[EnvProvider()], alias='providers')
-    provider_overrides: Dict[BinProviderName, ProviderLookupDict] = Field(default={}, alias='overrides')
-    
-    loaded_provider: Optional[BinProviderName] = Field(default=None, alias='provider')
-    loaded_abspath: Optional[HostBinPath] = Field(default=None, alias='abspath')
-    loaded_version: Optional[SemVer] = Field(default=None, alias='version')
-    
-    # bin_filename:  see below
-    # is_executable: see below
-    # is_script
-    # is_valid: see below
-
-
-    @model_validator(mode='after')
-    def validate(self):
-        self.loaded_abspath = bin_abspath(self.name) or self.name
-        self.description = self.description or self.name
-        
-        assert self.providers_supported, f'No providers were given for package {self.name}'
-
-        # pull in any overrides from the binproviders
-        for provider in self.providers_supported:
-            overrides_by_provider = provider.get_providers_for_bin(self.name)
-            if overrides_by_provider:
-                self.provider_overrides[provider.name] = {
-                    **overrides_by_provider,
-                    **self.provider_overrides.get(provider.name, {}),
-                }
-        return self
-
-    @field_validator('loaded_abspath', mode='before')
-    def parse_abspath(cls, value: Any):
-        return bin_abspath(value)
-
-    @field_validator('loaded_version', mode='before')
-    def parse_version(cls, value: Any):
-        return value and SemVer(value)
-
-    @field_serializer('provider_overrides', when_used='json')
-    def serialize_overrides(self, provider_overrides: Dict[BinProviderName, ProviderLookupDict]) -> Dict[BinProviderName, Dict[str, str]]:
-        return {
-            provider_name: {
-                key: str(val)
-                for key, val in overrides.items()
-            }
-            for provider_name, overrides in provider_overrides.items()
-        }
-
-    @computed_field                                                                                           # type: ignore[misc]  # see mypy issue #1362
-    @property
-    def bin_filename(self) -> BinName:
-        if self.is_script:
-            # e.g. '.../Python.framework/Versions/3.11/lib/python3.11/sqlite3/__init__.py' -> sqlite
-            name = self.name
-        elif self.loaded_abspath:
-            # e.g. '/opt/homebrew/bin/wget' -> wget
-            name = bin_name(self.loaded_abspath)
-        else:
-            # e.g. 'ytdlp' -> 'yt-dlp'
-            name = bin_name(self.name)
-        return name
-
-    @computed_field                                                                                           # type: ignore[misc]  # see mypy issue #1362
-    @property
-    def is_executable(self) -> bool:
-        try:
-            assert self.loaded_abspath and path_is_executable(self.loaded_abspath)
-            return True
-        except (ValidationError, AssertionError):
-            return False
-
-    @computed_field                                                                                           # type: ignore[misc]  # see mypy issue #1362
-    @property
-    def is_script(self) -> bool:
-        try:
-            assert self.loaded_abspath and path_is_script(self.loaded_abspath)
-            return True
-        except (ValidationError, AssertionError):
-            return False
-
-    @computed_field                                                                                           # type: ignore[misc]  # see mypy issue #1362
-    @property
-    def is_valid(self) -> bool:
-        return bool(
-            self.name
-            and self.loaded_abspath
-            and self.loaded_version
-            and (self.is_executable or self.is_script)
-        )
-
-    @validate_call
-    def install(self) -> Self:
-        if not self.providers_supported:
-            return self
-
-        exc = Exception('No providers were able to install binary', self.name, self.providers_supported)
-        for provider in self.providers_supported:
-            try:
-                installed_bin = provider.install(self.name, overrides=self.provider_overrides.get(provider.name))
-                if installed_bin:
-                    # print('INSTALLED', self.name, installed_bin)
-                    return self.model_copy(update={
-                        'loaded_provider': provider.name,
-                        'loaded_abspath': installed_bin.abspath,
-                        'loaded_version': installed_bin.version,
-                    })
-            except Exception as err:
-                print(err)
-                exc = err
-        raise exc
-
-    @validate_call
-    def load(self, cache=True) -> Self:
-        if self.is_valid:
-            return self
-
-        if not self.providers_supported:
-            return self
-
-        exc = Exception('No providers were able to install binary', self.name, self.providers_supported)
-        for provider in self.providers_supported:
-            try:
-                installed_bin = provider.load(self.name, cache=cache, overrides=self.provider_overrides.get(provider.name))
-                if installed_bin:
-                    # print('LOADED', provider, self.name, installed_bin)
-                    return self.model_copy(update={
-                        'loaded_provider': provider.name,
-                        'loaded_abspath': installed_bin.abspath,
-                        'loaded_version': installed_bin.version,
-                    })
-            except Exception as err:
-                print(err)
-                exc = err
-        raise exc
-
-    @validate_call
-    def load_or_install(self, cache=True) -> Self:
-        if self.is_valid:
-            return self
-
-        if not self.providers_supported:
-            return self
-
-        exc = Exception('No providers were able to install binary', self.name, self.providers_supported)
-        for provider in self.providers_supported:
-            try:
-                installed_bin = provider.load_or_install(self.name, overrides=self.provider_overrides.get(provider.name), cache=cache)
-                if installed_bin:
-                    # print('LOADED_OR_INSTALLED', self.name, installed_bin)
-                    return self.model_copy(update={
-                        'loaded_provider': provider.name,
-                        'loaded_abspath': installed_bin.abspath,
-                        'loaded_version': installed_bin.version,
-                    })
-            except Exception as err:
-                print(err)
-                exc = err
-        raise exc
-
-    @validate_call
-    def exec(self, args=(), pwd='.'):
-        assert self.loaded_abspath
-        assert self.loaded_version
-        return run([self.loaded_abspath, *args], stdout=PIPE, stderr=PIPE, pwd=pwd)
-
-
-
-
-class SystemPythonHelpers:
-    @staticmethod
-    def get_subdeps() -> str:
-        return 'python3 python3-minimal python3-pip python3-virtualenv'
-
-    @staticmethod
-    def get_abspath() -> str:
-        return sys.executable
-    
-    @staticmethod
-    def get_version() -> str:
-        return '{}.{}.{}'.format(*sys.version_info[:3])
-
-
-class SqliteHelpers:
-    @staticmethod
-    def get_abspath() -> Path:
-        import sqlite3
-        importlib.reload(sqlite3)
-        return Path(inspect.getfile(sqlite3))
-
-    @staticmethod
-    def get_version() -> SemVer:
-        import sqlite3
-        importlib.reload(sqlite3)
-        version = sqlite3.version
-        assert version
-        return SemVer(version)
-
-class DjangoHelpers:
-    @staticmethod
-    def get_django_abspath() -> str:
-        import django
-        return inspect.getfile(django)
-    
-
-    @staticmethod
-    def get_django_version() -> str:
-        import django
-        return '{}.{}.{} {} ({})'.format(*django.VERSION)
-
-class YtdlpHelpers:
-    @staticmethod
-    def get_ytdlp_subdeps() -> str:
-        return 'yt-dlp ffmpeg'
-
-    @staticmethod
-    def get_ytdlp_version() -> str:
-        import yt_dlp
-        importlib.reload(yt_dlp)
-
-        version = yt_dlp.version.__version__
-        assert version
-        return version
-
-class PythonBinary(Binary):
-    name: BinName = 'python'
-
-    providers_supported: List[BinProvider] = [
-        EnvProvider(
-            subdeps_provider={'python': 'plugantic.binaries.SystemPythonHelpers.get_subdeps'},
-            abspath_provider={'python': 'plugantic.binaries.SystemPythonHelpers.get_abspath'},
-            version_provider={'python': 'plugantic.binaries.SystemPythonHelpers.get_version'},
-        ),
-    ]
-
-class SqliteBinary(Binary):
-    name: BinName = 'sqlite'
-    providers_supported: List[BinProvider] = [
-        EnvProvider(
-            version_provider={'sqlite': 'plugantic.binaries.SqliteHelpers.get_version'},
-            abspath_provider={'sqlite': 'plugantic.binaries.SqliteHelpers.get_abspath'},
-        ),
-    ]
-
-class DjangoBinary(Binary):
-    name: BinName = 'django'
-    providers_supported: List[BinProvider] = [
-        EnvProvider(
-            abspath_provider={'django': 'plugantic.binaries.DjangoHelpers.get_django_abspath'},
-            version_provider={'django': 'plugantic.binaries.DjangoHelpers.get_django_version'},
-        ),
-    ]
-
+def get_ytdlp_version() -> str:
+    import yt_dlp
+    return yt_dlp.version.__version__
 
 
 
 
 
 
@@ -296,16 +28,26 @@ class DjangoBinary(Binary):
 class YtdlpBinary(Binary):
 class YtdlpBinary(Binary):
     name: BinName = 'yt-dlp'
     name: BinName = 'yt-dlp'
     providers_supported: List[BinProvider] = [
     providers_supported: List[BinProvider] = [
-        # EnvProvider(),
-        PipProvider(version_provider={'yt-dlp': 'plugantic.binaries.YtdlpHelpers.get_ytdlp_version'}),
-        BrewProvider(subdeps_provider={'yt-dlp': 'plugantic.binaries.YtdlpHelpers.get_ytdlp_subdeps'}),
-        # AptProvider(subdeps_provider={'yt-dlp': lambda: 'yt-dlp ffmpeg'}),
+        EnvProvider(),
+        PipProvider(),
+        BrewProvider(),
+        AptProvider(),
     ]
     ]
-
+    provider_overrides:  Dict[BinProviderName, ProviderLookupDict] = {
+        'pip': {
+            'version': get_ytdlp_version,
+        },
+        'brew': {
+            'subdeps': lambda: 'yt-dlp ffmpeg',
+        },
+        'apt': {
+            'subdeps': lambda: 'yt-dlp ffmpeg',
+        }
+    }
 
 
 class WgetBinary(Binary):
 class WgetBinary(Binary):
     name: BinName = 'wget'
     name: BinName = 'wget'
-    providers_supported: List[BinProvider] = [EnvProvider(), AptProvider()]
+    providers_supported: List[BinProvider] = [EnvProvider(), AptProvider(), BrewProvider()]
 
 
 
 
 # if __name__ == '__main__':
 # if __name__ == '__main__':

+ 0 - 561
archivebox/plugantic/binproviders.py

@@ -1,561 +0,0 @@
-__package__ = 'archivebox.plugantic'
-
-import os
-import shutil
-import operator
-
-from typing import Callable, Any, Optional, Type, Dict, Annotated, ClassVar, Literal, cast, TYPE_CHECKING
-from typing_extensions import Self
-from abc import ABC, abstractmethod
-from collections import namedtuple
-from pathlib import Path
-from subprocess import run, PIPE
-
-from pydantic_core import core_schema, ValidationError
-from pydantic import BaseModel, Field, TypeAdapter, AfterValidator, validate_call, GetCoreSchemaHandler
-
-
-
-def func_takes_args_or_kwargs(lambda_func: Callable[..., Any]) -> bool:
-    """returns True if a lambda func takes args/kwargs of any kind, otherwise false if it's pure/argless"""
-    code = lambda_func.__code__
-    has_args = code.co_argcount > 0
-    has_varargs = code.co_flags & 0x04 != 0
-    has_varkw = code.co_flags & 0x08 != 0
-    return has_args or has_varargs or has_varkw
-
-
-def is_semver_str(semver: Any) -> bool:
-    if isinstance(semver, str):
-        return (semver.count('.') == 2 and semver.replace('.', '').isdigit())
-    return False
-
-def semver_to_str(semver: tuple[int, int, int] | str) -> str:
-    if isinstance(semver, (list, tuple)):
-        return '.'.join(str(chunk) for chunk in semver)
-    if is_semver_str(semver):
-        return semver
-    raise ValidationError('Tried to convert invalid SemVer: {}'.format(semver))
-
-
-SemVerTuple = namedtuple('SemVerTuple', ('major', 'minor', 'patch'), defaults=(0, 0, 0))
-SemVerParsableTypes = str | tuple[str | int, ...] | list[str | int]
-
-class SemVer(SemVerTuple):
-    major: int
-    minor: int = 0
-    patch: int = 0
-
-    if TYPE_CHECKING:
-        full_text: str | None = ''
-
-    def __new__(cls, *args, full_text=None, **kwargs):
-        # '1.1.1'
-        if len(args) == 1 and is_semver_str(args[0]):
-            result = SemVer.parse(args[0])
-
-        # ('1', '2', '3')
-        elif len(args) == 1 and isinstance(args[0], (tuple, list)):
-            result = SemVer.parse(args[0])
-
-        # (1, '2', None)
-        elif not all(isinstance(arg, (int, type(None))) for arg in args):
-            result = SemVer.parse(args)
-
-        # (None)
-        elif all(chunk in ('', 0, None) for chunk in (*args, *kwargs.values())):
-            result = None
-
-        # 1, 2, 3
-        else:
-            result = SemVerTuple.__new__(cls, *args, **kwargs)
-
-        if result is not None:
-            # add first line as extra hidden metadata so it can be logged without having to re-run version cmd
-            result.full_text = full_text or str(result)
-        return result
-
-    @classmethod
-    def parse(cls, version_stdout: SemVerParsableTypes) -> Self | None:
-        """
-        parses a version tag string formatted like into (major, minor, patch) ints
-        'Google Chrome 124.0.6367.208'             -> (124, 0, 6367)
-        'GNU Wget 1.24.5 built on darwin23.2.0.'   -> (1, 24, 5)
-        'curl 8.4.0 (x86_64-apple-darwin23.0) ...' -> (8, 4, 0)
-        '2024.04.09'                               -> (2024, 4, 9)
-
-        """
-        # print('INITIAL_VALUE', type(version_stdout).__name__, version_stdout)
-
-        if isinstance(version_stdout, (tuple, list)):
-            version_stdout = '.'.join(str(chunk) for chunk in version_stdout)
-        elif isinstance(version_stdout, bytes):
-            version_stdout = version_stdout.decode()
-        elif not isinstance(version_stdout, str):
-            version_stdout = str(version_stdout)
-        
-        # no text to work with, return None immediately
-        if not version_stdout.strip():
-            # raise Exception('Tried to parse semver from empty version output (is binary installed and available?)')
-            return None
-
-        just_numbers = lambda col: col.lower().strip('v').split('+')[0].split('-')[0].split('_')[0]
-        contains_semver = lambda col: (
-            col.count('.') in (1, 2, 3)
-            and all(chunk.isdigit() for chunk in col.split('.')[:3])  # first 3 chunks can only be nums
-        )
-
-        full_text = version_stdout.split('\n')[0].strip()
-        first_line_columns = full_text.split()[:4]
-        version_columns = list(filter(contains_semver, map(just_numbers, first_line_columns)))
-        
-        # could not find any column of first line that looks like a version number, despite there being some text
-        if not version_columns:
-            # raise Exception('Failed to parse semver from version command output: {}'.format(' '.join(first_line_columns)))
-            return None
-
-        # take first col containing a semver, and truncate it to 3 chunks (e.g. 2024.04.09.91) -> (2024, 04, 09)
-        first_version_tuple = version_columns[0].split('.', 3)[:3]
-
-        # print('FINAL_VALUE', first_version_tuple)
-
-        return cls(*(int(chunk) for chunk in first_version_tuple), full_text=full_text)
-
-    def __str__(self):
-        return '.'.join(str(chunk) for chunk in self)
-
-    # @classmethod
-    # def __get_pydantic_core_schema__(cls, source: Type[Any], handler: GetCoreSchemaHandler) -> core_schema.CoreSchema:
-    #     default_schema = handler(source)
-    #     return core_schema.no_info_after_validator_function(
-    #         cls.parse,
-    #         default_schema,
-    #         serialization=core_schema.plain_serializer_function_ser_schema(
-    #             lambda semver: str(semver),
-    #             info_arg=False,
-    #             return_schema=core_schema.str_schema(),
-    #         ),
-    #     )
-
-assert SemVer(None) == None
-assert SemVer('') == None
-assert SemVer.parse('') == None
-assert SemVer(1) == (1, 0, 0)
-assert SemVer(1, 2) == (1, 2, 0)
-assert SemVer('1.2+234234') == (1, 2, 0)
-assert SemVer((1, 2, 3)) == (1, 2, 3)
-assert getattr(SemVer((1, 2, 3)), 'full_text') == '1.2.3'
-assert SemVer(('1', '2', '3')) == (1, 2, 3)
-assert SemVer.parse('5.6.7') == (5, 6, 7)
-assert SemVer.parse('124.0.6367.208') == (124, 0, 6367)
-assert SemVer.parse('Google Chrome 124.1+234.234') == (124, 1, 0)
-assert SemVer.parse('Google Ch1rome 124.0.6367.208') == (124, 0, 6367)
-assert SemVer.parse('Google Chrome 124.0.6367.208+beta_234. 234.234.123\n123.456.324') == (124, 0, 6367)
-assert getattr(SemVer.parse('Google Chrome 124.0.6367.208+beta_234. 234.234.123\n123.456.324'), 'full_text') == 'Google Chrome 124.0.6367.208+beta_234. 234.234.123'
-assert SemVer.parse('Google Chrome') == None
-
-@validate_call
-def bin_name(bin_path_or_name: str | Path) -> str:
-    name = Path(bin_path_or_name).name
-    assert len(name) > 1
-    assert name.replace('-', '').replace('_', '').replace('.', '').isalnum(), (
-        f'Binary name can only contain a-Z0-9-_.: {name}')
-    return name
-
-BinName = Annotated[str, AfterValidator(bin_name)]
-
-@validate_call
-def path_is_file(path: Path | str) -> Path:
-    path = Path(path) if isinstance(path, str) else path
-    assert path.is_file(), f'Path is not a file: {path}'
-    return path
-
-HostExistsPath = Annotated[Path, AfterValidator(path_is_file)]
-
-@validate_call
-def path_is_executable(path: HostExistsPath) -> HostExistsPath:
-    assert os.access(path, os.X_OK), f'Path is not executable (fix by running chmod +x {path})'
-    return path
-
-@validate_call
-def path_is_script(path: HostExistsPath) -> HostExistsPath:
-    SCRIPT_EXTENSIONS = ('.py', '.js', '.sh')
-    assert path.suffix.lower() in SCRIPT_EXTENSIONS, 'Path is not a script (does not end in {})'.format(', '.join(SCRIPT_EXTENSIONS))
-    return path
-
-HostExecutablePath = Annotated[HostExistsPath, AfterValidator(path_is_executable)]
-
-@validate_call
-def path_is_abspath(path: Path) -> Path:
-    return path.resolve()
-
-HostAbsPath = Annotated[HostExistsPath, AfterValidator(path_is_abspath)]
-HostBinPath = Annotated[Path, AfterValidator(path_is_abspath), AfterValidator(path_is_file)]
-
-
-@validate_call
-def bin_abspath(bin_path_or_name: BinName | Path) -> HostBinPath | None:
-    assert bin_path_or_name
-
-    if str(bin_path_or_name).startswith('/'):
-        # already a path, get its absolute form
-        abspath = Path(bin_path_or_name).resolve()
-    else:
-        # not a path yet, get path using os.which
-        binpath = shutil.which(bin_path_or_name)
-        if not binpath:
-            return None
-        abspath = Path(binpath).resolve()
-
-    try:
-        return TypeAdapter(HostBinPath).validate_python(abspath)
-    except ValidationError:
-        return None
-
-
-@validate_call
-def bin_version(bin_path: HostBinPath, args=('--version',)) -> SemVer | None:
-    return SemVer(run([bin_path, *args], stdout=PIPE).stdout.strip().decode())
-
-
-class InstalledBin(BaseModel):
-    abspath: HostBinPath
-    version: SemVer
-
-
-def is_valid_install_string(pkgs_str: str) -> str:
-    """Make sure a string is a valid install string for a package manager, e.g. 'yt-dlp ffmpeg'"""
-    assert pkgs_str
-    assert all(len(pkg) > 1 for pkg in pkgs_str.split(' '))
-    return pkgs_str
-
-def is_valid_python_dotted_import(import_str: str) -> str:
-    assert import_str and import_str.replace('.', '').replace('_', '').isalnum()
-    return import_str
-
-InstallStr = Annotated[str, AfterValidator(is_valid_install_string)]
-
-LazyImportStr = Annotated[str, AfterValidator(is_valid_python_dotted_import)]
-
-ProviderHandler = Callable[..., Any] | Callable[[], Any]                               # must take no args [], or [bin_name: str, **kwargs]
-#ProviderHandlerStr = Annotated[str, AfterValidator(lambda s: s.startswith('self.'))]
-ProviderHandlerRef = LazyImportStr | ProviderHandler
-ProviderLookupDict = Dict[str, LazyImportStr]
-ProviderType = Literal['abspath', 'version', 'subdeps', 'install']
-
-
-# class Host(BaseModel):
-#     machine: str
-#     system: str
-#     platform: str
-#     in_docker: bool
-#     in_qemu: bool
-#     python: str
-
-BinProviderName = Literal['env', 'pip', 'apt', 'brew', 'npm', 'vendor']
-
-
-class BinProvider(ABC, BaseModel):
-    name: BinProviderName
-    
-    abspath_provider: ProviderLookupDict = Field(default={'*': 'self.on_get_abspath'}, exclude=True)
-    version_provider: ProviderLookupDict = Field(default={'*': 'self.on_get_version'}, exclude=True)
-    subdeps_provider: ProviderLookupDict = Field(default={'*': 'self.on_get_subdeps'}, exclude=True)
-    install_provider: ProviderLookupDict = Field(default={'*': 'self.on_install'}, exclude=True)
-
-    _abspath_cache: ClassVar = {}
-    _version_cache: ClassVar = {}
-    _install_cache: ClassVar = {}
-
-    # def provider_version(self) -> SemVer | None:
-    #     """Version of the actual underlying package manager (e.g. pip v20.4.1)"""
-    #     if self.name in ('env', 'vendor'):
-    #         return SemVer('0.0.0')
-    #     installer_binpath = Path(shutil.which(self.name)).resolve()
-    #     return bin_version(installer_binpath)
-
-    # def provider_host(self) -> Host:
-    #     """Information about the host env, archictecture, and OS needed to select & build packages"""
-    #     p = platform.uname()
-    #     return Host(
-    #         machine=p.machine,
-    #         system=p.system,
-    #         platform=platform.platform(),
-    #         python=sys.implementation.name,
-    #         in_docker=os.environ.get('IN_DOCKER', '').lower() == 'true',
-    #         in_qemu=os.environ.get('IN_QEMU', '').lower() == 'true',
-    #     )
-
-    def get_default_providers(self):
-        return self.get_providers_for_bin('*')
-
-    def resolve_provider_func(self, provider_func: ProviderHandlerRef | None) -> ProviderHandler | None:
-        if provider_func is None:
-            return None
-
-        # if provider_func is a dotted path to a function on self, swap it for the actual function
-        if isinstance(provider_func, str) and provider_func.startswith('self.'):
-            provider_func = getattr(self, provider_func.split('self.', 1)[-1])
-
-        # if provider_func is a dot-formatted import string, import the function
-        if isinstance(provider_func, str):
-            from django.utils.module_loading import import_string
-
-            package_name, module_name, classname, path = provider_func.split('.', 3)   # -> abc, def, ghi.jkl
-
-            # get .ghi.jkl nested attr present on module abc.def
-            imported_module = import_string(f'{package_name}.{module_name}.{classname}')
-            provider_func = operator.attrgetter(path)(imported_module)
-
-            # # abc.def.ghi.jkl  -> 1, 2, 3
-            # for idx in range(1, len(path)):
-            #     parent_path = '.'.join(path[:-idx])  # abc.def.ghi
-            #     try:
-            #         parent_module = import_string(parent_path)
-            #         provider_func = getattr(parent_module, path[-idx])
-            #     except AttributeError, ImportError:
-            #         continue
-
-        assert TypeAdapter(ProviderHandler).validate_python(provider_func), (
-            f'{self.__class__.__name__} provider func for {bin_name} was not a function or dotted-import path: {provider_func}')
-
-        return provider_func
-
-    @validate_call
-    def get_providers_for_bin(self, bin_name: str) -> ProviderLookupDict:
-        providers_for_bin = {
-            'abspath': self.abspath_provider.get(bin_name),
-            'version': self.version_provider.get(bin_name),
-            'subdeps': self.subdeps_provider.get(bin_name),
-            'install': self.install_provider.get(bin_name),
-        }
-        only_set_providers_for_bin = {k: v for k, v in providers_for_bin.items() if v is not None}
-        
-        return only_set_providers_for_bin
-
-    @validate_call
-    def get_provider_for_action(self, bin_name: BinName, provider_type: ProviderType, default_provider: Optional[ProviderHandlerRef]=None, overrides: Optional[ProviderLookupDict]=None) -> ProviderHandler:
-        """
-        Get the provider func for a given key + Dict of provider callbacks + fallback default provider.
-        e.g. get_provider_for_action(bin_name='yt-dlp', 'install', default_provider=self.on_install, ...) -> Callable
-        """
-
-        provider_func_ref = (
-            (overrides or {}).get(provider_type)
-            or self.get_providers_for_bin(bin_name).get(provider_type)
-            or self.get_default_providers().get(provider_type)
-            or default_provider
-        )
-        # print('getting provider for action', bin_name, provider_type, provider_func)
-
-        provider_func = self.resolve_provider_func(provider_func_ref)
-
-        assert provider_func, f'No {self.name} provider func was found for {bin_name} in: {self.__class__.__name__}.'
-
-        return provider_func
-
-    @validate_call
-    def call_provider_for_action(self, bin_name: BinName, provider_type: ProviderType, default_provider: Optional[ProviderHandlerRef]=None, overrides: Optional[ProviderLookupDict]=None, **kwargs) -> Any:
-        provider_func: ProviderHandler = self.get_provider_for_action(
-            bin_name=bin_name,
-            provider_type=provider_type,
-            default_provider=default_provider,
-            overrides=overrides,
-        )
-        if not func_takes_args_or_kwargs(provider_func):
-            # if it's a pure argless lambdas, dont pass bin_path and other **kwargs
-            provider_func_without_args = cast(Callable[[], Any], provider_func)
-            return provider_func_without_args()
-
-        provider_func = cast(Callable[..., Any], provider_func)
-        return provider_func(bin_name, **kwargs)
-
-
-
-    def on_get_abspath(self, bin_name: BinName, **_) -> HostBinPath | None:
-        print(f'[*] {self.__class__.__name__}: Getting abspath for {bin_name}...')
-        try:
-            return bin_abspath(bin_name)
-        except ValidationError:
-            return None
-
-    def on_get_version(self, bin_name: BinName, abspath: Optional[HostBinPath]=None, **_) -> SemVer | None:
-        abspath = abspath or self._abspath_cache.get(bin_name) or self.get_abspath(bin_name)
-        if not abspath: return None
-
-        print(f'[*] {self.__class__.__name__}: Getting version for {bin_name}...')
-        try:
-            return bin_version(abspath)
-        except ValidationError:
-            return None
-
-    def on_get_subdeps(self, bin_name: BinName, **_) -> InstallStr:
-        print(f'[*] {self.__class__.__name__}: Getting subdependencies for {bin_name}')
-        # ... subdependency calculation logic here
-        return TypeAdapter(InstallStr).validate_python(bin_name)
-
-    @abstractmethod
-    def on_install(self, bin_name: BinName, subdeps: Optional[InstallStr]=None, **_):
-        subdeps = subdeps or self.get_subdeps(bin_name)
-        print(f'[*] {self.__class__.__name__}: Installing subdependencies for {bin_name} ({subdeps})')
-        # ... install logic here
-        assert True
-
-
-    @validate_call
-    def get_abspath(self, bin_name: BinName, overrides: Optional[ProviderLookupDict]=None) -> HostBinPath | None:
-        abspath = self.call_provider_for_action(
-            bin_name=bin_name,
-            provider_type='abspath',
-            default_provider=self.on_get_abspath,
-            overrides=overrides,
-        )
-        if not abspath:
-            return None
-        result = TypeAdapter(HostBinPath).validate_python(abspath)
-        self._abspath_cache[bin_name] = result
-        return result
-
-    @validate_call
-    def get_version(self, bin_name: BinName, abspath: Optional[HostBinPath]=None, overrides: Optional[ProviderLookupDict]=None) -> SemVer | None:
-        version = self.call_provider_for_action(
-            bin_name=bin_name,
-            provider_type='version',
-            default_provider=self.on_get_version,
-            overrides=overrides,
-            abspath=abspath,
-        )
-        if not version:
-            return None
-        result = SemVer(version)
-        self._version_cache[bin_name] = result
-        return result
-
-    @validate_call
-    def get_subdeps(self, bin_name: BinName, overrides: Optional[ProviderLookupDict]=None) -> InstallStr:
-        subdeps = self.call_provider_for_action(
-            bin_name=bin_name,
-            provider_type='subdeps',
-            default_provider=self.on_get_subdeps,
-            overrides=overrides,
-        )
-        if not subdeps:
-            subdeps = bin_name
-        result = TypeAdapter(InstallStr).validate_python(subdeps)
-        return result
-
-    @validate_call
-    def install(self, bin_name: BinName, overrides: Optional[ProviderLookupDict]=None) -> InstalledBin | None:
-        subdeps = self.get_subdeps(bin_name, overrides=overrides)
-
-        self.call_provider_for_action(
-            bin_name=bin_name,
-            provider_type='install',
-            default_provider=self.on_install,
-            overrides=overrides,
-            subdeps=subdeps,
-        )
-
-        installed_abspath = self.get_abspath(bin_name)
-        assert installed_abspath, f'Unable to find {bin_name} abspath after installing with {self.name}'
-
-        installed_version = self.get_version(bin_name, abspath=installed_abspath)
-        assert installed_version, f'Unable to find {bin_name} version after installing with {self.name}'
-        
-        result = InstalledBin(abspath=installed_abspath, version=installed_version)
-        self._install_cache[bin_name] = result
-        return result
-
-    @validate_call
-    def load(self, bin_name: BinName, overrides: Optional[ProviderLookupDict]=None, cache: bool=False) -> InstalledBin | None:
-        installed_abspath = None
-        installed_version = None
-
-        if cache:
-            installed_bin = self._install_cache.get(bin_name)
-            if installed_bin:
-                return installed_bin
-            installed_abspath = self._abspath_cache.get(bin_name)
-            installed_version = self._version_cache.get(bin_name)
-
-
-        installed_abspath = installed_abspath or self.get_abspath(bin_name, overrides=overrides)
-        if not installed_abspath:
-            return None
-
-        installed_version = installed_version or self.get_version(bin_name, abspath=installed_abspath, overrides=overrides)
-        if not installed_version:
-            return None
-
-        return InstalledBin(abspath=installed_abspath, version=installed_version)
-
-    @validate_call
-    def load_or_install(self, bin_name: BinName, overrides: Optional[ProviderLookupDict]=None, cache: bool=True) -> InstalledBin | None:
-        installed = self.load(bin_name, overrides=overrides, cache=cache)
-        if not installed:
-            installed = self.install(bin_name, overrides=overrides)
-        return installed
-
-
-class PipProvider(BinProvider):
-    name: BinProviderName = 'pip'
-
-    def on_install(self, bin_name: str, subdeps: Optional[InstallStr]=None, **_):
-        subdeps = subdeps or self.on_get_subdeps(bin_name)
-        print(f'[*] {self.__class__.__name__}: Installing subdependencies for {bin_name} ({subdeps})')
-        
-        proc = run(['pip', 'install', '--upgrade', *subdeps.split(' ')], stdout=PIPE, stderr=PIPE)
-        
-        if proc.returncode != 0:
-            print(proc.stdout.strip().decode())
-            print(proc.stderr.strip().decode())
-            raise Exception(f'{self.__class__.__name__}: install got returncode {proc.returncode} while installing {subdeps}: {subdeps}')
-
-
-class AptProvider(BinProvider):
-    name: BinProviderName = 'apt'
-    
-    subdeps_provider: ProviderLookupDict = {
-        'yt-dlp': lambda: 'yt-dlp ffmpeg',
-    }
-
-    def on_install(self, bin_name: BinName, subdeps: Optional[InstallStr]=None, **_):
-        subdeps = subdeps or self.on_get_subdeps(bin_name)
-        print(f'[*] {self.__class__.__name__}: Installing subdependencies for {bin_name} ({subdeps})')
-        
-        run(['apt-get', 'update', '-qq'])
-        proc = run(['apt-get', 'install', '-y', *subdeps.split(' ')], stdout=PIPE, stderr=PIPE)
-        
-        if proc.returncode != 0:
-            print(proc.stdout.strip().decode())
-            print(proc.stderr.strip().decode())
-            raise Exception(f'{self.__class__.__name__} install got returncode {proc.returncode} while installing {subdeps}: {subdeps}')
-
-class BrewProvider(BinProvider):
-    name: BinProviderName = 'brew'
-
-    def on_install(self, bin_name: str, subdeps: Optional[InstallStr]=None, **_):
-        subdeps = subdeps or self.on_get_subdeps(bin_name)
-        print(f'[*] {self.__class__.__name__}: Installing subdependencies for {bin_name} ({subdeps})')
-        
-        proc = run(['brew', 'install', *subdeps.split(' ')], stdout=PIPE, stderr=PIPE)
-        
-        if proc.returncode != 0:
-            print(proc.stdout.strip().decode())
-            print(proc.stderr.strip().decode())
-            raise Exception(f'{self.__class__.__name__} install got returncode {proc.returncode} while installing {subdeps}: {subdeps}')
-
-
-class EnvProvider(BinProvider):
-    name: BinProviderName = 'env'
-
-    abspath_provider: ProviderLookupDict = {
-        # 'python': lambda: Path('/opt/homebrew/Cellar/[email protected]/3.10.14/Frameworks/Python.framework/Versions/3.10/bin/python3.10'),
-    }
-    version_provider: ProviderLookupDict = {
-        # 'python': lambda: '{}.{}.{}'.format(*sys.version_info[:3]),
-    }
-
-    def on_install(self, bin_name: BinName, subdeps: Optional[InstallStr]=None, **_):
-        """The env provider is ready-only and does not install any packages, so this is a no-op"""
-        pass

+ 1 - 1
archivebox/plugantic/extractors.py

@@ -31,7 +31,7 @@ def no_empty_args(args: List[str]) -> List[str]:
     assert all(len(arg) for arg in args)
     assert all(len(arg) for arg in args)
     return args
     return args
 
 
-ExtractorName = Literal['wget', 'warc', 'media']
+ExtractorName = Literal['wget', 'warc', 'media', 'singlefile'] | str
 
 
 HandlerFuncStr = Annotated[str, AfterValidator(lambda s: s.startswith('self.'))]
 HandlerFuncStr = Annotated[str, AfterValidator(lambda s: s.startswith('self.'))]
 CmdArgsList = Annotated[List[str], AfterValidator(no_empty_args)]
 CmdArgsList = Annotated[List[str], AfterValidator(no_empty_args)]

+ 0 - 12
archivebox/plugantic/plugins.py

@@ -14,9 +14,6 @@ from pydantic import (
 
 
 from .binaries import (
 from .binaries import (
     Binary,
     Binary,
-    PythonBinary,
-    SqliteBinary,
-    DjangoBinary,
     WgetBinary,
     WgetBinary,
     YtdlpBinary,
     YtdlpBinary,
 )
 )
@@ -28,7 +25,6 @@ from .extractors import (
 )
 )
 from .replayers import (
 from .replayers import (
     Replayer,
     Replayer,
-    GENERIC_REPLAYER,
     MEDIA_REPLAYER,
     MEDIA_REPLAYER,
 )
 )
 from .configs import (
 from .configs import (
@@ -80,12 +76,6 @@ class Plugin(BaseModel):
         })
         })
 
 
 
 
-class CorePlugin(Plugin):
-    name: str = 'core'
-    configs: List[SerializeAsAny[ConfigSet]] = []
-    binaries: List[SerializeAsAny[Binary]] = [PythonBinary(), SqliteBinary(), DjangoBinary()]
-    extractors: List[SerializeAsAny[Extractor]] = []
-    replayers: List[SerializeAsAny[Replayer]] = [GENERIC_REPLAYER]
 
 
 class YtdlpPlugin(Plugin):
 class YtdlpPlugin(Plugin):
     name: str = 'ytdlp'
     name: str = 'ytdlp'
@@ -101,11 +91,9 @@ class WgetPlugin(Plugin):
     extractors: List[SerializeAsAny[Extractor]] = [WgetExtractor(), WarcExtractor()]
     extractors: List[SerializeAsAny[Extractor]] = [WgetExtractor(), WarcExtractor()]
 
 
 
 
-CORE_PLUGIN = CorePlugin()
 YTDLP_PLUGIN = YtdlpPlugin()
 YTDLP_PLUGIN = YtdlpPlugin()
 WGET_PLUGIN = WgetPlugin()
 WGET_PLUGIN = WgetPlugin()
 PLUGINS = [
 PLUGINS = [
-    CORE_PLUGIN,
     YTDLP_PLUGIN,
     YTDLP_PLUGIN,
     WGET_PLUGIN,
     WGET_PLUGIN,
 ]
 ]

+ 0 - 1
archivebox/plugantic/replayers.py

@@ -22,5 +22,4 @@ class Replayer(BaseModel):
     # thumbnail_view: LazyImportStr = 'plugins.generic_replayer.views.get_icon'
     # thumbnail_view: LazyImportStr = 'plugins.generic_replayer.views.get_icon'
 
 
 
 
-GENERIC_REPLAYER = Replayer(name='generic')
 MEDIA_REPLAYER = Replayer(name='media')
 MEDIA_REPLAYER = Replayer(name='media')

+ 48 - 7
archivebox/plugantic/views.py

@@ -1,5 +1,8 @@
 __package__ = 'archivebox.plugantic'
 __package__ = 'archivebox.plugantic'
 
 
+import inspect
+from typing import Any
+
 from django.http import HttpRequest
 from django.http import HttpRequest
 from django.utils.html import format_html, mark_safe
 from django.utils.html import format_html, mark_safe
 
 
@@ -10,6 +13,44 @@ from admin_data_views.utils import render_with_table_view, render_with_item_view
 from plugantic.plugins import LOADED_PLUGINS
 from plugantic.plugins import LOADED_PLUGINS
 from django.conf import settings
 from django.conf import settings
 
 
+def obj_to_yaml(obj: Any, indent: int=0) -> str:
+    indent_str = "  " * indent
+    
+    if isinstance(obj, dict):
+        if not obj:
+            return "{}"
+        result = "\n"
+        for key, value in obj.items():
+            result += f"{indent_str}{key}:{obj_to_yaml(value, indent + 1)}\n"
+        return result
+    
+    elif isinstance(obj, list):
+        if not obj:
+            return "[]"
+        result = "\n"
+        for item in obj:
+            result += f"{indent_str}- {obj_to_yaml(item, indent + 1).lstrip()}\n"
+        return result.rstrip()
+    
+    elif isinstance(obj, str):
+        if "\n" in obj:
+            return f" |\n{indent_str}  " + obj.replace("\n", f"\n{indent_str}  ")
+        else:
+            return f" {obj}"
+    
+    elif isinstance(obj, (int, float, bool)):
+        return f" {str(obj)}"
+    
+    elif callable(obj):
+        source = '\n'.join(
+            '' if 'def ' in line else line
+            for line in inspect.getsource(obj).split('\n')
+            if line.strip()
+        ).split('lambda: ')[-1].rstrip(',')
+        return f" {indent_str}  " + source.replace("\n", f"\n{indent_str}  ")
+    
+    else:
+        return f" {str(obj)}"
 
 
 @render_with_table_view
 @render_with_table_view
 def binaries_list_view(request: HttpRequest, **kwargs) -> TableContext:
 def binaries_list_view(request: HttpRequest, **kwargs) -> TableContext:
@@ -18,13 +59,13 @@ def binaries_list_view(request: HttpRequest, **kwargs) -> TableContext:
 
 
     rows = {
     rows = {
         "Binary": [],
         "Binary": [],
-        "From Plugin": [],
         "Found Version": [],
         "Found Version": [],
+        "From Plugin": [],
         "Provided By": [],
         "Provided By": [],
         "Found Abspath": [],
         "Found Abspath": [],
         "Related Configuration": [],
         "Related Configuration": [],
         "Overrides": [],
         "Overrides": [],
-        "Description": [],
+        # "Description": [],
     }
     }
 
 
     relevant_configs = {
     relevant_configs = {
@@ -38,8 +79,8 @@ def binaries_list_view(request: HttpRequest, **kwargs) -> TableContext:
             binary = binary.load_or_install()
             binary = binary.load_or_install()
 
 
             rows['Binary'].append(ItemLink(binary.name, key=binary.name))
             rows['Binary'].append(ItemLink(binary.name, key=binary.name))
-            rows['From Plugin'].append(plugin.name)
             rows['Found Version'].append(binary.loaded_version)
             rows['Found Version'].append(binary.loaded_version)
+            rows['From Plugin'].append(plugin.name)
             rows['Provided By'].append(binary.loaded_provider)
             rows['Provided By'].append(binary.loaded_provider)
             rows['Found Abspath'].append(binary.loaded_abspath)
             rows['Found Abspath'].append(binary.loaded_abspath)
             rows['Related Configuration'].append(mark_safe(', '.join(
             rows['Related Configuration'].append(mark_safe(', '.join(
@@ -48,8 +89,8 @@ def binaries_list_view(request: HttpRequest, **kwargs) -> TableContext:
                     if binary.name.lower().replace('-', '').replace('_', '').replace('ytdlp', 'youtubedl') in config_key.lower()
                     if binary.name.lower().replace('-', '').replace('_', '').replace('ytdlp', 'youtubedl') in config_key.lower()
                     # or binary.name.lower().replace('-', '').replace('_', '') in str(config_value).lower()
                     # or binary.name.lower().replace('-', '').replace('_', '') in str(config_value).lower()
             )))
             )))
-            rows['Overrides'].append(str(binary.provider_overrides))
-            rows['Description'].append(binary.description)
+            rows['Overrides'].append(obj_to_yaml(binary.provider_overrides))
+            # rows['Description'].append(binary.description)
 
 
     return TableContext(
     return TableContext(
         title="Binaries",
         title="Binaries",
@@ -85,8 +126,8 @@ def binary_detail_view(request: HttpRequest, key: str, **kwargs) -> ItemContext:
                     'binprovider': binary.loaded_provider,
                     'binprovider': binary.loaded_provider,
                     'abspath': binary.loaded_abspath,
                     'abspath': binary.loaded_abspath,
                     'version': binary.loaded_version,
                     'version': binary.loaded_version,
-                    'overrides': str(binary.provider_overrides),
-                    'providers': str(binary.providers_supported),
+                    'overrides': obj_to_yaml(binary.provider_overrides),
+                    'providers': obj_to_yaml(binary.providers_supported),
                 },
                 },
                 "help_texts": {
                 "help_texts": {
                     # TODO
                     # TODO

+ 1 - 2
archivebox/system.py

@@ -11,13 +11,12 @@ from typing import Optional, Union, Set, Tuple
 from subprocess import _mswindows, PIPE, Popen, CalledProcessError, CompletedProcess, TimeoutExpired
 from subprocess import _mswindows, PIPE, Popen, CalledProcessError, CompletedProcess, TimeoutExpired
 
 
 from crontab import CronTab
 from crontab import CronTab
-from .vendor.atomicwrites import atomic_write as lib_atomic_write
+from atomicwrites import atomic_write as lib_atomic_write
 
 
 from .util import enforce_types, ExtendedEncoder
 from .util import enforce_types, ExtendedEncoder
 from .config import PYTHON_BINARY, OUTPUT_PERMISSIONS, DIR_OUTPUT_PERMISSIONS, ENFORCE_ATOMIC_WRITES
 from .config import PYTHON_BINARY, OUTPUT_PERMISSIONS, DIR_OUTPUT_PERMISSIONS, ENFORCE_ATOMIC_WRITES
 
 
 
 
-
 def run(cmd, *args, input=None, capture_output=True, timeout=None, check=False, text=False, start_new_session=True, **kwargs):
 def run(cmd, *args, input=None, capture_output=True, timeout=None, check=False, text=False, start_new_session=True, **kwargs):
     """Patched of subprocess.run to kill forked child subprocesses and fix blocking io making timeout=innefective
     """Patched of subprocess.run to kill forked child subprocesses and fix blocking io making timeout=innefective
         Mostly copied from https://github.com/python/cpython/blob/master/Lib/subprocess.py
         Mostly copied from https://github.com/python/cpython/blob/master/Lib/subprocess.py

+ 4 - 8
archivebox/util.py

@@ -16,7 +16,7 @@ from datetime import datetime, timezone
 from dateparser import parse as dateparser
 from dateparser import parse as dateparser
 from requests.exceptions import RequestException, ReadTimeout
 from requests.exceptions import RequestException, ReadTimeout
 
 
-from .vendor.base32_crockford import encode as base32_encode                            # type: ignore
+from base32_crockford import encode as base32_encode                            # type: ignore
 from w3lib.encoding import html_body_declared_encoding, http_content_type_encoding
 from w3lib.encoding import html_body_declared_encoding, http_content_type_encoding
 from os.path import lexists
 from os.path import lexists
 from os import remove as remove_file
 from os import remove as remove_file
@@ -273,8 +273,8 @@ def get_headers(url: str, timeout: int=None) -> str:
         {
         {
             'URL': url,
             'URL': url,
             'Status-Code': response.status_code,
             'Status-Code': response.status_code,
-            'Elapsed': response.elapsed,
-            'Encoding': response.encoding,
+            'Elapsed': response.elapsed.total_seconds()*1000,
+            'Encoding': str(response.encoding),
             'Apparent-Encoding': response.apparent_encoding,
             'Apparent-Encoding': response.apparent_encoding,
             **dict(response.headers),
             **dict(response.headers),
         },
         },
@@ -304,11 +304,7 @@ def chrome_args(**options) -> List[str]:
     cmd_args += CHROME_EXTRA_ARGS
     cmd_args += CHROME_EXTRA_ARGS
 
 
     if options['CHROME_HEADLESS']:
     if options['CHROME_HEADLESS']:
-        chrome_major_version = int(re.search(r'\s(\d+)\.\d', CHROME_VERSION)[1])
-        if chrome_major_version >= 111:
-            cmd_args += ("--headless=new",)
-        else:
-            cmd_args += ('--headless',)
+        cmd_args += ("--headless=new",)   # expects chrome version >= 111
 
 
     if not options['CHROME_SANDBOX']:
     if not options['CHROME_SANDBOX']:
         # assume this means we are running inside a docker container
         # assume this means we are running inside a docker container

+ 34 - 0
archivebox/vendor/__init__.py

@@ -0,0 +1,34 @@
+import sys
+import inspect
+import importlib
+from pathlib import Path
+
+VENDOR_DIR = Path(__file__).parent
+
+VENDORED_LIBS = {
+    # sys.path dir:         library name
+    'python-atomicwrites':  'atomicwrites',
+    'django-taggit':        'taggit',
+    'pydantic-pkgr':        'pydantic_pkgr',
+    'pocket':               'pocket',
+    'base32-crockford':     'base32_crockford',
+}
+
+def load_vendored_libs():
+    for lib_subdir, lib_name in VENDORED_LIBS.items():
+        lib_dir = VENDOR_DIR / lib_subdir
+        assert lib_dir.is_dir(), 'Expected vendor libary {lib_name} could not be found in {lib_dir}'
+
+        try:
+            lib = importlib.import_module(lib_name)
+            # print(f"Successfully imported lib from environment {lib_name}: {inspect.getfile(lib)}")
+        except ImportError:
+            sys.path.append(str(lib_dir))
+            try:
+                lib = importlib.import_module(lib_name)
+                # print(f"Successfully imported lib from vendored fallback {lib_name}: {inspect.getfile(lib)}")
+            except ImportError as e:
+                print(f"Failed to import lib from environment or vendored fallback {lib_name}: {e}", file=sys.stderr)
+                sys.exit(1)
+        
+

+ 0 - 1
archivebox/vendor/atomicwrites.py

@@ -1 +0,0 @@
-python-atomicwrites/atomicwrites/__init__.py

+ 0 - 1
archivebox/vendor/base32_crockford.py

@@ -1 +0,0 @@
-base32-crockford/base32_crockford.py

+ 0 - 1
archivebox/vendor/package-lock.json

@@ -1 +0,0 @@
-../../package-lock.json

+ 0 - 1
archivebox/vendor/package.json

@@ -1 +0,0 @@
-../../package.json

+ 0 - 1
archivebox/vendor/pocket.py

@@ -1 +0,0 @@
-pocket/pocket.py

+ 1 - 0
archivebox/vendor/pydantic-pkgr

@@ -0,0 +1 @@
+Subproject commit 2cd844533d888ce29b9bf32b8363510dd0d76166

+ 0 - 1
archivebox/vendor/taggit_utils.py

@@ -1 +0,0 @@
-django-taggit/taggit/utils.py

+ 9 - 9
package-lock.json

@@ -236,9 +236,9 @@
       "license": "MIT"
       "license": "MIT"
     },
     },
     "node_modules/@types/node": {
     "node_modules/@types/node": {
-      "version": "22.5.0",
-      "resolved": "https://registry.npmjs.org/@types/node/-/node-22.5.0.tgz",
-      "integrity": "sha512-DkFrJOe+rfdHTqqMg0bSNlGlQ85hSoh2TPzZyhHsXnMtligRWpxUySiyw8FY14ITt24HVCiQPWxS3KO/QlGmWg==",
+      "version": "22.5.1",
+      "resolved": "https://registry.npmjs.org/@types/node/-/node-22.5.1.tgz",
+      "integrity": "sha512-KkHsxej0j9IW1KKOOAA/XBA0z08UFSrRQHErzEfA3Vgq57eXIMYboIlHJuYIfd+lwCQjtKqUu3UnmKbtUc9yRw==",
       "license": "MIT",
       "license": "MIT",
       "optional": true,
       "optional": true,
       "dependencies": {
       "dependencies": {
@@ -353,9 +353,9 @@
       }
       }
     },
     },
     "node_modules/aws4": {
     "node_modules/aws4": {
-      "version": "1.13.1",
-      "resolved": "https://registry.npmjs.org/aws4/-/aws4-1.13.1.tgz",
-      "integrity": "sha512-u5w79Rd7SU4JaIlA/zFqG+gOiuq25q5VLyZ8E+ijJeILuTxVzZgp2CaGw/UTw6pXYN9XMO9yiqj/nEHmhTG5CA==",
+      "version": "1.13.2",
+      "resolved": "https://registry.npmjs.org/aws4/-/aws4-1.13.2.tgz",
+      "integrity": "sha512-lHe62zvbTB5eEABUVi/AwVh0ZKY9rMMDhmm+eeyuuUQbQ3+J+fONVQOZyj+DdrvD4BY33uYniyRJ4UJIaSKAfw==",
       "license": "MIT"
       "license": "MIT"
     },
     },
     "node_modules/b4a": {
     "node_modules/b4a": {
@@ -2376,9 +2376,9 @@
       }
       }
     },
     },
     "node_modules/tslib": {
     "node_modules/tslib": {
-      "version": "2.6.3",
-      "resolved": "https://registry.npmjs.org/tslib/-/tslib-2.6.3.tgz",
-      "integrity": "sha512-xNvxJEOUiWPGhUuUdQgAJPKOOJfGnIyKySOc09XkKsgdUV/3E2zvwZYdejjmRgPCgcym1juLH3226yA7sEFJKQ==",
+      "version": "2.7.0",
+      "resolved": "https://registry.npmjs.org/tslib/-/tslib-2.7.0.tgz",
+      "integrity": "sha512-gLXCKdN1/j47AiHiOkJN69hJmcbGTHI0ImLmbYLHykhgeN0jVGola9yVjFgzCUklsZQMW55o+dW7IXv3RCXDzA==",
       "license": "0BSD"
       "license": "0BSD"
     },
     },
     "node_modules/turndown": {
     "node_modules/turndown": {

+ 92 - 11
pdm.lock

@@ -5,7 +5,7 @@
 groups = ["default", "ldap", "sonic"]
 groups = ["default", "ldap", "sonic"]
 strategy = ["inherit_metadata"]
 strategy = ["inherit_metadata"]
 lock_version = "4.5.0"
 lock_version = "4.5.0"
-content_hash = "sha256:f2f7ca01f2e18a1ef07d59b7a8985d89785a4b8a2a4e66452f1f9e8e8ad529ad"
+content_hash = "sha256:c6aa1f436032d18d079a4c2e9d9b95a5110579eb96a449751bfaf4d472eba401"
 
 
 [[metadata.targets]]
 [[metadata.targets]]
 requires_python = "==3.10.*"
 requires_python = "==3.10.*"
@@ -78,6 +78,29 @@ files = [
     {file = "asttokens-2.4.1.tar.gz", hash = "sha256:b03869718ba9a6eb027e134bfdf69f38a236d681c83c160d510768af11254ba0"},
     {file = "asttokens-2.4.1.tar.gz", hash = "sha256:b03869718ba9a6eb027e134bfdf69f38a236d681c83c160d510768af11254ba0"},
 ]
 ]
 
 
+[[package]]
+name = "atomicwrites"
+version = "1.4.0"
+requires_python = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*"
+summary = "Atomic file writes."
+groups = ["default"]
+marker = "python_version == \"3.10\""
+files = [
+    {file = "atomicwrites-1.4.0-py2.py3-none-any.whl", hash = "sha256:6d1784dea7c0c8d4a5172b6c620f40b6e4cbfdf96d783691f2e1302a7b88e197"},
+    {file = "atomicwrites-1.4.0.tar.gz", hash = "sha256:ae70396ad1a434f9c7046fd2dd196fc04b12f9e91ffb859164193be8b6168a7a"},
+]
+
+[[package]]
+name = "base32-crockford"
+version = "0.3.0"
+summary = "A Python implementation of Douglas Crockford's base32 encoding scheme"
+groups = ["default"]
+marker = "python_version == \"3.10\""
+files = [
+    {file = "base32-crockford-0.3.0.tar.gz", hash = "sha256:115f5bd32ae32b724035cb02eb65069a8824ea08c08851eb80c8b9f63443a969"},
+    {file = "base32_crockford-0.3.0-py2.py3-none-any.whl", hash = "sha256:295ef5ffbf6ed96b6e739ffd36be98fa7e90a206dd18c39acefb15777eedfe6e"},
+]
+
 [[package]]
 [[package]]
 name = "brotli"
 name = "brotli"
 version = "1.1.0"
 version = "1.1.0"
@@ -407,6 +430,21 @@ files = [
     {file = "django_stubs_ext-5.0.4.tar.gz", hash = "sha256:85da065224204774208be29c7d02b4482d5a69218a728465c2fbe41725fdc819"},
     {file = "django_stubs_ext-5.0.4.tar.gz", hash = "sha256:85da065224204774208be29c7d02b4482d5a69218a728465c2fbe41725fdc819"},
 ]
 ]
 
 
+[[package]]
+name = "django-taggit"
+version = "1.3.0"
+requires_python = ">=3.5"
+summary = "django-taggit is a reusable Django application for simple tagging."
+groups = ["default"]
+marker = "python_version == \"3.10\""
+dependencies = [
+    "Django>=1.11",
+]
+files = [
+    {file = "django-taggit-1.3.0.tar.gz", hash = "sha256:4a833bf71f4c2deddd9745924eee53be1c075d7f0020a06f12e29fa3d752732d"},
+    {file = "django_taggit-1.3.0-py3-none-any.whl", hash = "sha256:609b0223d8a652f3fae088b7fd29f294fdadaca2d7931d45c27d6c59b02fdf31"},
+]
+
 [[package]]
 [[package]]
 name = "exceptiongroup"
 name = "exceptiongroup"
 version = "1.2.2"
 version = "1.2.2"
@@ -479,7 +517,7 @@ files = [
 
 
 [[package]]
 [[package]]
 name = "httpx"
 name = "httpx"
-version = "0.27.0"
+version = "0.27.2"
 requires_python = ">=3.8"
 requires_python = ">=3.8"
 summary = "The next generation HTTP client."
 summary = "The next generation HTTP client."
 groups = ["default"]
 groups = ["default"]
@@ -492,20 +530,20 @@ dependencies = [
     "sniffio",
     "sniffio",
 ]
 ]
 files = [
 files = [
-    {file = "httpx-0.27.0-py3-none-any.whl", hash = "sha256:71d5465162c13681bff01ad59b2cc68dd838ea1f10e51574bac27103f00c91a5"},
-    {file = "httpx-0.27.0.tar.gz", hash = "sha256:a0cb88a46f32dc874e04ee956e4c2764aba2aa228f650b06788ba6bda2962ab5"},
+    {file = "httpx-0.27.2-py3-none-any.whl", hash = "sha256:7bb2708e112d8fdd7829cd4243970f0c223274051cb35ee80c03301ee29a3df0"},
+    {file = "httpx-0.27.2.tar.gz", hash = "sha256:f7c2be1d2f3c3c3160d441802406b206c2b76f5947b11115e6df10c6c65e66c2"},
 ]
 ]
 
 
 [[package]]
 [[package]]
 name = "idna"
 name = "idna"
-version = "3.7"
-requires_python = ">=3.5"
+version = "3.8"
+requires_python = ">=3.6"
 summary = "Internationalized Domain Names in Applications (IDNA)"
 summary = "Internationalized Domain Names in Applications (IDNA)"
 groups = ["default"]
 groups = ["default"]
 marker = "python_version == \"3.10\""
 marker = "python_version == \"3.10\""
 files = [
 files = [
-    {file = "idna-3.7-py3-none-any.whl", hash = "sha256:82fee1fc78add43492d3a1898bfa6d8a904cc97d8427f683ed8e798d07761aa0"},
-    {file = "idna-3.7.tar.gz", hash = "sha256:028ff3aadf0609c1fd278d8ea3089299412a7a8b9bd005dd08b9f8285bcb5cfc"},
+    {file = "idna-3.8-py3-none-any.whl", hash = "sha256:050b4e5baadcd44d760cedbd2b8e639f2ff89bbc7a5730fcc662954303377aac"},
+    {file = "idna-3.8.tar.gz", hash = "sha256:d838c2c0ed6fced7693d5e8ab8e734d5f8fda53a039c0164afb0b82e771e3603"},
 ]
 ]
 
 
 [[package]]
 [[package]]
@@ -613,6 +651,32 @@ files = [
     {file = "pexpect-4.9.0.tar.gz", hash = "sha256:ee7d41123f3c9911050ea2c2dac107568dc43b2d3b0c7557a33212c398ead30f"},
     {file = "pexpect-4.9.0.tar.gz", hash = "sha256:ee7d41123f3c9911050ea2c2dac107568dc43b2d3b0c7557a33212c398ead30f"},
 ]
 ]
 
 
+[[package]]
+name = "pocket"
+version = "0.3.7"
+git = "https://github.com/tapanpandita/pocket.git"
+ref = "v0.3.7"
+revision = "5a144438cc89bfc0ec94db960718ccf1f76468c1"
+summary = "api wrapper for getpocket.com"
+groups = ["default"]
+marker = "python_version == \"3.10\""
+dependencies = [
+    "requests",
+]
+
+[[package]]
+name = "pocket"
+version = "0.3.7"
+git = "https://github.com/tapanpandita/pocket.git"
+ref = "v0.3.7"
+revision = "5a144438cc89bfc0ec94db960718ccf1f76468c1"
+summary = "api wrapper for getpocket.com"
+groups = ["default"]
+marker = "python_version == \"3.10\""
+dependencies = [
+    "requests",
+]
+
 [[package]]
 [[package]]
 name = "prompt-toolkit"
 name = "prompt-toolkit"
 version = "3.0.47"
 version = "3.0.47"
@@ -739,6 +803,23 @@ files = [
     {file = "pydantic_core-2.20.1.tar.gz", hash = "sha256:26ca695eeee5f9f1aeeb211ffc12f10bcb6f71e2989988fda61dabd65db878d4"},
     {file = "pydantic_core-2.20.1.tar.gz", hash = "sha256:26ca695eeee5f9f1aeeb211ffc12f10bcb6f71e2989988fda61dabd65db878d4"},
 ]
 ]
 
 
+[[package]]
+name = "pydantic-pkgr"
+version = "0.1.4"
+requires_python = ">=3.10"
+summary = "System package manager APIs in strongly typed Python"
+groups = ["default"]
+marker = "python_version == \"3.10\""
+dependencies = [
+    "pydantic-core>=2.18.2",
+    "pydantic>=2.7.1",
+    "typing-extensions>=4.11.0",
+]
+files = [
+    {file = "pydantic_pkgr-0.1.4-py3-none-any.whl", hash = "sha256:bd9ddfa8eeb4d361257c4d3d8d36ba44a72515b497ee52cf0763240c66006417"},
+    {file = "pydantic_pkgr-0.1.4.tar.gz", hash = "sha256:e0422022dd83341f1e869a54da9aca903a6407a983ece0735f69493841b0fbb8"},
+]
+
 [[package]]
 [[package]]
 name = "pygments"
 name = "pygments"
 version = "2.18.0"
 version = "2.18.0"
@@ -841,14 +922,14 @@ files = [
 
 
 [[package]]
 [[package]]
 name = "setuptools"
 name = "setuptools"
-version = "73.0.1"
+version = "74.0.0"
 requires_python = ">=3.8"
 requires_python = ">=3.8"
 summary = "Easily download, build, install, upgrade, and uninstall Python packages"
 summary = "Easily download, build, install, upgrade, and uninstall Python packages"
 groups = ["default"]
 groups = ["default"]
 marker = "python_version == \"3.10\""
 marker = "python_version == \"3.10\""
 files = [
 files = [
-    {file = "setuptools-73.0.1-py3-none-any.whl", hash = "sha256:b208925fcb9f7af924ed2dc04708ea89791e24bde0d3020b27df0e116088b34e"},
-    {file = "setuptools-73.0.1.tar.gz", hash = "sha256:d59a3e788ab7e012ab2c4baed1b376da6366883ee20d7a5fc426816e3d7b1193"},
+    {file = "setuptools-74.0.0-py3-none-any.whl", hash = "sha256:0274581a0037b638b9fc1c6883cc71c0210865aaa76073f7882376b641b84e8f"},
+    {file = "setuptools-74.0.0.tar.gz", hash = "sha256:a85e96b8be2b906f3e3e789adec6a9323abf79758ecfa3065bd740d81158b11e"},
 ]
 ]
 
 
 [[package]]
 [[package]]

+ 14 - 8
pyproject.toml

@@ -29,12 +29,9 @@ dependencies = [
     "croniter>=2.0.5",                # for: archivebox schedule
     "croniter>=2.0.5",                # for: archivebox schedule
     "ipython>=8.23.0",                # for: archivebox shell
     "ipython>=8.23.0",                # for: archivebox shell
     # Extractor Dependencies
     # Extractor Dependencies
-    "yt-dlp>=2024.4.9",               # for: media
+    "yt-dlp>=2024.8.6",               # for: media
     # "playwright>=1.43.0; platform_machine != 'armv7l'",  # WARNING: playwright doesn't have any sdist, causes trouble on build systems that refuse to install wheel-only packages
     # "playwright>=1.43.0; platform_machine != 'armv7l'",  # WARNING: playwright doesn't have any sdist, causes trouble on build systems that refuse to install wheel-only packages
-    # TODO: add more extractors
-    #  - gallery-dl
-    #  - scihubdl
-    #  - See Github issues for more...
+
     "django-signal-webhooks>=0.3.0",
     "django-signal-webhooks>=0.3.0",
     "django-admin-data-views>=0.3.1",
     "django-admin-data-views>=0.3.1",
     "ulid-py>=1.1.0",
     "ulid-py>=1.1.0",
@@ -43,6 +40,14 @@ dependencies = [
     "django-pydantic-field>=0.3.9",
     "django-pydantic-field>=0.3.9",
     "django-jsonform>=2.22.0",
     "django-jsonform>=2.22.0",
     "django-stubs>=5.0.2",
     "django-stubs>=5.0.2",
+
+    # these can be safely omitted when installation subsystem does not provide these as packages (e.g. apt/debian)
+    # archivebox will automatically load fallback vendored copies bundled via archivebox/vendor/__init__.py
+    "pydantic-pkgr>=0.1.4",
+    "atomicwrites==1.4.0",
+    "pocket@git+https://github.com/tapanpandita/[email protected]",
+    "django-taggit==1.3.0",
+    "base32-crockford==0.3.0",
 ]
 ]
 
 
 homepage = "https://github.com/ArchiveBox/ArchiveBox"
 homepage = "https://github.com/ArchiveBox/ArchiveBox"
@@ -139,7 +144,7 @@ exclude = [
     "**/migrations",
     "**/migrations",
     "archivebox/vendor",
     "archivebox/vendor",
 ]
 ]
-stubPath = "./typings"
+stubPath = "./archivebox/typings"
 venvPath = "."
 venvPath = "."
 venv = ".venv"
 venv = ".venv"
 # ignore = ["src/oldstuff"]
 # ignore = ["src/oldstuff"]
@@ -169,6 +174,9 @@ debug = [
     "djdt_flamegraph",
     "djdt_flamegraph",
     "ipdb",
     "ipdb",
     "requests-tracker>=0.3.3",
     "requests-tracker>=0.3.3",
+    "logfire[django]>=0.51.0",
+    "opentelemetry-instrumentation-django>=0.47b0",
+    "opentelemetry-instrumentation-sqlite3>=0.47b0",
 ]
 ]
 test = [
 test = [
     "pytest",
     "pytest",
@@ -177,8 +185,6 @@ test = [
 lint = [
 lint = [
     "flake8",
     "flake8",
     "mypy",
     "mypy",
-]
-dev = [
     "django-autotyping>=0.5.1",
     "django-autotyping>=0.5.1",
 ]
 ]
 
 

+ 8 - 3
requirements.txt

@@ -5,6 +5,8 @@ annotated-types==0.7.0; python_version == "3.10"
 anyio==4.4.0; python_version == "3.10"
 anyio==4.4.0; python_version == "3.10"
 asgiref==3.8.1; python_version == "3.10"
 asgiref==3.8.1; python_version == "3.10"
 asttokens==2.4.1; python_version == "3.10"
 asttokens==2.4.1; python_version == "3.10"
+atomicwrites==1.4.0; python_version == "3.10"
+base32-crockford==0.3.0; python_version == "3.10"
 brotli==1.1.0; implementation_name == "cpython" and python_version == "3.10"
 brotli==1.1.0; implementation_name == "cpython" and python_version == "3.10"
 brotlicffi==1.1.0.0; implementation_name != "cpython" and python_version == "3.10"
 brotlicffi==1.1.0.0; implementation_name != "cpython" and python_version == "3.10"
 certifi==2024.7.4; python_version == "3.10"
 certifi==2024.7.4; python_version == "3.10"
@@ -26,13 +28,14 @@ django-settings-holder==0.1.2; python_version == "3.10"
 django-signal-webhooks==0.3.0; python_version == "3.10"
 django-signal-webhooks==0.3.0; python_version == "3.10"
 django-stubs==5.0.4; python_version == "3.10"
 django-stubs==5.0.4; python_version == "3.10"
 django-stubs-ext==5.0.4; python_version == "3.10"
 django-stubs-ext==5.0.4; python_version == "3.10"
+django-taggit==1.3.0; python_version == "3.10"
 exceptiongroup==1.2.2; python_version == "3.10"
 exceptiongroup==1.2.2; python_version == "3.10"
 executing==2.0.1; python_version == "3.10"
 executing==2.0.1; python_version == "3.10"
 feedparser==6.0.11; python_version == "3.10"
 feedparser==6.0.11; python_version == "3.10"
 h11==0.14.0; python_version == "3.10"
 h11==0.14.0; python_version == "3.10"
 httpcore==1.0.5; python_version == "3.10"
 httpcore==1.0.5; python_version == "3.10"
-httpx==0.27.0; python_version == "3.10"
-idna==3.7; python_version == "3.10"
+httpx==0.27.2; python_version == "3.10"
+idna==3.8; python_version == "3.10"
 ipython==8.26.0; python_version == "3.10"
 ipython==8.26.0; python_version == "3.10"
 jedi==0.19.1; python_version == "3.10"
 jedi==0.19.1; python_version == "3.10"
 matplotlib-inline==0.1.7; python_version == "3.10"
 matplotlib-inline==0.1.7; python_version == "3.10"
@@ -40,6 +43,7 @@ mutagen==1.47.0; python_version == "3.10"
 mypy-extensions==1.0.0; python_version == "3.10"
 mypy-extensions==1.0.0; python_version == "3.10"
 parso==0.8.4; python_version == "3.10"
 parso==0.8.4; python_version == "3.10"
 pexpect==4.9.0; (sys_platform != "win32" and sys_platform != "emscripten") and python_version == "3.10"
 pexpect==4.9.0; (sys_platform != "win32" and sys_platform != "emscripten") and python_version == "3.10"
+pocket @ git+https://github.com/tapanpandita/pocket.git@5a144438cc89bfc0ec94db960718ccf1f76468c1 ; python_version == "3.10"
 prompt-toolkit==3.0.47; python_version == "3.10"
 prompt-toolkit==3.0.47; python_version == "3.10"
 ptyprocess==0.7.0; (sys_platform != "win32" and sys_platform != "emscripten") and python_version == "3.10"
 ptyprocess==0.7.0; (sys_platform != "win32" and sys_platform != "emscripten") and python_version == "3.10"
 pure-eval==0.2.3; python_version == "3.10"
 pure-eval==0.2.3; python_version == "3.10"
@@ -49,6 +53,7 @@ pycparser==2.22; platform_python_implementation != "PyPy" and python_version ==
 pycryptodomex==3.20.0; python_version == "3.10"
 pycryptodomex==3.20.0; python_version == "3.10"
 pydantic==2.8.2; python_version == "3.10"
 pydantic==2.8.2; python_version == "3.10"
 pydantic-core==2.20.1; python_version == "3.10"
 pydantic-core==2.20.1; python_version == "3.10"
+pydantic-pkgr==0.1.4; python_version == "3.10"
 pygments==2.18.0; python_version == "3.10"
 pygments==2.18.0; python_version == "3.10"
 python-crontab==3.2.0; python_version == "3.10"
 python-crontab==3.2.0; python_version == "3.10"
 python-dateutil==2.9.0.post0; python_version == "3.10"
 python-dateutil==2.9.0.post0; python_version == "3.10"
@@ -56,7 +61,7 @@ python-ldap==3.4.4; python_version == "3.10"
 pytz==2024.1; python_version == "3.10"
 pytz==2024.1; python_version == "3.10"
 regex==2024.7.24; python_version == "3.10"
 regex==2024.7.24; python_version == "3.10"
 requests==2.32.3; python_version == "3.10"
 requests==2.32.3; python_version == "3.10"
-setuptools==73.0.1; python_version == "3.10"
+setuptools==74.0.0; python_version == "3.10"
 sgmllib3k==1.0.0; python_version == "3.10"
 sgmllib3k==1.0.0; python_version == "3.10"
 six==1.16.0; python_version == "3.10"
 six==1.16.0; python_version == "3.10"
 sniffio==1.3.1; python_version == "3.10"
 sniffio==1.3.1; python_version == "3.10"