瀏覽代碼

v0.8.+: Massive speed improvements for Admin UI & common queries, v3 plugins progress, and bugfixes (#1498)

Nick Sweeting 1 年之前
父節點
當前提交
43e87ef437
共有 66 個文件被更改,包括 1150 次插入1100 次删除
  1. 3 0
      .gitmodules
  2. 18 1
      archivebox/abid_utils/models.py
  3. 0 0
      archivebox/builtin_plugins/__init__.py
  4. 0 0
      archivebox/builtin_plugins/base/__init__.py
  5. 3 0
      archivebox/builtin_plugins/base/admin.py
  6. 83 0
      archivebox/builtin_plugins/base/apps.py
  7. 0 0
      archivebox/builtin_plugins/base/migrations/__init__.py
  8. 3 0
      archivebox/builtin_plugins/base/models.py
  9. 3 0
      archivebox/builtin_plugins/base/tests.py
  10. 3 0
      archivebox/builtin_plugins/base/views.py
  11. 0 0
      archivebox/builtin_plugins/singlefile/__init__.py
  12. 113 0
      archivebox/builtin_plugins/singlefile/apps.py
  13. 66 0
      archivebox/builtin_plugins/singlefile/config.yaml
  14. 3 0
      archivebox/builtin_plugins/singlefile/tests.py
  15. 28 15
      archivebox/config.py
  16. 108 25
      archivebox/core/admin.py
  17. 1 1
      archivebox/core/forms.py
  18. 2 2
      archivebox/core/migrations/0027_update_snapshot_ids.py
  19. 1 1
      archivebox/core/migrations/0034_alter_archiveresult_old_id_alter_archiveresult_uuid.py
  20. 1 1
      archivebox/core/migrations/0051_snapshottag_snapshot_alter_snapshottag_snapshot_old.py
  21. 1 1
      archivebox/core/migrations/0059_tag_id.py
  22. 1 1
      archivebox/core/migrations/0063_snapshottag_tag_alter_snapshottag_old_tag.py
  23. 35 0
      archivebox/core/migrations/0069_alter_archiveresult_created_alter_snapshot_added_and_more.py
  24. 68 36
      archivebox/core/models.py
  25. 126 80
      archivebox/core/settings.py
  26. 1 1
      archivebox/extractors/__init__.py
  27. 12 1
      archivebox/extractors/wget.py
  28. 6 6
      archivebox/index/__init__.py
  29. 9 1
      archivebox/index/html.py
  30. 3 1
      archivebox/index/sql.py
  31. 6 5
      archivebox/main.py
  32. 10 10
      archivebox/package-lock.json
  33. 1 1
      archivebox/parsers/pocket_api.py
  34. 0 0
      archivebox/pkg/__init__.py
  35. 3 0
      archivebox/pkg/admin.py
  36. 14 0
      archivebox/pkg/apps.py
  37. 0 0
      archivebox/pkg/management/__init__.py
  38. 0 0
      archivebox/pkg/management/commands/__init__.py
  39. 75 0
      archivebox/pkg/management/commands/pkg.py
  40. 0 0
      archivebox/pkg/migrations/__init__.py
  41. 3 0
      archivebox/pkg/models.py
  42. 86 0
      archivebox/pkg/settings.py
  43. 3 0
      archivebox/pkg/tests.py
  44. 3 0
      archivebox/pkg/views.py
  45. 0 1
      archivebox/plugantic/__init__.py
  46. 11 0
      archivebox/plugantic/apps.py
  47. 22 280
      archivebox/plugantic/binaries.py
  48. 0 561
      archivebox/plugantic/binproviders.py
  49. 1 1
      archivebox/plugantic/extractors.py
  50. 0 12
      archivebox/plugantic/plugins.py
  51. 0 1
      archivebox/plugantic/replayers.py
  52. 48 7
      archivebox/plugantic/views.py
  53. 1 2
      archivebox/system.py
  54. 4 8
      archivebox/util.py
  55. 34 0
      archivebox/vendor/__init__.py
  56. 0 1
      archivebox/vendor/atomicwrites.py
  57. 0 1
      archivebox/vendor/base32_crockford.py
  58. 0 1
      archivebox/vendor/package-lock.json
  59. 0 1
      archivebox/vendor/package.json
  60. 0 1
      archivebox/vendor/pocket.py
  61. 1 0
      archivebox/vendor/pydantic-pkgr
  62. 0 1
      archivebox/vendor/taggit_utils.py
  63. 9 9
      package-lock.json
  64. 92 11
      pdm.lock
  65. 14 8
      pyproject.toml
  66. 8 3
      requirements.txt

+ 3 - 0
.gitmodules

@@ -26,3 +26,6 @@
 [submodule "archivebox/vendor/python-atomicwrites"]
 	path = archivebox/vendor/python-atomicwrites
 	url = https://github.com/untitaker/python-atomicwrites
+[submodule "archivebox/vendor/pydantic-pkgr"]
+	path = archivebox/vendor/pydantic-pkgr
+	url = https://github.com/ArchiveBox/pydantic-pkgr

+ 18 - 1
archivebox/abid_utils/models.py

@@ -61,6 +61,11 @@ def get_or_create_system_user_pk(username='system'):
     return user.pk
 
 
+class AutoDateTimeField(models.DateTimeField):
+    def pre_save(self, model_instance, add):
+        return timezone.now()
+
+
 class ABIDModel(models.Model):
     """
     Abstract Base Model for other models to depend on. Provides ArchiveBox ID (ABID) interface.
@@ -76,13 +81,16 @@ class ABIDModel(models.Model):
     abid = ABIDField(prefix=abid_prefix)
 
     created_by = models.ForeignKey(settings.AUTH_USER_MODEL, on_delete=models.CASCADE, default=get_or_create_system_user_pk)
-    created = models.DateTimeField(auto_now_add=True)
+    created = AutoDateTimeField(default=timezone.now, db_index=True)
     modified = models.DateTimeField(auto_now=True)
 
     class Meta(TypedModelMeta):
         abstract = True
 
     def save(self, *args: Any, **kwargs: Any) -> None:
+        if self._state.adding or not self.created:
+            self.created = timezone.now()
+
         # when first creating a row, self.ABID is the source of truth
         # overwrite default prefilled self.id & self.abid with generated self.ABID value
         if self._state.adding or not self.id:
@@ -93,6 +101,7 @@ class ABIDModel(models.Model):
         super().save(*args, **kwargs)
         assert str(self.id) == str(self.ABID.uuid), f'self.id {self.id} does not match self.ABID {self.ABID.uuid}'
         assert str(self.abid) == str(self.ABID), f'self.abid {self.id} does not match self.ABID {self.ABID.uuid}'
+        assert str(self.uuid) == str(self.ABID.uuid), f'self.uuid ({self.uuid}) does not match .ABID.uuid ({self.ABID.uuid})'
 
     @property
     def abid_values(self) -> Dict[str, Any]:
@@ -186,6 +195,14 @@ class ABIDModel(models.Model):
         Get a uuid.UUID (v4) representation of the object's ABID.
         """
         return self.ABID.uuid
+    
+    @property
+    def uuid(self) -> str:
+        """
+        Get a str uuid.UUID (v4) representation of the object's ABID.
+        """
+        assert str(self.id) == str(self.ABID.uuid)
+        return str(self.id)
 
     @property
     def TypeID(self) -> TypeID:

+ 0 - 0
archivebox/builtin_plugins/__init__.py


+ 0 - 0
archivebox/builtin_plugins/base/__init__.py


+ 3 - 0
archivebox/builtin_plugins/base/admin.py

@@ -0,0 +1,3 @@
+from django.contrib import admin
+
+# Register your models here.

+ 83 - 0
archivebox/builtin_plugins/base/apps.py

@@ -0,0 +1,83 @@
+import sys
+import inspect
+from typing import List, Dict, Any, Optional
+from pathlib import Path
+
+import django
+from django.apps import AppConfig
+from django.core.checks import Tags, Warning, register
+from django.db.backends.sqlite3.base import Database as sqlite3
+
+from pydantic import (
+    Field,
+    SerializeAsAny,
+)
+
+from pydantic_pkgr import SemVer, BinProvider, BinProviderName, ProviderLookupDict, BinName, Binary, EnvProvider, NpmProvider
+
+from plugantic.extractors import Extractor, ExtractorName
+from plugantic.plugins import Plugin
+from plugantic.configs import ConfigSet, ConfigSectionName
+from plugantic.replayers import Replayer
+
+
+class PythonBinary(Binary):
+    name: BinName = 'python'
+
+    providers_supported: List[BinProvider] = [EnvProvider()]
+    provider_overrides: Dict[str, Any] = {
+        'env': {
+            'subdeps': \
+                lambda: 'python3 python3-minimal python3-pip python3-virtualenv',
+            'abspath': \
+                lambda: sys.executable,
+            'version': \
+                lambda: '{}.{}.{}'.format(*sys.version_info[:3]),
+        },
+    }
+
+class SqliteBinary(Binary):
+    name: BinName = 'sqlite'
+    providers_supported: List[BinProvider] = [EnvProvider()]
+    provider_overrides:  Dict[BinProviderName, ProviderLookupDict] = {
+        'env': {
+            'abspath': \
+                lambda: Path(inspect.getfile(sqlite3)),
+            'version': \
+                lambda: SemVer(sqlite3.version),
+        },
+    }
+
+
+class DjangoBinary(Binary):
+    name: BinName = 'django'
+
+    providers_supported: List[BinProvider] = [EnvProvider()]
+    provider_overrides:  Dict[BinProviderName, ProviderLookupDict] = {
+        'env': {
+            'abspath': \
+                lambda: inspect.getfile(django),
+            'version': \
+                lambda: django.VERSION[:3],
+        },
+    }
+
+
+class BasicReplayer(Replayer):
+    name: str = 'basic'
+
+
+class BasePlugin(Plugin):
+    name: str = 'base'
+    configs: List[SerializeAsAny[ConfigSet]] = []
+    binaries: List[SerializeAsAny[Binary]] = [PythonBinary(), SqliteBinary(), DjangoBinary()]
+    extractors: List[SerializeAsAny[Extractor]] = []
+    replayers: List[SerializeAsAny[Replayer]] = [BasicReplayer()]
+
+
+PLUGINS = [BasePlugin()]
+
+
+class BaseConfig(AppConfig):
+    default_auto_field = 'django.db.models.BigAutoField'
+    name = 'builtin_plugins.base'

+ 0 - 0
archivebox/builtin_plugins/base/migrations/__init__.py


+ 3 - 0
archivebox/builtin_plugins/base/models.py

@@ -0,0 +1,3 @@
+from django.db import models
+
+# Create your models here.

+ 3 - 0
archivebox/builtin_plugins/base/tests.py

@@ -0,0 +1,3 @@
+from django.test import TestCase
+
+# Create your tests here.

+ 3 - 0
archivebox/builtin_plugins/base/views.py

@@ -0,0 +1,3 @@
+from django.shortcuts import render
+
+# Create your views here.

+ 0 - 0
archivebox/builtin_plugins/singlefile/__init__.py


+ 113 - 0
archivebox/builtin_plugins/singlefile/apps.py

@@ -0,0 +1,113 @@
+from typing import List, Optional, Dict
+from pathlib import Path
+
+from django.apps import AppConfig
+from django.core.checks import Tags, Warning, register
+
+from pydantic import (
+    Field,
+    SerializeAsAny,
+)
+
+from pydantic_pkgr import BinProvider, BinName, Binary, EnvProvider, NpmProvider
+from pydantic_pkgr.binprovider import bin_abspath
+from pydantic_pkgr.binary import BinProviderName, ProviderLookupDict
+
+from plugantic.extractors import Extractor, ExtractorName
+from plugantic.plugins import Plugin
+from plugantic.configs import ConfigSet, ConfigSectionName
+
+from pkg.settings import env
+
+
+###################### Config ##########################
+
+class SinglefileToggleConfig(ConfigSet):
+    section: ConfigSectionName = 'ARCHIVE_METHOD_TOGGLES'
+
+    SAVE_SINGLEFILE: bool = True
+
+
+class SinglefileDependencyConfig(ConfigSet):
+    section: ConfigSectionName = 'DEPENDENCY_CONFIG'
+
+    SINGLEFILE_BINARY: str = Field(default='wget')
+    SINGLEFILE_ARGS: Optional[List[str]] = Field(default=None)
+    SINGLEFILE_EXTRA_ARGS: List[str] = []
+    SINGLEFILE_DEFAULT_ARGS: List[str] = ['--timeout={TIMEOUT-10}']
+
+class SinglefileOptionsConfig(ConfigSet):
+    section: ConfigSectionName = 'ARCHIVE_METHOD_OPTIONS'
+
+    # loaded from shared config
+    SINGLEFILE_USER_AGENT: str = Field(default='', alias='USER_AGENT')
+    SINGLEFILE_TIMEOUT: int = Field(default=60, alias='TIMEOUT')
+    SINGLEFILE_CHECK_SSL_VALIDITY: bool = Field(default=True, alias='CHECK_SSL_VALIDITY')
+    SINGLEFILE_RESTRICT_FILE_NAMES: str = Field(default='windows', alias='RESTRICT_FILE_NAMES')
+    SINGLEFILE_COOKIES_FILE: Optional[Path] = Field(default=None, alias='COOKIES_FILE')
+
+
+
+DEFAULT_CONFIG = {
+    'CHECK_SSL_VALIDITY': False,
+    'SAVE_SINGLEFILE': True,
+    'TIMEOUT': 120,
+}
+
+PLUGIN_CONFIG = [
+    SinglefileToggleConfig(**DEFAULT_CONFIG),
+    SinglefileDependencyConfig(**DEFAULT_CONFIG),
+    SinglefileOptionsConfig(**DEFAULT_CONFIG),
+]
+
+###################### Binaries ############################
+
+min_version: str = "1.1.54"
+max_version: str = "2.0.0"
+
+class SinglefileBinary(Binary):
+    name: BinName = 'single-file'
+    providers_supported: List[BinProvider] = [NpmProvider()]
+
+
+    provider_overrides: Dict[BinProviderName, ProviderLookupDict] ={
+        'env': {
+            'abspath': lambda: bin_abspath('single-file-node.js', PATH=env.PATH) or bin_abspath('single-file', PATH=env.PATH),
+        },
+        'npm': {
+            # 'abspath': lambda: bin_abspath('single-file', PATH=NpmProvider().PATH) or bin_abspath('single-file', PATH=env.PATH),
+            'subdeps': lambda: f'single-file-cli@>={min_version} <{max_version}',
+        },
+    }
+
+
+###################### Extractors ##########################
+
+class SinglefileExtractor(Extractor):
+    name: ExtractorName = 'singlefile'
+    binary: Binary = SinglefileBinary()
+
+    def get_output_path(self, snapshot) -> Path:
+        return Path(snapshot.link_dir) / 'singlefile.html'
+
+
+###################### Plugins #############################
+
+
+class SinglefilePlugin(Plugin):
+    name: str = 'singlefile'
+    configs: List[SerializeAsAny[ConfigSet]] = [*PLUGIN_CONFIG]
+    binaries: List[SerializeAsAny[Binary]] = [SinglefileBinary()]
+    extractors: List[SerializeAsAny[Extractor]] = [SinglefileExtractor()]
+
+PLUGINS = [SinglefilePlugin()]
+
+###################### Django Apps #########################
+
+class SinglefileConfig(AppConfig):
+    name = 'builtin_plugins.singlefile'
+    verbose_name = 'SingleFile'
+
+    def ready(self):
+        pass
+        # print('Loaded singlefile plugin')

+ 66 - 0
archivebox/builtin_plugins/singlefile/config.yaml

@@ -0,0 +1,66 @@
+name: singlefile
+plugin_version: '0.0.1'
+plugin_spec: '0.0.1'
+
+binaries:
+    singlefile:
+        providers:
+            - env
+            - npm
+
+commands:
+    - singlefile.exec
+    - singlefile.extract
+    - singlefile.should_extract
+    - singlefile.get_output_path
+
+extractors:
+    singlefile:
+        binary: singlefile
+        test: singlefile.should_extract
+        extract: singlefile.extract
+        output_files:
+            - singlefile.html
+
+configs:
+    ARCHIVE_METHOD_TOGGLES:
+        SAVE_SINGLEFILE:
+            type: bool
+            default: true
+
+    DEPENDENCY_CONFIG:
+        SINGLEFILE_BINARY:
+            type: str
+            default: wget
+        SINGLEFILE_ARGS:
+            type: Optional[List[str]]
+            default: null
+        SINGLEFILE_EXTRA_ARGS:
+            type: List[str]
+            default: []
+        SINGLEFILE_DEFAULT_ARGS:
+            type: List[str]
+            default: 
+            - "--timeout={TIMEOUT-10}"
+
+    ARCHIVE_METHOD_OPTIONS:
+        SINGLEFILE_USER_AGENT:
+            type: str
+            default: ""
+            alias: USER_AGENT
+        SINGLEFILE_TIMEOUT:
+            type: int
+            default: 60
+            alias: TIMEOUT
+        SINGLEFILE_CHECK_SSL_VALIDITY:
+            type: bool
+            default: true
+            alias: CHECK_SSL_VALIDITY
+        SINGLEFILE_RESTRICT_FILE_NAMES:
+            type: str
+            default: windows
+            alias: RESTRICT_FILE_NAMES
+        SINGLEFILE_COOKIES_FILE:
+            type: Optional[Path]
+            default: null
+            alias: COOKIES_FILE

+ 3 - 0
archivebox/builtin_plugins/singlefile/tests.py

@@ -0,0 +1,3 @@
+from django.test import TestCase
+
+# Create your tests here.

+ 28 - 15
archivebox/config.py

@@ -31,8 +31,6 @@ import getpass
 import platform
 import shutil
 import requests
-import django
-from sqlite3 import dbapi2 as sqlite3
 
 from hashlib import md5
 from pathlib import Path
@@ -43,6 +41,11 @@ from configparser import ConfigParser
 from collections import defaultdict
 import importlib.metadata
 
+from pydantic_pkgr import SemVer
+
+import django
+from django.db.backends.sqlite3.base import Database as sqlite3
+
 from .config_stubs import (
     AttrDict,
     SimpleConfigValueDict,
@@ -52,6 +55,11 @@ from .config_stubs import (
     ConfigDefaultDict,
 )
 
+# load fallback libraries from vendor dir
+from .vendor import load_vendored_libs
+load_vendored_libs()
+
+
 
 ############################### Config Schema ##################################
 
@@ -89,13 +97,13 @@ CONFIG_SCHEMA: Dict[str, ConfigDefaultDict] = {
         'SECRET_KEY':                {'type': str,   'default': None},
         'BIND_ADDR':                 {'type': str,   'default': lambda c: ['127.0.0.1:8000', '0.0.0.0:8000'][c['IN_DOCKER']]},
         'ALLOWED_HOSTS':             {'type': str,   'default': '*'},     # e.g. archivebox.example.com,archivebox2.example.com
-        'CSRF_TRUSTED_ORIGINS':      {'type': str,   'default': ''},      # e.g. https://archivebox.example.com,https://archivebox2.example.com:8080
+        'CSRF_TRUSTED_ORIGINS':      {'type': str,   'default': lambda c: 'http://localhost:8000,http://127.0.0.1:8000,http://0.0.0.0:8000,http://{}'.format(c['BIND_ADDR'])},   # e.g. https://archivebox.example.com,https://archivebox2.example.com:8080
         'DEBUG':                     {'type': bool,  'default': False},
         'PUBLIC_INDEX':              {'type': bool,  'default': True},
         'PUBLIC_SNAPSHOTS':          {'type': bool,  'default': True},
         'PUBLIC_ADD_VIEW':           {'type': bool,  'default': False},
         'FOOTER_INFO':               {'type': str,   'default': 'Content is hosted for personal archiving purposes only.  Contact server owner for any takedown requests.'},
-        'SNAPSHOTS_PER_PAGE':        {'type': int,   'default': 40},
+        'SNAPSHOTS_PER_PAGE':        {'type': int,   'default': 100},
         'CUSTOM_TEMPLATES_DIR':      {'type': str,   'default': None},
         'TIME_ZONE':                 {'type': str,   'default': 'UTC'},
         'TIMEZONE':                  {'type': str,   'default': 'UTC'},
@@ -565,7 +573,7 @@ DYNAMIC_CONFIG_SCHEMA: ConfigDefaultDict = {
     'PYTHON_VERSION':           {'default': lambda c: '{}.{}.{}'.format(*sys.version_info[:3])},
 
     'DJANGO_BINARY':            {'default': lambda c: inspect.getfile(django)},
-    'DJANGO_VERSION':           {'default': lambda c: '{}.{}.{} {} ({})'.format(*django.VERSION)},
+    'DJANGO_VERSION':           {'default': lambda c: '{}.{}.{}'.format(*django.VERSION[:3])},
     
     'SQLITE_BINARY':            {'default': lambda c: inspect.getfile(sqlite3)},
     'SQLITE_VERSION':           {'default': lambda c: sqlite3.version},
@@ -902,16 +910,9 @@ def bin_version(binary: Optional[str], cmd: Optional[str]=None) -> Optional[str]
             version_str = run(cmd or [abspath, "--version"], shell=is_cmd_str, stdout=PIPE, stderr=STDOUT).stdout.strip().decode()
         
         # take first 3 columns of first line of version info
-        version_ptn = re.compile(r"\d+?\.\d+?\.?\d*", re.MULTILINE)
-        try:
-            version_nums = version_ptn.findall(version_str.split('\n')[0])[0]
-            if version_nums:
-                return version_nums
-            else:
-                raise IndexError
-        except IndexError:
-            # take first 3 columns of first line of version info
-            return ' '.join(version_str.split('\n')[0].strip().split()[:3])
+        semver = SemVer.parse(version_str)
+        if semver:
+            return str(semver)
     except OSError:
         pass
         # stderr(f'[X] Unable to find working version of dependency: {binary}', color='red')
@@ -1524,5 +1525,17 @@ def setup_django(out_dir: Path=None, check_db=False, config: ConfigDict=CONFIG,
             assert sql_index_path.exists(), (
                 f'No database file {SQL_INDEX_FILENAME} found in: {config["OUTPUT_DIR"]} (Are you in an ArchiveBox collection directory?)')
 
+
+            # https://docs.pydantic.dev/logfire/integrations/django/ Logfire Debugging
+            if settings.DEBUG_LOGFIRE:
+                from opentelemetry.instrumentation.sqlite3 import SQLite3Instrumentor
+                SQLite3Instrumentor().instrument()
+
+                import logfire
+
+                logfire.configure()
+                logfire.instrument_django(is_sql_commentor_enabled=True)
+                logfire.info(f'Started ArchiveBox v{CONFIG.VERSION}', argv=sys.argv)
+
     except KeyboardInterrupt:
         raise SystemExit(2)

+ 108 - 25
archivebox/core/admin.py

@@ -10,12 +10,15 @@ from datetime import datetime, timezone
 from typing import Dict, Any
 
 from django.contrib import admin
-from django.db.models import Count, Q
-from django.urls import path, reverse
+from django.db.models import Count, Q, Prefetch
+from django.urls import path, reverse, resolve
+from django.utils import timezone
+from django.utils.functional import cached_property
 from django.utils.html import format_html
 from django.utils.safestring import mark_safe
 from django.shortcuts import render, redirect
 from django.contrib.auth import get_user_model
+from django.core.paginator import Paginator
 from django.core.exceptions import ValidationError
 from django.conf import settings
 from django import forms
@@ -126,22 +129,99 @@ archivebox_admin.get_admin_data_urls = get_admin_data_urls.__get__(archivebox_ad
 archivebox_admin.get_urls = get_urls(archivebox_admin.get_urls).__get__(archivebox_admin, ArchiveBoxAdmin)
 
 
+class AccelleratedPaginator(Paginator):
+    """
+    Accellerated Pagniator ignores DISTINCT when counting total number of rows.
+    Speeds up SELECT Count(*) on Admin views by >20x.
+    https://hakibenita.com/optimizing-the-django-admin-paginator
+    """
+
+    @cached_property
+    def count(self):
+        if self.object_list._has_filters():                             # type: ignore
+            # fallback to normal count method on filtered queryset
+            return super().count
+        else:
+            # otherwise count total rows in a separate fast query
+            return self.object_list.model.objects.count()
+    
+        # Alternative approach for PostgreSQL: fallback count takes > 200ms
+        # from django.db import connection, transaction, OperationalError
+        # with transaction.atomic(), connection.cursor() as cursor:
+        #     cursor.execute('SET LOCAL statement_timeout TO 200;')
+        #     try:
+        #         return super().count
+        #     except OperationalError:
+        #         return 9999999999999
+
+
 class ArchiveResultInline(admin.TabularInline):
     name = 'Archive Results Log'
     model = ArchiveResult
+    parent_model = Snapshot
     # fk_name = 'snapshot'
-    extra = 1
-    readonly_fields = ('result_id', 'start_ts', 'end_ts', 'extractor', 'command', 'cmd_version')
-    fields = ('id', *readonly_fields, 'status', 'output')
+    extra = 0
+    sort_fields = ('end_ts', 'extractor', 'output', 'status', 'cmd_version')
+    readonly_fields = ('result_id', 'completed', 'extractor', 'command', 'version')
+    fields = ('id', 'start_ts', 'end_ts', *readonly_fields, 'cmd', 'cmd_version', 'pwd', 'created_by', 'status', 'output')
+    # exclude = ('id',)
+    ordering = ('end_ts',)
     show_change_link = True
     # # classes = ['collapse']
     # # list_display_links = ['abid']
 
+    def get_parent_object_from_request(self, request):
+        resolved = resolve(request.path_info)
+        return self.parent_model.objects.get(pk=resolved.kwargs['object_id'])
+
+    @admin.display(
+        description='Completed',
+        ordering='end_ts',
+    )
+    def completed(self, obj):
+        return format_html('<p style="white-space: nowrap">{}</p>', obj.end_ts.strftime('%Y-%m-%d %H:%M:%S'))
+
     def result_id(self, obj):
-        return format_html('<a href="{}"><small><code>[{}]</code></small></a>', reverse('admin:core_archiveresult_change', args=(obj.id,)), obj.abid)
+        return format_html('<a href="{}"><code style="font-size: 10px">[{}]</code></a>', reverse('admin:core_archiveresult_change', args=(obj.id,)), obj.abid)
     
     def command(self, obj):
         return format_html('<small><code>{}</code></small>', " ".join(obj.cmd or []))
+    
+    def version(self, obj):
+        return format_html('<small><code>{}</code></small>', obj.cmd_version or '-')
+    
+    def get_formset(self, request, obj=None, **kwargs):
+        formset = super().get_formset(request, obj, **kwargs)
+        snapshot = self.get_parent_object_from_request(request)
+
+        # import ipdb; ipdb.set_trace()
+        formset.form.base_fields['id'].widget = formset.form.base_fields['id'].hidden_widget()
+        
+        # default values for new entries
+        formset.form.base_fields['status'].initial = 'succeeded'
+        formset.form.base_fields['start_ts'].initial = timezone.now()
+        formset.form.base_fields['end_ts'].initial = timezone.now()
+        formset.form.base_fields['cmd_version'].initial = '-'
+        formset.form.base_fields['pwd'].initial = str(snapshot.link_dir)
+        formset.form.base_fields['created_by'].initial = request.user
+        formset.form.base_fields['cmd'] = forms.JSONField(initial=['-'])
+        formset.form.base_fields['output'].initial = 'Manually recorded cmd output...'
+        
+        if obj is not None:
+            # hidden values for existing entries and new entries
+            formset.form.base_fields['start_ts'].widget = formset.form.base_fields['start_ts'].hidden_widget()
+            formset.form.base_fields['end_ts'].widget = formset.form.base_fields['end_ts'].hidden_widget()
+            formset.form.base_fields['cmd'].widget = formset.form.base_fields['cmd'].hidden_widget()
+            formset.form.base_fields['pwd'].widget = formset.form.base_fields['pwd'].hidden_widget()
+            formset.form.base_fields['created_by'].widget = formset.form.base_fields['created_by'].hidden_widget()
+            formset.form.base_fields['cmd_version'].widget = formset.form.base_fields['cmd_version'].hidden_widget()
+        return formset
+    
+    def get_readonly_fields(self, request, obj=None):
+        if obj is not None:
+            return self.readonly_fields
+        else:
+            return []
 
 
 class TagInline(admin.TabularInline):
@@ -222,25 +302,22 @@ def get_abid_info(self, obj):
 
 @admin.register(Snapshot, site=archivebox_admin)
 class SnapshotAdmin(SearchResultsAdminMixin, ABIDModelAdmin):
-    class Meta:
-        model = Snapshot
-
     list_display = ('added', 'title_str', 'files', 'size', 'url_str')
-    # list_editable = ('title',)
     sort_fields = ('title_str', 'url_str', 'added', 'files')
-    readonly_fields = ('tags', 'timestamp', 'admin_actions', 'status_info', 'bookmarked', 'added', 'updated', 'created', 'modified', 'API', 'link_dir')
+    readonly_fields = ('tags_str', 'timestamp', 'admin_actions', 'status_info', 'bookmarked', 'added', 'updated', 'created', 'modified', 'API', 'link_dir')
     search_fields = ('id', 'url', 'abid', 'old_id', 'timestamp', 'title', 'tags__name')
-    list_filter = ('added', 'updated', 'archiveresult__status', 'created_by', 'tags')
+    list_filter = ('added', 'updated', 'archiveresult__status', 'created_by', 'tags__name')
     fields = ('url', 'created_by', 'title', *readonly_fields)
     ordering = ['-added']
     actions = ['add_tags', 'remove_tags', 'update_titles', 'update_snapshots', 'resnapshot_snapshot', 'overwrite_snapshots', 'delete_snapshots']
-    autocomplete_fields = ['tags']
     inlines = [TagInline, ArchiveResultInline]
-    list_per_page = CONFIG.SNAPSHOTS_PER_PAGE
+    list_per_page = min(max(5, CONFIG.SNAPSHOTS_PER_PAGE), 5000)
 
     action_form = SnapshotActionForm
+    paginator = AccelleratedPaginator
 
     save_on_top = True
+    show_full_result_count = False
 
     def changelist_view(self, request, extra_context=None):
         extra_context = extra_context or {}
@@ -286,12 +363,15 @@ class SnapshotAdmin(SearchResultsAdminMixin, ABIDModelAdmin):
         ]
         return custom_urls + urls
 
-    def get_queryset(self, request):
-        self.request = request
-        return super().get_queryset(request).prefetch_related('tags', 'archiveresult_set').annotate(archiveresult_count=Count('archiveresult'))
+    # def get_queryset(self, request):
+    #     # tags_qs = SnapshotTag.objects.all().select_related('tag')
+    #     # prefetch = Prefetch('snapshottag_set', queryset=tags_qs)
+
+    #     self.request = request
+    #     return super().get_queryset(request).prefetch_related('archiveresult_set').distinct()  # .annotate(archiveresult_count=Count('archiveresult'))
 
     def tag_list(self, obj):
-        return ', '.join(obj.tags.values_list('name', flat=True))
+        return ', '.join(tag.name for tag in obj.tags.all())
 
     # TODO: figure out a different way to do this, you cant nest forms so this doenst work
     # def action(self, obj):
@@ -360,21 +440,20 @@ class SnapshotAdmin(SearchResultsAdminMixin, ABIDModelAdmin):
         ordering='title',
     )
     def title_str(self, obj):
-        canon = obj.as_link().canonical_outputs()
         tags = ''.join(
-            format_html('<a href="/admin/core/snapshot/?tags__id__exact={}"><span class="tag">{}</span></a> ', tag.id, tag)
+            format_html('<a href="/admin/core/snapshot/?tags__id__exact={}"><span class="tag">{}</span></a> ', tag.pk, tag.name)
             for tag in obj.tags.all()
-            if str(tag).strip()
+            if str(tag.name).strip()
         )
         return format_html(
             '<a href="/{}">'
-                '<img src="/{}/{}" class="favicon" onerror="this.remove()">'
+                '<img src="/{}/favicon.ico" class="favicon" onerror="this.remove()">'
             '</a>'
             '<a href="/{}/index.html">'
                 '<b class="status-{}">{}</b>'
             '</a>',
             obj.archive_path,
-            obj.archive_path, canon['favicon_path'],
+            obj.archive_path,
             obj.archive_path,
             'fetched' if obj.latest_title or obj.title else 'pending',
             urldecode(htmldecode(obj.latest_title or obj.title or ''))[:128] or 'Pending...'
@@ -382,14 +461,14 @@ class SnapshotAdmin(SearchResultsAdminMixin, ABIDModelAdmin):
 
     @admin.display(
         description='Files Saved',
-        ordering='archiveresult_count',
+        # ordering='archiveresult_count',
     )
     def files(self, obj):
         return snapshot_icons(obj)
 
 
     @admin.display(
-        ordering='archiveresult_count'
+        # ordering='archiveresult_count'
     )
     def size(self, obj):
         archive_size = (Path(obj.link_dir) / 'index.html').exists() and obj.archive_size
@@ -536,6 +615,8 @@ class TagAdmin(ABIDModelAdmin):
     actions = ['delete_selected']
     ordering = ['-created']
 
+    paginator = AccelleratedPaginator
+
     def API(self, obj):
         try:
             return get_abid_info(self, obj)
@@ -574,6 +655,8 @@ class ArchiveResultAdmin(ABIDModelAdmin):
     list_filter = ('status', 'extractor', 'start_ts', 'cmd_version')
     ordering = ['-start_ts']
     list_per_page = CONFIG.SNAPSHOTS_PER_PAGE
+    
+    paginator = AccelleratedPaginator
 
     @admin.display(
         description='Snapshot Info'

+ 1 - 1
archivebox/core/forms.py

@@ -4,7 +4,7 @@ from django import forms
 
 from ..util import URL_REGEX
 from ..parsers import PARSERS
-from ..vendor.taggit_utils import edit_string_for_tags, parse_tags
+from taggit.utils import edit_string_for_tags, parse_tags
 
 PARSER_CHOICES = [
     (parser_key, parser[0])

+ 2 - 2
archivebox/core/migrations/0027_update_snapshot_ids.py

@@ -52,7 +52,7 @@ def update_snapshot_ids(apps, schema_editor):
     Snapshot = apps.get_model("core", "Snapshot")
     num_total = Snapshot.objects.all().count()
     print(f'   Updating {num_total} Snapshot.id, Snapshot.uuid values in place...')
-    for idx, snapshot in enumerate(Snapshot.objects.all().only('abid').iterator()):
+    for idx, snapshot in enumerate(Snapshot.objects.all().only('abid').iterator(chunk_size=500)):
         assert snapshot.abid
         snapshot.abid_prefix = 'snp_'
         snapshot.abid_ts_src = 'self.added'
@@ -72,7 +72,7 @@ def update_archiveresult_ids(apps, schema_editor):
     ArchiveResult = apps.get_model("core", "ArchiveResult")
     num_total = ArchiveResult.objects.all().count()
     print(f'   Updating {num_total} ArchiveResult.id, ArchiveResult.uuid values in place... (may take an hour or longer for large collections...)')
-    for idx, result in enumerate(ArchiveResult.objects.all().only('abid', 'snapshot_id').iterator()):
+    for idx, result in enumerate(ArchiveResult.objects.all().only('abid', 'snapshot_id').iterator(chunk_size=500)):
         assert result.abid
         result.abid_prefix = 'res_'
         result.snapshot = Snapshot.objects.get(pk=result.snapshot_id)

+ 1 - 1
archivebox/core/migrations/0034_alter_archiveresult_old_id_alter_archiveresult_uuid.py

@@ -11,7 +11,7 @@ def update_archiveresult_ids(apps, schema_editor):
     ArchiveResult = apps.get_model("core", "ArchiveResult")
     num_total = ArchiveResult.objects.all().count()
     print(f'   Updating {num_total} ArchiveResult.id, ArchiveResult.uuid values in place... (may take an hour or longer for large collections...)')
-    for idx, result in enumerate(ArchiveResult.objects.all().only('abid').iterator()):
+    for idx, result in enumerate(ArchiveResult.objects.all().only('abid').iterator(chunk_size=500)):
         assert result.abid
         result.uuid = ABID.parse(result.abid).uuid
         result.save(update_fields=["uuid"])

+ 1 - 1
archivebox/core/migrations/0051_snapshottag_snapshot_alter_snapshottag_snapshot_old.py

@@ -9,7 +9,7 @@ def update_snapshottag_ids(apps, schema_editor):
     SnapshotTag = apps.get_model("core", "SnapshotTag")
     num_total = SnapshotTag.objects.all().count()
     print(f'   Updating {num_total} SnapshotTag.snapshot_id values in place... (may take an hour or longer for large collections...)')
-    for idx, snapshottag in enumerate(SnapshotTag.objects.all().only('snapshot_old_id').iterator()):
+    for idx, snapshottag in enumerate(SnapshotTag.objects.all().only('snapshot_old_id').iterator(chunk_size=500)):
         assert snapshottag.snapshot_old_id
         snapshot = Snapshot.objects.get(old_id=snapshottag.snapshot_old_id)
         snapshottag.snapshot_id = snapshot.id

+ 1 - 1
archivebox/core/migrations/0059_tag_id.py

@@ -49,7 +49,7 @@ def update_archiveresult_ids(apps, schema_editor):
     Tag = apps.get_model("core", "Tag")
     num_total = Tag.objects.all().count()
     print(f'   Updating {num_total} Tag.id, ArchiveResult.uuid values in place...')
-    for idx, tag in enumerate(Tag.objects.all().iterator()):
+    for idx, tag in enumerate(Tag.objects.all().iterator(chunk_size=500)):
         if not tag.slug:
             tag.slug = tag.name.lower().replace(' ', '_')
         if not tag.name:

+ 1 - 1
archivebox/core/migrations/0063_snapshottag_tag_alter_snapshottag_old_tag.py

@@ -9,7 +9,7 @@ def update_snapshottag_ids(apps, schema_editor):
     SnapshotTag = apps.get_model("core", "SnapshotTag")
     num_total = SnapshotTag.objects.all().count()
     print(f'   Updating {num_total} SnapshotTag.tag_id values in place... (may take an hour or longer for large collections...)')
-    for idx, snapshottag in enumerate(SnapshotTag.objects.all().only('old_tag_id').iterator()):
+    for idx, snapshottag in enumerate(SnapshotTag.objects.all().only('old_tag_id').iterator(chunk_size=500)):
         assert snapshottag.old_tag_id
         tag = Tag.objects.get(old_id=snapshottag.old_tag_id)
         snapshottag.tag_id = tag.id

+ 35 - 0
archivebox/core/migrations/0069_alter_archiveresult_created_alter_snapshot_added_and_more.py

@@ -0,0 +1,35 @@
+# Generated by Django 5.1 on 2024-08-28 09:40
+
+import abid_utils.models
+import django.utils.timezone
+from django.db import migrations
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ('core', '0068_alter_archiveresult_options'),
+    ]
+
+    operations = [
+        migrations.AlterField(
+            model_name='archiveresult',
+            name='created',
+            field=abid_utils.models.AutoDateTimeField(db_index=True, default=django.utils.timezone.now),
+        ),
+        migrations.AlterField(
+            model_name='snapshot',
+            name='added',
+            field=abid_utils.models.AutoDateTimeField(db_index=True, default=django.utils.timezone.now),
+        ),
+        migrations.AlterField(
+            model_name='snapshot',
+            name='created',
+            field=abid_utils.models.AutoDateTimeField(db_index=True, default=django.utils.timezone.now),
+        ),
+        migrations.AlterField(
+            model_name='tag',
+            name='created',
+            field=abid_utils.models.AutoDateTimeField(db_index=True, default=django.utils.timezone.now),
+        ),
+    ]

+ 68 - 36
archivebox/core/models.py

@@ -12,6 +12,7 @@ from uuid import uuid4
 from pathlib import Path
 
 from django.db import models
+from django.utils import timezone
 from django.utils.functional import cached_property
 from django.utils.text import slugify
 from django.core.cache import cache
@@ -19,7 +20,7 @@ from django.urls import reverse, reverse_lazy
 from django.db.models import Case, When, Value, IntegerField
 from django.conf import settings
 
-from abid_utils.models import ABIDModel, ABIDField
+from abid_utils.models import ABIDModel, ABIDField, AutoDateTimeField
 
 from ..system import get_dir_size
 from ..util import parse_date, base_url
@@ -50,7 +51,7 @@ class Tag(ABIDModel):
     Based on django-taggit model + ABID base.
     """
     abid_prefix = 'tag_'
-    abid_ts_src = 'self.created'          # TODO: add created/modified time
+    abid_ts_src = 'self.created'
     abid_uri_src = 'self.slug'
     abid_subtype_src = '"03"'
     abid_rand_src = 'self.old_id'
@@ -60,7 +61,6 @@ class Tag(ABIDModel):
     id = models.UUIDField(primary_key=True, default=uuid.uuid4, editable=False, unique=True)
     abid = ABIDField(prefix=abid_prefix)
 
-
     name = models.CharField(unique=True, blank=False, max_length=100)
     slug = models.SlugField(unique=True, blank=False, max_length=100, editable=False)
     # slug is autoset on save from name, never set it manually
@@ -125,6 +125,12 @@ class SnapshotTag(models.Model):
         db_table = 'core_snapshot_tags'
         unique_together = [('snapshot', 'tag')]
 
+
+class SnapshotManager(models.Manager):
+    def get_queryset(self):
+        return super().get_queryset().prefetch_related('tags', 'archiveresult_set')  # .annotate(archiveresult_count=models.Count('archiveresult')).distinct()
+
+
 class Snapshot(ABIDModel):
     abid_prefix = 'snp_'
     abid_ts_src = 'self.added'
@@ -143,16 +149,15 @@ class Snapshot(ABIDModel):
     
     tags = models.ManyToManyField(Tag, blank=True, through=SnapshotTag, related_name='snapshot_set', through_fields=('snapshot', 'tag'))
 
-    added = models.DateTimeField(auto_now_add=True, db_index=True)
+    added = AutoDateTimeField(default=timezone.now, db_index=True)
     updated = models.DateTimeField(auto_now=True, blank=True, null=True, db_index=True)
 
     keys = ('url', 'timestamp', 'title', 'tags', 'updated')
 
     archiveresult_set: models.Manager['ArchiveResult']
 
-    @property
-    def uuid(self):
-        return self.id
+    objects = SnapshotManager()
+
 
     def __repr__(self) -> str:
         title = (self.title_stripped or '-')[:64]
@@ -162,13 +167,6 @@ class Snapshot(ABIDModel):
         title = (self.title_stripped or '-')[:64]
         return f'[{self.timestamp}] {self.url[:64]} ({title})'
 
-    def save(self, *args, **kwargs):
-        super().save(*args, **kwargs)
-        try:
-            assert str(self.id) == str(self.ABID.uuid) == str(self.uuid), f'Snapshot.id ({self.id}) does not match .ABID.uuid ({self.ABID.uuid})'
-        except AssertionError as e:
-            print(e)
-
     @classmethod
     def from_json(cls, info: dict):
         info = {k: v for k, v in info.items() if k in cls.keys}
@@ -177,8 +175,7 @@ class Snapshot(ABIDModel):
     def as_json(self, *args) -> dict:
         args = args or self.keys
         return {
-            key: getattr(self, key)
-            if key != 'tags' else self.tags_str()
+            key: getattr(self, key) if key != 'tags' else self.tags_str(nocache=False)
             for key in args
         }
 
@@ -190,8 +187,14 @@ class Snapshot(ABIDModel):
         return load_link_details(self.as_link())
 
     def tags_str(self, nocache=True) -> str | None:
+        calc_tags_str = lambda: ','.join(sorted(tag.name for tag in self.tags.all()))
         cache_key = f'{self.pk}-{(self.updated or self.added).timestamp()}-tags'
-        calc_tags_str = lambda: ','.join(self.tags.order_by('name').values_list('name', flat=True))
+        
+        if hasattr(self, '_prefetched_objects_cache') and 'tags' in self._prefetched_objects_cache:
+            # tags are pre-fetched already, use them directly (best because db is always freshest)
+            tags_str = calc_tags_str()
+            return tags_str
+        
         if nocache:
             tags_str = calc_tags_str()
             cache.set(cache_key, tags_str)
@@ -234,7 +237,10 @@ class Snapshot(ABIDModel):
 
     @cached_property
     def num_outputs(self) -> int:
-        return self.archiveresult_set.filter(status='succeeded').count()
+        # DONT DO THIS: it will trigger a separate query for every snapshot
+        # return self.archiveresult_set.filter(status='succeeded').count()
+        # this is better:
+        return sum((1 for result in self.archiveresult_set.all() if result.status == 'succeeded'))
 
     @cached_property
     def base_url(self):
@@ -262,10 +268,21 @@ class Snapshot(ABIDModel):
 
     @cached_property
     def thumbnail_url(self) -> Optional[str]:
-        result = self.archiveresult_set.filter(
-            extractor='screenshot',
-            status='succeeded'
-        ).only('output').last()
+        if hasattr(self, '_prefetched_objects_cache') and 'archiveresult_set' in self._prefetched_objects_cache:
+            result = (sorted(
+                (
+                    result
+                    for result in self.archiveresult_set.all()
+                    if result.extractor == 'screenshot' and result.status =='succeeded' and result.output
+                ),
+                key=lambda result: result.created,
+            ) or [None])[-1]
+        else:
+            result = self.archiveresult_set.filter(
+                extractor='screenshot',
+                status='succeeded'
+            ).only('output').last()
+
         if result:
             return reverse('Snapshot', args=[f'{str(self.timestamp)}/{result.output}'])
         return None
@@ -292,6 +309,21 @@ class Snapshot(ABIDModel):
         if self.title:
             return self.title   # whoopdedoo that was easy
         
+        # check if ArchiveResult set has already been prefetched, if so use it instead of fetching it from db again
+        if hasattr(self, '_prefetched_objects_cache') and 'archiveresult_set' in self._prefetched_objects_cache:
+            try:
+                return (sorted(
+                    (
+                        result.output.strip()
+                        for result in self.archiveresult_set.all()
+                        if result.extractor == 'title' and result.status =='succeeded' and result.output
+                    ),
+                    key=lambda title: len(title),
+                ) or [None])[-1]
+            except IndexError:
+                pass
+        
+
         try:
             # take longest successful title from ArchiveResult db history
             return sorted(
@@ -355,12 +387,23 @@ class Snapshot(ABIDModel):
 
 class ArchiveResultManager(models.Manager):
     def indexable(self, sorted: bool = True):
+        """Return only ArchiveResults containing text suitable for full-text search (sorted in order of typical result quality)"""
+        
         INDEXABLE_METHODS = [ r[0] for r in ARCHIVE_METHODS_INDEXING_PRECEDENCE ]
-        qs = self.get_queryset().filter(extractor__in=INDEXABLE_METHODS,status='succeeded')
+        qs = self.get_queryset().filter(extractor__in=INDEXABLE_METHODS, status='succeeded')
 
         if sorted:
-            precedence = [ When(extractor=method, then=Value(precedence)) for method, precedence in ARCHIVE_METHODS_INDEXING_PRECEDENCE ]
-            qs = qs.annotate(indexing_precedence=Case(*precedence, default=Value(1000),output_field=IntegerField())).order_by('indexing_precedence')
+            precedence = [
+                When(extractor=method, then=Value(precedence))
+                for method, precedence in ARCHIVE_METHODS_INDEXING_PRECEDENCE
+            ]
+            qs = qs.annotate(
+                indexing_precedence=Case(
+                    *precedence,
+                    default=Value(1000),
+                    output_field=IntegerField()
+                )
+            ).order_by('indexing_precedence')
         return qs
 
 class ArchiveResult(ABIDModel):
@@ -418,17 +461,6 @@ class ArchiveResult(ABIDModel):
     def __str__(self):
         return self.extractor
 
-    def save(self, *args, **kwargs):
-        super().save(*args, **kwargs)
-        try:
-            assert str(self.id) == str(self.ABID.uuid) == str(self.uuid), f'ArchiveResult.id ({self.id}) does not match .ABID.uuid ({self.ABID.uuid})'
-        except AssertionError as e:
-            print(e)
-
-    @property
-    def uuid(self):
-        return self.id
-
     @cached_property
     def snapshot_dir(self):
         return Path(self.snapshot.link_dir)

+ 126 - 80
archivebox/core/settings.py

@@ -4,7 +4,9 @@ import os
 import sys
 import re
 import logging
+import inspect
 import tempfile
+from typing import Any, Dict
 
 from pathlib import Path
 from django.utils.crypto import get_random_string
@@ -33,22 +35,20 @@ APPEND_SLASH = True
 DEBUG = CONFIG.DEBUG or ('--debug' in sys.argv)
 
 
-# add plugins folders to system path, and load plugins in installed_apps
-BUILTIN_PLUGINS_DIR = CONFIG.PACKAGE_DIR / 'plugins'
-USER_PLUGINS_DIR = CONFIG.OUTPUT_DIR / 'plugins'
-sys.path.insert(0, str(BUILTIN_PLUGINS_DIR))
-sys.path.insert(0, str(USER_PLUGINS_DIR))
+BUILTIN_PLUGINS_DIR = CONFIG.PACKAGE_DIR / 'builtin_plugins'
+USER_PLUGINS_DIR = CONFIG.OUTPUT_DIR / 'user_plugins'
 
-def find_plugins(plugins_dir):
-    return {
-        # plugin_entrypoint.parent.name: import_module(plugin_entrypoint.parent.name).METADATA
-        plugin_entrypoint.parent.name: plugin_entrypoint.parent
+def find_plugins(plugins_dir, prefix: str) -> Dict[str, Any]:
+    plugins = {
+        f'{prefix}.{plugin_entrypoint.parent.name}': plugin_entrypoint.parent
         for plugin_entrypoint in plugins_dir.glob('*/apps.py')
     }
+    # print(f'Found {prefix} plugins:\n', '\n    '.join(plugins.keys()))
+    return plugins
 
 INSTALLED_PLUGINS = {
-    **find_plugins(BUILTIN_PLUGINS_DIR),
-    **find_plugins(USER_PLUGINS_DIR),
+    **find_plugins(BUILTIN_PLUGINS_DIR, prefix='builtin_plugins'),
+    **find_plugins(USER_PLUGINS_DIR, prefix='user_plugins'),
 }
 
 
@@ -66,11 +66,11 @@ INSTALLED_APPS = [
     'plugantic',
     'core',
     'api',
+    'pkg',
 
     *INSTALLED_PLUGINS.keys(),
 
     'admin_data_views',
-
     'django_extensions',
 ]
 
@@ -144,64 +144,6 @@ if CONFIG.LDAP:
         # sys.exit(1)
 
 
-################################################################################
-### Debug Settings
-################################################################################
-
-# only enable debug toolbar when in DEBUG mode with --nothreading (it doesnt work in multithreaded mode)
-DEBUG_TOOLBAR = DEBUG and ('--nothreading' in sys.argv) and ('--reload' not in sys.argv)
-if DEBUG_TOOLBAR:
-    try:
-        import debug_toolbar   # noqa
-        DEBUG_TOOLBAR = True
-    except ImportError:
-        DEBUG_TOOLBAR = False
-
-if DEBUG_TOOLBAR:
-    INSTALLED_APPS = [*INSTALLED_APPS, 'debug_toolbar']
-    INTERNAL_IPS = ['0.0.0.0', '127.0.0.1', '*']
-    DEBUG_TOOLBAR_CONFIG = {
-        "SHOW_TOOLBAR_CALLBACK": lambda request: True,
-        "RENDER_PANELS": True,
-    }
-    DEBUG_TOOLBAR_PANELS = [
-        'debug_toolbar.panels.history.HistoryPanel',
-        'debug_toolbar.panels.versions.VersionsPanel',
-        'debug_toolbar.panels.timer.TimerPanel',
-        'debug_toolbar.panels.settings.SettingsPanel',
-        'debug_toolbar.panels.headers.HeadersPanel',
-        'debug_toolbar.panels.request.RequestPanel',
-        'debug_toolbar.panels.sql.SQLPanel',
-        'debug_toolbar.panels.staticfiles.StaticFilesPanel',
-        # 'debug_toolbar.panels.templates.TemplatesPanel',
-        'debug_toolbar.panels.cache.CachePanel',
-        'debug_toolbar.panels.signals.SignalsPanel',
-        'debug_toolbar.panels.logging.LoggingPanel',
-        'debug_toolbar.panels.redirects.RedirectsPanel',
-        'debug_toolbar.panels.profiling.ProfilingPanel',
-        'djdt_flamegraph.FlamegraphPanel',
-    ]
-    MIDDLEWARE = [*MIDDLEWARE, 'debug_toolbar.middleware.DebugToolbarMiddleware']
-
-if DEBUG:
-    from django_autotyping.typing import AutotypingSettingsDict
-
-    INSTALLED_APPS += ['django_autotyping']
-    AUTOTYPING: AutotypingSettingsDict = {
-        "STUBS_GENERATION": {
-            "LOCAL_STUBS_DIR": Path(CONFIG.PACKAGE_DIR) / "typings",
-        }
-    }
-
-# https://github.com/bensi94/Django-Requests-Tracker (improved version of django-debug-toolbar)
-# Must delete archivebox/templates/admin to use because it relies on some things we override
-# visit /__requests_tracker__/ to access
-DEBUG_REQUESTS_TRACKER = False
-if DEBUG_REQUESTS_TRACKER:
-    INSTALLED_APPS += ["requests_tracker"]
-    MIDDLEWARE += ["requests_tracker.middleware.requests_tracker_middleware"]
-    INTERNAL_IPS = ["127.0.0.1", "10.0.2.2", "0.0.0.0", "*"]
-
 
 ################################################################################
 ### Staticfile and Template Settings
@@ -317,13 +259,15 @@ STORAGES = {
 SECRET_KEY = CONFIG.SECRET_KEY or get_random_string(50, 'abcdefghijklmnopqrstuvwxyz0123456789_')
 
 ALLOWED_HOSTS = CONFIG.ALLOWED_HOSTS.split(',')
-CSRF_TRUSTED_ORIGINS = CONFIG.CSRF_TRUSTED_ORIGINS.split(',')
+CSRF_TRUSTED_ORIGINS = list(set(CONFIG.CSRF_TRUSTED_ORIGINS.split(',')))
 
 # automatically fix case when user sets ALLOWED_HOSTS (e.g. to archivebox.example.com)
 # but forgets to add https://archivebox.example.com to CSRF_TRUSTED_ORIGINS
-if CONFIG.ALLOWED_HOSTS != '*' and (not CSRF_TRUSTED_ORIGINS):
-    for hostname in ALLOWED_HOSTS:
-        CSRF_TRUSTED_ORIGINS.append(f'https://{hostname}')
+for hostname in ALLOWED_HOSTS:
+    https_endpoint = f'https://{hostname}'
+    if hostname != '*' and https_endpoint not in CSRF_TRUSTED_ORIGINS:
+        print(f'[!] WARNING: {https_endpoint} from ALLOWED_HOSTS should be added to CSRF_TRUSTED_ORIGINS')
+        CSRF_TRUSTED_ORIGINS.append(https_endpoint)
 
 SECURE_BROWSER_XSS_FILTER = True
 SECURE_CONTENT_TYPE_NOSNIFF = True
@@ -345,6 +289,8 @@ AUTH_PASSWORD_VALIDATORS = [
     {'NAME': 'django.contrib.auth.password_validation.NumericPasswordValidator'},
 ]
 
+DATA_UPLOAD_MAX_NUMBER_FIELDS = None
+
 ################################################################################
 ### Shell Settings
 ################################################################################
@@ -385,6 +331,10 @@ IGNORABLE_404_URLS = [
     re.compile(r'robots\.txt$'),
     re.compile(r'.*\.(css|js)\.map$'),
 ]
+IGNORABLE_200_URLS = [
+    re.compile(r'^"GET /static/.* HTTP/.*" (200|30.) .+', re.I | re.M),
+    re.compile(r'^"GET /admin/jsi18n/ HTTP/.*" (200|30.) .+', re.I | re.M),
+]
 
 class NoisyRequestsFilter(logging.Filter):
     def filter(self, record) -> bool:
@@ -396,19 +346,26 @@ class NoisyRequestsFilter(logging.Filter):
             if ignorable_log_pattern.match(logline):
                 return False
 
-        # ignore staticfile requests that 200 or 30*
-        ignoreable_200_log_pattern = re.compile(r'"GET /static/.* HTTP/.*" (200|30.) .+', re.I | re.M)
-        if ignoreable_200_log_pattern.match(logline):
-            return False
+            ignorable_log_pattern = re.compile(f'^Not Found: /.*/?{ignorable_url_pattern.pattern}', re.I | re.M)
+            if ignorable_log_pattern.match(logline):
+                return False
 
+        # ignore staticfile requests that 200 or 30*
+        for ignorable_url_pattern in IGNORABLE_200_URLS:
+            if ignorable_log_pattern.match(logline):
+                return False
+            
         return True
 
+
+ERROR_LOG = tempfile.NamedTemporaryFile().name
+
 if CONFIG.LOGS_DIR.exists():
     ERROR_LOG = (CONFIG.LOGS_DIR / 'errors.log')
 else:
     # historically too many edge cases here around creating log dir w/ correct permissions early on
     # if there's an issue on startup, we trash the log and let user figure it out via stdout/stderr
-    ERROR_LOG = tempfile.NamedTemporaryFile().name
+    print(f'[!] WARNING: data/logs dir does not exist. Logging to temp file: {ERROR_LOG}')
 
 LOGGING = {
     'version': 1,
@@ -445,6 +402,10 @@ LOGGING = {
 }
 
 
+################################################################################
+### REST API Outbound Webhooks settings
+################################################################################
+
 # Add default webhook configuration to the User model
 SIGNAL_WEBHOOKS_CUSTOM_MODEL = 'api.models.OutboundWebhook'
 SIGNAL_WEBHOOKS = {
@@ -458,7 +419,9 @@ SIGNAL_WEBHOOKS = {
     },
 }
 
-DATA_UPLOAD_MAX_NUMBER_FIELDS = None
+################################################################################
+### Admin Data View Settings
+################################################################################
 
 ADMIN_DATA_VIEWS = {
     "NAME": "Environment",
@@ -495,3 +458,86 @@ ADMIN_DATA_VIEWS = {
         },
     ],
 }
+
+
+################################################################################
+### Debug Settings
+################################################################################
+
+# only enable debug toolbar when in DEBUG mode with --nothreading (it doesnt work in multithreaded mode)
+DEBUG_TOOLBAR = False
+DEBUG_TOOLBAR = DEBUG_TOOLBAR and DEBUG and ('--nothreading' in sys.argv) and ('--reload' not in sys.argv)
+if DEBUG_TOOLBAR:
+    try:
+        import debug_toolbar   # noqa
+        DEBUG_TOOLBAR = True
+    except ImportError:
+        DEBUG_TOOLBAR = False
+
+if DEBUG_TOOLBAR:
+    INSTALLED_APPS = [*INSTALLED_APPS, 'debug_toolbar']
+    INTERNAL_IPS = ['0.0.0.0', '127.0.0.1', '*']
+    DEBUG_TOOLBAR_CONFIG = {
+        "SHOW_TOOLBAR_CALLBACK": lambda request: True,
+        "RENDER_PANELS": True,
+    }
+    DEBUG_TOOLBAR_PANELS = [
+        'debug_toolbar.panels.history.HistoryPanel',
+        'debug_toolbar.panels.versions.VersionsPanel',
+        'debug_toolbar.panels.timer.TimerPanel',
+        'debug_toolbar.panels.settings.SettingsPanel',
+        'debug_toolbar.panels.headers.HeadersPanel',
+        'debug_toolbar.panels.request.RequestPanel',
+        'debug_toolbar.panels.sql.SQLPanel',
+        'debug_toolbar.panels.staticfiles.StaticFilesPanel',
+        # 'debug_toolbar.panels.templates.TemplatesPanel',
+        'debug_toolbar.panels.cache.CachePanel',
+        'debug_toolbar.panels.signals.SignalsPanel',
+        'debug_toolbar.panels.logging.LoggingPanel',
+        'debug_toolbar.panels.redirects.RedirectsPanel',
+        'debug_toolbar.panels.profiling.ProfilingPanel',
+        'djdt_flamegraph.FlamegraphPanel',
+    ]
+    MIDDLEWARE = [*MIDDLEWARE, 'debug_toolbar.middleware.DebugToolbarMiddleware']
+
+if DEBUG:
+    from django_autotyping.typing import AutotypingSettingsDict
+
+    INSTALLED_APPS += ['django_autotyping']
+    AUTOTYPING: AutotypingSettingsDict = {
+        "STUBS_GENERATION": {
+            "LOCAL_STUBS_DIR": Path(CONFIG.PACKAGE_DIR) / "typings",
+        }
+    }
+
+# https://github.com/bensi94/Django-Requests-Tracker (improved version of django-debug-toolbar)
+# Must delete archivebox/templates/admin to use because it relies on some things we override
+# visit /__requests_tracker__/ to access
+DEBUG_REQUESTS_TRACKER = True
+DEBUG_REQUESTS_TRACKER = DEBUG_REQUESTS_TRACKER and DEBUG
+if DEBUG_REQUESTS_TRACKER:
+    import requests_tracker
+
+    INSTALLED_APPS += ["requests_tracker"]
+    MIDDLEWARE += ["requests_tracker.middleware.requests_tracker_middleware"]
+    INTERNAL_IPS = ["127.0.0.1", "10.0.2.2", "0.0.0.0", "*"]
+
+    TEMPLATE_DIRS.insert(0, str(Path(inspect.getfile(requests_tracker)).parent / "templates"))
+
+    REQUESTS_TRACKER_CONFIG = {
+        "TRACK_SQL": True,
+        "ENABLE_STACKTRACES": False,
+        "IGNORE_PATHS_PATTERNS": (
+            r".*/favicon\.ico",
+            r".*\.png",
+            r"/admin/jsi18n/",
+        ),
+        "IGNORE_SQL_PATTERNS": (
+            r"^SELECT .* FROM django_migrations WHERE app = 'requests_tracker'",
+            r"^SELECT .* FROM django_migrations WHERE app = 'auth'",
+        ),
+    }
+
+# https://docs.pydantic.dev/logfire/integrations/django/ (similar to DataDog / NewRelic / etc.)
+DEBUG_LOGFIRE = False
+DEBUG_LOGFIRE = DEBUG_LOGFIRE and (Path(CONFIG.OUTPUT_DIR) / '.logfire').is_dir()

+ 1 - 1
archivebox/extractors/__init__.py

@@ -218,7 +218,7 @@ def archive_links(all_links: Union[Iterable[Link], QuerySet], overwrite: bool=Fa
     if type(all_links) is QuerySet:
         num_links: int = all_links.count()
         get_link = lambda x: x.as_link_with_details()
-        all_links = all_links.iterator()
+        all_links = all_links.iterator(chunk_size=500)
     else:
         num_links: int = len(all_links)
         get_link = lambda x: x

+ 12 - 1
archivebox/extractors/wget.py

@@ -197,7 +197,7 @@ def unsafe_wget_output_path(link: Link) -> Optional[str]:
 
 
 @enforce_types
-def wget_output_path(link: Link) -> Optional[str]:
+def wget_output_path(link: Link, nocache: bool=False) -> Optional[str]:
     """calculate the path to the wgetted .html file, since wget may
     adjust some paths to be different than the base_url path.
 
@@ -245,6 +245,15 @@ def wget_output_path(link: Link) -> Optional[str]:
     #    https://example.com/abc/test/?v=zzVa_tX1OiI
     #       > example.com/abc/test/index.html@v=zzVa_tX1OiI.html
 
+    cache_key = f'{link.url_hash}:{link.timestamp}-{link.updated and link.updated.timestamp()}-wget-output-path'
+    
+    if not nocache:
+        from django.core.cache import cache
+        cached_result = cache.get(cache_key)
+        if cached_result:
+            return cached_result
+
+
     # There's also lots of complexity around how the urlencoding and renaming
     # is done for pages with query and hash fragments, extensions like shtml / htm / php / etc,
     # unicode escape sequences, punycode domain names, unicode double-width characters, extensions longer than
@@ -271,6 +280,8 @@ def wget_output_path(link: Link) -> Optional[str]:
         output_path = None
 
     if output_path:
+        if not nocache:
+            cache.set(cache_key, output_path)
         return output_path
 
     # fallback to just the domain dir

+ 6 - 6
archivebox/index/__init__.py

@@ -407,7 +407,7 @@ def snapshot_filter(snapshots: QuerySet, filter_patterns: List[str], filter_type
 
 def get_indexed_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
     """indexed links without checking archive status or data directory validity"""
-    links = (snapshot.as_link() for snapshot in snapshots.iterator())
+    links = (snapshot.as_link() for snapshot in snapshots.iterator(chunk_size=500))
     return {
         link.link_dir: link
         for link in links
@@ -415,7 +415,7 @@ def get_indexed_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Option
 
 def get_archived_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
     """indexed links that are archived with a valid data directory"""
-    links = (snapshot.as_link() for snapshot in snapshots.iterator())
+    links = (snapshot.as_link() for snapshot in snapshots.iterator(chunk_size=500))
     return {
         link.link_dir: link
         for link in filter(is_archived, links)
@@ -423,7 +423,7 @@ def get_archived_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optio
 
 def get_unarchived_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
     """indexed links that are unarchived with no data directory or an empty data directory"""
-    links = (snapshot.as_link() for snapshot in snapshots.iterator())
+    links = (snapshot.as_link() for snapshot in snapshots.iterator(chunk_size=500))
     return {
         link.link_dir: link
         for link in filter(is_unarchived, links)
@@ -448,7 +448,7 @@ def get_present_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Option
 
 def get_valid_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
     """dirs with a valid index matched to the main index and archived content"""
-    links = [snapshot.as_link_with_details() for snapshot in snapshots.iterator()]
+    links = [snapshot.as_link_with_details() for snapshot in snapshots.iterator(chunk_size=500)]
     return {
         link.link_dir: link
         for link in filter(is_valid, links)
@@ -475,7 +475,7 @@ def get_duplicate_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Opti
             if entry.is_dir() and not snapshots.filter(timestamp=entry.name).exists()
     )
 
-    for path in chain(snapshots.iterator(), data_folders):
+    for path in chain(snapshots.iterator(chunk_size=500), data_folders):
         link = None
         if type(path) is not str:
             path = path.as_link().link_dir
@@ -518,7 +518,7 @@ def get_orphaned_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optio
 def get_corrupted_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
     """dirs that don't contain a valid index and aren't listed in the main index"""
     corrupted = {}
-    for snapshot in snapshots.iterator():
+    for snapshot in snapshots.iterator(chunk_size=500):
         link = snapshot.as_link()
         if is_corrupt(link):
             corrupted[link.link_dir] = link

+ 9 - 1
archivebox/index/html.py

@@ -124,7 +124,15 @@ def snapshot_icons(snapshot) -> str:
         from core.models import ArchiveResult
         # start = datetime.now(timezone.utc)
 
-        archive_results = snapshot.archiveresult_set.filter(status="succeeded", output__isnull=False)
+        if hasattr(snapshot, '_prefetched_objects_cache') and 'archiveresult_set' in snapshot._prefetched_objects_cache:
+            archive_results = [
+                result
+                for result in snapshot.archiveresult_set.all()
+                if result.status == "succeeded" and result.output
+            ]
+        else:
+            archive_results = snapshot.archiveresult_set.filter(status="succeeded", output__isnull=False)
+
         link = snapshot.as_link()
         path = link.archive_path
         canon = link.canonical_outputs()

+ 3 - 1
archivebox/index/sql.py

@@ -37,9 +37,11 @@ def remove_from_sql_main_index(snapshots: QuerySet, atomic: bool=False, out_dir:
 @enforce_types
 def write_link_to_sql_index(link: Link, created_by_id: int | None=None):
     from core.models import Snapshot, ArchiveResult
+    from abid_utils.models import get_or_create_system_user_pk
+
     info = {k: v for k, v in link._asdict().items() if k in Snapshot.keys}
 
-    info['created_by_id'] = created_by_id
+    info['created_by_id'] = created_by_id or get_or_create_system_user_pk()
 
     tag_list = list(dict.fromkeys(
         tag.strip() for tag in re.split(TAG_SEPARATOR_PATTERN, link.tags or '')

+ 6 - 5
archivebox/main.py

@@ -960,7 +960,8 @@ def setup(out_dir: Path=OUTPUT_DIR) -> None:
         run_subcommand('init', stdin=None, pwd=out_dir)
 
     setup_django(out_dir=out_dir, check_db=True)
-    from core.models import User
+    from django.contrib.auth import get_user_model
+    User = get_user_model()
 
     if not User.objects.filter(is_superuser=True).exists():
         stderr('\n[+] Creating new admin user for the Web UI...', color='green')
@@ -979,16 +980,16 @@ def setup(out_dir: Path=OUTPUT_DIR) -> None:
                 '--upgrade',
                 '--no-cache-dir',
                 '--no-warn-script-location',
-                'youtube_dl',
+                'yt-dlp',
             ], capture_output=False, cwd=out_dir)
             pkg_path = run_shell([
                 PYTHON_BINARY, '-m', 'pip',
                 'show',
-                'youtube_dl',
+                'yt-dlp',
             ], capture_output=True, text=True, cwd=out_dir).stdout.decode().split('Location: ')[-1].split('\n', 1)[0]
-            NEW_YOUTUBEDL_BINARY = Path(pkg_path) / 'youtube_dl' / '__main__.py'
+            NEW_YOUTUBEDL_BINARY = Path(pkg_path) / 'yt-dlp' / '__main__.py'
             os.chmod(NEW_YOUTUBEDL_BINARY, 0o777)
-            assert NEW_YOUTUBEDL_BINARY.exists(), f'youtube_dl must exist inside {pkg_path}'
+            assert NEW_YOUTUBEDL_BINARY.exists(), f'yt-dlp must exist inside {pkg_path}'
             config(f'YOUTUBEDL_BINARY={NEW_YOUTUBEDL_BINARY}', set=True, out_dir=out_dir)
         except BaseException as e:                                              # lgtm [py/catch-base-exception]
             stderr(f'[X] Failed to install python packages: {e}', color='red')

+ 10 - 10
archivebox/package-lock.json

@@ -11,7 +11,7 @@
       "dependencies": {
         "@postlight/parser": "^2.2.3",
         "readability-extractor": "github:ArchiveBox/readability-extractor",
-        "single-file-cli": "^1.1.54"
+        "single-file-cli": "^2.0.58"
       }
     },
     "node_modules/@asamuzakjp/dom-selector": {
@@ -236,9 +236,9 @@
       "license": "MIT"
     },
     "node_modules/@types/node": {
-      "version": "22.5.0",
-      "resolved": "https://registry.npmjs.org/@types/node/-/node-22.5.0.tgz",
-      "integrity": "sha512-DkFrJOe+rfdHTqqMg0bSNlGlQ85hSoh2TPzZyhHsXnMtligRWpxUySiyw8FY14ITt24HVCiQPWxS3KO/QlGmWg==",
+      "version": "22.5.1",
+      "resolved": "https://registry.npmjs.org/@types/node/-/node-22.5.1.tgz",
+      "integrity": "sha512-KkHsxej0j9IW1KKOOAA/XBA0z08UFSrRQHErzEfA3Vgq57eXIMYboIlHJuYIfd+lwCQjtKqUu3UnmKbtUc9yRw==",
       "license": "MIT",
       "optional": true,
       "dependencies": {
@@ -353,9 +353,9 @@
       }
     },
     "node_modules/aws4": {
-      "version": "1.13.1",
-      "resolved": "https://registry.npmjs.org/aws4/-/aws4-1.13.1.tgz",
-      "integrity": "sha512-u5w79Rd7SU4JaIlA/zFqG+gOiuq25q5VLyZ8E+ijJeILuTxVzZgp2CaGw/UTw6pXYN9XMO9yiqj/nEHmhTG5CA==",
+      "version": "1.13.2",
+      "resolved": "https://registry.npmjs.org/aws4/-/aws4-1.13.2.tgz",
+      "integrity": "sha512-lHe62zvbTB5eEABUVi/AwVh0ZKY9rMMDhmm+eeyuuUQbQ3+J+fONVQOZyj+DdrvD4BY33uYniyRJ4UJIaSKAfw==",
       "license": "MIT"
     },
     "node_modules/b4a": {
@@ -2376,9 +2376,9 @@
       }
     },
     "node_modules/tslib": {
-      "version": "2.6.3",
-      "resolved": "https://registry.npmjs.org/tslib/-/tslib-2.6.3.tgz",
-      "integrity": "sha512-xNvxJEOUiWPGhUuUdQgAJPKOOJfGnIyKySOc09XkKsgdUV/3E2zvwZYdejjmRgPCgcym1juLH3226yA7sEFJKQ==",
+      "version": "2.7.0",
+      "resolved": "https://registry.npmjs.org/tslib/-/tslib-2.7.0.tgz",
+      "integrity": "sha512-gLXCKdN1/j47AiHiOkJN69hJmcbGTHI0ImLmbYLHykhgeN0jVGola9yVjFgzCUklsZQMW55o+dW7IXv3RCXDzA==",
       "license": "0BSD"
     },
     "node_modules/turndown": {

+ 1 - 1
archivebox/parsers/pocket_api.py

@@ -7,7 +7,7 @@ from typing import IO, Iterable, Optional
 from configparser import ConfigParser
 
 from pathlib import Path
-from ..vendor.pocket import Pocket
+from pocket import Pocket
 
 from ..index.schema import Link
 from ..util import enforce_types

+ 0 - 0
archivebox/pkg/__init__.py


+ 3 - 0
archivebox/pkg/admin.py

@@ -0,0 +1,3 @@
+from django.contrib import admin
+
+# Register your models here.

+ 14 - 0
archivebox/pkg/apps.py

@@ -0,0 +1,14 @@
+__package__ = 'archivebox.pkg'
+
+from django.apps import AppConfig
+
+
+class PkgsConfig(AppConfig):
+    default_auto_field = 'django.db.models.BigAutoField'
+    name = 'pkg'
+
+    def ready(self):
+        from .settings import LOADED_DEPENDENCIES
+
+        # print(LOADED_DEPENDENCIES)
+        

+ 0 - 0
archivebox/pkg/management/__init__.py


+ 0 - 0
archivebox/pkg/management/commands/__init__.py


+ 75 - 0
archivebox/pkg/management/commands/pkg.py

@@ -0,0 +1,75 @@
+__package__ = 'archivebox.pkg.management.commands'
+
+from django.core.management.base import BaseCommand
+from django.conf import settings
+
+from pydantic_pkgr import Binary, BinProvider, BrewProvider, EnvProvider, SemVer
+from pydantic_pkgr.binprovider import bin_abspath
+
+from ....config import NODE_BIN_PATH, bin_path
+
+from plugantic.plugins import LOADED_PLUGINS
+
+from pkg.settings import env
+
+
+class Command(BaseCommand):
+    def handle(self, *args, method, **options):
+        method(*args, **options)
+
+    def add_arguments(self, parser):
+        subparsers = parser.add_subparsers(title="sub-commands", required=True)
+
+        list_parser = subparsers.add_parser("list", help="List archivebox runtime dependencies.")
+        list_parser.set_defaults(method=self.list)
+
+        install_parser = subparsers.add_parser("install", help="Install archivebox runtime dependencies.")
+        install_parser.add_argument("--update", action="store_true", help="Update dependencies to latest versions.")
+        install_parser.add_argument("package_names", nargs="+", type=str)
+        install_parser.set_defaults(method=self.install)
+
+    def list(self, *args, **options):
+        self.stdout.write('################# PLUGINS ####################')
+        for plugin in LOADED_PLUGINS:
+            self.stdout.write(f'{plugin.name}:')
+            for binary in plugin.binaries:
+                try:
+                    binary = binary.install()
+                except Exception as e:
+                    # import ipdb; ipdb.set_trace()
+                    raise
+                self.stdout.write(f'    {binary.name.ljust(14)} {str(binary.version).ljust(11)} {binary.binprovider.INSTALLER_BIN.ljust(5)}  {binary.abspath}')
+
+        self.stdout.write('\n################# LEGACY ####################')
+        for bin_key, dependency in settings.CONFIG.DEPENDENCIES.items():
+            bin_name = settings.CONFIG[bin_key]
+
+            self.stdout.write(f'{bin_key}:     {bin_name}')
+
+            # binary = Binary(name=package_name, providers=[env])
+            # print(binary)
+
+            # try:
+            #     loaded_bin = binary.load()
+            #     self.stdout.write(
+            #         self.style.SUCCESS(f'Successfully loaded {package_name}:') + str(loaded_bin)
+            #     )
+            # except Exception as e:
+            #     self.stderr.write(
+            #         self.style.ERROR(f"Error loading {package_name}: {e}")
+            #     )
+
+    def install(self, *args, bright, **options):
+        for package_name in options["package_names"]:
+            binary = Binary(name=package_name, providers=[env])
+            print(binary)
+
+            try:
+                loaded_bin = binary.load()
+                self.stdout.write(
+                    self.style.SUCCESS(f'Successfully loaded {package_name}:') + str(loaded_bin)
+                )
+            except Exception as e:
+                self.stderr.write(
+                    self.style.ERROR(f"Error loading {package_name}: {e}")
+                )

+ 0 - 0
archivebox/pkg/migrations/__init__.py


+ 3 - 0
archivebox/pkg/models.py

@@ -0,0 +1,3 @@
+from django.db import models
+
+# Create your models here.

+ 86 - 0
archivebox/pkg/settings.py

@@ -0,0 +1,86 @@
+__package__ = 'archivebox.pkg'
+
+import os
+import sys
+import shutil
+import inspect
+from pathlib import Path
+
+import django
+from django.conf import settings
+from django.db.backends.sqlite3.base import Database as sqlite3
+
+from pydantic_pkgr import Binary, BinProvider, BrewProvider, EnvProvider, SemVer
+from pydantic_pkgr.binprovider import bin_abspath
+
+from ..config import NODE_BIN_PATH, bin_path
+
+env = EnvProvider(PATH=NODE_BIN_PATH + ':' + os.environ.get('PATH', '/bin'))
+
+
+LOADED_DEPENDENCIES = {}
+
+for bin_key, dependency in settings.CONFIG.DEPENDENCIES.items():
+    # 'PYTHON_BINARY': {
+    #     'path': bin_path(config['PYTHON_BINARY']),
+    #     'version': config['PYTHON_VERSION'],
+    #     'hash': bin_hash(config['PYTHON_BINARY']),
+    #     'enabled': True,
+    #     'is_valid': bool(config['PYTHON_VERSION']),
+    # },
+    
+
+    bin_name = settings.CONFIG[bin_key]
+
+    if bin_name.endswith('django/__init__.py'):
+        binary_spec = Binary(name='django', providers=[env], provider_overrides={
+            'env': {
+                'abspath': lambda: Path(inspect.getfile(django)),
+                'version': lambda: SemVer('{}.{}.{} {} ({})'.format(*django.VERSION)),
+            }
+        })
+    elif bin_name.endswith('sqlite3/dbapi2.py'):
+        binary_spec = Binary(name='sqlite3', providers=[env], provider_overrides={
+            'env': {
+                'abspath': lambda: Path(inspect.getfile(sqlite3)),
+                'version': lambda: SemVer(sqlite3.version),
+            }
+        })
+    elif bin_name.endswith('archivebox'):
+        binary_spec = Binary(name='archivebox', providers=[env], provider_overrides={
+            'env': {
+                'abspath': lambda: shutil.which(str(Path('archivebox').expanduser())),
+                'version': lambda: settings.CONFIG.VERSION,
+            }
+        })
+    elif bin_name.endswith('postlight/parser/cli.js'):
+        binary_spec = Binary(name='postlight-parser', providers=[env], provider_overrides={
+            'env': {
+                'abspath': lambda: bin_path('postlight-parser'),
+                'version': lambda: SemVer('1.0.0'),
+            }
+        })
+    else:
+        binary_spec = Binary(name=bin_name, providers=[env])
+    
+    try:
+        binary = binary_spec.load()
+    except Exception as e:
+        # print(f"- ❌ Binary {bin_name} failed to load with error: {e}")
+        continue
+
+    assert isinstance(binary.loaded_version, SemVer)
+
+    try:
+        assert str(binary.loaded_version) == dependency['version'], f"Expected {bin_name} version {dependency['version']}, got {binary.loaded_version}"
+        assert str(binary.loaded_respath) == str(bin_abspath(dependency['path']).resolve()), f"Expected {bin_name} abspath {bin_abspath(dependency['path']).resolve()}, got {binary.loaded_respath}"
+        assert binary.is_valid == dependency['is_valid'], f"Expected {bin_name} is_valid={dependency['is_valid']}, got {binary.is_valid}"
+    except Exception as e:
+        pass
+        # print(f"WARNING: Error loading {bin_name}: {e}")
+        # import ipdb; ipdb.set_trace()
+    
+    # print(f"- ✅ Binary {bin_name} loaded successfully")
+    LOADED_DEPENDENCIES[bin_key] = binary
+
+

+ 3 - 0
archivebox/pkg/tests.py

@@ -0,0 +1,3 @@
+from django.test import TestCase
+
+# Create your tests here.

+ 3 - 0
archivebox/pkg/views.py

@@ -0,0 +1,3 @@
+from django.shortcuts import render
+
+# Create your views here.

+ 0 - 1
archivebox/plugantic/__init__.py

@@ -1,6 +1,5 @@
 __package__ = 'archivebox.plugantic'
 
-from .binproviders import BinProvider
 from .binaries import Binary
 from .extractors import Extractor
 from .replayers import Replayer

+ 11 - 0
archivebox/plugantic/apps.py

@@ -1,6 +1,17 @@
+import importlib
 from django.apps import AppConfig
 
 
 class PluganticConfig(AppConfig):
     default_auto_field = 'django.db.models.BigAutoField'
     name = 'plugantic'
+
+    def ready(self) -> None:
+        from django.conf import settings
+        from .plugins import PLUGINS
+
+        for plugin_name in settings.INSTALLED_PLUGINS.keys():
+            lib = importlib.import_module(f'{plugin_name}.apps')
+            if hasattr(lib, 'PLUGINS'):
+                for plugin_instance in lib.PLUGINS:
+                    PLUGINS.append(plugin_instance)

+ 22 - 280
archivebox/plugantic/binaries.py

@@ -10,285 +10,17 @@ from typing import Any, Optional, Dict, List
 from typing_extensions import Self
 from subprocess import run, PIPE
 
+from pydantic_pkgr import Binary, SemVer, BinName, BinProvider, EnvProvider, AptProvider, BrewProvider, PipProvider, BinProviderName, ProviderLookupDict
 
-from pydantic_core import ValidationError
+import django
+from django.db.backends.sqlite3.base import Database as sqlite3
 
-from pydantic import BaseModel, Field, model_validator, computed_field, field_validator, validate_call, field_serializer
 
-from .binproviders import (
-    SemVer,
-    BinName,
-    BinProviderName,
-    HostBinPath,
-    BinProvider,
-    EnvProvider,
-    AptProvider,
-    BrewProvider,
-    PipProvider,
-    ProviderLookupDict,
-    bin_name,
-    bin_abspath,
-    path_is_script,
-    path_is_executable,
-)
 
 
-class Binary(BaseModel):
-    name: BinName
-    description: str = Field(default='')
-
-    providers_supported: List[BinProvider] = Field(default=[EnvProvider()], alias='providers')
-    provider_overrides: Dict[BinProviderName, ProviderLookupDict] = Field(default={}, alias='overrides')
-    
-    loaded_provider: Optional[BinProviderName] = Field(default=None, alias='provider')
-    loaded_abspath: Optional[HostBinPath] = Field(default=None, alias='abspath')
-    loaded_version: Optional[SemVer] = Field(default=None, alias='version')
-    
-    # bin_filename:  see below
-    # is_executable: see below
-    # is_script
-    # is_valid: see below
-
-
-    @model_validator(mode='after')
-    def validate(self):
-        self.loaded_abspath = bin_abspath(self.name) or self.name
-        self.description = self.description or self.name
-        
-        assert self.providers_supported, f'No providers were given for package {self.name}'
-
-        # pull in any overrides from the binproviders
-        for provider in self.providers_supported:
-            overrides_by_provider = provider.get_providers_for_bin(self.name)
-            if overrides_by_provider:
-                self.provider_overrides[provider.name] = {
-                    **overrides_by_provider,
-                    **self.provider_overrides.get(provider.name, {}),
-                }
-        return self
-
-    @field_validator('loaded_abspath', mode='before')
-    def parse_abspath(cls, value: Any):
-        return bin_abspath(value)
-
-    @field_validator('loaded_version', mode='before')
-    def parse_version(cls, value: Any):
-        return value and SemVer(value)
-
-    @field_serializer('provider_overrides', when_used='json')
-    def serialize_overrides(self, provider_overrides: Dict[BinProviderName, ProviderLookupDict]) -> Dict[BinProviderName, Dict[str, str]]:
-        return {
-            provider_name: {
-                key: str(val)
-                for key, val in overrides.items()
-            }
-            for provider_name, overrides in provider_overrides.items()
-        }
-
-    @computed_field                                                                                           # type: ignore[misc]  # see mypy issue #1362
-    @property
-    def bin_filename(self) -> BinName:
-        if self.is_script:
-            # e.g. '.../Python.framework/Versions/3.11/lib/python3.11/sqlite3/__init__.py' -> sqlite
-            name = self.name
-        elif self.loaded_abspath:
-            # e.g. '/opt/homebrew/bin/wget' -> wget
-            name = bin_name(self.loaded_abspath)
-        else:
-            # e.g. 'ytdlp' -> 'yt-dlp'
-            name = bin_name(self.name)
-        return name
-
-    @computed_field                                                                                           # type: ignore[misc]  # see mypy issue #1362
-    @property
-    def is_executable(self) -> bool:
-        try:
-            assert self.loaded_abspath and path_is_executable(self.loaded_abspath)
-            return True
-        except (ValidationError, AssertionError):
-            return False
-
-    @computed_field                                                                                           # type: ignore[misc]  # see mypy issue #1362
-    @property
-    def is_script(self) -> bool:
-        try:
-            assert self.loaded_abspath and path_is_script(self.loaded_abspath)
-            return True
-        except (ValidationError, AssertionError):
-            return False
-
-    @computed_field                                                                                           # type: ignore[misc]  # see mypy issue #1362
-    @property
-    def is_valid(self) -> bool:
-        return bool(
-            self.name
-            and self.loaded_abspath
-            and self.loaded_version
-            and (self.is_executable or self.is_script)
-        )
-
-    @validate_call
-    def install(self) -> Self:
-        if not self.providers_supported:
-            return self
-
-        exc = Exception('No providers were able to install binary', self.name, self.providers_supported)
-        for provider in self.providers_supported:
-            try:
-                installed_bin = provider.install(self.name, overrides=self.provider_overrides.get(provider.name))
-                if installed_bin:
-                    # print('INSTALLED', self.name, installed_bin)
-                    return self.model_copy(update={
-                        'loaded_provider': provider.name,
-                        'loaded_abspath': installed_bin.abspath,
-                        'loaded_version': installed_bin.version,
-                    })
-            except Exception as err:
-                print(err)
-                exc = err
-        raise exc
-
-    @validate_call
-    def load(self, cache=True) -> Self:
-        if self.is_valid:
-            return self
-
-        if not self.providers_supported:
-            return self
-
-        exc = Exception('No providers were able to install binary', self.name, self.providers_supported)
-        for provider in self.providers_supported:
-            try:
-                installed_bin = provider.load(self.name, cache=cache, overrides=self.provider_overrides.get(provider.name))
-                if installed_bin:
-                    # print('LOADED', provider, self.name, installed_bin)
-                    return self.model_copy(update={
-                        'loaded_provider': provider.name,
-                        'loaded_abspath': installed_bin.abspath,
-                        'loaded_version': installed_bin.version,
-                    })
-            except Exception as err:
-                print(err)
-                exc = err
-        raise exc
-
-    @validate_call
-    def load_or_install(self, cache=True) -> Self:
-        if self.is_valid:
-            return self
-
-        if not self.providers_supported:
-            return self
-
-        exc = Exception('No providers were able to install binary', self.name, self.providers_supported)
-        for provider in self.providers_supported:
-            try:
-                installed_bin = provider.load_or_install(self.name, overrides=self.provider_overrides.get(provider.name), cache=cache)
-                if installed_bin:
-                    # print('LOADED_OR_INSTALLED', self.name, installed_bin)
-                    return self.model_copy(update={
-                        'loaded_provider': provider.name,
-                        'loaded_abspath': installed_bin.abspath,
-                        'loaded_version': installed_bin.version,
-                    })
-            except Exception as err:
-                print(err)
-                exc = err
-        raise exc
-
-    @validate_call
-    def exec(self, args=(), pwd='.'):
-        assert self.loaded_abspath
-        assert self.loaded_version
-        return run([self.loaded_abspath, *args], stdout=PIPE, stderr=PIPE, pwd=pwd)
-
-
-
-
-class SystemPythonHelpers:
-    @staticmethod
-    def get_subdeps() -> str:
-        return 'python3 python3-minimal python3-pip python3-virtualenv'
-
-    @staticmethod
-    def get_abspath() -> str:
-        return sys.executable
-    
-    @staticmethod
-    def get_version() -> str:
-        return '{}.{}.{}'.format(*sys.version_info[:3])
-
-
-class SqliteHelpers:
-    @staticmethod
-    def get_abspath() -> Path:
-        import sqlite3
-        importlib.reload(sqlite3)
-        return Path(inspect.getfile(sqlite3))
-
-    @staticmethod
-    def get_version() -> SemVer:
-        import sqlite3
-        importlib.reload(sqlite3)
-        version = sqlite3.version
-        assert version
-        return SemVer(version)
-
-class DjangoHelpers:
-    @staticmethod
-    def get_django_abspath() -> str:
-        import django
-        return inspect.getfile(django)
-    
-
-    @staticmethod
-    def get_django_version() -> str:
-        import django
-        return '{}.{}.{} {} ({})'.format(*django.VERSION)
-
-class YtdlpHelpers:
-    @staticmethod
-    def get_ytdlp_subdeps() -> str:
-        return 'yt-dlp ffmpeg'
-
-    @staticmethod
-    def get_ytdlp_version() -> str:
-        import yt_dlp
-        importlib.reload(yt_dlp)
-
-        version = yt_dlp.version.__version__
-        assert version
-        return version
-
-class PythonBinary(Binary):
-    name: BinName = 'python'
-
-    providers_supported: List[BinProvider] = [
-        EnvProvider(
-            subdeps_provider={'python': 'plugantic.binaries.SystemPythonHelpers.get_subdeps'},
-            abspath_provider={'python': 'plugantic.binaries.SystemPythonHelpers.get_abspath'},
-            version_provider={'python': 'plugantic.binaries.SystemPythonHelpers.get_version'},
-        ),
-    ]
-
-class SqliteBinary(Binary):
-    name: BinName = 'sqlite'
-    providers_supported: List[BinProvider] = [
-        EnvProvider(
-            version_provider={'sqlite': 'plugantic.binaries.SqliteHelpers.get_version'},
-            abspath_provider={'sqlite': 'plugantic.binaries.SqliteHelpers.get_abspath'},
-        ),
-    ]
-
-class DjangoBinary(Binary):
-    name: BinName = 'django'
-    providers_supported: List[BinProvider] = [
-        EnvProvider(
-            abspath_provider={'django': 'plugantic.binaries.DjangoHelpers.get_django_abspath'},
-            version_provider={'django': 'plugantic.binaries.DjangoHelpers.get_django_version'},
-        ),
-    ]
-
+def get_ytdlp_version() -> str:
+    import yt_dlp
+    return yt_dlp.version.__version__
 
 
 
@@ -296,16 +28,26 @@ class DjangoBinary(Binary):
 class YtdlpBinary(Binary):
     name: BinName = 'yt-dlp'
     providers_supported: List[BinProvider] = [
-        # EnvProvider(),
-        PipProvider(version_provider={'yt-dlp': 'plugantic.binaries.YtdlpHelpers.get_ytdlp_version'}),
-        BrewProvider(subdeps_provider={'yt-dlp': 'plugantic.binaries.YtdlpHelpers.get_ytdlp_subdeps'}),
-        # AptProvider(subdeps_provider={'yt-dlp': lambda: 'yt-dlp ffmpeg'}),
+        EnvProvider(),
+        PipProvider(),
+        BrewProvider(),
+        AptProvider(),
     ]
-
+    provider_overrides:  Dict[BinProviderName, ProviderLookupDict] = {
+        'pip': {
+            'version': get_ytdlp_version,
+        },
+        'brew': {
+            'subdeps': lambda: 'yt-dlp ffmpeg',
+        },
+        'apt': {
+            'subdeps': lambda: 'yt-dlp ffmpeg',
+        }
+    }
 
 class WgetBinary(Binary):
     name: BinName = 'wget'
-    providers_supported: List[BinProvider] = [EnvProvider(), AptProvider()]
+    providers_supported: List[BinProvider] = [EnvProvider(), AptProvider(), BrewProvider()]
 
 
 # if __name__ == '__main__':

+ 0 - 561
archivebox/plugantic/binproviders.py

@@ -1,561 +0,0 @@
-__package__ = 'archivebox.plugantic'
-
-import os
-import shutil
-import operator
-
-from typing import Callable, Any, Optional, Type, Dict, Annotated, ClassVar, Literal, cast, TYPE_CHECKING
-from typing_extensions import Self
-from abc import ABC, abstractmethod
-from collections import namedtuple
-from pathlib import Path
-from subprocess import run, PIPE
-
-from pydantic_core import core_schema, ValidationError
-from pydantic import BaseModel, Field, TypeAdapter, AfterValidator, validate_call, GetCoreSchemaHandler
-
-
-
-def func_takes_args_or_kwargs(lambda_func: Callable[..., Any]) -> bool:
-    """returns True if a lambda func takes args/kwargs of any kind, otherwise false if it's pure/argless"""
-    code = lambda_func.__code__
-    has_args = code.co_argcount > 0
-    has_varargs = code.co_flags & 0x04 != 0
-    has_varkw = code.co_flags & 0x08 != 0
-    return has_args or has_varargs or has_varkw
-
-
-def is_semver_str(semver: Any) -> bool:
-    if isinstance(semver, str):
-        return (semver.count('.') == 2 and semver.replace('.', '').isdigit())
-    return False
-
-def semver_to_str(semver: tuple[int, int, int] | str) -> str:
-    if isinstance(semver, (list, tuple)):
-        return '.'.join(str(chunk) for chunk in semver)
-    if is_semver_str(semver):
-        return semver
-    raise ValidationError('Tried to convert invalid SemVer: {}'.format(semver))
-
-
-SemVerTuple = namedtuple('SemVerTuple', ('major', 'minor', 'patch'), defaults=(0, 0, 0))
-SemVerParsableTypes = str | tuple[str | int, ...] | list[str | int]
-
-class SemVer(SemVerTuple):
-    major: int
-    minor: int = 0
-    patch: int = 0
-
-    if TYPE_CHECKING:
-        full_text: str | None = ''
-
-    def __new__(cls, *args, full_text=None, **kwargs):
-        # '1.1.1'
-        if len(args) == 1 and is_semver_str(args[0]):
-            result = SemVer.parse(args[0])
-
-        # ('1', '2', '3')
-        elif len(args) == 1 and isinstance(args[0], (tuple, list)):
-            result = SemVer.parse(args[0])
-
-        # (1, '2', None)
-        elif not all(isinstance(arg, (int, type(None))) for arg in args):
-            result = SemVer.parse(args)
-
-        # (None)
-        elif all(chunk in ('', 0, None) for chunk in (*args, *kwargs.values())):
-            result = None
-
-        # 1, 2, 3
-        else:
-            result = SemVerTuple.__new__(cls, *args, **kwargs)
-
-        if result is not None:
-            # add first line as extra hidden metadata so it can be logged without having to re-run version cmd
-            result.full_text = full_text or str(result)
-        return result
-
-    @classmethod
-    def parse(cls, version_stdout: SemVerParsableTypes) -> Self | None:
-        """
-        parses a version tag string formatted like into (major, minor, patch) ints
-        'Google Chrome 124.0.6367.208'             -> (124, 0, 6367)
-        'GNU Wget 1.24.5 built on darwin23.2.0.'   -> (1, 24, 5)
-        'curl 8.4.0 (x86_64-apple-darwin23.0) ...' -> (8, 4, 0)
-        '2024.04.09'                               -> (2024, 4, 9)
-
-        """
-        # print('INITIAL_VALUE', type(version_stdout).__name__, version_stdout)
-
-        if isinstance(version_stdout, (tuple, list)):
-            version_stdout = '.'.join(str(chunk) for chunk in version_stdout)
-        elif isinstance(version_stdout, bytes):
-            version_stdout = version_stdout.decode()
-        elif not isinstance(version_stdout, str):
-            version_stdout = str(version_stdout)
-        
-        # no text to work with, return None immediately
-        if not version_stdout.strip():
-            # raise Exception('Tried to parse semver from empty version output (is binary installed and available?)')
-            return None
-
-        just_numbers = lambda col: col.lower().strip('v').split('+')[0].split('-')[0].split('_')[0]
-        contains_semver = lambda col: (
-            col.count('.') in (1, 2, 3)
-            and all(chunk.isdigit() for chunk in col.split('.')[:3])  # first 3 chunks can only be nums
-        )
-
-        full_text = version_stdout.split('\n')[0].strip()
-        first_line_columns = full_text.split()[:4]
-        version_columns = list(filter(contains_semver, map(just_numbers, first_line_columns)))
-        
-        # could not find any column of first line that looks like a version number, despite there being some text
-        if not version_columns:
-            # raise Exception('Failed to parse semver from version command output: {}'.format(' '.join(first_line_columns)))
-            return None
-
-        # take first col containing a semver, and truncate it to 3 chunks (e.g. 2024.04.09.91) -> (2024, 04, 09)
-        first_version_tuple = version_columns[0].split('.', 3)[:3]
-
-        # print('FINAL_VALUE', first_version_tuple)
-
-        return cls(*(int(chunk) for chunk in first_version_tuple), full_text=full_text)
-
-    def __str__(self):
-        return '.'.join(str(chunk) for chunk in self)
-
-    # @classmethod
-    # def __get_pydantic_core_schema__(cls, source: Type[Any], handler: GetCoreSchemaHandler) -> core_schema.CoreSchema:
-    #     default_schema = handler(source)
-    #     return core_schema.no_info_after_validator_function(
-    #         cls.parse,
-    #         default_schema,
-    #         serialization=core_schema.plain_serializer_function_ser_schema(
-    #             lambda semver: str(semver),
-    #             info_arg=False,
-    #             return_schema=core_schema.str_schema(),
-    #         ),
-    #     )
-
-assert SemVer(None) == None
-assert SemVer('') == None
-assert SemVer.parse('') == None
-assert SemVer(1) == (1, 0, 0)
-assert SemVer(1, 2) == (1, 2, 0)
-assert SemVer('1.2+234234') == (1, 2, 0)
-assert SemVer((1, 2, 3)) == (1, 2, 3)
-assert getattr(SemVer((1, 2, 3)), 'full_text') == '1.2.3'
-assert SemVer(('1', '2', '3')) == (1, 2, 3)
-assert SemVer.parse('5.6.7') == (5, 6, 7)
-assert SemVer.parse('124.0.6367.208') == (124, 0, 6367)
-assert SemVer.parse('Google Chrome 124.1+234.234') == (124, 1, 0)
-assert SemVer.parse('Google Ch1rome 124.0.6367.208') == (124, 0, 6367)
-assert SemVer.parse('Google Chrome 124.0.6367.208+beta_234. 234.234.123\n123.456.324') == (124, 0, 6367)
-assert getattr(SemVer.parse('Google Chrome 124.0.6367.208+beta_234. 234.234.123\n123.456.324'), 'full_text') == 'Google Chrome 124.0.6367.208+beta_234. 234.234.123'
-assert SemVer.parse('Google Chrome') == None
-
-@validate_call
-def bin_name(bin_path_or_name: str | Path) -> str:
-    name = Path(bin_path_or_name).name
-    assert len(name) > 1
-    assert name.replace('-', '').replace('_', '').replace('.', '').isalnum(), (
-        f'Binary name can only contain a-Z0-9-_.: {name}')
-    return name
-
-BinName = Annotated[str, AfterValidator(bin_name)]
-
-@validate_call
-def path_is_file(path: Path | str) -> Path:
-    path = Path(path) if isinstance(path, str) else path
-    assert path.is_file(), f'Path is not a file: {path}'
-    return path
-
-HostExistsPath = Annotated[Path, AfterValidator(path_is_file)]
-
-@validate_call
-def path_is_executable(path: HostExistsPath) -> HostExistsPath:
-    assert os.access(path, os.X_OK), f'Path is not executable (fix by running chmod +x {path})'
-    return path
-
-@validate_call
-def path_is_script(path: HostExistsPath) -> HostExistsPath:
-    SCRIPT_EXTENSIONS = ('.py', '.js', '.sh')
-    assert path.suffix.lower() in SCRIPT_EXTENSIONS, 'Path is not a script (does not end in {})'.format(', '.join(SCRIPT_EXTENSIONS))
-    return path
-
-HostExecutablePath = Annotated[HostExistsPath, AfterValidator(path_is_executable)]
-
-@validate_call
-def path_is_abspath(path: Path) -> Path:
-    return path.resolve()
-
-HostAbsPath = Annotated[HostExistsPath, AfterValidator(path_is_abspath)]
-HostBinPath = Annotated[Path, AfterValidator(path_is_abspath), AfterValidator(path_is_file)]
-
-
-@validate_call
-def bin_abspath(bin_path_or_name: BinName | Path) -> HostBinPath | None:
-    assert bin_path_or_name
-
-    if str(bin_path_or_name).startswith('/'):
-        # already a path, get its absolute form
-        abspath = Path(bin_path_or_name).resolve()
-    else:
-        # not a path yet, get path using os.which
-        binpath = shutil.which(bin_path_or_name)
-        if not binpath:
-            return None
-        abspath = Path(binpath).resolve()
-
-    try:
-        return TypeAdapter(HostBinPath).validate_python(abspath)
-    except ValidationError:
-        return None
-
-
-@validate_call
-def bin_version(bin_path: HostBinPath, args=('--version',)) -> SemVer | None:
-    return SemVer(run([bin_path, *args], stdout=PIPE).stdout.strip().decode())
-
-
-class InstalledBin(BaseModel):
-    abspath: HostBinPath
-    version: SemVer
-
-
-def is_valid_install_string(pkgs_str: str) -> str:
-    """Make sure a string is a valid install string for a package manager, e.g. 'yt-dlp ffmpeg'"""
-    assert pkgs_str
-    assert all(len(pkg) > 1 for pkg in pkgs_str.split(' '))
-    return pkgs_str
-
-def is_valid_python_dotted_import(import_str: str) -> str:
-    assert import_str and import_str.replace('.', '').replace('_', '').isalnum()
-    return import_str
-
-InstallStr = Annotated[str, AfterValidator(is_valid_install_string)]
-
-LazyImportStr = Annotated[str, AfterValidator(is_valid_python_dotted_import)]
-
-ProviderHandler = Callable[..., Any] | Callable[[], Any]                               # must take no args [], or [bin_name: str, **kwargs]
-#ProviderHandlerStr = Annotated[str, AfterValidator(lambda s: s.startswith('self.'))]
-ProviderHandlerRef = LazyImportStr | ProviderHandler
-ProviderLookupDict = Dict[str, LazyImportStr]
-ProviderType = Literal['abspath', 'version', 'subdeps', 'install']
-
-
-# class Host(BaseModel):
-#     machine: str
-#     system: str
-#     platform: str
-#     in_docker: bool
-#     in_qemu: bool
-#     python: str
-
-BinProviderName = Literal['env', 'pip', 'apt', 'brew', 'npm', 'vendor']
-
-
-class BinProvider(ABC, BaseModel):
-    name: BinProviderName
-    
-    abspath_provider: ProviderLookupDict = Field(default={'*': 'self.on_get_abspath'}, exclude=True)
-    version_provider: ProviderLookupDict = Field(default={'*': 'self.on_get_version'}, exclude=True)
-    subdeps_provider: ProviderLookupDict = Field(default={'*': 'self.on_get_subdeps'}, exclude=True)
-    install_provider: ProviderLookupDict = Field(default={'*': 'self.on_install'}, exclude=True)
-
-    _abspath_cache: ClassVar = {}
-    _version_cache: ClassVar = {}
-    _install_cache: ClassVar = {}
-
-    # def provider_version(self) -> SemVer | None:
-    #     """Version of the actual underlying package manager (e.g. pip v20.4.1)"""
-    #     if self.name in ('env', 'vendor'):
-    #         return SemVer('0.0.0')
-    #     installer_binpath = Path(shutil.which(self.name)).resolve()
-    #     return bin_version(installer_binpath)
-
-    # def provider_host(self) -> Host:
-    #     """Information about the host env, archictecture, and OS needed to select & build packages"""
-    #     p = platform.uname()
-    #     return Host(
-    #         machine=p.machine,
-    #         system=p.system,
-    #         platform=platform.platform(),
-    #         python=sys.implementation.name,
-    #         in_docker=os.environ.get('IN_DOCKER', '').lower() == 'true',
-    #         in_qemu=os.environ.get('IN_QEMU', '').lower() == 'true',
-    #     )
-
-    def get_default_providers(self):
-        return self.get_providers_for_bin('*')
-
-    def resolve_provider_func(self, provider_func: ProviderHandlerRef | None) -> ProviderHandler | None:
-        if provider_func is None:
-            return None
-
-        # if provider_func is a dotted path to a function on self, swap it for the actual function
-        if isinstance(provider_func, str) and provider_func.startswith('self.'):
-            provider_func = getattr(self, provider_func.split('self.', 1)[-1])
-
-        # if provider_func is a dot-formatted import string, import the function
-        if isinstance(provider_func, str):
-            from django.utils.module_loading import import_string
-
-            package_name, module_name, classname, path = provider_func.split('.', 3)   # -> abc, def, ghi.jkl
-
-            # get .ghi.jkl nested attr present on module abc.def
-            imported_module = import_string(f'{package_name}.{module_name}.{classname}')
-            provider_func = operator.attrgetter(path)(imported_module)
-
-            # # abc.def.ghi.jkl  -> 1, 2, 3
-            # for idx in range(1, len(path)):
-            #     parent_path = '.'.join(path[:-idx])  # abc.def.ghi
-            #     try:
-            #         parent_module = import_string(parent_path)
-            #         provider_func = getattr(parent_module, path[-idx])
-            #     except AttributeError, ImportError:
-            #         continue
-
-        assert TypeAdapter(ProviderHandler).validate_python(provider_func), (
-            f'{self.__class__.__name__} provider func for {bin_name} was not a function or dotted-import path: {provider_func}')
-
-        return provider_func
-
-    @validate_call
-    def get_providers_for_bin(self, bin_name: str) -> ProviderLookupDict:
-        providers_for_bin = {
-            'abspath': self.abspath_provider.get(bin_name),
-            'version': self.version_provider.get(bin_name),
-            'subdeps': self.subdeps_provider.get(bin_name),
-            'install': self.install_provider.get(bin_name),
-        }
-        only_set_providers_for_bin = {k: v for k, v in providers_for_bin.items() if v is not None}
-        
-        return only_set_providers_for_bin
-
-    @validate_call
-    def get_provider_for_action(self, bin_name: BinName, provider_type: ProviderType, default_provider: Optional[ProviderHandlerRef]=None, overrides: Optional[ProviderLookupDict]=None) -> ProviderHandler:
-        """
-        Get the provider func for a given key + Dict of provider callbacks + fallback default provider.
-        e.g. get_provider_for_action(bin_name='yt-dlp', 'install', default_provider=self.on_install, ...) -> Callable
-        """
-
-        provider_func_ref = (
-            (overrides or {}).get(provider_type)
-            or self.get_providers_for_bin(bin_name).get(provider_type)
-            or self.get_default_providers().get(provider_type)
-            or default_provider
-        )
-        # print('getting provider for action', bin_name, provider_type, provider_func)
-
-        provider_func = self.resolve_provider_func(provider_func_ref)
-
-        assert provider_func, f'No {self.name} provider func was found for {bin_name} in: {self.__class__.__name__}.'
-
-        return provider_func
-
-    @validate_call
-    def call_provider_for_action(self, bin_name: BinName, provider_type: ProviderType, default_provider: Optional[ProviderHandlerRef]=None, overrides: Optional[ProviderLookupDict]=None, **kwargs) -> Any:
-        provider_func: ProviderHandler = self.get_provider_for_action(
-            bin_name=bin_name,
-            provider_type=provider_type,
-            default_provider=default_provider,
-            overrides=overrides,
-        )
-        if not func_takes_args_or_kwargs(provider_func):
-            # if it's a pure argless lambdas, dont pass bin_path and other **kwargs
-            provider_func_without_args = cast(Callable[[], Any], provider_func)
-            return provider_func_without_args()
-
-        provider_func = cast(Callable[..., Any], provider_func)
-        return provider_func(bin_name, **kwargs)
-
-
-
-    def on_get_abspath(self, bin_name: BinName, **_) -> HostBinPath | None:
-        print(f'[*] {self.__class__.__name__}: Getting abspath for {bin_name}...')
-        try:
-            return bin_abspath(bin_name)
-        except ValidationError:
-            return None
-
-    def on_get_version(self, bin_name: BinName, abspath: Optional[HostBinPath]=None, **_) -> SemVer | None:
-        abspath = abspath or self._abspath_cache.get(bin_name) or self.get_abspath(bin_name)
-        if not abspath: return None
-
-        print(f'[*] {self.__class__.__name__}: Getting version for {bin_name}...')
-        try:
-            return bin_version(abspath)
-        except ValidationError:
-            return None
-
-    def on_get_subdeps(self, bin_name: BinName, **_) -> InstallStr:
-        print(f'[*] {self.__class__.__name__}: Getting subdependencies for {bin_name}')
-        # ... subdependency calculation logic here
-        return TypeAdapter(InstallStr).validate_python(bin_name)
-
-    @abstractmethod
-    def on_install(self, bin_name: BinName, subdeps: Optional[InstallStr]=None, **_):
-        subdeps = subdeps or self.get_subdeps(bin_name)
-        print(f'[*] {self.__class__.__name__}: Installing subdependencies for {bin_name} ({subdeps})')
-        # ... install logic here
-        assert True
-
-
-    @validate_call
-    def get_abspath(self, bin_name: BinName, overrides: Optional[ProviderLookupDict]=None) -> HostBinPath | None:
-        abspath = self.call_provider_for_action(
-            bin_name=bin_name,
-            provider_type='abspath',
-            default_provider=self.on_get_abspath,
-            overrides=overrides,
-        )
-        if not abspath:
-            return None
-        result = TypeAdapter(HostBinPath).validate_python(abspath)
-        self._abspath_cache[bin_name] = result
-        return result
-
-    @validate_call
-    def get_version(self, bin_name: BinName, abspath: Optional[HostBinPath]=None, overrides: Optional[ProviderLookupDict]=None) -> SemVer | None:
-        version = self.call_provider_for_action(
-            bin_name=bin_name,
-            provider_type='version',
-            default_provider=self.on_get_version,
-            overrides=overrides,
-            abspath=abspath,
-        )
-        if not version:
-            return None
-        result = SemVer(version)
-        self._version_cache[bin_name] = result
-        return result
-
-    @validate_call
-    def get_subdeps(self, bin_name: BinName, overrides: Optional[ProviderLookupDict]=None) -> InstallStr:
-        subdeps = self.call_provider_for_action(
-            bin_name=bin_name,
-            provider_type='subdeps',
-            default_provider=self.on_get_subdeps,
-            overrides=overrides,
-        )
-        if not subdeps:
-            subdeps = bin_name
-        result = TypeAdapter(InstallStr).validate_python(subdeps)
-        return result
-
-    @validate_call
-    def install(self, bin_name: BinName, overrides: Optional[ProviderLookupDict]=None) -> InstalledBin | None:
-        subdeps = self.get_subdeps(bin_name, overrides=overrides)
-
-        self.call_provider_for_action(
-            bin_name=bin_name,
-            provider_type='install',
-            default_provider=self.on_install,
-            overrides=overrides,
-            subdeps=subdeps,
-        )
-
-        installed_abspath = self.get_abspath(bin_name)
-        assert installed_abspath, f'Unable to find {bin_name} abspath after installing with {self.name}'
-
-        installed_version = self.get_version(bin_name, abspath=installed_abspath)
-        assert installed_version, f'Unable to find {bin_name} version after installing with {self.name}'
-        
-        result = InstalledBin(abspath=installed_abspath, version=installed_version)
-        self._install_cache[bin_name] = result
-        return result
-
-    @validate_call
-    def load(self, bin_name: BinName, overrides: Optional[ProviderLookupDict]=None, cache: bool=False) -> InstalledBin | None:
-        installed_abspath = None
-        installed_version = None
-
-        if cache:
-            installed_bin = self._install_cache.get(bin_name)
-            if installed_bin:
-                return installed_bin
-            installed_abspath = self._abspath_cache.get(bin_name)
-            installed_version = self._version_cache.get(bin_name)
-
-
-        installed_abspath = installed_abspath or self.get_abspath(bin_name, overrides=overrides)
-        if not installed_abspath:
-            return None
-
-        installed_version = installed_version or self.get_version(bin_name, abspath=installed_abspath, overrides=overrides)
-        if not installed_version:
-            return None
-
-        return InstalledBin(abspath=installed_abspath, version=installed_version)
-
-    @validate_call
-    def load_or_install(self, bin_name: BinName, overrides: Optional[ProviderLookupDict]=None, cache: bool=True) -> InstalledBin | None:
-        installed = self.load(bin_name, overrides=overrides, cache=cache)
-        if not installed:
-            installed = self.install(bin_name, overrides=overrides)
-        return installed
-
-
-class PipProvider(BinProvider):
-    name: BinProviderName = 'pip'
-
-    def on_install(self, bin_name: str, subdeps: Optional[InstallStr]=None, **_):
-        subdeps = subdeps or self.on_get_subdeps(bin_name)
-        print(f'[*] {self.__class__.__name__}: Installing subdependencies for {bin_name} ({subdeps})')
-        
-        proc = run(['pip', 'install', '--upgrade', *subdeps.split(' ')], stdout=PIPE, stderr=PIPE)
-        
-        if proc.returncode != 0:
-            print(proc.stdout.strip().decode())
-            print(proc.stderr.strip().decode())
-            raise Exception(f'{self.__class__.__name__}: install got returncode {proc.returncode} while installing {subdeps}: {subdeps}')
-
-
-class AptProvider(BinProvider):
-    name: BinProviderName = 'apt'
-    
-    subdeps_provider: ProviderLookupDict = {
-        'yt-dlp': lambda: 'yt-dlp ffmpeg',
-    }
-
-    def on_install(self, bin_name: BinName, subdeps: Optional[InstallStr]=None, **_):
-        subdeps = subdeps or self.on_get_subdeps(bin_name)
-        print(f'[*] {self.__class__.__name__}: Installing subdependencies for {bin_name} ({subdeps})')
-        
-        run(['apt-get', 'update', '-qq'])
-        proc = run(['apt-get', 'install', '-y', *subdeps.split(' ')], stdout=PIPE, stderr=PIPE)
-        
-        if proc.returncode != 0:
-            print(proc.stdout.strip().decode())
-            print(proc.stderr.strip().decode())
-            raise Exception(f'{self.__class__.__name__} install got returncode {proc.returncode} while installing {subdeps}: {subdeps}')
-
-class BrewProvider(BinProvider):
-    name: BinProviderName = 'brew'
-
-    def on_install(self, bin_name: str, subdeps: Optional[InstallStr]=None, **_):
-        subdeps = subdeps or self.on_get_subdeps(bin_name)
-        print(f'[*] {self.__class__.__name__}: Installing subdependencies for {bin_name} ({subdeps})')
-        
-        proc = run(['brew', 'install', *subdeps.split(' ')], stdout=PIPE, stderr=PIPE)
-        
-        if proc.returncode != 0:
-            print(proc.stdout.strip().decode())
-            print(proc.stderr.strip().decode())
-            raise Exception(f'{self.__class__.__name__} install got returncode {proc.returncode} while installing {subdeps}: {subdeps}')
-
-
-class EnvProvider(BinProvider):
-    name: BinProviderName = 'env'
-
-    abspath_provider: ProviderLookupDict = {
-        # 'python': lambda: Path('/opt/homebrew/Cellar/[email protected]/3.10.14/Frameworks/Python.framework/Versions/3.10/bin/python3.10'),
-    }
-    version_provider: ProviderLookupDict = {
-        # 'python': lambda: '{}.{}.{}'.format(*sys.version_info[:3]),
-    }
-
-    def on_install(self, bin_name: BinName, subdeps: Optional[InstallStr]=None, **_):
-        """The env provider is ready-only and does not install any packages, so this is a no-op"""
-        pass

+ 1 - 1
archivebox/plugantic/extractors.py

@@ -31,7 +31,7 @@ def no_empty_args(args: List[str]) -> List[str]:
     assert all(len(arg) for arg in args)
     return args
 
-ExtractorName = Literal['wget', 'warc', 'media']
+ExtractorName = Literal['wget', 'warc', 'media', 'singlefile'] | str
 
 HandlerFuncStr = Annotated[str, AfterValidator(lambda s: s.startswith('self.'))]
 CmdArgsList = Annotated[List[str], AfterValidator(no_empty_args)]

+ 0 - 12
archivebox/plugantic/plugins.py

@@ -14,9 +14,6 @@ from pydantic import (
 
 from .binaries import (
     Binary,
-    PythonBinary,
-    SqliteBinary,
-    DjangoBinary,
     WgetBinary,
     YtdlpBinary,
 )
@@ -28,7 +25,6 @@ from .extractors import (
 )
 from .replayers import (
     Replayer,
-    GENERIC_REPLAYER,
     MEDIA_REPLAYER,
 )
 from .configs import (
@@ -80,12 +76,6 @@ class Plugin(BaseModel):
         })
 
 
-class CorePlugin(Plugin):
-    name: str = 'core'
-    configs: List[SerializeAsAny[ConfigSet]] = []
-    binaries: List[SerializeAsAny[Binary]] = [PythonBinary(), SqliteBinary(), DjangoBinary()]
-    extractors: List[SerializeAsAny[Extractor]] = []
-    replayers: List[SerializeAsAny[Replayer]] = [GENERIC_REPLAYER]
 
 class YtdlpPlugin(Plugin):
     name: str = 'ytdlp'
@@ -101,11 +91,9 @@ class WgetPlugin(Plugin):
     extractors: List[SerializeAsAny[Extractor]] = [WgetExtractor(), WarcExtractor()]
 
 
-CORE_PLUGIN = CorePlugin()
 YTDLP_PLUGIN = YtdlpPlugin()
 WGET_PLUGIN = WgetPlugin()
 PLUGINS = [
-    CORE_PLUGIN,
     YTDLP_PLUGIN,
     WGET_PLUGIN,
 ]

+ 0 - 1
archivebox/plugantic/replayers.py

@@ -22,5 +22,4 @@ class Replayer(BaseModel):
     # thumbnail_view: LazyImportStr = 'plugins.generic_replayer.views.get_icon'
 
 
-GENERIC_REPLAYER = Replayer(name='generic')
 MEDIA_REPLAYER = Replayer(name='media')

+ 48 - 7
archivebox/plugantic/views.py

@@ -1,5 +1,8 @@
 __package__ = 'archivebox.plugantic'
 
+import inspect
+from typing import Any
+
 from django.http import HttpRequest
 from django.utils.html import format_html, mark_safe
 
@@ -10,6 +13,44 @@ from admin_data_views.utils import render_with_table_view, render_with_item_view
 from plugantic.plugins import LOADED_PLUGINS
 from django.conf import settings
 
+def obj_to_yaml(obj: Any, indent: int=0) -> str:
+    indent_str = "  " * indent
+    
+    if isinstance(obj, dict):
+        if not obj:
+            return "{}"
+        result = "\n"
+        for key, value in obj.items():
+            result += f"{indent_str}{key}:{obj_to_yaml(value, indent + 1)}\n"
+        return result
+    
+    elif isinstance(obj, list):
+        if not obj:
+            return "[]"
+        result = "\n"
+        for item in obj:
+            result += f"{indent_str}- {obj_to_yaml(item, indent + 1).lstrip()}\n"
+        return result.rstrip()
+    
+    elif isinstance(obj, str):
+        if "\n" in obj:
+            return f" |\n{indent_str}  " + obj.replace("\n", f"\n{indent_str}  ")
+        else:
+            return f" {obj}"
+    
+    elif isinstance(obj, (int, float, bool)):
+        return f" {str(obj)}"
+    
+    elif callable(obj):
+        source = '\n'.join(
+            '' if 'def ' in line else line
+            for line in inspect.getsource(obj).split('\n')
+            if line.strip()
+        ).split('lambda: ')[-1].rstrip(',')
+        return f" {indent_str}  " + source.replace("\n", f"\n{indent_str}  ")
+    
+    else:
+        return f" {str(obj)}"
 
 @render_with_table_view
 def binaries_list_view(request: HttpRequest, **kwargs) -> TableContext:
@@ -18,13 +59,13 @@ def binaries_list_view(request: HttpRequest, **kwargs) -> TableContext:
 
     rows = {
         "Binary": [],
-        "From Plugin": [],
         "Found Version": [],
+        "From Plugin": [],
         "Provided By": [],
         "Found Abspath": [],
         "Related Configuration": [],
         "Overrides": [],
-        "Description": [],
+        # "Description": [],
     }
 
     relevant_configs = {
@@ -38,8 +79,8 @@ def binaries_list_view(request: HttpRequest, **kwargs) -> TableContext:
             binary = binary.load_or_install()
 
             rows['Binary'].append(ItemLink(binary.name, key=binary.name))
-            rows['From Plugin'].append(plugin.name)
             rows['Found Version'].append(binary.loaded_version)
+            rows['From Plugin'].append(plugin.name)
             rows['Provided By'].append(binary.loaded_provider)
             rows['Found Abspath'].append(binary.loaded_abspath)
             rows['Related Configuration'].append(mark_safe(', '.join(
@@ -48,8 +89,8 @@ def binaries_list_view(request: HttpRequest, **kwargs) -> TableContext:
                     if binary.name.lower().replace('-', '').replace('_', '').replace('ytdlp', 'youtubedl') in config_key.lower()
                     # or binary.name.lower().replace('-', '').replace('_', '') in str(config_value).lower()
             )))
-            rows['Overrides'].append(str(binary.provider_overrides))
-            rows['Description'].append(binary.description)
+            rows['Overrides'].append(obj_to_yaml(binary.provider_overrides))
+            # rows['Description'].append(binary.description)
 
     return TableContext(
         title="Binaries",
@@ -85,8 +126,8 @@ def binary_detail_view(request: HttpRequest, key: str, **kwargs) -> ItemContext:
                     'binprovider': binary.loaded_provider,
                     'abspath': binary.loaded_abspath,
                     'version': binary.loaded_version,
-                    'overrides': str(binary.provider_overrides),
-                    'providers': str(binary.providers_supported),
+                    'overrides': obj_to_yaml(binary.provider_overrides),
+                    'providers': obj_to_yaml(binary.providers_supported),
                 },
                 "help_texts": {
                     # TODO

+ 1 - 2
archivebox/system.py

@@ -11,13 +11,12 @@ from typing import Optional, Union, Set, Tuple
 from subprocess import _mswindows, PIPE, Popen, CalledProcessError, CompletedProcess, TimeoutExpired
 
 from crontab import CronTab
-from .vendor.atomicwrites import atomic_write as lib_atomic_write
+from atomicwrites import atomic_write as lib_atomic_write
 
 from .util import enforce_types, ExtendedEncoder
 from .config import PYTHON_BINARY, OUTPUT_PERMISSIONS, DIR_OUTPUT_PERMISSIONS, ENFORCE_ATOMIC_WRITES
 
 
-
 def run(cmd, *args, input=None, capture_output=True, timeout=None, check=False, text=False, start_new_session=True, **kwargs):
     """Patched of subprocess.run to kill forked child subprocesses and fix blocking io making timeout=innefective
         Mostly copied from https://github.com/python/cpython/blob/master/Lib/subprocess.py

+ 4 - 8
archivebox/util.py

@@ -16,7 +16,7 @@ from datetime import datetime, timezone
 from dateparser import parse as dateparser
 from requests.exceptions import RequestException, ReadTimeout
 
-from .vendor.base32_crockford import encode as base32_encode                            # type: ignore
+from base32_crockford import encode as base32_encode                            # type: ignore
 from w3lib.encoding import html_body_declared_encoding, http_content_type_encoding
 from os.path import lexists
 from os import remove as remove_file
@@ -273,8 +273,8 @@ def get_headers(url: str, timeout: int=None) -> str:
         {
             'URL': url,
             'Status-Code': response.status_code,
-            'Elapsed': response.elapsed,
-            'Encoding': response.encoding,
+            'Elapsed': response.elapsed.total_seconds()*1000,
+            'Encoding': str(response.encoding),
             'Apparent-Encoding': response.apparent_encoding,
             **dict(response.headers),
         },
@@ -304,11 +304,7 @@ def chrome_args(**options) -> List[str]:
     cmd_args += CHROME_EXTRA_ARGS
 
     if options['CHROME_HEADLESS']:
-        chrome_major_version = int(re.search(r'\s(\d+)\.\d', CHROME_VERSION)[1])
-        if chrome_major_version >= 111:
-            cmd_args += ("--headless=new",)
-        else:
-            cmd_args += ('--headless',)
+        cmd_args += ("--headless=new",)   # expects chrome version >= 111
 
     if not options['CHROME_SANDBOX']:
         # assume this means we are running inside a docker container

+ 34 - 0
archivebox/vendor/__init__.py

@@ -0,0 +1,34 @@
+import sys
+import inspect
+import importlib
+from pathlib import Path
+
+VENDOR_DIR = Path(__file__).parent
+
+VENDORED_LIBS = {
+    # sys.path dir:         library name
+    'python-atomicwrites':  'atomicwrites',
+    'django-taggit':        'taggit',
+    'pydantic-pkgr':        'pydantic_pkgr',
+    'pocket':               'pocket',
+    'base32-crockford':     'base32_crockford',
+}
+
+def load_vendored_libs():
+    for lib_subdir, lib_name in VENDORED_LIBS.items():
+        lib_dir = VENDOR_DIR / lib_subdir
+        assert lib_dir.is_dir(), 'Expected vendor libary {lib_name} could not be found in {lib_dir}'
+
+        try:
+            lib = importlib.import_module(lib_name)
+            # print(f"Successfully imported lib from environment {lib_name}: {inspect.getfile(lib)}")
+        except ImportError:
+            sys.path.append(str(lib_dir))
+            try:
+                lib = importlib.import_module(lib_name)
+                # print(f"Successfully imported lib from vendored fallback {lib_name}: {inspect.getfile(lib)}")
+            except ImportError as e:
+                print(f"Failed to import lib from environment or vendored fallback {lib_name}: {e}", file=sys.stderr)
+                sys.exit(1)
+        
+

+ 0 - 1
archivebox/vendor/atomicwrites.py

@@ -1 +0,0 @@
-python-atomicwrites/atomicwrites/__init__.py

+ 0 - 1
archivebox/vendor/base32_crockford.py

@@ -1 +0,0 @@
-base32-crockford/base32_crockford.py

+ 0 - 1
archivebox/vendor/package-lock.json

@@ -1 +0,0 @@
-../../package-lock.json

+ 0 - 1
archivebox/vendor/package.json

@@ -1 +0,0 @@
-../../package.json

+ 0 - 1
archivebox/vendor/pocket.py

@@ -1 +0,0 @@
-pocket/pocket.py

+ 1 - 0
archivebox/vendor/pydantic-pkgr

@@ -0,0 +1 @@
+Subproject commit 2cd844533d888ce29b9bf32b8363510dd0d76166

+ 0 - 1
archivebox/vendor/taggit_utils.py

@@ -1 +0,0 @@
-django-taggit/taggit/utils.py

+ 9 - 9
package-lock.json

@@ -236,9 +236,9 @@
       "license": "MIT"
     },
     "node_modules/@types/node": {
-      "version": "22.5.0",
-      "resolved": "https://registry.npmjs.org/@types/node/-/node-22.5.0.tgz",
-      "integrity": "sha512-DkFrJOe+rfdHTqqMg0bSNlGlQ85hSoh2TPzZyhHsXnMtligRWpxUySiyw8FY14ITt24HVCiQPWxS3KO/QlGmWg==",
+      "version": "22.5.1",
+      "resolved": "https://registry.npmjs.org/@types/node/-/node-22.5.1.tgz",
+      "integrity": "sha512-KkHsxej0j9IW1KKOOAA/XBA0z08UFSrRQHErzEfA3Vgq57eXIMYboIlHJuYIfd+lwCQjtKqUu3UnmKbtUc9yRw==",
       "license": "MIT",
       "optional": true,
       "dependencies": {
@@ -353,9 +353,9 @@
       }
     },
     "node_modules/aws4": {
-      "version": "1.13.1",
-      "resolved": "https://registry.npmjs.org/aws4/-/aws4-1.13.1.tgz",
-      "integrity": "sha512-u5w79Rd7SU4JaIlA/zFqG+gOiuq25q5VLyZ8E+ijJeILuTxVzZgp2CaGw/UTw6pXYN9XMO9yiqj/nEHmhTG5CA==",
+      "version": "1.13.2",
+      "resolved": "https://registry.npmjs.org/aws4/-/aws4-1.13.2.tgz",
+      "integrity": "sha512-lHe62zvbTB5eEABUVi/AwVh0ZKY9rMMDhmm+eeyuuUQbQ3+J+fONVQOZyj+DdrvD4BY33uYniyRJ4UJIaSKAfw==",
       "license": "MIT"
     },
     "node_modules/b4a": {
@@ -2376,9 +2376,9 @@
       }
     },
     "node_modules/tslib": {
-      "version": "2.6.3",
-      "resolved": "https://registry.npmjs.org/tslib/-/tslib-2.6.3.tgz",
-      "integrity": "sha512-xNvxJEOUiWPGhUuUdQgAJPKOOJfGnIyKySOc09XkKsgdUV/3E2zvwZYdejjmRgPCgcym1juLH3226yA7sEFJKQ==",
+      "version": "2.7.0",
+      "resolved": "https://registry.npmjs.org/tslib/-/tslib-2.7.0.tgz",
+      "integrity": "sha512-gLXCKdN1/j47AiHiOkJN69hJmcbGTHI0ImLmbYLHykhgeN0jVGola9yVjFgzCUklsZQMW55o+dW7IXv3RCXDzA==",
       "license": "0BSD"
     },
     "node_modules/turndown": {

+ 92 - 11
pdm.lock

@@ -5,7 +5,7 @@
 groups = ["default", "ldap", "sonic"]
 strategy = ["inherit_metadata"]
 lock_version = "4.5.0"
-content_hash = "sha256:f2f7ca01f2e18a1ef07d59b7a8985d89785a4b8a2a4e66452f1f9e8e8ad529ad"
+content_hash = "sha256:c6aa1f436032d18d079a4c2e9d9b95a5110579eb96a449751bfaf4d472eba401"
 
 [[metadata.targets]]
 requires_python = "==3.10.*"
@@ -78,6 +78,29 @@ files = [
     {file = "asttokens-2.4.1.tar.gz", hash = "sha256:b03869718ba9a6eb027e134bfdf69f38a236d681c83c160d510768af11254ba0"},
 ]
 
+[[package]]
+name = "atomicwrites"
+version = "1.4.0"
+requires_python = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*"
+summary = "Atomic file writes."
+groups = ["default"]
+marker = "python_version == \"3.10\""
+files = [
+    {file = "atomicwrites-1.4.0-py2.py3-none-any.whl", hash = "sha256:6d1784dea7c0c8d4a5172b6c620f40b6e4cbfdf96d783691f2e1302a7b88e197"},
+    {file = "atomicwrites-1.4.0.tar.gz", hash = "sha256:ae70396ad1a434f9c7046fd2dd196fc04b12f9e91ffb859164193be8b6168a7a"},
+]
+
+[[package]]
+name = "base32-crockford"
+version = "0.3.0"
+summary = "A Python implementation of Douglas Crockford's base32 encoding scheme"
+groups = ["default"]
+marker = "python_version == \"3.10\""
+files = [
+    {file = "base32-crockford-0.3.0.tar.gz", hash = "sha256:115f5bd32ae32b724035cb02eb65069a8824ea08c08851eb80c8b9f63443a969"},
+    {file = "base32_crockford-0.3.0-py2.py3-none-any.whl", hash = "sha256:295ef5ffbf6ed96b6e739ffd36be98fa7e90a206dd18c39acefb15777eedfe6e"},
+]
+
 [[package]]
 name = "brotli"
 version = "1.1.0"
@@ -407,6 +430,21 @@ files = [
     {file = "django_stubs_ext-5.0.4.tar.gz", hash = "sha256:85da065224204774208be29c7d02b4482d5a69218a728465c2fbe41725fdc819"},
 ]
 
+[[package]]
+name = "django-taggit"
+version = "1.3.0"
+requires_python = ">=3.5"
+summary = "django-taggit is a reusable Django application for simple tagging."
+groups = ["default"]
+marker = "python_version == \"3.10\""
+dependencies = [
+    "Django>=1.11",
+]
+files = [
+    {file = "django-taggit-1.3.0.tar.gz", hash = "sha256:4a833bf71f4c2deddd9745924eee53be1c075d7f0020a06f12e29fa3d752732d"},
+    {file = "django_taggit-1.3.0-py3-none-any.whl", hash = "sha256:609b0223d8a652f3fae088b7fd29f294fdadaca2d7931d45c27d6c59b02fdf31"},
+]
+
 [[package]]
 name = "exceptiongroup"
 version = "1.2.2"
@@ -479,7 +517,7 @@ files = [
 
 [[package]]
 name = "httpx"
-version = "0.27.0"
+version = "0.27.2"
 requires_python = ">=3.8"
 summary = "The next generation HTTP client."
 groups = ["default"]
@@ -492,20 +530,20 @@ dependencies = [
     "sniffio",
 ]
 files = [
-    {file = "httpx-0.27.0-py3-none-any.whl", hash = "sha256:71d5465162c13681bff01ad59b2cc68dd838ea1f10e51574bac27103f00c91a5"},
-    {file = "httpx-0.27.0.tar.gz", hash = "sha256:a0cb88a46f32dc874e04ee956e4c2764aba2aa228f650b06788ba6bda2962ab5"},
+    {file = "httpx-0.27.2-py3-none-any.whl", hash = "sha256:7bb2708e112d8fdd7829cd4243970f0c223274051cb35ee80c03301ee29a3df0"},
+    {file = "httpx-0.27.2.tar.gz", hash = "sha256:f7c2be1d2f3c3c3160d441802406b206c2b76f5947b11115e6df10c6c65e66c2"},
 ]
 
 [[package]]
 name = "idna"
-version = "3.7"
-requires_python = ">=3.5"
+version = "3.8"
+requires_python = ">=3.6"
 summary = "Internationalized Domain Names in Applications (IDNA)"
 groups = ["default"]
 marker = "python_version == \"3.10\""
 files = [
-    {file = "idna-3.7-py3-none-any.whl", hash = "sha256:82fee1fc78add43492d3a1898bfa6d8a904cc97d8427f683ed8e798d07761aa0"},
-    {file = "idna-3.7.tar.gz", hash = "sha256:028ff3aadf0609c1fd278d8ea3089299412a7a8b9bd005dd08b9f8285bcb5cfc"},
+    {file = "idna-3.8-py3-none-any.whl", hash = "sha256:050b4e5baadcd44d760cedbd2b8e639f2ff89bbc7a5730fcc662954303377aac"},
+    {file = "idna-3.8.tar.gz", hash = "sha256:d838c2c0ed6fced7693d5e8ab8e734d5f8fda53a039c0164afb0b82e771e3603"},
 ]
 
 [[package]]
@@ -613,6 +651,32 @@ files = [
     {file = "pexpect-4.9.0.tar.gz", hash = "sha256:ee7d41123f3c9911050ea2c2dac107568dc43b2d3b0c7557a33212c398ead30f"},
 ]
 
+[[package]]
+name = "pocket"
+version = "0.3.7"
+git = "https://github.com/tapanpandita/pocket.git"
+ref = "v0.3.7"
+revision = "5a144438cc89bfc0ec94db960718ccf1f76468c1"
+summary = "api wrapper for getpocket.com"
+groups = ["default"]
+marker = "python_version == \"3.10\""
+dependencies = [
+    "requests",
+]
+
+[[package]]
+name = "pocket"
+version = "0.3.7"
+git = "https://github.com/tapanpandita/pocket.git"
+ref = "v0.3.7"
+revision = "5a144438cc89bfc0ec94db960718ccf1f76468c1"
+summary = "api wrapper for getpocket.com"
+groups = ["default"]
+marker = "python_version == \"3.10\""
+dependencies = [
+    "requests",
+]
+
 [[package]]
 name = "prompt-toolkit"
 version = "3.0.47"
@@ -739,6 +803,23 @@ files = [
     {file = "pydantic_core-2.20.1.tar.gz", hash = "sha256:26ca695eeee5f9f1aeeb211ffc12f10bcb6f71e2989988fda61dabd65db878d4"},
 ]
 
+[[package]]
+name = "pydantic-pkgr"
+version = "0.1.4"
+requires_python = ">=3.10"
+summary = "System package manager APIs in strongly typed Python"
+groups = ["default"]
+marker = "python_version == \"3.10\""
+dependencies = [
+    "pydantic-core>=2.18.2",
+    "pydantic>=2.7.1",
+    "typing-extensions>=4.11.0",
+]
+files = [
+    {file = "pydantic_pkgr-0.1.4-py3-none-any.whl", hash = "sha256:bd9ddfa8eeb4d361257c4d3d8d36ba44a72515b497ee52cf0763240c66006417"},
+    {file = "pydantic_pkgr-0.1.4.tar.gz", hash = "sha256:e0422022dd83341f1e869a54da9aca903a6407a983ece0735f69493841b0fbb8"},
+]
+
 [[package]]
 name = "pygments"
 version = "2.18.0"
@@ -841,14 +922,14 @@ files = [
 
 [[package]]
 name = "setuptools"
-version = "73.0.1"
+version = "74.0.0"
 requires_python = ">=3.8"
 summary = "Easily download, build, install, upgrade, and uninstall Python packages"
 groups = ["default"]
 marker = "python_version == \"3.10\""
 files = [
-    {file = "setuptools-73.0.1-py3-none-any.whl", hash = "sha256:b208925fcb9f7af924ed2dc04708ea89791e24bde0d3020b27df0e116088b34e"},
-    {file = "setuptools-73.0.1.tar.gz", hash = "sha256:d59a3e788ab7e012ab2c4baed1b376da6366883ee20d7a5fc426816e3d7b1193"},
+    {file = "setuptools-74.0.0-py3-none-any.whl", hash = "sha256:0274581a0037b638b9fc1c6883cc71c0210865aaa76073f7882376b641b84e8f"},
+    {file = "setuptools-74.0.0.tar.gz", hash = "sha256:a85e96b8be2b906f3e3e789adec6a9323abf79758ecfa3065bd740d81158b11e"},
 ]
 
 [[package]]

+ 14 - 8
pyproject.toml

@@ -29,12 +29,9 @@ dependencies = [
     "croniter>=2.0.5",                # for: archivebox schedule
     "ipython>=8.23.0",                # for: archivebox shell
     # Extractor Dependencies
-    "yt-dlp>=2024.4.9",               # for: media
+    "yt-dlp>=2024.8.6",               # for: media
     # "playwright>=1.43.0; platform_machine != 'armv7l'",  # WARNING: playwright doesn't have any sdist, causes trouble on build systems that refuse to install wheel-only packages
-    # TODO: add more extractors
-    #  - gallery-dl
-    #  - scihubdl
-    #  - See Github issues for more...
+
     "django-signal-webhooks>=0.3.0",
     "django-admin-data-views>=0.3.1",
     "ulid-py>=1.1.0",
@@ -43,6 +40,14 @@ dependencies = [
     "django-pydantic-field>=0.3.9",
     "django-jsonform>=2.22.0",
     "django-stubs>=5.0.2",
+
+    # these can be safely omitted when installation subsystem does not provide these as packages (e.g. apt/debian)
+    # archivebox will automatically load fallback vendored copies bundled via archivebox/vendor/__init__.py
+    "pydantic-pkgr>=0.1.4",
+    "atomicwrites==1.4.0",
+    "pocket@git+https://github.com/tapanpandita/[email protected]",
+    "django-taggit==1.3.0",
+    "base32-crockford==0.3.0",
 ]
 
 homepage = "https://github.com/ArchiveBox/ArchiveBox"
@@ -139,7 +144,7 @@ exclude = [
     "**/migrations",
     "archivebox/vendor",
 ]
-stubPath = "./typings"
+stubPath = "./archivebox/typings"
 venvPath = "."
 venv = ".venv"
 # ignore = ["src/oldstuff"]
@@ -169,6 +174,9 @@ debug = [
     "djdt_flamegraph",
     "ipdb",
     "requests-tracker>=0.3.3",
+    "logfire[django]>=0.51.0",
+    "opentelemetry-instrumentation-django>=0.47b0",
+    "opentelemetry-instrumentation-sqlite3>=0.47b0",
 ]
 test = [
     "pytest",
@@ -177,8 +185,6 @@ test = [
 lint = [
     "flake8",
     "mypy",
-]
-dev = [
     "django-autotyping>=0.5.1",
 ]
 

+ 8 - 3
requirements.txt

@@ -5,6 +5,8 @@ annotated-types==0.7.0; python_version == "3.10"
 anyio==4.4.0; python_version == "3.10"
 asgiref==3.8.1; python_version == "3.10"
 asttokens==2.4.1; python_version == "3.10"
+atomicwrites==1.4.0; python_version == "3.10"
+base32-crockford==0.3.0; python_version == "3.10"
 brotli==1.1.0; implementation_name == "cpython" and python_version == "3.10"
 brotlicffi==1.1.0.0; implementation_name != "cpython" and python_version == "3.10"
 certifi==2024.7.4; python_version == "3.10"
@@ -26,13 +28,14 @@ django-settings-holder==0.1.2; python_version == "3.10"
 django-signal-webhooks==0.3.0; python_version == "3.10"
 django-stubs==5.0.4; python_version == "3.10"
 django-stubs-ext==5.0.4; python_version == "3.10"
+django-taggit==1.3.0; python_version == "3.10"
 exceptiongroup==1.2.2; python_version == "3.10"
 executing==2.0.1; python_version == "3.10"
 feedparser==6.0.11; python_version == "3.10"
 h11==0.14.0; python_version == "3.10"
 httpcore==1.0.5; python_version == "3.10"
-httpx==0.27.0; python_version == "3.10"
-idna==3.7; python_version == "3.10"
+httpx==0.27.2; python_version == "3.10"
+idna==3.8; python_version == "3.10"
 ipython==8.26.0; python_version == "3.10"
 jedi==0.19.1; python_version == "3.10"
 matplotlib-inline==0.1.7; python_version == "3.10"
@@ -40,6 +43,7 @@ mutagen==1.47.0; python_version == "3.10"
 mypy-extensions==1.0.0; python_version == "3.10"
 parso==0.8.4; python_version == "3.10"
 pexpect==4.9.0; (sys_platform != "win32" and sys_platform != "emscripten") and python_version == "3.10"
+pocket @ git+https://github.com/tapanpandita/pocket.git@5a144438cc89bfc0ec94db960718ccf1f76468c1 ; python_version == "3.10"
 prompt-toolkit==3.0.47; python_version == "3.10"
 ptyprocess==0.7.0; (sys_platform != "win32" and sys_platform != "emscripten") and python_version == "3.10"
 pure-eval==0.2.3; python_version == "3.10"
@@ -49,6 +53,7 @@ pycparser==2.22; platform_python_implementation != "PyPy" and python_version ==
 pycryptodomex==3.20.0; python_version == "3.10"
 pydantic==2.8.2; python_version == "3.10"
 pydantic-core==2.20.1; python_version == "3.10"
+pydantic-pkgr==0.1.4; python_version == "3.10"
 pygments==2.18.0; python_version == "3.10"
 python-crontab==3.2.0; python_version == "3.10"
 python-dateutil==2.9.0.post0; python_version == "3.10"
@@ -56,7 +61,7 @@ python-ldap==3.4.4; python_version == "3.10"
 pytz==2024.1; python_version == "3.10"
 regex==2024.7.24; python_version == "3.10"
 requests==2.32.3; python_version == "3.10"
-setuptools==73.0.1; python_version == "3.10"
+setuptools==74.0.0; python_version == "3.10"
 sgmllib3k==1.0.0; python_version == "3.10"
 six==1.16.0; python_version == "3.10"
 sniffio==1.3.1; python_version == "3.10"