Browse Source

New Snapshot detail page UI (#1429)

Nick Sweeting 1 year ago
parent
commit
925b6d943d
79 changed files with 4556 additions and 226 deletions
  1. 1 0
      .gitignore
  2. 1 4
      archivebox/__init__.py
  3. 1 0
      archivebox/abid_utils/__init__.py
  4. 191 0
      archivebox/abid_utils/abid.py
  5. 7 0
      archivebox/abid_utils/apps.py
  6. 0 0
      archivebox/abid_utils/migrations/__init__.py
  7. 314 0
      archivebox/abid_utils/models.py
  8. 3 0
      archivebox/abid_utils/tests.py
  9. 4 0
      archivebox/api/apps.py
  10. 60 0
      archivebox/api/migrations/0003_outboundwebhook_apitoken_abid_apitoken_uuid_and_more.py
  11. 58 0
      archivebox/api/migrations/0004_rename_user_apitoken_created_by_apitoken_modified_and_more.py
  12. 56 4
      archivebox/api/models.py
  13. 1 1
      archivebox/api/v1_auth.py
  14. 113 32
      archivebox/api/v1_core.py
  15. 43 2
      archivebox/cli/__init__.py
  16. 14 2
      archivebox/config.py
  17. 104 71
      archivebox/core/admin.py
  18. 0 2
      archivebox/core/migrations/0007_archiveresult.py
  19. 43 0
      archivebox/core/migrations/0023_alter_archiveresult_options_archiveresult_abid_and_more.py
  20. 95 0
      archivebox/core/migrations/0024_auto_20240513_1143.py
  21. 19 0
      archivebox/core/migrations/0025_alter_archiveresult_uuid.py
  22. 76 0
      archivebox/core/migrations/0026_archiveresult_created_archiveresult_created_by_and_more.py
  23. 157 32
      archivebox/core/models.py
  24. 78 14
      archivebox/core/settings.py
  25. 99 13
      archivebox/core/views.py
  26. 39 3
      archivebox/extractors/__init__.py
  27. 5 3
      archivebox/extractors/archive_org.py
  28. 6 3
      archivebox/extractors/dom.py
  29. 7 2
      archivebox/extractors/favicon.py
  30. 15 2
      archivebox/extractors/git.py
  31. 7 3
      archivebox/extractors/headers.py
  32. 8 2
      archivebox/extractors/htmltotext.py
  33. 16 2
      archivebox/extractors/media.py
  34. 9 3
      archivebox/extractors/mercury.py
  35. 7 3
      archivebox/extractors/pdf.py
  36. 9 3
      archivebox/extractors/readability.py
  37. 5 2
      archivebox/extractors/screenshot.py
  38. 8 3
      archivebox/extractors/singlefile.py
  39. 8 0
      archivebox/extractors/title.py
  40. 12 0
      archivebox/extractors/wget.py
  41. 4 4
      archivebox/index/html.py
  42. 17 2
      archivebox/index/schema.py
  43. 4 3
      archivebox/index/sql.py
  44. 16 0
      archivebox/monkey_patches.py
  45. 17 0
      archivebox/plugantic/__init__.py
  46. 26 0
      archivebox/plugantic/admin.py
  47. 6 0
      archivebox/plugantic/apps.py
  48. 323 0
      archivebox/plugantic/binaries.py
  49. 561 0
      archivebox/plugantic/binproviders.py
  50. 53 0
      archivebox/plugantic/configs.py
  51. 118 0
      archivebox/plugantic/extractors.py
  52. 396 0
      archivebox/plugantic/ini_to_toml.py
  53. 38 0
      archivebox/plugantic/migrations/0001_initial.py
  54. 21 0
      archivebox/plugantic/migrations/0002_alter_plugin_schema.py
  55. 21 0
      archivebox/plugantic/migrations/0003_alter_plugin_schema.py
  56. 32 0
      archivebox/plugantic/migrations/0004_remove_plugin_schema_plugin_configs_plugin_name.py
  57. 39 0
      archivebox/plugantic/migrations/0005_customplugin_delete_plugin.py
  58. 19 0
      archivebox/plugantic/migrations/0006_alter_customplugin_path.py
  59. 19 0
      archivebox/plugantic/migrations/0007_alter_customplugin_path.py
  60. 19 0
      archivebox/plugantic/migrations/0008_alter_customplugin_path.py
  61. 18 0
      archivebox/plugantic/migrations/0009_alter_customplugin_path.py
  62. 18 0
      archivebox/plugantic/migrations/0010_alter_customplugin_path.py
  63. 18 0
      archivebox/plugantic/migrations/0011_alter_customplugin_path.py
  64. 18 0
      archivebox/plugantic/migrations/0012_alter_customplugin_path.py
  65. 18 0
      archivebox/plugantic/migrations/0013_alter_customplugin_path.py
  66. 18 0
      archivebox/plugantic/migrations/0014_alter_customplugin_path.py
  67. 18 0
      archivebox/plugantic/migrations/0015_alter_customplugin_path.py
  68. 16 0
      archivebox/plugantic/migrations/0016_delete_customplugin.py
  69. 0 0
      archivebox/plugantic/migrations/__init__.py
  70. 50 0
      archivebox/plugantic/models.py
  71. 134 0
      archivebox/plugantic/plugins.py
  72. 26 0
      archivebox/plugantic/replayers.py
  73. 3 0
      archivebox/plugantic/tests.py
  74. 169 0
      archivebox/plugantic/views.py
  75. 5 5
      archivebox/search/__init__.py
  76. 12 0
      archivebox/templates/admin/base.html
  77. 1 1
      archivebox/templates/admin/snapshots_grid.html
  78. 545 0
      archivebox/templates/core/snapshot_live.html
  79. 21 0
      pyproject.toml

+ 1 - 0
.gitignore

@@ -29,6 +29,7 @@ dist/
 data/
 data/
 data*/
 data*/
 output/
 output/
+index.sqlite3
 
 
 # vim
 # vim
 *.sw?
 *.sw?

+ 1 - 4
archivebox/__init__.py

@@ -1,7 +1,4 @@
 __package__ = 'archivebox'
 __package__ = 'archivebox'
 
 
 
 
-# monkey patch django timezone to add back utc (it was removed in Django 5.0)
-import datetime
-from django.utils import timezone
-timezone.utc = datetime.timezone.utc
+from .monkey_patches import *

+ 1 - 0
archivebox/abid_utils/__init__.py

@@ -0,0 +1 @@
+__package__ = 'abid_utils'

+ 191 - 0
archivebox/abid_utils/abid.py

@@ -0,0 +1,191 @@
+from typing import NamedTuple, Any, Union, Optional
+
+import ulid
+import uuid6
+import hashlib
+from urllib.parse import urlparse
+
+from uuid import UUID
+from typeid import TypeID            # type: ignore[import-untyped]
+from datetime import datetime
+
+
+
+ABID_PREFIX_LEN = 4
+ABID_SUFFIX_LEN = 26
+ABID_LEN = 30
+ABID_TS_LEN = 10
+ABID_URI_LEN = 8
+ABID_SUBTYPE_LEN = 2
+ABID_RAND_LEN = 6
+
+DEFAULT_ABID_PREFIX = 'obj_'
+
+
+class ABID(NamedTuple):
+    """
+    e.g. ABID('obj_01HX9FPYTRE4A5CCD901ZYEBQE')
+    """
+    prefix: str            # e.g. obj_
+    ts: str                # e.g. 01HX9FPYTR
+    uri: str               # e.g. E4A5CCD9
+    subtype: str           # e.g. 01
+    rand: str              # e.g. ZYEBQE
+
+    def __getattr__(self, attr: str) -> Any:
+        return getattr(self.ulid, attr)
+
+    def __eq__(self, other: Any) -> bool:
+        try:
+            return self.ulid == other.ulid
+        except AttributeError:
+            return NotImplemented
+
+    def __str__(self) -> str:
+        return self.prefix + self.suffix
+
+    def __len__(self) -> int:
+        return len(self.prefix + self.suffix)
+
+    @classmethod
+    def parse(cls, buffer: Union[str, UUID, ulid.ULID, TypeID, 'ABID'], prefix=DEFAULT_ABID_PREFIX) -> 'ABID':
+        assert buffer, f'Attempted to create ABID from null value {buffer}'
+
+        buffer = str(buffer)
+        if '_' in buffer:
+            prefix, suffix = buffer.split('_')
+        else:
+            prefix, suffix = prefix.strip('_'), buffer
+
+        assert len(prefix) == ABID_PREFIX_LEN - 1   # length without trailing _
+        assert len(suffix) == ABID_SUFFIX_LEN, f'Suffix {suffix} from {buffer} was not {ABID_SUFFIX_LEN} chars long'
+
+        return cls(
+            prefix=abid_part_from_prefix(prefix),
+            ts=suffix[0:10].upper(),
+            uri=suffix[10:18].upper(),
+            subtype=suffix[18:20].upper(),
+            rand=suffix[20:26].upper(),
+        )
+
+    @property
+    def suffix(self):
+        return ''.join((self.ts, self.uri, self.subtype, self.rand))
+    
+    @property
+    def ulid(self) -> ulid.ULID:
+        return ulid.parse(self.suffix)
+
+    @property
+    def uuid(self) -> UUID:
+        return self.ulid.uuid
+
+    @property
+    def uuid6(self) -> uuid6.UUID:
+        return uuid6.UUID(hex=self.uuid.hex)
+
+    @property
+    def typeid(self) -> TypeID:
+        return TypeID.from_uuid(prefix=self.prefix.strip('_'), suffix=self.uuid6)
+
+    @property
+    def datetime(self) -> datetime:
+        return self.ulid.timestamp().datetime
+
+
+
+####################################################
+
+
+def uri_hash(uri: Union[str, bytes]) -> str:
+    """
+    'E4A5CCD9AF4ED2A6E0954DF19FD274E9CDDB4853051F033FD518BFC90AA1AC25'
+    """
+    if isinstance(uri, bytes):
+        uri_str: str = uri.decode()
+    else:
+        uri_str = uri
+
+    # only hash the domain part of URLs
+    if '://' in uri_str:
+        try:
+            domain = urlparse(uri_str).netloc
+            if domain:
+                uri_str = domain
+        except AttributeError:
+            pass
+    
+    uri_bytes = uri_str.encode('utf-8')
+
+    return hashlib.sha256(uri_bytes).hexdigest().upper()
+
+def abid_part_from_prefix(prefix: Optional[str]) -> str:
+    """
+    'snp_'
+    """
+    if prefix is None:
+        return 'obj_'
+
+    prefix = prefix.strip('_').lower()
+    assert len(prefix) == 3
+    return prefix + '_'
+
+def abid_part_from_uri(uri: str) -> str:
+    """
+    'E4A5CCD9'     # takes first 8 characters of sha256(url)
+    """
+    uri = str(uri)
+    return uri_hash(uri)[:ABID_URI_LEN]
+
+def abid_part_from_ts(ts: Optional[datetime]) -> str:
+    """
+    '01HX9FPYTR'   # produces 10 character Timestamp section of ulid based on added date
+    """
+    return str(ulid.from_timestamp(ts) if ts else ulid.new())[:ABID_TS_LEN]
+
+def abid_part_from_subtype(subtype: str) -> str:
+    """
+    Snapshots have 01 type, other objects have other subtypes like wget/media/etc.
+    Also allows us to change the ulid spec later by putting special sigil values here.
+    """
+    subtype = str(subtype)
+    if len(subtype) == ABID_SUBTYPE_LEN:
+        return subtype
+
+    return hashlib.sha256(subtype.encode('utf-8')).hexdigest()[:ABID_SUBTYPE_LEN].upper()
+
+def abid_part_from_rand(rand: Union[str, UUID, None, int]) -> str:
+    """
+    'ZYEBQE'   # takes last 6 characters of randomness from existing legacy uuid db field
+    """
+    if rand is None:
+        # if it's None we generate a new random 6 character hex string
+        return str(ulid.new())[-ABID_RAND_LEN:]
+    elif isinstance(rand, UUID):
+        # if it's a uuid we take the last 6 characters of the ULID represation of it
+        return str(ulid.from_uuid(rand))[-ABID_RAND_LEN:]
+    elif isinstance(rand, int):
+        # if it's a BigAutoInteger field we convert it from an int to a 0-padded string
+        rand_str = str(rand)[-ABID_RAND_LEN:]
+        padding_needed = ABID_RAND_LEN - len(rand_str)
+        rand_str = ('0'*padding_needed) + rand_str
+        return rand_str
+
+    # otherwise treat it as a string, take the last 6 characters of it verbatim
+    return str(rand)[-ABID_RAND_LEN:].upper()
+
+
+def abid_from_values(prefix, ts, uri, subtype, rand) -> ABID:
+    """
+    Return a freshly derived ABID (assembled from attrs defined in ABIDModel.abid_*_src).
+    """
+
+    abid = ABID(
+        prefix=abid_part_from_prefix(prefix),
+        ts=abid_part_from_ts(ts),
+        uri=abid_part_from_uri(uri),
+        subtype=abid_part_from_subtype(subtype),
+        rand=abid_part_from_rand(rand),
+    )
+    assert abid.ulid and abid.uuid and abid.typeid, f'Failed to calculate {prefix}_ABID for ts={ts} uri={uri} subtyp={subtype} rand={rand}'
+    return abid

+ 7 - 0
archivebox/abid_utils/apps.py

@@ -0,0 +1,7 @@
+from django.apps import AppConfig
+
+
+class AbidUtilsConfig(AppConfig):
+    default_auto_field = 'django.db.models.BigAutoField'
+    
+    name = 'abid_utils'

+ 0 - 0
archivebox/index.sqlite3 → archivebox/abid_utils/migrations/__init__.py


+ 314 - 0
archivebox/abid_utils/models.py

@@ -0,0 +1,314 @@
+"""
+This file provides the Django ABIDField and ABIDModel base model to inherit from.
+
+It implements the ArchiveBox ID (ABID) interfaces including abid_values, get_abid, .abid, .uuid, .id.
+"""
+
+from typing import Any, Dict, Union, List, Set, NamedTuple, cast
+
+from ulid import ULID
+from uuid import uuid4, UUID
+from typeid import TypeID            # type: ignore[import-untyped]
+from datetime import datetime
+from functools import partial
+from charidfield import CharIDField  # type: ignore[import-untyped]
+
+from django.conf import settings
+from django.db import models
+from django.db.utils import OperationalError
+from django.contrib.auth import get_user_model
+
+from django_stubs_ext.db.models import TypedModelMeta
+
+from .abid import (
+    ABID,
+    ABID_LEN,
+    ABID_RAND_LEN,
+    ABID_SUFFIX_LEN,
+    DEFAULT_ABID_PREFIX,
+    abid_part_from_prefix,
+    abid_from_values
+)
+
+####################################################
+
+
+# Database Field for typeid/ulid style IDs with a prefix, e.g. snp_01BJQMF54D093DXEAWZ6JYRPAQ
+ABIDField = partial(
+    CharIDField,
+    max_length=ABID_LEN,
+    help_text="ABID-format identifier for this entity (e.g. snp_01BJQMF54D093DXEAWZ6JYRPAQ)",
+    default=None,
+    null=True,
+    blank=True,
+    db_index=True,
+    unique=True,
+)
+
+def get_or_create_system_user_pk(username='system'):
+    """Get or create a system user with is_superuser=True to be the default owner for new DB rows"""
+
+    User = get_user_model()
+
+    # if only one user exists total, return that user
+    if User.objects.filter(is_superuser=True).count() == 1:
+        return User.objects.filter(is_superuser=True).values_list('pk', flat=True)[0]
+
+    # otherwise, create a dedicated "system" user
+    user, created = User.objects.get_or_create(username=username, is_staff=True, is_superuser=True, defaults={'email': '', 'password': ''})
+    return user.pk
+
+
+class ABIDModel(models.Model):
+    """
+    Abstract Base Model for other models to depend on. Provides ArchiveBox ID (ABID) interface.
+    """
+    abid_prefix: str = DEFAULT_ABID_PREFIX  # e.g. 'tag_'
+    abid_ts_src = 'None'                    # e.g. 'self.created'
+    abid_uri_src = 'None'                   # e.g. 'self.uri'
+    abid_subtype_src = 'None'               # e.g. 'self.extractor'
+    abid_rand_src = 'None'                  # e.g. 'self.uuid' or 'self.id'
+
+    id = models.UUIDField(primary_key=True, default=uuid4, editable=True)
+    uuid = models.UUIDField(blank=True, null=True, editable=True, unique=True)
+    abid = ABIDField(prefix=abid_prefix)
+
+    created_by = models.ForeignKey(settings.AUTH_USER_MODEL, on_delete=models.CASCADE, default=get_or_create_system_user_pk)
+    created = models.DateTimeField(auto_now_add=True)
+    modified = models.DateTimeField(auto_now=True)
+
+    class Meta(TypedModelMeta):
+        abstract = True
+
+    def save(self, *args: Any, **kwargs: Any) -> None:
+        if hasattr(self, 'abid'):
+            # self.abid = ABID.parse(self.abid) if self.abid else self.get_abid()
+            self.abid = self.get_abid()
+        else:
+            print(f'[!] WARNING: {self.__class__.__name__}.abid is not a DB field so ABID will not be persisted!')
+            self.abid = self.get_abid()
+        
+        super().save(*args, **kwargs)
+
+    @property
+    def abid_values(self) -> Dict[str, Any]:
+        return {
+            'prefix': self.abid_prefix,
+            'ts': eval(self.abid_ts_src),
+            'uri': eval(self.abid_uri_src),
+            'subtype': eval(self.abid_subtype_src),
+            'rand': eval(self.abid_rand_src),
+        }
+
+    def get_abid(self) -> ABID:
+        """
+        Return a freshly derived ABID (assembled from attrs defined in ABIDModel.abid_*_src).
+        """
+        prefix, ts, uri, subtype, rand = self.abid_values.values()
+
+        if (not prefix) or prefix == DEFAULT_ABID_PREFIX:
+            suggested_abid = self.__class__.__name__[:3].lower()
+            raise Exception(f'{self.__class__.__name__}.abid_prefix must be defined to calculate ABIDs (suggested: {suggested_abid})')
+
+        if not ts:
+            ts = datetime.utcfromtimestamp(0)
+            print(f'[!] WARNING: Generating ABID with ts=0000000000 placeholder because {self.__class__.__name__}.abid_ts_src={self.abid_ts_src} is unset!', ts.isoformat())
+
+        if not uri:
+            uri = str(self)
+            print(f'[!] WARNING: Generating ABID with uri=str(self) placeholder because {self.__class__.__name__}.abid_uri_src={self.abid_uri_src} is unset!', uri)
+
+        if not subtype:
+            subtype = self.__class__.__name__
+            print(f'[!] WARNING: Generating ABID with subtype={subtype} placeholder because {self.__class__.__name__}.abid_subtype_src={self.abid_subtype_src} is unset!', subtype)
+
+        if not rand:
+            rand = getattr(self, 'uuid', None) or getattr(self, 'id', None) or getattr(self, 'pk')
+            print(f'[!] WARNING: Generating ABID with rand=self.id placeholder because {self.__class__.__name__}.abid_rand_src={self.abid_rand_src} is unset!', rand)
+
+        abid = abid_from_values(
+            prefix=prefix,
+            ts=ts,
+            uri=uri,
+            subtype=subtype,
+            rand=rand,
+        )
+        assert abid.ulid and abid.uuid and abid.typeid, f'Failed to calculate {prefix}_ABID for {self.__class__.__name__}'
+        return abid
+
+    @property
+    def ABID(self) -> ABID:
+        """
+        ULIDParts(timestamp='01HX9FPYTR', url='E4A5CCD9', subtype='00', randomness='ZYEBQE')
+        """
+        return ABID.parse(self.abid) if getattr(self, 'abid', None) else self.get_abid()
+
+    @property
+    def ULID(self) -> ULID:
+        """
+        Get a ulid.ULID representation of the object's ABID.
+        """
+        return self.ABID.ulid
+
+    @property
+    def UUID(self) -> UUID:
+        """
+        Get a uuid.UUID (v4) representation of the object's ABID.
+        """
+        return self.ABID.uuid
+
+    @property
+    def TypeID(self) -> TypeID:
+        """
+        Get a typeid.TypeID (stripe-style) representation of the object's ABID.
+        """
+        return self.ABID.typeid
+
+
+
+####################################################
+
+# Django helpers
+def find_all_abid_prefixes() -> Dict[str, type[models.Model]]:
+    """
+    Return the mapping of all ABID prefixes to their models.
+    e.g. {'tag_': core.models.Tag, 'snp_': core.models.Snapshot, ...}
+    """
+    import django.apps
+    prefix_map = {}
+
+    for model in django.apps.apps.get_models():
+        abid_prefix = getattr(model, 'abid_prefix', None)
+        if abid_prefix:
+            prefix_map[abid_prefix] = model
+    return prefix_map
+
+def find_prefix_for_abid(abid: ABID) -> str:
+    """
+    Find the correct prefix for a given ABID that may have be missing a prefix (slow).
+    e.g. ABID('obj_01BJQMF54D093DXEAWZ6JYRPAQ') -> 'snp_'
+    """
+    # if existing abid prefix is correct, lookup is easy
+    model = find_model_from_abid(abid)
+    if model:
+        assert issubclass(model, ABIDModel)
+        return model.abid_prefix
+
+    # prefix might be obj_ or missing, fuzzy-search to find any object that matches
+    return find_obj_from_abid_rand(abid)[0].abid_prefix
+
+def find_model_from_abid_prefix(prefix: str) -> type[ABIDModel] | None:
+    """
+    Return the Django Model that corresponds to a given ABID prefix.
+    e.g. 'tag_' -> core.models.Tag
+    """
+    prefix = abid_part_from_prefix(prefix)
+
+    import django.apps
+
+    for model in django.apps.apps.get_models():
+        if not issubclass(model, ABIDModel): continue   # skip non-ABID-enabled models
+        if not hasattr(model, 'objects'): continue      # skip abstract models
+
+        if (model.abid_prefix == prefix):
+            return model
+
+    return None
+
+def find_model_from_abid(abid: ABID) -> type[models.Model] | None:
+    """
+    Shortcut for find_model_from_abid_prefix(abid.prefix)
+    """
+    return find_model_from_abid_prefix(abid.prefix)
+
+def find_obj_from_abid_rand(rand: Union[ABID, str], model=None) -> List[ABIDModel]:
+    """
+    Find an object corresponding to an ABID by exhaustively searching using its random suffix (slow).
+    e.g. 'obj_....................JYRPAQ' -> Snapshot('snp_01BJQMF54D093DXEAWZ6JYRPAQ')
+    """
+
+    # convert str to ABID if necessary
+    if isinstance(rand, ABID):
+        abid: ABID = rand
+    else:
+        rand = str(rand)
+        if len(rand) < ABID_SUFFIX_LEN:
+            padding_needed = ABID_SUFFIX_LEN - len(rand)
+            rand = ('0'*padding_needed) + rand
+        abid = ABID.parse(rand)
+
+    import django.apps
+
+    partial_matches: List[ABIDModel] = []
+
+    models_to_try = cast(Set[type[models.Model]], set(filter(bool, (
+        model,
+        find_model_from_abid(abid),
+        *django.apps.apps.get_models(),
+    ))))
+    # print(abid, abid.rand, abid.uuid, models_to_try)
+
+    for model in models_to_try:
+        if not issubclass(model, ABIDModel): continue   # skip Models that arent ABID-enabled
+        if not hasattr(model, 'objects'): continue      # skip abstract Models
+        assert hasattr(model, 'objects')                # force-fix for type hint nit about missing manager https://github.com/typeddjango/django-stubs/issues/1684
+
+        # continue on to try fuzzy searching by randomness portion derived from uuid field
+        try:
+            qs = []
+            if hasattr(model, 'abid'):
+                qs = model.objects.filter(abid__endswith=abid.rand)
+            elif hasattr(model, 'uuid'):
+                qs = model.objects.filter(uuid__endswith=str(abid.uuid)[-ABID_RAND_LEN:])
+            elif hasattr(model, 'id'):
+                # NOTE: this only works on SQLite where every column is a string
+                # other DB backends like postgres dont let you do __endswith if this is a BigAutoInteger field
+                
+                # try to search for uuid=...-2354352
+                # try to search for id=...2354352
+                # try to search for id=2354352
+                qs = model.objects.filter(
+                    models.Q(id__endswith=str(abid.uuid)[-ABID_RAND_LEN:])
+                    | models.Q(id__endswith=abid.rand)
+                    | models.Q(id__startswith=str(int(abid.rand)) if abid.rand.isdigit() else abid.rand)
+                )
+
+            for obj in qs:
+                if obj.get_abid() == abid:
+                    # found exact match, no need to keep iterating
+                    return [obj]
+                partial_matches.append(obj)
+        except OperationalError as err:
+            print(f'[!] WARNING: Got error while trying to iterate through QuerySet for {model}:', err, '\n')
+
+    return partial_matches
+
+def find_obj_from_abid(abid: ABID, model=None, fuzzy=False) -> Any:
+    """
+    Find an object with a given ABID by filtering possible models for a matching abid/uuid/id (fast).
+    e.g. 'snp_01BJQMF54D093DXEAWZ6JYRPAQ' -> Snapshot('snp_01BJQMF54D093DXEAWZ6JYRPAQ')
+    """
+
+    model = model or find_model_from_abid(abid)
+    assert model, f'Could not find model that could match this ABID type: {abid}'
+
+    try:
+        if hasattr(model, 'abid'):
+            return model.objects.get(abid__endswith=abid.suffix)
+        if hasattr(model, 'uuid'):
+            return model.objects.get(uuid=abid.uuid)
+        return model.objects.get(id=abid.uuid)
+    except model.DoesNotExist:
+        # if the model has an abid field then it shouldve matched, pointless to fuzzy search in that case
+        if hasattr(model, 'abid') or (not fuzzy):
+            raise
+
+    # continue on to try fuzzy searching by randomness portion derived from uuid field
+    match_by_rand = find_obj_from_abid_rand(abid, model=model)
+    if match_by_rand:
+        if match_by_rand[0].abid_prefix != abid.prefix:
+            print(f'[!] WARNING: fetched object {match_by_rand} even though prefix {abid.prefix} doesnt match!', abid, '\n')
+        return match_by_rand
+
+    raise model.DoesNotExist
+

+ 3 - 0
archivebox/abid_utils/tests.py

@@ -0,0 +1,3 @@
+from django.test import TestCase
+
+# Create your tests here.

+ 4 - 0
archivebox/api/apps.py

@@ -3,5 +3,9 @@ __package__ = 'archivebox.api'
 from django.apps import AppConfig
 from django.apps import AppConfig
 
 
 
 
+
 class APIConfig(AppConfig):
 class APIConfig(AppConfig):
     name = 'api'
     name = 'api'
+
+    def ready(self):
+        pass

+ 60 - 0
archivebox/api/migrations/0003_outboundwebhook_apitoken_abid_apitoken_uuid_and_more.py

@@ -0,0 +1,60 @@
+# Generated by Django 5.0.6 on 2024-05-13 10:58
+
+import charidfield.fields
+import signal_webhooks.fields
+import signal_webhooks.utils
+import uuid
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ('api', '0002_alter_apitoken_options'),
+    ]
+
+    operations = [
+        migrations.CreateModel(
+            name='OutboundWebhook',
+            fields=[
+                ('name', models.CharField(db_index=True, help_text='Give your webhook a descriptive name (e.g. Notify ACME Slack channel of any new ArchiveResults).', max_length=255, unique=True, verbose_name='name')),
+                ('signal', models.CharField(choices=[('CREATE', 'Create'), ('UPDATE', 'Update'), ('DELETE', 'Delete'), ('M2M', 'M2M changed'), ('CREATE_OR_UPDATE', 'Create or Update'), ('CREATE_OR_DELETE', 'Create or Delete'), ('CREATE_OR_M2M', 'Create or M2M changed'), ('UPDATE_OR_DELETE', 'Update or Delete'), ('UPDATE_OR_M2M', 'Update or M2M changed'), ('DELETE_OR_M2M', 'Delete or M2M changed'), ('CREATE_UPDATE_OR_DELETE', 'Create, Update or Delete'), ('CREATE_UPDATE_OR_M2M', 'Create, Update or M2M changed'), ('CREATE_DELETE_OR_M2M', 'Create, Delete or M2M changed'), ('UPDATE_DELETE_OR_M2M', 'Update, Delete or M2M changed'), ('CREATE_UPDATE_DELETE_OR_M2M', 'Create, Update or Delete, or M2M changed')], help_text='The type of event the webhook should fire for (e.g. Create, Update, Delete).', max_length=255, verbose_name='signal')),
+                ('ref', models.CharField(db_index=True, help_text='Dot import notation of the model the webhook should fire for (e.g. core.models.Snapshot or core.models.ArchiveResult).', max_length=1023, validators=[signal_webhooks.utils.model_from_reference], verbose_name='referenced model')),
+                ('endpoint', models.URLField(help_text='External URL to POST the webhook notification to (e.g. https://someapp.example.com/webhook/some-webhook-receiver).', max_length=2047, verbose_name='endpoint')),
+                ('headers', models.JSONField(blank=True, default=dict, help_text='Headers to send with the webhook request.', validators=[signal_webhooks.utils.is_dict], verbose_name='headers')),
+                ('auth_token', signal_webhooks.fields.TokenField(blank=True, default='', help_text='Authentication token to use in an Authorization header.', max_length=8000, validators=[signal_webhooks.utils.decode_cipher_key], verbose_name='authentication token')),
+                ('enabled', models.BooleanField(default=True, help_text='Is this webhook enabled?', verbose_name='enabled')),
+                ('keep_last_response', models.BooleanField(default=False, help_text='Should the webhook keep a log of the latest response it got?', verbose_name='keep last response')),
+                ('created', models.DateTimeField(auto_now_add=True, help_text='When the webhook was created.', verbose_name='created')),
+                ('updated', models.DateTimeField(auto_now=True, help_text='When the webhook was last updated.', verbose_name='updated')),
+                ('last_response', models.CharField(blank=True, default='', help_text='Latest response to this webhook.', max_length=8000, verbose_name='last response')),
+                ('last_success', models.DateTimeField(default=None, help_text='When the webhook last succeeded.', null=True, verbose_name='last success')),
+                ('last_failure', models.DateTimeField(default=None, help_text='When the webhook last failed.', null=True, verbose_name='last failure')),
+                ('uuid', models.UUIDField(default=uuid.uuid4, primary_key=True, serialize=False)),
+                ('abid', charidfield.fields.CharIDField(blank=True, db_index=True, default=None, help_text='ABID-format identifier for this entity (e.g. snp_01BJQMF54D093DXEAWZ6JYRPAQ)', max_length=30, null=True, prefix='whk', unique=True)),
+            ],
+            options={
+                'verbose_name': 'API Outbound Webhook',
+                'abstract': False,
+            },
+        ),
+        migrations.AddField(
+            model_name='apitoken',
+            name='abid',
+            field=charidfield.fields.CharIDField(blank=True, db_index=True, default=None, help_text='ABID-format identifier for this entity (e.g. snp_01BJQMF54D093DXEAWZ6JYRPAQ)', max_length=30, null=True, prefix='apt', unique=True),
+        ),
+        migrations.AddField(
+            model_name='apitoken',
+            name='uuid',
+            field=models.UUIDField(blank=True, null=True, unique=True),
+        ),
+        migrations.AlterField(
+            model_name='apitoken',
+            name='id',
+            field=models.UUIDField(default=uuid.uuid4, primary_key=True, serialize=False),
+        ),
+        migrations.AddConstraint(
+            model_name='outboundwebhook',
+            constraint=models.UniqueConstraint(fields=('ref', 'endpoint'), name='prevent_duplicate_hooks_api_outboundwebhook'),
+        ),
+    ]

+ 58 - 0
archivebox/api/migrations/0004_rename_user_apitoken_created_by_apitoken_modified_and_more.py

@@ -0,0 +1,58 @@
+# Generated by Django 5.0.6 on 2024-05-13 14:36
+
+import abid_utils.models
+import charidfield.fields
+import django.db.models.deletion
+from django.conf import settings
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ('api', '0003_outboundwebhook_apitoken_abid_apitoken_uuid_and_more'),
+        migrations.swappable_dependency(settings.AUTH_USER_MODEL),
+    ]
+
+    operations = [
+        migrations.RenameField(
+            model_name='apitoken',
+            old_name='user',
+            new_name='created_by',
+        ),
+        migrations.AddField(
+            model_name='apitoken',
+            name='modified',
+            field=models.DateTimeField(auto_now=True),
+        ),
+        migrations.AddField(
+            model_name='outboundwebhook',
+            name='created_by',
+            field=models.ForeignKey(default=abid_utils.models.get_or_create_system_user_pk, on_delete=django.db.models.deletion.CASCADE, to=settings.AUTH_USER_MODEL),
+        ),
+        migrations.AddField(
+            model_name='outboundwebhook',
+            name='id',
+            field=models.UUIDField(blank=True, null=True, unique=True),
+        ),
+        migrations.AddField(
+            model_name='outboundwebhook',
+            name='modified',
+            field=models.DateTimeField(auto_now=True),
+        ),
+        migrations.AlterField(
+            model_name='apitoken',
+            name='abid',
+            field=charidfield.fields.CharIDField(blank=True, db_index=True, default=None, help_text='ABID-format identifier for this entity (e.g. snp_01BJQMF54D093DXEAWZ6JYRPAQ)', max_length=30, null=True, prefix='apt_', unique=True),
+        ),
+        migrations.AlterField(
+            model_name='outboundwebhook',
+            name='abid',
+            field=charidfield.fields.CharIDField(blank=True, db_index=True, default=None, help_text='ABID-format identifier for this entity (e.g. snp_01BJQMF54D093DXEAWZ6JYRPAQ)', max_length=30, null=True, prefix='whk_', unique=True),
+        ),
+        migrations.AlterField(
+            model_name='outboundwebhook',
+            name='created',
+            field=models.DateTimeField(auto_now_add=True),
+        ),
+    ]

+ 56 - 4
archivebox/api/models.py

@@ -8,22 +8,39 @@ from django.conf import settings
 from django.db import models
 from django.db import models
 from django.utils import timezone
 from django.utils import timezone
 
 
+from signal_webhooks.models import WebhookBase
+
 from django_stubs_ext.db.models import TypedModelMeta
 from django_stubs_ext.db.models import TypedModelMeta
 
 
+from abid_utils.models import ABIDModel, ABIDField
+
 
 
 def generate_secret_token() -> str:
 def generate_secret_token() -> str:
     # returns cryptographically secure string with len() == 32
     # returns cryptographically secure string with len() == 32
     return secrets.token_hex(16)
     return secrets.token_hex(16)
 
 
 
 
-class APIToken(models.Model):
-    id = models.UUIDField(primary_key=True, default=uuid.uuid4, editable=False)
+class APIToken(ABIDModel):
+    """
+    A secret key generated by a User that's used to authenticate REST API requests to ArchiveBox.
+    """
+    # ABID: apt_<created_ts>_<token_hash>_<user_id_hash>_<uuid_rand>
+    abid_prefix = 'apt_'
+    abid_ts_src = 'self.created'
+    abid_uri_src = 'self.token'
+    abid_subtype_src = 'self.user_id'
+    abid_rand_src = 'self.id'
 
 
-    user = models.ForeignKey(settings.AUTH_USER_MODEL, on_delete=models.CASCADE)
+    id = models.UUIDField(primary_key=True, default=uuid.uuid4, editable=True)
+    uuid = models.UUIDField(blank=True, null=True, editable=True, unique=True)
+    abid = ABIDField(prefix=abid_prefix)
+
+    created_by = models.ForeignKey(settings.AUTH_USER_MODEL, on_delete=models.CASCADE)
     token = models.CharField(max_length=32, default=generate_secret_token, unique=True)
     token = models.CharField(max_length=32, default=generate_secret_token, unique=True)
     
     
     created = models.DateTimeField(auto_now_add=True)
     created = models.DateTimeField(auto_now_add=True)
     expires = models.DateTimeField(null=True, blank=True)
     expires = models.DateTimeField(null=True, blank=True)
+    
 
 
     class Meta(TypedModelMeta):
     class Meta(TypedModelMeta):
         verbose_name = "API Key"
         verbose_name = "API Key"
@@ -38,7 +55,8 @@ class APIToken(models.Model):
     def __json__(self) -> dict:
     def __json__(self) -> dict:
         return {
         return {
             "TYPE":             "APIToken",    
             "TYPE":             "APIToken",    
-            "id":               str(self.id),
+            "uuid":             str(self.id),
+            "abid":             str(self.get_abid()),
             "user_id":          str(self.user.id),
             "user_id":          str(self.user.id),
             "user_username":    self.user.username,
             "user_username":    self.user.username,
             "token":            self.token,
             "token":            self.token,
@@ -61,3 +79,37 @@ class APIToken(models.Model):
 
 
         return True
         return True
 
 
+
+
+
+
+
+# monkey patch django-signals-webhooks to change how it shows up in Admin UI
+
+class OutboundWebhook(ABIDModel, WebhookBase):
+    """
+    Model used in place of (extending) signals_webhooks.models.WebhookModel. Swapped using:
+        settings.SIGNAL_WEBHOOKS_CUSTOM_MODEL = 'api.models.OutboundWebhook'
+    """
+    abid_prefix = 'whk_'
+    abid_ts_src = 'self.created'
+    abid_uri_src = 'self.endpoint'
+    abid_subtype_src = 'self.ref'
+    abid_rand_src = 'self.id'
+
+    id = models.UUIDField(blank=True, null=True, unique=True, editable=True)
+    uuid = models.UUIDField(primary_key=True, default=uuid.uuid4, editable=True)
+    abid = ABIDField(prefix=abid_prefix)
+
+    WebhookBase._meta.get_field('name').help_text = (
+        'Give your webhook a descriptive name (e.g. Notify ACME Slack channel of any new ArchiveResults).')
+    WebhookBase._meta.get_field('signal').help_text = (
+        'The type of event the webhook should fire for (e.g. Create, Update, Delete).')
+    WebhookBase._meta.get_field('ref').help_text = (
+        'Dot import notation of the model the webhook should fire for (e.g. core.models.Snapshot or core.models.ArchiveResult).')
+    WebhookBase._meta.get_field('endpoint').help_text = (
+        'External URL to POST the webhook notification to (e.g. https://someapp.example.com/webhook/some-webhook-receiver).')
+
+    class Meta(WebhookBase.Meta):
+        verbose_name = 'API Outbound Webhook'
+

+ 1 - 1
archivebox/api/v1_auth.py

@@ -47,6 +47,6 @@ def check_api_token(request, token_data: TokenAuthSchema):
         request=request,
         request=request,
     )
     )
     if user:
     if user:
-        return {"success": True, "user_id": str(user.id)}
+        return {"success": True, "user_id": str(user.pk)}
     
     
     return {"success": False, "user_id": None}
     return {"success": False, "user_id": None}

+ 113 - 32
archivebox/api/v1_core.py

@@ -4,13 +4,14 @@ from uuid import UUID
 from typing import List, Optional
 from typing import List, Optional
 from datetime import datetime
 from datetime import datetime
 
 
+from django.db.models import Q
 from django.shortcuts import get_object_or_404
 from django.shortcuts import get_object_or_404
 
 
 from ninja import Router, Schema, FilterSchema, Field, Query
 from ninja import Router, Schema, FilterSchema, Field, Query
 from ninja.pagination import paginate
 from ninja.pagination import paginate
 
 
 from core.models import Snapshot, ArchiveResult, Tag
 from core.models import Snapshot, ArchiveResult, Tag
-
+from abid_utils.abid import ABID
 
 
 router = Router(tags=['Core Models'])
 router = Router(tags=['Core Models'])
 
 
@@ -20,24 +21,39 @@ router = Router(tags=['Core Models'])
 ### ArchiveResult #########################################################################
 ### ArchiveResult #########################################################################
 
 
 class ArchiveResultSchema(Schema):
 class ArchiveResultSchema(Schema):
-    id: UUID
+    abid: str
+    uuid: UUID
+    pk: str
+    modified: datetime
+    created: datetime
+    created_by_id: str
 
 
-    snapshot_id: UUID
+    snapshot_abid: str
     snapshot_url: str
     snapshot_url: str
     snapshot_tags: str
     snapshot_tags: str
 
 
     extractor: str
     extractor: str
+    cmd_version: str
     cmd: List[str]
     cmd: List[str]
     pwd: str
     pwd: str
-    cmd_version: str
-    output: str
     status: str
     status: str
+    output: str
 
 
-    created: datetime
+    @staticmethod
+    def resolve_created_by_id(obj):
+        return str(obj.created_by_id)
+
+    @staticmethod
+    def resolve_pk(obj):
+        return str(obj.pk)
+
+    @staticmethod
+    def resolve_uuid(obj):
+        return str(obj.uuid)
 
 
     @staticmethod
     @staticmethod
-    def resolve_id(obj):
-        return obj.uuid
+    def resolve_abid(obj):
+        return str(obj.ABID)
 
 
     @staticmethod
     @staticmethod
     def resolve_created(obj):
     def resolve_created(obj):
@@ -47,18 +63,23 @@ class ArchiveResultSchema(Schema):
     def resolve_snapshot_url(obj):
     def resolve_snapshot_url(obj):
         return obj.snapshot.url
         return obj.snapshot.url
 
 
+    @staticmethod
+    def resolve_snapshot_abid(obj):
+        return str(obj.snapshot.ABID)
+
     @staticmethod
     @staticmethod
     def resolve_snapshot_tags(obj):
     def resolve_snapshot_tags(obj):
         return obj.snapshot.tags_str()
         return obj.snapshot.tags_str()
 
 
 
 
 class ArchiveResultFilterSchema(FilterSchema):
 class ArchiveResultFilterSchema(FilterSchema):
-    id: Optional[UUID] = Field(None, q='uuid')
+    uuid: Optional[UUID] = Field(None, q='uuid')
+    # abid: Optional[str] = Field(None, q='abid')
 
 
     search: Optional[str] = Field(None, q=['snapshot__url__icontains', 'snapshot__title__icontains', 'snapshot__tags__name__icontains', 'extractor', 'output__icontains'])
     search: Optional[str] = Field(None, q=['snapshot__url__icontains', 'snapshot__title__icontains', 'snapshot__tags__name__icontains', 'extractor', 'output__icontains'])
-    snapshot_id: Optional[UUID] = Field(None, q='snapshot_id')
-    snapshot_url: Optional[str] = Field(None, q='snapshot__url')
-    snapshot_tag: Optional[str] = Field(None, q='snapshot__tags__name')
+    snapshot_uuid: Optional[UUID] = Field(None, q='snapshot_uuid__icontains')
+    snapshot_url: Optional[str] = Field(None, q='snapshot__url__icontains')
+    snapshot_tag: Optional[str] = Field(None, q='snapshot__tags__name__icontains')
     
     
     status: Optional[str] = Field(None, q='status')
     status: Optional[str] = Field(None, q='status')
     output: Optional[str] = Field(None, q='output__icontains')
     output: Optional[str] = Field(None, q='output__icontains')
@@ -75,6 +96,7 @@ class ArchiveResultFilterSchema(FilterSchema):
 @router.get("/archiveresults", response=List[ArchiveResultSchema])
 @router.get("/archiveresults", response=List[ArchiveResultSchema])
 @paginate
 @paginate
 def list_archiveresults(request, filters: ArchiveResultFilterSchema = Query(...)):
 def list_archiveresults(request, filters: ArchiveResultFilterSchema = Query(...)):
+    """List all ArchiveResult entries matching these filters."""
     qs = ArchiveResult.objects.all()
     qs = ArchiveResult.objects.all()
     results = filters.filter(qs)
     results = filters.filter(qs)
     return results
     return results
@@ -82,8 +104,8 @@ def list_archiveresults(request, filters: ArchiveResultFilterSchema = Query(...)
 
 
 @router.get("/archiveresult/{archiveresult_id}", response=ArchiveResultSchema)
 @router.get("/archiveresult/{archiveresult_id}", response=ArchiveResultSchema)
 def get_archiveresult(request, archiveresult_id: str):
 def get_archiveresult(request, archiveresult_id: str):
-    archiveresult = get_object_or_404(ArchiveResult, id=archiveresult_id)
-    return archiveresult
+    """Get a specific ArchiveResult by abid, uuid, or pk."""
+    return ArchiveResult.objects.get(Q(pk__icontains=archiveresult_id) | Q(abid__icontains=archiveresult_id) | Q(uuid__icontains=archiveresult_id))
 
 
 
 
 # @router.post("/archiveresult", response=ArchiveResultSchema)
 # @router.post("/archiveresult", response=ArchiveResultSchema)
@@ -115,27 +137,50 @@ def get_archiveresult(request, archiveresult_id: str):
 
 
 
 
 class SnapshotSchema(Schema):
 class SnapshotSchema(Schema):
-    id: UUID
+    abid: str
+    uuid: UUID
+    pk: str
+    modified: datetime
+    created: datetime
+    created_by_id: str
 
 
     url: str
     url: str
     tags: str
     tags: str
     title: Optional[str]
     title: Optional[str]
     timestamp: str
     timestamp: str
+    archive_path: str
+
     bookmarked: datetime
     bookmarked: datetime
     added: datetime
     added: datetime
-    updated: datetime
-    archive_path: str
+    updated: Optional[datetime]
 
 
+    num_archiveresults: int
     archiveresults: List[ArchiveResultSchema]
     archiveresults: List[ArchiveResultSchema]
 
 
-    # @staticmethod
-    # def resolve_id(obj):
-    #     return str(obj.id)
+    @staticmethod
+    def resolve_created_by_id(obj):
+        return str(obj.created_by_id)
+
+    @staticmethod
+    def resolve_pk(obj):
+        return str(obj.pk)
+
+    @staticmethod
+    def resolve_uuid(obj):
+        return str(obj.uuid)
+
+    @staticmethod
+    def resolve_abid(obj):
+        return str(obj.ABID)
 
 
     @staticmethod
     @staticmethod
     def resolve_tags(obj):
     def resolve_tags(obj):
         return obj.tags_str()
         return obj.tags_str()
 
 
+    @staticmethod
+    def resolve_num_archiveresults(obj, context):
+        return obj.archiveresult_set.all().distinct().count()
+
     @staticmethod
     @staticmethod
     def resolve_archiveresults(obj, context):
     def resolve_archiveresults(obj, context):
         if context['request'].with_archiveresults:
         if context['request'].with_archiveresults:
@@ -144,23 +189,32 @@ class SnapshotSchema(Schema):
 
 
 
 
 class SnapshotFilterSchema(FilterSchema):
 class SnapshotFilterSchema(FilterSchema):
-    id: Optional[UUID] = Field(None, q='id')
-
-    search: Optional[str] = Field(None, q=['url__icontains', 'title__icontains', 'tags__name__icontains'])
+    abid: Optional[str] = Field(None, q='abid__icontains')
+    uuid: Optional[str] = Field(None, q='uuid__icontains')
+    pk: Optional[str] = Field(None, q='pk__icontains')
+    created_by_id: str = Field(None, q='created_by_id__icontains')
+    created__gte: datetime = Field(None, q='created__gte')
+    created__lt: datetime = Field(None, q='created__lt')
+    created: datetime = Field(None, q='created')
+    modified: datetime = Field(None, q='modified')
+    modified__gte: datetime = Field(None, q='modified__gte')
+    modified__lt: datetime = Field(None, q='modified__lt')
+
+    search: Optional[str] = Field(None, q=['url__icontains', 'title__icontains', 'tags__name__icontains', 'abid__icontains', 'uuid__icontains'])
     url: Optional[str] = Field(None, q='url')
     url: Optional[str] = Field(None, q='url')
     tag: Optional[str] = Field(None, q='tags__name')
     tag: Optional[str] = Field(None, q='tags__name')
     title: Optional[str] = Field(None, q='title__icontains')
     title: Optional[str] = Field(None, q='title__icontains')
-    
     timestamp: Optional[str] = Field(None, q='timestamp__startswith')
     timestamp: Optional[str] = Field(None, q='timestamp__startswith')
     
     
-    added: Optional[datetime] = Field(None, q='added')
     added__gte: Optional[datetime] = Field(None, q='added__gte')
     added__gte: Optional[datetime] = Field(None, q='added__gte')
     added__lt: Optional[datetime] = Field(None, q='added__lt')
     added__lt: Optional[datetime] = Field(None, q='added__lt')
 
 
 
 
+
 @router.get("/snapshots", response=List[SnapshotSchema])
 @router.get("/snapshots", response=List[SnapshotSchema])
 @paginate
 @paginate
 def list_snapshots(request, filters: SnapshotFilterSchema = Query(...), with_archiveresults: bool=True):
 def list_snapshots(request, filters: SnapshotFilterSchema = Query(...), with_archiveresults: bool=True):
+    """List all Snapshot entries matching these filters."""
     request.with_archiveresults = with_archiveresults
     request.with_archiveresults = with_archiveresults
 
 
     qs = Snapshot.objects.all()
     qs = Snapshot.objects.all()
@@ -169,8 +223,24 @@ def list_snapshots(request, filters: SnapshotFilterSchema = Query(...), with_arc
 
 
 @router.get("/snapshot/{snapshot_id}", response=SnapshotSchema)
 @router.get("/snapshot/{snapshot_id}", response=SnapshotSchema)
 def get_snapshot(request, snapshot_id: str, with_archiveresults: bool=True):
 def get_snapshot(request, snapshot_id: str, with_archiveresults: bool=True):
+    """Get a specific Snapshot by abid, uuid, or pk."""
     request.with_archiveresults = with_archiveresults
     request.with_archiveresults = with_archiveresults
-    snapshot = get_object_or_404(Snapshot, id=snapshot_id)
+    snapshot = None
+    try:
+        snapshot = Snapshot.objects.get(Q(uuid__startswith=snapshot_id) | Q(abid__startswith=snapshot_id)| Q(pk__startswith=snapshot_id))
+    except Snapshot.DoesNotExist:
+        pass
+
+    try:
+        snapshot = snapshot or Snapshot.objects.get()
+    except Snapshot.DoesNotExist:
+        pass
+
+    try:
+        snapshot = snapshot or Snapshot.objects.get(Q(uuid__icontains=snapshot_id) | Q(abid__icontains=snapshot_id))
+    except Snapshot.DoesNotExist:
+        pass
+
     return snapshot
     return snapshot
 
 
 
 
@@ -179,9 +249,9 @@ def get_snapshot(request, snapshot_id: str, with_archiveresults: bool=True):
 #     snapshot = Snapshot.objects.create(**payload.dict())
 #     snapshot = Snapshot.objects.create(**payload.dict())
 #     return snapshot
 #     return snapshot
 #
 #
-# @router.put("/snapshot/{snapshot_id}", response=SnapshotSchema)
-# def update_snapshot(request, snapshot_id: str, payload: SnapshotSchema):
-#     snapshot = get_object_or_404(Snapshot, id=snapshot_id)
+# @router.put("/snapshot/{snapshot_uuid}", response=SnapshotSchema)
+# def update_snapshot(request, snapshot_uuid: str, payload: SnapshotSchema):
+#     snapshot = get_object_or_404(Snapshot, uuid=snapshot_uuid)
 #
 #
 #     for attr, value in payload.dict().items():
 #     for attr, value in payload.dict().items():
 #         setattr(snapshot, attr, value)
 #         setattr(snapshot, attr, value)
@@ -189,9 +259,9 @@ def get_snapshot(request, snapshot_id: str, with_archiveresults: bool=True):
 #
 #
 #     return snapshot
 #     return snapshot
 #
 #
-# @router.delete("/snapshot/{snapshot_id}")
-# def delete_snapshot(request, snapshot_id: str):
-#     snapshot = get_object_or_404(Snapshot, id=snapshot_id)
+# @router.delete("/snapshot/{snapshot_uuid}")
+# def delete_snapshot(request, snapshot_uuid: str):
+#     snapshot = get_object_or_404(Snapshot, uuid=snapshot_uuid)
 #     snapshot.delete()
 #     snapshot.delete()
 #     return {"success": True}
 #     return {"success": True}
 
 
@@ -201,10 +271,21 @@ def get_snapshot(request, snapshot_id: str, with_archiveresults: bool=True):
 
 
 
 
 class TagSchema(Schema):
 class TagSchema(Schema):
+    abid: Optional[UUID] = Field(None, q='abid')
+    uuid: Optional[UUID] = Field(None, q='uuid')
+    pk: Optional[UUID] = Field(None, q='pk')
+    modified: datetime
+    created: datetime
+    created_by_id: str
+
     name: str
     name: str
     slug: str
     slug: str
 
 
 
 
+    @staticmethod
+    def resolve_created_by_id(obj):
+        return str(obj.created_by_id)
+
 @router.get("/tags", response=List[TagSchema])
 @router.get("/tags", response=List[TagSchema])
 def list_tags(request):
 def list_tags(request):
     return Tag.objects.all()
     return Tag.objects.all()

+ 43 - 2
archivebox/cli/__init__.py

@@ -4,14 +4,18 @@ __command__ = 'archivebox'
 import os
 import os
 import sys
 import sys
 import argparse
 import argparse
+import threading
+from time import sleep
 
 
-from typing import Optional, Dict, List, IO, Union
+from typing import Optional, Dict, List, IO, Union, Iterable
 from pathlib import Path
 from pathlib import Path
 
 
-from ..config import OUTPUT_DIR, check_data_folder, check_migrations
+from ..config import OUTPUT_DIR, check_data_folder, check_migrations, stderr
 
 
 from importlib import import_module
 from importlib import import_module
 
 
+BUILTIN_LIST = list
+
 CLI_DIR = Path(__file__).resolve().parent
 CLI_DIR = Path(__file__).resolve().parent
 
 
 # these common commands will appear sorted before any others for ease-of-use
 # these common commands will appear sorted before any others for ease-of-use
@@ -33,6 +37,40 @@ is_valid_cli_module = lambda module, subcommand: (
 )
 )
 
 
 
 
+IGNORED_BG_THREADS = ('MainThread', 'ThreadPoolExecutor', 'IPythonHistorySavingThread')  # threads we dont have to wait for before exiting
+
+
+def wait_for_bg_threads_to_exit(thread_names: Iterable[str]=(), ignore_names: Iterable[str]=IGNORED_BG_THREADS, timeout: int=60) -> int:
+    """
+    Block until the specified threads exit. e.g. pass thread_names=('default_hook_handler',) to wait for webhooks.
+    Useful for waiting for signal handlers, webhooks, etc. to finish running after a mgmt command completes.
+    """
+
+    wait_for_all: bool = thread_names == ()
+
+    thread_matches = lambda thread, ptns: any(ptn in repr(thread) for ptn in ptns)
+
+    should_wait = lambda thread: (
+        not thread_matches(thread, ignore_names)
+        and (wait_for_all or thread_matches(thread, thread_names)))
+
+    for tries in range(timeout):
+        all_threads = [*threading.enumerate()]
+        blocking_threads = [*filter(should_wait, all_threads)]
+        threads_summary = ', '.join(repr(t) for t in blocking_threads)
+        if blocking_threads:
+            sleep(1)
+            if tries == 5:                            # only show stderr message if we need to wait more than 5s
+                stderr(
+                    f'[…] Waiting up to {timeout}s for background jobs (e.g. webhooks) to finish...',
+                    threads_summary,
+                )
+        else:
+            return tries
+
+    raise Exception('Background threads failed to exit after {tries}s: {threads_summary}')
+
+
 def list_subcommands() -> Dict[str, str]:
 def list_subcommands() -> Dict[str, str]:
     """find and import all valid archivebox_<subcommand>.py files in CLI_DIR"""
     """find and import all valid archivebox_<subcommand>.py files in CLI_DIR"""
 
 
@@ -79,6 +117,9 @@ def run_subcommand(subcommand: str,
     module = import_module('.archivebox_{}'.format(subcommand), __package__)
     module = import_module('.archivebox_{}'.format(subcommand), __package__)
     module.main(args=subcommand_args, stdin=stdin, pwd=pwd)    # type: ignore
     module.main(args=subcommand_args, stdin=stdin, pwd=pwd)    # type: ignore
 
 
+    # wait for webhooks, signals, and other background jobs to finish before exit
+    wait_for_bg_threads_to_exit(timeout=60)
+
 
 
 SUBCOMMANDS = list_subcommands()
 SUBCOMMANDS = list_subcommands()
 
 

+ 14 - 2
archivebox/config.py

@@ -37,7 +37,7 @@ from sqlite3 import dbapi2 as sqlite3
 from hashlib import md5
 from hashlib import md5
 from pathlib import Path
 from pathlib import Path
 from datetime import datetime, timezone
 from datetime import datetime, timezone
-from typing import Optional, Type, Tuple, Dict, Union, List
+from typing import Optional, Type, Tuple, Dict, Union, List, Any
 from subprocess import run, PIPE, DEVNULL
 from subprocess import run, PIPE, DEVNULL
 from configparser import ConfigParser
 from configparser import ConfigParser
 from collections import defaultdict
 from collections import defaultdict
@@ -281,6 +281,7 @@ TEMPLATES_DIR_NAME = 'templates'
 ARCHIVE_DIR_NAME = 'archive'
 ARCHIVE_DIR_NAME = 'archive'
 SOURCES_DIR_NAME = 'sources'
 SOURCES_DIR_NAME = 'sources'
 LOGS_DIR_NAME = 'logs'
 LOGS_DIR_NAME = 'logs'
+CACHE_DIR_NAME = 'cache'
 PERSONAS_DIR_NAME = 'personas'
 PERSONAS_DIR_NAME = 'personas'
 CRONTABS_DIR_NAME = 'crontabs'
 CRONTABS_DIR_NAME = 'crontabs'
 SQL_INDEX_FILENAME = 'index.sqlite3'
 SQL_INDEX_FILENAME = 'index.sqlite3'
@@ -360,6 +361,7 @@ ALLOWED_IN_OUTPUT_DIR = {
     ARCHIVE_DIR_NAME,
     ARCHIVE_DIR_NAME,
     SOURCES_DIR_NAME,
     SOURCES_DIR_NAME,
     LOGS_DIR_NAME,
     LOGS_DIR_NAME,
+    CACHE_DIR_NAME,
     PERSONAS_DIR_NAME,
     PERSONAS_DIR_NAME,
     SQL_INDEX_FILENAME,
     SQL_INDEX_FILENAME,
     f'{SQL_INDEX_FILENAME}-wal',
     f'{SQL_INDEX_FILENAME}-wal',
@@ -511,6 +513,7 @@ DYNAMIC_CONFIG_SCHEMA: ConfigDefaultDict = {
     'ARCHIVE_DIR':              {'default': lambda c: c['OUTPUT_DIR'] / ARCHIVE_DIR_NAME},
     'ARCHIVE_DIR':              {'default': lambda c: c['OUTPUT_DIR'] / ARCHIVE_DIR_NAME},
     'SOURCES_DIR':              {'default': lambda c: c['OUTPUT_DIR'] / SOURCES_DIR_NAME},
     'SOURCES_DIR':              {'default': lambda c: c['OUTPUT_DIR'] / SOURCES_DIR_NAME},
     'LOGS_DIR':                 {'default': lambda c: c['OUTPUT_DIR'] / LOGS_DIR_NAME},
     'LOGS_DIR':                 {'default': lambda c: c['OUTPUT_DIR'] / LOGS_DIR_NAME},
+    'CACHE_DIR':                {'default': lambda c: c['OUTPUT_DIR'] / CACHE_DIR_NAME},
     'PERSONAS_DIR':             {'default': lambda c: c['OUTPUT_DIR'] / PERSONAS_DIR_NAME},
     'PERSONAS_DIR':             {'default': lambda c: c['OUTPUT_DIR'] / PERSONAS_DIR_NAME},
     'CONFIG_FILE':              {'default': lambda c: Path(c['CONFIG_FILE']).resolve() if c['CONFIG_FILE'] else c['OUTPUT_DIR'] / CONFIG_FILENAME},
     'CONFIG_FILE':              {'default': lambda c: Path(c['CONFIG_FILE']).resolve() if c['CONFIG_FILE'] else c['OUTPUT_DIR'] / CONFIG_FILENAME},
     'COOKIES_FILE':             {'default': lambda c: c['COOKIES_FILE'] and Path(c['COOKIES_FILE']).resolve()},
     'COOKIES_FILE':             {'default': lambda c: c['COOKIES_FILE'] and Path(c['COOKIES_FILE']).resolve()},
@@ -1038,6 +1041,11 @@ def get_data_locations(config: ConfigDict) -> ConfigValue:
             'enabled': True,
             'enabled': True,
             'is_valid': config['LOGS_DIR'].exists(),
             'is_valid': config['LOGS_DIR'].exists(),
         },
         },
+        'CACHE_DIR': {
+            'path': config['CACHE_DIR'].resolve(),
+            'enabled': True,
+            'is_valid': config['CACHE_DIR'].exists(),
+        },
         'CUSTOM_TEMPLATES_DIR': {
         'CUSTOM_TEMPLATES_DIR': {
             'path': config['CUSTOM_TEMPLATES_DIR'] and Path(config['CUSTOM_TEMPLATES_DIR']).resolve(),
             'path': config['CUSTOM_TEMPLATES_DIR'] and Path(config['CUSTOM_TEMPLATES_DIR']).resolve(),
             'enabled': bool(config['CUSTOM_TEMPLATES_DIR']),
             'enabled': bool(config['CUSTOM_TEMPLATES_DIR']),
@@ -1299,7 +1307,10 @@ def check_system_config(config: ConfigDict=CONFIG) -> None:
                 stderr()
                 stderr()
                 stderr('    Try removing /Default from the end e.g.:')
                 stderr('    Try removing /Default from the end e.g.:')
                 stderr('        CHROME_USER_DATA_DIR="{}"'.format(config['CHROME_USER_DATA_DIR'].split('/Default')[0]))
                 stderr('        CHROME_USER_DATA_DIR="{}"'.format(config['CHROME_USER_DATA_DIR'].split('/Default')[0]))
-            raise SystemExit(2)
+            
+            # hard error is too annoying here, instead just set it to nothing
+            # raise SystemExit(2)
+            config['CHROME_USER_DATA_DIR'] = None
 
 
 
 
 def check_dependencies(config: ConfigDict=CONFIG, show_help: bool=True) -> None:
 def check_dependencies(config: ConfigDict=CONFIG, show_help: bool=True) -> None:
@@ -1385,6 +1396,7 @@ def check_migrations(out_dir: Union[str, Path, None]=None, config: ConfigDict=CO
 
 
     (Path(output_dir) / SOURCES_DIR_NAME).mkdir(exist_ok=True)
     (Path(output_dir) / SOURCES_DIR_NAME).mkdir(exist_ok=True)
     (Path(output_dir) / LOGS_DIR_NAME).mkdir(exist_ok=True)
     (Path(output_dir) / LOGS_DIR_NAME).mkdir(exist_ok=True)
+    (Path(output_dir) / CACHE_DIR_NAME).mkdir(exist_ok=True)
     (Path(output_dir) / PERSONAS_DIR_NAME).mkdir(exist_ok=True)
     (Path(output_dir) / PERSONAS_DIR_NAME).mkdir(exist_ok=True)
     (Path(output_dir) / PERSONAS_DIR_NAME / 'Default').mkdir(exist_ok=True)
     (Path(output_dir) / PERSONAS_DIR_NAME / 'Default').mkdir(exist_ok=True)
 
 

+ 104 - 71
archivebox/core/admin.py

@@ -15,8 +15,8 @@ from django.contrib.auth import get_user_model
 from django import forms
 from django import forms
 
 
 
 
-from signal_webhooks.apps import DjangoSignalWebhooksConfig
-from signal_webhooks.admin import WebhookAdmin, WebhookModel
+from signal_webhooks.admin import WebhookAdmin, get_webhook_model
+# from plugantic.admin import CustomPlugin
 
 
 from ..util import htmldecode, urldecode, ansi_to_html
 from ..util import htmldecode, urldecode, ansi_to_html
 
 
@@ -38,6 +38,7 @@ from config import (
     CAN_UPGRADE
     CAN_UPGRADE
 )
 )
 
 
+
 GLOBAL_CONTEXT = {'VERSION': VERSION, 'VERSIONS_AVAILABLE': VERSIONS_AVAILABLE, 'CAN_UPGRADE': CAN_UPGRADE}
 GLOBAL_CONTEXT = {'VERSION': VERSION, 'VERSIONS_AVAILABLE': VERSIONS_AVAILABLE, 'CAN_UPGRADE': CAN_UPGRADE}
 
 
 # Admin URLs
 # Admin URLs
@@ -104,23 +105,16 @@ class ArchiveBoxAdmin(admin.AdminSite):
         return render(template_name='add.html', request=request, context=context)
         return render(template_name='add.html', request=request, context=context)
 
 
 
 
-# monkey patch django-signals-webhooks to change how it shows up in Admin UI
-DjangoSignalWebhooksConfig.verbose_name = 'API'
-WebhookModel._meta.get_field('name').help_text = 'Give your webhook a descriptive name (e.g. Notify ACME Slack channel of any new ArchiveResults).'
-WebhookModel._meta.get_field('signal').help_text = 'The type of event the webhook should fire for (e.g. Create, Update, Delete).'
-WebhookModel._meta.get_field('ref').help_text = 'Dot import notation of the model the webhook should fire for (e.g. core.models.Snapshot or core.models.ArchiveResult).'
-WebhookModel._meta.get_field('endpoint').help_text = 'External URL to POST the webhook notification to (e.g. https://someapp.example.com/webhook/some-webhook-receiver).'
-WebhookModel._meta.app_label = 'api'
-
-
 archivebox_admin = ArchiveBoxAdmin()
 archivebox_admin = ArchiveBoxAdmin()
 archivebox_admin.register(get_user_model())
 archivebox_admin.register(get_user_model())
 archivebox_admin.register(APIToken)
 archivebox_admin.register(APIToken)
-archivebox_admin.register(WebhookModel, WebhookAdmin)
+archivebox_admin.register(get_webhook_model(), WebhookAdmin)
 archivebox_admin.disable_action('delete_selected')
 archivebox_admin.disable_action('delete_selected')
 
 
+# archivebox_admin.register(CustomPlugin)
 
 
-# patch admin with methods to add data views
+# patch admin with methods to add data views (implemented by admin_data_views package)
+############### Additional sections are defined in settings.ADMIN_DATA_VIEWS #########
 from admin_data_views.admin import get_app_list, admin_data_index_view, get_admin_data_urls, get_urls
 from admin_data_views.admin import get_app_list, admin_data_index_view, get_admin_data_urls, get_urls
 
 
 archivebox_admin.get_app_list = get_app_list.__get__(archivebox_admin, ArchiveBoxAdmin)
 archivebox_admin.get_app_list = get_app_list.__get__(archivebox_admin, ArchiveBoxAdmin)
@@ -170,14 +164,41 @@ class SnapshotActionForm(ActionForm):
     # )
     # )
 
 
 
 
+def get_abid_info(self, obj):
+    return format_html(
+        # URL Hash: <code style="font-size: 10px; user-select: all">{}</code><br/>
+        '''
+        &nbsp; &nbsp; ABID:&nbsp; <code style="font-size: 16px; user-select: all"><b>{}</b></code><br/>
+        &nbsp; &nbsp; TS: &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp;<code style="font-size: 10px; user-select: all"><b>{}</b></code> ({})<br/>
+        &nbsp; &nbsp; URI: &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; <code style="font-size: 10px; user-select: all"><b>{}</b></code> ({})<br/>
+        &nbsp; &nbsp; SUBTYPE: &nbsp; &nbsp; &nbsp; <code style="font-size: 10px; user-select: all"><b>{}</b></code> ({})<br/>
+        &nbsp; &nbsp; RAND: &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp;&nbsp; <code style="font-size: 10px; user-select: all"><b>{}</b></code> ({})<br/><br/>
+        &nbsp; &nbsp; ABID AS UUID:&nbsp; <code style="font-size: 10px; user-select: all">{}</code> &nbsp; &nbsp;<br/><br/>
+
+        &nbsp; &nbsp; .uuid: &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; <code style="font-size: 10px; user-select: all">{}</code> &nbsp; &nbsp;<br/>
+        &nbsp; &nbsp; .id: &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp;&nbsp; <code style="font-size: 10px; user-select: all">{}</code> &nbsp; &nbsp;<br/>
+        &nbsp; &nbsp; .pk: &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; <code style="font-size: 10px; user-select: all">{}</code> &nbsp; &nbsp;<br/><br/>
+        ''',
+        obj.abid,
+        obj.ABID.ts, obj.abid_values['ts'].isoformat() if isinstance(obj.abid_values['ts'], datetime) else obj.abid_values['ts'],
+        obj.ABID.uri, str(obj.abid_values['uri']),
+        obj.ABID.subtype, str(obj.abid_values['subtype']),
+        obj.ABID.rand, str(obj.abid_values['rand'])[-7:],
+        obj.ABID.uuid,
+        obj.uuid,
+        obj.id,
+        obj.pk,
+    )
+
+
 @admin.register(Snapshot, site=archivebox_admin)
 @admin.register(Snapshot, site=archivebox_admin)
 class SnapshotAdmin(SearchResultsAdminMixin, admin.ModelAdmin):
 class SnapshotAdmin(SearchResultsAdminMixin, admin.ModelAdmin):
     list_display = ('added', 'title_str', 'files', 'size', 'url_str')
     list_display = ('added', 'title_str', 'files', 'size', 'url_str')
     sort_fields = ('title_str', 'url_str', 'added', 'files')
     sort_fields = ('title_str', 'url_str', 'added', 'files')
-    readonly_fields = ('info', 'bookmarked', 'added', 'updated')
-    search_fields = ('id', 'url', 'timestamp', 'title', 'tags__name')
-    fields = ('timestamp', 'url', 'title', 'tags', *readonly_fields)
-    list_filter = ('added', 'updated', 'tags', 'archiveresult__status')
+    readonly_fields = ('admin_actions', 'status_info', 'bookmarked', 'added', 'updated', 'created', 'modified', 'identifiers')
+    search_fields = ('id', 'url', 'abid', 'uuid', 'timestamp', 'title', 'tags__name')
+    fields = ('url', 'timestamp', 'created_by', 'tags', 'title', *readonly_fields)
+    list_filter = ('added', 'updated', 'tags', 'archiveresult__status', 'created_by')
     ordering = ['-added']
     ordering = ['-added']
     actions = ['add_tags', 'remove_tags', 'update_titles', 'update_snapshots', 'resnapshot_snapshot', 'overwrite_snapshots', 'delete_snapshots']
     actions = ['add_tags', 'remove_tags', 'update_titles', 'update_snapshots', 'resnapshot_snapshot', 'overwrite_snapshots', 'delete_snapshots']
     autocomplete_fields = ['tags']
     autocomplete_fields = ['tags']
@@ -223,40 +244,46 @@ class SnapshotAdmin(SearchResultsAdminMixin, admin.ModelAdmin):
     #             </form>
     #             </form>
     #         ''',
     #         ''',
     #         csrf.get_token(self.request),
     #         csrf.get_token(self.request),
-    #         obj.id,
+    #         obj.pk,
     #     )
     #     )
 
 
-    def info(self, obj):
+    def admin_actions(self, obj):
         return format_html(
         return format_html(
+            # URL Hash: <code style="font-size: 10px; user-select: all">{}</code><br/>
+            '''
+            <a class="btn" style="font-size: 18px; display: inline-block; border-radius: 10px; border: 3px solid #eee; padding: 4px 8px" href="/archive/{}">Summary page ➡️</a> &nbsp; &nbsp;
+            <a class="btn" style="font-size: 18px; display: inline-block; border-radius: 10px; border: 3px solid #eee; padding: 4px 8px" href="/archive/{}/index.html#all">Result files 📑</a> &nbsp; &nbsp;
+            <a class="btn" style="font-size: 18px; display: inline-block; border-radius: 10px; border: 3px solid #eee; padding: 4px 8px" href="/admin/core/snapshot/?id__exact={}">Admin actions ⚙️</a>
+            ''',
+            obj.timestamp,
+            obj.timestamp,
+            obj.pk,
+        )
+
+    def status_info(self, obj):
+        return format_html(
+            # URL Hash: <code style="font-size: 10px; user-select: all">{}</code><br/>
             '''
             '''
-            UUID: <code style="font-size: 10px; user-select: all">{}</code> &nbsp; &nbsp;
-            Timestamp: <code style="font-size: 10px; user-select: all">{}</code> &nbsp; &nbsp;
-            URL Hash: <code style="font-size: 10px; user-select: all">{}</code><br/>
             Archived: {} ({} files {}) &nbsp; &nbsp;
             Archived: {} ({} files {}) &nbsp; &nbsp;
             Favicon: <img src="{}" style="height: 20px"/> &nbsp; &nbsp;
             Favicon: <img src="{}" style="height: 20px"/> &nbsp; &nbsp;
-            Status code: {} &nbsp; &nbsp;
+            Status code: {} &nbsp; &nbsp;<br/>
             Server: {} &nbsp; &nbsp;
             Server: {} &nbsp; &nbsp;
             Content type: {} &nbsp; &nbsp;
             Content type: {} &nbsp; &nbsp;
             Extension: {} &nbsp; &nbsp;
             Extension: {} &nbsp; &nbsp;
-            <br/><br/>
-            <a href="/archive/{}">View Snapshot index ➡️</a> &nbsp; &nbsp;
-            <a href="/admin/core/snapshot/?id__exact={}">View actions ⚙️</a>
             ''',
             ''',
-            obj.id,
-            obj.timestamp,
-            obj.url_hash,
             '✅' if obj.is_archived else '❌',
             '✅' if obj.is_archived else '❌',
             obj.num_outputs,
             obj.num_outputs,
-            self.size(obj),
+            self.size(obj) or '0kb',
             f'/archive/{obj.timestamp}/favicon.ico',
             f'/archive/{obj.timestamp}/favicon.ico',
-            obj.status_code or '?',
-            obj.headers and obj.headers.get('Server') or '?',
-            obj.headers and obj.headers.get('Content-Type') or '?',
-            obj.extension or '?',
-            obj.timestamp,
-            obj.id,
+            obj.status_code or '-',
+            obj.headers and obj.headers.get('Server') or '-',
+            obj.headers and obj.headers.get('Content-Type') or '-',
+            obj.extension or '-',
         )
         )
 
 
+    def identifiers(self, obj):
+        return get_abid_info(self, obj)
+
     @admin.display(
     @admin.display(
         description='Title',
         description='Title',
         ordering='title',
         ordering='title',
@@ -316,7 +343,7 @@ class SnapshotAdmin(SearchResultsAdminMixin, admin.ModelAdmin):
         return format_html(
         return format_html(
             '<a href="{}"><code style="user-select: all;">{}</code></a>',
             '<a href="{}"><code style="user-select: all;">{}</code></a>',
             obj.url,
             obj.url,
-            obj.url,
+            obj.url[:128],
         )
         )
 
 
     def grid_view(self, request, extra_context=None):
     def grid_view(self, request, extra_context=None):
@@ -419,42 +446,45 @@ class SnapshotAdmin(SearchResultsAdminMixin, admin.ModelAdmin):
 
 
 @admin.register(Tag, site=archivebox_admin)
 @admin.register(Tag, site=archivebox_admin)
 class TagAdmin(admin.ModelAdmin):
 class TagAdmin(admin.ModelAdmin):
-    list_display = ('slug', 'name', 'num_snapshots', 'snapshots', 'id')
-    sort_fields = ('id', 'name', 'slug')
-    readonly_fields = ('id', 'num_snapshots', 'snapshots')
-    search_fields = ('id', 'name', 'slug')
-    fields = (*readonly_fields, 'name', 'slug')
+    list_display = ('slug', 'name', 'num_snapshots', 'snapshots', 'abid')
+    sort_fields = ('id', 'name', 'slug', 'abid')
+    readonly_fields = ('created', 'modified', 'identifiers', 'num_snapshots', 'snapshots')
+    search_fields = ('id', 'abid', 'uuid', 'name', 'slug')
+    fields = ('name', 'slug', 'created_by', *readonly_fields, )
     actions = ['delete_selected']
     actions = ['delete_selected']
     ordering = ['-id']
     ordering = ['-id']
 
 
-    def num_snapshots(self, obj):
+    def identifiers(self, obj):
+        return get_abid_info(self, obj)
+
+    def num_snapshots(self, tag):
         return format_html(
         return format_html(
             '<a href="/admin/core/snapshot/?tags__id__exact={}">{} total</a>',
             '<a href="/admin/core/snapshot/?tags__id__exact={}">{} total</a>',
-            obj.id,
-            obj.snapshot_set.count(),
+            tag.id,
+            tag.snapshot_set.count(),
         )
         )
 
 
-    def snapshots(self, obj):
-        total_count = obj.snapshot_set.count()
+    def snapshots(self, tag):
+        total_count = tag.snapshot_set.count()
         return mark_safe('<br/>'.join(
         return mark_safe('<br/>'.join(
             format_html(
             format_html(
                 '{} <code><a href="/admin/core/snapshot/{}/change"><b>[{}]</b></a> {}</code>',
                 '{} <code><a href="/admin/core/snapshot/{}/change"><b>[{}]</b></a> {}</code>',
                 snap.updated.strftime('%Y-%m-%d %H:%M') if snap.updated else 'pending...',
                 snap.updated.strftime('%Y-%m-%d %H:%M') if snap.updated else 'pending...',
-                snap.id,
-                snap.timestamp,
+                snap.pk,
+                snap.abid,
                 snap.url,
                 snap.url,
             )
             )
-            for snap in obj.snapshot_set.order_by('-updated')[:10]
-        ) + (f'<br/><a href="/admin/core/snapshot/?tags__id__exact={obj.id}">and {total_count-10} more...<a>' if obj.snapshot_set.count() > 10 else ''))
+            for snap in tag.snapshot_set.order_by('-updated')[:10]
+        ) + (f'<br/><a href="/admin/core/snapshot/?tags__id__exact={tag.id}">and {total_count-10} more...<a>' if tag.snapshot_set.count() > 10 else ''))
 
 
 
 
 @admin.register(ArchiveResult, site=archivebox_admin)
 @admin.register(ArchiveResult, site=archivebox_admin)
 class ArchiveResultAdmin(admin.ModelAdmin):
 class ArchiveResultAdmin(admin.ModelAdmin):
-    list_display = ('id', 'start_ts', 'extractor', 'snapshot_str', 'tags_str', 'cmd_str', 'status', 'output_str')
+    list_display = ('start_ts', 'snapshot_info', 'tags_str', 'extractor', 'cmd_str', 'status', 'output_str')
     sort_fields = ('start_ts', 'extractor', 'status')
     sort_fields = ('start_ts', 'extractor', 'status')
-    readonly_fields = ('id', 'uuid', 'snapshot_str', 'tags_str')
-    search_fields = ('id', 'uuid', 'snapshot__url', 'extractor', 'output', 'cmd_version', 'cmd', 'snapshot__timestamp')
-    fields = (*readonly_fields, 'snapshot', 'extractor', 'status', 'start_ts', 'end_ts', 'output', 'pwd', 'cmd', 'cmd_version')
+    readonly_fields = ('snapshot_info', 'tags_str', 'created_by', 'created', 'modified', 'identifiers')
+    search_fields = ('id', 'uuid', 'abid', 'snapshot__url', 'extractor', 'output', 'cmd_version', 'cmd', 'snapshot__timestamp')
+    fields = ('snapshot', 'extractor', 'status', 'output', 'pwd', 'cmd',  'start_ts', 'end_ts', 'cmd_version', *readonly_fields)
     autocomplete_fields = ['snapshot']
     autocomplete_fields = ['snapshot']
 
 
     list_filter = ('status', 'extractor', 'start_ts', 'cmd_version')
     list_filter = ('status', 'extractor', 'start_ts', 'cmd_version')
@@ -462,33 +492,36 @@ class ArchiveResultAdmin(admin.ModelAdmin):
     list_per_page = SNAPSHOTS_PER_PAGE
     list_per_page = SNAPSHOTS_PER_PAGE
 
 
     @admin.display(
     @admin.display(
-        description='snapshot'
+        description='Snapshot Info'
     )
     )
-    def snapshot_str(self, obj):
+    def snapshot_info(self, result):
         return format_html(
         return format_html(
-            '<a href="/archive/{}/index.html"><b><code>[{}]</code></b></a><br/>'
-            '<small>{}</small>',
-            obj.snapshot.timestamp,
-            obj.snapshot.timestamp,
-            obj.snapshot.url[:128],
+            '<a href="/archive/{}/index.html"><b><code>[{}]</code></b> &nbsp; {} &nbsp; {}</a><br/>',
+            result.snapshot.timestamp,
+            result.snapshot.abid,
+            result.snapshot.added.strftime('%Y-%m-%d %H:%M'),
+            result.snapshot.url[:128],
         )
         )
 
 
+    def identifiers(self, obj):
+        return get_abid_info(self, obj)
+
     @admin.display(
     @admin.display(
-        description='tags'
+        description='Snapshot Tags'
     )
     )
-    def tags_str(self, obj):
-        return obj.snapshot.tags_str()
+    def tags_str(self, result):
+        return result.snapshot.tags_str()
 
 
-    def cmd_str(self, obj):
+    def cmd_str(self, result):
         return format_html(
         return format_html(
             '<pre>{}</pre>',
             '<pre>{}</pre>',
-            ' '.join(obj.cmd) if isinstance(obj.cmd, list) else str(obj.cmd),
+            ' '.join(result.cmd) if isinstance(result.cmd, list) else str(result.cmd),
         )
         )
 
 
-    def output_str(self, obj):
+    def output_str(self, result):
         return format_html(
         return format_html(
             '<a href="/archive/{}/{}" class="output-link">↗️</a><pre>{}</pre>',
             '<a href="/archive/{}/{}" class="output-link">↗️</a><pre>{}</pre>',
-            obj.snapshot.timestamp,
-            obj.output if (obj.status == 'succeeded') and obj.extractor not in ('title', 'archive_org') else 'index.html',
-            obj.output,
+            result.snapshot.timestamp,
+            result.output if (result.status == 'succeeded') and result.extractor not in ('title', 'archive_org') else 'index.html',
+            result.output,
         )
         )

+ 0 - 2
archivebox/core/migrations/0007_archiveresult.py

@@ -17,8 +17,6 @@ except AttributeError:
 
 
 
 
 def forwards_func(apps, schema_editor):
 def forwards_func(apps, schema_editor):
-    from core.models import EXTRACTORS
-
     Snapshot = apps.get_model("core", "Snapshot")
     Snapshot = apps.get_model("core", "Snapshot")
     ArchiveResult = apps.get_model("core", "ArchiveResult")
     ArchiveResult = apps.get_model("core", "ArchiveResult")
 
 

+ 43 - 0
archivebox/core/migrations/0023_alter_archiveresult_options_archiveresult_abid_and_more.py

@@ -0,0 +1,43 @@
+# Generated by Django 5.0.6 on 2024-05-13 10:56
+
+import charidfield.fields
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ('core', '0022_auto_20231023_2008'),
+    ]
+
+    operations = [
+        migrations.AlterModelOptions(
+            name='archiveresult',
+            options={'verbose_name': 'Result'},
+        ),
+        migrations.AddField(
+            model_name='archiveresult',
+            name='abid',
+            field=charidfield.fields.CharIDField(blank=True, db_index=True, default=None, help_text='ABID-format identifier for this entity (e.g. snp_01BJQMF54D093DXEAWZ6JYRPAQ)', max_length=30, null=True, prefix='res_', unique=True),
+        ),
+        migrations.AddField(
+            model_name='snapshot',
+            name='abid',
+            field=charidfield.fields.CharIDField(blank=True, db_index=True, default=None, help_text='ABID-format identifier for this entity (e.g. snp_01BJQMF54D093DXEAWZ6JYRPAQ)', max_length=30, null=True, prefix='snp_', unique=True),
+        ),
+        migrations.AddField(
+            model_name='snapshot',
+            name='uuid',
+            field=models.UUIDField(blank=True, null=True, unique=True),
+        ),
+        migrations.AddField(
+            model_name='tag',
+            name='abid',
+            field=charidfield.fields.CharIDField(blank=True, db_index=True, default=None, help_text='ABID-format identifier for this entity (e.g. snp_01BJQMF54D093DXEAWZ6JYRPAQ)', max_length=30, null=True, prefix='tag_', unique=True),
+        ),
+        migrations.AlterField(
+            model_name='archiveresult',
+            name='extractor',
+            field=models.CharField(choices=[('htmltotext', 'htmltotext'), ('git', 'git'), ('singlefile', 'singlefile'), ('media', 'media'), ('archive_org', 'archive_org'), ('readability', 'readability'), ('mercury', 'mercury'), ('favicon', 'favicon'), ('pdf', 'pdf'), ('headers', 'headers'), ('screenshot', 'screenshot'), ('dom', 'dom'), ('title', 'title'), ('wget', 'wget')], max_length=32),
+        ),
+    ]

+ 95 - 0
archivebox/core/migrations/0024_auto_20240513_1143.py

@@ -0,0 +1,95 @@
+# Generated by Django 5.0.6 on 2024-05-13 11:43
+
+from django.db import migrations
+from datetime import datetime
+from abid_utils.abid import abid_from_values
+
+
+def calculate_abid(self):
+    """
+    Return a freshly derived ABID (assembled from attrs defined in ABIDModel.abid_*_src).
+    """
+    prefix = self.abid_prefix
+    ts = eval(self.abid_ts_src)
+    uri = eval(self.abid_uri_src)
+    subtype = eval(self.abid_subtype_src)
+    rand = eval(self.abid_rand_src)
+
+    if (not prefix) or prefix == 'obj_':
+        suggested_abid = self.__class__.__name__[:3].lower()
+        raise Exception(f'{self.__class__.__name__}.abid_prefix must be defined to calculate ABIDs (suggested: {suggested_abid})')
+
+    if not ts:
+        ts = datetime.utcfromtimestamp(0)
+        print(f'[!] WARNING: Generating ABID with ts=0000000000 placeholder because {self.__class__.__name__}.abid_ts_src={self.abid_ts_src} is unset!', ts.isoformat())
+
+    if not uri:
+        uri = str(self)
+        print(f'[!] WARNING: Generating ABID with uri=str(self) placeholder because {self.__class__.__name__}.abid_uri_src={self.abid_uri_src} is unset!', uri)
+
+    if not subtype:
+        subtype = self.__class__.__name__
+        print(f'[!] WARNING: Generating ABID with subtype={subtype} placeholder because {self.__class__.__name__}.abid_subtype_src={self.abid_subtype_src} is unset!', subtype)
+
+    if not rand:
+        rand = getattr(self, 'uuid', None) or getattr(self, 'id', None) or getattr(self, 'pk')
+        print(f'[!] WARNING: Generating ABID with rand=self.id placeholder because {self.__class__.__name__}.abid_rand_src={self.abid_rand_src} is unset!', rand)
+
+    abid = abid_from_values(
+        prefix=prefix,
+        ts=ts,
+        uri=uri,
+        subtype=subtype,
+        rand=rand,
+    )
+    assert abid.ulid and abid.uuid and abid.typeid, f'Failed to calculate {prefix}_ABID for {self.__class__.__name__}'
+    return abid
+
+
+def copy_snapshot_uuids(apps, schema_editor):
+    Snapshot = apps.get_model("core", "Snapshot")
+    for snapshot in Snapshot.objects.all():
+        snapshot.uuid = snapshot.id
+        snapshot.save(update_fields=["uuid"])
+
+def generate_snapshot_abids(apps, schema_editor):
+    Snapshot = apps.get_model("core", "Snapshot")
+    for snapshot in Snapshot.objects.all():
+        snapshot.abid_prefix = 'snp_'
+        snapshot.abid_ts_src = 'self.added'
+        snapshot.abid_uri_src = 'self.url'
+        snapshot.abid_subtype_src = '"01"'
+        snapshot.abid_rand_src = 'self.uuid'
+
+        snapshot.abid = calculate_abid(snapshot)
+        snapshot.save(update_fields=["abid"])
+
+def generate_archiveresult_abids(apps, schema_editor):
+    ArchiveResult = apps.get_model("core", "ArchiveResult")
+    Snapshot = apps.get_model("core", "Snapshot")
+    for result in ArchiveResult.objects.all():
+        result.abid_prefix = 'res_'
+        result.snapshot = Snapshot.objects.get(pk=result.snapshot_id)
+        result.snapshot_added = result.snapshot.added
+        result.snapshot_url = result.snapshot.url
+        result.abid_ts_src = 'self.snapshot_added'
+        result.abid_uri_src = 'self.snapshot_url'
+        result.abid_subtype_src = 'self.extractor'
+        result.abid_rand_src = 'self.id'
+
+        result.abid = calculate_abid(result)
+        result.uuid = result.abid.uuid
+        result.save(update_fields=["abid", "uuid"])
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ('core', '0023_alter_archiveresult_options_archiveresult_abid_and_more'),
+    ]
+
+    operations = [
+        migrations.RunPython(copy_snapshot_uuids, reverse_code=migrations.RunPython.noop),
+        migrations.RunPython(generate_snapshot_abids, reverse_code=migrations.RunPython.noop),
+        migrations.RunPython(generate_archiveresult_abids, reverse_code=migrations.RunPython.noop),
+    ]

+ 19 - 0
archivebox/core/migrations/0025_alter_archiveresult_uuid.py

@@ -0,0 +1,19 @@
+# Generated by Django 5.0.6 on 2024-05-13 12:08
+
+import uuid
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ('core', '0024_auto_20240513_1143'),
+    ]
+
+    operations = [
+        migrations.AlterField(
+            model_name='archiveresult',
+            name='uuid',
+            field=models.UUIDField(default=uuid.uuid4, editable=False, unique=True),
+        ),
+    ]

+ 76 - 0
archivebox/core/migrations/0026_archiveresult_created_archiveresult_created_by_and_more.py

@@ -0,0 +1,76 @@
+# Generated by Django 5.0.6 on 2024-05-13 13:01
+
+import abid_utils.models
+import django.db.models.deletion
+import django.utils.timezone
+from django.conf import settings
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ('core', '0025_alter_archiveresult_uuid'),
+        migrations.swappable_dependency(settings.AUTH_USER_MODEL),
+    ]
+
+    operations = [
+        migrations.AddField(
+            model_name='archiveresult',
+            name='created',
+            field=models.DateTimeField(auto_now_add=True, default=django.utils.timezone.now),
+            preserve_default=False,
+        ),
+        migrations.AddField(
+            model_name='archiveresult',
+            name='created_by',
+            field=models.ForeignKey(default=abid_utils.models.get_or_create_system_user_pk, on_delete=django.db.models.deletion.CASCADE, to=settings.AUTH_USER_MODEL),
+        ),
+        migrations.AddField(
+            model_name='archiveresult',
+            name='modified',
+            field=models.DateTimeField(auto_now=True),
+        ),
+        migrations.AddField(
+            model_name='snapshot',
+            name='created',
+            field=models.DateTimeField(auto_now_add=True, default=django.utils.timezone.now),
+            preserve_default=False,
+        ),
+        migrations.AddField(
+            model_name='snapshot',
+            name='created_by',
+            field=models.ForeignKey(default=abid_utils.models.get_or_create_system_user_pk, on_delete=django.db.models.deletion.CASCADE, to=settings.AUTH_USER_MODEL),
+        ),
+        migrations.AddField(
+            model_name='snapshot',
+            name='modified',
+            field=models.DateTimeField(auto_now=True),
+        ),
+        migrations.AddField(
+            model_name='tag',
+            name='created',
+            field=models.DateTimeField(auto_now_add=True, default=django.utils.timezone.now),
+            preserve_default=False,
+        ),
+        migrations.AddField(
+            model_name='tag',
+            name='created_by',
+            field=models.ForeignKey(default=abid_utils.models.get_or_create_system_user_pk, on_delete=django.db.models.deletion.CASCADE, to=settings.AUTH_USER_MODEL),
+        ),
+        migrations.AddField(
+            model_name='tag',
+            name='modified',
+            field=models.DateTimeField(auto_now=True),
+        ),
+        migrations.AddField(
+            model_name='tag',
+            name='uuid',
+            field=models.UUIDField(blank=True, null=True, unique=True),
+        ),
+        migrations.AlterField(
+            model_name='archiveresult',
+            name='uuid',
+            field=models.UUIDField(blank=True, null=True, unique=True),
+        ),
+    ]

+ 157 - 32
archivebox/core/models.py

@@ -1,11 +1,14 @@
 __package__ = 'archivebox.core'
 __package__ = 'archivebox.core'
 
 
 
 
-import uuid
+from typing import Optional, List, Dict
+from django_stubs_ext.db.models import TypedModelMeta
+
 import json
 import json
 
 
+import uuid
+from uuid import uuid4
 from pathlib import Path
 from pathlib import Path
-from typing import Optional, List
 
 
 from django.db import models
 from django.db import models
 from django.utils.functional import cached_property
 from django.utils.functional import cached_property
@@ -15,40 +18,58 @@ from django.urls import reverse
 from django.db.models import Case, When, Value, IntegerField
 from django.db.models import Case, When, Value, IntegerField
 from django.contrib.auth.models import User   # noqa
 from django.contrib.auth.models import User   # noqa
 
 
+from abid_utils.models import ABIDModel, ABIDField
+
 from ..config import ARCHIVE_DIR, ARCHIVE_DIR_NAME
 from ..config import ARCHIVE_DIR, ARCHIVE_DIR_NAME
 from ..system import get_dir_size
 from ..system import get_dir_size
-from ..util import parse_date, base_url, hashurl
+from ..util import parse_date, base_url
 from ..index.schema import Link
 from ..index.schema import Link
 from ..index.html import snapshot_icons
 from ..index.html import snapshot_icons
-from ..extractors import get_default_archive_methods, ARCHIVE_METHODS_INDEXING_PRECEDENCE
+from ..extractors import ARCHIVE_METHODS_INDEXING_PRECEDENCE, EXTRACTORS
+
 
 
-EXTRACTORS = [(extractor[0], extractor[0]) for extractor in get_default_archive_methods()]
+EXTRACTOR_CHOICES = [(extractor_name, extractor_name) for extractor_name in EXTRACTORS.keys()]
 STATUS_CHOICES = [
 STATUS_CHOICES = [
     ("succeeded", "succeeded"),
     ("succeeded", "succeeded"),
     ("failed", "failed"),
     ("failed", "failed"),
     ("skipped", "skipped")
     ("skipped", "skipped")
 ]
 ]
 
 
-try:
-    JSONField = models.JSONField
-except AttributeError:
-    import jsonfield
-    JSONField = jsonfield.JSONField
 
 
 
 
-class Tag(models.Model):
+# class BaseModel(models.Model):
+#     # TODO: migrate all models to a shared base class with all our standard fields and helpers:
+#     #       ulid/created/modified/owner/is_deleted/as_json/from_json/etc.
+#     #
+#     # id = models.AutoField(primary_key=True, serialize=False, verbose_name='ID')
+#     # ulid = models.CharField(max_length=26, null=True, blank=True, db_index=True, unique=True)
+
+#     class Meta(TypedModelMeta):
+#         abstract = True
+
+
+class Tag(ABIDModel):
     """
     """
-    Based on django-taggit model
+    Based on django-taggit model + ABID base.
     """
     """
+    abid_prefix = 'tag_'
+    abid_ts_src = 'self.created'          # TODO: add created/modified time
+    abid_uri_src = 'self.name'
+    abid_subtype_src = '"03"'
+    abid_rand_src = 'self.id'
+
+    # id = models.UUIDField(primary_key=True, default=uuid.uuid4, editable=True)
     id = models.AutoField(primary_key=True, serialize=False, verbose_name='ID')
     id = models.AutoField(primary_key=True, serialize=False, verbose_name='ID')
+    uuid = models.UUIDField(blank=True, null=True, editable=True, unique=True)
+    abid = ABIDField(prefix=abid_prefix)
 
 
-    name = models.CharField(unique=True, blank=False, max_length=100)
 
 
-    # slug is autoset on save from name, never set it manually
+    name = models.CharField(unique=True, blank=False, max_length=100)
     slug = models.SlugField(unique=True, blank=True, max_length=100)
     slug = models.SlugField(unique=True, blank=True, max_length=100)
+    # slug is autoset on save from name, never set it manually
 
 
 
 
-    class Meta:
+    class Meta(TypedModelMeta):
         verbose_name = "Tag"
         verbose_name = "Tag"
         verbose_name_plural = "Tags"
         verbose_name_plural = "Tags"
 
 
@@ -84,8 +105,16 @@ class Tag(models.Model):
             return super().save(*args, **kwargs)
             return super().save(*args, **kwargs)
 
 
 
 
-class Snapshot(models.Model):
-    id = models.UUIDField(primary_key=True, default=uuid.uuid4, editable=False)
+class Snapshot(ABIDModel):
+    abid_prefix = 'snp_'
+    abid_ts_src = 'self.added'
+    abid_uri_src = 'self.url'
+    abid_subtype_src = '"01"'
+    abid_rand_src = 'self.id'
+
+    id = models.UUIDField(primary_key=True, default=uuid.uuid4, editable=False)  # legacy pk
+    uuid = models.UUIDField(blank=True, null=True, editable=True, unique=True)
+    abid = ABIDField(prefix=abid_prefix)
 
 
     url = models.URLField(unique=True, db_index=True)
     url = models.URLField(unique=True, db_index=True)
     timestamp = models.CharField(max_length=32, unique=True, db_index=True)
     timestamp = models.CharField(max_length=32, unique=True, db_index=True)
@@ -98,6 +127,7 @@ class Snapshot(models.Model):
 
 
     keys = ('url', 'timestamp', 'title', 'tags', 'updated')
     keys = ('url', 'timestamp', 'title', 'tags', 'updated')
 
 
+
     def __repr__(self) -> str:
     def __repr__(self) -> str:
         title = self.title or '-'
         title = self.title or '-'
         return f'[{self.timestamp}] {self.url[:64]} ({title[:64]})'
         return f'[{self.timestamp}] {self.url[:64]} ({title[:64]})'
@@ -126,8 +156,8 @@ class Snapshot(models.Model):
         from ..index import load_link_details
         from ..index import load_link_details
         return load_link_details(self.as_link())
         return load_link_details(self.as_link())
 
 
-    def tags_str(self, nocache=True) -> str:
-        cache_key = f'{self.id}-{(self.updated or self.added).timestamp()}-tags'
+    def tags_str(self, nocache=True) -> str | None:
+        cache_key = f'{self.pk}-{(self.updated or self.added).timestamp()}-tags'
         calc_tags_str = lambda: ','.join(self.tags.order_by('name').values_list('name', flat=True))
         calc_tags_str = lambda: ','.join(self.tags.order_by('name').values_list('name', flat=True))
         if nocache:
         if nocache:
             tags_str = calc_tags_str()
             tags_str = calc_tags_str()
@@ -157,13 +187,9 @@ class Snapshot(models.Model):
         return self.as_link().is_archived
         return self.as_link().is_archived
 
 
     @cached_property
     @cached_property
-    def num_outputs(self):
+    def num_outputs(self) -> int:
         return self.archiveresult_set.filter(status='succeeded').count()
         return self.archiveresult_set.filter(status='succeeded').count()
 
 
-    @cached_property
-    def url_hash(self):
-        return hashurl(self.url)
-
     @cached_property
     @cached_property
     def base_url(self):
     def base_url(self):
         return base_url(self.url)
         return base_url(self.url)
@@ -178,7 +204,7 @@ class Snapshot(models.Model):
 
 
     @cached_property
     @cached_property
     def archive_size(self):
     def archive_size(self):
-        cache_key = f'{str(self.id)[:12]}-{(self.updated or self.added).timestamp()}-size'
+        cache_key = f'{str(self.pk)[:12]}-{(self.updated or self.added).timestamp()}-size'
 
 
         def calc_dir_size():
         def calc_dir_size():
             try:
             try:
@@ -199,7 +225,7 @@ class Snapshot(models.Model):
         return None
         return None
 
 
     @cached_property
     @cached_property
-    def headers(self) -> Optional[dict]:
+    def headers(self) -> Optional[Dict[str, str]]:
         try:
         try:
             return json.loads((Path(self.link_dir) / 'headers.json').read_text(encoding='utf-8').strip())
             return json.loads((Path(self.link_dir) / 'headers.json').read_text(encoding='utf-8').strip())
         except Exception:
         except Exception:
@@ -250,11 +276,37 @@ class Snapshot(models.Model):
         tags_id = []
         tags_id = []
         for tag in tags:
         for tag in tags:
             if tag.strip():
             if tag.strip():
-                tags_id.append(Tag.objects.get_or_create(name=tag)[0].id)
+                tags_id.append(Tag.objects.get_or_create(name=tag)[0].pk)
         self.tags.clear()
         self.tags.clear()
         self.tags.add(*tags_id)
         self.tags.add(*tags_id)
 
 
 
 
+    # def get_storage_dir(self, create=True, symlink=True) -> Path:
+    #     date_str = self.added.strftime('%Y%m%d')
+    #     domain_str = domain(self.url)
+    #     abs_storage_dir = Path(ARCHIVE_DIR) / 'snapshots' / date_str / domain_str / str(self.ulid)
+
+    #     if create and not abs_storage_dir.is_dir():
+    #         abs_storage_dir.mkdir(parents=True, exist_ok=True)
+
+    #     if symlink:
+    #         LINK_PATHS = [
+    #             Path(ARCHIVE_DIR).parent / 'index' / 'all_by_id' / str(self.ulid),
+    #             # Path(ARCHIVE_DIR).parent / 'index' / 'snapshots_by_id' / str(self.ulid),
+    #             Path(ARCHIVE_DIR).parent / 'index' / 'snapshots_by_date' / date_str / domain_str / str(self.ulid),
+    #             Path(ARCHIVE_DIR).parent / 'index' / 'snapshots_by_domain' / domain_str / date_str / str(self.ulid),
+    #         ]
+    #         for link_path in LINK_PATHS:
+    #             link_path.parent.mkdir(parents=True, exist_ok=True)
+    #             try:
+    #                 link_path.symlink_to(abs_storage_dir)
+    #             except FileExistsError:
+    #                 link_path.unlink()
+    #                 link_path.symlink_to(abs_storage_dir)
+
+    #     return abs_storage_dir
+
+
 class ArchiveResultManager(models.Manager):
 class ArchiveResultManager(models.Manager):
     def indexable(self, sorted: bool = True):
     def indexable(self, sorted: bool = True):
         INDEXABLE_METHODS = [ r[0] for r in ARCHIVE_METHODS_INDEXING_PRECEDENCE ]
         INDEXABLE_METHODS = [ r[0] for r in ARCHIVE_METHODS_INDEXING_PRECEDENCE ]
@@ -266,13 +318,22 @@ class ArchiveResultManager(models.Manager):
         return qs
         return qs
 
 
 
 
-class ArchiveResult(models.Model):
-    id = models.AutoField(primary_key=True, serialize=False, verbose_name='ID')
-    uuid = models.UUIDField(default=uuid.uuid4, editable=False)
+class ArchiveResult(ABIDModel):
+    abid_prefix = 'res_'
+    abid_ts_src = 'self.snapshot.added'
+    abid_uri_src = 'self.snapshot.url'
+    abid_subtype_src = 'self.extractor'
+    abid_rand_src = 'self.uuid'
+    EXTRACTOR_CHOICES = EXTRACTOR_CHOICES
+
+    # id = models.UUIDField(primary_key=True, default=uuid.uuid4, editable=False)
+    id = models.AutoField(primary_key=True, serialize=False, verbose_name='ID')   # legacy pk
+    uuid = models.UUIDField(blank=True, null=True, editable=True, unique=True)
+    abid = ABIDField(prefix=abid_prefix)
 
 
     snapshot = models.ForeignKey(Snapshot, on_delete=models.CASCADE)
     snapshot = models.ForeignKey(Snapshot, on_delete=models.CASCADE)
-    extractor = models.CharField(choices=EXTRACTORS, max_length=32)
-    cmd = JSONField()
+    extractor = models.CharField(choices=EXTRACTOR_CHOICES, max_length=32)
+    cmd = models.JSONField()
     pwd = models.CharField(max_length=256)
     pwd = models.CharField(max_length=256)
     cmd_version = models.CharField(max_length=128, default=None, null=True, blank=True)
     cmd_version = models.CharField(max_length=128, default=None, null=True, blank=True)
     output = models.CharField(max_length=1024)
     output = models.CharField(max_length=1024)
@@ -282,5 +343,69 @@ class ArchiveResult(models.Model):
 
 
     objects = ArchiveResultManager()
     objects = ArchiveResultManager()
 
 
+    class Meta(TypedModelMeta):
+        verbose_name = 'Result'
+
     def __str__(self):
     def __str__(self):
         return self.extractor
         return self.extractor
+
+    @cached_property
+    def snapshot_dir(self):
+        return Path(self.snapshot.link_dir)
+
+
+    @property
+    def extractor_module(self):
+        return EXTRACTORS[self.extractor]
+
+    def output_path(self) -> str:
+        """return the canonical output filename or directory name within the snapshot dir"""
+        return self.extractor_module.get_output_path()
+
+    def embed_path(self) -> str:
+        """
+        return the actual runtime-calculated path to the file on-disk that
+        should be used for user-facing iframe embeds of this result
+        """
+
+        if hasattr(self.extractor_module, 'get_embed_path'):
+            return self.extractor_module.get_embed_path(self)
+
+        return self.extractor_module.get_output_path()
+
+    def legacy_output_path(self):
+        link = self.snapshot.as_link()
+        return link.canonical_outputs().get(f'{self.extractor}_path')
+
+    def output_exists(self) -> bool:
+        return Path(self.output_path()).exists()
+
+
+    # def get_storage_dir(self, create=True, symlink=True):
+    #     date_str = self.snapshot.added.strftime('%Y%m%d')
+    #     domain_str = domain(self.snapshot.url)
+    #     abs_storage_dir = Path(ARCHIVE_DIR) / 'results' / date_str / domain_str / self.extractor / str(self.ulid)
+
+    #     if create and not abs_storage_dir.is_dir():
+    #         abs_storage_dir.mkdir(parents=True, exist_ok=True)
+
+    #     if symlink:
+    #         LINK_PATHS = [
+    #             Path(ARCHIVE_DIR).parent / 'index' / 'all_by_id' / str(self.ulid),
+    #             # Path(ARCHIVE_DIR).parent / 'index' / 'results_by_id' / str(self.ulid),
+    #             # Path(ARCHIVE_DIR).parent / 'index' / 'results_by_date' / date_str / domain_str / self.extractor / str(self.ulid),
+    #             Path(ARCHIVE_DIR).parent / 'index' / 'results_by_domain' / domain_str / date_str / self.extractor / str(self.ulid),
+    #             Path(ARCHIVE_DIR).parent / 'index' / 'results_by_type' / self.extractor / date_str / domain_str / str(self.ulid),
+    #         ]
+    #         for link_path in LINK_PATHS:
+    #             link_path.parent.mkdir(parents=True, exist_ok=True)
+    #             try:
+    #                 link_path.symlink_to(abs_storage_dir)
+    #             except FileExistsError:
+    #                 link_path.unlink()
+    #                 link_path.symlink_to(abs_storage_dir)
+
+    #     return abs_storage_dir
+
+    # def symlink_index(self, create=True):
+    #     abs_result_dir = self.get_storage_dir(create=create)

+ 78 - 14
archivebox/core/settings.py

@@ -10,6 +10,7 @@ from pathlib import Path
 from django.utils.crypto import get_random_string
 from django.utils.crypto import get_random_string
 
 
 from ..config import (
 from ..config import (
+    CONFIG,
     DEBUG,
     DEBUG,
     SECRET_KEY,
     SECRET_KEY,
     ALLOWED_HOSTS,
     ALLOWED_HOSTS,
@@ -20,6 +21,7 @@ from ..config import (
     OUTPUT_DIR,
     OUTPUT_DIR,
     ARCHIVE_DIR,
     ARCHIVE_DIR,
     LOGS_DIR,
     LOGS_DIR,
+    CACHE_DIR,
     TIMEZONE,
     TIMEZONE,
 
 
     LDAP,
     LDAP,
@@ -53,6 +55,26 @@ APPEND_SLASH = True
 
 
 DEBUG = DEBUG or ('--debug' in sys.argv)
 DEBUG = DEBUG or ('--debug' in sys.argv)
 
 
+
+# add plugins folders to system path, and load plugins in installed_apps
+BUILTIN_PLUGINS_DIR = PACKAGE_DIR / 'plugins'
+USER_PLUGINS_DIR = OUTPUT_DIR / 'plugins'
+sys.path.insert(0, str(BUILTIN_PLUGINS_DIR))
+sys.path.insert(0, str(USER_PLUGINS_DIR))
+
+def find_plugins(plugins_dir):
+    return {
+        # plugin_entrypoint.parent.name: import_module(plugin_entrypoint.parent.name).METADATA
+        plugin_entrypoint.parent.name: plugin_entrypoint.parent
+        for plugin_entrypoint in plugins_dir.glob('*/apps.py')
+    }
+
+INSTALLED_PLUGINS = {
+    **find_plugins(BUILTIN_PLUGINS_DIR),
+    **find_plugins(USER_PLUGINS_DIR),
+}
+
+
 INSTALLED_APPS = [
 INSTALLED_APPS = [
     'django.contrib.auth',
     'django.contrib.auth',
     'django.contrib.contenttypes',
     'django.contrib.contenttypes',
@@ -60,13 +82,18 @@ INSTALLED_APPS = [
     'django.contrib.messages',
     'django.contrib.messages',
     'django.contrib.staticfiles',
     'django.contrib.staticfiles',
     'django.contrib.admin',
     'django.contrib.admin',
+    'django_jsonform',
 
 
+    'signal_webhooks',
+    'abid_utils',
+    'plugantic',
     'core',
     'core',
     'api',
     'api',
 
 
+    *INSTALLED_PLUGINS.keys(),
+
     'admin_data_views',
     'admin_data_views',
 
 
-    'signal_webhooks',
     'django_extensions',
     'django_extensions',
 ]
 ]
 
 
@@ -227,6 +254,11 @@ TEMPLATES = [
 ### External Service Settings
 ### External Service Settings
 ################################################################################
 ################################################################################
 
 
+
+CACHE_DB_FILENAME = 'cache.sqlite3'
+CACHE_DB_PATH = CACHE_DIR / CACHE_DB_FILENAME
+CACHE_DB_TABLE = 'django_cache'
+
 DATABASE_FILE = Path(OUTPUT_DIR) / SQL_INDEX_FILENAME
 DATABASE_FILE = Path(OUTPUT_DIR) / SQL_INDEX_FILENAME
 DATABASE_NAME = os.environ.get("ARCHIVEBOX_DATABASE_NAME", str(DATABASE_FILE))
 DATABASE_NAME = os.environ.get("ARCHIVEBOX_DATABASE_NAME", str(DATABASE_FILE))
 
 
@@ -240,18 +272,28 @@ DATABASES = {
         },
         },
         'TIME_ZONE': TIMEZONE,
         'TIME_ZONE': TIMEZONE,
         # DB setup is sometimes modified at runtime by setup_django() in config.py
         # DB setup is sometimes modified at runtime by setup_django() in config.py
-    }
+    },
+    # 'cache': {
+    #     'ENGINE': 'django.db.backends.sqlite3',
+    #     'NAME': CACHE_DB_PATH,
+    #     'OPTIONS': {
+    #         'timeout': 60,
+    #         'check_same_thread': False,
+    #     },
+    #     'TIME_ZONE': TIMEZONE,
+    # },
 }
 }
+MIGRATION_MODULES = {'signal_webhooks': None}
+
+# as much as I'd love this to be a UUID or ULID field, it's not supported yet as of Django 5.0
+DEFAULT_AUTO_FIELD = 'django.db.models.BigAutoField'
 
 
-CACHE_BACKEND = 'django.core.cache.backends.locmem.LocMemCache'
-# CACHE_BACKEND = 'django.core.cache.backends.db.DatabaseCache'
-# CACHE_BACKEND = 'django.core.cache.backends.dummy.DummyCache'
 
 
 CACHES = {
 CACHES = {
-    'default': {
-        'BACKEND': CACHE_BACKEND,
-        'LOCATION': 'django_cache_default',
-    }
+    'default': {'BACKEND': 'django.core.cache.backends.locmem.LocMemCache'},
+    # 'sqlite': {'BACKEND': 'django.core.cache.backends.db.DatabaseCache', 'LOCATION': 'cache'},
+    # 'dummy': {'BACKEND': 'django.core.cache.backends.dummy.DummyCache'},
+    # 'filebased': {"BACKEND": "django.core.cache.backends.filebased.FileBasedCache", "LOCATION": CACHE_DIR / 'cache_filebased'},
 }
 }
 
 
 EMAIL_BACKEND = 'django.core.mail.backends.console.EmailBackend'
 EMAIL_BACKEND = 'django.core.mail.backends.console.EmailBackend'
@@ -409,9 +451,11 @@ LOGGING = {
 
 
 
 
 # Add default webhook configuration to the User model
 # Add default webhook configuration to the User model
+SIGNAL_WEBHOOKS_CUSTOM_MODEL = 'api.models.OutboundWebhook'
 SIGNAL_WEBHOOKS = {
 SIGNAL_WEBHOOKS = {
     "HOOKS": {
     "HOOKS": {
-        "django.contrib.auth.models.User": ...,  # ... is a special value that means "use the default autogenerated hooks"
+        # ... is a special sigil value that means "use the default autogenerated hooks"
+        "django.contrib.auth.models.User": ...,
         "core.models.Snapshot": ...,
         "core.models.Snapshot": ...,
         "core.models.ArchiveResult": ...,
         "core.models.ArchiveResult": ...,
         "core.models.Tag": ...,
         "core.models.Tag": ...,
@@ -421,16 +465,36 @@ SIGNAL_WEBHOOKS = {
 
 
 
 
 ADMIN_DATA_VIEWS = {
 ADMIN_DATA_VIEWS = {
-    "NAME": "configuration",
+    "NAME": "Environment",
     "URLS": [
     "URLS": [
         {
         {
-            "route": "live/",
+            "route": "config/",
             "view": "core.views.live_config_list_view",
             "view": "core.views.live_config_list_view",
-            "name": "live",
+            "name": "Configuration",
             "items": {
             "items": {
                 "route": "<str:key>/",
                 "route": "<str:key>/",
                 "view": "core.views.live_config_value_view",
                 "view": "core.views.live_config_value_view",
-                "name": "live_config_value",
+                "name": "config_val",
+            },
+        },
+        {
+            "route": "binaries/",
+            "view": "plugantic.views.binaries_list_view",
+            "name": "Binaries",
+            "items": {
+                "route": "<str:key>/",
+                "view": "plugantic.views.binary_detail_view",
+                "name": "binary",
+            },
+        },
+        {
+            "route": "plugins/",
+            "view": "plugantic.views.plugins_list_view",
+            "name": "Plugins",
+            "items": {
+                "route": "<str:key>/",
+                "view": "plugantic.views.plugin_detail_view",
+                "name": "plugin",
             },
             },
         },
         },
     ],
     ],

+ 99 - 13
archivebox/core/views.py

@@ -3,6 +3,7 @@ __package__ = 'archivebox.core'
 from typing import Callable
 from typing import Callable
 
 
 from io import StringIO
 from io import StringIO
+from pathlib import Path
 from contextlib import redirect_stdout
 from contextlib import redirect_stdout
 
 
 from django.shortcuts import render, redirect
 from django.shortcuts import render, redirect
@@ -36,10 +37,14 @@ from ..config import (
     CONFIG_SCHEMA,
     CONFIG_SCHEMA,
     DYNAMIC_CONFIG_SCHEMA,
     DYNAMIC_CONFIG_SCHEMA,
     USER_CONFIG,
     USER_CONFIG,
+    SAVE_ARCHIVE_DOT_ORG,
+    PREVIEW_ORIGINALS,
 )
 )
+from ..logging_util import printable_filesize
 from ..main import add
 from ..main import add
-from ..util import base_url, ansi_to_html
+from ..util import base_url, ansi_to_html, htmlencode, urldecode, urlencode, ts_to_date_str
 from ..search import query_search_index
 from ..search import query_search_index
+from ..extractors.wget import wget_output_path
 
 
 
 
 class HomepageView(View):
 class HomepageView(View):
@@ -56,10 +61,80 @@ class HomepageView(View):
 class SnapshotView(View):
 class SnapshotView(View):
     # render static html index from filesystem archive/<timestamp>/index.html
     # render static html index from filesystem archive/<timestamp>/index.html
 
 
+    @staticmethod
+    def render_live_index(request, snapshot):
+        TITLE_LOADING_MSG = 'Not yet archived...'
+        HIDDEN_RESULTS = ('favicon', 'headers', 'title', 'htmltotext', 'warc', 'archive_org')
+
+        archiveresults = {}
+
+        results = snapshot.archiveresult_set.all()
+
+        for result in results:
+            embed_path = result.embed_path()
+            abs_path = result.snapshot_dir / (embed_path or 'None')
+
+            if (result.status == 'succeeded'
+                and (result.extractor not in HIDDEN_RESULTS)
+                and embed_path
+                and abs_path.exists()):
+                if abs_path.is_dir() and not any(abs_path.glob('*.*')):
+                    continue
+
+                result_info = {
+                    'name': result.extractor,
+                    'path': embed_path,
+                    'ts': ts_to_date_str(result.end_ts),
+                }
+                archiveresults[result.extractor] = result_info
+
+        preferred_types = ('singlefile', 'wget', 'screenshot', 'dom', 'media', 'pdf', 'readability', 'mercury')
+        all_types = preferred_types + tuple(result_type for result_type in archiveresults.keys() if result_type not in preferred_types)
+
+        best_result = {'path': 'None'}
+        for result_type in preferred_types:
+            if result_type in archiveresults:
+                best_result = archiveresults[result_type]
+                break
+
+        link = snapshot.as_link()
+
+        link_info = link._asdict(extended=True)
+
+        try:
+            warc_path = 'warc/' + list(Path(snapshot.link_dir).glob('warc/*.warc.*'))[0].name
+        except IndexError:
+            warc_path = 'warc/'
+
+        context = {
+            **link_info,
+            **link_info['canonical'],
+            'title': htmlencode(
+                link.title
+                or (link.base_url if link.is_archived else TITLE_LOADING_MSG)
+            ),
+            'extension': link.extension or 'html',
+            'tags': link.tags or 'untagged',
+            'size': printable_filesize(link.archive_size) if link.archive_size else 'pending',
+            'status': 'archived' if link.is_archived else 'not yet archived',
+            'status_color': 'success' if link.is_archived else 'danger',
+            'oldest_archive_date': ts_to_date_str(link.oldest_archive_date),
+            'warc_path': warc_path,
+            'SAVE_ARCHIVE_DOT_ORG': SAVE_ARCHIVE_DOT_ORG,
+            'PREVIEW_ORIGINALS': PREVIEW_ORIGINALS,
+            'archiveresults': sorted(archiveresults.values(), key=lambda r: all_types.index(r['name'])),
+            'best_result': best_result,
+            # 'tags_str': 'somealskejrewlkrjwer,werlmwrwlekrjewlkrjwer324m532l,4m32,23m324234',
+        }
+        return render(template_name='core/snapshot_live.html', request=request, context=context)
+
+
     def get(self, request, path):
     def get(self, request, path):
         if not request.user.is_authenticated and not PUBLIC_SNAPSHOTS:
         if not request.user.is_authenticated and not PUBLIC_SNAPSHOTS:
             return redirect(f'/admin/login/?next={request.path}')
             return redirect(f'/admin/login/?next={request.path}')
 
 
+        snapshot = None
+
         try:
         try:
             slug, archivefile = path.split('/', 1)
             slug, archivefile = path.split('/', 1)
         except (IndexError, ValueError):
         except (IndexError, ValueError):
@@ -75,7 +150,11 @@ class SnapshotView(View):
             try:
             try:
                 try:
                 try:
                     snapshot = Snapshot.objects.get(Q(timestamp=slug) | Q(id__startswith=slug))
                     snapshot = Snapshot.objects.get(Q(timestamp=slug) | Q(id__startswith=slug))
-                    response = static.serve(request, archivefile, document_root=snapshot.link_dir, show_indexes=True)
+                    if archivefile == 'index.html':
+                        # if they requested snapshot index, serve live rendered template instead of static html
+                        response = self.render_live_index(request, snapshot)
+                    else:
+                        response = static.serve(request, archivefile, document_root=snapshot.link_dir, show_indexes=True)
                     response["Link"] = f'<{snapshot.url}>; rel="canonical"'
                     response["Link"] = f'<{snapshot.url}>; rel="canonical"'
                     return response
                     return response
                 except Snapshot.DoesNotExist:
                 except Snapshot.DoesNotExist:
@@ -127,26 +206,33 @@ class SnapshotView(View):
                     status=404,
                     status=404,
                 )
                 )
             except Http404:
             except Http404:
+                assert snapshot     # (Snapshot.DoesNotExist is already handled above)
+
                 # Snapshot dir exists but file within does not e.g. 124235.324234/screenshot.png
                 # Snapshot dir exists but file within does not e.g. 124235.324234/screenshot.png
                 return HttpResponse(
                 return HttpResponse(
                     format_html(
                     format_html(
                         (
                         (
                             '<center><br/><br/><br/>'
                             '<center><br/><br/><br/>'
-                            f'Snapshot <a href="/archive/{snapshot.timestamp}/index.html" target="_top"><b><code>[{snapshot.timestamp}]</code></b></a> exists in DB, but resource <b><code>{snapshot.timestamp}/'
+                            f'Snapshot <a href="/archive/{snapshot.timestamp}/index.html" target="_top"><b><code>[{snapshot.timestamp}]</code></b></a>: <a href="{snapshot.url}" target="_blank" rel="noreferrer">{snapshot.url}</a><br/>'
+                            f'was queued on {str(snapshot.added).split(".")[0]}, '
+                            f'but no files have been saved yet in:<br/><b><a href="/archive/{snapshot.timestamp}/" target="_top"><code>{snapshot.timestamp}</code></a><code>/'
                             '{}'
                             '{}'
-                            f'</code></b> does not exist in the <a href="/archive/{snapshot.timestamp}/" target="_top">snapshot dir</a> yet.<br/><br/>'
-                            'It\'s possible that this resource type is not available for the Snapshot,<br/>or that the archiving process has not completed yet.<br/>'
-                            f'<pre><code># if interrupted, run this cmd to finish archiving this Snapshot<br/>archivebox update -t timestamp {snapshot.timestamp}</code></pre><br/><br/>'
+                            f'</code></b><br/><br/>'
+                            'It\'s possible {} '
+                            f'during the last capture on {str(snapshot.added).split(".")[0]},<br/>or that the archiving process has not completed yet.<br/>'
+                            f'<pre><code># run this cmd to finish/retry archiving this Snapshot</code><br/>'
+                            f'<code style="user-select: all; color: #333">archivebox update -t timestamp {snapshot.timestamp}</code></pre><br/><br/>'
                             '<div class="text-align: left; width: 100%; max-width: 400px">'
                             '<div class="text-align: left; width: 100%; max-width: 400px">'
                             '<i><b>Next steps:</i></b><br/>'
                             '<i><b>Next steps:</i></b><br/>'
                             f'- list all the <a href="/archive/{snapshot.timestamp}/" target="_top">Snapshot files <code>.*</code></a><br/>'
                             f'- list all the <a href="/archive/{snapshot.timestamp}/" target="_top">Snapshot files <code>.*</code></a><br/>'
                             f'- view the <a href="/archive/{snapshot.timestamp}/index.html" target="_top">Snapshot <code>./index.html</code></a><br/>'
                             f'- view the <a href="/archive/{snapshot.timestamp}/index.html" target="_top">Snapshot <code>./index.html</code></a><br/>'
-                            f'- go to the <a href="/admin/core/snapshot/{snapshot.id}/change/" target="_top">Snapshot admin</a> to edit<br/>'
-                            f'- go to the <a href="/admin/core/snapshot/?id__startswith={snapshot.id}" target="_top">Snapshot actions</a> to re-archive<br/>'
+                            f'- go to the <a href="/admin/core/snapshot/{snapshot.pk}/change/" target="_top">Snapshot admin</a> to edit<br/>'
+                            f'- go to the <a href="/admin/core/snapshot/?uuid__startswith={snapshot.uuid}" target="_top">Snapshot actions</a> to re-archive<br/>'
                             '- or return to <a href="/" target="_top">the main index...</a></div>'
                             '- or return to <a href="/" target="_top">the main index...</a></div>'
                             '</center>'
                             '</center>'
                         ),
                         ),
-                        archivefile,
+                        archivefile if str(archivefile) != 'None' else '',
+                        f'the {archivefile} resource could not be fetched' if str(archivefile) != 'None' else 'the original site was not available',
                     ),
                     ),
                     content_type="text/html",
                     content_type="text/html",
                     status=404,
                     status=404,
@@ -369,21 +455,21 @@ def live_config_list_view(request: HttpRequest, **kwargs) -> TableContext:
 
 
     for section in CONFIG_SCHEMA.keys():
     for section in CONFIG_SCHEMA.keys():
         for key in CONFIG_SCHEMA[section].keys():
         for key in CONFIG_SCHEMA[section].keys():
-            rows['Section'].append(section.replace('_', ' ').title().replace(' Config', ''))
+            rows['Section'].append(section)   # section.replace('_', ' ').title().replace(' Config', '')
             rows['Key'].append(ItemLink(key, key=key))
             rows['Key'].append(ItemLink(key, key=key))
             rows['Type'].append(mark_safe(f'<code>{find_config_type(key)}</code>'))
             rows['Type'].append(mark_safe(f'<code>{find_config_type(key)}</code>'))
             rows['Value'].append(mark_safe(f'<code>{CONFIG[key]}</code>') if key_is_safe(key) else '******** (redacted)')
             rows['Value'].append(mark_safe(f'<code>{CONFIG[key]}</code>') if key_is_safe(key) else '******** (redacted)')
-            rows['Default'].append(mark_safe(f'<a href="https://github.com/search?q=repo%3AArchiveBox%2FArchiveBox+path%3Aconfig.py+%27{key}%27&type=code"><code style="text-decoration: underline">{find_config_default(key) or 'See here...'}</code></a>'))
+            rows['Default'].append(mark_safe(f'<a href="https://github.com/search?q=repo%3AArchiveBox%2FArchiveBox+path%3Aconfig.py+%27{key}%27&type=code"><code style="text-decoration: underline">{find_config_default(key) or "See here..."}</code></a>'))
             # rows['Documentation'].append(mark_safe(f'Wiki: <a href="https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#{key.lower()}">{key}</a>'))
             # rows['Documentation'].append(mark_safe(f'Wiki: <a href="https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#{key.lower()}">{key}</a>'))
             rows['Aliases'].append(', '.join(CONFIG_SCHEMA[section][key].get('aliases', [])))
             rows['Aliases'].append(', '.join(CONFIG_SCHEMA[section][key].get('aliases', [])))
 
 
     section = 'DYNAMIC'
     section = 'DYNAMIC'
     for key in DYNAMIC_CONFIG_SCHEMA.keys():
     for key in DYNAMIC_CONFIG_SCHEMA.keys():
-        rows['Section'].append(section.replace('_', ' ').title().replace(' Config', ''))
+        rows['Section'].append(section)   # section.replace('_', ' ').title().replace(' Config', '')
         rows['Key'].append(ItemLink(key, key=key))
         rows['Key'].append(ItemLink(key, key=key))
         rows['Type'].append(mark_safe(f'<code>{find_config_type(key)}</code>'))
         rows['Type'].append(mark_safe(f'<code>{find_config_type(key)}</code>'))
         rows['Value'].append(mark_safe(f'<code>{CONFIG[key]}</code>') if key_is_safe(key) else '******** (redacted)')
         rows['Value'].append(mark_safe(f'<code>{CONFIG[key]}</code>') if key_is_safe(key) else '******** (redacted)')
-        rows['Default'].append(mark_safe(f'<a href="https://github.com/search?q=repo%3AArchiveBox%2FArchiveBox+path%3Aconfig.py+%27{key}%27&type=code"><code style="text-decoration: underline">{find_config_default(key) or 'See here...'}</code></a>'))
+        rows['Default'].append(mark_safe(f'<a href="https://github.com/search?q=repo%3AArchiveBox%2FArchiveBox+path%3Aconfig.py+%27{key}%27&type=code"><code style="text-decoration: underline">{find_config_default(key) or "See here..."}</code></a>'))
         # rows['Documentation'].append(mark_safe(f'Wiki: <a href="https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#{key.lower()}">{key}</a>'))
         # rows['Documentation'].append(mark_safe(f'Wiki: <a href="https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#{key.lower()}">{key}</a>'))
         rows['Aliases'].append(ItemLink(key, key=key) if key in USER_CONFIG else '')
         rows['Aliases'].append(ItemLink(key, key=key) if key in USER_CONFIG else '')
 
 

+ 39 - 3
archivebox/extractors/__init__.py

@@ -1,11 +1,13 @@
 __package__ = 'archivebox.extractors'
 __package__ = 'archivebox.extractors'
 
 
+from typing import Callable, Optional, Dict, List, Iterable, Union, Protocol, cast
+
 import os
 import os
 import sys
 import sys
 from pathlib import Path
 from pathlib import Path
-
-from typing import Callable, Optional, List, Iterable, Union
+from importlib import import_module
 from datetime import datetime, timezone
 from datetime import datetime, timezone
+
 from django.db.models import QuerySet
 from django.db.models import QuerySet
 
 
 from ..config import (
 from ..config import (
@@ -158,7 +160,7 @@ def archive_link(link: Link, overwrite: bool=False, methods: Optional[Iterable[s
                     # bump the updated time on the main Snapshot here, this is critical
                     # bump the updated time on the main Snapshot here, this is critical
                     # to be able to cache summaries of the ArchiveResults for a given
                     # to be able to cache summaries of the ArchiveResults for a given
                     # snapshot without having to load all the results from the DB each time.
                     # snapshot without having to load all the results from the DB each time.
-                    # (we use {Snapshot.id}-{Snapshot.updated} as the cache key and assume
+                    # (we use {Snapshot.pk}-{Snapshot.updated} as the cache key and assume
                     # ArchiveResults are unchanged as long as the updated timestamp is unchanged)
                     # ArchiveResults are unchanged as long as the updated timestamp is unchanged)
                     snapshot.save()
                     snapshot.save()
                 else:
                 else:
@@ -240,3 +242,37 @@ def archive_links(all_links: Union[Iterable[Link], QuerySet], overwrite: bool=Fa
 
 
     log_archiving_finished(num_links)
     log_archiving_finished(num_links)
     return all_links
     return all_links
+
+
+
+EXTRACTORS_DIR = Path(__file__).parent
+
+class ExtractorModuleProtocol(Protocol):
+    """Type interface for an Extractor Module (WIP)"""
+    
+    get_output_path: Callable
+    
+    # TODO:
+    # get_embed_path: Callable | None
+    # should_extract(Snapshot)
+    # extract(Snapshot)
+
+
+def get_extractors(dir: Path=EXTRACTORS_DIR) -> Dict[str, ExtractorModuleProtocol]:
+    """iterate through archivebox/extractors/*.py and load extractor modules"""
+    EXTRACTORS = {}
+
+    for filename in EXTRACTORS_DIR.glob('*.py'):
+        if filename.name.startswith('__'):
+            continue
+
+        extractor_name = filename.name.replace('.py', '')
+
+        extractor_module = cast(ExtractorModuleProtocol, import_module(f'.{extractor_name}', package=__package__))
+
+        assert getattr(extractor_module, 'get_output_path')
+        EXTRACTORS[extractor_name] = extractor_module
+
+    return EXTRACTORS
+
+EXTRACTORS = get_extractors(EXTRACTORS_DIR)

+ 5 - 3
archivebox/extractors/archive_org.py

@@ -24,6 +24,8 @@ from ..config import (
 )
 )
 from ..logging_util import TimedProgress
 from ..logging_util import TimedProgress
 
 
+def get_output_path():
+    return 'archive.org.txt'
 
 
 
 
 @enforce_types
 @enforce_types
@@ -32,7 +34,7 @@ def should_save_archive_dot_org(link: Link, out_dir: Optional[Path]=None, overwr
         return False
         return False
 
 
     out_dir = out_dir or Path(link.link_dir)
     out_dir = out_dir or Path(link.link_dir)
-    if not overwrite and (out_dir / 'archive.org.txt').exists():
+    if not overwrite and (out_dir / get_output_path()).exists():
         # if open(path, 'r', encoding='utf-8').read().strip() != 'None':
         # if open(path, 'r', encoding='utf-8').read().strip() != 'None':
         return False
         return False
 
 
@@ -43,7 +45,7 @@ def save_archive_dot_org(link: Link, out_dir: Optional[Path]=None, timeout: int=
     """submit site to archive.org for archiving via their service, save returned archive url"""
     """submit site to archive.org for archiving via their service, save returned archive url"""
 
 
     out_dir = out_dir or Path(link.link_dir)
     out_dir = out_dir or Path(link.link_dir)
-    output: ArchiveOutput = 'archive.org.txt'
+    output: ArchiveOutput = get_output_path()
     archive_org_url = None
     archive_org_url = None
     submit_url = 'https://web.archive.org/save/{}'.format(link.url)
     submit_url = 'https://web.archive.org/save/{}'.format(link.url)
     # later options take precedence
     # later options take precedence
@@ -88,7 +90,7 @@ def save_archive_dot_org(link: Link, out_dir: Optional[Path]=None, timeout: int=
         archive_org_url = archive_org_url or submit_url
         archive_org_url = archive_org_url or submit_url
         with open(str(out_dir / output), 'w', encoding='utf-8') as f:
         with open(str(out_dir / output), 'w', encoding='utf-8') as f:
             f.write(archive_org_url)
             f.write(archive_org_url)
-        chmod_file('archive.org.txt', cwd=str(out_dir))
+        chmod_file(str(out_dir / output), cwd=str(out_dir))
         output = archive_org_url
         output = archive_org_url
 
 
     return ArchiveResult(
     return ArchiveResult(

+ 6 - 3
archivebox/extractors/dom.py

@@ -19,6 +19,9 @@ from ..config import (
 from ..logging_util import TimedProgress
 from ..logging_util import TimedProgress
 
 
 
 
+def get_output_path():
+    return 'output.html'
+
 
 
 @enforce_types
 @enforce_types
 def should_save_dom(link: Link, out_dir: Optional[Path]=None, overwrite: Optional[bool]=False) -> bool:
 def should_save_dom(link: Link, out_dir: Optional[Path]=None, overwrite: Optional[bool]=False) -> bool:
@@ -26,8 +29,8 @@ def should_save_dom(link: Link, out_dir: Optional[Path]=None, overwrite: Optiona
         return False
         return False
 
 
     out_dir = out_dir or Path(link.link_dir)
     out_dir = out_dir or Path(link.link_dir)
-    if not overwrite and (out_dir / 'output.html').exists():
-        if (out_dir / 'output.html').stat().st_size > 1:
+    if not overwrite and (out_dir / get_output_path()).exists():
+        if (out_dir / get_output_path()).stat().st_size > 1:
             return False
             return False
 
 
     return SAVE_DOM
     return SAVE_DOM
@@ -37,7 +40,7 @@ def save_dom(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) ->
     """print HTML of site to file using chrome --dump-html"""
     """print HTML of site to file using chrome --dump-html"""
 
 
     out_dir = out_dir or Path(link.link_dir)
     out_dir = out_dir or Path(link.link_dir)
-    output: ArchiveOutput = 'output.html'
+    output: ArchiveOutput = get_output_path()
     output_path = out_dir / output
     output_path = out_dir / output
     cmd = [
     cmd = [
         *chrome_args(),
         *chrome_args(),

+ 7 - 2
archivebox/extractors/favicon.py

@@ -8,8 +8,8 @@ from ..index.schema import Link, ArchiveResult, ArchiveOutput
 from ..system import chmod_file, run
 from ..system import chmod_file, run
 from ..util import (
 from ..util import (
     enforce_types,
     enforce_types,
-     domain,
-     dedupe,
+    domain,
+    dedupe,
 )
 )
 from ..config import (
 from ..config import (
     TIMEOUT,
     TIMEOUT,
@@ -33,6 +33,11 @@ def should_save_favicon(link: Link, out_dir: Optional[str]=None, overwrite: Opti
 
 
     return SAVE_FAVICON
     return SAVE_FAVICON
 
 
+@enforce_types
+def get_output_path():
+    return 'favicon.ico'
+
+
 @enforce_types
 @enforce_types
 def save_favicon(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) -> ArchiveResult:
 def save_favicon(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) -> ArchiveResult:
     """download site favicon from google's favicon api"""
     """download site favicon from google's favicon api"""

+ 15 - 2
archivebox/extractors/git.py

@@ -26,6 +26,19 @@ from ..config import (
 from ..logging_util import TimedProgress
 from ..logging_util import TimedProgress
 
 
 
 
+def get_output_path():
+    return 'git/'
+
+def get_embed_path(archiveresult=None):
+    if not archiveresult:
+        return get_output_path()
+
+    try:
+        return get_output_path() + list((archiveresult.snapshot_dir / get_output_path()).glob('*'))[0].name + '/'
+    except IndexError:
+        pass
+
+    return get_output_path()
 
 
 @enforce_types
 @enforce_types
 def should_save_git(link: Link, out_dir: Optional[Path]=None, overwrite: Optional[bool]=False) -> bool:
 def should_save_git(link: Link, out_dir: Optional[Path]=None, overwrite: Optional[bool]=False) -> bool:
@@ -33,7 +46,7 @@ def should_save_git(link: Link, out_dir: Optional[Path]=None, overwrite: Optiona
         return False
         return False
 
 
     out_dir = out_dir or Path(link.link_dir)
     out_dir = out_dir or Path(link.link_dir)
-    if not overwrite and (out_dir / 'git').exists():
+    if not overwrite and (out_dir / get_output_path()).exists():
         return False
         return False
 
 
     is_clonable_url = (
     is_clonable_url = (
@@ -51,7 +64,7 @@ def save_git(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) ->
     """download full site using git"""
     """download full site using git"""
 
 
     out_dir = out_dir or Path(link.link_dir)
     out_dir = out_dir or Path(link.link_dir)
-    output: ArchiveOutput = 'git'
+    output: ArchiveOutput = get_output_path()
     output_path = out_dir / output
     output_path = out_dir / output
     output_path.mkdir(exist_ok=True)
     output_path.mkdir(exist_ok=True)
     cmd = [
     cmd = [

+ 7 - 3
archivebox/extractors/headers.py

@@ -23,10 +23,14 @@ from ..config import (
 )
 )
 from ..logging_util import TimedProgress
 from ..logging_util import TimedProgress
 
 
+def get_output_path():
+    return 'headers.json'
+
+
 @enforce_types
 @enforce_types
 def should_save_headers(link: Link, out_dir: Optional[str]=None, overwrite: Optional[bool]=False) -> bool:
 def should_save_headers(link: Link, out_dir: Optional[str]=None, overwrite: Optional[bool]=False) -> bool:
     out_dir = out_dir or Path(link.link_dir)
     out_dir = out_dir or Path(link.link_dir)
-    if not overwrite and (out_dir / 'headers.json').exists():
+    if not overwrite and (out_dir / get_output_path()).exists():
         return False
         return False
 
 
     return SAVE_HEADERS
     return SAVE_HEADERS
@@ -38,7 +42,7 @@ def save_headers(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEOUT)
 
 
     out_dir = Path(out_dir or link.link_dir)
     out_dir = Path(out_dir or link.link_dir)
     output_folder = out_dir.absolute()
     output_folder = out_dir.absolute()
-    output: ArchiveOutput = 'headers.json'
+    output: ArchiveOutput = get_output_path()
 
 
     status = 'succeeded'
     status = 'succeeded'
     timer = TimedProgress(timeout, prefix='      ')
     timer = TimedProgress(timeout, prefix='      ')
@@ -59,7 +63,7 @@ def save_headers(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEOUT)
     try:
     try:
         json_headers = get_headers(link.url, timeout=timeout)
         json_headers = get_headers(link.url, timeout=timeout)
         output_folder.mkdir(exist_ok=True)
         output_folder.mkdir(exist_ok=True)
-        atomic_write(str(output_folder / "headers.json"), json_headers)
+        atomic_write(str(output_folder / get_output_path()), json_headers)
     except (Exception, OSError) as err:
     except (Exception, OSError) as err:
         status = 'failed'
         status = 'failed'
         output = err
         output = err

+ 8 - 2
archivebox/extractors/htmltotext.py

@@ -19,6 +19,12 @@ from ..util import (
 )
 )
 from .title import get_html
 from .title import get_html
 
 
+
+def get_output_path():
+    return "htmltotext.txt"
+
+
+
 class HTMLTextExtractor(HTMLParser):
 class HTMLTextExtractor(HTMLParser):
     TEXT_ATTRS = [
     TEXT_ATTRS = [
         "alt", "cite", "href", "label",
         "alt", "cite", "href", "label",
@@ -109,7 +115,7 @@ def should_save_htmltotext(link: Link, out_dir: Optional[Path]=None, overwrite:
         return False
         return False
 
 
     out_dir = out_dir or Path(link.link_dir)
     out_dir = out_dir or Path(link.link_dir)
-    if not overwrite and (out_dir / 'htmltotext.txt').exists():
+    if not overwrite and (out_dir / get_output_path()).exists():
         return False
         return False
 
 
     return SAVE_HTMLTOTEXT
     return SAVE_HTMLTOTEXT
@@ -120,7 +126,7 @@ def save_htmltotext(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEO
     """extract search-indexing-friendly text from an HTML document"""
     """extract search-indexing-friendly text from an HTML document"""
 
 
     out_dir = Path(out_dir or link.link_dir)
     out_dir = Path(out_dir or link.link_dir)
-    output = "htmltotext.txt"
+    output = get_output_path()
     cmd = ['(internal) archivebox.extractors.htmltotext', './{singlefile,dom}.html']
     cmd = ['(internal) archivebox.extractors.htmltotext', './{singlefile,dom}.html']
 
 
     timer = TimedProgress(timeout, prefix='      ')
     timer = TimedProgress(timeout, prefix='      ')

+ 16 - 2
archivebox/extractors/media.py

@@ -22,13 +22,27 @@ from ..config import (
 from ..logging_util import TimedProgress
 from ..logging_util import TimedProgress
 
 
 
 
+def get_output_path():
+    return 'media/'
+
+def get_embed_path(archiveresult=None):
+    if not archiveresult:
+        return get_output_path()
+
+    out_dir = archiveresult.snapshot_dir / get_output_path()
+    try:
+        return get_output_path() + list(out_dir.glob('*.mp4'))[0].name
+    except IndexError:
+        return get_output_path()
+
+
 @enforce_types
 @enforce_types
 def should_save_media(link: Link, out_dir: Optional[Path]=None, overwrite: Optional[bool]=False) -> bool:
 def should_save_media(link: Link, out_dir: Optional[Path]=None, overwrite: Optional[bool]=False) -> bool:
     if is_static_file(link.url):
     if is_static_file(link.url):
         return False
         return False
 
 
     out_dir = out_dir or Path(link.link_dir)
     out_dir = out_dir or Path(link.link_dir)
-    if not overwrite and (out_dir / 'media').exists():
+    if not overwrite and (out_dir / get_output_path()).exists():
         return False
         return False
 
 
     return SAVE_MEDIA
     return SAVE_MEDIA
@@ -38,7 +52,7 @@ def save_media(link: Link, out_dir: Optional[Path]=None, timeout: int=MEDIA_TIME
     """Download playlists or individual video, audio, and subtitles using youtube-dl or yt-dlp"""
     """Download playlists or individual video, audio, and subtitles using youtube-dl or yt-dlp"""
 
 
     out_dir = out_dir or Path(link.link_dir)
     out_dir = out_dir or Path(link.link_dir)
-    output: ArchiveOutput = 'media'
+    output: ArchiveOutput = get_output_path()
     output_path = out_dir / output
     output_path = out_dir / output
     output_path.mkdir(exist_ok=True)
     output_path.mkdir(exist_ok=True)
     # later options take precedence
     # later options take precedence

+ 9 - 3
archivebox/extractors/mercury.py

@@ -24,6 +24,12 @@ from ..config import (
 from ..logging_util import TimedProgress
 from ..logging_util import TimedProgress
 
 
 
 
+def get_output_path():
+    return 'mercury/'
+
+def get_embed_path(archiveresult=None):
+    return get_output_path() + 'content.html'
+
 
 
 @enforce_types
 @enforce_types
 def ShellError(cmd: List[str], result: CompletedProcess, lines: int=20) -> ArchiveError:
 def ShellError(cmd: List[str], result: CompletedProcess, lines: int=20) -> ArchiveError:
@@ -44,7 +50,7 @@ def should_save_mercury(link: Link, out_dir: Optional[str]=None, overwrite: Opti
         return False
         return False
 
 
     out_dir = out_dir or Path(link.link_dir)
     out_dir = out_dir or Path(link.link_dir)
-    if not overwrite and (out_dir / 'mercury').exists():
+    if not overwrite and (out_dir / get_output_path()).exists():
         return False
         return False
 
 
     return SAVE_MERCURY
     return SAVE_MERCURY
@@ -55,8 +61,8 @@ def save_mercury(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT)
     """download reader friendly version using @postlight/mercury-parser"""
     """download reader friendly version using @postlight/mercury-parser"""
 
 
     out_dir = Path(out_dir or link.link_dir)
     out_dir = Path(out_dir or link.link_dir)
-    output_folder = out_dir.absolute() / "mercury"
-    output = "mercury"
+    output_folder = out_dir.absolute() / get_output_path()
+    output = get_output_path()
 
 
     status = 'succeeded'
     status = 'succeeded'
     timer = TimedProgress(timeout, prefix='      ')
     timer = TimedProgress(timeout, prefix='      ')

+ 7 - 3
archivebox/extractors/pdf.py

@@ -19,13 +19,17 @@ from ..config import (
 from ..logging_util import TimedProgress
 from ..logging_util import TimedProgress
 
 
 
 
+def get_output_path():
+    return 'output.pdf'
+
+
 @enforce_types
 @enforce_types
 def should_save_pdf(link: Link, out_dir: Optional[Path]=None, overwrite: Optional[bool]=False) -> bool:
 def should_save_pdf(link: Link, out_dir: Optional[Path]=None, overwrite: Optional[bool]=False) -> bool:
     if is_static_file(link.url):
     if is_static_file(link.url):
         return False
         return False
 
 
     out_dir = out_dir or Path(link.link_dir)
     out_dir = out_dir or Path(link.link_dir)
-    if not overwrite and (out_dir / 'output.pdf').exists():
+    if not overwrite and (out_dir / get_output_path()).exists():
         return False
         return False
 
 
     return SAVE_PDF
     return SAVE_PDF
@@ -36,7 +40,7 @@ def save_pdf(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) ->
     """print PDF of site to file using chrome --headless"""
     """print PDF of site to file using chrome --headless"""
 
 
     out_dir = out_dir or Path(link.link_dir)
     out_dir = out_dir or Path(link.link_dir)
-    output: ArchiveOutput = 'output.pdf'
+    output: ArchiveOutput = get_output_path()
     cmd = [
     cmd = [
         *chrome_args(),
         *chrome_args(),
         '--print-to-pdf',
         '--print-to-pdf',
@@ -51,7 +55,7 @@ def save_pdf(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) ->
             hints = (result.stderr or result.stdout).decode()
             hints = (result.stderr or result.stdout).decode()
             raise ArchiveError('Failed to save PDF', hints)
             raise ArchiveError('Failed to save PDF', hints)
         
         
-        chmod_file('output.pdf', cwd=str(out_dir))
+        chmod_file(get_output_path(), cwd=str(out_dir))
     except Exception as err:
     except Exception as err:
         status = 'failed'
         status = 'failed'
         output = err
         output = err

+ 9 - 3
archivebox/extractors/readability.py

@@ -22,6 +22,12 @@ from ..config import (
 from ..logging_util import TimedProgress
 from ..logging_util import TimedProgress
 from .title import get_html
 from .title import get_html
 
 
+def get_output_path():
+    return 'readability/'
+
+def get_embed_path(archiveresult=None):
+    return get_output_path() + 'content.html'
+
 
 
 @enforce_types
 @enforce_types
 def should_save_readability(link: Link, out_dir: Optional[str]=None, overwrite: Optional[bool]=False) -> bool:
 def should_save_readability(link: Link, out_dir: Optional[str]=None, overwrite: Optional[bool]=False) -> bool:
@@ -29,7 +35,7 @@ def should_save_readability(link: Link, out_dir: Optional[str]=None, overwrite:
         return False
         return False
 
 
     out_dir = out_dir or Path(link.link_dir)
     out_dir = out_dir or Path(link.link_dir)
-    if not overwrite and (out_dir / 'readability').exists():
+    if not overwrite and (out_dir / get_output_path()).exists():
         return False
         return False
 
 
     return SAVE_READABILITY
     return SAVE_READABILITY
@@ -40,8 +46,8 @@ def save_readability(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEO
     """download reader friendly version using @mozilla/readability"""
     """download reader friendly version using @mozilla/readability"""
 
 
     out_dir = Path(out_dir or link.link_dir)
     out_dir = Path(out_dir or link.link_dir)
-    output_folder = out_dir.absolute() / "readability"
-    output = "readability"
+    output_folder = out_dir.absolute() / get_output_path()
+    output = get_output_path()
 
 
     # Readability Docs: https://github.com/mozilla/readability
     # Readability Docs: https://github.com/mozilla/readability
 
 

+ 5 - 2
archivebox/extractors/screenshot.py

@@ -19,6 +19,9 @@ from ..config import (
 from ..logging_util import TimedProgress
 from ..logging_util import TimedProgress
 
 
 
 
+def get_output_path():
+    return 'screenshot.png'
+
 
 
 @enforce_types
 @enforce_types
 def should_save_screenshot(link: Link, out_dir: Optional[Path]=None, overwrite: Optional[bool]=False) -> bool:
 def should_save_screenshot(link: Link, out_dir: Optional[Path]=None, overwrite: Optional[bool]=False) -> bool:
@@ -26,7 +29,7 @@ def should_save_screenshot(link: Link, out_dir: Optional[Path]=None, overwrite:
         return False
         return False
 
 
     out_dir = out_dir or Path(link.link_dir)
     out_dir = out_dir or Path(link.link_dir)
-    if not overwrite and (out_dir / 'screenshot.png').exists():
+    if not overwrite and (out_dir / get_output_path()).exists():
         return False
         return False
 
 
     return SAVE_SCREENSHOT
     return SAVE_SCREENSHOT
@@ -36,7 +39,7 @@ def save_screenshot(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEO
     """take screenshot of site using chrome --headless"""
     """take screenshot of site using chrome --headless"""
     
     
     out_dir = out_dir or Path(link.link_dir)
     out_dir = out_dir or Path(link.link_dir)
-    output: ArchiveOutput = 'screenshot.png'
+    output: ArchiveOutput = get_output_path()
     cmd = [
     cmd = [
         *chrome_args(),
         *chrome_args(),
         '--screenshot',
         '--screenshot',

+ 8 - 3
archivebox/extractors/singlefile.py

@@ -26,13 +26,17 @@ from ..config import (
 from ..logging_util import TimedProgress
 from ..logging_util import TimedProgress
 
 
 
 
+def get_output_path():
+    return 'singlefile.html'
+
+
 @enforce_types
 @enforce_types
 def should_save_singlefile(link: Link, out_dir: Optional[Path]=None, overwrite: Optional[bool]=False) -> bool:
 def should_save_singlefile(link: Link, out_dir: Optional[Path]=None, overwrite: Optional[bool]=False) -> bool:
     if is_static_file(link.url):
     if is_static_file(link.url):
         return False
         return False
 
 
     out_dir = out_dir or Path(link.link_dir)
     out_dir = out_dir or Path(link.link_dir)
-    if not overwrite and (out_dir / 'singlefile.html').exists():
+    if not overwrite and (out_dir / get_output_path()).exists():
         return False
         return False
 
 
     return SAVE_SINGLEFILE
     return SAVE_SINGLEFILE
@@ -43,7 +47,7 @@ def save_singlefile(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEO
     """download full site using single-file"""
     """download full site using single-file"""
 
 
     out_dir = out_dir or Path(link.link_dir)
     out_dir = out_dir or Path(link.link_dir)
-    output = "singlefile.html"
+    output = get_output_path()
 
 
     browser_args = chrome_args(CHROME_TIMEOUT=0)
     browser_args = chrome_args(CHROME_TIMEOUT=0)
 
 
@@ -90,7 +94,8 @@ def save_singlefile(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEO
         status = 'failed'
         status = 'failed'
         # TODO: Make this prettier. This is necessary to run the command (escape JSON internal quotes).
         # TODO: Make this prettier. This is necessary to run the command (escape JSON internal quotes).
         cmd[2] = browser_args.replace('"', "\\\"")
         cmd[2] = browser_args.replace('"', "\\\"")
-        err.hints = (result.stdout + result.stderr).decode().split('\n')
+        if result:
+            err.hints = (result.stdout + result.stderr).decode().split('\n')
         output = err
         output = err
     finally:
     finally:
         timer.end()
         timer.end()

+ 8 - 0
archivebox/extractors/title.py

@@ -60,6 +60,7 @@ class TitleParser(HTMLParser):
         if tag.lower() == "title":
         if tag.lower() == "title":
             self.inside_title_tag = False
             self.inside_title_tag = False
 
 
+
 @enforce_types
 @enforce_types
 def get_html(link: Link, path: Path, timeout: int=TIMEOUT) -> str:
 def get_html(link: Link, path: Path, timeout: int=TIMEOUT) -> str:
     """
     """
@@ -84,6 +85,13 @@ def get_html(link: Link, path: Path, timeout: int=TIMEOUT) -> str:
     else:
     else:
         return document
         return document
 
 
+
+def get_output_path():
+    # TODO: actually save title to this file
+    # (currently only saved in ArchiveResult.output as charfield value, not saved to filesystem)
+    return 'title.json'
+
+
 @enforce_types
 @enforce_types
 def should_save_title(link: Link, out_dir: Optional[str]=None, overwrite: Optional[bool]=False) -> bool:
 def should_save_title(link: Link, out_dir: Optional[str]=None, overwrite: Optional[bool]=False) -> bool:
     # if link already has valid title, skip it
     # if link already has valid title, skip it

+ 12 - 0
archivebox/extractors/wget.py

@@ -35,6 +35,18 @@ from ..config import (
 from ..logging_util import TimedProgress
 from ..logging_util import TimedProgress
 
 
 
 
+def get_output_path():
+    # TODO: actually save output into this folder, instead of do {domain}/**/index.html
+    return 'wget/'
+
+def get_embed_path(archiveresult=None):
+    if not archiveresult:
+        return get_output_path()
+
+    link = archiveresult.snapshot.as_link()
+    return wget_output_path(link)
+
+
 @enforce_types
 @enforce_types
 def should_save_wget(link: Link, out_dir: Optional[Path]=None, overwrite: Optional[bool]=False) -> bool:
 def should_save_wget(link: Link, out_dir: Optional[Path]=None, overwrite: Optional[bool]=False) -> bool:
     output_path = wget_output_path(link)
     output_path = wget_output_path(link)

+ 4 - 4
archivebox/index/html.py

@@ -118,10 +118,10 @@ def render_django_template(template: str, context: Mapping[str, str]) -> str:
 
 
 
 
 def snapshot_icons(snapshot) -> str:
 def snapshot_icons(snapshot) -> str:
-    cache_key = f'{snapshot.id}-{(snapshot.updated or snapshot.added).timestamp()}-snapshot-icons'
+    cache_key = f'{snapshot.pk}-{(snapshot.updated or snapshot.added).timestamp()}-snapshot-icons'
     
     
     def calc_snapshot_icons():
     def calc_snapshot_icons():
-        from core.models import EXTRACTORS
+        from core.models import EXTRACTOR_CHOICES
         # start = datetime.now(timezone.utc)
         # start = datetime.now(timezone.utc)
 
 
         archive_results = snapshot.archiveresult_set.filter(status="succeeded", output__isnull=False)
         archive_results = snapshot.archiveresult_set.filter(status="succeeded", output__isnull=False)
@@ -147,12 +147,12 @@ def snapshot_icons(snapshot) -> str:
         # Missing specific entry for WARC
         # Missing specific entry for WARC
 
 
         extractor_outputs = defaultdict(lambda: None)
         extractor_outputs = defaultdict(lambda: None)
-        for extractor, _ in EXTRACTORS:
+        for extractor, _ in EXTRACTOR_CHOICES:
             for result in archive_results:
             for result in archive_results:
                 if result.extractor == extractor and result:
                 if result.extractor == extractor and result:
                     extractor_outputs[extractor] = result
                     extractor_outputs[extractor] = result
 
 
-        for extractor, _ in EXTRACTORS:
+        for extractor, _ in EXTRACTOR_CHOICES:
             if extractor not in exclude:
             if extractor not in exclude:
                 existing = extractor_outputs[extractor] and extractor_outputs[extractor].status == 'succeeded' and extractor_outputs[extractor].output
                 existing = extractor_outputs[extractor] and extractor_outputs[extractor].status == 'succeeded' and extractor_outputs[extractor].output
                 # Check filesystsem to see if anything is actually present (too slow, needs optimization/caching)
                 # Check filesystsem to see if anything is actually present (too slow, needs optimization/caching)

+ 17 - 2
archivebox/index/schema.py

@@ -192,6 +192,9 @@ class Link:
         if extended:
         if extended:
             info.update({
             info.update({
                 'snapshot_id': self.snapshot_id,
                 'snapshot_id': self.snapshot_id,
+                'snapshot_uuid': self.snapshot_uuid,
+                'snapshot_abid': self.snapshot_abid,
+
                 'link_dir': self.link_dir,
                 'link_dir': self.link_dir,
                 'archive_path': self.archive_path,
                 'archive_path': self.archive_path,
                 
                 
@@ -261,9 +264,21 @@ class Link:
         return to_csv(self, cols=cols or self.field_names(), separator=separator, ljust=ljust)
         return to_csv(self, cols=cols or self.field_names(), separator=separator, ljust=ljust)
 
 
     @cached_property
     @cached_property
-    def snapshot_id(self):
+    def snapshot(self):
         from core.models import Snapshot
         from core.models import Snapshot
-        return str(Snapshot.objects.only('id').get(url=self.url).id)
+        return Snapshot.objects.only('uuid').get(url=self.url)
+
+    @cached_property
+    def snapshot_id(self):
+        return str(self.snapshot.pk)
+
+    @cached_property
+    def snapshot_uuid(self):
+        return str(self.snapshot.uuid)
+
+    @cached_property
+    def snapshot_abid(self):
+        return str(self.snapshot.ABID)
 
 
     @classmethod
     @classmethod
     def field_names(cls):
     def field_names(cls):

+ 4 - 3
archivebox/index/sql.py

@@ -45,7 +45,8 @@ def write_link_to_sql_index(link: Link):
     info.pop('tags')
     info.pop('tags')
 
 
     try:
     try:
-        info["timestamp"] = Snapshot.objects.get(url=link.url).timestamp
+        snapshot = Snapshot.objects.get(url=link.url)
+        info["timestamp"] = snapshot.timestamp
     except Snapshot.DoesNotExist:
     except Snapshot.DoesNotExist:
         while Snapshot.objects.filter(timestamp=info["timestamp"]).exists():
         while Snapshot.objects.filter(timestamp=info["timestamp"]).exists():
             info["timestamp"] = str(float(info["timestamp"]) + 1.0)
             info["timestamp"] = str(float(info["timestamp"]) + 1.0)
@@ -57,7 +58,7 @@ def write_link_to_sql_index(link: Link):
         for entry in entries:
         for entry in entries:
             if isinstance(entry, dict):
             if isinstance(entry, dict):
                 result, _ = ArchiveResult.objects.get_or_create(
                 result, _ = ArchiveResult.objects.get_or_create(
-                    snapshot_id=snapshot.id,
+                    snapshot_id=snapshot.pk,
                     extractor=extractor,
                     extractor=extractor,
                     start_ts=parse_date(entry['start_ts']),
                     start_ts=parse_date(entry['start_ts']),
                     defaults={
                     defaults={
@@ -71,7 +72,7 @@ def write_link_to_sql_index(link: Link):
                 )
                 )
             else:
             else:
                 result, _ = ArchiveResult.objects.update_or_create(
                 result, _ = ArchiveResult.objects.update_or_create(
-                    snapshot_id=snapshot.id,
+                    snapshot_id=snapshot.pk,
                     extractor=extractor,
                     extractor=extractor,
                     start_ts=parse_date(entry.start_ts),
                     start_ts=parse_date(entry.start_ts),
                     defaults={
                     defaults={

+ 16 - 0
archivebox/monkey_patches.py

@@ -0,0 +1,16 @@
+__package__ = 'archivebox'
+
+import django_stubs_ext
+
+django_stubs_ext.monkeypatch()
+
+
+# monkey patch django timezone to add back utc (it was removed in Django 5.0)
+import datetime
+from django.utils import timezone
+timezone.utc = datetime.timezone.utc
+
+
+# monkey patch django-signals-webhooks to change how it shows up in Admin UI
+# from signal_webhooks.apps import DjangoSignalWebhooksConfig
+# DjangoSignalWebhooksConfig.verbose_name = 'API'

+ 17 - 0
archivebox/plugantic/__init__.py

@@ -0,0 +1,17 @@
+__package__ = 'archivebox.plugantic'
+
+from .binproviders import BinProvider
+from .binaries import Binary
+from .extractors import Extractor
+from .replayers import Replayer
+from .configs import ConfigSet
+from .plugins import Plugin
+
+# __all__ = [
+#     'BinProvider',
+#     'Binary',
+#     'Extractor',
+#     'Replayer',
+#     'ConfigSet',
+#     'Plugin',
+# ]

+ 26 - 0
archivebox/plugantic/admin.py

@@ -0,0 +1,26 @@
+# from django.contrib import admin
+# from django import forms
+
+# from django_jsonform.widgets import JSONFormWidget
+
+# from django_pydantic_field.v2.fields import PydanticSchemaField
+
+# from .models import CustomPlugin
+
+
+# class PluginForm(forms.ModelForm):
+#     class Meta:
+#         model = CustomPlugin
+#         fields = '__all__'
+#         widgets = {
+#             'items': JSONFormWidget(schema=PluginSchema),
+#         }
+
+
+# class PluginAdmin(admin.ModelAdmin):
+#     formfield_overrides = {
+#         PydanticSchemaField: {"widget": JSONFormWidget},
+#     }
+#     form = PluginForm
+
+    

+ 6 - 0
archivebox/plugantic/apps.py

@@ -0,0 +1,6 @@
+from django.apps import AppConfig
+
+
+class PluganticConfig(AppConfig):
+    default_auto_field = 'django.db.models.BigAutoField'
+    name = 'plugantic'

+ 323 - 0
archivebox/plugantic/binaries.py

@@ -0,0 +1,323 @@
+__package__ = 'archivebox.plugantic'
+
+import sys
+import inspect
+import importlib
+from pathlib import Path
+
+
+from typing import Any, Optional, Dict, List
+from typing_extensions import Self
+from subprocess import run, PIPE
+
+
+from pydantic_core import ValidationError
+
+from pydantic import BaseModel, Field, model_validator, computed_field, field_validator, validate_call, field_serializer
+
+from .binproviders import (
+    SemVer,
+    BinName,
+    BinProviderName,
+    HostBinPath,
+    BinProvider,
+    EnvProvider,
+    AptProvider,
+    BrewProvider,
+    PipProvider,
+    ProviderLookupDict,
+    bin_name,
+    bin_abspath,
+    path_is_script,
+    path_is_executable,
+)
+
+
+class Binary(BaseModel):
+    name: BinName
+    description: str = Field(default='')
+
+    providers_supported: List[BinProvider] = Field(default=[EnvProvider()], alias='providers')
+    provider_overrides: Dict[BinProviderName, ProviderLookupDict] = Field(default={}, alias='overrides')
+    
+    loaded_provider: Optional[BinProviderName] = Field(default=None, alias='provider')
+    loaded_abspath: Optional[HostBinPath] = Field(default=None, alias='abspath')
+    loaded_version: Optional[SemVer] = Field(default=None, alias='version')
+    
+    # bin_filename:  see below
+    # is_executable: see below
+    # is_script
+    # is_valid: see below
+
+
+    @model_validator(mode='after')
+    def validate(self):
+        self.loaded_abspath = bin_abspath(self.name) or self.name
+        self.description = self.description or self.name
+        
+        assert self.providers_supported, f'No providers were given for package {self.name}'
+
+        # pull in any overrides from the binproviders
+        for provider in self.providers_supported:
+            overrides_by_provider = provider.get_providers_for_bin(self.name)
+            if overrides_by_provider:
+                self.provider_overrides[provider.name] = {
+                    **overrides_by_provider,
+                    **self.provider_overrides.get(provider.name, {}),
+                }
+        return self
+
+    @field_validator('loaded_abspath', mode='before')
+    def parse_abspath(cls, value: Any):
+        return bin_abspath(value)
+
+    @field_validator('loaded_version', mode='before')
+    def parse_version(cls, value: Any):
+        return value and SemVer(value)
+
+    @field_serializer('provider_overrides', when_used='json')
+    def serialize_overrides(self, provider_overrides: Dict[BinProviderName, ProviderLookupDict]) -> Dict[BinProviderName, Dict[str, str]]:
+        return {
+            provider_name: {
+                key: str(val)
+                for key, val in overrides.items()
+            }
+            for provider_name, overrides in provider_overrides.items()
+        }
+
+    @computed_field                                                                                           # type: ignore[misc]  # see mypy issue #1362
+    @property
+    def bin_filename(self) -> BinName:
+        if self.is_script:
+            # e.g. '.../Python.framework/Versions/3.11/lib/python3.11/sqlite3/__init__.py' -> sqlite
+            name = self.name
+        elif self.loaded_abspath:
+            # e.g. '/opt/homebrew/bin/wget' -> wget
+            name = bin_name(self.loaded_abspath)
+        else:
+            # e.g. 'ytdlp' -> 'yt-dlp'
+            name = bin_name(self.name)
+        return name
+
+    @computed_field                                                                                           # type: ignore[misc]  # see mypy issue #1362
+    @property
+    def is_executable(self) -> bool:
+        try:
+            assert self.loaded_abspath and path_is_executable(self.loaded_abspath)
+            return True
+        except (ValidationError, AssertionError):
+            return False
+
+    @computed_field                                                                                           # type: ignore[misc]  # see mypy issue #1362
+    @property
+    def is_script(self) -> bool:
+        try:
+            assert self.loaded_abspath and path_is_script(self.loaded_abspath)
+            return True
+        except (ValidationError, AssertionError):
+            return False
+
+    @computed_field                                                                                           # type: ignore[misc]  # see mypy issue #1362
+    @property
+    def is_valid(self) -> bool:
+        return bool(
+            self.name
+            and self.loaded_abspath
+            and self.loaded_version
+            and (self.is_executable or self.is_script)
+        )
+
+    @validate_call
+    def install(self) -> Self:
+        if not self.providers_supported:
+            return self
+
+        exc = Exception('No providers were able to install binary', self.name, self.providers_supported)
+        for provider in self.providers_supported:
+            try:
+                installed_bin = provider.install(self.name, overrides=self.provider_overrides.get(provider.name))
+                if installed_bin:
+                    # print('INSTALLED', self.name, installed_bin)
+                    return self.model_copy(update={
+                        'loaded_provider': provider.name,
+                        'loaded_abspath': installed_bin.abspath,
+                        'loaded_version': installed_bin.version,
+                    })
+            except Exception as err:
+                print(err)
+                exc = err
+        raise exc
+
+    @validate_call
+    def load(self, cache=True) -> Self:
+        if self.is_valid:
+            return self
+
+        if not self.providers_supported:
+            return self
+
+        exc = Exception('No providers were able to install binary', self.name, self.providers_supported)
+        for provider in self.providers_supported:
+            try:
+                installed_bin = provider.load(self.name, cache=cache, overrides=self.provider_overrides.get(provider.name))
+                if installed_bin:
+                    # print('LOADED', provider, self.name, installed_bin)
+                    return self.model_copy(update={
+                        'loaded_provider': provider.name,
+                        'loaded_abspath': installed_bin.abspath,
+                        'loaded_version': installed_bin.version,
+                    })
+            except Exception as err:
+                print(err)
+                exc = err
+        raise exc
+
+    @validate_call
+    def load_or_install(self, cache=True) -> Self:
+        if self.is_valid:
+            return self
+
+        if not self.providers_supported:
+            return self
+
+        exc = Exception('No providers were able to install binary', self.name, self.providers_supported)
+        for provider in self.providers_supported:
+            try:
+                installed_bin = provider.load_or_install(self.name, overrides=self.provider_overrides.get(provider.name), cache=cache)
+                if installed_bin:
+                    # print('LOADED_OR_INSTALLED', self.name, installed_bin)
+                    return self.model_copy(update={
+                        'loaded_provider': provider.name,
+                        'loaded_abspath': installed_bin.abspath,
+                        'loaded_version': installed_bin.version,
+                    })
+            except Exception as err:
+                print(err)
+                exc = err
+        raise exc
+
+    @validate_call
+    def exec(self, args=(), pwd='.'):
+        assert self.loaded_abspath
+        assert self.loaded_version
+        return run([self.loaded_abspath, *args], stdout=PIPE, stderr=PIPE, pwd=pwd)
+
+
+
+
+class SystemPythonHelpers:
+    @staticmethod
+    def get_subdeps() -> str:
+        return 'python3 python3-minimal python3-pip python3-virtualenv'
+
+    @staticmethod
+    def get_abspath() -> str:
+        return sys.executable
+    
+    @staticmethod
+    def get_version() -> str:
+        return '{}.{}.{}'.format(*sys.version_info[:3])
+
+
+class SqliteHelpers:
+    @staticmethod
+    def get_abspath() -> Path:
+        import sqlite3
+        importlib.reload(sqlite3)
+        return Path(inspect.getfile(sqlite3))
+
+    @staticmethod
+    def get_version() -> SemVer:
+        import sqlite3
+        importlib.reload(sqlite3)
+        version = sqlite3.version
+        assert version
+        return SemVer(version)
+
+class DjangoHelpers:
+    @staticmethod
+    def get_django_abspath() -> str:
+        import django
+        return inspect.getfile(django)
+    
+
+    @staticmethod
+    def get_django_version() -> str:
+        import django
+        return '{}.{}.{} {} ({})'.format(*django.VERSION)
+
+class YtdlpHelpers:
+    @staticmethod
+    def get_ytdlp_subdeps() -> str:
+        return 'yt-dlp ffmpeg'
+
+    @staticmethod
+    def get_ytdlp_version() -> str:
+        import yt_dlp
+        importlib.reload(yt_dlp)
+
+        version = yt_dlp.version.__version__
+        assert version
+        return version
+
+class PythonBinary(Binary):
+    name: BinName = 'python'
+
+    providers_supported: List[BinProvider] = [
+        EnvProvider(
+            subdeps_provider={'python': 'plugantic.binaries.SystemPythonHelpers.get_subdeps'},
+            abspath_provider={'python': 'plugantic.binaries.SystemPythonHelpers.get_abspath'},
+            version_provider={'python': 'plugantic.binaries.SystemPythonHelpers.get_version'},
+        ),
+    ]
+
+class SqliteBinary(Binary):
+    name: BinName = 'sqlite'
+    providers_supported: List[BinProvider] = [
+        EnvProvider(
+            version_provider={'sqlite': 'plugantic.binaries.SqliteHelpers.get_version'},
+            abspath_provider={'sqlite': 'plugantic.binaries.SqliteHelpers.get_abspath'},
+        ),
+    ]
+
+class DjangoBinary(Binary):
+    name: BinName = 'django'
+    providers_supported: List[BinProvider] = [
+        EnvProvider(
+            abspath_provider={'django': 'plugantic.binaries.DjangoHelpers.get_django_abspath'},
+            version_provider={'django': 'plugantic.binaries.DjangoHelpers.get_django_version'},
+        ),
+    ]
+
+
+
+
+
+class YtdlpBinary(Binary):
+    name: BinName = 'yt-dlp'
+    providers_supported: List[BinProvider] = [
+        # EnvProvider(),
+        PipProvider(version_provider={'yt-dlp': 'plugantic.binaries.YtdlpHelpers.get_ytdlp_version'}),
+        BrewProvider(subdeps_provider={'yt-dlp': 'plugantic.binaries.YtdlpHelpers.get_ytdlp_subdeps'}),
+        # AptProvider(subdeps_provider={'yt-dlp': lambda: 'yt-dlp ffmpeg'}),
+    ]
+
+
+class WgetBinary(Binary):
+    name: BinName = 'wget'
+    providers_supported: List[BinProvider] = [EnvProvider(), AptProvider()]
+
+
+# if __name__ == '__main__':
+#     PYTHON_BINARY = PythonBinary()
+#     SQLITE_BINARY = SqliteBinary()
+#     DJANGO_BINARY = DjangoBinary()
+#     WGET_BINARY = WgetBinary()
+#     YTDLP_BINARY = YtdlpPBinary()
+
+#     print('-------------------------------------DEFINING BINARIES---------------------------------')
+#     print(PYTHON_BINARY)
+#     print(SQLITE_BINARY)
+#     print(DJANGO_BINARY)
+#     print(WGET_BINARY)
+#     print(YTDLP_BINARY)

+ 561 - 0
archivebox/plugantic/binproviders.py

@@ -0,0 +1,561 @@
+__package__ = 'archivebox.plugantic'
+
+import os
+import shutil
+import operator
+
+from typing import Callable, Any, Optional, Type, Dict, Annotated, ClassVar, Literal, cast, TYPE_CHECKING
+from typing_extensions import Self
+from abc import ABC, abstractmethod
+from collections import namedtuple
+from pathlib import Path
+from subprocess import run, PIPE
+
+from pydantic_core import core_schema, ValidationError
+from pydantic import BaseModel, Field, TypeAdapter, AfterValidator, validate_call, GetCoreSchemaHandler
+
+
+
+def func_takes_args_or_kwargs(lambda_func: Callable[..., Any]) -> bool:
+    """returns True if a lambda func takes args/kwargs of any kind, otherwise false if it's pure/argless"""
+    code = lambda_func.__code__
+    has_args = code.co_argcount > 0
+    has_varargs = code.co_flags & 0x04 != 0
+    has_varkw = code.co_flags & 0x08 != 0
+    return has_args or has_varargs or has_varkw
+
+
+def is_semver_str(semver: Any) -> bool:
+    if isinstance(semver, str):
+        return (semver.count('.') == 2 and semver.replace('.', '').isdigit())
+    return False
+
+def semver_to_str(semver: tuple[int, int, int] | str) -> str:
+    if isinstance(semver, (list, tuple)):
+        return '.'.join(str(chunk) for chunk in semver)
+    if is_semver_str(semver):
+        return semver
+    raise ValidationError('Tried to convert invalid SemVer: {}'.format(semver))
+
+
+SemVerTuple = namedtuple('SemVerTuple', ('major', 'minor', 'patch'), defaults=(0, 0, 0))
+SemVerParsableTypes = str | tuple[str | int, ...] | list[str | int]
+
+class SemVer(SemVerTuple):
+    major: int
+    minor: int = 0
+    patch: int = 0
+
+    if TYPE_CHECKING:
+        full_text: str | None = ''
+
+    def __new__(cls, *args, full_text=None, **kwargs):
+        # '1.1.1'
+        if len(args) == 1 and is_semver_str(args[0]):
+            result = SemVer.parse(args[0])
+
+        # ('1', '2', '3')
+        elif len(args) == 1 and isinstance(args[0], (tuple, list)):
+            result = SemVer.parse(args[0])
+
+        # (1, '2', None)
+        elif not all(isinstance(arg, (int, type(None))) for arg in args):
+            result = SemVer.parse(args)
+
+        # (None)
+        elif all(chunk in ('', 0, None) for chunk in (*args, *kwargs.values())):
+            result = None
+
+        # 1, 2, 3
+        else:
+            result = SemVerTuple.__new__(cls, *args, **kwargs)
+
+        if result is not None:
+            # add first line as extra hidden metadata so it can be logged without having to re-run version cmd
+            result.full_text = full_text or str(result)
+        return result
+
+    @classmethod
+    def parse(cls, version_stdout: SemVerParsableTypes) -> Self | None:
+        """
+        parses a version tag string formatted like into (major, minor, patch) ints
+        'Google Chrome 124.0.6367.208'             -> (124, 0, 6367)
+        'GNU Wget 1.24.5 built on darwin23.2.0.'   -> (1, 24, 5)
+        'curl 8.4.0 (x86_64-apple-darwin23.0) ...' -> (8, 4, 0)
+        '2024.04.09'                               -> (2024, 4, 9)
+
+        """
+        # print('INITIAL_VALUE', type(version_stdout).__name__, version_stdout)
+
+        if isinstance(version_stdout, (tuple, list)):
+            version_stdout = '.'.join(str(chunk) for chunk in version_stdout)
+        elif isinstance(version_stdout, bytes):
+            version_stdout = version_stdout.decode()
+        elif not isinstance(version_stdout, str):
+            version_stdout = str(version_stdout)
+        
+        # no text to work with, return None immediately
+        if not version_stdout.strip():
+            # raise Exception('Tried to parse semver from empty version output (is binary installed and available?)')
+            return None
+
+        just_numbers = lambda col: col.lower().strip('v').split('+')[0].split('-')[0].split('_')[0]
+        contains_semver = lambda col: (
+            col.count('.') in (1, 2, 3)
+            and all(chunk.isdigit() for chunk in col.split('.')[:3])  # first 3 chunks can only be nums
+        )
+
+        full_text = version_stdout.split('\n')[0].strip()
+        first_line_columns = full_text.split()[:4]
+        version_columns = list(filter(contains_semver, map(just_numbers, first_line_columns)))
+        
+        # could not find any column of first line that looks like a version number, despite there being some text
+        if not version_columns:
+            # raise Exception('Failed to parse semver from version command output: {}'.format(' '.join(first_line_columns)))
+            return None
+
+        # take first col containing a semver, and truncate it to 3 chunks (e.g. 2024.04.09.91) -> (2024, 04, 09)
+        first_version_tuple = version_columns[0].split('.', 3)[:3]
+
+        # print('FINAL_VALUE', first_version_tuple)
+
+        return cls(*(int(chunk) for chunk in first_version_tuple), full_text=full_text)
+
+    def __str__(self):
+        return '.'.join(str(chunk) for chunk in self)
+
+    # @classmethod
+    # def __get_pydantic_core_schema__(cls, source: Type[Any], handler: GetCoreSchemaHandler) -> core_schema.CoreSchema:
+    #     default_schema = handler(source)
+    #     return core_schema.no_info_after_validator_function(
+    #         cls.parse,
+    #         default_schema,
+    #         serialization=core_schema.plain_serializer_function_ser_schema(
+    #             lambda semver: str(semver),
+    #             info_arg=False,
+    #             return_schema=core_schema.str_schema(),
+    #         ),
+    #     )
+
+assert SemVer(None) == None
+assert SemVer('') == None
+assert SemVer.parse('') == None
+assert SemVer(1) == (1, 0, 0)
+assert SemVer(1, 2) == (1, 2, 0)
+assert SemVer('1.2+234234') == (1, 2, 0)
+assert SemVer((1, 2, 3)) == (1, 2, 3)
+assert getattr(SemVer((1, 2, 3)), 'full_text') == '1.2.3'
+assert SemVer(('1', '2', '3')) == (1, 2, 3)
+assert SemVer.parse('5.6.7') == (5, 6, 7)
+assert SemVer.parse('124.0.6367.208') == (124, 0, 6367)
+assert SemVer.parse('Google Chrome 124.1+234.234') == (124, 1, 0)
+assert SemVer.parse('Google Ch1rome 124.0.6367.208') == (124, 0, 6367)
+assert SemVer.parse('Google Chrome 124.0.6367.208+beta_234. 234.234.123\n123.456.324') == (124, 0, 6367)
+assert getattr(SemVer.parse('Google Chrome 124.0.6367.208+beta_234. 234.234.123\n123.456.324'), 'full_text') == 'Google Chrome 124.0.6367.208+beta_234. 234.234.123'
+assert SemVer.parse('Google Chrome') == None
+
+@validate_call
+def bin_name(bin_path_or_name: str | Path) -> str:
+    name = Path(bin_path_or_name).name
+    assert len(name) > 1
+    assert name.replace('-', '').replace('_', '').replace('.', '').isalnum(), (
+        f'Binary name can only contain a-Z0-9-_.: {name}')
+    return name
+
+BinName = Annotated[str, AfterValidator(bin_name)]
+
+@validate_call
+def path_is_file(path: Path | str) -> Path:
+    path = Path(path) if isinstance(path, str) else path
+    assert path.is_file(), f'Path is not a file: {path}'
+    return path
+
+HostExistsPath = Annotated[Path, AfterValidator(path_is_file)]
+
+@validate_call
+def path_is_executable(path: HostExistsPath) -> HostExistsPath:
+    assert os.access(path, os.X_OK), f'Path is not executable (fix by running chmod +x {path})'
+    return path
+
+@validate_call
+def path_is_script(path: HostExistsPath) -> HostExistsPath:
+    SCRIPT_EXTENSIONS = ('.py', '.js', '.sh')
+    assert path.suffix.lower() in SCRIPT_EXTENSIONS, 'Path is not a script (does not end in {})'.format(', '.join(SCRIPT_EXTENSIONS))
+    return path
+
+HostExecutablePath = Annotated[HostExistsPath, AfterValidator(path_is_executable)]
+
+@validate_call
+def path_is_abspath(path: Path) -> Path:
+    return path.resolve()
+
+HostAbsPath = Annotated[HostExistsPath, AfterValidator(path_is_abspath)]
+HostBinPath = Annotated[Path, AfterValidator(path_is_abspath), AfterValidator(path_is_file)]
+
+
+@validate_call
+def bin_abspath(bin_path_or_name: BinName | Path) -> HostBinPath | None:
+    assert bin_path_or_name
+
+    if str(bin_path_or_name).startswith('/'):
+        # already a path, get its absolute form
+        abspath = Path(bin_path_or_name).resolve()
+    else:
+        # not a path yet, get path using os.which
+        binpath = shutil.which(bin_path_or_name)
+        if not binpath:
+            return None
+        abspath = Path(binpath).resolve()
+
+    try:
+        return TypeAdapter(HostBinPath).validate_python(abspath)
+    except ValidationError:
+        return None
+
+
+@validate_call
+def bin_version(bin_path: HostBinPath, args=('--version',)) -> SemVer | None:
+    return SemVer(run([bin_path, *args], stdout=PIPE).stdout.strip().decode())
+
+
+class InstalledBin(BaseModel):
+    abspath: HostBinPath
+    version: SemVer
+
+
+def is_valid_install_string(pkgs_str: str) -> str:
+    """Make sure a string is a valid install string for a package manager, e.g. 'yt-dlp ffmpeg'"""
+    assert pkgs_str
+    assert all(len(pkg) > 1 for pkg in pkgs_str.split(' '))
+    return pkgs_str
+
+def is_valid_python_dotted_import(import_str: str) -> str:
+    assert import_str and import_str.replace('.', '').replace('_', '').isalnum()
+    return import_str
+
+InstallStr = Annotated[str, AfterValidator(is_valid_install_string)]
+
+LazyImportStr = Annotated[str, AfterValidator(is_valid_python_dotted_import)]
+
+ProviderHandler = Callable[..., Any] | Callable[[], Any]                               # must take no args [], or [bin_name: str, **kwargs]
+#ProviderHandlerStr = Annotated[str, AfterValidator(lambda s: s.startswith('self.'))]
+ProviderHandlerRef = LazyImportStr | ProviderHandler
+ProviderLookupDict = Dict[str, LazyImportStr]
+ProviderType = Literal['abspath', 'version', 'subdeps', 'install']
+
+
+# class Host(BaseModel):
+#     machine: str
+#     system: str
+#     platform: str
+#     in_docker: bool
+#     in_qemu: bool
+#     python: str
+
+BinProviderName = Literal['env', 'pip', 'apt', 'brew', 'npm', 'vendor']
+
+
+class BinProvider(ABC, BaseModel):
+    name: BinProviderName
+    
+    abspath_provider: ProviderLookupDict = Field(default={'*': 'self.on_get_abspath'}, exclude=True)
+    version_provider: ProviderLookupDict = Field(default={'*': 'self.on_get_version'}, exclude=True)
+    subdeps_provider: ProviderLookupDict = Field(default={'*': 'self.on_get_subdeps'}, exclude=True)
+    install_provider: ProviderLookupDict = Field(default={'*': 'self.on_install'}, exclude=True)
+
+    _abspath_cache: ClassVar = {}
+    _version_cache: ClassVar = {}
+    _install_cache: ClassVar = {}
+
+    # def provider_version(self) -> SemVer | None:
+    #     """Version of the actual underlying package manager (e.g. pip v20.4.1)"""
+    #     if self.name in ('env', 'vendor'):
+    #         return SemVer('0.0.0')
+    #     installer_binpath = Path(shutil.which(self.name)).resolve()
+    #     return bin_version(installer_binpath)
+
+    # def provider_host(self) -> Host:
+    #     """Information about the host env, archictecture, and OS needed to select & build packages"""
+    #     p = platform.uname()
+    #     return Host(
+    #         machine=p.machine,
+    #         system=p.system,
+    #         platform=platform.platform(),
+    #         python=sys.implementation.name,
+    #         in_docker=os.environ.get('IN_DOCKER', '').lower() == 'true',
+    #         in_qemu=os.environ.get('IN_QEMU', '').lower() == 'true',
+    #     )
+
+    def get_default_providers(self):
+        return self.get_providers_for_bin('*')
+
+    def resolve_provider_func(self, provider_func: ProviderHandlerRef | None) -> ProviderHandler | None:
+        if provider_func is None:
+            return None
+
+        # if provider_func is a dotted path to a function on self, swap it for the actual function
+        if isinstance(provider_func, str) and provider_func.startswith('self.'):
+            provider_func = getattr(self, provider_func.split('self.', 1)[-1])
+
+        # if provider_func is a dot-formatted import string, import the function
+        if isinstance(provider_func, str):
+            from django.utils.module_loading import import_string
+
+            package_name, module_name, classname, path = provider_func.split('.', 3)   # -> abc, def, ghi.jkl
+
+            # get .ghi.jkl nested attr present on module abc.def
+            imported_module = import_string(f'{package_name}.{module_name}.{classname}')
+            provider_func = operator.attrgetter(path)(imported_module)
+
+            # # abc.def.ghi.jkl  -> 1, 2, 3
+            # for idx in range(1, len(path)):
+            #     parent_path = '.'.join(path[:-idx])  # abc.def.ghi
+            #     try:
+            #         parent_module = import_string(parent_path)
+            #         provider_func = getattr(parent_module, path[-idx])
+            #     except AttributeError, ImportError:
+            #         continue
+
+        assert TypeAdapter(ProviderHandler).validate_python(provider_func), (
+            f'{self.__class__.__name__} provider func for {bin_name} was not a function or dotted-import path: {provider_func}')
+
+        return provider_func
+
+    @validate_call
+    def get_providers_for_bin(self, bin_name: str) -> ProviderLookupDict:
+        providers_for_bin = {
+            'abspath': self.abspath_provider.get(bin_name),
+            'version': self.version_provider.get(bin_name),
+            'subdeps': self.subdeps_provider.get(bin_name),
+            'install': self.install_provider.get(bin_name),
+        }
+        only_set_providers_for_bin = {k: v for k, v in providers_for_bin.items() if v is not None}
+        
+        return only_set_providers_for_bin
+
+    @validate_call
+    def get_provider_for_action(self, bin_name: BinName, provider_type: ProviderType, default_provider: Optional[ProviderHandlerRef]=None, overrides: Optional[ProviderLookupDict]=None) -> ProviderHandler:
+        """
+        Get the provider func for a given key + Dict of provider callbacks + fallback default provider.
+        e.g. get_provider_for_action(bin_name='yt-dlp', 'install', default_provider=self.on_install, ...) -> Callable
+        """
+
+        provider_func_ref = (
+            (overrides or {}).get(provider_type)
+            or self.get_providers_for_bin(bin_name).get(provider_type)
+            or self.get_default_providers().get(provider_type)
+            or default_provider
+        )
+        # print('getting provider for action', bin_name, provider_type, provider_func)
+
+        provider_func = self.resolve_provider_func(provider_func_ref)
+
+        assert provider_func, f'No {self.name} provider func was found for {bin_name} in: {self.__class__.__name__}.'
+
+        return provider_func
+
+    @validate_call
+    def call_provider_for_action(self, bin_name: BinName, provider_type: ProviderType, default_provider: Optional[ProviderHandlerRef]=None, overrides: Optional[ProviderLookupDict]=None, **kwargs) -> Any:
+        provider_func: ProviderHandler = self.get_provider_for_action(
+            bin_name=bin_name,
+            provider_type=provider_type,
+            default_provider=default_provider,
+            overrides=overrides,
+        )
+        if not func_takes_args_or_kwargs(provider_func):
+            # if it's a pure argless lambdas, dont pass bin_path and other **kwargs
+            provider_func_without_args = cast(Callable[[], Any], provider_func)
+            return provider_func_without_args()
+
+        provider_func = cast(Callable[..., Any], provider_func)
+        return provider_func(bin_name, **kwargs)
+
+
+
+    def on_get_abspath(self, bin_name: BinName, **_) -> HostBinPath | None:
+        print(f'[*] {self.__class__.__name__}: Getting abspath for {bin_name}...')
+        try:
+            return bin_abspath(bin_name)
+        except ValidationError:
+            return None
+
+    def on_get_version(self, bin_name: BinName, abspath: Optional[HostBinPath]=None, **_) -> SemVer | None:
+        abspath = abspath or self._abspath_cache.get(bin_name) or self.get_abspath(bin_name)
+        if not abspath: return None
+
+        print(f'[*] {self.__class__.__name__}: Getting version for {bin_name}...')
+        try:
+            return bin_version(abspath)
+        except ValidationError:
+            return None
+
+    def on_get_subdeps(self, bin_name: BinName, **_) -> InstallStr:
+        print(f'[*] {self.__class__.__name__}: Getting subdependencies for {bin_name}')
+        # ... subdependency calculation logic here
+        return TypeAdapter(InstallStr).validate_python(bin_name)
+
+    @abstractmethod
+    def on_install(self, bin_name: BinName, subdeps: Optional[InstallStr]=None, **_):
+        subdeps = subdeps or self.get_subdeps(bin_name)
+        print(f'[*] {self.__class__.__name__}: Installing subdependencies for {bin_name} ({subdeps})')
+        # ... install logic here
+        assert True
+
+
+    @validate_call
+    def get_abspath(self, bin_name: BinName, overrides: Optional[ProviderLookupDict]=None) -> HostBinPath | None:
+        abspath = self.call_provider_for_action(
+            bin_name=bin_name,
+            provider_type='abspath',
+            default_provider=self.on_get_abspath,
+            overrides=overrides,
+        )
+        if not abspath:
+            return None
+        result = TypeAdapter(HostBinPath).validate_python(abspath)
+        self._abspath_cache[bin_name] = result
+        return result
+
+    @validate_call
+    def get_version(self, bin_name: BinName, abspath: Optional[HostBinPath]=None, overrides: Optional[ProviderLookupDict]=None) -> SemVer | None:
+        version = self.call_provider_for_action(
+            bin_name=bin_name,
+            provider_type='version',
+            default_provider=self.on_get_version,
+            overrides=overrides,
+            abspath=abspath,
+        )
+        if not version:
+            return None
+        result = SemVer(version)
+        self._version_cache[bin_name] = result
+        return result
+
+    @validate_call
+    def get_subdeps(self, bin_name: BinName, overrides: Optional[ProviderLookupDict]=None) -> InstallStr:
+        subdeps = self.call_provider_for_action(
+            bin_name=bin_name,
+            provider_type='subdeps',
+            default_provider=self.on_get_subdeps,
+            overrides=overrides,
+        )
+        if not subdeps:
+            subdeps = bin_name
+        result = TypeAdapter(InstallStr).validate_python(subdeps)
+        return result
+
+    @validate_call
+    def install(self, bin_name: BinName, overrides: Optional[ProviderLookupDict]=None) -> InstalledBin | None:
+        subdeps = self.get_subdeps(bin_name, overrides=overrides)
+
+        self.call_provider_for_action(
+            bin_name=bin_name,
+            provider_type='install',
+            default_provider=self.on_install,
+            overrides=overrides,
+            subdeps=subdeps,
+        )
+
+        installed_abspath = self.get_abspath(bin_name)
+        assert installed_abspath, f'Unable to find {bin_name} abspath after installing with {self.name}'
+
+        installed_version = self.get_version(bin_name, abspath=installed_abspath)
+        assert installed_version, f'Unable to find {bin_name} version after installing with {self.name}'
+        
+        result = InstalledBin(abspath=installed_abspath, version=installed_version)
+        self._install_cache[bin_name] = result
+        return result
+
+    @validate_call
+    def load(self, bin_name: BinName, overrides: Optional[ProviderLookupDict]=None, cache: bool=False) -> InstalledBin | None:
+        installed_abspath = None
+        installed_version = None
+
+        if cache:
+            installed_bin = self._install_cache.get(bin_name)
+            if installed_bin:
+                return installed_bin
+            installed_abspath = self._abspath_cache.get(bin_name)
+            installed_version = self._version_cache.get(bin_name)
+
+
+        installed_abspath = installed_abspath or self.get_abspath(bin_name, overrides=overrides)
+        if not installed_abspath:
+            return None
+
+        installed_version = installed_version or self.get_version(bin_name, abspath=installed_abspath, overrides=overrides)
+        if not installed_version:
+            return None
+
+        return InstalledBin(abspath=installed_abspath, version=installed_version)
+
+    @validate_call
+    def load_or_install(self, bin_name: BinName, overrides: Optional[ProviderLookupDict]=None, cache: bool=True) -> InstalledBin | None:
+        installed = self.load(bin_name, overrides=overrides, cache=cache)
+        if not installed:
+            installed = self.install(bin_name, overrides=overrides)
+        return installed
+
+
+class PipProvider(BinProvider):
+    name: BinProviderName = 'pip'
+
+    def on_install(self, bin_name: str, subdeps: Optional[InstallStr]=None, **_):
+        subdeps = subdeps or self.on_get_subdeps(bin_name)
+        print(f'[*] {self.__class__.__name__}: Installing subdependencies for {bin_name} ({subdeps})')
+        
+        proc = run(['pip', 'install', '--upgrade', *subdeps.split(' ')], stdout=PIPE, stderr=PIPE)
+        
+        if proc.returncode != 0:
+            print(proc.stdout.strip().decode())
+            print(proc.stderr.strip().decode())
+            raise Exception(f'{self.__class__.__name__}: install got returncode {proc.returncode} while installing {subdeps}: {subdeps}')
+
+
+class AptProvider(BinProvider):
+    name: BinProviderName = 'apt'
+    
+    subdeps_provider: ProviderLookupDict = {
+        'yt-dlp': lambda: 'yt-dlp ffmpeg',
+    }
+
+    def on_install(self, bin_name: BinName, subdeps: Optional[InstallStr]=None, **_):
+        subdeps = subdeps or self.on_get_subdeps(bin_name)
+        print(f'[*] {self.__class__.__name__}: Installing subdependencies for {bin_name} ({subdeps})')
+        
+        run(['apt-get', 'update', '-qq'])
+        proc = run(['apt-get', 'install', '-y', *subdeps.split(' ')], stdout=PIPE, stderr=PIPE)
+        
+        if proc.returncode != 0:
+            print(proc.stdout.strip().decode())
+            print(proc.stderr.strip().decode())
+            raise Exception(f'{self.__class__.__name__} install got returncode {proc.returncode} while installing {subdeps}: {subdeps}')
+
+class BrewProvider(BinProvider):
+    name: BinProviderName = 'brew'
+
+    def on_install(self, bin_name: str, subdeps: Optional[InstallStr]=None, **_):
+        subdeps = subdeps or self.on_get_subdeps(bin_name)
+        print(f'[*] {self.__class__.__name__}: Installing subdependencies for {bin_name} ({subdeps})')
+        
+        proc = run(['brew', 'install', *subdeps.split(' ')], stdout=PIPE, stderr=PIPE)
+        
+        if proc.returncode != 0:
+            print(proc.stdout.strip().decode())
+            print(proc.stderr.strip().decode())
+            raise Exception(f'{self.__class__.__name__} install got returncode {proc.returncode} while installing {subdeps}: {subdeps}')
+
+
+class EnvProvider(BinProvider):
+    name: BinProviderName = 'env'
+
+    abspath_provider: ProviderLookupDict = {
+        # 'python': lambda: Path('/opt/homebrew/Cellar/[email protected]/3.10.14/Frameworks/Python.framework/Versions/3.10/bin/python3.10'),
+    }
+    version_provider: ProviderLookupDict = {
+        # 'python': lambda: '{}.{}.{}'.format(*sys.version_info[:3]),
+    }
+
+    def on_install(self, bin_name: BinName, subdeps: Optional[InstallStr]=None, **_):
+        """The env provider is ready-only and does not install any packages, so this is a no-op"""
+        pass

+ 53 - 0
archivebox/plugantic/configs.py

@@ -0,0 +1,53 @@
+__package__ = 'archivebox.plugantic'
+
+
+from typing import Optional, List, Literal
+from pathlib import Path
+from pydantic import BaseModel, Field
+
+
+ConfigSectionName = Literal['GENERAL_CONFIG', 'ARCHIVE_METHOD_TOGGLES', 'ARCHIVE_METHOD_OPTIONS', 'DEPENDENCY_CONFIG']
+
+
+class ConfigSet(BaseModel):
+    section: ConfigSectionName = 'GENERAL_CONFIG'
+
+class WgetToggleConfig(ConfigSet):
+    section: ConfigSectionName = 'ARCHIVE_METHOD_TOGGLES'
+
+    SAVE_WGET: bool = True
+    SAVE_WARC: bool = True
+
+class WgetDependencyConfig(ConfigSet):
+    section: ConfigSectionName = 'DEPENDENCY_CONFIG'
+
+    WGET_BINARY: str = Field(default='wget')
+    WGET_ARGS: Optional[List[str]] = Field(default=None)
+    WGET_EXTRA_ARGS: List[str] = []
+    WGET_DEFAULT_ARGS: List[str] = ['--timeout={TIMEOUT-10}']
+
+class WgetOptionsConfig(ConfigSet):
+    section: ConfigSectionName = 'ARCHIVE_METHOD_OPTIONS'
+
+    # loaded from shared config
+    WGET_AUTO_COMPRESSION: bool = Field(default=True)
+    SAVE_WGET_REQUISITES: bool = Field(default=True)
+    WGET_USER_AGENT: str = Field(default='', alias='USER_AGENT')
+    WGET_TIMEOUT: int = Field(default=60, alias='TIMEOUT')
+    WGET_CHECK_SSL_VALIDITY: bool = Field(default=True, alias='CHECK_SSL_VALIDITY')
+    WGET_RESTRICT_FILE_NAMES: str = Field(default='windows', alias='RESTRICT_FILE_NAMES')
+    WGET_COOKIES_FILE: Optional[Path] = Field(default=None, alias='COOKIES_FILE')
+
+
+CONFIG = {
+    'CHECK_SSL_VALIDITY': False,
+    'SAVE_WARC': False,
+    'TIMEOUT': 999,
+}
+
+
+WGET_CONFIG = [
+    WgetToggleConfig(**CONFIG),
+    WgetDependencyConfig(**CONFIG),
+    WgetOptionsConfig(**CONFIG),
+]

+ 118 - 0
archivebox/plugantic/extractors.py

@@ -0,0 +1,118 @@
+__package__ = 'archivebox.plugantic'
+
+from typing import Optional, List, Literal, Annotated, Dict, Any
+from typing_extensions import Self
+
+from abc import ABC
+from pathlib import Path
+
+from pydantic import BaseModel, model_validator, field_serializer, AfterValidator
+
+from .binaries import (
+    Binary,
+    YtdlpBinary,
+    WgetBinary,
+)
+
+
+# stubs
+class Snapshot:
+    pass
+
+class ArchiveResult:
+    pass
+
+def get_wget_output_path(*args, **kwargs) -> Path:
+    return Path('.').resolve()
+
+
+
+def no_empty_args(args: List[str]) -> List[str]:
+    assert all(len(arg) for arg in args)
+    return args
+
+ExtractorName = Literal['wget', 'warc', 'media']
+
+HandlerFuncStr = Annotated[str, AfterValidator(lambda s: s.startswith('self.'))]
+CmdArgsList = Annotated[List[str], AfterValidator(no_empty_args)]
+
+
+class Extractor(ABC, BaseModel):
+    name: ExtractorName
+    binary: Binary
+
+    output_path_func: HandlerFuncStr = 'self.get_output_path'
+    should_extract_func: HandlerFuncStr = 'self.should_extract'
+    extract_func: HandlerFuncStr = 'self.extract'
+    exec_func: HandlerFuncStr = 'self.exec'
+
+    default_args: CmdArgsList = []
+    extra_args: CmdArgsList = []
+    args: Optional[CmdArgsList] = None
+
+    @model_validator(mode='after')
+    def validate_model(self) -> Self:
+        if self.args is None:
+            self.args = [*self.default_args, *self.extra_args]
+        return self
+
+    @field_serializer('binary', when_used='json')
+    def dump_binary(binary) -> str:
+        return binary.name
+
+    def get_output_path(self, snapshot) -> Path:
+        return Path(self.name)
+
+    def should_extract(self, snapshot) -> bool:
+        output_dir = self.get_output_path(snapshot)
+        if output_dir.glob('*.*'):
+            return False
+        return True
+
+
+    def extract(self, url: str, **kwargs) -> Dict[str, Any]:
+        output_dir = self.get_output_path(url, **kwargs)
+
+        cmd = [url, *self.args] if self.args is not None else [url, *self.default_args, *self.extra_args]
+        proc = self.exec(cmd, pwd=output_dir)
+
+        return {
+            'status': 'succeeded' if proc.returncode == 0 else 'failed',
+            'output': proc.stdout.decode().strip().split('\n')[-1],
+            'output_files': list(output_dir.glob('*.*')),
+
+            'stdout': proc.stdout.decode().strip(),
+            'stderr': proc.stderr.decode().strip(),
+            'returncode': proc.returncode,
+        }
+
+    def exec(self, args: CmdArgsList, pwd: Optional[Path]=None):
+        pwd = pwd or Path('.')
+        assert self.binary.loaded_provider
+        return self.binary.exec(args, pwd=pwd)
+
+
+class YtdlpExtractor(Extractor):
+    name: ExtractorName = 'media'
+    binary: Binary = YtdlpBinary()
+
+    def get_output_path(self, snapshot) -> Path:
+        return Path(self.name)
+
+
+class WgetExtractor(Extractor):
+    name: ExtractorName = 'wget'
+    binary: Binary = WgetBinary()
+
+    def get_output_path(self, snapshot) -> Path:
+        return get_wget_output_path(snapshot)
+
+
+class WarcExtractor(Extractor):
+    name: ExtractorName = 'warc'
+    binary: Binary = WgetBinary()
+
+    def get_output_path(self, snapshot) -> Path:
+        return get_wget_output_path(snapshot)
+
+

+ 396 - 0
archivebox/plugantic/ini_to_toml.py

@@ -0,0 +1,396 @@
+from typing import Dict, Any, List
+
+import configparser
+import json
+import ast
+
+JSONValue = str | bool | int | None | List['JSONValue']
+
+def load_ini_value(val: str) -> JSONValue:
+    """Convert lax INI values into strict TOML-compliant (JSON) values"""
+    if val.lower() in ('true', 'yes', '1'):
+        return True
+    if val.lower() in ('false', 'no', '0'):
+        return False
+    if val.isdigit():
+        return int(val)
+
+    try:
+        return ast.literal_eval(val)
+    except Exception:
+        pass
+
+    try:
+        return json.loads(val)
+    except Exception as err:
+        pass
+    
+    return val
+
+
+def convert(ini_str: str) -> str:
+    """Convert a string of INI config into its TOML equivalent (warning: strips comments)"""
+
+    config = configparser.ConfigParser()
+    config.optionxform = str  # capitalize key names
+    config.read_string(ini_str)
+
+    # Initialize an empty dictionary to store the TOML representation
+    toml_dict = {}
+
+    # Iterate over each section in the INI configuration
+    for section in config.sections():
+        toml_dict[section] = {}
+
+        # Iterate over each key-value pair in the section
+        for key, value in config.items(section):
+            parsed_value = load_ini_value(value)
+
+            # Convert the parsed value to its TOML-compatible JSON representation
+            toml_dict[section.upper()][key.upper()] = json.dumps(parsed_value)
+
+    # Build the TOML string
+    toml_str = ""
+    for section, items in toml_dict.items():
+        toml_str += f"[{section}]\n"
+        for key, value in items.items():
+            toml_str += f"{key} = {value}\n"
+        toml_str += "\n"
+
+    return toml_str.strip()
+
+
+
+### Basic Assertions
+
+test_input = """
+[SERVER_CONFIG]
+IS_TTY=False
+USE_COLOR=False
+SHOW_PROGRESS=False
+IN_DOCKER=False
+IN_QEMU=False
+PUID=501
+PGID=20
+OUTPUT_DIR=/opt/archivebox/data
+CONFIG_FILE=/opt/archivebox/data/ArchiveBox.conf
+ONLY_NEW=True
+TIMEOUT=60
+MEDIA_TIMEOUT=3600
+OUTPUT_PERMISSIONS=644
+RESTRICT_FILE_NAMES=windows
+URL_DENYLIST=\.(css|js|otf|ttf|woff|woff2|gstatic\.com|googleapis\.com/css)(\?.*)?$
+URL_ALLOWLIST=None
+ADMIN_USERNAME=None
+ADMIN_PASSWORD=None
+ENFORCE_ATOMIC_WRITES=True
+TAG_SEPARATOR_PATTERN=[,]
+SECRET_KEY=xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx
+BIND_ADDR=127.0.0.1:8000
+ALLOWED_HOSTS=*
+DEBUG=False
+PUBLIC_INDEX=True
+PUBLIC_SNAPSHOTS=True
+PUBLIC_ADD_VIEW=False
+FOOTER_INFO=Content is hosted for personal archiving purposes only.  Contact server owner for any takedown requests.
+SNAPSHOTS_PER_PAGE=40
+CUSTOM_TEMPLATES_DIR=None
+TIME_ZONE=UTC
+TIMEZONE=UTC
+REVERSE_PROXY_USER_HEADER=Remote-User
+REVERSE_PROXY_WHITELIST=
+LOGOUT_REDIRECT_URL=/
+PREVIEW_ORIGINALS=True
+LDAP=False
+LDAP_SERVER_URI=None
+LDAP_BIND_DN=None
+LDAP_BIND_PASSWORD=None
+LDAP_USER_BASE=None
+LDAP_USER_FILTER=None
+LDAP_USERNAME_ATTR=None
+LDAP_FIRSTNAME_ATTR=None
+LDAP_LASTNAME_ATTR=None
+LDAP_EMAIL_ATTR=None
+LDAP_CREATE_SUPERUSER=False
+SAVE_TITLE=True
+SAVE_FAVICON=True
+SAVE_WGET=True
+SAVE_WGET_REQUISITES=True
+SAVE_SINGLEFILE=True
+SAVE_READABILITY=True
+SAVE_MERCURY=True
+SAVE_HTMLTOTEXT=True
+SAVE_PDF=True
+SAVE_SCREENSHOT=True
+SAVE_DOM=True
+SAVE_HEADERS=True
+SAVE_WARC=True
+SAVE_GIT=True
+SAVE_MEDIA=True
+SAVE_ARCHIVE_DOT_ORG=True
+RESOLUTION=1440,2000
+GIT_DOMAINS=github.com,bitbucket.org,gitlab.com,gist.github.com,codeberg.org,gitea.com,git.sr.ht
+CHECK_SSL_VALIDITY=True
+MEDIA_MAX_SIZE=750m
+USER_AGENT=None
+CURL_USER_AGENT=Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36 ArchiveBox/0.8.0 (+https://github.com/ArchiveBox/ArchiveBox/) curl/curl 8.4.0 (x86_64-apple-darwin23.0)
+WGET_USER_AGENT=Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36 ArchiveBox/0.8.0 (+https://github.com/ArchiveBox/ArchiveBox/) wget/GNU Wget 1.24.5
+CHROME_USER_AGENT=Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36 ArchiveBox/0.8.0 (+https://github.com/ArchiveBox/ArchiveBox/)
+COOKIES_FILE=None
+CHROME_USER_DATA_DIR=None
+CHROME_TIMEOUT=0
+CHROME_HEADLESS=True
+CHROME_SANDBOX=True
+CHROME_EXTRA_ARGS=[]
+YOUTUBEDL_ARGS=['--restrict-filenames', '--trim-filenames', '128', '--write-description', '--write-info-json', '--write-annotations', '--write-thumbnail', '--no-call-home', '--write-sub', '--write-auto-subs', '--convert-subs=srt', '--yes-playlist', '--continue', '--no-abort-on-error', '--ignore-errors', '--geo-bypass', '--add-metadata', '--format=(bv*+ba/b)[filesize<=750m][filesize_approx<=?750m]/(bv*+ba/b)']
+YOUTUBEDL_EXTRA_ARGS=[]
+WGET_ARGS=['--no-verbose', '--adjust-extension', '--convert-links', '--force-directories', '--backup-converted', '--span-hosts', '--no-parent', '-e', 'robots=off']
+WGET_EXTRA_ARGS=[]
+CURL_ARGS=['--silent', '--location', '--compressed']
+CURL_EXTRA_ARGS=[]
+GIT_ARGS=['--recursive']
+SINGLEFILE_ARGS=[]
+SINGLEFILE_EXTRA_ARGS=[]
+MERCURY_ARGS=['--format=text']
+MERCURY_EXTRA_ARGS=[]
+FAVICON_PROVIDER=https://www.google.com/s2/favicons?domain={}
+USE_INDEXING_BACKEND=True
+USE_SEARCHING_BACKEND=True
+SEARCH_BACKEND_ENGINE=ripgrep
+SEARCH_BACKEND_HOST_NAME=localhost
+SEARCH_BACKEND_PORT=1491
+SEARCH_BACKEND_PASSWORD=SecretPassword
+SEARCH_PROCESS_HTML=True
+SONIC_COLLECTION=archivebox
+SONIC_BUCKET=snapshots
+SEARCH_BACKEND_TIMEOUT=90
+FTS_SEPARATE_DATABASE=True
+FTS_TOKENIZERS=porter unicode61 remove_diacritics 2
+FTS_SQLITE_MAX_LENGTH=1000000000
+USE_CURL=True
+USE_WGET=True
+USE_SINGLEFILE=True
+USE_READABILITY=True
+USE_MERCURY=True
+USE_GIT=True
+USE_CHROME=True
+USE_NODE=True
+USE_YOUTUBEDL=True
+USE_RIPGREP=True
+CURL_BINARY=curl
+GIT_BINARY=git
+WGET_BINARY=wget
+SINGLEFILE_BINARY=single-file
+READABILITY_BINARY=readability-extractor
+MERCURY_BINARY=postlight-parser
+YOUTUBEDL_BINARY=yt-dlp
+NODE_BINARY=node
+RIPGREP_BINARY=rg
+CHROME_BINARY=chrome
+POCKET_CONSUMER_KEY=None
+USER=squash
+PACKAGE_DIR=/opt/archivebox/archivebox
+TEMPLATES_DIR=/opt/archivebox/archivebox/templates
+ARCHIVE_DIR=/opt/archivebox/data/archive
+SOURCES_DIR=/opt/archivebox/data/sources
+LOGS_DIR=/opt/archivebox/data/logs
+PERSONAS_DIR=/opt/archivebox/data/personas
+URL_DENYLIST_PTN=re.compile('\\.(css|js|otf|ttf|woff|woff2|gstatic\\.com|googleapis\\.com/css)(\\?.*)?$', re.IGNORECASE|re.MULTILINE)
+URL_ALLOWLIST_PTN=None
+DIR_OUTPUT_PERMISSIONS=755
+ARCHIVEBOX_BINARY=/opt/archivebox/.venv/bin/archivebox
+VERSION=0.8.0
+COMMIT_HASH=102e87578c6036bb0132dd1ebd17f8f05ffc880f
+BUILD_TIME=2024-05-15 03:28:05 1715768885
+VERSIONS_AVAILABLE=None
+CAN_UPGRADE=False
+PYTHON_BINARY=/opt/archivebox/.venv/bin/python3.10
+PYTHON_ENCODING=UTF-8
+PYTHON_VERSION=3.10.14
+DJANGO_BINARY=/opt/archivebox/.venv/lib/python3.10/site-packages/django/__init__.py
+DJANGO_VERSION=5.0.6 final (0)
+SQLITE_BINARY=/opt/homebrew/Cellar/[email protected]/3.10.14/Frameworks/Python.framework/Versions/3.10/lib/python3.10/sqlite3/dbapi2.py
+SQLITE_VERSION=2.6.0
+CURL_VERSION=curl 8.4.0 (x86_64-apple-darwin23.0)
+WGET_VERSION=GNU Wget 1.24.5
+WGET_AUTO_COMPRESSION=True
+RIPGREP_VERSION=ripgrep 14.1.0
+SINGLEFILE_VERSION=None
+READABILITY_VERSION=None
+MERCURY_VERSION=None
+GIT_VERSION=git version 2.44.0
+YOUTUBEDL_VERSION=2024.04.09
+CHROME_VERSION=Google Chrome 124.0.6367.207
+NODE_VERSION=v21.7.3
+"""
+
+
+expected_output = '''[SERVER_CONFIG]
+IS_TTY = false
+USE_COLOR = false
+SHOW_PROGRESS = false
+IN_DOCKER = false
+IN_QEMU = false
+PUID = 501
+PGID = 20
+OUTPUT_DIR = "/opt/archivebox/data"
+CONFIG_FILE = "/opt/archivebox/data/ArchiveBox.conf"
+ONLY_NEW = true
+TIMEOUT = 60
+MEDIA_TIMEOUT = 3600
+OUTPUT_PERMISSIONS = 644
+RESTRICT_FILE_NAMES = "windows"
+URL_DENYLIST = "\\\\.(css|js|otf|ttf|woff|woff2|gstatic\\\\.com|googleapis\\\\.com/css)(\\\\?.*)?$"
+URL_ALLOWLIST = null
+ADMIN_USERNAME = null
+ADMIN_PASSWORD = null
+ENFORCE_ATOMIC_WRITES = true
+TAG_SEPARATOR_PATTERN = "[,]"
+SECRET_KEY = "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"
+BIND_ADDR = "127.0.0.1:8000"
+ALLOWED_HOSTS = "*"
+DEBUG = false
+PUBLIC_INDEX = true
+PUBLIC_SNAPSHOTS = true
+PUBLIC_ADD_VIEW = false
+FOOTER_INFO = "Content is hosted for personal archiving purposes only.  Contact server owner for any takedown requests."
+SNAPSHOTS_PER_PAGE = 40
+CUSTOM_TEMPLATES_DIR = null
+TIME_ZONE = "UTC"
+TIMEZONE = "UTC"
+REVERSE_PROXY_USER_HEADER = "Remote-User"
+REVERSE_PROXY_WHITELIST = ""
+LOGOUT_REDIRECT_URL = "/"
+PREVIEW_ORIGINALS = true
+LDAP = false
+LDAP_SERVER_URI = null
+LDAP_BIND_DN = null
+LDAP_BIND_PASSWORD = null
+LDAP_USER_BASE = null
+LDAP_USER_FILTER = null
+LDAP_USERNAME_ATTR = null
+LDAP_FIRSTNAME_ATTR = null
+LDAP_LASTNAME_ATTR = null
+LDAP_EMAIL_ATTR = null
+LDAP_CREATE_SUPERUSER = false
+SAVE_TITLE = true
+SAVE_FAVICON = true
+SAVE_WGET = true
+SAVE_WGET_REQUISITES = true
+SAVE_SINGLEFILE = true
+SAVE_READABILITY = true
+SAVE_MERCURY = true
+SAVE_HTMLTOTEXT = true
+SAVE_PDF = true
+SAVE_SCREENSHOT = true
+SAVE_DOM = true
+SAVE_HEADERS = true
+SAVE_WARC = true
+SAVE_GIT = true
+SAVE_MEDIA = true
+SAVE_ARCHIVE_DOT_ORG = true
+RESOLUTION = [1440, 2000]
+GIT_DOMAINS = "github.com,bitbucket.org,gitlab.com,gist.github.com,codeberg.org,gitea.com,git.sr.ht"
+CHECK_SSL_VALIDITY = true
+MEDIA_MAX_SIZE = "750m"
+USER_AGENT = null
+CURL_USER_AGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36 ArchiveBox/0.8.0 (+https://github.com/ArchiveBox/ArchiveBox/) curl/curl 8.4.0 (x86_64-apple-darwin23.0)"
+WGET_USER_AGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36 ArchiveBox/0.8.0 (+https://github.com/ArchiveBox/ArchiveBox/) wget/GNU Wget 1.24.5"
+CHROME_USER_AGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36 ArchiveBox/0.8.0 (+https://github.com/ArchiveBox/ArchiveBox/)"
+COOKIES_FILE = null
+CHROME_USER_DATA_DIR = null
+CHROME_TIMEOUT = false
+CHROME_HEADLESS = true
+CHROME_SANDBOX = true
+CHROME_EXTRA_ARGS = []
+YOUTUBEDL_ARGS = ["--restrict-filenames", "--trim-filenames", "128", "--write-description", "--write-info-json", "--write-annotations", "--write-thumbnail", "--no-call-home", "--write-sub", "--write-auto-subs", "--convert-subs=srt", "--yes-playlist", "--continue", "--no-abort-on-error", "--ignore-errors", "--geo-bypass", "--add-metadata", "--format=(bv*+ba/b)[filesize<=750m][filesize_approx<=?750m]/(bv*+ba/b)"]
+YOUTUBEDL_EXTRA_ARGS = []
+WGET_ARGS = ["--no-verbose", "--adjust-extension", "--convert-links", "--force-directories", "--backup-converted", "--span-hosts", "--no-parent", "-e", "robots=off"]
+WGET_EXTRA_ARGS = []
+CURL_ARGS = ["--silent", "--location", "--compressed"]
+CURL_EXTRA_ARGS = []
+GIT_ARGS = ["--recursive"]
+SINGLEFILE_ARGS = []
+SINGLEFILE_EXTRA_ARGS = []
+MERCURY_ARGS = ["--format=text"]
+MERCURY_EXTRA_ARGS = []
+FAVICON_PROVIDER = "https://www.google.com/s2/favicons?domain={}"
+USE_INDEXING_BACKEND = true
+USE_SEARCHING_BACKEND = true
+SEARCH_BACKEND_ENGINE = "ripgrep"
+SEARCH_BACKEND_HOST_NAME = "localhost"
+SEARCH_BACKEND_PORT = 1491
+SEARCH_BACKEND_PASSWORD = "SecretPassword"
+SEARCH_PROCESS_HTML = true
+SONIC_COLLECTION = "archivebox"
+SONIC_BUCKET = "snapshots"
+SEARCH_BACKEND_TIMEOUT = 90
+FTS_SEPARATE_DATABASE = true
+FTS_TOKENIZERS = "porter unicode61 remove_diacritics 2"
+FTS_SQLITE_MAX_LENGTH = 1000000000
+USE_CURL = true
+USE_WGET = true
+USE_SINGLEFILE = true
+USE_READABILITY = true
+USE_MERCURY = true
+USE_GIT = true
+USE_CHROME = true
+USE_NODE = true
+USE_YOUTUBEDL = true
+USE_RIPGREP = true
+CURL_BINARY = "curl"
+GIT_BINARY = "git"
+WGET_BINARY = "wget"
+SINGLEFILE_BINARY = "single-file"
+READABILITY_BINARY = "readability-extractor"
+MERCURY_BINARY = "postlight-parser"
+YOUTUBEDL_BINARY = "yt-dlp"
+NODE_BINARY = "node"
+RIPGREP_BINARY = "rg"
+CHROME_BINARY = "chrome"
+POCKET_CONSUMER_KEY = null
+USER = "squash"
+PACKAGE_DIR = "/opt/archivebox/archivebox"
+TEMPLATES_DIR = "/opt/archivebox/archivebox/templates"
+ARCHIVE_DIR = "/opt/archivebox/data/archive"
+SOURCES_DIR = "/opt/archivebox/data/sources"
+LOGS_DIR = "/opt/archivebox/data/logs"
+PERSONAS_DIR = "/opt/archivebox/data/personas"
+URL_DENYLIST_PTN = "re.compile(\'\\\\.(css|js|otf|ttf|woff|woff2|gstatic\\\\.com|googleapis\\\\.com/css)(\\\\?.*)?$\', re.IGNORECASE|re.MULTILINE)"
+URL_ALLOWLIST_PTN = null
+DIR_OUTPUT_PERMISSIONS = 755
+ARCHIVEBOX_BINARY = "/opt/archivebox/.venv/bin/archivebox"
+VERSION = "0.8.0"
+COMMIT_HASH = "102e87578c6036bb0132dd1ebd17f8f05ffc880f"
+BUILD_TIME = "2024-05-15 03:28:05 1715768885"
+VERSIONS_AVAILABLE = null
+CAN_UPGRADE = false
+PYTHON_BINARY = "/opt/archivebox/.venv/bin/python3.10"
+PYTHON_ENCODING = "UTF-8"
+PYTHON_VERSION = "3.10.14"
+DJANGO_BINARY = "/opt/archivebox/.venv/lib/python3.10/site-packages/django/__init__.py"
+DJANGO_VERSION = "5.0.6 final (0)"
+SQLITE_BINARY = "/opt/homebrew/Cellar/[email protected]/3.10.14/Frameworks/Python.framework/Versions/3.10/lib/python3.10/sqlite3/dbapi2.py"
+SQLITE_VERSION = "2.6.0"
+CURL_VERSION = "curl 8.4.0 (x86_64-apple-darwin23.0)"
+WGET_VERSION = "GNU Wget 1.24.5"
+WGET_AUTO_COMPRESSION = true
+RIPGREP_VERSION = "ripgrep 14.1.0"
+SINGLEFILE_VERSION = null
+READABILITY_VERSION = null
+MERCURY_VERSION = null
+GIT_VERSION = "git version 2.44.0"
+YOUTUBEDL_VERSION = "2024.04.09"
+CHROME_VERSION = "Google Chrome 124.0.6367.207"
+NODE_VERSION = "v21.7.3"'''
+
+
+first_output = convert(test_input)      # make sure ini -> toml parses correctly
+second_output = convert(first_output)   # make sure toml -> toml parses/dumps consistently
+assert first_output == second_output == expected_output  # make sure parsing is indempotent
+
+# # DEBUGGING
+# import sys
+# import difflib
+# sys.stdout.writelines(difflib.context_diff(first_output, second_output, fromfile='first', tofile='second'))
+# print(repr(second_output))

+ 38 - 0
archivebox/plugantic/migrations/0001_initial.py

@@ -0,0 +1,38 @@
+# Generated by Django 5.0.6 on 2024-05-18 00:16
+
+import abid_utils.models
+import archivebox.plugantic.plugins
+import charidfield.fields
+import django.core.serializers.json
+import django.db.models.deletion
+import django_pydantic_field.fields
+import uuid
+from django.conf import settings
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+
+    initial = True
+
+    dependencies = [
+        migrations.swappable_dependency(settings.AUTH_USER_MODEL),
+    ]
+
+    operations = [
+        migrations.CreateModel(
+            name='Plugin',
+            fields=[
+                ('created', models.DateTimeField(auto_now_add=True)),
+                ('modified', models.DateTimeField(auto_now=True)),
+                ('id', models.UUIDField(default=uuid.uuid4, editable=False, primary_key=True, serialize=False)),
+                ('uuid', models.UUIDField(blank=True, null=True, unique=True)),
+                ('abid', charidfield.fields.CharIDField(blank=True, db_index=True, default=None, help_text='ABID-format identifier for this entity (e.g. snp_01BJQMF54D093DXEAWZ6JYRPAQ)', max_length=30, null=True, prefix='plg_', unique=True)),
+                ('schema', django_pydantic_field.fields.PydanticSchemaField(config=None, encoder=django.core.serializers.json.DjangoJSONEncoder, schema=archivebox.plugantic.plugins.Plugin)),
+                ('created_by', models.ForeignKey(default=abid_utils.models.get_or_create_system_user_pk, on_delete=django.db.models.deletion.CASCADE, to=settings.AUTH_USER_MODEL)),
+            ],
+            options={
+                'abstract': False,
+            },
+        ),
+    ]

+ 21 - 0
archivebox/plugantic/migrations/0002_alter_plugin_schema.py

@@ -0,0 +1,21 @@
+# Generated by Django 5.0.6 on 2024-05-18 01:16
+
+import archivebox.plugantic.plugins
+import django.core.serializers.json
+import django_pydantic_field.fields
+from django.db import migrations
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ('plugantic', '0001_initial'),
+    ]
+
+    operations = [
+        migrations.AlterField(
+            model_name='plugin',
+            name='schema',
+            field=django_pydantic_field.fields.PydanticSchemaField(config=None, default=None, encoder=django.core.serializers.json.DjangoJSONEncoder, schema=archivebox.plugantic.plugins.Plugin),
+        ),
+    ]

+ 21 - 0
archivebox/plugantic/migrations/0003_alter_plugin_schema.py

@@ -0,0 +1,21 @@
+# Generated by Django 5.0.6 on 2024-05-18 01:25
+
+import archivebox.plugantic.replayers
+import django.core.serializers.json
+import django_pydantic_field.fields
+from django.db import migrations
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ('plugantic', '0002_alter_plugin_schema'),
+    ]
+
+    operations = [
+        migrations.AlterField(
+            model_name='plugin',
+            name='schema',
+            field=django_pydantic_field.fields.PydanticSchemaField(config=None, default={'embed_template': 'plugins/generic_replayer/templates/embed.html', 'fullpage_template': 'plugins/generic_replayer/templates/fullpage.html', 'name': 'GenericReplayer', 'row_template': 'plugins/generic_replayer/templates/row.html', 'url_pattern': '*'}, encoder=django.core.serializers.json.DjangoJSONEncoder, schema=archivebox.plugantic.replayers.Replayer),
+        ),
+    ]

+ 32 - 0
archivebox/plugantic/migrations/0004_remove_plugin_schema_plugin_configs_plugin_name.py

@@ -0,0 +1,32 @@
+# Generated by Django 5.0.6 on 2024-05-18 01:28
+
+import archivebox.plugantic.configs
+import django.core.serializers.json
+import django_pydantic_field.compat.django
+import django_pydantic_field.fields
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ('plugantic', '0003_alter_plugin_schema'),
+    ]
+
+    operations = [
+        migrations.RemoveField(
+            model_name='plugin',
+            name='schema',
+        ),
+        migrations.AddField(
+            model_name='plugin',
+            name='configs',
+            field=django_pydantic_field.fields.PydanticSchemaField(config=None, default=[], encoder=django.core.serializers.json.DjangoJSONEncoder, schema=django_pydantic_field.compat.django.GenericContainer(list, (archivebox.plugantic.configs.ConfigSet,))),
+        ),
+        migrations.AddField(
+            model_name='plugin',
+            name='name',
+            field=models.CharField(default='name', max_length=64, unique=True),
+            preserve_default=False,
+        ),
+    ]

+ 39 - 0
archivebox/plugantic/migrations/0005_customplugin_delete_plugin.py

@@ -0,0 +1,39 @@
+# Generated by Django 5.0.6 on 2024-05-18 01:42
+
+import abid_utils.models
+import charidfield.fields
+import django.db.models.deletion
+import pathlib
+import uuid
+from django.conf import settings
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ('plugantic', '0004_remove_plugin_schema_plugin_configs_plugin_name'),
+        migrations.swappable_dependency(settings.AUTH_USER_MODEL),
+    ]
+
+    operations = [
+        migrations.CreateModel(
+            name='CustomPlugin',
+            fields=[
+                ('created', models.DateTimeField(auto_now_add=True)),
+                ('modified', models.DateTimeField(auto_now=True)),
+                ('id', models.UUIDField(default=uuid.uuid4, editable=False, primary_key=True, serialize=False)),
+                ('uuid', models.UUIDField(blank=True, null=True, unique=True)),
+                ('abid', charidfield.fields.CharIDField(blank=True, db_index=True, default=None, help_text='ABID-format identifier for this entity (e.g. snp_01BJQMF54D093DXEAWZ6JYRPAQ)', max_length=30, null=True, prefix='plg_', unique=True)),
+                ('name', models.CharField(max_length=64, unique=True)),
+                ('path', models.FilePathField(path=pathlib.PurePosixPath('/Volumes/NVME/Users/squash/Local/Code/archiveboxes/ArchiveBox/archivebox/plugins'))),
+                ('created_by', models.ForeignKey(default=abid_utils.models.get_or_create_system_user_pk, on_delete=django.db.models.deletion.CASCADE, to=settings.AUTH_USER_MODEL)),
+            ],
+            options={
+                'abstract': False,
+            },
+        ),
+        migrations.DeleteModel(
+            name='Plugin',
+        ),
+    ]

+ 19 - 0
archivebox/plugantic/migrations/0006_alter_customplugin_path.py

@@ -0,0 +1,19 @@
+# Generated by Django 5.0.6 on 2024-05-18 01:45
+
+import pathlib
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ('plugantic', '0005_customplugin_delete_plugin'),
+    ]
+
+    operations = [
+        migrations.AlterField(
+            model_name='customplugin',
+            name='path',
+            field=models.FilePathField(allow_files=False, allow_folders=True, path=pathlib.PurePosixPath('/Volumes/NVME/Users/squash/Local/Code/archiveboxes/ArchiveBox/archivebox/plugins'), recursive=True),
+        ),
+    ]

+ 19 - 0
archivebox/plugantic/migrations/0007_alter_customplugin_path.py

@@ -0,0 +1,19 @@
+# Generated by Django 5.0.6 on 2024-05-18 01:46
+
+import pathlib
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ('plugantic', '0006_alter_customplugin_path'),
+    ]
+
+    operations = [
+        migrations.AlterField(
+            model_name='customplugin',
+            name='path',
+            field=models.FilePathField(allow_files=False, allow_folders=True, path=pathlib.PurePosixPath('/Volumes/NVME/Users/squash/Local/Code/archiveboxes/ArchiveBox/data/plugins'), recursive=True),
+        ),
+    ]

+ 19 - 0
archivebox/plugantic/migrations/0008_alter_customplugin_path.py

@@ -0,0 +1,19 @@
+# Generated by Django 5.0.6 on 2024-05-18 01:47
+
+import pathlib
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ('plugantic', '0007_alter_customplugin_path'),
+    ]
+
+    operations = [
+        migrations.AlterField(
+            model_name='customplugin',
+            name='path',
+            field=models.FilePathField(allow_files=False, allow_folders=True, path=pathlib.PurePosixPath('/Volumes/NVME/Users/squash/Local/Code/archiveboxes/ArchiveBox/data'), recursive=True),
+        ),
+    ]

+ 18 - 0
archivebox/plugantic/migrations/0009_alter_customplugin_path.py

@@ -0,0 +1,18 @@
+# Generated by Django 5.0.6 on 2024-05-18 01:48
+
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ('plugantic', '0008_alter_customplugin_path'),
+    ]
+
+    operations = [
+        migrations.AlterField(
+            model_name='customplugin',
+            name='path',
+            field=models.FilePathField(allow_files=False, allow_folders=True, path='/Volumes/NVME/Users/squash/Local/Code/archiveboxes/ArchiveBox/data', recursive=True),
+        ),
+    ]

+ 18 - 0
archivebox/plugantic/migrations/0010_alter_customplugin_path.py

@@ -0,0 +1,18 @@
+# Generated by Django 5.0.6 on 2024-05-18 01:48
+
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ('plugantic', '0009_alter_customplugin_path'),
+    ]
+
+    operations = [
+        migrations.AlterField(
+            model_name='customplugin',
+            name='path',
+            field=models.FilePathField(allow_files=False, allow_folders=True, match='/plugins/*', path='/Volumes/NVME/Users/squash/Local/Code/archiveboxes/ArchiveBox/data', recursive=True),
+        ),
+    ]

+ 18 - 0
archivebox/plugantic/migrations/0011_alter_customplugin_path.py

@@ -0,0 +1,18 @@
+# Generated by Django 5.0.6 on 2024-05-18 01:48
+
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ('plugantic', '0010_alter_customplugin_path'),
+    ]
+
+    operations = [
+        migrations.AlterField(
+            model_name='customplugin',
+            name='path',
+            field=models.FilePathField(allow_files=False, allow_folders=True, match='plugins/*', path='/Volumes/NVME/Users/squash/Local/Code/archiveboxes/ArchiveBox/data', recursive=True),
+        ),
+    ]

+ 18 - 0
archivebox/plugantic/migrations/0012_alter_customplugin_path.py

@@ -0,0 +1,18 @@
+# Generated by Django 5.0.6 on 2024-05-18 01:49
+
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ('plugantic', '0011_alter_customplugin_path'),
+    ]
+
+    operations = [
+        migrations.AlterField(
+            model_name='customplugin',
+            name='path',
+            field=models.FilePathField(allow_files=False, allow_folders=True, default='example_plugin', match='plugins/*', path='/Volumes/NVME/Users/squash/Local/Code/archiveboxes/ArchiveBox/data', recursive=True),
+        ),
+    ]

+ 18 - 0
archivebox/plugantic/migrations/0013_alter_customplugin_path.py

@@ -0,0 +1,18 @@
+# Generated by Django 5.0.6 on 2024-05-18 01:49
+
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ('plugantic', '0012_alter_customplugin_path'),
+    ]
+
+    operations = [
+        migrations.AlterField(
+            model_name='customplugin',
+            name='path',
+            field=models.FilePathField(allow_files=False, allow_folders=True, default='/plugins/example_plugin', match='plugins/*', path='/Volumes/NVME/Users/squash/Local/Code/archiveboxes/ArchiveBox/data', recursive=True),
+        ),
+    ]

+ 18 - 0
archivebox/plugantic/migrations/0014_alter_customplugin_path.py

@@ -0,0 +1,18 @@
+# Generated by Django 5.0.6 on 2024-05-18 01:50
+
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ('plugantic', '0013_alter_customplugin_path'),
+    ]
+
+    operations = [
+        migrations.AlterField(
+            model_name='customplugin',
+            name='path',
+            field=models.FilePathField(allow_files=False, allow_folders=True, default='/plugins/example_plugin', match='*', path='/Volumes/NVME/Users/squash/Local/Code/archiveboxes/ArchiveBox/data/plugins', recursive=True),
+        ),
+    ]

+ 18 - 0
archivebox/plugantic/migrations/0015_alter_customplugin_path.py

@@ -0,0 +1,18 @@
+# Generated by Django 5.0.6 on 2024-05-18 01:51
+
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ('plugantic', '0014_alter_customplugin_path'),
+    ]
+
+    operations = [
+        migrations.AlterField(
+            model_name='customplugin',
+            name='path',
+            field=models.FilePathField(allow_files=False, allow_folders=True, match='*', path='/Volumes/NVME/Users/squash/Local/Code/archiveboxes/ArchiveBox/data/plugins', recursive=True),
+        ),
+    ]

+ 16 - 0
archivebox/plugantic/migrations/0016_delete_customplugin.py

@@ -0,0 +1,16 @@
+# Generated by Django 5.0.6 on 2024-05-18 01:57
+
+from django.db import migrations
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ('plugantic', '0015_alter_customplugin_path'),
+    ]
+
+    operations = [
+        migrations.DeleteModel(
+            name='CustomPlugin',
+        ),
+    ]

+ 0 - 0
archivebox/plugantic/migrations/__init__.py


+ 50 - 0
archivebox/plugantic/models.py

@@ -0,0 +1,50 @@
+__package__ = 'archivebox.plugantic'
+
+
+# import uuid
+# from django.db import models
+# from typing_extensions import Self
+
+# from django_pydantic_field import SchemaField
+# from django.conf import settings
+
+# from abid_utils.models import ABIDModel, ABIDField
+
+# # from .plugins import Plugin as PluginSchema, CORE_PLUGIN
+# from .binproviders import BinProvider
+# from .binaries import Binary
+# from .configs import WgetOptionsConfig
+# from .extractors import Extractor
+# from .replayers import Replayer
+
+
+# PLUGINS_ROOT = settings.CONFIG['OUTPUT_DIR'] / 'plugins'
+# PLUGINS_ROOT.mkdir(exist_ok=True)
+
+
+# class CustomPlugin(ABIDModel):
+#     abid_prefix = 'plg_'
+#     abid_ts_src = 'self.added'
+#     abid_uri_src = 'self.name'
+#     abid_subtype_src = '"09"'
+#     abid_rand_src = 'self.id'
+
+#     id = models.UUIDField(primary_key=True, default=uuid.uuid4, editable=False)  # legacy pk
+#     uuid = models.UUIDField(blank=True, null=True, editable=True, unique=True)
+#     abid = ABIDField(prefix=abid_prefix)
+
+#     name = models.CharField(max_length=64, blank=False, unique=True)
+
+#     path = models.FilePathField(path=str(PLUGINS_ROOT), match='*', recursive=True, allow_folders=True, allow_files=False)
+
+#     # replayers: list[Replayer] = SchemaField()
+#     # binaries: list[Replayer] = SchemaField()
+#     # extractors: list[Replayer] = SchemaField()
+
+
+#     # @classmethod
+#     # def from_loaded_plugin(cls, plugin: PluginSchema) -> Self:
+#     #     new_obj = cls(
+#     #         schema=plugin,
+#     #     )
+#     #     return new_obj

+ 134 - 0
archivebox/plugantic/plugins.py

@@ -0,0 +1,134 @@
+__package__ = 'archivebox.plugantic'
+
+from typing import List
+from typing_extensions import Self
+
+from pydantic import (
+    BaseModel,
+    ConfigDict,
+    Field,
+    model_validator,
+    validate_call,
+    SerializeAsAny,
+)
+
+from .binaries import (
+    Binary,
+    PythonBinary,
+    SqliteBinary,
+    DjangoBinary,
+    WgetBinary,
+    YtdlpBinary,
+)
+from .extractors import (
+    Extractor,
+    YtdlpExtractor,
+    WgetExtractor,
+    WarcExtractor,
+)
+from .replayers import (
+    Replayer,
+    GENERIC_REPLAYER,
+    MEDIA_REPLAYER,
+)
+from .configs import (
+    ConfigSet,
+    WGET_CONFIG,
+)
+
+
+class Plugin(BaseModel):
+    model_config = ConfigDict(arbitrary_types_allowed=True, extra='ignore', populate_by_name=True)
+
+    name: str = Field(default='baseplugin')                   # e.g. media
+    description: str = Field(default='')                      # e.g. get media using yt-dlp
+    
+    configs: List[SerializeAsAny[ConfigSet]] = Field(default=[])
+    binaries: List[SerializeAsAny[Binary]] = Field(default=[])                # e.g. [Binary(name='yt-dlp')]
+    extractors: List[SerializeAsAny[Extractor]] = Field(default=[])
+    replayers: List[SerializeAsAny[Replayer]] = Field(default=[])
+
+    @model_validator(mode='after')
+    def validate(self):
+        self.description = self.description or self.name
+
+    @validate_call
+    def install(self) -> Self:
+        new_binaries = []
+        for idx, binary in enumerate(self.binaries):
+            new_binaries.append(binary.install() or binary)
+        return self.model_copy(update={
+            'binaries': new_binaries,
+        })
+
+    @validate_call
+    def load(self, cache=True) -> Self:
+        new_binaries = []
+        for idx, binary in enumerate(self.binaries):
+            new_binaries.append(binary.load(cache=cache) or binary)
+        return self.model_copy(update={
+            'binaries': new_binaries,
+        })
+
+    @validate_call
+    def load_or_install(self, cache=True) -> Self:
+        new_binaries = []
+        for idx, binary in enumerate(self.binaries):
+            new_binaries.append(binary.load_or_install(cache=cache) or binary)
+        return self.model_copy(update={
+            'binaries': new_binaries,
+        })
+
+
+class CorePlugin(Plugin):
+    name: str = 'core'
+    configs: List[SerializeAsAny[ConfigSet]] = []
+    binaries: List[SerializeAsAny[Binary]] = [PythonBinary(), SqliteBinary(), DjangoBinary()]
+    extractors: List[SerializeAsAny[Extractor]] = []
+    replayers: List[SerializeAsAny[Replayer]] = [GENERIC_REPLAYER]
+
+class YtdlpPlugin(Plugin):
+    name: str = 'ytdlp'
+    configs: List[SerializeAsAny[ConfigSet]] = []
+    binaries: List[SerializeAsAny[Binary]] = [YtdlpBinary()]
+    extractors: List[SerializeAsAny[Extractor]] = [YtdlpExtractor()]
+    replayers: List[SerializeAsAny[Replayer]] = [MEDIA_REPLAYER]
+
+class WgetPlugin(Plugin):
+    name: str = 'wget'
+    configs: List[SerializeAsAny[ConfigSet]] = [*WGET_CONFIG]
+    binaries: List[SerializeAsAny[Binary]] = [WgetBinary()]
+    extractors: List[SerializeAsAny[Extractor]] = [WgetExtractor(), WarcExtractor()]
+
+
+CORE_PLUGIN = CorePlugin()
+YTDLP_PLUGIN = YtdlpPlugin()
+WGET_PLUGIN = WgetPlugin()
+PLUGINS = [
+    CORE_PLUGIN,
+    YTDLP_PLUGIN,
+    WGET_PLUGIN,
+]
+LOADED_PLUGINS = PLUGINS
+
+
+import json
+
+for plugin in PLUGINS:
+    try:
+        json.dumps(plugin.model_json_schema(), indent=4)
+        # print(json.dumps(plugin.model_json_schema(), indent=4))
+    except Exception as err:
+        print(f'Failed to generate JSON schema for {plugin.name}')
+        raise
+
+# print('-------------------------------------BEFORE INSTALL---------------------------------')
+# for plugin in PLUGINS:
+#     print(plugin.model_dump_json(indent=4))
+# print('-------------------------------------DURING LOAD/INSTALL---------------------------------')
+# for plugin in PLUGINS:
+    # LOADED_PLUGINS.append(plugin.install())
+# print('-------------------------------------AFTER INSTALL---------------------------------')
+# for plugin in LOADED_PLUGINS:
+    # print(plugin.model_dump_json(indent=4))
+

+ 26 - 0
archivebox/plugantic/replayers.py

@@ -0,0 +1,26 @@
+__package__ = 'archivebox.plugantic'
+
+
+from pydantic import BaseModel
+
+# from .binproviders import LazyImportStr
+
+
+class Replayer(BaseModel):
+    """Describes how to render an ArchiveResult in several contexts"""
+    name: str = 'GenericReplayer'
+    url_pattern: str = '*'
+
+    row_template: str = 'plugins/generic_replayer/templates/row.html'
+    embed_template: str = 'plugins/generic_replayer/templates/embed.html'
+    fullpage_template: str = 'plugins/generic_replayer/templates/fullpage.html'
+
+    # row_view: LazyImportStr = 'plugins.generic_replayer.views.row_view'
+    # embed_view: LazyImportStr = 'plugins.generic_replayer.views.embed_view'
+    # fullpage_view: LazyImportStr = 'plugins.generic_replayer.views.fullpage_view'
+    # icon_view: LazyImportStr = 'plugins.generic_replayer.views.get_icon'
+    # thumbnail_view: LazyImportStr = 'plugins.generic_replayer.views.get_icon'
+
+
+GENERIC_REPLAYER = Replayer(name='generic')
+MEDIA_REPLAYER = Replayer(name='media')

+ 3 - 0
archivebox/plugantic/tests.py

@@ -0,0 +1,3 @@
+from django.test import TestCase
+
+# Create your tests here.

+ 169 - 0
archivebox/plugantic/views.py

@@ -0,0 +1,169 @@
+__package__ = 'archivebox.plugantic'
+
+from django.http import HttpRequest
+from django.utils.html import format_html, mark_safe
+
+from admin_data_views.typing import TableContext, ItemContext
+from admin_data_views.utils import render_with_table_view, render_with_item_view, ItemLink
+
+
+from plugantic.plugins import LOADED_PLUGINS
+from django.conf import settings
+
+
+@render_with_table_view
+def binaries_list_view(request: HttpRequest, **kwargs) -> TableContext:
+
+    assert request.user.is_superuser, 'Must be a superuser to view configuration settings.'
+
+    rows = {
+        "Binary": [],
+        "From Plugin": [],
+        "Found Version": [],
+        "Provided By": [],
+        "Found Abspath": [],
+        "Related Configuration": [],
+        "Overrides": [],
+        "Description": [],
+    }
+
+    relevant_configs = {
+        key: val
+        for key, val in settings.CONFIG.items()
+        if '_BINARY' in key or '_VERSION' in key
+    }
+
+    for plugin in LOADED_PLUGINS:
+        for binary in plugin.binaries:
+            binary = binary.load_or_install()
+
+            rows['Binary'].append(ItemLink(binary.name, key=binary.name))
+            rows['From Plugin'].append(plugin.name)
+            rows['Found Version'].append(binary.loaded_version)
+            rows['Provided By'].append(binary.loaded_provider)
+            rows['Found Abspath'].append(binary.loaded_abspath)
+            rows['Related Configuration'].append(mark_safe(', '.join(
+                f'<a href="/admin/environment/config/{config_key}/">{config_key}</a>'
+                for config_key, config_value in relevant_configs.items()
+                    if binary.name.lower().replace('-', '').replace('_', '').replace('ytdlp', 'youtubedl') in config_key.lower()
+                    # or binary.name.lower().replace('-', '').replace('_', '') in str(config_value).lower()
+            )))
+            rows['Overrides'].append(str(binary.provider_overrides))
+            rows['Description'].append(binary.description)
+
+    return TableContext(
+        title="Binaries",
+        table=rows,
+    )
+
+@render_with_item_view
+def binary_detail_view(request: HttpRequest, key: str, **kwargs) -> ItemContext:
+
+    assert request.user.is_superuser, 'Must be a superuser to view configuration settings.'
+
+    binary = None
+    plugin = None
+    for loaded_plugin in LOADED_PLUGINS:
+        for loaded_binary in loaded_plugin.binaries:
+            if loaded_binary.name == key:
+                binary = loaded_binary
+                plugin = loaded_plugin
+
+    assert plugin and binary, f'Could not find a binary matching the specified name: {key}'
+
+    binary = binary.load_or_install()
+
+    return ItemContext(
+        slug=key,
+        title=key,
+        data=[
+            {
+                "name": binary.name,
+                "description": binary.description,
+                "fields": {
+                    'plugin': plugin.name,
+                    'binprovider': binary.loaded_provider,
+                    'abspath': binary.loaded_abspath,
+                    'version': binary.loaded_version,
+                    'overrides': str(binary.provider_overrides),
+                    'providers': str(binary.providers_supported),
+                },
+                "help_texts": {
+                    # TODO
+                },
+            },
+        ],
+    )
+
+
+@render_with_table_view
+def plugins_list_view(request: HttpRequest, **kwargs) -> TableContext:
+
+    assert request.user.is_superuser, 'Must be a superuser to view configuration settings.'
+
+    rows = {
+        "Name": [],
+        "binaries": [],
+        "extractors": [],
+        "replayers": [],
+        "configs": [],
+        "description": [],
+    }
+
+
+    for plugin in LOADED_PLUGINS:
+        plugin = plugin.load_or_install()
+
+        rows['Name'].append(ItemLink(plugin.name, key=plugin.name))
+        rows['binaries'].append(mark_safe(', '.join(
+            f'<a href="/admin/environment/binaries/{binary.name}/">{binary.name}</a>'
+            for binary in plugin.binaries
+        )))
+        rows['extractors'].append(', '.join(extractor.name for extractor in plugin.extractors))
+        rows['replayers'].append(', '.join(replayer.name for replayer in plugin.replayers))
+        rows['configs'].append(mark_safe(', '.join(
+            f'<a href="/admin/environment/config/{config_key}/">{config_key}</a>'
+            for configset in plugin.configs
+                for config_key in configset.__fields__.keys()
+                    if config_key != 'section' and config_key in settings.CONFIG
+        )))
+        rows['description'].append(str(plugin.description))
+
+    return TableContext(
+        title="Installed plugins",
+        table=rows,
+    )
+
+@render_with_item_view
+def plugin_detail_view(request: HttpRequest, key: str, **kwargs) -> ItemContext:
+
+    assert request.user.is_superuser, 'Must be a superuser to view configuration settings.'
+
+    plugin = None
+    for loaded_plugin in LOADED_PLUGINS:
+        if loaded_plugin.name == key:
+            plugin = loaded_plugin
+
+    assert plugin, f'Could not find a plugin matching the specified name: {key}'
+
+    plugin = plugin.load_or_install()
+
+    return ItemContext(
+        slug=key,
+        title=key,
+        data=[
+            {
+                "name": plugin.name,
+                "description": plugin.description,
+                "fields": {
+                    'configs': plugin.configs,
+                    'binaries': plugin.binaries,
+                    'extractors': plugin.extractors,
+                    'replayers': plugin.replayers,
+                },
+                "help_texts": {
+                    # TODO
+                },
+            },
+        ],
+    )

+ 5 - 5
archivebox/search/__init__.py

@@ -39,7 +39,7 @@ def write_search_index(link: Link, texts: Union[List[str], None]=None, out_dir:
         backend = import_backend()
         backend = import_backend()
         if snap:
         if snap:
             try:
             try:
-                backend.index(snapshot_id=str(snap.id), texts=texts)
+                backend.index(snapshot_id=str(snap.pk), texts=texts)
             except Exception as err:
             except Exception as err:
                 stderr()
                 stderr()
                 stderr(
                 stderr(
@@ -54,7 +54,7 @@ def query_search_index(query: str, out_dir: Path=OUTPUT_DIR) -> QuerySet:
     if search_backend_enabled():
     if search_backend_enabled():
         backend = import_backend()
         backend = import_backend()
         try:
         try:
-            snapshot_ids = backend.search(query)
+            snapshot_pks = backend.search(query)
         except Exception as err:
         except Exception as err:
             stderr()
             stderr()
             stderr(
             stderr(
@@ -64,7 +64,7 @@ def query_search_index(query: str, out_dir: Path=OUTPUT_DIR) -> QuerySet:
             raise
             raise
         else:
         else:
             # TODO preserve ordering from backend
             # TODO preserve ordering from backend
-            qsearch = Snapshot.objects.filter(pk__in=snapshot_ids)
+            qsearch = Snapshot.objects.filter(pk__in=snapshot_pks)
             return qsearch
             return qsearch
     
     
     return Snapshot.objects.none()
     return Snapshot.objects.none()
@@ -74,9 +74,9 @@ def flush_search_index(snapshots: QuerySet):
     if not indexing_enabled() or not snapshots:
     if not indexing_enabled() or not snapshots:
         return
         return
     backend = import_backend()
     backend = import_backend()
-    snapshot_ids=(str(pk) for pk in snapshots.values_list('pk',flat=True))
+    snapshot_pks = (str(pk) for pk in snapshots.values_list('pk', flat=True))
     try:
     try:
-        backend.flush(snapshot_ids)
+        backend.flush(snapshot_pks)
     except Exception as err:
     except Exception as err:
         stderr()
         stderr()
         stderr(
         stderr(

+ 12 - 0
archivebox/templates/admin/base.html

@@ -277,10 +277,22 @@
                         $(this).parents('.card').removeClass('selected-card')
                         $(this).parents('.card').removeClass('selected-card')
                 })
                 })
             };
             };
+            function selectSnapshotIfHotlinked() {
+                // if we arrive at the index with a url like ??id__startswith=...
+                // we were hotlinked here with the intention of making it easy for the user to perform some
+                // actions on the given snapshot. therefore we should preselect the snapshot to save them a click
+                if (window.location.search.startsWith('?id__startswith=') || window.location.search.startsWith('?id__exact=')) {
+                    const result_checkboxes = [...document.querySelectorAll('#result_list .action-checkbox input[type=checkbox]')]
+                    if (result_checkboxes.length === 1) {
+                        result_checkboxes[0].click()
+                    }
+                }
+            }
             $(document).ready(function() {
             $(document).ready(function() {
                 fix_actions()
                 fix_actions()
                 setupSnapshotGridListToggle()
                 setupSnapshotGridListToggle()
                 setTimeOffset()
                 setTimeOffset()
+                selectSnapshotIfHotlinked()
             })
             })
         </script>
         </script>
     </body>
     </body>

+ 1 - 1
archivebox/templates/admin/snapshots_grid.html

@@ -147,7 +147,7 @@
     {% for obj in results %}
     {% for obj in results %}
       <div class="card">
       <div class="card">
           <div class="card-info">
           <div class="card-info">
-            <a href="{% url 'admin:core_snapshot_change' obj.id %}">
+            <a href="{% url 'admin:core_snapshot_change' obj.pk %}">
               <span class="timestamp">{{obj.added}}</span>
               <span class="timestamp">{{obj.added}}</span>
             </a>
             </a>
             <label>
             <label>

+ 545 - 0
archivebox/templates/core/snapshot_live.html

@@ -0,0 +1,545 @@
+{% load static tz core_tags %}
+
+<!DOCTYPE html>
+<html lang="en">
+    <head>
+        <title>{{title}}</title>
+        <meta charset="utf-8" name="viewport" content="width=device-width, initial-scale=1">
+        <link href="{% static 'bootstrap.min.css' %}" rel="stylesheet">
+        <style>
+            /* Keep this inline, don't move to external css file because this template is used to generate static exports that need to be usable as-is without an accompanying staticfiles dir */
+            html, body {
+                width: 100%;
+                height: 100%;
+                background-color: #ddd;
+            }
+            header {
+                background-color: #aa1e55;
+            }
+            small {
+                font-weight: 200;
+            }
+            header a:hover {
+                text-decoration: none;
+            }
+            .header-top {
+                width: 100%;
+                height: auto;
+                min-height: 40px;
+                margin: 0px;
+                text-align: center;
+                color: #f6f6f6;
+                font-size: calc(10px + 0.84vw);
+                font-weight: 200;
+                padding: 3px 4px;
+                background-color: #aa1e55;
+            }
+            .header-top .nav {
+                width: 100%;
+            }
+            .nav > div {
+                min-height: 30px;
+                line-height: 1.2;
+            }
+            .header-top .header-url {
+                display: inline-block;
+                width: 100%;
+                background-color: rgb(216, 216, 235, 0.05);
+                text-align: center;
+                line-height: 1.3;
+                font-family: monospace;
+                white-space: nowrap;
+                font-weight: 200;
+                display: block;
+                margin-top: -1px;
+                font-size: 23px;
+                opacity: 0.8;
+                border-radius: 0px 0px 8px 8px;
+            }
+            .header-top .header-url a.header-url-text {
+                color: #f6f6f6;
+                user-select: all;
+                text-overflow: ellipsis;
+            }
+            .header-top .header-url a.header-url-text:hover {
+                color: rgb(144, 161, 255);
+            }
+            .header-top a {
+                text-decoration: none;
+                color: rgba(0,0,0,0.6);
+            }
+            .header-top a:hover {
+                text-decoration: none;
+                color: rgba(0,0,0,0.9);
+            }
+            .header-top .header-title {
+                color: rgba(0,0,0,0.6);
+            }
+            .header-top .favicon {
+                height: 24px;
+                vertical-align: -5px;
+                margin-right: 4px;
+            }
+            .header-top .col-lg-4 {
+                text-align: center;
+                padding-top: 4px;
+                padding-bottom: 4px;
+            }
+            .header-archivebox img {
+                display: inline-block;
+                margin-right: 3px;
+                height: 30px;
+                margin-left: 12px;
+                margin-top: -4px;
+                margin-bottom: 2px;
+            }
+            .header-archivebox img:hover {
+                opacity: 0.5;
+            }
+            header small code {
+                white-space: nowrap;
+                font-weight: 200;
+                display: block;
+                margin-top: -1px;
+                font-size: 13px;
+                opacity: 0.8;
+                user-select: all;
+            }
+            .header-toggle {
+                line-height: 12px;
+                font-size: 70px;
+                vertical-align: -12px;
+                margin-left: 4px;
+            }
+            
+            .info-row {
+                margin-top: 2px;
+                margin-bottom: 5px;
+            }
+            .info-row .alert {
+                margin-bottom: 0px;
+            }
+            .row.header-bottom {
+                margin-left: -10px;
+                margin-right: -10px;
+            }
+            .header-bottom .col-lg-2 {
+                padding-left: 4px;
+                padding-right: 4px;
+            }
+
+            .header-bottom-frames .card {
+                box-shadow: 2px 2px 7px 0px rgba(0, 0, 0, 0.1);
+                margin-bottom: 5px;
+                border: 1px solid rgba(0, 0, 0, 0.06);
+                border-radius: 10px;
+                background-color: #efefef;
+                overflow: hidden;
+                height: 130px;
+            }
+            .card h4 {
+                font-size: 0.8em;
+                display: inline-block;
+                width: auto;
+                text-transform: uppercase;
+                margin-top: 0px;
+                margin-bottom: 5px;
+                color: rgb(93, 105, 110);
+            }
+            .card-body {
+                font-size: 14px;
+                padding: 4px 10px;
+                padding-bottom: 0px;
+                /* padding-left: 3px; */
+                /* padding-right: 3px; */
+                /* padding-bottom: 3px; */
+                line-height: 1;
+                word-wrap: break-word;
+                max-height: 102px;
+                overflow: hidden;
+                text-overflow: ellipsis;
+                color: #d3d3d3;
+            }
+            .card-title {
+                margin-bottom: 4px;
+                text-transform: uppercase;
+            }
+            .card-img-top {
+                border: 0px;
+                padding: 0px;
+                margin: 0px;
+                overflow: hidden;
+                opacity: 0.8;
+                border-top: 1px solid rgba(0,0,0,0);
+                border-radius: 4px;
+                border-bottom: 1px solid rgba(0,0,0,0);
+                height: 430px;
+                width: 405%;
+                margin-bottom: -330px;
+                background-color: #333;
+                margin-left: -1%;
+                margin-right: -1%;
+                pointer-events: none;
+
+                transform: scale(0.25); 
+                transform-origin: 0 0;
+            }
+            #main-frame {
+                border-top: 1px solid #ddd;
+                width: 100%;
+                height: calc(100vh - 210px);
+                margin: 0px;
+                border: 0px;
+                border-top: 3px solid #aa1e55;
+            }
+            .card.selected-card {
+                border: 2px solid orange;
+                box-shadow: 0px -6px 13px 1px rgba(0,0,0,0.05);
+            }
+            .iframe-large {
+                height: calc(100vh - 70px);
+            }
+            img.external {
+                height: 30px;
+                margin-right: -10px;
+                padding: 3px;
+                border-radius: 4px;
+                vertical-align: middle;
+                border: 4px solid rgba(0,0,0,0);
+            }
+            img.external:hover {
+                border: 4px solid green;
+            }
+            .screenshot {
+                background-color: #333;
+                transform: none;
+                width: 100%;
+                min-height: 100px;
+                max-height: 100px;
+                margin-bottom: 0px;
+                object-fit: cover;
+                object-position: top center;
+            }
+            .header-bottom {
+                border-top: 1px solid rgba(170, 30, 85, 0.9);
+                padding-bottom: 1px;
+                border-bottom: 5px solid rgb(170, 30, 85);
+                margin-bottom: -1px;
+
+                border-radius: 0px;
+                background-color: #f4eeee;
+                border: 1px solid rgba(0,0,0,0.2);
+                box-shadow: 4px 4px 4px rgba(0,0,0,0.2);
+                margin-top: 0px;
+            }
+            .header-bottom-info {
+                color: #6f6f6f;
+                padding-top: 0px;
+                padding-bottom: 0px;
+                margin: 0px -15px;
+            }
+
+            .header-bottom-info > div {
+                text-align: center;
+            }
+            .header-bottom-info h5 {
+                font-size: 12px;
+                font-weight: 400;
+                margin-top: 3px;
+                margin-bottom: 3px;
+            }
+            .info-chunk {
+                width: auto;
+                display: inline-block;
+                text-align: center;
+                margin: 8px 4px;
+                vertical-align: top;
+                font-size: 14px;
+            }
+            header .badge {
+                margin-top: 3px;
+                font-size: 0.9rem;
+                font-weight: 200;
+                font-family: monospace;
+            }
+            header .internal-links {
+                text-align: left;
+                opacity: 1;
+                background-color: rgba(0,0,0,0.03);
+                padding: 1px 3px;
+            }
+            header .external-links {
+                text-align: center;
+                opacity: 0.9;
+                /*background-color: rgba(0,0,0,0.03);*/
+                margin-top: 0px;
+                padding: 1px 3px;
+                font-size: 14px;
+                color: #ddd;
+                width: 100%;
+                overflow: hidden;
+            }
+            .header-bottom-frames {
+                padding-top: 5px;
+                justify-content: center;
+            }
+            .header-bottom-frames .card-title {
+                width: 100%;
+                text-align: center;
+                font-size: 17px;
+                margin-bottom: 0px;
+                display: inline-block;
+                color: #d3d3d3;
+                font-weight: 200;
+                vertical-align: 3px;
+            }
+            .header-bottom-frames .card-text {
+/*                width: 100%;
+                text-align: center;*/
+                font-size: 0.9em;
+                display: inline-block;
+                position: relative;
+/*                top: -11px;*/
+            }
+            .card-text code {
+                padding: .1rem .2rem;
+                font-size: 90%;
+                color: #bd4147;
+                background-color: rgb(204, 204, 204, 0.28);
+                border-radius: .25rem;
+            }
+
+            /*@media(max-width: 1092px) {
+                iframe {
+                    display: none;
+                }
+            }*/
+                
+
+            @media(max-width: 728px) {
+                .card h4 {
+                    font-size: 5vw;
+                }
+                .card-body {
+                    font-size: 4vw;
+                }
+                .card {
+                    margin-bottom: 5px;
+                }
+                header > h1 > a.header-url, header > h1 > a.header-archivebox {
+                    display: none;
+                }
+            }
+        </style>
+    </head>
+    <body>
+        <header>
+            <div class="header-top container-fluid">
+                <div class="row nav">
+                    <div class="col-lg-2" style="line-height: 58px; vertical-align: middle">
+                        <a href="../../index.html" class="header-archivebox" title="Go to Main Index...">
+                            <img src="../../static/archive.png" alt="Archive Icon">
+                            ArchiveBox
+                        </a>
+                    </div>
+                    <div class="col-lg-8">
+                        <div class="header-url">
+                            <a class="header-url-text" href="{{url}}" title="Open original URL in new window..." target="_blank" rel="noreferrer">
+                                {{url}}
+                            </a>
+                        </div>
+                        <div class="badge badge-{{status_color}}" style="float: left">
+                            <a href="/admin/core/snapshot/?id__startswith={{snapshot_id}}" title="Click to see options to pull, re-snapshot, or delete this Snapshot">
+                                {{status|upper}}
+                            </a>
+                        </div>
+                        <div class="badge badge-default" style="float: left; font-weight: 200">
+                            {{num_outputs}}
+                            {% if num_failures %}
+                                + {{num_failures}} <small>errors</small>
+                            {% endif %}
+                        </div>
+                        <div class="badge badge-info" style="float: right">
+                            <a href="/admin/core/snapshot/{{snapshot_id}}/change/" title="Click to edit this Snapshot in the Admin UI">
+                                {{size}}
+                            </a>
+                        </div>
+                        <div class="badge badge-default" style="float: right">
+                            <a href="/admin/core/snapshot/{{snapshot_id}}/change/" title="Click to edit this Snapshot in the Admin UI">
+                                {{extension}}
+                            </a>
+                        </div>
+                        <small class="header-title header-toggle-trigger">
+                            <img src="favicon.ico" onerror="this.style.opacity=0" alt="Favicon" class="favicon"/>
+                            {{title|truncatechars:120|safe}} <a href="#" class="header-toggle header-toggle-trigger">▾</a>
+                            <br/>
+                            {% for tag in tags_str|split:',' %}
+                                <div class="badge badge-default tag" style="word-break: break-all;">{{tag}}</div>
+                            {% endfor %}
+                        </small>
+                    </div>
+                    <div class="col-lg-2" style="padding-top: 4px">
+                        <a href="/archive/{{url}}" title="Date Added: {{bookmarked_date}}  |  First Archived: {{oldest_archive_date|default:updated_date}}  |  Last Checked: {{updated_date}}   (UTC)">
+                            {{oldest_archive_date|default:updated_date|default:bookmarked_date}}
+                        </a>
+                        <br/>
+                        <div class="external-links">
+                            ↗️ &nbsp;
+                            <a href="https://web.archive.org/web/{{url}}" title="Search for a copy of the URL saved in Archive.org" target="_blank" rel="noreferrer">Archive.org</a> &nbsp;|&nbsp; 
+                            <a href="https://archive.md/{{url}}" title="Search for a copy of the URL saved in Archive.today" target="_blank" rel="noreferrer">Archive.today</a>  &nbsp;|&nbsp; 
+                            <a href="{{warc_path}}" title="Download the ArchiveBox-generated WARC file" target="_blank">WARC</a>
+                            <!--<a href="https://ghostarchive.org/search?term={{url|urlencode}}" title="Search for a copy of the URL saved in GhostArchive.org" target="_blank" rel="noreferrer">More...</a>-->
+                        </div>
+                    </div>
+                </div>
+            </div>
+            <div class="header-bottom container-fluid">
+                <div class="row header-bottom-frames">
+                    {% for result in archiveresults %}
+                        <div class="col-lg-2">
+                            <div class="card {% if forloop.first %}selected-card{% endif %}">
+                                <div class="card-body">
+                                    <a href="{{result.path}}" target="preview" title="./{{result.path}} (downloaded {{result.ts}})">
+                                        <h4>{{result.name}}</h4>
+                                        <!-- <p class="card-text" ><code>./{{result.path|truncatechars:30}}</code></p> -->
+                                    </a>
+                                    <!--<a href="{{result.path}}" target="preview"><h4 class="card-title">{{result.name}}</h4></a>-->
+                                </div>
+                                <iframe class="card-img-top" src="{{result.path}}" sandbox="allow-same-origin allow-top-navigation-by-user-activation allow-scripts allow-forms" scrolling="no" loading="lazy"></iframe>
+                            </div>
+                        </div>
+                    {% endfor %}
+                    <div class="col-lg-2">
+                        <div class="card">
+                            <div class="card-body">
+                                <a href="./" target="preview">
+                                    <h4>Headers, JSON, etc.</h4>
+                                </a>
+                                <!--<a href="{{result.path}}" target="preview"><h4 class="card-title">{{result.name}}</h4></a>-->
+                            </div>
+                            <iframe class="card-img-top" src="./" sandbox="" scrolling="no" loading="lazy"></iframe>
+                        </div>
+                    </div>
+                </div>
+            </div>
+        </header>
+        <iframe id="main-frame" sandbox="allow-same-origin allow-top-navigation-by-user-activation allow-scripts allow-forms" class="full-page-iframe" src="{{best_result.path}}" name="preview"></iframe>
+    
+        <script src="{% static 'jquery.min.js' %}" type="text/javascript"></script>
+
+        <script>
+            // un-sandbox iframes showing pdfs (required to display pdf viewer)
+            jQuery('iframe').map(function() {
+                if (this.src.endsWith('.pdf')) {
+                    this.removeAttribute('sandbox')
+                    this.src = this.src + '#toolbar=0'
+                }
+                this.onload = function() {
+                    this.contentWindow.scrollTo(0, 0);
+                    // this.src = this.src
+                    if (this.src.endsWith('.pdf')) {
+                        this.removeAttribute('sandbox')
+                        this.src = this.src + '#toolbar=0'
+                    }
+                }
+            })
+
+            function getPreviewTypeFromPath(link) {
+                if (link.getAttribute('href') == './') {
+                    return 'all'
+                }
+                return link.getAttribute('href')
+            }
+
+            const iframe_elem = document.getElementById('main-frame')
+
+            for (const card of [...document.querySelectorAll('.card')]) {
+                card.addEventListener('click', function(event) {
+                    const target = event.currentTarget.querySelector('a').href
+
+                    jQuery('.selected-card').removeClass('selected-card')
+                    jQuery(event.currentTarget).closest('.card').addClass('selected-card')
+
+                    if (target.endsWith('.pdf')) {
+                        jQuery('#main-frame')[0].removeAttribute('sandbox')
+                    } else {
+                        jQuery('#main-frame')[0].sandbox = "allow-same-origin allow-top-navigation-by-user-activation allow-scripts allow-forms"
+                    }
+                    window.location.hash = getPreviewTypeFromPath(event.currentTarget.querySelector('a'))
+
+                    iframe_elem.src = target
+                })
+            }
+
+
+            function hideSnapshotHeader() {
+                console.log('Collapsing Snapshot header...')
+                jQuery('.header-toggle').text('▸')
+                jQuery('.header-bottom').hide()
+                jQuery('#main-frame').addClass('iframe-large')
+                try {
+                    localStorage.setItem("archivebox-snapshot-header-visible", "false")
+                } catch (e) {
+                    console.log('Could not use localStorage to persist header collapse state', e)
+                }
+            }
+            function showSnapshotHeader() {
+                console.log('Expanding Snapshot header...')
+                jQuery('.header-toggle').text('▾')
+                jQuery('.header-bottom').show()
+                jQuery('#main-frame').removeClass('iframe-large')
+                try {
+                    localStorage.setItem("archivebox-snapshot-header-visible", "true")
+                } catch (e) {
+                    console.log('Could not use localStorage to persist header collapse state', e)
+                }
+            }
+            function loadSnapshotHeaderState() {
+                // collapse snapshot header if user has previously hidden it
+                let snapshotHeaderIsExpanded = 'false'
+                try {
+                    snapshotHeaderIsExpanded = localStorage.getItem("archivebox-snapshot-header-visible") || 'false'
+                } catch (e) {
+                    console.log('Could not use localStorage to get header collapse state', e)
+                }
+                if (snapshotHeaderIsExpanded === 'false') {
+                    hideSnapshotHeader()
+                }
+            }
+            function handleSnapshotHeaderToggle() {
+                if (jQuery('.header-toggle').text().includes('▾')) {
+                    hideSnapshotHeader()
+                } else {
+                    showSnapshotHeader()
+                }
+                return true
+            }
+
+            // hide header when collapse icon is clicked
+            jQuery('.header-toggle').on('click', handleSnapshotHeaderToggle)
+            jQuery('.header-toggle-trigger').on('click', handleSnapshotHeaderToggle)
+
+            // check URL for hash e.g. #git and load relevant preview
+            jQuery(document).ready(function() {
+                if (window.location.hash) {
+                    for (const link of jQuery('a[target=preview]')) {
+                        console.log(link.pathname)
+                        if (getPreviewTypeFromPath(link) == window.location.hash.slice(1).toLowerCase()) {
+                            jQuery(link).closest('.card').click()
+                            jQuery(link).click()
+                            link.click()
+                        }
+                    }
+                }
+                loadSnapshotHeaderState()
+            })
+
+            
+
+            // hide all preview iframes on small screens
+            // if (window.innerWidth < 1091) {
+            //     jQuery('.card a[target=preview]').attr('target', '_self')
+            // }
+        </script>
+    </body>
+</html>

+ 21 - 0
pyproject.toml

@@ -37,6 +37,11 @@ dependencies = [
     #  - See Github issues for more...
     #  - See Github issues for more...
     "django-signal-webhooks>=0.3.0",
     "django-signal-webhooks>=0.3.0",
     "django-admin-data-views>=0.3.1",
     "django-admin-data-views>=0.3.1",
+    "ulid-py>=1.1.0",
+    "typeid-python>=0.3.0",
+    "django-charid-field>=0.4",
+    "django-pydantic-field>=0.3.9",
+    "django-jsonform>=2.22.0",
 ]
 ]
 
 
 homepage = "https://github.com/ArchiveBox/ArchiveBox"
 homepage = "https://github.com/ArchiveBox/ArchiveBox"
@@ -155,6 +160,22 @@ plugins = ["mypy_django_plugin.main"]
 [tool.django-stubs]
 [tool.django-stubs]
 django_settings_module = "core.settings"
 django_settings_module = "core.settings"
 
 
+[tool.pyright]
+include = ["archivebox"]
+exclude = ["**/node_modules",
+    "**/__pycache__",
+    "**/migrations",
+    "archivebox/vendor",
+]
+# ignore = ["src/oldstuff"]
+# defineConstant = { DEBUG = true }
+
+reportMissingImports = true
+reportMissingTypeStubs = false
+pythonVersion = "3.10"
+pythonPlatform = "Linux"
+
+
 
 
 [project.urls]
 [project.urls]
 Homepage = "https://github.com/ArchiveBox/ArchiveBox"
 Homepage = "https://github.com/ArchiveBox/ArchiveBox"