فهرست منبع

add ABID model check and fix model inheritance

Nick Sweeting 1 سال پیش
والد
کامیت
1ceaa1ac7a
3فایلهای تغییر یافته به همراه190 افزوده شده و 20 حذف شده
  1. 126 7
      archivebox/base_models/models.py
  2. 59 12
      archivebox/core/models.py
  3. 5 1
      archivebox/core/statemachines.py

+ 126 - 7
archivebox/base_models/models.py

@@ -18,9 +18,14 @@ from django.utils.functional import classproperty
 from django.db.utils import OperationalError
 from django.db.utils import OperationalError
 from django.contrib.auth import get_user_model
 from django.contrib.auth import get_user_model
 from django.urls import reverse_lazy
 from django.urls import reverse_lazy
+from django.conf import settings
+# from django.contrib.contenttypes.models import ContentType
+# from django.contrib.contenttypes.fields import GenericForeignKey
+# from django.contrib.contenttypes.fields import GenericRelation
 
 
 from django_stubs_ext.db.models import TypedModelMeta
 from django_stubs_ext.db.models import TypedModelMeta
 
 
+
 from archivebox.index.json import to_json
 from archivebox.index.json import to_json
 
 
 from .abid import (
 from .abid import (
@@ -74,6 +79,89 @@ class ABIDError(Exception):
     pass
     pass
 
 
 
 
+# class LabelType:
+#     """
+#     A Many:1 reference to an object by a human-readable or machine-readable label, e.g.:
+#     """
+#
+#     name: str
+#     verbose_name: str
+#
+# class UUIDLabelType(LabelType):
+#     name = 'UUID'
+#     verbose_name = 'UUID'
+#
+# class ABIDLabelType(LabelType):
+#     name = 'ABID'
+#     verbose_name = 'ABID'
+#
+# class TimestampLabelType(LabelType):
+#     name = 'TIMESTAMP'
+#     verbose_name = 'Timestamp'
+
+
+# class Label(models.Model):
+#     """
+#     A 1:1 reference to an object by a human-readable or machine-readable label, e.g.:
+#
+#     Label(label='snp_01BJQMF54D093DXEAWZ6JYRPAQ', content_object=snapshot, reftype='ABID')
+#     """
+#     class RefTypeChoices(models.TextChoices):
+#         UUID = UUIDLabelType.name, UUIDLabelType.verbose_name
+#         ABID = ABIDLabelType.name, ABIDLabelType.verbose_name
+#         URI = URILabelType.name, URILabelType.verbose_name
+#         TAG = TagLabelType.name, TagLabelType.verbose_name
+#         TIMESTAMP = TimestampLabelType.name, TimestampLabelType.verbose_name
+#
+#     id = models.CharField(max_length=255, primary_key=True, null=False, blank=False, db_index=True)
+#     reftype = models.CharField(choices=RefTypeChoices.choices, default=RefTypeChoices.ABID, max_length=32)
+#
+#     content_type = models.ForeignKey(ContentType, on_delete=models.CASCADE)
+#     object_id = models.UUIDField(default=None, null=False, editable=False)
+#     content_object = GenericForeignKey("content_type", "object_id")
+#
+#     @property
+#     def created_by(self) -> User:
+#         return self.content_object.created_by
+#
+#     @property
+#     def created_by_id(self) -> int:
+#         return self.content_object.created_by_id
+#
+#     @created_by.setter
+#     def created_by(self, value: User) -> None:
+#         self.content_object.created_by = value
+#
+#     @created_by_id.setter
+#     def created_by_id(self, value: int) -> None:
+#         self.content_object.created_by_id = value
+#
+#     @property
+#     def abid_prefix(self) -> str:
+#         return self.content_object.abid_prefix
+#
+#     @property
+#     def ABID(self) -> ABID:
+#         return ABID.parse(self.abid_prefix + self.abid.split('_', 1)[-1])
+#
+#     def __str__(self):
+#         return self.tag
+#
+#     class Meta:
+#         indexes = [
+#             models.Index(fields=["content_type", "object_id"]),
+#         ]
+#
+# class ModelWithLabels(models.Model):
+#     labels = GenericRelation(Label)
+#
+#     def UUID(self) -> uuid4.UUID:
+#         return uuid4.UUID(self.labels.filter(reftype=Label.RefTypeChoices.UUID).first().id)
+#
+#     def ABID(self) -> ABID:
+#         return ABID.parse(self.labels.filter(reftype=Label.RefTypeChoices.ABID).first().id)
+
+
 class ABIDModel(models.Model):
 class ABIDModel(models.Model):
     """
     """
     Abstract Base Model for other models to depend on. Provides ArchiveBox ID (ABID) interface and other helper methods.
     Abstract Base Model for other models to depend on. Provides ArchiveBox ID (ABID) interface and other helper methods.
@@ -86,12 +174,14 @@ class ABIDModel(models.Model):
     abid_salt: str = DEFAULT_ABID_URI_SALT           # combined with self.uri to anonymize hashes on a per-install basis (default is shared globally with all users, means everyone will hash ABC to -> 123 the same around the world, makes it easy to share ABIDs across installs and see if they are for the same URI. Change this if you dont want your hashes to be guessable / in the same hash space as all other users)
     abid_salt: str = DEFAULT_ABID_URI_SALT           # combined with self.uri to anonymize hashes on a per-install basis (default is shared globally with all users, means everyone will hash ABC to -> 123 the same around the world, makes it easy to share ABIDs across installs and see if they are for the same URI. Change this if you dont want your hashes to be guessable / in the same hash space as all other users)
     abid_drift_allowed: bool = False                 # set to True to allow abid_field values to change after a fixed ABID has been issued (NOT RECOMMENDED: means values can drift out of sync from original ABID)
     abid_drift_allowed: bool = False                 # set to True to allow abid_field values to change after a fixed ABID has been issued (NOT RECOMMENDED: means values can drift out of sync from original ABID)
 
 
-    # id = models.UUIDField(primary_key=True, default=None, null=False, editable=False, unique=True, verbose_name='ID')
-    # abid = ABIDField(prefix=abid_prefix)
+    id = models.UUIDField(primary_key=True, default=None, null=False, editable=False, unique=True, verbose_name='ID')
+    abid = ABIDField(prefix=abid_prefix)
 
 
-    # created_by = models.ForeignKey(settings.AUTH_USER_MODEL, on_delete=models.CASCADE, default=None, null=False, db_index=True)
-    # created_at = AutoDateTimeField(default=None, null=False, db_index=True)
-    # modified_at = models.DateTimeField(auto_now=True)
+    created_by = models.ForeignKey(settings.AUTH_USER_MODEL, on_delete=models.CASCADE, default=None, null=False, db_index=True)
+    created_at = AutoDateTimeField(default=None, null=False, db_index=True)
+    modified_at = models.DateTimeField(auto_now=True)
+    
+    # labels = GenericRelation(Label)
     
     
     # if ModelWithNotesMixin model:
     # if ModelWithNotesMixin model:
     #   notes = models.TextField(blank=True, null=False, default='', help_text='Any extra notes this snapshot should have')
     #   notes = models.TextField(blank=True, null=False, default='', help_text='Any extra notes this snapshot should have')
@@ -135,6 +225,14 @@ class ABIDModel(models.Model):
         # (ordinarily fields cant depend on other fields until the obj is saved to db and recalled)
         # (ordinarily fields cant depend on other fields until the obj is saved to db and recalled)
         self._init_timestamp = ts_from_abid(abid_part_from_ts(timezone.now()))
         self._init_timestamp = ts_from_abid(abid_part_from_ts(timezone.now()))
 
 
+    def check(self):
+        super().check()
+        assert 'id' in self._meta.get_fields(), 'All ABIDModel subclasses must define an id field'
+        assert 'abid' in self._meta.get_fields(), 'All ABIDModel subclasses must define an abid field'
+        assert 'created_at' in self._meta.get_fields(), 'All ABIDModel subclasses must define a created_at field'
+        assert 'modified_at' in self._meta.get_fields(), 'All ABIDModel subclasses must define a modified_at field'
+        assert 'created_by' in self._meta.get_fields(), 'All ABIDModel subclasses must define a created_by field'
+
     def clean(self, abid_drift_allowed: bool | None=None) -> None:
     def clean(self, abid_drift_allowed: bool | None=None) -> None:
         if self._state.adding:
         if self._state.adding:
             # only runs once when a new object is first saved to the DB
             # only runs once when a new object is first saved to the DB
@@ -386,6 +484,27 @@ class ModelWithHealthStats(models.Model):
         return round(success_pct)
         return round(success_pct)
 
 
 
 
+class ModelWithConfig(ABIDModel):
+    """
+    Base Model that adds a config property to any ABIDModel.
+    This config is retrieved by abx.pm.hook.get_scope_config(...) later whenever this model is used.
+    """
+    config = models.JSONField(default=dict, null=False, blank=False, editable=True)
+    
+    class Meta:
+        abstract = True
+
+    # @property
+    # def unique_config(self) -> dict[str, Any]:
+    #     """Get the unique config that this model is adding to the default config"""
+    #     without_us = archivebox.pm.hook.get_scope_config()
+    #     with_us = archivebox.pm.hook.get_scope_config(extra_config=self.config)
+    #     return {
+    #         key: value
+    #         for key, value in with_us.items()
+    #         if key not in without_us
+    #         or without_us[key] != value
+    #     }
 
 
 
 
 class ModelWithOutputDir(ABIDModel):
 class ModelWithOutputDir(ABIDModel):
@@ -415,7 +534,7 @@ class ModelWithOutputDir(ABIDModel):
             self.write_indexes()  # write the index.html, merkle hashes, symlinks, send indexable texts to search backend, etc.
             self.write_indexes()  # write the index.html, merkle hashes, symlinks, send indexable texts to search backend, etc.
 
 
     @property
     @property
-    def output_dir_type(self) -> str:
+    def output_dir_parent(self) -> str:
         """Get the model type parent directory name that holds this object's data e.g. 'archiveresults'"""
         """Get the model type parent directory name that holds this object's data e.g. 'archiveresults'"""
         parent_dir = getattr(self, 'output_dir_parent', f'{self._meta.model_name}s')
         parent_dir = getattr(self, 'output_dir_parent', f'{self._meta.model_name}s')
         assert len(parent_dir) > 2, f'output_dir_parent must be a non-empty string, got: "{parent_dir}"'
         assert len(parent_dir) > 2, f'output_dir_parent must be a non-empty string, got: "{parent_dir}"'
@@ -430,7 +549,7 @@ class ModelWithOutputDir(ABIDModel):
     @property
     @property
     def output_dir_str(self) -> str:
     def output_dir_str(self) -> str:
         """Get relateive the filesystem directory Path that holds that data for this object e.g. 'snapshots/snp_2342353k2jn3j32l4324'"""
         """Get relateive the filesystem directory Path that holds that data for this object e.g. 'snapshots/snp_2342353k2jn3j32l4324'"""
-        return f'{self.output_dir_type}/{self.output_dir_name}'  # e.g. snapshots/snp_2342353k2jn3j32l4324
+        return f'{self.output_dir_parent}/{self.output_dir_name}'  # e.g. snapshots/snp_2342353k2jn3j32l4324
         
         
     @property
     @property
     def OUTPUT_DIR(self) -> Path:
     def OUTPUT_DIR(self) -> Path:

+ 59 - 12
archivebox/core/models.py

@@ -29,7 +29,7 @@ from archivebox.misc.util import parse_date, base_url
 from archivebox.index.schema import Link
 from archivebox.index.schema import Link
 from archivebox.index.html import snapshot_icons
 from archivebox.index.html import snapshot_icons
 from archivebox.extractors import ARCHIVE_METHODS_INDEXING_PRECEDENCE
 from archivebox.extractors import ARCHIVE_METHODS_INDEXING_PRECEDENCE
-from archivebox.base_models.models import ABIDModel, ABIDField, AutoDateTimeField, ModelWithOutputDir
+from archivebox.base_models.models import ABIDModel, ABIDField, AutoDateTimeField, ModelWithOutputDir, ModelWithConfig
 
 
 from workers.models import ModelWithStateMachine
 from workers.models import ModelWithStateMachine
 from workers.tasks import bg_archive_snapshot
 from workers.tasks import bg_archive_snapshot
@@ -145,22 +145,20 @@ class SnapshotManager(models.Manager):
     def get_queryset(self):
     def get_queryset(self):
         return super().get_queryset().prefetch_related('tags', 'archiveresult_set')  # .annotate(archiveresult_count=models.Count('archiveresult')).distinct()
         return super().get_queryset().prefetch_related('tags', 'archiveresult_set')  # .annotate(archiveresult_count=models.Count('archiveresult')).distinct()
 
 
-class Snapshot(ModelWithOutputDir, ModelWithStateMachine, ABIDModel):
+class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithStateMachine, ABIDModel):
     abid_prefix = 'snp_'
     abid_prefix = 'snp_'
     abid_ts_src = 'self.created_at'
     abid_ts_src = 'self.created_at'
     abid_uri_src = 'self.url'
     abid_uri_src = 'self.url'
     abid_subtype_src = '"01"'
     abid_subtype_src = '"01"'
     abid_rand_src = 'self.id'
     abid_rand_src = 'self.id'
     abid_drift_allowed = True
     abid_drift_allowed = True
-
+    
     state_machine_name = 'core.statemachines.SnapshotMachine'
     state_machine_name = 'core.statemachines.SnapshotMachine'
     state_field_name = 'status'
     state_field_name = 'status'
     retry_at_field_name = 'retry_at'
     retry_at_field_name = 'retry_at'
     StatusChoices = ModelWithStateMachine.StatusChoices
     StatusChoices = ModelWithStateMachine.StatusChoices
     active_state = StatusChoices.STARTED
     active_state = StatusChoices.STARTED
-    
-    output_dir_parent = 'snapshots'
-    
+
     id = models.UUIDField(primary_key=True, default=None, null=False, editable=False, unique=True, verbose_name='ID')
     id = models.UUIDField(primary_key=True, default=None, null=False, editable=False, unique=True, verbose_name='ID')
     abid = ABIDField(prefix=abid_prefix)
     abid = ABIDField(prefix=abid_prefix)
 
 
@@ -168,9 +166,8 @@ class Snapshot(ModelWithOutputDir, ModelWithStateMachine, ABIDModel):
     created_at = AutoDateTimeField(default=None, null=False, db_index=True)  # loaded from self._init_timestamp
     created_at = AutoDateTimeField(default=None, null=False, db_index=True)  # loaded from self._init_timestamp
     modified_at = models.DateTimeField(auto_now=True)
     modified_at = models.DateTimeField(auto_now=True)
     
     
-    status = ModelWithStateMachine.StatusField(choices=StatusChoices, default=StatusChoices.QUEUED)
     retry_at = ModelWithStateMachine.RetryAtField(default=timezone.now)
     retry_at = ModelWithStateMachine.RetryAtField(default=timezone.now)
-    
+    status = ModelWithStateMachine.StatusField(choices=StatusChoices, default=StatusChoices.QUEUED)
     notes = models.TextField(blank=True, null=False, default='', help_text='Any extra notes this snapshot should have')
     notes = models.TextField(blank=True, null=False, default='', help_text='Any extra notes this snapshot should have')
 
 
     bookmarked_at = AutoDateTimeField(default=None, null=False, editable=True, db_index=True)
     bookmarked_at = AutoDateTimeField(default=None, null=False, editable=True, db_index=True)
@@ -183,13 +180,61 @@ class Snapshot(ModelWithOutputDir, ModelWithStateMachine, ABIDModel):
     tags = models.ManyToManyField(Tag, blank=True, through=SnapshotTag, related_name='snapshot_set', through_fields=('snapshot', 'tag'))
     tags = models.ManyToManyField(Tag, blank=True, through=SnapshotTag, related_name='snapshot_set', through_fields=('snapshot', 'tag'))
     title = models.CharField(max_length=512, null=True, blank=True, db_index=True)
     title = models.CharField(max_length=512, null=True, blank=True, db_index=True)
 
 
-    # config = models.JSONField(default=dict, null=False, blank=False, editable=True)
 
 
     keys = ('url', 'timestamp', 'title', 'tags', 'downloaded_at', 'created_at', 'status', 'retry_at', 'abid', 'id')
     keys = ('url', 'timestamp', 'title', 'tags', 'downloaded_at', 'created_at', 'status', 'retry_at', 'abid', 'id')
 
 
     archiveresult_set: models.Manager['ArchiveResult']
     archiveresult_set: models.Manager['ArchiveResult']
 
 
     objects = SnapshotManager()
     objects = SnapshotManager()
+    
+    ### Inherited from ModelWithStateMachine #################################
+    # StatusChoices = ModelWithStateMachine.StatusChoices
+    #
+    # status = ModelWithStateMachine.StatusField(choices=StatusChoices, default=StatusChoices.QUEUED)
+    # retry_at = ModelWithStateMachine.RetryAtField(default=timezone.now)
+    #
+    # state_machine_name = 'core.statemachines.SnapshotMachine'
+    # state_field_name = 'status'
+    # retry_at_field_name = 'retry_at'
+    # active_state = StatusChoices.STARTED
+    ########################################################################
+    
+    ### Inherited from ModelWithConfig #######################################
+    # config = models.JSONField(default=dict, null=False, blank=False, editable=True)
+    ########################################################################
+    
+    ### Inherited from ModelWithOutputDir:
+    # output_dir = models.FilePathField(path=CONSTANTS.ARCHIVE_DIR, recursive=True, match='.*', default=None, null=True, blank=True, editable=True)
+    
+    # self.save(): creates OUTPUT_DIR, writes index.json, writes indexes
+    # self.output_dir_parent -> str 'archive/snapshots/<YYYY-MM-DD>/<example.com>'
+    # self.output_dir_name -> '<abid>'
+    # self.output_dir_str -> 'archive/snapshots/<YYYY-MM-DD>/<example.com>/<abid>'
+    # self.OUTPUT_DIR -> Path('/data/archive/snapshots/<YYYY-MM-DD>/<example.com>/<abid>')
+    
+    ### Inherited from ABIDModel:
+    # id = models.UUIDField(primary_key=True, default=None, null=False, editable=False, unique=True, verbose_name='ID')
+    # abid = ABIDField(prefix=abid_prefix)
+    # created_by = models.ForeignKey(settings.AUTH_USER_MODEL, on_delete=models.CASCADE, default=None, null=False, related_name='snapshot_set', db_index=True)
+    # created_at = AutoDateTimeField(default=None, null=False, db_index=True)  # loaded from self._init_timestamp
+    # modified_at = models.DateTimeField(auto_now=True)
+    
+    # abid_prefix = 'snp_'
+    # abid_ts_src = 'self.created_at'
+    # abid_uri_src = 'self.url'
+    # abid_subtype_src = '"01"'
+    # abid_rand_src = 'self.id'
+    # abid_drift_allowed = True
+    # self.clean() -> sets self._timestamp
+    # self.save() -> issues new ABID if creating new, otherwise uses existing ABID
+    # self.ABID -> ABID
+    # self.api_url -> '/api/v1/core/snapshot/{uulid}'
+    # self.api_docs_url -> '/api/v1/docs#/Core%20Models/api_v1_core_get_snapshot'
+    # self.admin_change_url -> '/admin/core/snapshot/{pk}/change/'
+    # self.get_absolute_url() -> '/{self.archive_path}'
+    # self.update_for_workers() -> bool
+    # self.as_json() -> dict[str, Any]
+    
 
 
     def save(self, *args, **kwargs):
     def save(self, *args, **kwargs):
         print(f'Snapshot[{self.ABID}].save()')
         print(f'Snapshot[{self.ABID}].save()')
@@ -551,7 +596,7 @@ class ArchiveResultManager(models.Manager):
             ).order_by('indexing_precedence')
             ).order_by('indexing_precedence')
         return qs
         return qs
 
 
-class ArchiveResult(ModelWithOutputDir, ModelWithStateMachine, ABIDModel):
+class ArchiveResult(ModelWithConfig, ModelWithOutputDir, ModelWithStateMachine, ABIDModel):
     abid_prefix = 'res_'
     abid_prefix = 'res_'
     abid_ts_src = 'self.snapshot.created_at'
     abid_ts_src = 'self.snapshot.created_at'
     abid_uri_src = 'self.snapshot.url'
     abid_uri_src = 'self.snapshot.url'
@@ -573,8 +618,6 @@ class ArchiveResult(ModelWithOutputDir, ModelWithStateMachine, ABIDModel):
     state_field_name = 'status'
     state_field_name = 'status'
     active_state = StatusChoices.STARTED
     active_state = StatusChoices.STARTED
     
     
-    output_dir_parent = 'archiveresults'
-
     EXTRACTOR_CHOICES = (
     EXTRACTOR_CHOICES = (
         ('htmltotext', 'htmltotext'),
         ('htmltotext', 'htmltotext'),
         ('git', 'git'),
         ('git', 'git'),
@@ -681,6 +724,10 @@ class ArchiveResult(ModelWithOutputDir, ModelWithStateMachine, ABIDModel):
     def extractor_module(self) -> Any | None:
     def extractor_module(self) -> Any | None:
         return abx.as_dict(abx.pm.hook.get_EXTRACTORS()).get(self.extractor, None)
         return abx.as_dict(abx.pm.hook.get_EXTRACTORS()).get(self.extractor, None)
 
 
+    @property
+    def EXTRACTOR(self) -> object:
+        # return self.extractor_module
+        return self.extractor_module(archiveresult=self)
 
 
     def embed_path(self) -> str | None:
     def embed_path(self) -> str | None:
         """
         """

+ 5 - 1
archivebox/core/statemachines.py

@@ -93,6 +93,11 @@ class SnapshotMachine(StateMachine, strict_states=True):
             status=Snapshot.StatusChoices.STARTED,
             status=Snapshot.StatusChoices.STARTED,
         )
         )
         
         
+        # run_subcommand([
+        #     'archivebox', 'snapshot', self.snapshot.ABID,
+        #     '--start',
+        # ])
+        
     @sealed.enter
     @sealed.enter
     def enter_sealed(self):
     def enter_sealed(self):
         print(f'{self}.on_sealed() ↳ snapshot.retry_at=None')
         print(f'{self}.on_sealed() ↳ snapshot.retry_at=None')
@@ -183,7 +188,6 @@ class ArchiveResultMachine(StateMachine, strict_states=True):
     
     
     def is_finished(self) -> bool:
     def is_finished(self) -> bool:
         return self.is_failed() or self.is_succeeded()
         return self.is_failed() or self.is_succeeded()
-    
 
 
     @queued.enter
     @queued.enter
     def enter_queued(self):
     def enter_queued(self):