Browse Source

fix abid generation migrations to be historically consistent

Nick Sweeting 1 year ago
parent
commit
9273db528e

+ 3 - 3
archivebox/core/migrations/0024_auto_20240513_1143.py

@@ -2,7 +2,7 @@
 
 from django.db import migrations
 from datetime import datetime
-from abid_utils.abid import abid_from_values
+from abid_utils.abid import abid_from_values, DEFAULT_ABID_URI_SALT
 
 
 def calculate_abid(self):
@@ -41,6 +41,7 @@ def calculate_abid(self):
         uri=uri,
         subtype=subtype,
         rand=rand,
+        salt=DEFAULT_ABID_URI_SALT,
     )
     assert abid.ulid and abid.uuid and abid.typeid, f'Failed to calculate {prefix}_ABID for {self.__class__.__name__}'
     return abid
@@ -65,8 +66,7 @@ def generate_snapshot_abids(apps, schema_editor):
 
         snapshot.abid = calculate_abid(snapshot)
         snapshot.uuid = snapshot.abid.uuid
-        snapshot.id = snapshot.abid.uuid
-        snapshot.save(update_fields=["abid", "uuid", "id"])
+        snapshot.save(update_fields=["abid", "uuid"])
 
 def generate_archiveresult_abids(apps, schema_editor):
     print('   Generating ArchiveResult.abid values... (may take an hour or longer for large collections...)')

+ 65 - 5
archivebox/core/migrations/0027_update_snapshot_ids.py

@@ -4,29 +4,89 @@ from django.db import migrations
 
 from django.db import migrations
 from datetime import datetime
-from abid_utils.abid import ABID
+from abid_utils.abid import ABID, abid_from_values, DEFAULT_ABID_URI_SALT
 
 
+def calculate_abid(self):
+    """
+    Return a freshly derived ABID (assembled from attrs defined in ABIDModel.abid_*_src).
+    """
+    prefix = self.abid_prefix
+    ts = eval(self.abid_ts_src)
+    uri = eval(self.abid_uri_src)
+    subtype = eval(self.abid_subtype_src)
+    rand = eval(self.abid_rand_src)
+
+    if (not prefix) or prefix == 'obj_':
+        suggested_abid = self.__class__.__name__[:3].lower()
+        raise Exception(f'{self.__class__.__name__}.abid_prefix must be defined to calculate ABIDs (suggested: {suggested_abid})')
+
+    if not ts:
+        ts = datetime.utcfromtimestamp(0)
+        print(f'[!] WARNING: Generating ABID with ts=0000000000 placeholder because {self.__class__.__name__}.abid_ts_src={self.abid_ts_src} is unset!', ts.isoformat())
+
+    if not uri:
+        uri = str(self)
+        print(f'[!] WARNING: Generating ABID with uri=str(self) placeholder because {self.__class__.__name__}.abid_uri_src={self.abid_uri_src} is unset!', uri)
+
+    if not subtype:
+        subtype = self.__class__.__name__
+        print(f'[!] WARNING: Generating ABID with subtype={subtype} placeholder because {self.__class__.__name__}.abid_subtype_src={self.abid_subtype_src} is unset!', subtype)
+
+    if not rand:
+        rand = getattr(self, 'uuid', None) or getattr(self, 'id', None) or getattr(self, 'pk')
+        print(f'[!] WARNING: Generating ABID with rand=self.id placeholder because {self.__class__.__name__}.abid_rand_src={self.abid_rand_src} is unset!', rand)
+
+    abid = abid_from_values(
+        prefix=prefix,
+        ts=ts,
+        uri=uri,
+        subtype=subtype,
+        rand=rand,
+        salt=DEFAULT_ABID_URI_SALT,
+    )
+    assert abid.ulid and abid.uuid and abid.typeid, f'Failed to calculate {prefix}_ABID for {self.__class__.__name__}'
+    return abid
+
 def update_snapshot_ids(apps, schema_editor):
     Snapshot = apps.get_model("core", "Snapshot")
     num_total = Snapshot.objects.all().count()
     print(f'   Updating {num_total} Snapshot.id, Snapshot.uuid values in place...')
     for idx, snapshot in enumerate(Snapshot.objects.all().only('abid').iterator()):
         assert snapshot.abid
-        snapshot.uuid = ABID.parse(snapshot.abid).uuid
-        snapshot.save(update_fields=["uuid"])
+        snapshot.abid_prefix = 'snp_'
+        snapshot.abid_ts_src = 'self.added'
+        snapshot.abid_uri_src = 'self.url'
+        snapshot.abid_subtype_src = '"01"'
+        snapshot.abid_rand_src = 'self.uuid'
+
+        snapshot.abid = calculate_abid(snapshot)
+        snapshot.uuid = snapshot.abid.uuid
+        snapshot.save(update_fields=["abid", "uuid"])
         assert str(ABID.parse(snapshot.abid).uuid) == str(snapshot.uuid)
         if idx % 1000 == 0:
             print(f'Migrated {idx}/{num_total} Snapshot objects...')
 
 def update_archiveresult_ids(apps, schema_editor):
+    Snapshot = apps.get_model("core", "Snapshot")
     ArchiveResult = apps.get_model("core", "ArchiveResult")
     num_total = ArchiveResult.objects.all().count()
     print(f'   Updating {num_total} ArchiveResult.id, ArchiveResult.uuid values in place... (may take an hour or longer for large collections...)')
-    for idx, result in enumerate(ArchiveResult.objects.all().only('abid').iterator()):
+    for idx, result in enumerate(ArchiveResult.objects.all().only('abid', 'snapshot_id').iterator()):
         assert result.abid
+        result.abid_prefix = 'res_'
+        result.snapshot = Snapshot.objects.get(pk=result.snapshot_id)
+        result.snapshot_added = result.snapshot.added
+        result.snapshot_url = result.snapshot.url
+        result.abid_ts_src = 'self.snapshot_added'
+        result.abid_uri_src = 'self.snapshot_url'
+        result.abid_subtype_src = 'self.extractor'
+        result.abid_rand_src = 'self.id'
+
+        result.abid = calculate_abid(result)
+        result.uuid = result.abid.uuid
         result.uuid = ABID.parse(result.abid).uuid
-        result.save(update_fields=["uuid"])
+        result.save(update_fields=["abid", "uuid"])
         assert str(ABID.parse(result.abid).uuid) == str(result.uuid)
         if idx % 5000 == 0:
             print(f'Migrated {idx}/{num_total} ArchiveResult objects...')

+ 2 - 2
archivebox/core/migrations/0040_archiveresult_snapshot.py

@@ -8,9 +8,9 @@ def update_archiveresult_snapshot_ids(apps, schema_editor):
     Snapshot = apps.get_model("core", "Snapshot")
     num_total = ArchiveResult.objects.all().count()
     print(f'   Updating {num_total} ArchiveResult.snapshot_id values in place... (may take an hour or longer for large collections...)')
-    for idx, result in enumerate(ArchiveResult.objects.all().only('snapshot_old_id').iterator()):
+    for idx, result in enumerate(ArchiveResult.objects.all().only('snapshot_old_id').iterator(chunk_size=5000)):
         assert result.snapshot_old_id
-        snapshot = Snapshot.objects.get(old_id=result.snapshot_old_id)
+        snapshot = Snapshot.objects.only('id').get(old_id=result.snapshot_old_id)
         result.snapshot_id = snapshot.id
         result.save(update_fields=["snapshot_id"])
         assert str(result.snapshot_id) == str(snapshot.id)

+ 27 - 20
archivebox/core/models.py

@@ -17,7 +17,6 @@ from django.utils.text import slugify
 from django.core.cache import cache
 from django.urls import reverse, reverse_lazy
 from django.db.models import Case, When, Value, IntegerField
-from django.contrib.auth.models import User   # noqa
 
 from abid_utils.models import ABIDModel, ABIDField
 
@@ -36,6 +35,8 @@ STATUS_CHOICES = [
     ("skipped", "skipped")
 ]
 
+def rand_int_id():
+    return random.getrandbits(32)
 
 
 # class BaseModel(models.Model):
@@ -49,24 +50,26 @@ STATUS_CHOICES = [
 #         abstract = True
 
 
+
+
 class Tag(ABIDModel):
     """
     Based on django-taggit model + ABID base.
     """
     abid_prefix = 'tag_'
     abid_ts_src = 'self.created'          # TODO: add created/modified time
-    abid_uri_src = 'self.name'
+    abid_uri_src = 'self.slug'
     abid_subtype_src = '"03"'
-    abid_rand_src = 'self.id'
+    abid_rand_src = 'self.old_id'
+
+    old_id = models.BigIntegerField(unique=True, default=rand_int_id, serialize=False, verbose_name='Old ID')  # legacy PK
 
-    # id = models.UUIDField(primary_key=True, default=uuid.uuid4, editable=True)
-    id = models.AutoField(primary_key=True, serialize=False, verbose_name='ID')
-    uuid = models.UUIDField(default=uuid.uuid4, null=True, unique=True)
+    id = models.UUIDField(primary_key=True, default=uuid.uuid4, editable=False, unique=True)
     abid = ABIDField(prefix=abid_prefix)
 
 
     name = models.CharField(unique=True, blank=False, max_length=100)
-    slug = models.SlugField(unique=True, blank=True, max_length=100)
+    slug = models.SlugField(unique=True, blank=False, max_length=100, editable=False)
     # slug is autoset on save from name, never set it manually
 
 
@@ -77,9 +80,9 @@ class Tag(ABIDModel):
     def __str__(self):
         return self.name
 
-    @property
-    def old_id(self):
-        return self.id
+    # @property
+    # def old_id(self):
+    #     return self.id
 
     def slugify(self, tag, i=None):
         slug = slugify(tag)
@@ -156,16 +159,19 @@ class Snapshot(ABIDModel):
         return self.id
 
     def __repr__(self) -> str:
-        title = self.title or '-'
-        return f'[{self.timestamp}] {self.url[:64]} ({title[:64]})'
+        title = (self.title_stripped or '-')[:64]
+        return f'[{self.timestamp}] {self.url[:64]} ({title})'
 
     def __str__(self) -> str:
-        title = self.title or '-'
-        return f'[{self.timestamp}] {self.url[:64]} ({title[:64]})'
+        title = (self.title_stripped or '-')[:64]
+        return f'[{self.timestamp}] {self.url[:64]} ({title})'
 
     def save(self, *args, **kwargs):
         super().save(*args, **kwargs)
-        assert str(self.id) == str(self.abid.uuid) == str(self.uuid)
+        try:
+            assert str(self.id) == str(self.ABID.uuid) == str(self.uuid), f'Snapshot.id ({self.id}) does not match .ABID.uuid ({self.ABID.uuid})'
+        except AssertionError as e:
+            print(e)
 
     @classmethod
     def from_json(cls, info: dict):
@@ -357,9 +363,6 @@ class ArchiveResultManager(models.Manager):
             qs = qs.annotate(indexing_precedence=Case(*precedence, default=Value(1000),output_field=IntegerField())).order_by('indexing_precedence')
         return qs
 
-def rand_int_id():
-    return random.getrandbits(32)
-
 class ArchiveResult(ABIDModel):
     abid_prefix = 'res_'
     abid_ts_src = 'self.snapshot.added'
@@ -387,7 +390,8 @@ class ArchiveResult(ABIDModel):
     objects = ArchiveResultManager()
 
     class Meta(TypedModelMeta):
-        verbose_name = 'Result'
+        verbose_name = 'Archive Result'
+        verbose_name_plural = 'Archive Results Log'
         
 
     def __str__(self):
@@ -395,7 +399,10 @@ class ArchiveResult(ABIDModel):
 
     def save(self, *args, **kwargs):
         super().save(*args, **kwargs)
-        assert str(self.id) == str(self.abid.uuid) == str(self.uuid)
+        try:
+            assert str(self.id) == str(self.ABID.uuid) == str(self.uuid), f'ArchiveResult.id ({self.id}) does not match .ABID.uuid ({self.ABID.uuid})'
+        except AssertionError as e:
+            print(e)
 
     @property
     def uuid(self):

+ 1 - 1
archivebox/core/settings.py

@@ -83,7 +83,7 @@ INSTALLED_APPS = [
     'django.contrib.staticfiles',
     'django.contrib.admin',
     'django_jsonform',
-
+    
     'signal_webhooks',
     'abid_utils',
     'plugantic',

+ 1 - 0
archivebox/core/views.py

@@ -181,6 +181,7 @@ class SnapshotView(View):
         except (IndexError, ValueError):
             slug, archivefile = path.split('/', 1)[0], 'index.html'
 
+
         # slug is a timestamp
         if slug.replace('.','').isdigit():