Browse Source

refactor: Remove django-taggit and replace it with a local tags setup

Cristian 5 years ago
parent
commit
62c78e1d10

+ 0 - 1
archivebox.egg-info/requires.txt

@@ -4,7 +4,6 @@ mypy-extensions==0.4.3
 base32-crockford==0.3.0
 base32-crockford==0.3.0
 django==3.0.8
 django==3.0.8
 django-extensions==3.0.3
 django-extensions==3.0.3
-django-taggit==1.3.0
 dateparser
 dateparser
 ipython
 ipython
 youtube-dl
 youtube-dl

+ 29 - 1
archivebox/core/admin.py

@@ -9,9 +9,10 @@ from django.utils.html import format_html
 from django.utils.safestring import mark_safe
 from django.utils.safestring import mark_safe
 from django.shortcuts import render, redirect
 from django.shortcuts import render, redirect
 from django.contrib.auth import get_user_model
 from django.contrib.auth import get_user_model
+from django import forms
 
 
 from core.models import Snapshot
 from core.models import Snapshot
-from core.forms import AddLinkForm
+from core.forms import AddLinkForm, TagField
 from core.utils import get_icons
 from core.utils import get_icons
 
 
 from util import htmldecode, urldecode, ansi_to_html
 from util import htmldecode, urldecode, ansi_to_html
@@ -55,6 +56,32 @@ def delete_snapshots(modeladmin, request, queryset):
 delete_snapshots.short_description = "Delete"
 delete_snapshots.short_description = "Delete"
 
 
 
 
+class SnapshotAdminForm(forms.ModelForm):
+    tags = TagField(required=False)
+
+    class Meta:
+        model = Snapshot
+        fields = "__all__"
+
+    def save(self, commit=True):
+        # Based on: https://stackoverflow.com/a/49933068/3509554
+
+        # Get the unsave instance
+        instance = forms.ModelForm.save(self, False)
+        tags = self.cleaned_data.pop("tags")
+
+        #update save_m2m
+        def new_save_m2m():
+            instance.save_tags(tags)
+
+        # Do we need to save all changes now?
+        self.save_m2m = new_save_m2m
+        if commit:
+            instance.save()
+
+        return instance
+
+
 class SnapshotAdmin(admin.ModelAdmin):
 class SnapshotAdmin(admin.ModelAdmin):
     list_display = ('added', 'title_str', 'url_str', 'files', 'size')
     list_display = ('added', 'title_str', 'url_str', 'files', 'size')
     sort_fields = ('title_str', 'url_str', 'added')
     sort_fields = ('title_str', 'url_str', 'added')
@@ -65,6 +92,7 @@ class SnapshotAdmin(admin.ModelAdmin):
     ordering = ['-added']
     ordering = ['-added']
     actions = [delete_snapshots, overwrite_snapshots, update_snapshots, update_titles, verify_snapshots]
     actions = [delete_snapshots, overwrite_snapshots, update_snapshots, update_titles, verify_snapshots]
     actions_template = 'admin/actions_as_select.html'
     actions_template = 'admin/actions_as_select.html'
+    form = SnapshotAdminForm
 
 
     def get_queryset(self, request):
     def get_queryset(self, request):
         return super().get_queryset(request).prefetch_related('tags')
         return super().get_queryset(request).prefetch_related('tags')

+ 42 - 0
archivebox/core/forms.py

@@ -3,6 +3,7 @@ __package__ = 'archivebox.core'
 from django import forms
 from django import forms
 
 
 from ..util import URL_REGEX
 from ..util import URL_REGEX
+from .utils_taggit import edit_string_for_tags, parse_tags
 
 
 CHOICES = (
 CHOICES = (
     ('0', 'depth = 0 (archive just these URLs)'),
     ('0', 'depth = 0 (archive just these URLs)'),
@@ -12,3 +13,44 @@ CHOICES = (
 class AddLinkForm(forms.Form):
 class AddLinkForm(forms.Form):
     url = forms.RegexField(label="URLs (one per line)", regex=URL_REGEX, min_length='6', strip=True, widget=forms.Textarea, required=True)
     url = forms.RegexField(label="URLs (one per line)", regex=URL_REGEX, min_length='6', strip=True, widget=forms.Textarea, required=True)
     depth = forms.ChoiceField(label="Archive depth", choices=CHOICES, widget=forms.RadioSelect, initial='0')
     depth = forms.ChoiceField(label="Archive depth", choices=CHOICES, widget=forms.RadioSelect, initial='0')
+
+
+class TagWidgetMixin:
+    def format_value(self, value):
+        if value is not None and not isinstance(value, str):
+            value = edit_string_for_tags(value)
+        return super().format_value(value)
+
+class TagWidget(TagWidgetMixin, forms.TextInput):
+    pass
+
+class TagField(forms.CharField):
+    widget = TagWidget
+
+    def clean(self, value):
+        value = super().clean(value)
+        try:
+            return parse_tags(value)
+        except ValueError:
+            raise forms.ValidationError(
+                "Please provide a comma-separated list of tags."
+            )
+
+    def has_changed(self, initial_value, data_value):
+        # Always return False if the field is disabled since self.bound_data
+        # always uses the initial value in this case.
+        if self.disabled:
+            return False
+
+        try:
+            data_value = self.clean(data_value)
+        except forms.ValidationError:
+            pass
+
+        if initial_value is None:
+            initial_value = []
+
+        initial_value = [tag.name for tag in initial_value]
+        initial_value.sort()
+
+        return initial_value != data_value

+ 0 - 90
archivebox/core/migrations/0006_auto_20200915_2006.py

@@ -1,90 +0,0 @@
-# Generated by Django 3.0.8 on 2020-09-15 20:06
-
-from django.db import migrations, models
-from django.contrib.contenttypes.models import ContentType
-from django.utils.text import slugify 
-import django.db.models.deletion
-import taggit.managers
-
-def forwards_func(apps, schema_editor):
-    SnapshotModel = apps.get_model("core", "Snapshot")
-    TaggedItemModel = apps.get_model("core", "TaggedItem")
-    TagModel = apps.get_model("taggit", "Tag")
-    contents = ContentType.objects.all()
-    try:
-        ct = ContentType.objects.filter(app_label="core", model="snapshot")
-    except model.DoesNotExist:  # Be explicit about exceptions
-        ct = None
-
-    db_alias = schema_editor.connection.alias
-    snapshots = SnapshotModel.objects.all()
-    for snapshot in snapshots:
-        tags = snapshot.tags
-        tag_set = (
-            set(tag.strip() for tag in (snapshot.tags_old or '').split(','))
-        )
-        tag_set.discard("")
-
-        for tag in tag_set:
-            new_tag, created = TagModel.objects.get_or_create(name=tag, slug=slugify(tag))
-            TaggedItemModel.objects.get_or_create(
-                content_type_id=ct[0].id,
-                object_id=snapshot.id,
-                tag=new_tag
-            )
-
-
-def reverse_func(apps, schema_editor):
-    SnapshotModel = apps.get_model("core", "Snapshot")
-    TaggedItemModel = apps.get_model("core", "TaggedItem")
-    TagModel = apps.get_model("taggit", "Tag")
-    ct = ContentType.objects.get(app_label="core", model="snapshot")
-
-    db_alias = schema_editor.connection.alias
-    snapshots = SnapshotModel.objects.all()
-    for snapshot in snapshots:       
-        tags = TaggedItemModel.objects.filter(
-            object_id=snapshot.id,
-        )
-        snapshot.tags_old = ",".join([tag.tag.name for tag in tags])
-        snapshot.save()
-
-
-class Migration(migrations.Migration):
-
-    dependencies = [
-        ('contenttypes', '0002_remove_content_type_name'),
-        ('taggit', '0003_taggeditem_add_unique_index'),
-        ('core', '0005_auto_20200728_0326'),
-    ]
-
-    operations = [
-        migrations.RenameField(
-            model_name='snapshot',
-            old_name='tags',
-            new_name='tags_old',
-        ),
-        migrations.CreateModel(
-            name='TaggedItem',
-            fields=[
-                ('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
-                ('object_id', models.UUIDField(db_index=True, verbose_name='object ID')),
-                ('content_type', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, related_name='core_taggeditem_tagged_items', to='contenttypes.ContentType', verbose_name='content type')),
-                ('tag', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, related_name='core_taggeditem_items', to='taggit.Tag')),
-            ],
-            options={
-                'verbose_name': 'Tag',
-                'verbose_name_plural': 'Tags',
-            },
-        ),
-        migrations.AddField(
-            model_name='snapshot',
-            name='tags',
-            field=taggit.managers.TaggableManager(help_text='A comma-separated list of tags.', through='core.TaggedItem', to='taggit.Tag', verbose_name='Tags'),
-        ),
-        migrations.RunPython(forwards_func, reverse_func),
-        migrations.RemoveField(
-            model_name='snapshot',
-            name='tags_old',
-        ),
-    ]

+ 70 - 0
archivebox/core/migrations/0006_auto_20201012_1520.py

@@ -0,0 +1,70 @@
+# Generated by Django 3.0.8 on 2020-10-12 15:20
+
+from django.db import migrations, models
+from django.utils.text import slugify
+
+def forwards_func(apps, schema_editor):
+    SnapshotModel = apps.get_model("core", "Snapshot")
+    TagModel = apps.get_model("core", "Tag")
+
+    db_alias = schema_editor.connection.alias
+    snapshots = SnapshotModel.objects.all()
+    for snapshot in snapshots:
+        tags = snapshot.tags
+        tag_set = (
+            set(tag.strip() for tag in (snapshot.tags_old or '').split(','))
+        )
+        tag_set.discard("")
+
+        for tag in tag_set:
+            to_add, _ = TagModel.objects.get_or_create(name=tag, slug=slugify(tag))
+            snapshot.tags.add(to_add)
+
+
+def reverse_func(apps, schema_editor):
+    SnapshotModel = apps.get_model("core", "Snapshot")
+    TagModel = apps.get_model("core", "Tag")
+
+    db_alias = schema_editor.connection.alias
+    snapshots = SnapshotModel.objects.all()
+    for snapshot in snapshots:
+        tags = snapshot.tags.values_list("name", flat=True)
+        snapshot.tags_old = ",".join([tag for tag in tags])
+        snapshot.save()
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ('core', '0005_auto_20200728_0326'),
+    ]
+
+    operations = [
+        migrations.RenameField(
+            model_name='snapshot',
+            old_name='tags',
+            new_name='tags_old',
+        ),
+        migrations.CreateModel(
+            name='Tag',
+            fields=[
+                ('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
+                ('name', models.CharField(max_length=100, unique=True, verbose_name='name')),
+                ('slug', models.SlugField(max_length=100, unique=True, verbose_name='slug')),
+            ],
+            options={
+                'verbose_name': 'Tag',
+                'verbose_name_plural': 'Tags',
+            },
+        ),
+        migrations.AddField(
+            model_name='snapshot',
+            name='tags',
+            field=models.ManyToManyField(to='core.Tag'),
+        ),
+        migrations.RunPython(forwards_func, reverse_func),
+        migrations.RemoveField(
+            model_name='snapshot',
+            name='tags_old',
+        ),
+    ]

+ 46 - 7
archivebox/core/models.py

@@ -2,22 +2,55 @@ __package__ = 'archivebox.core'
 
 
 import uuid
 import uuid
 
 
-from django.db import models
+from django.db import models, transaction
 from django.utils.functional import cached_property
 from django.utils.functional import cached_property
-
-from taggit.managers import TaggableManager
-from taggit.models import GenericUUIDTaggedItemBase, TaggedItemBase
+from django.utils.text import slugify
 
 
 from ..util import parse_date
 from ..util import parse_date
 from ..index.schema import Link
 from ..index.schema import Link
 
 
 
 
+class Tag(models.Model):
+    """
+    Based on django-taggit model
+    """
+    name = models.CharField(verbose_name="name", unique=True, blank=False, max_length=100)
+    slug = models.SlugField(verbose_name="slug", unique=True, max_length=100)
 
 
-class TaggedItem(GenericUUIDTaggedItemBase, TaggedItemBase):
     class Meta:
     class Meta:
         verbose_name = "Tag"
         verbose_name = "Tag"
         verbose_name_plural = "Tags"
         verbose_name_plural = "Tags"
 
 
+    def __str__(self):
+        return self.name
+
+    def slugify(self, tag, i=None):
+        slug = slugify(tag)
+        if i is not None:
+            slug += "_%d" % i
+        return slug
+
+    def save(self, *args, **kwargs):
+        if self._state.adding and not self.slug:
+            self.slug = self.slugify(self.name)
+
+            with transaction.atomic():
+                slugs = set(
+                    type(self)
+                    ._default_manager.filter(slug__startswith=self.slug)
+                    .values_list("slug", flat=True)
+                )
+
+                i = None
+                while True:
+                    slug = self.slugify(self.name, i)
+                    if slug not in slugs:
+                        self.slug = slug
+                        return super().save(*args, **kwargs)
+                    i = 1 if i is None else i+1
+        else:
+            return super().save(*args, **kwargs)
+
 class Snapshot(models.Model):
 class Snapshot(models.Model):
     id = models.UUIDField(primary_key=True, default=uuid.uuid4, editable=False)
     id = models.UUIDField(primary_key=True, default=uuid.uuid4, editable=False)
 
 
@@ -25,11 +58,10 @@ class Snapshot(models.Model):
     timestamp = models.CharField(max_length=32, unique=True, db_index=True)
     timestamp = models.CharField(max_length=32, unique=True, db_index=True)
 
 
     title = models.CharField(max_length=128, null=True, blank=True, db_index=True)
     title = models.CharField(max_length=128, null=True, blank=True, db_index=True)
-    tags = TaggableManager(through=TaggedItem)
 
 
     added = models.DateTimeField(auto_now_add=True, db_index=True)
     added = models.DateTimeField(auto_now_add=True, db_index=True)
     updated = models.DateTimeField(null=True, blank=True, db_index=True)
     updated = models.DateTimeField(null=True, blank=True, db_index=True)
-    # bookmarked = models.DateTimeField()
+    tags = models.ManyToManyField(Tag)
 
 
     keys = ('url', 'timestamp', 'title', 'tags', 'updated')
     keys = ('url', 'timestamp', 'title', 'tags', 'updated')
 
 
@@ -113,3 +145,10 @@ class Snapshot(models.Model):
             and self.history['title'][-1].output.strip()):
             and self.history['title'][-1].output.strip()):
             return self.history['title'][-1].output.strip()
             return self.history['title'][-1].output.strip()
         return None
         return None
+
+    def save_tags(self, tags=[]):
+        tags_id = []
+        for tag in tags:
+            tags_id.append(Tag.objects.get_or_create(name=tag)[0].id)
+        self.tags.clear()
+        self.tags.add(*tags_id)

+ 0 - 1
archivebox/core/settings.py

@@ -31,7 +31,6 @@ INSTALLED_APPS = [
     'core',
     'core',
 
 
     'django_extensions',
     'django_extensions',
-    'taggit',
 ]
 ]
 
 
 
 

+ 113 - 0
archivebox/core/utils_taggit.py

@@ -0,0 +1,113 @@
+# Taken from https://github.com/jazzband/django-taggit/blob/3b56adb637ab95aca5036c37a358402c825a367c/taggit/utils.py
+
+def parse_tags(tagstring):
+    """
+    Parses tag input, with multiple word input being activated and
+    delineated by commas and double quotes. Quotes take precedence, so
+    they may contain commas.
+
+    Returns a sorted list of unique tag names.
+
+    Ported from Jonathan Buchanan's `django-tagging
+    <http://django-tagging.googlecode.com/>`_
+    """
+    if not tagstring:
+        return []
+
+    # Special case - if there are no commas or double quotes in the
+    # input, we don't *do* a recall... I mean, we know we only need to
+    # split on spaces.
+    if "," not in tagstring and '"' not in tagstring:
+        words = list(set(split_strip(tagstring, " ")))
+        words.sort()
+        return words
+
+    words = []
+    buffer = []
+    # Defer splitting of non-quoted sections until we know if there are
+    # any unquoted commas.
+    to_be_split = []
+    saw_loose_comma = False
+    open_quote = False
+    i = iter(tagstring)
+    try:
+        while True:
+            c = next(i)
+            if c == '"':
+                if buffer:
+                    to_be_split.append("".join(buffer))
+                    buffer = []
+                # Find the matching quote
+                open_quote = True
+                c = next(i)
+                while c != '"':
+                    buffer.append(c)
+                    c = next(i)
+                if buffer:
+                    word = "".join(buffer).strip()
+                    if word:
+                        words.append(word)
+                    buffer = []
+                open_quote = False
+            else:
+                if not saw_loose_comma and c == ",":
+                    saw_loose_comma = True
+                buffer.append(c)
+    except StopIteration:
+        # If we were parsing an open quote which was never closed treat
+        # the buffer as unquoted.
+        if buffer:
+            if open_quote and "," in buffer:
+                saw_loose_comma = True
+            to_be_split.append("".join(buffer))
+    if to_be_split:
+        if saw_loose_comma:
+            delimiter = ","
+        else:
+            delimiter = " "
+        for chunk in to_be_split:
+            words.extend(split_strip(chunk, delimiter))
+    words = list(set(words))
+    words.sort()
+    return words
+
+
+def split_strip(string, delimiter=","):
+    """
+    Splits ``string`` on ``delimiter``, stripping each resulting string
+    and returning a list of non-empty strings.
+
+    Ported from Jonathan Buchanan's `django-tagging
+    <http://django-tagging.googlecode.com/>`_
+    """
+    if not string:
+        return []
+
+    words = [w.strip() for w in string.split(delimiter)]
+    return [w for w in words if w]
+
+
+def edit_string_for_tags(tags):
+    """
+    Given list of ``Tag`` instances, creates a string representation of
+    the list suitable for editing by the user, such that submitting the
+    given string representation back without changing it will give the
+    same list of tags.
+
+    Tag names which contain commas will be double quoted.
+
+    If any tag name which isn't being quoted contains whitespace, the
+    resulting string of tag names will be comma-delimited, otherwise
+    it will be space-delimited.
+
+    Ported from Jonathan Buchanan's `django-tagging
+    <http://django-tagging.googlecode.com/>`_
+    """
+    names = []
+    for tag in tags:
+        name = tag.name
+        if "," in name or " " in name:
+            names.append('"%s"' % name)
+        else:
+            names.append(name)
+    return ", ".join(sorted(names))

+ 8 - 4
archivebox/index/sql.py

@@ -34,14 +34,19 @@ def remove_from_sql_main_index(snapshots: QuerySet, out_dir: Path=OUTPUT_DIR) ->
 def write_link_to_sql_index(link: Link):
 def write_link_to_sql_index(link: Link):
     from core.models import Snapshot
     from core.models import Snapshot
     info = {k: v for k, v in link._asdict().items() if k in Snapshot.keys}
     info = {k: v for k, v in link._asdict().items() if k in Snapshot.keys}
+    tags = info.pop("tags")
+    if tags is None:
+        tags = []
+
     try:
     try:
         info["timestamp"] = Snapshot.objects.get(url=link.url).timestamp
         info["timestamp"] = Snapshot.objects.get(url=link.url).timestamp
     except Snapshot.DoesNotExist:
     except Snapshot.DoesNotExist:
         while Snapshot.objects.filter(timestamp=info["timestamp"]).exists():
         while Snapshot.objects.filter(timestamp=info["timestamp"]).exists():
             info["timestamp"] = str(float(info["timestamp"]) + 1.0)
             info["timestamp"] = str(float(info["timestamp"]) + 1.0)
 
 
-    Snapshot.objects.update_or_create(url=link.url, defaults=info)
-    return Snapshot.objects.get(url=link.url)
+    snapshot, _ = Snapshot.objects.update_or_create(url=link.url, defaults=info)
+    snapshot.save_tags(tags)
+    return snapshot
 
 
 
 
 @enforce_types
 @enforce_types
@@ -72,9 +77,8 @@ def write_sql_link_details(link: Link, out_dir: Path=OUTPUT_DIR) -> None:
         )
         )
         tag_list = list(tag_set) or []
         tag_list = list(tag_set) or []
 
 
-        for tag in tag_list:
-            snap.tags.add(tag)
         snap.save()
         snap.save()
+        snap.save_tags(tag_list)
 
 
 
 
 
 

+ 0 - 1
setup.py

@@ -80,7 +80,6 @@ setuptools.setup(
         "base32-crockford==0.3.0",
         "base32-crockford==0.3.0",
         "django==3.0.8",
         "django==3.0.8",
         "django-extensions==3.0.3",
         "django-extensions==3.0.3",
-        "django-taggit==1.3.0",
 
 
         "dateparser",
         "dateparser",
         "ipython",
         "ipython",

+ 5 - 7
tests/test_init.py

@@ -157,18 +157,16 @@ def test_tags_migration(tmp_path, disable_extractors_dict):
     conn.row_factory = sqlite3.Row
     conn.row_factory = sqlite3.Row
     c = conn.cursor()
     c = conn.cursor()
     c.execute("""
     c.execute("""
-        SELECT snapshot.id snapshot, tags.name tag
-        FROM core_snapshot snapshot, core_taggeditem snapshot_tagged, taggit_tag tags
-        WHERE
-        snapshot.id = snapshot_tagged.object_id
-        AND tags.id = snapshot_tagged.tag_id
+        SELECT core_snapshot.id, core_tag.name from core_snapshot
+        JOIN core_snapshot_tags on core_snapshot_tags.snapshot_id=core_snapshot.id
+        JOIN core_tag on core_tag.id=core_snapshot_tags.tag_id
     """)
     """)
     tags = c.fetchall()
     tags = c.fetchall()
     conn.commit()
     conn.commit()
     conn.close()
     conn.close()
 
 
     for tag in tags:
     for tag in tags:
-        snapshot_id = tag['snapshot']
-        tag_name = tag['tag']
+        snapshot_id = tag["id"]
+        tag_name = tag["name"]
         # Check each tag migrated is in the previous field
         # Check each tag migrated is in the previous field
         assert tag_name in snapshots_dict[snapshot_id]
         assert tag_name in snapshots_dict[snapshot_id]