瀏覽代碼

feat: initial functional version with icons calculated based on archive results

Cristian 5 年之前
父節點
當前提交
b3e0400bc0
共有 3 個文件被更改,包括 106 次插入31 次删除
  1. 37 0
      archivebox/core/migrations/0007_archiveresult.py
  2. 5 1
      archivebox/core/models.py
  3. 64 30
      archivebox/core/utils.py

+ 37 - 0
archivebox/core/migrations/0007_archiveresult.py

@@ -1,8 +1,43 @@
 # Generated by Django 3.0.8 on 2020-11-04 12:25
 # Generated by Django 3.0.8 on 2020-11-04 12:25
 
 
+import json
+from pathlib import Path
+
 from django.db import migrations, models
 from django.db import migrations, models
 import django.db.models.deletion
 import django.db.models.deletion
 
 
+from config import CONFIG
+
+
+def forwards_func(apps, schema_editor):
+    from core.models import EXTRACTORS
+
+    Snapshot = apps.get_model("core", "Snapshot")
+    ArchiveResult = apps.get_model("core", "ArchiveResult")
+
+    snapshots = Snapshot.objects.all()
+    for snapshot in snapshots:
+        out_dir = Path(CONFIG['ARCHIVE_DIR']) / snapshot.timestamp
+
+        try:
+            with open(out_dir / "index.json", "r") as f:
+                fs_index = json.load(f)
+        except Exception as e:
+            continue
+
+        history = fs_index["history"]
+
+        for extractor in history:
+            for result in history[extractor]:
+                ArchiveResult.objects.create(extractor=extractor, snapshot=snapshot, cmd=json.dumps(result["cmd"]), cmd_version=result["cmd_version"], 
+                start_ts=result["start_ts"], end_ts=result["end_ts"], status=result["status"], pwd=result["pwd"], output=result["output"])
+
+
+
+def reverse_func(apps, schema_editor):
+    ArchiveResult = apps.get_model("core", "ArchiveResult")
+    ArchiveResult.objects.all().delete()
+
 
 
 class Migration(migrations.Migration):
 class Migration(migrations.Migration):
 
 
@@ -18,6 +53,7 @@ class Migration(migrations.Migration):
                 ('cmd', models.CharField(default='', max_length=500)),
                 ('cmd', models.CharField(default='', max_length=500)),
                 ('pwd', models.CharField(default='', max_length=200)),
                 ('pwd', models.CharField(default='', max_length=200)),
                 ('cmd_version', models.CharField(default='', max_length=20)),
                 ('cmd_version', models.CharField(default='', max_length=20)),
+                ('status', models.CharField(max_length=10)),
                 ('output', models.CharField(default='', max_length=500)),
                 ('output', models.CharField(default='', max_length=500)),
                 ('start_ts', models.DateTimeField()),
                 ('start_ts', models.DateTimeField()),
                 ('end_ts', models.DateTimeField()),
                 ('end_ts', models.DateTimeField()),
@@ -25,4 +61,5 @@ class Migration(migrations.Migration):
                 ('snapshot', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, to='core.Snapshot')),
                 ('snapshot', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, to='core.Snapshot')),
             ],
             ],
         ),
         ),
+        migrations.RunPython(forwards_func, reverse_func),
     ]
     ]

+ 5 - 1
archivebox/core/models.py

@@ -161,4 +161,8 @@ class ArchiveResult(models.Model):
     output = models.CharField(max_length=500, default="")
     output = models.CharField(max_length=500, default="")
     start_ts = models.DateTimeField()
     start_ts = models.DateTimeField()
     end_ts = models.DateTimeField()
     end_ts = models.DateTimeField()
-    extractor = models.CharField(choices=EXTRACTORS, blank=False, max_length=20)
+    status = models.CharField(max_length=10)
+    extractor = models.CharField(choices=EXTRACTORS, blank=False, max_length=20)
+
+    def __str__(self):
+        return self.extractor

+ 64 - 30
archivebox/core/utils.py

@@ -2,38 +2,72 @@ from pathlib import Path
 
 
 from django.utils.html import format_html
 from django.utils.html import format_html
 
 
-from core.models import Snapshot
+from core.models import Snapshot, ArchiveResult, EXTRACTORS
 
 
 
 
 def get_icons(snapshot: Snapshot) -> str:
 def get_icons(snapshot: Snapshot) -> str:
+    archive_results = snapshot.archiveresult_set
     link = snapshot.as_link()
     link = snapshot.as_link()
     canon = link.canonical_outputs()
     canon = link.canonical_outputs()
-    out_dir = Path(link.link_dir)
-
-    # slow version: highlights icons based on whether files exist or not for that output
-    # link_tuple = lambda link, method: (link.archive_path, canon[method] or '', canon[method] and (out_dir / (canon[method] or 'notdone')).exists())
-    # fast version: all icons are highlighted without checking for outputs in filesystem
-    link_tuple = lambda link, method: (link.archive_path, canon[method] or '', canon[method] and (out_dir / (canon[method] or 'notdone')).exists())
-
-    return format_html(
-            '<span class="files-icons" style="font-size: 1.2em; opacity: 0.8">'
-                '<a href="/{}/{}" class="exists-{}" title="SingleFile">❶ </a>'
-                '<a href="/{}/{}" class="exists-{}" title="Wget clone">🆆 </a> '
-                '<a href="/{}/{}" class="exists-{}" title="HTML dump">🅷 </a> '
-                '<a href="/{}/{}" class="exists-{}" title="PDF">📄 </a> '
-                '<a href="/{}/{}" class="exists-{}" title="Screenshot">💻 </a> '
-                '<a href="/{}/{}" class="exists-{}" title="WARC">📦 </a> '
-                '<a href="/{}/{}/" class="exists-{}" title="Media files">📼 </a> '
-                '<a href="/{}/{}/" class="exists-{}" title="Git repos">🅶 </a> '
-                '<a href="{}" class="exists-{}" title="Archive.org snapshot">🏛 </a> '
-            '</span>',
-            *link_tuple(link, 'singlefile_path'),
-            *link_tuple(link, 'wget_path')[:2], any((out_dir / link.domain).glob('*')),
-            *link_tuple(link, 'pdf_path'),
-            *link_tuple(link, 'screenshot_path'),
-            *link_tuple(link, 'dom_path'),
-            *link_tuple(link, 'warc_path')[:2], any((out_dir / canon['warc_path']).glob('*.warc.gz')),
-            *link_tuple(link, 'media_path')[:2], any((out_dir / canon['media_path']).glob('*')),
-            *link_tuple(link, 'git_path')[:2], any((out_dir / canon['git_path']).glob('*')),
-            canon['archive_org_path'], (out_dir / 'archive.org.txt').exists(),
-        )
+    output = ""
+    output_template = '<a href="/{}/{}" class="exists-{}" title="{}">{} </a>'
+    icons = {
+        "singlefile": "❶",
+        "wget": "🆆",
+        "dom": "🅷",
+        "pdf": "📄",
+        "screenshot": "💻",
+        "media": "📼",
+        "git": "🅶",
+        "archive_org": "🏛",
+        "readability": "🆁",
+        "mercury": "🅼",
+    }
+    exclude = ["favicon"]
+    # Missing specific entry for WARC
+
+
+    for extractor in EXTRACTORS:
+        result = archive_results.filter(extractor=extractor[0])
+        try:
+            if extractor[0] not in exclude:
+                output += output_template.format(link.archive_path, canon[f"{extractor[0]}_path"],
+                                                 result.exists(), extractor[0], icons.get(extractor[0], "?"))
+        except Exception as e:
+            print(e)
+
+    return format_html(f'<span class="files-icons" style="font-size: 1.2em; opacity: 0.8">{output}<span>')
+
+#def get_icons(snapshot: Snapshot) -> str:
+#    link = snapshot.as_link()
+#    canon = link.canonical_outputs()
+#    out_dir = Path(link.link_dir)
+#
+#    # slow version: highlights icons based on whether files exist or not for that output
+#    # link_tuple = lambda link, method: (link.archive_path, canon[method] or '', canon[method] and (out_dir / (canon[method] or 'notdone')).exists())
+#    # fast version: all icons are highlighted without checking for outputs in filesystem
+#    link_tuple = lambda link, method: (link.archive_path, canon[method] or '', canon[method] and (out_dir / (canon[method] or 'notdone')).exists())
+#
+#    return format_html(
+#            '<span class="files-icons" style="font-size: 1.2em; opacity: 0.8">'
+#                '<a href="/{}/{}" class="exists-{}" title="SingleFile">❶ </a>'
+#                '<a href="/{}/{}" class="exists-{}" title="Wget clone">🆆 </a> '
+#                '<a href="/{}/{}" class="exists-{}" title="HTML dump">🅷 </a> '
+#                '<a href="/{}/{}" class="exists-{}" title="PDF">📄 </a> '
+#                '<a href="/{}/{}" class="exists-{}" title="Screenshot">💻 </a> '
+#                '<a href="/{}/{}" class="exists-{}" title="WARC">📦 </a> '
+#                '<a href="/{}/{}/" class="exists-{}" title="Media files">📼 </a> '
+#                '<a href="/{}/{}/" class="exists-{}" title="Git repos">🅶 </a> '
+#                '<a href="{}" class="exists-{}" title="Archive.org snapshot">🏛 </a> '
+#            '</span>',
+#            *link_tuple(link, 'singlefile_path'),
+#            *link_tuple(link, 'wget_path')[:2], any((out_dir / link.domain).glob('*')),
+#            *link_tuple(link, 'pdf_path'),
+#            *link_tuple(link, 'screenshot_path'),
+#            *link_tuple(link, 'dom_path'),
+#            *link_tuple(link, 'warc_path')[:2], any((out_dir / canon['warc_path']).glob('*.warc.gz')),
+#            *link_tuple(link, 'media_path')[:2], any((out_dir / canon['media_path']).glob('*')),
+#            *link_tuple(link, 'git_path')[:2], any((out_dir / canon['git_path']).glob('*')),
+#            canon['archive_org_path'], (out_dir / 'archive.org.txt').exists(),
+#        )
+#