5 年之前 · b3e0400bc0
--- a/archivebox/core/migrations/0007_archiveresult.py
+++ b/archivebox/core/migrations/0007_archiveresult.py
@@ -1,8 +1,43 @@
 
															 # Generated by Django 3.0.8 on 2020-11-04 12:25
														
 
															+import json
														
 
															+from pathlib import Path
														
 
															+
														
 
															 from django.db import migrations, models
														
 
															 import django.db.models.deletion
														
 
															+from config import CONFIG
														
 
															+
														
 
															+
														
 
															+def forwards_func(apps, schema_editor):
														
 
															+    from core.models import EXTRACTORS
														
 
															+
														
 
															+    Snapshot = apps.get_model("core", "Snapshot")
														
 
															+    ArchiveResult = apps.get_model("core", "ArchiveResult")
														
 
															+
														
 
															+    snapshots = Snapshot.objects.all()
														
 
															+    for snapshot in snapshots:
														
 
															+        out_dir = Path(CONFIG['ARCHIVE_DIR']) / snapshot.timestamp
														
 
															+
														
 
															+        try:
														
 
															+            with open(out_dir / "index.json", "r") as f:
														
 
															+                fs_index = json.load(f)
														
 
															+        except Exception as e:
														
 
															+            continue
														
 
															+
														
 
															+        history = fs_index["history"]
														
 
															+
														
 
															+        for extractor in history:
														
 
															+            for result in history[extractor]:
														
 
															+                ArchiveResult.objects.create(extractor=extractor, snapshot=snapshot, cmd=json.dumps(result["cmd"]), cmd_version=result["cmd_version"], 
														
 
															+                start_ts=result["start_ts"], end_ts=result["end_ts"], status=result["status"], pwd=result["pwd"], output=result["output"])
														
 
															+
														
 
															+
														
 
															+
														
 
															+def reverse_func(apps, schema_editor):
														
 
															+    ArchiveResult = apps.get_model("core", "ArchiveResult")
														
 
															+    ArchiveResult.objects.all().delete()
														
 
															+
														
 
															 class Migration(migrations.Migration):
														
@@ -18,6 +53,7 @@ class Migration(migrations.Migration):
 
															                 ('cmd', models.CharField(default='', max_length=500)),
														
 
															                 ('pwd', models.CharField(default='', max_length=200)),
														
 
															                 ('cmd_version', models.CharField(default='', max_length=20)),
														
 
															+                ('status', models.CharField(max_length=10)),
														
 
															                 ('output', models.CharField(default='', max_length=500)),
														
 
															                 ('start_ts', models.DateTimeField()),
														
 
															                 ('end_ts', models.DateTimeField()),
														
@@ -25,4 +61,5 @@ class Migration(migrations.Migration):
 
															                 ('snapshot', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, to='core.Snapshot')),
														
 
															             ],
														
 
															         ),
														
 
															+        migrations.RunPython(forwards_func, reverse_func),
														
 
															     ]
														
--- a/archivebox/core/models.py
+++ b/archivebox/core/models.py
@@ -161,4 +161,8 @@ class ArchiveResult(models.Model):
 
															     output = models.CharField(max_length=500, default="")
														
 
															     start_ts = models.DateTimeField()
														
 
															     end_ts = models.DateTimeField()
														
 
															-    extractor = models.CharField(choices=EXTRACTORS, blank=False, max_length=20)
														
 
															+    status = models.CharField(max_length=10)
														
 
															+    extractor = models.CharField(choices=EXTRACTORS, blank=False, max_length=20)
														
 
															+
														
 
															+    def __str__(self):
														
 
															+        return self.extractor
														
--- a/archivebox/core/utils.py
+++ b/archivebox/core/utils.py
@@ -2,38 +2,72 @@ from pathlib import Path
 
															 from django.utils.html import format_html
														
 
															-from core.models import Snapshot
														
 
															+from core.models import Snapshot, ArchiveResult, EXTRACTORS
														
 
															 def get_icons(snapshot: Snapshot) -> str:
														
 
															+    archive_results = snapshot.archiveresult_set
														
 
															     link = snapshot.as_link()
														
 
															     canon = link.canonical_outputs()
														
 
															-    out_dir = Path(link.link_dir)
														
 
															-
														
 
															-    # slow version: highlights icons based on whether files exist or not for that output
														
 
															-    # link_tuple = lambda link, method: (link.archive_path, canon[method] or '', canon[method] and (out_dir / (canon[method] or 'notdone')).exists())
														
 
															-    # fast version: all icons are highlighted without checking for outputs in filesystem
														
 
															-    link_tuple = lambda link, method: (link.archive_path, canon[method] or '', canon[method] and (out_dir / (canon[method] or 'notdone')).exists())
														
 
															-
														
 
															-    return format_html(
														
 
															-            '<span class="files-icons" style="font-size: 1.2em; opacity: 0.8">'
														
 
															-                '<a href="/{}/{}" class="exists-{}" title="SingleFile">❶ </a>'
														
 
															-                '<a href="/{}/{}" class="exists-{}" title="Wget clone">🆆 </a> '
														
 
															-                '<a href="/{}/{}" class="exists-{}" title="HTML dump">🅷 </a> '
														
 
															-                '<a href="/{}/{}" class="exists-{}" title="PDF">📄 </a> '
														
 
															-                '<a href="/{}/{}" class="exists-{}" title="Screenshot">💻 </a> '
														
 
															-                '<a href="/{}/{}" class="exists-{}" title="WARC">📦 </a> '
														
 
															-                '<a href="/{}/{}/" class="exists-{}" title="Media files">📼 </a> '
														
 
															-                '<a href="/{}/{}/" class="exists-{}" title="Git repos">🅶 </a> '
														
 
															-                '<a href="{}" class="exists-{}" title="Archive.org snapshot">🏛 </a> '
														
 
															-            '</span>',
														
 
															-            *link_tuple(link, 'singlefile_path'),
														
 
															-            *link_tuple(link, 'wget_path')[:2], any((out_dir / link.domain).glob('*')),
														
 
															-            *link_tuple(link, 'pdf_path'),
														
 
															-            *link_tuple(link, 'screenshot_path'),
														
 
															-            *link_tuple(link, 'dom_path'),
														
 
															-            *link_tuple(link, 'warc_path')[:2], any((out_dir / canon['warc_path']).glob('*.warc.gz')),
														
 
															-            *link_tuple(link, 'media_path')[:2], any((out_dir / canon['media_path']).glob('*')),
														
 
															-            *link_tuple(link, 'git_path')[:2], any((out_dir / canon['git_path']).glob('*')),
														
 
															-            canon['archive_org_path'], (out_dir / 'archive.org.txt').exists(),
														
 
															-        )
														
 
															+    output = ""
														
 
															+    output_template = '<a href="/{}/{}" class="exists-{}" title="{}">{} </a>'
														
 
															+    icons = {
														
 
															+        "singlefile": "❶",
														
 
															+        "wget": "🆆",
														
 
															+        "dom": "🅷",
														
 
															+        "pdf": "📄",
														
 
															+        "screenshot": "💻",
														
 
															+        "media": "📼",
														
 
															+        "git": "🅶",
														
 
															+        "archive_org": "🏛",
														
 
															+        "readability": "🆁",
														
 
															+        "mercury": "🅼",
														
 
															+    }
														
 
															+    exclude = ["favicon"]
														
 
															+    # Missing specific entry for WARC
														
 
															+
														
 
															+
														
 
															+    for extractor in EXTRACTORS:
														
 
															+        result = archive_results.filter(extractor=extractor[0])
														
 
															+        try:
														
 
															+            if extractor[0] not in exclude:
														
 
															+                output += output_template.format(link.archive_path, canon[f"{extractor[0]}_path"],
														
 
															+                                                 result.exists(), extractor[0], icons.get(extractor[0], "?"))
														
 
															+        except Exception as e:
														
 
															+            print(e)
														
 
															+
														
 
															+    return format_html(f'<span class="files-icons" style="font-size: 1.2em; opacity: 0.8">{output}<span>')
														
 
															+
														
 
															+#def get_icons(snapshot: Snapshot) -> str:
														
 
															+#    link = snapshot.as_link()
														
 
															+#    canon = link.canonical_outputs()
														
 
															+#    out_dir = Path(link.link_dir)
														
 
															+#
														
 
															+#    # slow version: highlights icons based on whether files exist or not for that output
														
 
															+#    # link_tuple = lambda link, method: (link.archive_path, canon[method] or '', canon[method] and (out_dir / (canon[method] or 'notdone')).exists())
														
 
															+#    # fast version: all icons are highlighted without checking for outputs in filesystem
														
 
															+#    link_tuple = lambda link, method: (link.archive_path, canon[method] or '', canon[method] and (out_dir / (canon[method] or 'notdone')).exists())
														
 
															+#
														
 
															+#    return format_html(
														
 
															+#            '<span class="files-icons" style="font-size: 1.2em; opacity: 0.8">'
														
 
															+#                '<a href="/{}/{}" class="exists-{}" title="SingleFile">❶ </a>'
														
 
															+#                '<a href="/{}/{}" class="exists-{}" title="Wget clone">🆆 </a> '
														
 
															+#                '<a href="/{}/{}" class="exists-{}" title="HTML dump">🅷 </a> '
														
 
															+#                '<a href="/{}/{}" class="exists-{}" title="PDF">📄 </a> '
														
 
															+#                '<a href="/{}/{}" class="exists-{}" title="Screenshot">💻 </a> '
														
 
															+#                '<a href="/{}/{}" class="exists-{}" title="WARC">📦 </a> '
														
 
															+#                '<a href="/{}/{}/" class="exists-{}" title="Media files">📼 </a> '
														
 
															+#                '<a href="/{}/{}/" class="exists-{}" title="Git repos">🅶 </a> '
														
 
															+#                '<a href="{}" class="exists-{}" title="Archive.org snapshot">🏛 </a> '
														
 
															+#            '</span>',
														
 
															+#            *link_tuple(link, 'singlefile_path'),
														
 
															+#            *link_tuple(link, 'wget_path')[:2], any((out_dir / link.domain).glob('*')),
														
 
															+#            *link_tuple(link, 'pdf_path'),
														
 
															+#            *link_tuple(link, 'screenshot_path'),
														
 
															+#            *link_tuple(link, 'dom_path'),
														
 
															+#            *link_tuple(link, 'warc_path')[:2], any((out_dir / canon['warc_path']).glob('*.warc.gz')),
														
 
															+#            *link_tuple(link, 'media_path')[:2], any((out_dir / canon['media_path']).glob('*')),
														
 
															+#            *link_tuple(link, 'git_path')[:2], any((out_dir / canon['git_path']).glob('*')),
														
 
															+#            canon['archive_org_path'], (out_dir / 'archive.org.txt').exists(),
														
 
															+#        )
														
 
															+#