|
|
@@ -6,6 +6,7 @@ from collections import defaultdict
|
|
|
from typing import List, Optional, Iterator, Mapping
|
|
|
|
|
|
from django.utils.html import format_html, mark_safe
|
|
|
+from django.core.cache import cache
|
|
|
|
|
|
from .schema import Link
|
|
|
from ..system import atomic_write
|
|
|
@@ -115,74 +116,78 @@ def render_django_template(template: str, context: Mapping[str, str]) -> str:
|
|
|
|
|
|
|
|
|
def snapshot_icons(snapshot) -> str:
|
|
|
- from core.models import EXTRACTORS
|
|
|
- # start = datetime.now()
|
|
|
-
|
|
|
- archive_results = snapshot.archiveresult_set.filter(status="succeeded", output__isnull=False)
|
|
|
- link = snapshot.as_link()
|
|
|
- path = link.archive_path
|
|
|
- canon = link.canonical_outputs()
|
|
|
- output = ""
|
|
|
- output_template = '<a href="/{}/{}" class="exists-{}" title="{}">{}</a> '
|
|
|
- icons = {
|
|
|
- "singlefile": "❶",
|
|
|
- "wget": "🆆",
|
|
|
- "dom": "🅷",
|
|
|
- "pdf": "📄",
|
|
|
- "screenshot": "💻",
|
|
|
- "media": "📼",
|
|
|
- "git": "🅶",
|
|
|
- "archive_org": "🏛",
|
|
|
- "readability": "🆁",
|
|
|
- "mercury": "🅼",
|
|
|
- "warc": "📦"
|
|
|
- }
|
|
|
- exclude = ["favicon", "title", "headers", "archive_org"]
|
|
|
- # Missing specific entry for WARC
|
|
|
-
|
|
|
- extractor_outputs = defaultdict(lambda: None)
|
|
|
- for extractor, _ in EXTRACTORS:
|
|
|
- for result in archive_results:
|
|
|
- if result.extractor == extractor and result:
|
|
|
- extractor_outputs[extractor] = result
|
|
|
-
|
|
|
- for extractor, _ in EXTRACTORS:
|
|
|
- if extractor not in exclude:
|
|
|
- existing = extractor_outputs[extractor] and extractor_outputs[extractor].status == 'succeeded' and extractor_outputs[extractor].output
|
|
|
- # Check filesystsem to see if anything is actually present (too slow, needs optimization/caching)
|
|
|
- # if existing:
|
|
|
- # existing = (Path(path) / existing)
|
|
|
- # if existing.is_file():
|
|
|
- # existing = True
|
|
|
- # elif existing.is_dir():
|
|
|
- # existing = any(existing.glob('*.*'))
|
|
|
- output += format_html(output_template, path, canon[f"{extractor}_path"], str(bool(existing)),
|
|
|
- extractor, icons.get(extractor, "?"))
|
|
|
- if extractor == "wget":
|
|
|
- # warc isn't technically it's own extractor, so we have to add it after wget
|
|
|
-
|
|
|
- # get from db (faster but less thurthful)
|
|
|
- exists = extractor_outputs[extractor] and extractor_outputs[extractor].status == 'succeeded' and extractor_outputs[extractor].output
|
|
|
- # get from filesystem (slower but more accurate)
|
|
|
- # exists = list((Path(path) / canon["warc_path"]).glob("*.warc.gz"))
|
|
|
- output += format_html(output_template, path, canon["warc_path"], str(bool(exists)), "warc", icons.get("warc", "?"))
|
|
|
-
|
|
|
- if extractor == "archive_org":
|
|
|
- # The check for archive_org is different, so it has to be handled separately
|
|
|
-
|
|
|
- # get from db (faster)
|
|
|
- exists = extractor_outputs[extractor] and extractor_outputs[extractor].status == 'succeeded' and extractor_outputs[extractor].output
|
|
|
- # get from filesystem (slower)
|
|
|
- # target_path = Path(path) / "archive.org.txt"
|
|
|
- # exists = target_path.exists()
|
|
|
- output += '<a href="{}" class="exists-{}" title="{}">{}</a> '.format(canon["archive_org_path"], str(exists),
|
|
|
- "archive_org", icons.get("archive_org", "?"))
|
|
|
-
|
|
|
- result = format_html('<span class="files-icons" style="font-size: 1.1em; opacity: 0.8; min-width: 240px; display: inline-block">{}<span>', mark_safe(output))
|
|
|
- # end = datetime.now()
|
|
|
- # print(((end - start).total_seconds()*1000) // 1, 'ms')
|
|
|
- return result
|
|
|
-
|
|
|
- # return cache.get_or_set(cache_key, calc_snapshot_icons)
|
|
|
+ cache_key = f'{str(snapshot.id)[:12]}-{(snapshot.updated or snapshot.added).timestamp()}-snapshot-icons'
|
|
|
+
|
|
|
+ def calc_snapshot_icons():
|
|
|
+ from core.models import EXTRACTORS
|
|
|
+ # start = datetime.now()
|
|
|
+
|
|
|
+ archive_results = snapshot.archiveresult_set.filter(status="succeeded", output__isnull=False)
|
|
|
+ link = snapshot.as_link()
|
|
|
+ path = link.archive_path
|
|
|
+ canon = link.canonical_outputs()
|
|
|
+ output = ""
|
|
|
+ output_template = '<a href="/{}/{}" class="exists-{}" title="{}">{}</a> '
|
|
|
+ icons = {
|
|
|
+ "singlefile": "❶",
|
|
|
+ "wget": "🆆",
|
|
|
+ "dom": "🅷",
|
|
|
+ "pdf": "📄",
|
|
|
+ "screenshot": "💻",
|
|
|
+ "media": "📼",
|
|
|
+ "git": "🅶",
|
|
|
+ "archive_org": "🏛",
|
|
|
+ "readability": "🆁",
|
|
|
+ "mercury": "🅼",
|
|
|
+ "warc": "📦"
|
|
|
+ }
|
|
|
+ exclude = ["favicon", "title", "headers", "archive_org"]
|
|
|
+ # Missing specific entry for WARC
|
|
|
+
|
|
|
+ extractor_outputs = defaultdict(lambda: None)
|
|
|
+ for extractor, _ in EXTRACTORS:
|
|
|
+ for result in archive_results:
|
|
|
+ if result.extractor == extractor and result:
|
|
|
+ extractor_outputs[extractor] = result
|
|
|
+
|
|
|
+ for extractor, _ in EXTRACTORS:
|
|
|
+ if extractor not in exclude:
|
|
|
+ existing = extractor_outputs[extractor] and extractor_outputs[extractor].status == 'succeeded' and extractor_outputs[extractor].output
|
|
|
+ # Check filesystsem to see if anything is actually present (too slow, needs optimization/caching)
|
|
|
+ # if existing:
|
|
|
+ # existing = (Path(path) / existing)
|
|
|
+ # if existing.is_file():
|
|
|
+ # existing = True
|
|
|
+ # elif existing.is_dir():
|
|
|
+ # existing = any(existing.glob('*.*'))
|
|
|
+ output += format_html(output_template, path, canon[f"{extractor}_path"], str(bool(existing)),
|
|
|
+ extractor, icons.get(extractor, "?"))
|
|
|
+ if extractor == "wget":
|
|
|
+ # warc isn't technically it's own extractor, so we have to add it after wget
|
|
|
+
|
|
|
+ # get from db (faster but less thurthful)
|
|
|
+ exists = extractor_outputs[extractor] and extractor_outputs[extractor].status == 'succeeded' and extractor_outputs[extractor].output
|
|
|
+ # get from filesystem (slower but more accurate)
|
|
|
+ # exists = list((Path(path) / canon["warc_path"]).glob("*.warc.gz"))
|
|
|
+ output += format_html(output_template, path, canon["warc_path"], str(bool(exists)), "warc", icons.get("warc", "?"))
|
|
|
+
|
|
|
+ if extractor == "archive_org":
|
|
|
+ # The check for archive_org is different, so it has to be handled separately
|
|
|
+
|
|
|
+ # get from db (faster)
|
|
|
+ exists = extractor_outputs[extractor] and extractor_outputs[extractor].status == 'succeeded' and extractor_outputs[extractor].output
|
|
|
+ # get from filesystem (slower)
|
|
|
+ # target_path = Path(path) / "archive.org.txt"
|
|
|
+ # exists = target_path.exists()
|
|
|
+ output += '<a href="{}" class="exists-{}" title="{}">{}</a> '.format(canon["archive_org_path"], str(exists),
|
|
|
+ "archive_org", icons.get("archive_org", "?"))
|
|
|
+
|
|
|
+ result = format_html('<span class="files-icons" style="font-size: 1.1em; opacity: 0.8; min-width: 240px; display: inline-block">{}<span>', mark_safe(output))
|
|
|
+ # end = datetime.now()
|
|
|
+ # print(((end - start).total_seconds()*1000) // 1, 'ms')
|
|
|
+ return result
|
|
|
+
|
|
|
+ return cache.get_or_set(cache_key, calc_snapshot_icons)
|
|
|
+ # return calc_snapshot_icons()
|
|
|
|
|
|
|