4 years ago · 8b236b9367
--- a/archivebox/core/models.py
+++ b/archivebox/core/models.py
@@ -5,9 +5,11 @@ import uuid
 
				 from django.db import models, transaction
			
 
				 from django.utils.functional import cached_property
			
 
				 from django.utils.text import slugify
			
 
				+from django.core.cache import cache
			
 
				 from django.db.models import Case, When, Value, IntegerField
			
 
				 
			
 
				-from ..config import ARCHIVE_DIR
			
 
				+from ..config import ARCHIVE_DIR, ARCHIVE_DIR_NAME
			
 
				+from ..system import get_dir_size
			
 
				 from ..util import parse_date, base_url, hashurl
			
 
				 from ..index.schema import Link
			
 
				 from ..extractors import get_default_archive_methods, ARCHIVE_METHODS_INDEXING_PRECEDENCE
			
@@ -111,7 +113,9 @@ class Snapshot(models.Model):
 
				         return load_link_details(self.as_link())
			
 
				 
			
 
				     def tags_str(self) -> str:
			
 
				-        return ','.join(self.tags.order_by('name').values_list('name', flat=True))
			
 
				+        cache_key = f'{self.id}-{(self.updated or self.added).timestamp()}-tags'
			
 
				+        calc_tags_str = lambda: ','.join(self.tags.order_by('name').values_list('name', flat=True))
			
 
				+        return cache.get_or_set(cache_key, calc_tags_str)
			
 
				 
			
 
				     @cached_property
			
 
				     def bookmarked(self):
			
@@ -148,10 +152,15 @@ class Snapshot(models.Model):
 
				 
			
 
				     @cached_property
			
 
				     def archive_size(self):
			
 
				-        try:
			
 
				-            return get_dir_size(self.link_dir)[0]
			
 
				-        except Exception:
			
 
				-            return 0
			
 
				+        cache_key = f'{str(self.id)[:12]}-{(self.updated or self.added).timestamp()}-size'
			
 
				+
			
 
				+        def calc_dir_size():
			
 
				+            try:
			
 
				+                return get_dir_size(self.link_dir)[0]
			
 
				+            except Exception:
			
 
				+                return 0
			
 
				+
			
 
				+        return cache.get_or_set(cache_key, calc_dir_size)
			
 
				 
			
 
				     @cached_property
			
 
				     def history(self):
			
--- a/archivebox/index/html.py
+++ b/archivebox/index/html.py
@@ -6,6 +6,7 @@ from collections import defaultdict
 
				 from typing import List, Optional, Iterator, Mapping
			
 
				 
			
 
				 from django.utils.html import format_html, mark_safe
			
 
				+from django.core.cache import cache
			
 
				 
			
 
				 from .schema import Link
			
 
				 from ..system import atomic_write
			
@@ -115,74 +116,78 @@ def render_django_template(template: str, context: Mapping[str, str]) -> str:
 
				 
			
 
				 
			
 
				 def snapshot_icons(snapshot) -> str:
			
 
				-    from core.models import EXTRACTORS
			
 
				-    # start = datetime.now()
			
 
				-
			
 
				-    archive_results = snapshot.archiveresult_set.filter(status="succeeded", output__isnull=False)
			
 
				-    link = snapshot.as_link()
			
 
				-    path = link.archive_path
			
 
				-    canon = link.canonical_outputs()
			
 
				-    output = ""
			
 
				-    output_template = '<a href="/{}/{}" class="exists-{}" title="{}">{}</a> &nbsp;'
			
 
				-    icons = {
			
 
				-        "singlefile": "❶",
			
 
				-        "wget": "🆆",
			
 
				-        "dom": "🅷",
			
 
				-        "pdf": "📄",
			
 
				-        "screenshot": "💻",
			
 
				-        "media": "📼",
			
 
				-        "git": "🅶",
			
 
				-        "archive_org": "🏛",
			
 
				-        "readability": "🆁",
			
 
				-        "mercury": "🅼",
			
 
				-        "warc": "📦"
			
 
				-    }
			
 
				-    exclude = ["favicon", "title", "headers", "archive_org"]
			
 
				-    # Missing specific entry for WARC
			
 
				-
			
 
				-    extractor_outputs = defaultdict(lambda: None)
			
 
				-    for extractor, _ in EXTRACTORS:
			
 
				-        for result in archive_results:
			
 
				-            if result.extractor == extractor and result:
			
 
				-                extractor_outputs[extractor] = result
			
 
				-
			
 
				-    for extractor, _ in EXTRACTORS:
			
 
				-        if extractor not in exclude:
			
 
				-            existing = extractor_outputs[extractor] and extractor_outputs[extractor].status == 'succeeded' and extractor_outputs[extractor].output
			
 
				-            # Check filesystsem to see if anything is actually present (too slow, needs optimization/caching)
			
 
				-            # if existing:
			
 
				-            #     existing = (Path(path) / existing)
			
 
				-            #     if existing.is_file():
			
 
				-            #         existing = True
			
 
				-            #     elif existing.is_dir():
			
 
				-            #         existing = any(existing.glob('*.*'))
			
 
				-            output += format_html(output_template, path, canon[f"{extractor}_path"], str(bool(existing)),
			
 
				-                                         extractor, icons.get(extractor, "?"))
			
 
				-        if extractor == "wget":
			
 
				-            # warc isn't technically it's own extractor, so we have to add it after wget
			
 
				-            
			
 
				-            # get from db (faster but less thurthful)
			
 
				-            exists = extractor_outputs[extractor] and extractor_outputs[extractor].status == 'succeeded' and extractor_outputs[extractor].output
			
 
				-            # get from filesystem (slower but more accurate)
			
 
				-            # exists = list((Path(path) / canon["warc_path"]).glob("*.warc.gz"))
			
 
				-            output += format_html(output_template, path, canon["warc_path"], str(bool(exists)), "warc", icons.get("warc", "?"))
			
 
				-
			
 
				-        if extractor == "archive_org":
			
 
				-            # The check for archive_org is different, so it has to be handled separately
			
 
				-
			
 
				-            # get from db (faster)
			
 
				-            exists = extractor_outputs[extractor] and extractor_outputs[extractor].status == 'succeeded' and extractor_outputs[extractor].output
			
 
				-            # get from filesystem (slower)
			
 
				-            # target_path = Path(path) / "archive.org.txt"
			
 
				-            # exists = target_path.exists()
			
 
				-            output += '<a href="{}" class="exists-{}" title="{}">{}</a> '.format(canon["archive_org_path"], str(exists),
			
 
				-                                                                                        "archive_org", icons.get("archive_org", "?"))
			
 
				-
			
 
				-    result = format_html('<span class="files-icons" style="font-size: 1.1em; opacity: 0.8; min-width: 240px; display: inline-block">{}<span>', mark_safe(output))
			
 
				-    # end = datetime.now()
			
 
				-    # print(((end - start).total_seconds()*1000) // 1, 'ms')
			
 
				-    return result
			
 
				-
			
 
				-    # return cache.get_or_set(cache_key, calc_snapshot_icons)
			
 
				+    cache_key = f'{str(snapshot.id)[:12]}-{(snapshot.updated or snapshot.added).timestamp()}-snapshot-icons'
			
 
				+    
			
 
				+    def calc_snapshot_icons():
			
 
				+        from core.models import EXTRACTORS
			
 
				+        # start = datetime.now()
			
 
				+
			
 
				+        archive_results = snapshot.archiveresult_set.filter(status="succeeded", output__isnull=False)
			
 
				+        link = snapshot.as_link()
			
 
				+        path = link.archive_path
			
 
				+        canon = link.canonical_outputs()
			
 
				+        output = ""
			
 
				+        output_template = '<a href="/{}/{}" class="exists-{}" title="{}">{}</a> &nbsp;'
			
 
				+        icons = {
			
 
				+            "singlefile": "❶",
			
 
				+            "wget": "🆆",
			
 
				+            "dom": "🅷",
			
 
				+            "pdf": "📄",
			
 
				+            "screenshot": "💻",
			
 
				+            "media": "📼",
			
 
				+            "git": "🅶",
			
 
				+            "archive_org": "🏛",
			
 
				+            "readability": "🆁",
			
 
				+            "mercury": "🅼",
			
 
				+            "warc": "📦"
			
 
				+        }
			
 
				+        exclude = ["favicon", "title", "headers", "archive_org"]
			
 
				+        # Missing specific entry for WARC
			
 
				+
			
 
				+        extractor_outputs = defaultdict(lambda: None)
			
 
				+        for extractor, _ in EXTRACTORS:
			
 
				+            for result in archive_results:
			
 
				+                if result.extractor == extractor and result:
			
 
				+                    extractor_outputs[extractor] = result
			
 
				+
			
 
				+        for extractor, _ in EXTRACTORS:
			
 
				+            if extractor not in exclude:
			
 
				+                existing = extractor_outputs[extractor] and extractor_outputs[extractor].status == 'succeeded' and extractor_outputs[extractor].output
			
 
				+                # Check filesystsem to see if anything is actually present (too slow, needs optimization/caching)
			
 
				+                # if existing:
			
 
				+                #     existing = (Path(path) / existing)
			
 
				+                #     if existing.is_file():
			
 
				+                #         existing = True
			
 
				+                #     elif existing.is_dir():
			
 
				+                #         existing = any(existing.glob('*.*'))
			
 
				+                output += format_html(output_template, path, canon[f"{extractor}_path"], str(bool(existing)),
			
 
				+                                             extractor, icons.get(extractor, "?"))
			
 
				+            if extractor == "wget":
			
 
				+                # warc isn't technically it's own extractor, so we have to add it after wget
			
 
				+                
			
 
				+                # get from db (faster but less thurthful)
			
 
				+                exists = extractor_outputs[extractor] and extractor_outputs[extractor].status == 'succeeded' and extractor_outputs[extractor].output
			
 
				+                # get from filesystem (slower but more accurate)
			
 
				+                # exists = list((Path(path) / canon["warc_path"]).glob("*.warc.gz"))
			
 
				+                output += format_html(output_template, path, canon["warc_path"], str(bool(exists)), "warc", icons.get("warc", "?"))
			
 
				+
			
 
				+            if extractor == "archive_org":
			
 
				+                # The check for archive_org is different, so it has to be handled separately
			
 
				+
			
 
				+                # get from db (faster)
			
 
				+                exists = extractor_outputs[extractor] and extractor_outputs[extractor].status == 'succeeded' and extractor_outputs[extractor].output
			
 
				+                # get from filesystem (slower)
			
 
				+                # target_path = Path(path) / "archive.org.txt"
			
 
				+                # exists = target_path.exists()
			
 
				+                output += '<a href="{}" class="exists-{}" title="{}">{}</a> '.format(canon["archive_org_path"], str(exists),
			
 
				+                                                                                            "archive_org", icons.get("archive_org", "?"))
			
 
				+
			
 
				+        result = format_html('<span class="files-icons" style="font-size: 1.1em; opacity: 0.8; min-width: 240px; display: inline-block">{}<span>', mark_safe(output))
			
 
				+        # end = datetime.now()
			
 
				+        # print(((end - start).total_seconds()*1000) // 1, 'ms')
			
 
				+        return result
			
 
				+
			
 
				+    return cache.get_or_set(cache_key, calc_snapshot_icons)
			
 
				+    # return calc_snapshot_icons()
			
 
				 
			
 
				    
			
--- a/archivebox/index/schema.py
+++ b/archivebox/index/schema.py
@@ -16,6 +16,7 @@ from typing import List, Dict, Any, Optional, Union
 
				 
			
 
				 from dataclasses import dataclass, asdict, field, fields
			
 
				 
			
 
				+from django.utils.functional import cached_property
			
 
				 
			
 
				 from ..system import get_dir_size
			
 
				 
			
@@ -133,7 +134,6 @@ class Link:
 
				     updated: Optional[datetime] = None
			
 
				     schema: str = 'Link'
			
 
				 
			
 
				-
			
 
				     def __str__(self) -> str:
			
 
				         return f'[{self.timestamp}] {self.url} "{self.title}"'