Browse Source

automatically create storage directories and symlinks based on ulid

Nick Sweeting 1 year ago
parent
commit
ce833e8ead
1 changed files with 57 additions and 4 deletions
  1. 57 4
      archivebox/core/models.py

+ 57 - 4
archivebox/core/models.py

@@ -21,7 +21,7 @@ from django.contrib.auth.models import User   # noqa
 
 from ..config import ARCHIVE_DIR, ARCHIVE_DIR_NAME
 from ..system import get_dir_size
-from ..util import parse_date, base_url, hashurl
+from ..util import parse_date, base_url, hashurl, domain
 from ..index.schema import Link
 from ..index.html import snapshot_icons
 from ..extractors import get_default_archive_methods, ARCHIVE_METHODS_INDEXING_PRECEDENCE, EXTRACTORS
@@ -206,9 +206,7 @@ class Snapshot(models.Model):
     @cached_property
     def url_hash(self):
         # return hashurl(self.url)
-        url_hash = hashlib.new('sha256')
-        url_hash.update(self.url.encode('utf-8'))
-        return url_hash.hexdigest()[:16]
+        return hashlib.sha256(self.url.encode('utf-8')).hexdigest()[:16].upper()
 
     @cached_property
     def base_url(self):
@@ -301,6 +299,31 @@ class Snapshot(models.Model):
         self.tags.add(*tags_id)
 
 
+    def get_storage_dir(self, create=True, symlink=True) -> Path:
+        date_str = self.added.strftime('%Y%m%d')
+        domain_str = domain(self.url)
+        abs_storage_dir = Path(ARCHIVE_DIR) / 'snapshots' / date_str / domain_str / str(self.ulid)
+
+        if create and not abs_storage_dir.is_dir():
+            abs_storage_dir.mkdir(parents=True, exist_ok=True)
+
+        if symlink:
+            LINK_PATHS = [
+                Path(ARCHIVE_DIR).parent / 'index' / 'all_by_id' / str(self.ulid),
+                Path(ARCHIVE_DIR).parent / 'index' / 'snapshots_by_id' / str(self.ulid),
+                Path(ARCHIVE_DIR).parent / 'index' / 'snapshots_by_domain' / domain_str / date_str / str(self.ulid),
+                Path(ARCHIVE_DIR).parent / 'index' / 'snapshots_by_date' / date_str / domain_str / str(self.ulid),
+            ]
+            for link_path in LINK_PATHS:
+                link_path.parent.mkdir(parents=True, exist_ok=True)
+                try:
+                    link_path.symlink_to(abs_storage_dir)
+                except FileExistsError:
+                    link_path.unlink()
+                    link_path.symlink_to(abs_storage_dir)
+
+        return abs_storage_dir
+
 class ArchiveResultManager(models.Manager):
     def indexable(self, sorted: bool = True):
         INDEXABLE_METHODS = [ r[0] for r in ARCHIVE_METHODS_INDEXING_PRECEDENCE ]
@@ -397,3 +420,33 @@ class ArchiveResult(models.Model):
 
     def output_exists(self) -> bool:
         return Path(self.output_path()).exists()
+
+
+    def get_storage_dir(self, create=True, symlink=True):
+        date_str = self.snapshot.added.strftime('%Y%m%d')
+        domain_str = domain(self.snapshot.url)
+        abs_storage_dir = Path(ARCHIVE_DIR) / 'results' / date_str / domain_str / str(self.ulid)
+
+        if create and not abs_storage_dir.is_dir():
+            abs_storage_dir.mkdir(parents=True, exist_ok=True)
+
+        if symlink:
+            LINK_PATHS = [
+                Path(ARCHIVE_DIR).parent / 'index' / 'all_by_id' / str(self.ulid),
+                Path(ARCHIVE_DIR).parent / 'index' / 'results_by_id' / str(self.ulid),
+                Path(ARCHIVE_DIR).parent / 'index' / 'results_by_date' / date_str / domain_str / self.extractor / str(self.ulid),
+                Path(ARCHIVE_DIR).parent / 'index' / 'results_by_domain' / domain_str / date_str / self.extractor / str(self.ulid),
+                Path(ARCHIVE_DIR).parent / 'index' / 'results_by_type' / self.extractor / date_str / domain_str / str(self.ulid),
+            ]
+            for link_path in LINK_PATHS:
+                link_path.parent.mkdir(parents=True, exist_ok=True)
+                try:
+                    link_path.symlink_to(abs_storage_dir)
+                except FileExistsError:
+                    link_path.unlink()
+                    link_path.symlink_to(abs_storage_dir)
+
+        return abs_storage_dir
+
+    def symlink_index(self, create=True):
+        abs_result_dir = self.get_storage_dir(create=create)