6 سال پیش · 717e390ef6
--- a/archivebox/legacy/index.py
+++ b/archivebox/legacy/index.py
@@ -47,16 +47,6 @@ TITLE_LOADING_MSG = 'Not yet archived...'
 
				 
			
 
				 ### Link filtering and checking
			
 
				 
			
 
				-@enforce_types
			
 
				-def derived_link_info(link: Link) -> dict:
			
 
				-    """extend link info with the archive urls and other derived data"""
			
 
				-
			
 
				-    info = link._asdict(extended=True)
			
 
				-    info.update(link.canonical_outputs())
			
 
				-
			
 
				-    return info
			
 
				-
			
 
				-
			
 
				 @enforce_types
			
 
				 def merge_links(a: Link, b: Link) -> Link:
			
 
				     """deterministially merge two links, favoring longer field values over shorter,
			
--- a/archivebox/legacy/schema.py
+++ b/archivebox/legacy/schema.py
@@ -142,19 +142,27 @@ class Link:
 
				             info.update({
			
 
				                 'link_dir': self.link_dir,
			
 
				                 'archive_path': self.archive_path,
			
 
				-                'bookmarked_date': self.bookmarked_date,
			
 
				-                'updated_date': self.updated_date,
			
 
				+                
			
 
				+                'hash': self.url_hash,
			
 
				+                'base_url': self.base_url,
			
 
				+                'scheme': self.scheme,
			
 
				                 'domain': self.domain,
			
 
				                 'path': self.path,
			
 
				                 'basename': self.basename,
			
 
				                 'extension': self.extension,
			
 
				-                'base_url': self.base_url,
			
 
				                 'is_static': self.is_static,
			
 
				+
			
 
				+                'bookmarked_date': self.bookmarked_date,
			
 
				+                'updated_date': self.updated_date,
			
 
				+                'oldest_archive_date': self.oldest_archive_date,
			
 
				+                'newest_archive_date': self.newest_archive_date,
			
 
				+        
			
 
				                 'is_archived': self.is_archived,
			
 
				                 'num_outputs': self.num_outputs,
			
 
				                 'num_failures': self.num_failures,
			
 
				-                'oldest_archive_date': self.oldest_archive_date,
			
 
				-                'newest_archive_date': self.newest_archive_date,
			
 
				+                
			
 
				+                'latest': self.latest_outputs(),
			
 
				+                'canonical': self.canonical_outputs(),
			
 
				             })
			
 
				         return info
			
 
				 
			
@@ -211,11 +219,16 @@ class Link:
 
				     
			
 
				     ### URL Helpers
			
 
				     @property
			
 
				-    def urlhash(self):
			
 
				+    def url_hash(self):
			
 
				         from .util import hashurl
			
 
				 
			
 
				         return hashurl(self.url)
			
 
				 
			
 
				+    @property
			
 
				+    def scheme(self) -> str:
			
 
				+        from .util import scheme
			
 
				+        return scheme(self.url)
			
 
				+
			
 
				     @property
			
 
				     def extension(self) -> str:
			
 
				         from .util import extension
			
@@ -319,32 +332,35 @@ class Link:
 
				 
			
 
				         return latest
			
 
				 
			
 
				+
			
 
				     def canonical_outputs(self) -> Dict[str, Optional[str]]:
			
 
				+        """predict the expected output paths that should be present after archiving"""
			
 
				+
			
 
				         from .util import wget_output_path
			
 
				         canonical = {
			
 
				-            'index_url': 'index.html',
			
 
				-            'favicon_url': 'favicon.ico',
			
 
				-            'google_favicon_url': 'https://www.google.com/s2/favicons?domain={}'.format(self.domain),
			
 
				-            'archive_url': wget_output_path(self),
			
 
				-            'warc_url': 'warc',
			
 
				-            'pdf_url': 'output.pdf',
			
 
				-            'screenshot_url': 'screenshot.png',
			
 
				-            'dom_url': 'output.html',
			
 
				-            'archive_org_url': 'https://web.archive.org/web/{}'.format(self.base_url),
			
 
				-            'git_url': 'git',
			
 
				-            'media_url': 'media',
			
 
				+            'index_path': 'index.html',
			
 
				+            'favicon_path': 'favicon.ico',
			
 
				+            'google_favicon_path': 'https://www.google.com/s2/favicons?domain={}'.format(self.domain),
			
 
				+            'wget_path': wget_output_path(self),
			
 
				+            'warc_path': 'warc',
			
 
				+            'pdf_path': 'output.pdf',
			
 
				+            'screenshot_path': 'screenshot.png',
			
 
				+            'dom_path': 'output.html',
			
 
				+            'archive_org_path': 'https://web.archive.org/web/{}'.format(self.base_url),
			
 
				+            'git_path': 'git',
			
 
				+            'media_path': 'media',
			
 
				         }
			
 
				         if self.is_static:
			
 
				             # static binary files like PDF and images are handled slightly differently.
			
 
				             # they're just downloaded once and aren't archived separately multiple times, 
			
 
				             # so the wget, screenshot, & pdf urls should all point to the same file
			
 
				 
			
 
				-            static_url = wget_output_path(self)
			
 
				+            static_path = wget_output_path(self)
			
 
				             canonical.update({
			
 
				                 'title': self.basename,
			
 
				-                'archive_url': static_url,
			
 
				-                'pdf_url': static_url,
			
 
				-                'screenshot_url': static_url,
			
 
				-                'dom_url': static_url,
			
 
				+                'wget_path': static_path,
			
 
				+                'pdf_path': static_path,
			
 
				+                'screenshot_path': static_path,
			
 
				+                'dom_path': static_path,
			
 
				             })
			
 
				         return canonical
			
--- a/archivebox/legacy/templates/index_row.html
+++ b/archivebox/legacy/templates/index_row.html
@@ -2,7 +2,7 @@
 
				     <td title="$timestamp">$bookmarked_date</td>
			
 
				     <td class="title-col">
			
 
				         <a href="$archive_path/$index_url"><img src="$favicon_url" class="link-favicon" decoding="async"></a>
			
 
				-        <a href="$archive_path/$archive_url" title="$title">
			
 
				+        <a href="$archive_path/$wget_url" title="$title">
			
 
				             <span data-title-for="$url" data-archived="$is_archived">$title</span>
			
 
				             <small style="float:right">$tags</small>
			
 
				         </a>
			
--- a/archivebox/legacy/util.py
+++ b/archivebox/legacy/util.py
@@ -60,7 +60,6 @@ base_url = lambda url: without_scheme(url)  # uniq base url used to dedupe links
 
				 
			
 
				 without_www = lambda url: url.replace('://www.', '://', 1)
			
 
				 without_trailing_slash = lambda url: url[:-1] if url[-1] == '/' else url.replace('/?', '?')
			
 
				-fuzzy_url = lambda url: without_trailing_slash(without_www(without_scheme(url.lower())))
			
 
				 hashurl = lambda url: base32_encode(int(sha256(base_url(url).encode('utf-8')).hexdigest(), 16))[:20]
			
 
				 
			
 
				 urlencode = lambda s: s and quote(s, encoding='utf-8', errors='replace')
			
@@ -393,6 +392,7 @@ def parse_date(date: Any) -> Optional[datetime]:
 
				                 pass
			
 
				 
			
 
				         if '-' in date:
			
 
				+            # 2019-04-07T05:44:39.227520
			
 
				             try:
			
 
				                 return datetime.fromisoformat(date)
			
 
				             except Exception: