6 년 전 · ab09560f14
--- a/archivebox/index.py
+++ b/archivebox/index.py
@@ -1,7 +1,6 @@
 
				 import os
			
 
				 import json
			
 
				 
			
 
				-from itertools import chain
			
 
				 from datetime import datetime
			
 
				 from string import Template
			
 
				 from typing import List, Tuple, Iterator, Optional
			
@@ -20,13 +19,13 @@ from config import (
 
				     FOOTER_INFO,
			
 
				 )
			
 
				 from util import (
			
 
				+    merge_links,
			
 
				     chmod_file,
			
 
				     urlencode,
			
 
				     derived_link_info,
			
 
				     wget_output_path,
			
 
				     ExtendedEncoder,
			
 
				-    check_link_structure,
			
 
				-    check_links_structure,
			
 
				+    enforce_types,
			
 
				 )
			
 
				 from parse import parse_links
			
 
				 from links import validate_links
			
@@ -43,6 +42,7 @@ TITLE_LOADING_MSG = 'Not yet archived...'
 
				 
			
 
				 ### Homepage index for all the links
			
 
				 
			
 
				+@enforce_types
			
 
				 def write_links_index(out_dir: str, links: List[Link], finished: bool=False) -> None:
			
 
				     """create index.html file for a given list of links"""
			
 
				 
			
@@ -55,8 +55,9 @@ def write_links_index(out_dir: str, links: List[Link], finished: bool=False) ->
 
				     log_indexing_started(out_dir, 'index.html')
			
 
				     write_html_links_index(out_dir, links, finished=finished)
			
 
				     log_indexing_finished(out_dir, 'index.html')
			
 
				-    
			
 
				 
			
 
				+
			
 
				+@enforce_types
			
 
				 def load_links_index(out_dir: str=OUTPUT_DIR, import_path: str=None) -> Tuple[List[Link], List[Link]]:
			
 
				     """parse and load existing index with any new links from import_path merged in"""
			
 
				 
			
@@ -81,6 +82,7 @@ def load_links_index(out_dir: str=OUTPUT_DIR, import_path: str=None) -> Tuple[Li
 
				     return all_links, new_links
			
 
				 
			
 
				 
			
 
				+@enforce_types
			
 
				 def write_json_links_index(out_dir: str, links: List[Link]) -> None:
			
 
				     """write the json link index to a given path"""
			
 
				 
			
@@ -114,6 +116,7 @@ def write_json_links_index(out_dir: str, links: List[Link]) -> None:
 
				     chmod_file(path)
			
 
				 
			
 
				 
			
 
				+@enforce_types
			
 
				 def parse_json_links_index(out_dir: str=OUTPUT_DIR) -> Iterator[Link]:
			
 
				     """parse a archive index json file and return the list of links"""
			
 
				 
			
@@ -121,13 +124,13 @@ def parse_json_links_index(out_dir: str=OUTPUT_DIR) -> Iterator[Link]:
 
				     if os.path.exists(index_path):
			
 
				         with open(index_path, 'r', encoding='utf-8') as f:
			
 
				             links = json.load(f)['links']
			
 
				-            check_links_structure(links)
			
 
				             for link in links:
			
 
				                 yield Link(**link)
			
 
				 
			
 
				     return ()
			
 
				 
			
 
				 
			
 
				+@enforce_types
			
 
				 def write_html_links_index(out_dir: str, links: List[Link], finished: bool=False) -> None:
			
 
				     """write the html link index to a given path"""
			
 
				 
			
@@ -151,6 +154,7 @@ def write_html_links_index(out_dir: str, links: List[Link], finished: bool=False
 
				                 link.title
			
 
				                 or (link.base_url if link.is_archived else TITLE_LOADING_MSG)
			
 
				             ),
			
 
				+            'tags': link.tags or '',
			
 
				             'favicon_url': (
			
 
				                 os.path.join('archive', link.timestamp, 'favicon.ico')
			
 
				                 # if link['is_archived'] else 'data:image/gif;base64,R0lGODlhAQABAAD/ACwAAAAAAQABAAACADs='
			
@@ -179,6 +183,7 @@ def write_html_links_index(out_dir: str, links: List[Link], finished: bool=False
 
				     chmod_file(path)
			
 
				 
			
 
				 
			
 
				+@enforce_types
			
 
				 def patch_links_index(link: Link, out_dir: str=OUTPUT_DIR) -> None:
			
 
				     """hack to in-place update one row's info in the generated index html"""
			
 
				 
			
@@ -218,11 +223,13 @@ def patch_links_index(link: Link, out_dir: str=OUTPUT_DIR) -> None:
 
				 
			
 
				 ### Individual link index
			
 
				 
			
 
				+@enforce_types
			
 
				 def write_link_index(out_dir: str, link: Link) -> None:
			
 
				     write_json_link_index(out_dir, link)
			
 
				     write_html_link_index(out_dir, link)
			
 
				 
			
 
				 
			
 
				+@enforce_types
			
 
				 def write_json_link_index(out_dir: str, link: Link) -> None:
			
 
				     """write a json file with some info about the link"""
			
 
				     
			
@@ -234,29 +241,29 @@ def write_json_link_index(out_dir: str, link: Link) -> None:
 
				     chmod_file(path)
			
 
				 
			
 
				 
			
 
				+@enforce_types
			
 
				 def parse_json_link_index(out_dir: str) -> Optional[Link]:
			
 
				     """load the json link index from a given directory"""
			
 
				     existing_index = os.path.join(out_dir, 'index.json')
			
 
				     if os.path.exists(existing_index):
			
 
				         with open(existing_index, 'r', encoding='utf-8') as f:
			
 
				             link_json = json.load(f)
			
 
				-            check_link_structure(link_json)
			
 
				             return Link(**link_json)
			
 
				     return None
			
 
				 
			
 
				 
			
 
				+@enforce_types
			
 
				 def load_json_link_index(out_dir: str, link: Link) -> Link:
			
 
				     """check for an existing link archive in the given directory, 
			
 
				        and load+merge it into the given link dict
			
 
				     """
			
 
				-
			
 
				     existing_link = parse_json_link_index(out_dir)
			
 
				-    existing_link = existing_link._asdict() if existing_link else {}
			
 
				-    new_link = link._asdict()
			
 
				-
			
 
				-    return Link(**{**existing_link, **new_link})
			
 
				+    if existing_link:
			
 
				+        return merge_links(existing_link, link)
			
 
				+    return link
			
 
				 
			
 
				 
			
 
				+@enforce_types
			
 
				 def write_html_link_index(out_dir: str, link: Link) -> None:
			
 
				     with open(os.path.join(TEMPLATES_DIR, 'link_index.html'), 'r', encoding='utf-8') as f:
			
 
				         link_html = f.read()
			
--- a/archivebox/schema.py
+++ b/archivebox/schema.py
@@ -39,15 +39,24 @@ class Link:
 
				     tags: Optional[str]
			
 
				     sources: List[str]
			
 
				     history: Dict[str, List[ArchiveResult]] = field(default_factory=lambda: {})
			
 
				-    updated: Optional[str] = None
			
 
				+    updated: Optional[datetime] = None
			
 
				 
			
 
				-    def __hash__(self):
			
 
				-        return self.urlhash
			
 
				+    def __post_init__(self):
			
 
				+        """fix any history result items to be type-checked ArchiveResults"""
			
 
				+        cast_history = {}
			
 
				+        for method, method_history in self.history.items():
			
 
				+            cast_history[method] = []
			
 
				+            for result in method_history:
			
 
				+                if isinstance(result, dict):
			
 
				+                    result = ArchiveResult(**result)
			
 
				+                cast_history[method].append(result)
			
 
				+
			
 
				+        object.__setattr__(self, 'history', cast_history)
			
 
				 
			
 
				     def __eq__(self, other):
			
 
				         if not isinstance(other, Link):
			
 
				             return NotImplemented
			
 
				-        return self.urlhash == other.urlhash
			
 
				+        return self.url == other.url
			
 
				 
			
 
				     def __gt__(self, other):
			
 
				         if not isinstance(other, Link):
			
--- a/archivebox/util.py
+++ b/archivebox/util.py
@@ -4,7 +4,9 @@ import sys
 
				 import time
			
 
				 
			
 
				 from json import JSONEncoder
			
 
				-from typing import List, Optional, Iterable
			
 
				+from typing import List, Optional, Any
			
 
				+from inspect import signature, _empty
			
 
				+from functools import wraps
			
 
				 from hashlib import sha256
			
 
				 from urllib.request import Request, urlopen
			
 
				 from urllib.parse import urlparse, quote, unquote
			
@@ -22,7 +24,7 @@ from subprocess import (
 
				 
			
 
				 from base32_crockford import encode as base32_encode
			
 
				 
			
 
				-from schema import Link, LinkDict, ArchiveResult
			
 
				+from schema import Link
			
 
				 from config import (
			
 
				     ANSI,
			
 
				     TERM_WIDTH,
			
@@ -55,26 +57,13 @@ fragment = lambda url: urlparse(url).fragment
 
				 extension = lambda url: basename(url).rsplit('.', 1)[-1].lower() if '.' in basename(url) else ''
			
 
				 base_url = lambda url: without_scheme(url)  # uniq base url used to dedupe links
			
 
				 
			
 
				-
			
 
				 without_www = lambda url: url.replace('://www.', '://', 1)
			
 
				 without_trailing_slash = lambda url: url[:-1] if url[-1] == '/' else url.replace('/?', '?')
			
 
				 fuzzy_url = lambda url: without_trailing_slash(without_www(without_scheme(url.lower())))
			
 
				 
			
 
				-short_ts = lambda ts: (
			
 
				-    str(ts.timestamp()).split('.')[0]
			
 
				-    if isinstance(ts, datetime) else
			
 
				-    str(ts).split('.')[0]
			
 
				-)
			
 
				-ts_to_date = lambda ts: (
			
 
				-    ts.strftime('%Y-%m-%d %H:%M')
			
 
				-    if isinstance(ts, datetime) else
			
 
				-    datetime.fromtimestamp(float(ts)).strftime('%Y-%m-%d %H:%M')
			
 
				-)
			
 
				-ts_to_iso = lambda ts: (
			
 
				-    ts.isoformat()
			
 
				-    if isinstance(ts, datetime) else
			
 
				-    datetime.fromtimestamp(float(ts)).isoformat()
			
 
				-)
			
 
				+short_ts = lambda ts: str(parse_date(ts).timestamp()).split('.')[0]
			
 
				+ts_to_date = lambda ts: parse_date(ts).strftime('%Y-%m-%d %H:%M')
			
 
				+ts_to_iso = lambda ts: parse_date(ts).isoformat()
			
 
				 
			
 
				 urlencode = lambda s: s and quote(s, encoding='utf-8', errors='replace')
			
 
				 urldecode = lambda s: s and unquote(s)
			
@@ -122,23 +111,46 @@ STATICFILE_EXTENSIONS = {
 
				 
			
 
				 ### Checks & Tests
			
 
				 
			
 
				-def check_link_structure(link: LinkDict) -> None:
			
 
				-    """basic sanity check invariants to make sure the data is valid"""
			
 
				-    assert isinstance(link, dict)
			
 
				-    assert isinstance(link.get('url'), str)
			
 
				-    assert len(link['url']) > 2
			
 
				-    assert len(re.findall(URL_REGEX, link['url'])) == 1
			
 
				-    if 'history' in link:
			
 
				-        assert isinstance(link['history'], dict), 'history must be a Dict'
			
 
				-        for key, val in link['history'].items():
			
 
				-            assert isinstance(key, str)
			
 
				-            assert isinstance(val, list), 'history must be a Dict[str, List], got: {}'.format(link['history'])
			
 
				-    
			
 
				-def check_links_structure(links: Iterable[LinkDict]) -> None:
			
 
				-    """basic sanity check invariants to make sure the data is valid"""
			
 
				-    assert isinstance(links, list)
			
 
				-    if links:
			
 
				-        check_link_structure(links[0])
			
 
				+def enforce_types(func):
			
 
				+    """
			
 
				+    Checks parameters type signatures against arg and kwarg type hints.
			
 
				+    """
			
 
				+
			
 
				+    @wraps(func)
			
 
				+    def typechecked_function(*args, **kwargs):
			
 
				+        sig = signature(func)
			
 
				+
			
 
				+        def check_argument_type(arg_key, arg_val):
			
 
				+            try:
			
 
				+                annotation = sig.parameters[arg_key].annotation
			
 
				+            except KeyError:
			
 
				+                annotation = _empty
			
 
				+
			
 
				+            if annotation is not _empty and annotation.__class__ is type:
			
 
				+                if not isinstance(arg_val, annotation):
			
 
				+                    raise TypeError(
			
 
				+                        '{}(..., {}: {}) got unexpected {} argument {}={}'.format(
			
 
				+                            func.__name__,
			
 
				+                            arg_key,
			
 
				+                            annotation.__name__,
			
 
				+                            type(arg_val).__name__,
			
 
				+                            arg_key,
			
 
				+                            arg_val,
			
 
				+                        )
			
 
				+                    )
			
 
				+
			
 
				+        # check args
			
 
				+        for arg_val, arg_key in zip(args, sig.parameters):
			
 
				+            check_argument_type(arg_key, arg_val)
			
 
				+
			
 
				+        # check kwargs
			
 
				+        for arg_key, arg_val in kwargs.items():
			
 
				+            check_argument_type(arg_key, arg_val)
			
 
				+
			
 
				+        return func(*args, **kwargs)
			
 
				+
			
 
				+    return typechecked_function
			
 
				+
			
 
				 
			
 
				 def check_url_parsing_invariants() -> None:
			
 
				     """Check that plain text regex URL parsing works as expected"""
			
@@ -329,25 +341,98 @@ def str_between(string: str, start: str, end: str=None) -> str:
 
				     return content
			
 
				 
			
 
				 
			
 
				+def parse_date(date: Any) -> Optional[datetime]:
			
 
				+    """Parse unix timestamps, iso format, and human-readable strings"""
			
 
				+    
			
 
				+    if isinstance(date, datetime):
			
 
				+        return date
			
 
				+
			
 
				+    if date is None:
			
 
				+        return None
			
 
				+    
			
 
				+    if isinstance(date, (float, int)):
			
 
				+        date = str(date)
			
 
				+
			
 
				+    if isinstance(date, str):
			
 
				+        if date.replace('.', '').isdigit():
			
 
				+            timestamp = float(date)
			
 
				+
			
 
				+            EARLIEST_POSSIBLE = 473403600.0  # 1985
			
 
				+            LATEST_POSSIBLE = 1735707600.0   # 2025
			
 
				+
			
 
				+            if EARLIEST_POSSIBLE < timestamp < LATEST_POSSIBLE:
			
 
				+                # number is seconds
			
 
				+                return datetime.fromtimestamp(timestamp)
			
 
				+            elif EARLIEST_POSSIBLE * 1000 < timestamp < LATEST_POSSIBLE * 1000:
			
 
				+                # number is milliseconds
			
 
				+                return datetime.fromtimestamp(timestamp / 1000)
			
 
				+
			
 
				+            elif EARLIEST_POSSIBLE * 1000*1000 < timestamp < LATEST_POSSIBLE * 1000*1000:
			
 
				+                # number is microseconds
			
 
				+                return datetime.fromtimestamp(timestamp / (1000*1000))
			
 
				+
			
 
				+        if '-' in date:
			
 
				+            try:
			
 
				+                return datetime.fromisoformat(date)
			
 
				+            except Exception:
			
 
				+                try:
			
 
				+                    return datetime.strptime(date, '%Y-%m-%d %H:%M')
			
 
				+                except Exception:
			
 
				+                    pass
			
 
				+    
			
 
				+    raise ValueError('Tried to parse invalid date! {}'.format(date))
			
 
				+
			
 
				+
			
 
				+
			
 
				 ### Link Helpers
			
 
				 
			
 
				+@enforce_types
			
 
				 def merge_links(a: Link, b: Link) -> Link:
			
 
				     """deterministially merge two links, favoring longer field values over shorter,
			
 
				     and "cleaner" values over worse ones.
			
 
				     """
			
 
				-    a, b = a._asdict(), b._asdict()
			
 
				-    longer = lambda key: (a[key] if len(a[key]) > len(b[key]) else b[key]) if (a[key] and b[key]) else (a[key] or b[key])
			
 
				-    earlier = lambda key: a[key] if a[key] < b[key] else b[key]
			
 
				-    
			
 
				-    url = longer('url')
			
 
				-    longest_title = longer('title')
			
 
				-    cleanest_title = a['title'] if '://' not in (a['title'] or '') else b['title']
			
 
				+    assert a.base_url == b.base_url, 'Cannot merge two links with different URLs'
			
 
				+
			
 
				+    url = a.url if len(a.url) > len(b.url) else b.url
			
 
				+
			
 
				+    possible_titles = [
			
 
				+        title
			
 
				+        for title in (a.title, b.title)
			
 
				+        if title and title.strip() and '://' not in title
			
 
				+    ]
			
 
				+    title = None
			
 
				+    if len(possible_titles) == 2:
			
 
				+        title = max(possible_titles, key=lambda t: len(t))
			
 
				+    elif len(possible_titles) == 1:
			
 
				+        title = possible_titles[0]
			
 
				+
			
 
				+    timestamp = (
			
 
				+        a.timestamp
			
 
				+        if float(a.timestamp or 0) < float(b.timestamp or 0) else
			
 
				+        b.timestamp
			
 
				+    )
			
 
				+
			
 
				+    tags_set = (
			
 
				+        set(tag.strip() for tag in (a.tags or '').split(','))
			
 
				+        | set(tag.strip() for tag in (b.tags or '').split(','))
			
 
				+    )
			
 
				+    tags = ','.join(tags_set) or None
			
 
				+
			
 
				+    sources = list(set(a.sources + b.sources))
			
 
				+
			
 
				+    all_methods = (set(a.history.keys()) | set(a.history.keys()))
			
 
				+    history = {
			
 
				+        method: (a.history.get(method) or []) + (b.history.get(method) or [])
			
 
				+        for method in all_methods
			
 
				+    }
			
 
				+
			
 
				     return Link(
			
 
				         url=url,
			
 
				-        timestamp=earlier('timestamp'),
			
 
				-        title=longest_title if '://' not in (longest_title or '') else cleanest_title,
			
 
				-        tags=longer('tags'),
			
 
				-        sources=list(set(a.get('sources', []) + b.get('sources', []))),
			
 
				+        timestamp=timestamp,
			
 
				+        title=title,
			
 
				+        tags=tags,
			
 
				+        sources=sources,
			
 
				+        history=history,
			
 
				     )
			
 
				 
			
 
				 def is_static_file(url: str) -> bool: