Browse Source

working runtime type casting and enforcement for a wide range of types

Nick Sweeting 6 years ago
parent
commit
ab09560f14
3 changed files with 162 additions and 61 deletions
  1. 18 11
      archivebox/index.py
  2. 13 4
      archivebox/schema.py
  3. 131 46
      archivebox/util.py

+ 18 - 11
archivebox/index.py

@@ -1,7 +1,6 @@
 import os
 import os
 import json
 import json
 
 
-from itertools import chain
 from datetime import datetime
 from datetime import datetime
 from string import Template
 from string import Template
 from typing import List, Tuple, Iterator, Optional
 from typing import List, Tuple, Iterator, Optional
@@ -20,13 +19,13 @@ from config import (
     FOOTER_INFO,
     FOOTER_INFO,
 )
 )
 from util import (
 from util import (
+    merge_links,
     chmod_file,
     chmod_file,
     urlencode,
     urlencode,
     derived_link_info,
     derived_link_info,
     wget_output_path,
     wget_output_path,
     ExtendedEncoder,
     ExtendedEncoder,
-    check_link_structure,
-    check_links_structure,
+    enforce_types,
 )
 )
 from parse import parse_links
 from parse import parse_links
 from links import validate_links
 from links import validate_links
@@ -43,6 +42,7 @@ TITLE_LOADING_MSG = 'Not yet archived...'
 
 
 ### Homepage index for all the links
 ### Homepage index for all the links
 
 
+@enforce_types
 def write_links_index(out_dir: str, links: List[Link], finished: bool=False) -> None:
 def write_links_index(out_dir: str, links: List[Link], finished: bool=False) -> None:
     """create index.html file for a given list of links"""
     """create index.html file for a given list of links"""
 
 
@@ -55,8 +55,9 @@ def write_links_index(out_dir: str, links: List[Link], finished: bool=False) ->
     log_indexing_started(out_dir, 'index.html')
     log_indexing_started(out_dir, 'index.html')
     write_html_links_index(out_dir, links, finished=finished)
     write_html_links_index(out_dir, links, finished=finished)
     log_indexing_finished(out_dir, 'index.html')
     log_indexing_finished(out_dir, 'index.html')
-    
 
 
+
+@enforce_types
 def load_links_index(out_dir: str=OUTPUT_DIR, import_path: str=None) -> Tuple[List[Link], List[Link]]:
 def load_links_index(out_dir: str=OUTPUT_DIR, import_path: str=None) -> Tuple[List[Link], List[Link]]:
     """parse and load existing index with any new links from import_path merged in"""
     """parse and load existing index with any new links from import_path merged in"""
 
 
@@ -81,6 +82,7 @@ def load_links_index(out_dir: str=OUTPUT_DIR, import_path: str=None) -> Tuple[Li
     return all_links, new_links
     return all_links, new_links
 
 
 
 
+@enforce_types
 def write_json_links_index(out_dir: str, links: List[Link]) -> None:
 def write_json_links_index(out_dir: str, links: List[Link]) -> None:
     """write the json link index to a given path"""
     """write the json link index to a given path"""
 
 
@@ -114,6 +116,7 @@ def write_json_links_index(out_dir: str, links: List[Link]) -> None:
     chmod_file(path)
     chmod_file(path)
 
 
 
 
+@enforce_types
 def parse_json_links_index(out_dir: str=OUTPUT_DIR) -> Iterator[Link]:
 def parse_json_links_index(out_dir: str=OUTPUT_DIR) -> Iterator[Link]:
     """parse a archive index json file and return the list of links"""
     """parse a archive index json file and return the list of links"""
 
 
@@ -121,13 +124,13 @@ def parse_json_links_index(out_dir: str=OUTPUT_DIR) -> Iterator[Link]:
     if os.path.exists(index_path):
     if os.path.exists(index_path):
         with open(index_path, 'r', encoding='utf-8') as f:
         with open(index_path, 'r', encoding='utf-8') as f:
             links = json.load(f)['links']
             links = json.load(f)['links']
-            check_links_structure(links)
             for link in links:
             for link in links:
                 yield Link(**link)
                 yield Link(**link)
 
 
     return ()
     return ()
 
 
 
 
+@enforce_types
 def write_html_links_index(out_dir: str, links: List[Link], finished: bool=False) -> None:
 def write_html_links_index(out_dir: str, links: List[Link], finished: bool=False) -> None:
     """write the html link index to a given path"""
     """write the html link index to a given path"""
 
 
@@ -151,6 +154,7 @@ def write_html_links_index(out_dir: str, links: List[Link], finished: bool=False
                 link.title
                 link.title
                 or (link.base_url if link.is_archived else TITLE_LOADING_MSG)
                 or (link.base_url if link.is_archived else TITLE_LOADING_MSG)
             ),
             ),
+            'tags': link.tags or '',
             'favicon_url': (
             'favicon_url': (
                 os.path.join('archive', link.timestamp, 'favicon.ico')
                 os.path.join('archive', link.timestamp, 'favicon.ico')
                 # if link['is_archived'] else 'data:image/gif;base64,R0lGODlhAQABAAD/ACwAAAAAAQABAAACADs='
                 # if link['is_archived'] else 'data:image/gif;base64,R0lGODlhAQABAAD/ACwAAAAAAQABAAACADs='
@@ -179,6 +183,7 @@ def write_html_links_index(out_dir: str, links: List[Link], finished: bool=False
     chmod_file(path)
     chmod_file(path)
 
 
 
 
+@enforce_types
 def patch_links_index(link: Link, out_dir: str=OUTPUT_DIR) -> None:
 def patch_links_index(link: Link, out_dir: str=OUTPUT_DIR) -> None:
     """hack to in-place update one row's info in the generated index html"""
     """hack to in-place update one row's info in the generated index html"""
 
 
@@ -218,11 +223,13 @@ def patch_links_index(link: Link, out_dir: str=OUTPUT_DIR) -> None:
 
 
 ### Individual link index
 ### Individual link index
 
 
+@enforce_types
 def write_link_index(out_dir: str, link: Link) -> None:
 def write_link_index(out_dir: str, link: Link) -> None:
     write_json_link_index(out_dir, link)
     write_json_link_index(out_dir, link)
     write_html_link_index(out_dir, link)
     write_html_link_index(out_dir, link)
 
 
 
 
+@enforce_types
 def write_json_link_index(out_dir: str, link: Link) -> None:
 def write_json_link_index(out_dir: str, link: Link) -> None:
     """write a json file with some info about the link"""
     """write a json file with some info about the link"""
     
     
@@ -234,29 +241,29 @@ def write_json_link_index(out_dir: str, link: Link) -> None:
     chmod_file(path)
     chmod_file(path)
 
 
 
 
+@enforce_types
 def parse_json_link_index(out_dir: str) -> Optional[Link]:
 def parse_json_link_index(out_dir: str) -> Optional[Link]:
     """load the json link index from a given directory"""
     """load the json link index from a given directory"""
     existing_index = os.path.join(out_dir, 'index.json')
     existing_index = os.path.join(out_dir, 'index.json')
     if os.path.exists(existing_index):
     if os.path.exists(existing_index):
         with open(existing_index, 'r', encoding='utf-8') as f:
         with open(existing_index, 'r', encoding='utf-8') as f:
             link_json = json.load(f)
             link_json = json.load(f)
-            check_link_structure(link_json)
             return Link(**link_json)
             return Link(**link_json)
     return None
     return None
 
 
 
 
+@enforce_types
 def load_json_link_index(out_dir: str, link: Link) -> Link:
 def load_json_link_index(out_dir: str, link: Link) -> Link:
     """check for an existing link archive in the given directory, 
     """check for an existing link archive in the given directory, 
        and load+merge it into the given link dict
        and load+merge it into the given link dict
     """
     """
-
     existing_link = parse_json_link_index(out_dir)
     existing_link = parse_json_link_index(out_dir)
-    existing_link = existing_link._asdict() if existing_link else {}
-    new_link = link._asdict()
-
-    return Link(**{**existing_link, **new_link})
+    if existing_link:
+        return merge_links(existing_link, link)
+    return link
 
 
 
 
+@enforce_types
 def write_html_link_index(out_dir: str, link: Link) -> None:
 def write_html_link_index(out_dir: str, link: Link) -> None:
     with open(os.path.join(TEMPLATES_DIR, 'link_index.html'), 'r', encoding='utf-8') as f:
     with open(os.path.join(TEMPLATES_DIR, 'link_index.html'), 'r', encoding='utf-8') as f:
         link_html = f.read()
         link_html = f.read()

+ 13 - 4
archivebox/schema.py

@@ -39,15 +39,24 @@ class Link:
     tags: Optional[str]
     tags: Optional[str]
     sources: List[str]
     sources: List[str]
     history: Dict[str, List[ArchiveResult]] = field(default_factory=lambda: {})
     history: Dict[str, List[ArchiveResult]] = field(default_factory=lambda: {})
-    updated: Optional[str] = None
+    updated: Optional[datetime] = None
 
 
-    def __hash__(self):
-        return self.urlhash
+    def __post_init__(self):
+        """fix any history result items to be type-checked ArchiveResults"""
+        cast_history = {}
+        for method, method_history in self.history.items():
+            cast_history[method] = []
+            for result in method_history:
+                if isinstance(result, dict):
+                    result = ArchiveResult(**result)
+                cast_history[method].append(result)
+
+        object.__setattr__(self, 'history', cast_history)
 
 
     def __eq__(self, other):
     def __eq__(self, other):
         if not isinstance(other, Link):
         if not isinstance(other, Link):
             return NotImplemented
             return NotImplemented
-        return self.urlhash == other.urlhash
+        return self.url == other.url
 
 
     def __gt__(self, other):
     def __gt__(self, other):
         if not isinstance(other, Link):
         if not isinstance(other, Link):

+ 131 - 46
archivebox/util.py

@@ -4,7 +4,9 @@ import sys
 import time
 import time
 
 
 from json import JSONEncoder
 from json import JSONEncoder
-from typing import List, Optional, Iterable
+from typing import List, Optional, Any
+from inspect import signature, _empty
+from functools import wraps
 from hashlib import sha256
 from hashlib import sha256
 from urllib.request import Request, urlopen
 from urllib.request import Request, urlopen
 from urllib.parse import urlparse, quote, unquote
 from urllib.parse import urlparse, quote, unquote
@@ -22,7 +24,7 @@ from subprocess import (
 
 
 from base32_crockford import encode as base32_encode
 from base32_crockford import encode as base32_encode
 
 
-from schema import Link, LinkDict, ArchiveResult
+from schema import Link
 from config import (
 from config import (
     ANSI,
     ANSI,
     TERM_WIDTH,
     TERM_WIDTH,
@@ -55,26 +57,13 @@ fragment = lambda url: urlparse(url).fragment
 extension = lambda url: basename(url).rsplit('.', 1)[-1].lower() if '.' in basename(url) else ''
 extension = lambda url: basename(url).rsplit('.', 1)[-1].lower() if '.' in basename(url) else ''
 base_url = lambda url: without_scheme(url)  # uniq base url used to dedupe links
 base_url = lambda url: without_scheme(url)  # uniq base url used to dedupe links
 
 
-
 without_www = lambda url: url.replace('://www.', '://', 1)
 without_www = lambda url: url.replace('://www.', '://', 1)
 without_trailing_slash = lambda url: url[:-1] if url[-1] == '/' else url.replace('/?', '?')
 without_trailing_slash = lambda url: url[:-1] if url[-1] == '/' else url.replace('/?', '?')
 fuzzy_url = lambda url: without_trailing_slash(without_www(without_scheme(url.lower())))
 fuzzy_url = lambda url: without_trailing_slash(without_www(without_scheme(url.lower())))
 
 
-short_ts = lambda ts: (
-    str(ts.timestamp()).split('.')[0]
-    if isinstance(ts, datetime) else
-    str(ts).split('.')[0]
-)
-ts_to_date = lambda ts: (
-    ts.strftime('%Y-%m-%d %H:%M')
-    if isinstance(ts, datetime) else
-    datetime.fromtimestamp(float(ts)).strftime('%Y-%m-%d %H:%M')
-)
-ts_to_iso = lambda ts: (
-    ts.isoformat()
-    if isinstance(ts, datetime) else
-    datetime.fromtimestamp(float(ts)).isoformat()
-)
+short_ts = lambda ts: str(parse_date(ts).timestamp()).split('.')[0]
+ts_to_date = lambda ts: parse_date(ts).strftime('%Y-%m-%d %H:%M')
+ts_to_iso = lambda ts: parse_date(ts).isoformat()
 
 
 urlencode = lambda s: s and quote(s, encoding='utf-8', errors='replace')
 urlencode = lambda s: s and quote(s, encoding='utf-8', errors='replace')
 urldecode = lambda s: s and unquote(s)
 urldecode = lambda s: s and unquote(s)
@@ -122,23 +111,46 @@ STATICFILE_EXTENSIONS = {
 
 
 ### Checks & Tests
 ### Checks & Tests
 
 
-def check_link_structure(link: LinkDict) -> None:
-    """basic sanity check invariants to make sure the data is valid"""
-    assert isinstance(link, dict)
-    assert isinstance(link.get('url'), str)
-    assert len(link['url']) > 2
-    assert len(re.findall(URL_REGEX, link['url'])) == 1
-    if 'history' in link:
-        assert isinstance(link['history'], dict), 'history must be a Dict'
-        for key, val in link['history'].items():
-            assert isinstance(key, str)
-            assert isinstance(val, list), 'history must be a Dict[str, List], got: {}'.format(link['history'])
-    
-def check_links_structure(links: Iterable[LinkDict]) -> None:
-    """basic sanity check invariants to make sure the data is valid"""
-    assert isinstance(links, list)
-    if links:
-        check_link_structure(links[0])
+def enforce_types(func):
+    """
+    Checks parameters type signatures against arg and kwarg type hints.
+    """
+
+    @wraps(func)
+    def typechecked_function(*args, **kwargs):
+        sig = signature(func)
+
+        def check_argument_type(arg_key, arg_val):
+            try:
+                annotation = sig.parameters[arg_key].annotation
+            except KeyError:
+                annotation = _empty
+
+            if annotation is not _empty and annotation.__class__ is type:
+                if not isinstance(arg_val, annotation):
+                    raise TypeError(
+                        '{}(..., {}: {}) got unexpected {} argument {}={}'.format(
+                            func.__name__,
+                            arg_key,
+                            annotation.__name__,
+                            type(arg_val).__name__,
+                            arg_key,
+                            arg_val,
+                        )
+                    )
+
+        # check args
+        for arg_val, arg_key in zip(args, sig.parameters):
+            check_argument_type(arg_key, arg_val)
+
+        # check kwargs
+        for arg_key, arg_val in kwargs.items():
+            check_argument_type(arg_key, arg_val)
+
+        return func(*args, **kwargs)
+
+    return typechecked_function
+
 
 
 def check_url_parsing_invariants() -> None:
 def check_url_parsing_invariants() -> None:
     """Check that plain text regex URL parsing works as expected"""
     """Check that plain text regex URL parsing works as expected"""
@@ -329,25 +341,98 @@ def str_between(string: str, start: str, end: str=None) -> str:
     return content
     return content
 
 
 
 
+def parse_date(date: Any) -> Optional[datetime]:
+    """Parse unix timestamps, iso format, and human-readable strings"""
+    
+    if isinstance(date, datetime):
+        return date
+
+    if date is None:
+        return None
+    
+    if isinstance(date, (float, int)):
+        date = str(date)
+
+    if isinstance(date, str):
+        if date.replace('.', '').isdigit():
+            timestamp = float(date)
+
+            EARLIEST_POSSIBLE = 473403600.0  # 1985
+            LATEST_POSSIBLE = 1735707600.0   # 2025
+
+            if EARLIEST_POSSIBLE < timestamp < LATEST_POSSIBLE:
+                # number is seconds
+                return datetime.fromtimestamp(timestamp)
+            elif EARLIEST_POSSIBLE * 1000 < timestamp < LATEST_POSSIBLE * 1000:
+                # number is milliseconds
+                return datetime.fromtimestamp(timestamp / 1000)
+
+            elif EARLIEST_POSSIBLE * 1000*1000 < timestamp < LATEST_POSSIBLE * 1000*1000:
+                # number is microseconds
+                return datetime.fromtimestamp(timestamp / (1000*1000))
+
+        if '-' in date:
+            try:
+                return datetime.fromisoformat(date)
+            except Exception:
+                try:
+                    return datetime.strptime(date, '%Y-%m-%d %H:%M')
+                except Exception:
+                    pass
+    
+    raise ValueError('Tried to parse invalid date! {}'.format(date))
+
+
+
 ### Link Helpers
 ### Link Helpers
 
 
+@enforce_types
 def merge_links(a: Link, b: Link) -> Link:
 def merge_links(a: Link, b: Link) -> Link:
     """deterministially merge two links, favoring longer field values over shorter,
     """deterministially merge two links, favoring longer field values over shorter,
     and "cleaner" values over worse ones.
     and "cleaner" values over worse ones.
     """
     """
-    a, b = a._asdict(), b._asdict()
-    longer = lambda key: (a[key] if len(a[key]) > len(b[key]) else b[key]) if (a[key] and b[key]) else (a[key] or b[key])
-    earlier = lambda key: a[key] if a[key] < b[key] else b[key]
-    
-    url = longer('url')
-    longest_title = longer('title')
-    cleanest_title = a['title'] if '://' not in (a['title'] or '') else b['title']
+    assert a.base_url == b.base_url, 'Cannot merge two links with different URLs'
+
+    url = a.url if len(a.url) > len(b.url) else b.url
+
+    possible_titles = [
+        title
+        for title in (a.title, b.title)
+        if title and title.strip() and '://' not in title
+    ]
+    title = None
+    if len(possible_titles) == 2:
+        title = max(possible_titles, key=lambda t: len(t))
+    elif len(possible_titles) == 1:
+        title = possible_titles[0]
+
+    timestamp = (
+        a.timestamp
+        if float(a.timestamp or 0) < float(b.timestamp or 0) else
+        b.timestamp
+    )
+
+    tags_set = (
+        set(tag.strip() for tag in (a.tags or '').split(','))
+        | set(tag.strip() for tag in (b.tags or '').split(','))
+    )
+    tags = ','.join(tags_set) or None
+
+    sources = list(set(a.sources + b.sources))
+
+    all_methods = (set(a.history.keys()) | set(a.history.keys()))
+    history = {
+        method: (a.history.get(method) or []) + (b.history.get(method) or [])
+        for method in all_methods
+    }
+
     return Link(
     return Link(
         url=url,
         url=url,
-        timestamp=earlier('timestamp'),
-        title=longest_title if '://' not in (longest_title or '') else cleanest_title,
-        tags=longer('tags'),
-        sources=list(set(a.get('sources', []) + b.get('sources', []))),
+        timestamp=timestamp,
+        title=title,
+        tags=tags,
+        sources=sources,
+        history=history,
     )
     )
 
 
 def is_static_file(url: str) -> bool:
 def is_static_file(url: str) -> bool: