Răsfoiți Sursa

better link corruption guards, remove title prefetching, save index after run

Nick Sweeting 6 ani în urmă
părinte
comite
b03e9fade8
6 a modificat fișierele cu 165 adăugiri și 93 ștergeri
  1. 49 15
      archivebox/archive.py
  2. 21 39
      archivebox/archive_methods.py
  3. 36 2
      archivebox/index.py
  4. 13 9
      archivebox/links.py
  5. 9 13
      archivebox/parse.py
  6. 37 15
      archivebox/util.py

+ 49 - 15
archivebox/archive.py

@@ -7,34 +7,31 @@ import os
 import sys
 
 from datetime import datetime
-from subprocess import run
+from peekable import Peekable
+
 
 from parse import parse_links
-from links import validate_links
-from archive_methods import archive_links, _RESULTS_TOTALS
+from links import validate_links, links_after_timestamp
+from archive_methods import archive_link, _RESULTS_TOTALS
 from index import (
     write_links_index,
-    write_link_index,
     parse_json_links_index,
-    parse_json_link_index,
 )
 from config import (
+    ARCHIVE_DIR,
     ONLY_NEW,
-    OUTPUT_PERMISSIONS,
     OUTPUT_DIR,
     REPO_DIR,
     ANSI,
-    TIMEOUT,
-    SHOW_PROGRESS,
     GIT_SHA,
 )
 from util import (
+    check_dependencies,
     download_url,
     save_source,
-    progress,
-    cleanup_archive,
     pretty_path,
     migrate_data,
+    check_links_structure,
 )
 
 __AUTHOR__ = 'Nick Sweeting <[email protected]>'
@@ -42,6 +39,7 @@ __VERSION__ = GIT_SHA
 __DESCRIPTION__ = 'ArchiveBox Usage:  Create a browsable html archive of a list of links.'
 __DOCUMENTATION__ = 'https://github.com/pirate/ArchiveBox/wiki'
 
+
 def print_help():
     print(__DESCRIPTION__)
     print("Documentation:     {}\n".format(__DOCUMENTATION__))
@@ -55,21 +53,22 @@ def print_help():
 
 def load_links(archive_path=OUTPUT_DIR, import_path=None):
     """get new links from file and optionally append them to links in existing archive"""
-    
+
     existing_links = []
     if archive_path:
         existing_links = parse_json_links_index(archive_path)
+        check_links_structure(existing_links)
 
     new_links = []
     if import_path:
         # parse and validate the import file
         raw_links, parser_name = parse_links(import_path)
         new_links = validate_links(raw_links)
-        if SHOW_PROGRESS:
-            print()
+        check_links_structure(new_links)
 
     # merge existing links in archive_path and new links
     all_links = validate_links(existing_links + new_links)
+    check_links_structure(all_links)
     num_new_links = len(all_links) - len(existing_links)
 
     if import_path and parser_name:
@@ -81,6 +80,7 @@ def load_links(archive_path=OUTPUT_DIR, import_path=None):
 
     return all_links, new_links
 
+
 def update_archive(archive_path, links, source=None, resume=None, append=True):
     """update or create index.html+json given a path to an export file containing new links"""
 
@@ -99,8 +99,38 @@ def update_archive(archive_path, links, source=None, resume=None, append=True):
              **ANSI,
         ))
 
+    check_links_structure(links)
+
+    # prefetch the first link off the generator so that if we pause or fail
+    # immediately we can show that we paused on the first link and not just None
+    to_archive = Peekable(links_after_timestamp(links, resume))
+    idx, link = 0, to_archive.peek(0)
+
     # loop over links and archive them
-    archive_links(archive_path, links, source=source, resume=resume)
+    try:
+        check_dependencies()
+        for idx, link in enumerate(to_archive):
+            link_dir = os.path.join(ARCHIVE_DIR, link['timestamp'])
+            archive_link(link_dir, link)
+
+    except (KeyboardInterrupt, SystemExit, Exception) as e:
+        print('\n{lightyellow}[X] [{now}] Downloading paused on link {timestamp} ({idx}/{total}){reset}'.format(
+            **ANSI,
+            now=datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
+            idx=idx+1,
+            timestamp=link['timestamp'],
+            total=len(links),
+        ))
+        print('    To view your archive, open: {}/index.html'.format(OUTPUT_DIR.replace(REPO_DIR + '/', '')))
+        print('    Continue where you left off by running:')
+        print('        {} {}'.format(
+            pretty_path(sys.argv[0]),
+            link['timestamp'],
+        ))
+        if not isinstance(e, KeyboardInterrupt):
+            print()
+            raise e
+        raise SystemExit(1)
 
     # print timing information & summary
     end_ts = datetime.now().timestamp()
@@ -135,7 +165,7 @@ if __name__ == '__main__':
     source = sys.argv[1] if argc > 1 else None  # path of links file to import
     resume = sys.argv[2] if argc > 2 else None  # timestamp to resume dowloading from
    
-    stdin_raw_text = []
+    stdin_raw_text = ''
 
     if not sys.stdin.isatty():
         stdin_raw_text = sys.stdin.read()
@@ -192,3 +222,7 @@ if __name__ == '__main__':
         update_archive(out_dir, new_links, source=source, resume=resume, append=True)
     else:
         update_archive(out_dir, all_links, source=source, resume=resume, append=True)
+
+    # Step 5: Re-write links index with updated titles, icons, and resources
+    all_links, _ = load_links(archive_path=out_dir)
+    write_links_index(out_dir=out_dir, links=all_links)

+ 21 - 39
archivebox/archive_methods.py

@@ -1,16 +1,17 @@
 import os
-import re
-import sys
 
 from functools import wraps
 from collections import defaultdict
 from datetime import datetime
 
-from peekable import Peekable
-
-from index import wget_output_path, parse_json_link_index, write_link_index
-from links import links_after_timestamp
+from index import (
+    wget_output_path,
+    parse_json_link_index,
+    write_link_index,
+    patch_index_title_hack,
+)
 from config import (
+    OUTPUT_DIR,
     CURL_BINARY,
     GIT_BINARY,
     WGET_BINARY,
@@ -42,12 +43,12 @@ from config import (
 )
 from util import (
     without_fragment,
-    check_dependencies,
     fetch_page_title,
     progress,
     chmod_file,
     pretty_path,
-    run, PIPE, DEVNULL
+    check_link_structure,
+    run, PIPE, DEVNULL,
 )
 
 
@@ -57,38 +58,12 @@ _RESULTS_TOTALS = {   # globals are bad, mmkay
     'failed': 0,
 }
 
-def archive_links(archive_path, links, source=None, resume=None):
-    check_dependencies()
-
-    to_archive = Peekable(links_after_timestamp(links, resume))
-    idx, link = 0, to_archive.peek(0)
-
-    try:
-        for idx, link in enumerate(to_archive):
-            link_dir = os.path.join(ARCHIVE_DIR, link['timestamp'])
-            archive_link(link_dir, link)
-    
-    except (KeyboardInterrupt, SystemExit, Exception) as e:
-        print('{lightyellow}[X] [{now}] Downloading paused on link {timestamp} ({idx}/{total}){reset}'.format(
-            **ANSI,
-            now=datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
-            idx=idx+1,
-            timestamp=link['timestamp'],
-            total=len(links),
-        ))
-        print('    Continue where you left off by running:')
-        print('        {} {}'.format(
-            pretty_path(sys.argv[0]),
-            link['timestamp'],
-        ))
-        if not isinstance(e, KeyboardInterrupt):
-            raise e
-        raise SystemExit(1)
-
 
 def archive_link(link_dir, link, overwrite=True):
     """download the DOM, PDF, and a screenshot into a folder named after the link's timestamp"""
 
+    check_link_structure(link)
+
     try:
         update_existing = os.path.exists(link_dir)
         if update_existing:
@@ -99,7 +74,7 @@ def archive_link(link_dir, link, overwrite=True):
         else:
             os.makedirs(link_dir)
         
-        log_link_archive(link_dir, link, update_existing)
+        print_link_status_line(link_dir, link, update_existing)
 
         if FETCH_FAVICON:
             link = fetch_favicon(link_dir, link, overwrite=overwrite)
@@ -135,7 +110,7 @@ def archive_link(link_dir, link, overwrite=True):
     
     return link
 
-def log_link_archive(link_dir, link, update_existing):
+def print_link_status_line(link_dir, link, update_existing):
     print('[{symbol_color}{symbol}{reset}] [{now}] "{title}"\n    {blue}{url}{reset}'.format(
         symbol='*' if update_existing else '+',
         symbol_color=ANSI['black' if update_existing else 'green'],
@@ -518,7 +493,7 @@ def fetch_title(link_dir, link, timeout=TIMEOUT):
 
     # if link already has valid title, skip it
     if link['title'] and not link['title'].lower().startswith('http'):
-        return {'output': link['title'], 'cmd': 'fetch_page_title("{}")'.format(link['url'])}
+        return {'output': link['title'], 'status': 'skipped'}
 
     end = progress(timeout, prefix='      ')
     try:
@@ -530,6 +505,13 @@ def fetch_title(link_dir, link, timeout=TIMEOUT):
         print('        {}Failed: {} {}{}'.format(ANSI['red'], e.__class__.__name__, e, ANSI['reset']))
         output = e
 
+    # titles should show up in the global index immediatley for better UX,
+    # do a hacky immediate replacement to add them in as we're archiving
+    # TODO: figure out how to do this without gnarly string replacement
+    if title:
+        link['title'] = title
+        patch_index_title_hack(link['url'], title)
+
     return {
         'cmd': 'fetch_page_title("{}")'.format(link['url']),
         'output': output,

+ 36 - 2
archivebox/index.py

@@ -6,6 +6,7 @@ from string import Template
 from distutils.dir_util import copy_tree
 
 from config import (
+    OUTPUT_DIR,
     TEMPLATES_DIR,
     OUTPUT_PERMISSIONS,
     ANSI,
@@ -17,6 +18,8 @@ from util import (
     wget_output_path,
     derived_link_info,
     pretty_path,
+    check_link_structure,
+    check_links_structure,
 )
 
 
@@ -25,6 +28,8 @@ from util import (
 def write_links_index(out_dir, links):
     """create index.html file for a given list of links"""
 
+    check_links_structure(links)
+
     if not os.path.exists(out_dir):
         os.makedirs(out_dir)
 
@@ -42,6 +47,8 @@ def write_links_index(out_dir, links):
 def write_json_links_index(out_dir, links):
     """write the json link index to a given path"""
 
+    check_links_structure(links)
+
     path = os.path.join(out_dir, 'index.json')
 
     index_json = {
@@ -63,13 +70,17 @@ def parse_json_links_index(out_dir):
     index_path = os.path.join(out_dir, 'index.json')
     if os.path.exists(index_path):
         with open(index_path, 'r', encoding='utf-8') as f:
-            return json.load(f)['links']
+            links = json.load(f)['links']
+            check_links_structure(links)
+            return links
 
     return []
 
 def write_html_links_index(out_dir, links):
     """write the html link index to a given path"""
 
+    check_links_structure(links)
+
     path = os.path.join(out_dir, 'index.html')
 
     copy_tree(os.path.join(TEMPLATES_DIR, 'static'), os.path.join(out_dir, 'static'))
@@ -104,6 +115,25 @@ def write_html_links_index(out_dir, links):
     chmod_file(path)
 
 
+def patch_index_title_hack(link_url, new_title):
+    """hack to update just one link's title in the link index json"""
+
+    json_path = os.path.join(OUTPUT_DIR, 'index.json')
+
+    links = parse_json_links_index(OUTPUT_DIR)
+
+    changed = False
+    for link in links:
+        if link['url'] == link_url:
+            link['title'] = new_title
+            changed = True
+            break
+
+    if changed:
+        write_json_links_index(OUTPUT_DIR, links)
+
+
+
 ### Individual link index
 
 def write_link_index(out_dir, link):
@@ -114,6 +144,7 @@ def write_link_index(out_dir, link):
 def write_json_link_index(out_dir, link):
     """write a json file with some info about the link"""
     
+    check_link_structure(link)
     path = os.path.join(out_dir, 'index.json')
 
     print('      √ index.json')
@@ -128,10 +159,13 @@ def parse_json_link_index(out_dir):
     existing_index = os.path.join(out_dir, 'index.json')
     if os.path.exists(existing_index):
         with open(existing_index, 'r', encoding='utf-8') as f:
-            return json.load(f)
+            link_json = json.load(f)
+            check_link_structure(link_json)
+            return link_json
     return {}
 
 def write_html_link_index(out_dir, link):
+    check_link_structure(link)
     with open(os.path.join(TEMPLATES_DIR, 'link_index_fancy.html'), 'r', encoding='utf-8') as f:
         link_html = f.read()
 

+ 13 - 9
archivebox/links.py

@@ -32,34 +32,33 @@ Link {
 
 """
 
-import datetime
 from html import unescape
 from collections import OrderedDict
 
 from util import (
-    domain,
-    base_url,
-    str_between,
-    get_link_type,
     merge_links,
     wget_output_path,
+    check_link_structure,
+    check_links_structure,
 )
-from config import ANSI
 
 
 def validate_links(links):
+    check_links_structure(links)
     links = archivable_links(links)  # remove chrome://, about:, mailto: etc.
     links = uniquefied_links(links)  # merge/dedupe duplicate timestamps & urls
     links = sorted_links(links)      # deterministically sort the links based on timstamp, url
-    
+
     if not links:
         print('[X] No links found :(')
         raise SystemExit(1)
 
     for link in links:
+        check_link_structure(link)
+
         link['title'] = unescape(link['title']) if link['title'] else None
         link['latest'] = link.get('latest') or {}
-        
+
         latest = link['latest']
         if not link['latest'].get('wget'):
             link['latest']['wget'] = wget_output_path(link)
@@ -81,14 +80,16 @@ def validate_links(links):
 
     return list(links)
 
+
 def archivable_links(links):
     """remove chrome://, about:// or other schemed links that cant be archived"""
     return (
         link
         for link in links
-        if any(link['url'].startswith(s) for s in ('http://', 'https://', 'ftp://'))
+        if any(link['url'].lower().startswith(s) for s in ('http://', 'https://', 'ftp://'))
     )
 
+
 def uniquefied_links(sorted_links):
     """
     ensures that all non-duplicate links have monotonically increasing timestamps
@@ -114,10 +115,12 @@ def uniquefied_links(sorted_links):
 
     return unique_timestamps.values()
 
+
 def sorted_links(links):
     sort_func = lambda link: (link['timestamp'].split('.', 1)[0], link['url'])
     return sorted(links, key=sort_func, reverse=True)
 
+
 def links_after_timestamp(links, timestamp=None):
     if not timestamp:
         yield from links
@@ -130,6 +133,7 @@ def links_after_timestamp(links, timestamp=None):
         except (ValueError, TypeError):
             print('Resume value and all timestamp values must be valid numbers.')
 
+
 def lowest_uniq_timestamp(used_timestamps, timestamp):
     """resolve duplicate timestamps by appending a decimal 1234, 1234 -> 1234.1, 1234.2"""
 

+ 9 - 13
archivebox/parse.py

@@ -20,7 +20,6 @@ Parsed link schema: {
 import re
 import sys
 import json
-import urllib
 from collections import OrderedDict
 import xml.etree.ElementTree as etree
 
@@ -32,7 +31,6 @@ from util import (
     base_url,
     str_between,
     get_link_type,
-    fetch_page_title,
     URL_REGEX,
 )
 
@@ -56,13 +54,11 @@ def parse_links(path):
     
     links = []
     with open(path, 'r', encoding='utf-8') as file:
-        print('{green}[*] [{}] Parsing new links from output/sources/{} and fetching titles...{reset}'.format(
+        print('{green}[*] [{}] Parsing new links from output/sources/{}...{reset}'.format(
             datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
             path.rsplit('/', 1)[-1],
             **ANSI,
         ))
-        if SHOW_PROGRESS:
-            sys.stdout.write('    ')
 
         for parser_name, parser_func in get_parsers(file).items():
             # otherwise try all parsers until one works
@@ -98,7 +94,7 @@ def parse_pocket_html_export(html_file):
                 'base_url': base_url(fixed_url),
                 'timestamp': str(time.timestamp()),
                 'tags': match.group(3),
-                'title': match.group(4).replace(' — Readability', '').replace('http://www.readability.com/read?url=', '') or fetch_page_title(fixed_url),
+                'title': match.group(4).replace(' — Readability', '').replace('http://www.readability.com/read?url=', '') or None,
                 'sources': [html_file.name],
             }
             info['type'] = get_link_type(info)
@@ -135,7 +131,7 @@ def parse_pinboard_json_export(json_file):
                 'base_url': base_url(url),
                 'timestamp': timestamp,
                 'tags': erg.get('tags') or '',
-                'title': title or fetch_page_title(url),
+                'title': title or None,
                 'sources': [json_file.name],
             }
             info['type'] = get_link_type(info)
@@ -174,7 +170,7 @@ def parse_rss_export(rss_file):
             'base_url': base_url(url),
             'timestamp': str(time.timestamp()),
             'tags': '',
-            'title': title or fetch_page_title(url),
+            'title': title or None,
             'sources': [rss_file.name],
         }
         info['type'] = get_link_type(info)
@@ -215,7 +211,7 @@ def parse_shaarli_rss_export(rss_file):
             'base_url': base_url(url),
             'timestamp': str(time.timestamp()),
             'tags': '',
-            'title': title or fetch_page_title(url),
+            'title': title or None,
             'sources': [rss_file.name],
         }
         info['type'] = get_link_type(info)
@@ -242,7 +238,7 @@ def parse_netscape_html_export(html_file):
                 'base_url': base_url(url),
                 'timestamp': str(time.timestamp()),
                 'tags': "",
-                'title': match.group(3).strip() or fetch_page_title(url),
+                'title': match.group(3).strip() or None,
                 'sources': [html_file.name],
             }
             info['type'] = get_link_type(info)
@@ -275,7 +271,7 @@ def parse_pinboard_rss_export(rss_file):
             'base_url': base_url(url),
             'timestamp': str(time.timestamp()),
             'tags': tags,
-            'title': title or fetch_page_title(url),
+            'title': title or None,
             'sources': [rss_file.name],
         }
         info['type'] = get_link_type(info)
@@ -300,7 +296,7 @@ def parse_medium_rss_export(rss_file):
             'base_url': base_url(url),
             'timestamp': str(time.timestamp()),
             'tags': '',
-            'title': title or fetch_page_title(url),
+            'title': title or None,
             'sources': [rss_file.name],
         }
         info['type'] = get_link_type(info)
@@ -324,7 +320,7 @@ def parse_plain_text_export(text_file):
                     'base_url': base_url(url),
                     'timestamp': str(datetime.now().timestamp()),
                     'tags': '',
-                    'title': fetch_page_title(url),
+                    'title': None,
                     'sources': [text_file.name],
                 }
                 info['type'] = get_link_type(info)

+ 37 - 15
archivebox/util.py

@@ -3,8 +3,7 @@ import re
 import sys
 import time
 import json
-import signal
-from urllib.request import urlopen
+from urllib.request import Request, urlopen
 from urllib.parse import urlparse
 
 from decimal import Decimal
@@ -25,6 +24,7 @@ from config import (
     TIMEOUT,
     SHOW_PROGRESS,
     CHECK_SSL_VALIDITY,
+    WGET_USER_AGENT,
     CURL_BINARY,
     WGET_BINARY,
     CHROME_BINARY,
@@ -219,7 +219,21 @@ def save_source(raw_text):
     return source_path
 
 
-def download_url(url):
+def fetch_page_content(url, timeout=TIMEOUT):
+    req = Request(url, headers={'User-Agent': WGET_USER_AGENT})
+
+    if CHECK_SSL_VALIDITY:
+        resp = urlopen(req, timeout=timeout)
+    else:
+        import ssl
+        insecure = ssl._create_unverified_context()
+        resp = urlopen(req, timeout=timeout, context=insecure)
+
+    encoding = resp.headers.get_content_charset() or 'utf-8'
+    return resp.read().decode(encoding)
+
+
+def download_url(url, timeout=TIMEOUT):
     """download a given url's content into downloads/domain.txt"""
 
     if not os.path.exists(SOURCES_DIR):
@@ -236,7 +250,7 @@ def download_url(url):
     ))
     end = progress(TIMEOUT, prefix='      ')
     try:
-        downloaded_xml = urlopen(url).read().decode('utf-8')
+        downloaded_xml = fetch_page_content(url, timeout=timeout)
         end()
     except Exception as e:
         end()
@@ -260,19 +274,15 @@ def fetch_page_title(url, timeout=10, progress=SHOW_PROGRESS):
             sys.stdout.write('.')
             sys.stdout.flush()
 
-        if CHECK_SSL_VALIDITY:
-            html_content = urlopen(url, timeout=timeout)
-        else:
-            try:
-                import ssl
-                insecure = ssl._create_unverified_context()
-                html_content = urlopen(url, timeout=timeout, context=insecure)
-            except ImportError:
-                html_content = urlopen(url, timeout=timeout)
+        html = fetch_page_content(url, timeout=timeout)
 
-        match = re.search(HTML_TITLE_REGEX, html_content.read().decode('utf-8'))
+        match = re.search(HTML_TITLE_REGEX, html)
         return match.group(1).strip() if match else None
-    except Exception:
+    except Exception as err:
+        # print('[!] Failed to fetch title because of {}: {}'.format(
+        #     err.__class__.__name__,
+        #     err,
+        # ))
         return None
 
 
@@ -603,3 +613,15 @@ def run(*popenargs, input=None, capture_output=False, timeout=None, check=False,
             raise CalledProcessError(retcode, process.args,
                                      output=stdout, stderr=stderr)
     return CompletedProcess(process.args, retcode, stdout, stderr)
+
+
+def check_link_structure(link):
+    assert isinstance(link, dict)
+    assert isinstance(link.get('url'), str)
+    assert len(link['url']) > 2
+
+
+def check_links_structure(links):
+    assert isinstance(links, list)
+    if links:
+        check_link_structure(links[0])