6 ani în urmă · b03e9fade8
--- a/archivebox/archive.py
+++ b/archivebox/archive.py
@@ -7,34 +7,31 @@ import os
 
				 import sys
			
 
				 
			
 
				 from datetime import datetime
			
 
				-from subprocess import run
			
 
				+from peekable import Peekable
			
 
				+
			
 
				 
			
 
				 from parse import parse_links
			
 
				-from links import validate_links
			
 
				-from archive_methods import archive_links, _RESULTS_TOTALS
			
 
				+from links import validate_links, links_after_timestamp
			
 
				+from archive_methods import archive_link, _RESULTS_TOTALS
			
 
				 from index import (
			
 
				     write_links_index,
			
 
				-    write_link_index,
			
 
				     parse_json_links_index,
			
 
				-    parse_json_link_index,
			
 
				 )
			
 
				 from config import (
			
 
				+    ARCHIVE_DIR,
			
 
				     ONLY_NEW,
			
 
				-    OUTPUT_PERMISSIONS,
			
 
				     OUTPUT_DIR,
			
 
				     REPO_DIR,
			
 
				     ANSI,
			
 
				-    TIMEOUT,
			
 
				-    SHOW_PROGRESS,
			
 
				     GIT_SHA,
			
 
				 )
			
 
				 from util import (
			
 
				+    check_dependencies,
			
 
				     download_url,
			
 
				     save_source,
			
 
				-    progress,
			
 
				-    cleanup_archive,
			
 
				     pretty_path,
			
 
				     migrate_data,
			
 
				+    check_links_structure,
			
 
				 )
			
 
				 
			
 
				 __AUTHOR__ = 'Nick Sweeting <[email protected]>'
			
@@ -42,6 +39,7 @@ __VERSION__ = GIT_SHA
 
				 __DESCRIPTION__ = 'ArchiveBox Usage:  Create a browsable html archive of a list of links.'
			
 
				 __DOCUMENTATION__ = 'https://github.com/pirate/ArchiveBox/wiki'
			
 
				 
			
 
				+
			
 
				 def print_help():
			
 
				     print(__DESCRIPTION__)
			
 
				     print("Documentation:     {}\n".format(__DOCUMENTATION__))
			
@@ -55,21 +53,22 @@ def print_help():
 
				 
			
 
				 def load_links(archive_path=OUTPUT_DIR, import_path=None):
			
 
				     """get new links from file and optionally append them to links in existing archive"""
			
 
				-    
			
 
				+
			
 
				     existing_links = []
			
 
				     if archive_path:
			
 
				         existing_links = parse_json_links_index(archive_path)
			
 
				+        check_links_structure(existing_links)
			
 
				 
			
 
				     new_links = []
			
 
				     if import_path:
			
 
				         # parse and validate the import file
			
 
				         raw_links, parser_name = parse_links(import_path)
			
 
				         new_links = validate_links(raw_links)
			
 
				-        if SHOW_PROGRESS:
			
 
				-            print()
			
 
				+        check_links_structure(new_links)
			
 
				 
			
 
				     # merge existing links in archive_path and new links
			
 
				     all_links = validate_links(existing_links + new_links)
			
 
				+    check_links_structure(all_links)
			
 
				     num_new_links = len(all_links) - len(existing_links)
			
 
				 
			
 
				     if import_path and parser_name:
			
@@ -81,6 +80,7 @@ def load_links(archive_path=OUTPUT_DIR, import_path=None):
 
				 
			
 
				     return all_links, new_links
			
 
				 
			
 
				+
			
 
				 def update_archive(archive_path, links, source=None, resume=None, append=True):
			
 
				     """update or create index.html+json given a path to an export file containing new links"""
			
 
				 
			
@@ -99,8 +99,38 @@ def update_archive(archive_path, links, source=None, resume=None, append=True):
 
				              **ANSI,
			
 
				         ))
			
 
				 
			
 
				+    check_links_structure(links)
			
 
				+
			
 
				+    # prefetch the first link off the generator so that if we pause or fail
			
 
				+    # immediately we can show that we paused on the first link and not just None
			
 
				+    to_archive = Peekable(links_after_timestamp(links, resume))
			
 
				+    idx, link = 0, to_archive.peek(0)
			
 
				+
			
 
				     # loop over links and archive them
			
 
				-    archive_links(archive_path, links, source=source, resume=resume)
			
 
				+    try:
			
 
				+        check_dependencies()
			
 
				+        for idx, link in enumerate(to_archive):
			
 
				+            link_dir = os.path.join(ARCHIVE_DIR, link['timestamp'])
			
 
				+            archive_link(link_dir, link)
			
 
				+
			
 
				+    except (KeyboardInterrupt, SystemExit, Exception) as e:
			
 
				+        print('\n{lightyellow}[X] [{now}] Downloading paused on link {timestamp} ({idx}/{total}){reset}'.format(
			
 
				+            **ANSI,
			
 
				+            now=datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
			
 
				+            idx=idx+1,
			
 
				+            timestamp=link['timestamp'],
			
 
				+            total=len(links),
			
 
				+        ))
			
 
				+        print('    To view your archive, open: {}/index.html'.format(OUTPUT_DIR.replace(REPO_DIR + '/', '')))
			
 
				+        print('    Continue where you left off by running:')
			
 
				+        print('        {} {}'.format(
			
 
				+            pretty_path(sys.argv[0]),
			
 
				+            link['timestamp'],
			
 
				+        ))
			
 
				+        if not isinstance(e, KeyboardInterrupt):
			
 
				+            print()
			
 
				+            raise e
			
 
				+        raise SystemExit(1)
			
 
				 
			
 
				     # print timing information & summary
			
 
				     end_ts = datetime.now().timestamp()
			
@@ -135,7 +165,7 @@ if __name__ == '__main__':
 
				     source = sys.argv[1] if argc > 1 else None  # path of links file to import
			
 
				     resume = sys.argv[2] if argc > 2 else None  # timestamp to resume dowloading from
			
 
				    
			
 
				-    stdin_raw_text = []
			
 
				+    stdin_raw_text = ''
			
 
				 
			
 
				     if not sys.stdin.isatty():
			
 
				         stdin_raw_text = sys.stdin.read()
			
@@ -192,3 +222,7 @@ if __name__ == '__main__':
 
				         update_archive(out_dir, new_links, source=source, resume=resume, append=True)
			
 
				     else:
			
 
				         update_archive(out_dir, all_links, source=source, resume=resume, append=True)
			
 
				+
			
 
				+    # Step 5: Re-write links index with updated titles, icons, and resources
			
 
				+    all_links, _ = load_links(archive_path=out_dir)
			
 
				+    write_links_index(out_dir=out_dir, links=all_links)
			
--- a/archivebox/archive_methods.py
+++ b/archivebox/archive_methods.py
@@ -1,16 +1,17 @@
 
				 import os
			
 
				-import re
			
 
				-import sys
			
 
				 
			
 
				 from functools import wraps
			
 
				 from collections import defaultdict
			
 
				 from datetime import datetime
			
 
				 
			
 
				-from peekable import Peekable
			
 
				-
			
 
				-from index import wget_output_path, parse_json_link_index, write_link_index
			
 
				-from links import links_after_timestamp
			
 
				+from index import (
			
 
				+    wget_output_path,
			
 
				+    parse_json_link_index,
			
 
				+    write_link_index,
			
 
				+    patch_index_title_hack,
			
 
				+)
			
 
				 from config import (
			
 
				+    OUTPUT_DIR,
			
 
				     CURL_BINARY,
			
 
				     GIT_BINARY,
			
 
				     WGET_BINARY,
			
@@ -42,12 +43,12 @@ from config import (
 
				 )
			
 
				 from util import (
			
 
				     without_fragment,
			
 
				-    check_dependencies,
			
 
				     fetch_page_title,
			
 
				     progress,
			
 
				     chmod_file,
			
 
				     pretty_path,
			
 
				-    run, PIPE, DEVNULL
			
 
				+    check_link_structure,
			
 
				+    run, PIPE, DEVNULL,
			
 
				 )
			
 
				 
			
 
				 
			
@@ -57,38 +58,12 @@ _RESULTS_TOTALS = {   # globals are bad, mmkay
 
				     'failed': 0,
			
 
				 }
			
 
				 
			
 
				-def archive_links(archive_path, links, source=None, resume=None):
			
 
				-    check_dependencies()
			
 
				-
			
 
				-    to_archive = Peekable(links_after_timestamp(links, resume))
			
 
				-    idx, link = 0, to_archive.peek(0)
			
 
				-
			
 
				-    try:
			
 
				-        for idx, link in enumerate(to_archive):
			
 
				-            link_dir = os.path.join(ARCHIVE_DIR, link['timestamp'])
			
 
				-            archive_link(link_dir, link)
			
 
				-    
			
 
				-    except (KeyboardInterrupt, SystemExit, Exception) as e:
			
 
				-        print('{lightyellow}[X] [{now}] Downloading paused on link {timestamp} ({idx}/{total}){reset}'.format(
			
 
				-            **ANSI,
			
 
				-            now=datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
			
 
				-            idx=idx+1,
			
 
				-            timestamp=link['timestamp'],
			
 
				-            total=len(links),
			
 
				-        ))
			
 
				-        print('    Continue where you left off by running:')
			
 
				-        print('        {} {}'.format(
			
 
				-            pretty_path(sys.argv[0]),
			
 
				-            link['timestamp'],
			
 
				-        ))
			
 
				-        if not isinstance(e, KeyboardInterrupt):
			
 
				-            raise e
			
 
				-        raise SystemExit(1)
			
 
				-
			
 
				 
			
 
				 def archive_link(link_dir, link, overwrite=True):
			
 
				     """download the DOM, PDF, and a screenshot into a folder named after the link's timestamp"""
			
 
				 
			
 
				+    check_link_structure(link)
			
 
				+
			
 
				     try:
			
 
				         update_existing = os.path.exists(link_dir)
			
 
				         if update_existing:
			
@@ -99,7 +74,7 @@ def archive_link(link_dir, link, overwrite=True):
 
				         else:
			
 
				             os.makedirs(link_dir)
			
 
				         
			
 
				-        log_link_archive(link_dir, link, update_existing)
			
 
				+        print_link_status_line(link_dir, link, update_existing)
			
 
				 
			
 
				         if FETCH_FAVICON:
			
 
				             link = fetch_favicon(link_dir, link, overwrite=overwrite)
			
@@ -135,7 +110,7 @@ def archive_link(link_dir, link, overwrite=True):
 
				     
			
 
				     return link
			
 
				 
			
 
				-def log_link_archive(link_dir, link, update_existing):
			
 
				+def print_link_status_line(link_dir, link, update_existing):
			
 
				     print('[{symbol_color}{symbol}{reset}] [{now}] "{title}"\n    {blue}{url}{reset}'.format(
			
 
				         symbol='*' if update_existing else '+',
			
 
				         symbol_color=ANSI['black' if update_existing else 'green'],
			
@@ -518,7 +493,7 @@ def fetch_title(link_dir, link, timeout=TIMEOUT):
 
				 
			
 
				     # if link already has valid title, skip it
			
 
				     if link['title'] and not link['title'].lower().startswith('http'):
			
 
				-        return {'output': link['title'], 'cmd': 'fetch_page_title("{}")'.format(link['url'])}
			
 
				+        return {'output': link['title'], 'status': 'skipped'}
			
 
				 
			
 
				     end = progress(timeout, prefix='      ')
			
 
				     try:
			
@@ -530,6 +505,13 @@ def fetch_title(link_dir, link, timeout=TIMEOUT):
 
				         print('        {}Failed: {} {}{}'.format(ANSI['red'], e.__class__.__name__, e, ANSI['reset']))
			
 
				         output = e
			
 
				 
			
 
				+    # titles should show up in the global index immediatley for better UX,
			
 
				+    # do a hacky immediate replacement to add them in as we're archiving
			
 
				+    # TODO: figure out how to do this without gnarly string replacement
			
 
				+    if title:
			
 
				+        link['title'] = title
			
 
				+        patch_index_title_hack(link['url'], title)
			
 
				+
			
 
				     return {
			
 
				         'cmd': 'fetch_page_title("{}")'.format(link['url']),
			
 
				         'output': output,
			
--- a/archivebox/index.py
+++ b/archivebox/index.py
@@ -6,6 +6,7 @@ from string import Template
 
				 from distutils.dir_util import copy_tree
			
 
				 
			
 
				 from config import (
			
 
				+    OUTPUT_DIR,
			
 
				     TEMPLATES_DIR,
			
 
				     OUTPUT_PERMISSIONS,
			
 
				     ANSI,
			
@@ -17,6 +18,8 @@ from util import (
 
				     wget_output_path,
			
 
				     derived_link_info,
			
 
				     pretty_path,
			
 
				+    check_link_structure,
			
 
				+    check_links_structure,
			
 
				 )
			
 
				 
			
 
				 
			
@@ -25,6 +28,8 @@ from util import (
 
				 def write_links_index(out_dir, links):
			
 
				     """create index.html file for a given list of links"""
			
 
				 
			
 
				+    check_links_structure(links)
			
 
				+
			
 
				     if not os.path.exists(out_dir):
			
 
				         os.makedirs(out_dir)
			
 
				 
			
@@ -42,6 +47,8 @@ def write_links_index(out_dir, links):
 
				 def write_json_links_index(out_dir, links):
			
 
				     """write the json link index to a given path"""
			
 
				 
			
 
				+    check_links_structure(links)
			
 
				+
			
 
				     path = os.path.join(out_dir, 'index.json')
			
 
				 
			
 
				     index_json = {
			
@@ -63,13 +70,17 @@ def parse_json_links_index(out_dir):
 
				     index_path = os.path.join(out_dir, 'index.json')
			
 
				     if os.path.exists(index_path):
			
 
				         with open(index_path, 'r', encoding='utf-8') as f:
			
 
				-            return json.load(f)['links']
			
 
				+            links = json.load(f)['links']
			
 
				+            check_links_structure(links)
			
 
				+            return links
			
 
				 
			
 
				     return []
			
 
				 
			
 
				 def write_html_links_index(out_dir, links):
			
 
				     """write the html link index to a given path"""
			
 
				 
			
 
				+    check_links_structure(links)
			
 
				+
			
 
				     path = os.path.join(out_dir, 'index.html')
			
 
				 
			
 
				     copy_tree(os.path.join(TEMPLATES_DIR, 'static'), os.path.join(out_dir, 'static'))
			
@@ -104,6 +115,25 @@ def write_html_links_index(out_dir, links):
 
				     chmod_file(path)
			
 
				 
			
 
				 
			
 
				+def patch_index_title_hack(link_url, new_title):
			
 
				+    """hack to update just one link's title in the link index json"""
			
 
				+
			
 
				+    json_path = os.path.join(OUTPUT_DIR, 'index.json')
			
 
				+
			
 
				+    links = parse_json_links_index(OUTPUT_DIR)
			
 
				+
			
 
				+    changed = False
			
 
				+    for link in links:
			
 
				+        if link['url'] == link_url:
			
 
				+            link['title'] = new_title
			
 
				+            changed = True
			
 
				+            break
			
 
				+
			
 
				+    if changed:
			
 
				+        write_json_links_index(OUTPUT_DIR, links)
			
 
				+
			
 
				+
			
 
				+
			
 
				 ### Individual link index
			
 
				 
			
 
				 def write_link_index(out_dir, link):
			
@@ -114,6 +144,7 @@ def write_link_index(out_dir, link):
 
				 def write_json_link_index(out_dir, link):
			
 
				     """write a json file with some info about the link"""
			
 
				     
			
 
				+    check_link_structure(link)
			
 
				     path = os.path.join(out_dir, 'index.json')
			
 
				 
			
 
				     print('      √ index.json')
			
@@ -128,10 +159,13 @@ def parse_json_link_index(out_dir):
 
				     existing_index = os.path.join(out_dir, 'index.json')
			
 
				     if os.path.exists(existing_index):
			
 
				         with open(existing_index, 'r', encoding='utf-8') as f:
			
 
				-            return json.load(f)
			
 
				+            link_json = json.load(f)
			
 
				+            check_link_structure(link_json)
			
 
				+            return link_json
			
 
				     return {}
			
 
				 
			
 
				 def write_html_link_index(out_dir, link):
			
 
				+    check_link_structure(link)
			
 
				     with open(os.path.join(TEMPLATES_DIR, 'link_index_fancy.html'), 'r', encoding='utf-8') as f:
			
 
				         link_html = f.read()
			
 
				 
			
--- a/archivebox/links.py
+++ b/archivebox/links.py
@@ -32,34 +32,33 @@ Link {
 
				 
			
 
				 """
			
 
				 
			
 
				-import datetime
			
 
				 from html import unescape
			
 
				 from collections import OrderedDict
			
 
				 
			
 
				 from util import (
			
 
				-    domain,
			
 
				-    base_url,
			
 
				-    str_between,
			
 
				-    get_link_type,
			
 
				     merge_links,
			
 
				     wget_output_path,
			
 
				+    check_link_structure,
			
 
				+    check_links_structure,
			
 
				 )
			
 
				-from config import ANSI
			
 
				 
			
 
				 
			
 
				 def validate_links(links):
			
 
				+    check_links_structure(links)
			
 
				     links = archivable_links(links)  # remove chrome://, about:, mailto: etc.
			
 
				     links = uniquefied_links(links)  # merge/dedupe duplicate timestamps & urls
			
 
				     links = sorted_links(links)      # deterministically sort the links based on timstamp, url
			
 
				-    
			
 
				+
			
 
				     if not links:
			
 
				         print('[X] No links found :(')
			
 
				         raise SystemExit(1)
			
 
				 
			
 
				     for link in links:
			
 
				+        check_link_structure(link)
			
 
				+
			
 
				         link['title'] = unescape(link['title']) if link['title'] else None
			
 
				         link['latest'] = link.get('latest') or {}
			
 
				-        
			
 
				+
			
 
				         latest = link['latest']
			
 
				         if not link['latest'].get('wget'):
			
 
				             link['latest']['wget'] = wget_output_path(link)
			
@@ -81,14 +80,16 @@ def validate_links(links):
 
				 
			
 
				     return list(links)
			
 
				 
			
 
				+
			
 
				 def archivable_links(links):
			
 
				     """remove chrome://, about:// or other schemed links that cant be archived"""
			
 
				     return (
			
 
				         link
			
 
				         for link in links
			
 
				-        if any(link['url'].startswith(s) for s in ('http://', 'https://', 'ftp://'))
			
 
				+        if any(link['url'].lower().startswith(s) for s in ('http://', 'https://', 'ftp://'))
			
 
				     )
			
 
				 
			
 
				+
			
 
				 def uniquefied_links(sorted_links):
			
 
				     """
			
 
				     ensures that all non-duplicate links have monotonically increasing timestamps
			
@@ -114,10 +115,12 @@ def uniquefied_links(sorted_links):
 
				 
			
 
				     return unique_timestamps.values()
			
 
				 
			
 
				+
			
 
				 def sorted_links(links):
			
 
				     sort_func = lambda link: (link['timestamp'].split('.', 1)[0], link['url'])
			
 
				     return sorted(links, key=sort_func, reverse=True)
			
 
				 
			
 
				+
			
 
				 def links_after_timestamp(links, timestamp=None):
			
 
				     if not timestamp:
			
 
				         yield from links
			
@@ -130,6 +133,7 @@ def links_after_timestamp(links, timestamp=None):
 
				         except (ValueError, TypeError):
			
 
				             print('Resume value and all timestamp values must be valid numbers.')
			
 
				 
			
 
				+
			
 
				 def lowest_uniq_timestamp(used_timestamps, timestamp):
			
 
				     """resolve duplicate timestamps by appending a decimal 1234, 1234 -> 1234.1, 1234.2"""
			
 
				 
			
--- a/archivebox/parse.py
+++ b/archivebox/parse.py
@@ -20,7 +20,6 @@ Parsed link schema: {
 
				 import re
			
 
				 import sys
			
 
				 import json
			
 
				-import urllib
			
 
				 from collections import OrderedDict
			
 
				 import xml.etree.ElementTree as etree
			
 
				 
			
@@ -32,7 +31,6 @@ from util import (
 
				     base_url,
			
 
				     str_between,
			
 
				     get_link_type,
			
 
				-    fetch_page_title,
			
 
				     URL_REGEX,
			
 
				 )
			
 
				 
			
@@ -56,13 +54,11 @@ def parse_links(path):
 
				     
			
 
				     links = []
			
 
				     with open(path, 'r', encoding='utf-8') as file:
			
 
				-        print('{green}[*] [{}] Parsing new links from output/sources/{} and fetching titles...{reset}'.format(
			
 
				+        print('{green}[*] [{}] Parsing new links from output/sources/{}...{reset}'.format(
			
 
				             datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
			
 
				             path.rsplit('/', 1)[-1],
			
 
				             **ANSI,
			
 
				         ))
			
 
				-        if SHOW_PROGRESS:
			
 
				-            sys.stdout.write('    ')
			
 
				 
			
 
				         for parser_name, parser_func in get_parsers(file).items():
			
 
				             # otherwise try all parsers until one works
			
@@ -98,7 +94,7 @@ def parse_pocket_html_export(html_file):
 
				                 'base_url': base_url(fixed_url),
			
 
				                 'timestamp': str(time.timestamp()),
			
 
				                 'tags': match.group(3),
			
 
				-                'title': match.group(4).replace(' — Readability', '').replace('http://www.readability.com/read?url=', '') or fetch_page_title(fixed_url),
			
 
				+                'title': match.group(4).replace(' — Readability', '').replace('http://www.readability.com/read?url=', '') or None,
			
 
				                 'sources': [html_file.name],
			
 
				             }
			
 
				             info['type'] = get_link_type(info)
			
@@ -135,7 +131,7 @@ def parse_pinboard_json_export(json_file):
 
				                 'base_url': base_url(url),
			
 
				                 'timestamp': timestamp,
			
 
				                 'tags': erg.get('tags') or '',
			
 
				-                'title': title or fetch_page_title(url),
			
 
				+                'title': title or None,
			
 
				                 'sources': [json_file.name],
			
 
				             }
			
 
				             info['type'] = get_link_type(info)
			
@@ -174,7 +170,7 @@ def parse_rss_export(rss_file):
 
				             'base_url': base_url(url),
			
 
				             'timestamp': str(time.timestamp()),
			
 
				             'tags': '',
			
 
				-            'title': title or fetch_page_title(url),
			
 
				+            'title': title or None,
			
 
				             'sources': [rss_file.name],
			
 
				         }
			
 
				         info['type'] = get_link_type(info)
			
@@ -215,7 +211,7 @@ def parse_shaarli_rss_export(rss_file):
 
				             'base_url': base_url(url),
			
 
				             'timestamp': str(time.timestamp()),
			
 
				             'tags': '',
			
 
				-            'title': title or fetch_page_title(url),
			
 
				+            'title': title or None,
			
 
				             'sources': [rss_file.name],
			
 
				         }
			
 
				         info['type'] = get_link_type(info)
			
@@ -242,7 +238,7 @@ def parse_netscape_html_export(html_file):
 
				                 'base_url': base_url(url),
			
 
				                 'timestamp': str(time.timestamp()),
			
 
				                 'tags': "",
			
 
				-                'title': match.group(3).strip() or fetch_page_title(url),
			
 
				+                'title': match.group(3).strip() or None,
			
 
				                 'sources': [html_file.name],
			
 
				             }
			
 
				             info['type'] = get_link_type(info)
			
@@ -275,7 +271,7 @@ def parse_pinboard_rss_export(rss_file):
 
				             'base_url': base_url(url),
			
 
				             'timestamp': str(time.timestamp()),
			
 
				             'tags': tags,
			
 
				-            'title': title or fetch_page_title(url),
			
 
				+            'title': title or None,
			
 
				             'sources': [rss_file.name],
			
 
				         }
			
 
				         info['type'] = get_link_type(info)
			
@@ -300,7 +296,7 @@ def parse_medium_rss_export(rss_file):
 
				             'base_url': base_url(url),
			
 
				             'timestamp': str(time.timestamp()),
			
 
				             'tags': '',
			
 
				-            'title': title or fetch_page_title(url),
			
 
				+            'title': title or None,
			
 
				             'sources': [rss_file.name],
			
 
				         }
			
 
				         info['type'] = get_link_type(info)
			
@@ -324,7 +320,7 @@ def parse_plain_text_export(text_file):
 
				                     'base_url': base_url(url),
			
 
				                     'timestamp': str(datetime.now().timestamp()),
			
 
				                     'tags': '',
			
 
				-                    'title': fetch_page_title(url),
			
 
				+                    'title': None,
			
 
				                     'sources': [text_file.name],
			
 
				                 }
			
 
				                 info['type'] = get_link_type(info)
			
--- a/archivebox/util.py
+++ b/archivebox/util.py
@@ -3,8 +3,7 @@ import re
 
				 import sys
			
 
				 import time
			
 
				 import json
			
 
				-import signal
			
 
				-from urllib.request import urlopen
			
 
				+from urllib.request import Request, urlopen
			
 
				 from urllib.parse import urlparse
			
 
				 
			
 
				 from decimal import Decimal
			
@@ -25,6 +24,7 @@ from config import (
 
				     TIMEOUT,
			
 
				     SHOW_PROGRESS,
			
 
				     CHECK_SSL_VALIDITY,
			
 
				+    WGET_USER_AGENT,
			
 
				     CURL_BINARY,
			
 
				     WGET_BINARY,
			
 
				     CHROME_BINARY,
			
@@ -219,7 +219,21 @@ def save_source(raw_text):
 
				     return source_path
			
 
				 
			
 
				 
			
 
				-def download_url(url):
			
 
				+def fetch_page_content(url, timeout=TIMEOUT):
			
 
				+    req = Request(url, headers={'User-Agent': WGET_USER_AGENT})
			
 
				+
			
 
				+    if CHECK_SSL_VALIDITY:
			
 
				+        resp = urlopen(req, timeout=timeout)
			
 
				+    else:
			
 
				+        import ssl
			
 
				+        insecure = ssl._create_unverified_context()
			
 
				+        resp = urlopen(req, timeout=timeout, context=insecure)
			
 
				+
			
 
				+    encoding = resp.headers.get_content_charset() or 'utf-8'
			
 
				+    return resp.read().decode(encoding)
			
 
				+
			
 
				+
			
 
				+def download_url(url, timeout=TIMEOUT):
			
 
				     """download a given url's content into downloads/domain.txt"""
			
 
				 
			
 
				     if not os.path.exists(SOURCES_DIR):
			
@@ -236,7 +250,7 @@ def download_url(url):
 
				     ))
			
 
				     end = progress(TIMEOUT, prefix='      ')
			
 
				     try:
			
 
				-        downloaded_xml = urlopen(url).read().decode('utf-8')
			
 
				+        downloaded_xml = fetch_page_content(url, timeout=timeout)
			
 
				         end()
			
 
				     except Exception as e:
			
 
				         end()
			
@@ -260,19 +274,15 @@ def fetch_page_title(url, timeout=10, progress=SHOW_PROGRESS):
 
				             sys.stdout.write('.')
			
 
				             sys.stdout.flush()
			
 
				 
			
 
				-        if CHECK_SSL_VALIDITY:
			
 
				-            html_content = urlopen(url, timeout=timeout)
			
 
				-        else:
			
 
				-            try:
			
 
				-                import ssl
			
 
				-                insecure = ssl._create_unverified_context()
			
 
				-                html_content = urlopen(url, timeout=timeout, context=insecure)
			
 
				-            except ImportError:
			
 
				-                html_content = urlopen(url, timeout=timeout)
			
 
				+        html = fetch_page_content(url, timeout=timeout)
			
 
				 
			
 
				-        match = re.search(HTML_TITLE_REGEX, html_content.read().decode('utf-8'))
			
 
				+        match = re.search(HTML_TITLE_REGEX, html)
			
 
				         return match.group(1).strip() if match else None
			
 
				-    except Exception:
			
 
				+    except Exception as err:
			
 
				+        # print('[!] Failed to fetch title because of {}: {}'.format(
			
 
				+        #     err.__class__.__name__,
			
 
				+        #     err,
			
 
				+        # ))
			
 
				         return None
			
 
				 
			
 
				 
			
@@ -603,3 +613,15 @@ def run(*popenargs, input=None, capture_output=False, timeout=None, check=False,
 
				             raise CalledProcessError(retcode, process.args,
			
 
				                                      output=stdout, stderr=stderr)
			
 
				     return CompletedProcess(process.args, retcode, stdout, stderr)
			
 
				+
			
 
				+
			
 
				+def check_link_structure(link):
			
 
				+    assert isinstance(link, dict)
			
 
				+    assert isinstance(link.get('url'), str)
			
 
				+    assert len(link['url']) > 2
			
 
				+
			
 
				+
			
 
				+def check_links_structure(links):
			
 
				+    assert isinstance(links, list)
			
 
				+    if links:
			
 
				+        check_link_structure(links[0])