6 jaren geleden · c7fc9e1878
--- a/archivebox/archive.py
+++ b/archivebox/archive.py
@@ -30,7 +30,6 @@ from util import (
 
				     save_remote_source,
			
 
				     save_stdin_source,
			
 
				     pretty_path,
			
 
				-    migrate_data,
			
 
				     check_links_structure,
			
 
				 )
			
 
				 
			
@@ -159,8 +158,6 @@ if __name__ == '__main__':
 
				         print_help()
			
 
				         raise SystemExit(0)
			
 
				 
			
 
				-    migrate_data()
			
 
				-
			
 
				     source = sys.argv[1] if argc > 1 else None  # path of links file to import
			
 
				     resume = sys.argv[2] if argc > 2 else None  # timestamp to resume dowloading from
			
 
				    
			
--- a/archivebox/util.py
+++ b/archivebox/util.py
@@ -2,22 +2,18 @@ import os
 
				 import re
			
 
				 import sys
			
 
				 import time
			
 
				-import json
			
 
				-from urllib.request import Request, urlopen
			
 
				-from urllib.parse import urlparse
			
 
				 
			
 
				+from urllib.request import Request, urlopen
			
 
				+from urllib.parse import urlparse, quote
			
 
				 from decimal import Decimal
			
 
				-from urllib.parse import quote
			
 
				 from datetime import datetime
			
 
				-from subprocess import TimeoutExpired, Popen, PIPE, DEVNULL, CompletedProcess, CalledProcessError
			
 
				 from multiprocessing import Process
			
 
				+from subprocess import TimeoutExpired, Popen, PIPE, DEVNULL, CompletedProcess, CalledProcessError
			
 
				 
			
 
				 from config import (
			
 
				     ANSI,
			
 
				-    IS_TTY,
			
 
				     TERM_WIDTH,
			
 
				     REPO_DIR,
			
 
				-    OUTPUT_DIR,
			
 
				     SOURCES_DIR,
			
 
				     ARCHIVE_DIR,
			
 
				     OUTPUT_PERMISSIONS,
			
@@ -42,7 +38,9 @@ from config import (
 
				     SUBMIT_ARCHIVE_DOT_ORG,
			
 
				 )
			
 
				 
			
 
				-# URL helpers: https://docs.python.org/3/library/urllib.parse.html#url-parsing
			
 
				+### Parsing Helpers
			
 
				+
			
 
				+# Url Parsing: https://docs.python.org/3/library/urllib.parse.html#url-parsing
			
 
				 scheme = lambda url: urlparse(url).scheme
			
 
				 without_scheme = lambda url: urlparse(url)._replace(scheme='').geturl().strip('//')
			
 
				 without_query = lambda url: urlparse(url)._replace(query='').geturl().strip('//')
			
@@ -72,6 +70,20 @@ HTML_TITLE_REGEX = re.compile(
 
				     re.IGNORECASE,
			
 
				 )
			
 
				 
			
 
				+### Checks & Tests
			
 
				+
			
 
				+def check_link_structure(link):
			
 
				+    """basic sanity check invariants to make sure the data is valid"""
			
 
				+    assert isinstance(link, dict)
			
 
				+    assert isinstance(link.get('url'), str)
			
 
				+    assert len(link['url']) > 2
			
 
				+    assert len(re.findall(URL_REGEX, link['url'])) == 1
			
 
				+
			
 
				+def check_links_structure(links):
			
 
				+    """basic sanity check invariants to make sure the data is valid"""
			
 
				+    assert isinstance(links, list)
			
 
				+    if links:
			
 
				+        check_link_structure(links[0])
			
 
				 
			
 
				 def check_dependencies():
			
 
				     """Check that all necessary dependencies are installed, and have valid versions"""
			
@@ -134,7 +146,6 @@ def check_dependencies():
 
				             print('    See https://github.com/pirate/ArchiveBox for help.')
			
 
				             raise SystemExit(1)
			
 
				 
			
 
				-
			
 
				 def check_url_parsing():
			
 
				     """Check that plain text regex URL parsing works as expected"""
			
 
				     test_urls = '''
			
@@ -159,111 +170,7 @@ def check_url_parsing():
 
				     assert len(re.findall(URL_REGEX, test_urls)) == 12
			
 
				 
			
 
				 
			
 
				-def print_error_hints(cmd, pwd, err=None, hints=None, prefix='        '):
			
 
				-    """quote the argument with whitespace in a command so the user can 
			
 
				-       copy-paste the outputted string directly to run the cmd
			
 
				-    """
			
 
				-
			
 
				-    quoted_cmd = ' '.join(
			
 
				-        '"{}"'.format(arg) if ' ' in arg else arg
			
 
				-        for arg in cmd
			
 
				-    )
			
 
				-
			
 
				-    output_lines = [
			
 
				-        '{}Failed: {} {}{}'.format(ANSI['red'], err.__class__.__name__, err, ANSI['reset']),
			
 
				-        '    {}{}{}'.format(ANSI['lightyellow'], hints, ANSI['reset']) if hints else None,
			
 
				-        'Run to see full output:'        
			
 
				-        '    cd {};'.format(pwd),
			
 
				-        '    {}'.format(quoted_cmd),
			
 
				-    ]
			
 
				-
			
 
				-    return '\n'.join(
			
 
				-        '{}{}'.format(prefix, line)
			
 
				-        for line in output_lines
			
 
				-        if line
			
 
				-    )
			
 
				-
			
 
				-
			
 
				-def chmod_file(path, cwd='.', permissions=OUTPUT_PERMISSIONS, timeout=30):
			
 
				-    """chmod -R <permissions> <cwd>/<path>"""
			
 
				-
			
 
				-    if not os.path.exists(os.path.join(cwd, path)):
			
 
				-        raise Exception('Failed to chmod: {} does not exist (did the previous step fail?)'.format(path))
			
 
				-
			
 
				-    chmod_result = run(['chmod', '-R', permissions, path], cwd=cwd, stdout=DEVNULL, stderr=PIPE, timeout=timeout)
			
 
				-    if chmod_result.returncode == 1:
			
 
				-        print('     ', chmod_result.stderr.decode())
			
 
				-        raise Exception('Failed to chmod {}/{}'.format(cwd, path))
			
 
				-
			
 
				-
			
 
				-def progress(seconds=TIMEOUT, prefix=''):
			
 
				-    """Show a (subprocess-controlled) progress bar with a <seconds> timeout,
			
 
				-       returns end() function to instantly finish the progress
			
 
				-    """
			
 
				-
			
 
				-    if not SHOW_PROGRESS:
			
 
				-        return lambda: None
			
 
				-
			
 
				-    def progress_bar(seconds, prefix):
			
 
				-        """show timer in the form of progress bar, with percentage and seconds remaining"""
			
 
				-        chunk = '█' if sys.stdout.encoding == 'UTF-8' else '#'
			
 
				-        chunks = TERM_WIDTH - len(prefix) - 20  # number of progress chunks to show (aka max bar width)
			
 
				-        try:
			
 
				-            for s in range(seconds * chunks):
			
 
				-                progress = s / chunks / seconds * 100
			
 
				-                bar_width = round(progress/(100/chunks))
			
 
				-
			
 
				-                # ████████████████████           0.9% (1/60sec)
			
 
				-                sys.stdout.write('\r{0}{1}{2}{3} {4}% ({5}/{6}sec)'.format(
			
 
				-                    prefix,
			
 
				-                    ANSI['green'],
			
 
				-                    (chunk * bar_width).ljust(chunks),
			
 
				-                    ANSI['reset'],
			
 
				-                    round(progress, 1),
			
 
				-                    round(s/chunks),
			
 
				-                    seconds,
			
 
				-                ))
			
 
				-                sys.stdout.flush()
			
 
				-                time.sleep(1 / chunks)
			
 
				-
			
 
				-            # ██████████████████████████████████ 100.0% (60/60sec)
			
 
				-            sys.stdout.write('\r{0}{1}{2}{3} {4}% ({5}/{6}sec)\n'.format(
			
 
				-                prefix,
			
 
				-                ANSI['red'],
			
 
				-                chunk * chunks,
			
 
				-                ANSI['reset'],
			
 
				-                100.0,
			
 
				-                seconds,
			
 
				-                seconds,
			
 
				-            ))
			
 
				-            sys.stdout.flush()
			
 
				-        except KeyboardInterrupt:
			
 
				-            print()
			
 
				-            pass
			
 
				-
			
 
				-    p = Process(target=progress_bar, args=(seconds, prefix))
			
 
				-    p.start()
			
 
				-
			
 
				-    def end():
			
 
				-        """immediately finish progress and clear the progressbar line"""
			
 
				-
			
 
				-        # protect from double termination
			
 
				-        #if p is None or not hasattr(p, 'kill'):
			
 
				-        #    return
			
 
				-        nonlocal p
			
 
				-        if p is not None:
			
 
				-            p.terminate()
			
 
				-        p = None
			
 
				-
			
 
				-        sys.stdout.write('\r{}{}\r'.format((' ' * TERM_WIDTH), ANSI['reset']))  # clear whole terminal line
			
 
				-        sys.stdout.flush()
			
 
				-
			
 
				-    return end
			
 
				-
			
 
				-def pretty_path(path):
			
 
				-    """convert paths like .../ArchiveBox/archivebox/../output/abc into output/abc"""
			
 
				-    return path.replace(REPO_DIR + '/', '')
			
 
				-
			
 
				+### Random Helpers
			
 
				 
			
 
				 def save_stdin_source(raw_text):
			
 
				     if not os.path.exists(SOURCES_DIR):
			
@@ -278,23 +185,8 @@ def save_stdin_source(raw_text):
 
				 
			
 
				     return source_path
			
 
				 
			
 
				-
			
 
				-def fetch_page_content(url, timeout=TIMEOUT):
			
 
				-    req = Request(url, headers={'User-Agent': WGET_USER_AGENT})
			
 
				-
			
 
				-    if CHECK_SSL_VALIDITY:
			
 
				-        resp = urlopen(req, timeout=timeout)
			
 
				-    else:
			
 
				-        import ssl
			
 
				-        insecure = ssl._create_unverified_context()
			
 
				-        resp = urlopen(req, timeout=timeout, context=insecure)
			
 
				-
			
 
				-    encoding = resp.headers.get_content_charset() or 'utf-8'
			
 
				-    return resp.read().decode(encoding)
			
 
				-
			
 
				-
			
 
				 def save_remote_source(url, timeout=TIMEOUT):
			
 
				-    """download a given url's content into downloads/domain.txt"""
			
 
				+    """download a given url's content into output/sources/domain-<timestamp>.txt"""
			
 
				 
			
 
				     if not os.path.exists(SOURCES_DIR):
			
 
				         os.makedirs(SOURCES_DIR)
			
@@ -311,7 +203,7 @@ def save_remote_source(url, timeout=TIMEOUT):
 
				     ))
			
 
				     end = progress(TIMEOUT, prefix='      ')
			
 
				     try:
			
 
				-        downloaded_xml = fetch_page_content(url, timeout=timeout)
			
 
				+        downloaded_xml = download_url(url, timeout=timeout)
			
 
				         end()
			
 
				     except Exception as e:
			
 
				         end()
			
@@ -330,7 +222,6 @@ def save_remote_source(url, timeout=TIMEOUT):
 
				 
			
 
				     return source_path
			
 
				 
			
 
				-
			
 
				 def fetch_page_title(url, timeout=10, progress=SHOW_PROGRESS):
			
 
				     """Attempt to guess a page's title by downloading the html"""
			
 
				     if not FETCH_TITLE:
			
@@ -341,7 +232,7 @@ def fetch_page_title(url, timeout=10, progress=SHOW_PROGRESS):
 
				             sys.stdout.write('.')
			
 
				             sys.stdout.flush()
			
 
				 
			
 
				-        html = fetch_page_content(url, timeout=timeout)
			
 
				+        html = download_url(url, timeout=timeout)
			
 
				 
			
 
				         match = re.search(HTML_TITLE_REGEX, html)
			
 
				         return match.group(1).strip() if match else None
			
@@ -352,215 +243,6 @@ def fetch_page_title(url, timeout=10, progress=SHOW_PROGRESS):
 
				         # ))
			
 
				         return None
			
 
				 
			
 
				-
			
 
				-def str_between(string, start, end=None):
			
 
				-    """(<abc>12345</def>, <abc>, </def>)  ->  12345"""
			
 
				-
			
 
				-    content = string.split(start, 1)[-1]
			
 
				-    if end is not None:
			
 
				-        content = content.rsplit(end, 1)[0]
			
 
				-
			
 
				-    return content
			
 
				-
			
 
				-def get_link_type(link):
			
 
				-    """Certain types of links need to be handled specially, this figures out when that's the case"""
			
 
				-
			
 
				-    if extension(link['url']) == 'pdf':
			
 
				-        return 'PDF'
			
 
				-    elif extension(link['url']) in ('pdf', 'png', 'jpg', 'jpeg', 'svg', 'bmp', 'gif', 'tiff', 'webp'):
			
 
				-        return 'image'
			
 
				-    elif 'wikipedia.org' in domain(link['url']).lower():
			
 
				-        return 'wiki'
			
 
				-    elif 'youtube.com' in domain(link['url']).lower():
			
 
				-        return 'youtube'
			
 
				-    elif 'soundcloud.com' in domain(link['url']).lower():
			
 
				-        return 'soundcloud'
			
 
				-    elif 'youku.com' in domain(link['url']).lower():
			
 
				-        return 'youku'
			
 
				-    elif 'vimeo.com' in domain(link['url']).lower():
			
 
				-        return 'vimeo'
			
 
				-    return None
			
 
				-
			
 
				-def merge_links(a, b):
			
 
				-    """deterministially merge two links, favoring longer field values over shorter,
			
 
				-    and "cleaner" values over worse ones.
			
 
				-    """
			
 
				-    longer = lambda key: (a[key] if len(a[key]) > len(b[key]) else b[key]) if (a[key] and b[key]) else (a[key] or b[key])
			
 
				-    earlier = lambda key: a[key] if a[key] < b[key] else b[key]
			
 
				-    
			
 
				-    url = longer('url')
			
 
				-    longest_title = longer('title')
			
 
				-    cleanest_title = a['title'] if '://' not in (a['title'] or '') else b['title']
			
 
				-    link = {
			
 
				-        'timestamp': earlier('timestamp'),
			
 
				-        'url': url,
			
 
				-        'domain': domain(url),
			
 
				-        'base_url': base_url(url),
			
 
				-        'tags': longer('tags'),
			
 
				-        'title': longest_title if '://' not in (longest_title or '') else cleanest_title,
			
 
				-        'sources': list(set(a.get('sources', []) + b.get('sources', []))),
			
 
				-    }
			
 
				-    link['type'] = get_link_type(link)
			
 
				-    return link
			
 
				-
			
 
				-def find_link(folder, links):
			
 
				-    """for a given archive folder, find the corresponding link object in links"""
			
 
				-    url = parse_url(folder)
			
 
				-    if url:
			
 
				-        for link in links:
			
 
				-            if (base_url(link['url']) in url) or (url in link['url']):
			
 
				-                return link
			
 
				-
			
 
				-    timestamp = folder.split('.')[0]
			
 
				-    for link in links:
			
 
				-        if link['timestamp'].startswith(timestamp):
			
 
				-            if domain(link['url']) in os.listdir(os.path.join(ARCHIVE_DIR, folder)):
			
 
				-                return link      # careful now, this isn't safe for most ppl
			
 
				-            if domain(link['url']) in parse_url(folder):
			
 
				-                return link
			
 
				-    return None
			
 
				-
			
 
				-
			
 
				-def parse_url(folder):
			
 
				-    """for a given archive folder, figure out what url it's for"""
			
 
				-    link_json = os.path.join(ARCHIVE_DIR, folder, 'index.json')
			
 
				-    if os.path.exists(link_json):
			
 
				-        with open(link_json, 'r') as f:
			
 
				-            try:
			
 
				-                link_json = f.read().strip()
			
 
				-                if link_json:
			
 
				-                    link = json.loads(link_json)
			
 
				-                    return base_url(link['url'])
			
 
				-            except ValueError:
			
 
				-                print('File contains invalid JSON: {}!'.format(link_json))
			
 
				-
			
 
				-    archive_org_txt = os.path.join(ARCHIVE_DIR, folder, 'archive.org.txt')
			
 
				-    if os.path.exists(archive_org_txt):
			
 
				-        with open(archive_org_txt, 'r') as f:
			
 
				-            original_link = f.read().strip().split('/http', 1)[-1]
			
 
				-            with_scheme = 'http{}'.format(original_link)
			
 
				-            return with_scheme
			
 
				-
			
 
				-    return ''
			
 
				-
			
 
				-def manually_merge_folders(source, target):
			
 
				-    """prompt for user input to resolve a conflict between two archive folders"""
			
 
				-
			
 
				-    if not IS_TTY:
			
 
				-        return
			
 
				-
			
 
				-    fname = lambda path: path.split('/')[-1]
			
 
				-
			
 
				-    print('    {} and {} have conflicting files, which do you want to keep?'.format(fname(source), fname(target)))
			
 
				-    print('      - [enter]: do nothing (keep both)')
			
 
				-    print('      - a:       prefer files from {}'.format(source))
			
 
				-    print('      - b:       prefer files from {}'.format(target))
			
 
				-    print('      - q:       quit and resolve the conflict manually')
			
 
				-    try:
			
 
				-        answer = input('> ').strip().lower()
			
 
				-    except KeyboardInterrupt:
			
 
				-        answer = 'q'
			
 
				-
			
 
				-    assert answer in ('', 'a', 'b', 'q'), 'Invalid choice.'
			
 
				-
			
 
				-    if answer == 'q':
			
 
				-        print('\nJust run ArchiveBox again to pick up where you left off.')
			
 
				-        raise SystemExit(0)
			
 
				-    elif answer == '':
			
 
				-        return
			
 
				-
			
 
				-    files_in_source = set(os.listdir(source))
			
 
				-    files_in_target = set(os.listdir(target))
			
 
				-    for file in files_in_source:
			
 
				-        if file in files_in_target:
			
 
				-            to_delete = target if answer == 'a' else source
			
 
				-            run(['rm', '-Rf', os.path.join(to_delete, file)])
			
 
				-        run(['mv', os.path.join(source, file), os.path.join(target, file)])
			
 
				-
			
 
				-    if not set(os.listdir(source)):
			
 
				-        run(['rm', '-Rf', source])
			
 
				-
			
 
				-def fix_folder_path(archive_path, link_folder, link):
			
 
				-    """given a folder, merge it to the canonical 'correct' path for the given link object"""
			
 
				-    source = os.path.join(archive_path, link_folder)
			
 
				-    target = os.path.join(archive_path, link['timestamp'])
			
 
				-
			
 
				-    url_in_folder = parse_url(source)
			
 
				-    if not (url_in_folder in base_url(link['url'])
			
 
				-            or base_url(link['url']) in url_in_folder):
			
 
				-        raise ValueError('The link does not match the url for this folder.')
			
 
				-
			
 
				-    if not os.path.exists(target):
			
 
				-        # target doesn't exist so nothing needs merging, simply move A to B
			
 
				-        run(['mv', source, target])
			
 
				-    else:
			
 
				-        # target folder exists, check for conflicting files and attempt manual merge
			
 
				-        files_in_source = set(os.listdir(source))
			
 
				-        files_in_target = set(os.listdir(target))
			
 
				-        conflicting_files = files_in_source & files_in_target
			
 
				-
			
 
				-        if not conflicting_files:
			
 
				-            for file in files_in_source:
			
 
				-                run(['mv', os.path.join(source, file), os.path.join(target, file)])
			
 
				-
			
 
				-    if os.path.exists(source):
			
 
				-        files_in_source = set(os.listdir(source))
			
 
				-        if files_in_source:
			
 
				-            manually_merge_folders(source, target)
			
 
				-        else:
			
 
				-            run(['rm', '-R', source])
			
 
				-
			
 
				-
			
 
				-def migrate_data():
			
 
				-    # migrate old folder to new OUTPUT folder
			
 
				-    old_dir = os.path.join(REPO_DIR, 'html')
			
 
				-    if os.path.exists(old_dir):
			
 
				-        print('[!] WARNING: Moved old output folder "html" to new location: {}'.format(OUTPUT_DIR))
			
 
				-        run(['mv', old_dir, OUTPUT_DIR], timeout=10)
			
 
				-
			
 
				-
			
 
				-def cleanup_archive(archive_path, links):
			
 
				-    """move any incorrectly named folders to their canonical locations"""
			
 
				-    
			
 
				-    # for each folder that exists, see if we can match it up with a known good link
			
 
				-    # if we can, then merge the two folders (TODO: if not, move it to lost & found)
			
 
				-
			
 
				-    unmatched = []
			
 
				-    bad_folders = []
			
 
				-
			
 
				-    if not os.path.exists(archive_path):
			
 
				-        return
			
 
				-
			
 
				-    for folder in os.listdir(archive_path):
			
 
				-        try:
			
 
				-            files = os.listdir(os.path.join(archive_path, folder))
			
 
				-        except NotADirectoryError:
			
 
				-            continue
			
 
				-        
			
 
				-        if files:
			
 
				-            link = find_link(folder, links)
			
 
				-            if link is None:
			
 
				-                unmatched.append(folder)
			
 
				-                continue
			
 
				-            
			
 
				-            if folder != link['timestamp']:
			
 
				-                bad_folders.append((folder, link))
			
 
				-        else:
			
 
				-            # delete empty folders
			
 
				-            run(['rm', '-R', os.path.join(archive_path, folder)])
			
 
				-    
			
 
				-    if bad_folders and IS_TTY and input('[!] Cleanup archive? y/[n]: ') == 'y':
			
 
				-        print('[!] Fixing {} improperly named folders in archive...'.format(len(bad_folders)))
			
 
				-        for folder, link in bad_folders:
			
 
				-            fix_folder_path(archive_path, folder, link)
			
 
				-    elif bad_folders:
			
 
				-        print('[!] Warning! {} folders need to be merged, fix by running ArchiveBox.'.format(len(bad_folders)))
			
 
				-
			
 
				-    if unmatched:
			
 
				-        print('[!] Warning! {} unrecognized folders in html/archive/'.format(len(unmatched)))
			
 
				-        print('    '+ '\n    '.join(unmatched))
			
 
				-
			
 
				-
			
 
				 def wget_output_path(link, look_in=None):
			
 
				     """calculate the path to the wgetted .html file, since wget may
			
 
				     adjust some paths to be different than the base_url path.
			
@@ -615,6 +297,88 @@ def wget_output_path(link, look_in=None):
 
				     #             return urlencode('#'.join([without_scheme + '.html', *split_url[1:]]))
			
 
				     #         return urlencode(base_url(link['url']) + '/index.html')
			
 
				 
			
 
				+### String Manipulation & Logging Helpers
			
 
				+
			
 
				+def str_between(string, start, end=None):
			
 
				+    """(<abc>12345</def>, <abc>, </def>)  ->  12345"""
			
 
				+
			
 
				+    content = string.split(start, 1)[-1]
			
 
				+    if end is not None:
			
 
				+        content = content.rsplit(end, 1)[0]
			
 
				+
			
 
				+    return content
			
 
				+
			
 
				+def pretty_path(path):
			
 
				+    """convert paths like .../ArchiveBox/archivebox/../output/abc into output/abc"""
			
 
				+    return path.replace(REPO_DIR + '/', '')
			
 
				+
			
 
				+def print_error_hints(cmd, pwd, err=None, hints=None, prefix='        '):
			
 
				+    """quote the argument with whitespace in a command so the user can 
			
 
				+       copy-paste the outputted string directly to run the cmd
			
 
				+    """
			
 
				+
			
 
				+    quoted_cmd = ' '.join(
			
 
				+        '"{}"'.format(arg) if ' ' in arg else arg
			
 
				+        for arg in cmd
			
 
				+    )
			
 
				+
			
 
				+    output_lines = [
			
 
				+        '{}Failed: {} {}{}'.format(ANSI['red'], err.__class__.__name__, err, ANSI['reset']),
			
 
				+        '    {}{}{}'.format(ANSI['lightyellow'], hints, ANSI['reset']) if hints else None,
			
 
				+        'Run to see full output:'        
			
 
				+        '    cd {};'.format(pwd),
			
 
				+        '    {}'.format(quoted_cmd),
			
 
				+    ]
			
 
				+
			
 
				+    return '\n'.join(
			
 
				+        '{}{}'.format(prefix, line)
			
 
				+        for line in output_lines
			
 
				+        if line
			
 
				+    )
			
 
				+
			
 
				+
			
 
				+### Link Helpers
			
 
				+
			
 
				+def merge_links(a, b):
			
 
				+    """deterministially merge two links, favoring longer field values over shorter,
			
 
				+    and "cleaner" values over worse ones.
			
 
				+    """
			
 
				+    longer = lambda key: (a[key] if len(a[key]) > len(b[key]) else b[key]) if (a[key] and b[key]) else (a[key] or b[key])
			
 
				+    earlier = lambda key: a[key] if a[key] < b[key] else b[key]
			
 
				+    
			
 
				+    url = longer('url')
			
 
				+    longest_title = longer('title')
			
 
				+    cleanest_title = a['title'] if '://' not in (a['title'] or '') else b['title']
			
 
				+    link = {
			
 
				+        'timestamp': earlier('timestamp'),
			
 
				+        'url': url,
			
 
				+        'domain': domain(url),
			
 
				+        'base_url': base_url(url),
			
 
				+        'tags': longer('tags'),
			
 
				+        'title': longest_title if '://' not in (longest_title or '') else cleanest_title,
			
 
				+        'sources': list(set(a.get('sources', []) + b.get('sources', []))),
			
 
				+    }
			
 
				+    link['type'] = get_link_type(link)
			
 
				+    return link
			
 
				+
			
 
				+def get_link_type(link):
			
 
				+    """Certain types of links need to be handled specially, this figures out when that's the case"""
			
 
				+
			
 
				+    if extension(link['url']) == 'pdf':
			
 
				+        return 'PDF'
			
 
				+    elif extension(link['url']) in ('pdf', 'png', 'jpg', 'jpeg', 'svg', 'bmp', 'gif', 'tiff', 'webp'):
			
 
				+        return 'image'
			
 
				+    elif 'wikipedia.org' in domain(link['url']).lower():
			
 
				+        return 'wiki'
			
 
				+    elif 'youtube.com' in domain(link['url']).lower():
			
 
				+        return 'youtube'
			
 
				+    elif 'soundcloud.com' in domain(link['url']).lower():
			
 
				+        return 'soundcloud'
			
 
				+    elif 'youku.com' in domain(link['url']).lower():
			
 
				+        return 'youku'
			
 
				+    elif 'vimeo.com' in domain(link['url']).lower():
			
 
				+        return 'vimeo'
			
 
				+    return None
			
 
				 
			
 
				 def derived_link_info(link):
			
 
				     """extend link info with the archive urls and other derived data"""
			
@@ -625,15 +389,12 @@ def derived_link_info(link):
 
				 
			
 
				     extended_info = {
			
 
				         **link,
			
 
				-        'title': link['title'] or base_url(url),
			
 
				-        'date': to_date_str(link['timestamp']),
			
 
				+        'bookmarked_date': to_date_str(link['timestamp']),
			
 
				         'updated_date': to_date_str(link['updated']) if 'updated' in link else None,
			
 
				-        'base_url': base_url(url),
			
 
				         'domain': domain(url),
			
 
				-        'basename': basename(url),
			
 
				         'path': path(url),
			
 
				-        'type': link['type'] or 'website',
			
 
				-        'tags': link['tags'] or 'untagged',
			
 
				+        'basename': basename(url),
			
 
				+        'base_url': base_url(url),
			
 
				     }
			
 
				 
			
 
				     # Archive Method Output URLs
			
@@ -667,6 +428,96 @@ def derived_link_info(link):
 
				     return extended_info
			
 
				 
			
 
				 
			
 
				+### Python / System Helpers
			
 
				+
			
 
				+def progress(seconds=TIMEOUT, prefix=''):
			
 
				+    """Show a (subprocess-controlled) progress bar with a <seconds> timeout,
			
 
				+       returns end() function to instantly finish the progress
			
 
				+    """
			
 
				+
			
 
				+    if not SHOW_PROGRESS:
			
 
				+        return lambda: None
			
 
				+
			
 
				+    def progress_bar(seconds, prefix):
			
 
				+        """show timer in the form of progress bar, with percentage and seconds remaining"""
			
 
				+        chunk = '█' if sys.stdout.encoding == 'UTF-8' else '#'
			
 
				+        chunks = TERM_WIDTH - len(prefix) - 20  # number of progress chunks to show (aka max bar width)
			
 
				+        try:
			
 
				+            for s in range(seconds * chunks):
			
 
				+                progress = s / chunks / seconds * 100
			
 
				+                bar_width = round(progress/(100/chunks))
			
 
				+
			
 
				+                # ████████████████████           0.9% (1/60sec)
			
 
				+                sys.stdout.write('\r{0}{1}{2}{3} {4}% ({5}/{6}sec)'.format(
			
 
				+                    prefix,
			
 
				+                    ANSI['green'],
			
 
				+                    (chunk * bar_width).ljust(chunks),
			
 
				+                    ANSI['reset'],
			
 
				+                    round(progress, 1),
			
 
				+                    round(s/chunks),
			
 
				+                    seconds,
			
 
				+                ))
			
 
				+                sys.stdout.flush()
			
 
				+                time.sleep(1 / chunks)
			
 
				+
			
 
				+            # ██████████████████████████████████ 100.0% (60/60sec)
			
 
				+            sys.stdout.write('\r{0}{1}{2}{3} {4}% ({5}/{6}sec)\n'.format(
			
 
				+                prefix,
			
 
				+                ANSI['red'],
			
 
				+                chunk * chunks,
			
 
				+                ANSI['reset'],
			
 
				+                100.0,
			
 
				+                seconds,
			
 
				+                seconds,
			
 
				+            ))
			
 
				+            sys.stdout.flush()
			
 
				+        except KeyboardInterrupt:
			
 
				+            print()
			
 
				+            pass
			
 
				+
			
 
				+    p = Process(target=progress_bar, args=(seconds, prefix))
			
 
				+    p.start()
			
 
				+
			
 
				+    def end():
			
 
				+        """immediately finish progress and clear the progressbar line"""
			
 
				+
			
 
				+        # protect from double termination
			
 
				+        #if p is None or not hasattr(p, 'kill'):
			
 
				+        #    return
			
 
				+        nonlocal p
			
 
				+        if p is not None:
			
 
				+            p.terminate()
			
 
				+        p = None
			
 
				+
			
 
				+        sys.stdout.write('\r{}{}\r'.format((' ' * TERM_WIDTH), ANSI['reset']))  # clear whole terminal line
			
 
				+        sys.stdout.flush()
			
 
				+
			
 
				+    return end
			
 
				+
			
 
				+def download_url(url, timeout=TIMEOUT):
			
 
				+    req = Request(url, headers={'User-Agent': WGET_USER_AGENT})
			
 
				+
			
 
				+    if CHECK_SSL_VALIDITY:
			
 
				+        resp = urlopen(req, timeout=timeout)
			
 
				+    else:
			
 
				+        import ssl
			
 
				+        insecure = ssl._create_unverified_context()
			
 
				+        resp = urlopen(req, timeout=timeout, context=insecure)
			
 
				+
			
 
				+    encoding = resp.headers.get_content_charset() or 'utf-8'
			
 
				+    return resp.read().decode(encoding)
			
 
				+
			
 
				+def chmod_file(path, cwd='.', permissions=OUTPUT_PERMISSIONS, timeout=30):
			
 
				+    """chmod -R <permissions> <cwd>/<path>"""
			
 
				+
			
 
				+    if not os.path.exists(os.path.join(cwd, path)):
			
 
				+        raise Exception('Failed to chmod: {} does not exist (did the previous step fail?)'.format(path))
			
 
				+
			
 
				+    chmod_result = run(['chmod', '-R', permissions, path], cwd=cwd, stdout=DEVNULL, stderr=PIPE, timeout=timeout)
			
 
				+    if chmod_result.returncode == 1:
			
 
				+        print('     ', chmod_result.stderr.decode())
			
 
				+        raise Exception('Failed to chmod {}/{}'.format(cwd, path))
			
 
				+
			
 
				 def run(*popenargs, input=None, capture_output=False, timeout=None, check=False, **kwargs):
			
 
				     """Patched of subprocess.run to fix blocking io making timeout=innefective"""
			
 
				 
			
@@ -701,16 +552,3 @@ def run(*popenargs, input=None, capture_output=False, timeout=None, check=False,
 
				             raise CalledProcessError(retcode, process.args,
			
 
				                                      output=stdout, stderr=stderr)
			
 
				     return CompletedProcess(process.args, retcode, stdout, stderr)
			
 
				-
			
 
				-
			
 
				-def check_link_structure(link):
			
 
				-    assert isinstance(link, dict)
			
 
				-    assert isinstance(link.get('url'), str)
			
 
				-    assert len(link['url']) > 2
			
 
				-    assert len(re.findall(URL_REGEX, link['url'])) == 1
			
 
				-
			
 
				-
			
 
				-def check_links_structure(links):
			
 
				-    assert isinstance(links, list)
			
 
				-    if links:
			
 
				-        check_link_structure(links[0])