6 år sedan · e6bd1f8ca8
--- a/archivebox/archive.py
+++ b/archivebox/archive.py
@@ -1,225 +1,132 @@
 
															 #!/usr/bin/env python3
														
 
															-# ArchiveBox
														
 
															-# Nick Sweeting 2017 | MIT License
														
 
															-# https://github.com/pirate/ArchiveBox
														
 
															+"""
														
 
															+ArchiveBox command line application.
														
 
															-import os
														
 
															-import sys
														
 
															+./archive and ./bin/archivebox both point to this file, 
														
 
															+but you can also run it directly using `python3 archive.py`
														
 
															-from datetime import datetime
														
 
															-from peekable import Peekable
														
 
															+Usage & Documentation:
														
 
															+    https://github.com/pirate/ArchiveBox/Wiki
														
 
															+"""
														
 
															+import os
														
 
															+import sys
														
 
															-from parse import parse_links
														
 
															-from links import validate_links, links_after_timestamp
														
 
															-from archive_methods import archive_link, _RESULTS_TOTALS
														
 
															-from index import (
														
 
															-    write_links_index,
														
 
															-    parse_json_links_index,
														
 
															-)
														
 
															+from links import links_after_timestamp
														
 
															+from index import write_links_index, load_links_index
														
 
															+from archive_methods import archive_link
														
 
															 from config import (
														
 
															     ARCHIVE_DIR,
														
 
															     ONLY_NEW,
														
 
															     OUTPUT_DIR,
														
 
															-    REPO_DIR,
														
 
															-    ANSI,
														
 
															     GIT_SHA,
														
 
															 )
														
 
															 from util import (
														
 
															     check_dependencies,
														
 
															     save_remote_source,
														
 
															     save_stdin_source,
														
 
															-    pretty_path,
														
 
															-    check_links_structure,
														
 
															+)
														
 
															+from logs import (
														
 
															+    log_archiving_started,
														
 
															+    log_archiving_paused,
														
 
															+    log_archiving_finished,
														
 
															 )
														
 
															 __AUTHOR__ = 'Nick Sweeting <[email protected]>'
														
 
															 __VERSION__ = GIT_SHA
														
 
															-__DESCRIPTION__ = 'ArchiveBox Usage:  Create a browsable html archive of a list of links.'
														
 
															+__DESCRIPTION__ = 'ArchiveBox: The self-hosted internet archive.'
														
 
															 __DOCUMENTATION__ = 'https://github.com/pirate/ArchiveBox/wiki'
														
 
															 def print_help():
														
 
															-    print(__DESCRIPTION__)
														
 
															-    print("Documentation:     {}\n".format(__DOCUMENTATION__))
														
 
															+    print('ArchiveBox: The self-hosted internet archive.\n')
														
 
															+    print("Documentation:")
														
 
															+    print("    https://github.com/pirate/ArchiveBox/wiki\n")
														
 
															     print("Usage:")
														
 
															+    print("    echo 'https://examplecom' | ./bin/archivebox\n")
														
 
															     print("    ./bin/archivebox ~/Downloads/bookmarks_export.html\n")
														
 
															-    print("")
														
 
															     print("    ./bin/archivebox https://example.com/feed.rss\n")
														
 
															-    print("")
														
 
															-    print("    echo 'https://examplecom' | ./bin/archivebox\n")
														
 
															+    print("    ./bin/archivebox 15109948213.123\n")
														
 
															-def load_links(archive_path=OUTPUT_DIR, import_path=None):
														
 
															-    """get new links from file and optionally append them to links in existing archive"""
														
 
															-
														
 
															-    existing_links = []
														
 
															-    if archive_path:
														
 
															-        existing_links = parse_json_links_index(archive_path)
														
 
															-        check_links_structure(existing_links)
														
 
															+def main(*args):
														
 
															+    if set(args).intersection(('-h', '--help', 'help')) or len(args) > 2:
														
 
															+        print_help()
														
 
															+        raise SystemExit(0)
														
 
															-    new_links = []
														
 
															-    if import_path:
														
 
															-        # parse and validate the import file
														
 
															-        raw_links, parser_name = parse_links(import_path)
														
 
															-        new_links = validate_links(raw_links)
														
 
															-        check_links_structure(new_links)
														
 
															+    ### Handle CLI arguments
														
 
															+    #     ./archive bookmarks.html
														
 
															+    #     ./archive 1523422111.234
														
 
															+    import_path, resume = None, None
														
 
															+    if len(args) == 2:
														
 
															+        # if the argument is a string, it's a import_path file to import
														
 
															+        # if it's a number, it's a timestamp to resume archiving from
														
 
															+        if args[1].replace('.', '').isdigit():
														
 
															+            import_path, resume = None, args[1]
														
 
															+        else:
														
 
															+            import_path, resume = args[1], None
														
 
															-    # merge existing links in archive_path and new links
														
 
															-    all_links = validate_links(existing_links + new_links)
														
 
															-    check_links_structure(all_links)
														
 
															-    num_new_links = len(all_links) - len(existing_links)
														
 
															+    ### Set up output folder
														
 
															+    if not os.path.exists(OUTPUT_DIR):
														
 
															+        os.makedirs(OUTPUT_DIR)
														
 
															-    if import_path and parser_name:
														
 
															-        print('    > Adding {} new links to index (parsed import as {})'.format(
														
 
															-            num_new_links,
														
 
															-            parser_name,
														
 
															-        ))
														
 
															+    ### Handle ingesting urls piped in through stdin
														
 
															+    # (.e.g if user does cat example_urls.txt | ./archive)
														
 
															+    if not sys.stdin.isatty():
														
 
															+        stdin_raw_text = sys.stdin.read()
														
 
															+        if stdin_raw_text and import_path:
														
 
															+            print(
														
 
															+                '[X] You should pass either a path as an argument, '
														
 
															+                'or pass a list of links via stdin, but not both.\n'
														
 
															+            )
														
 
															+            print_help()
														
 
															+            raise SystemExit(1)
														
 
															-    return all_links, new_links
														
 
															+        import_path = save_stdin_source(stdin_raw_text)
														
 
															+    ### Handle ingesting urls from a remote file/feed
														
 
															+    # (e.g. if an RSS feed URL is used as the import path) 
														
 
															+    if import_path and any(import_path.startswith(s) for s in ('http://', 'https://', 'ftp://')):
														
 
															+        import_path = save_remote_source(import_path)
														
 
															-def update_archive(archive_path, links, source=None, resume=None, append=True):
														
 
															-    """update or create index.html+json given a path to an export file containing new links"""
														
 
															+    ### Run the main archive update process
														
 
															+    update_archive_data(import_path=import_path, resume=resume)
														
 
															-    start_ts = datetime.now().timestamp()
														
 
															-    if resume:
														
 
															-        print('{green}[▶] [{}] Resuming archive downloading from {}...{reset}'.format(
														
 
															-             datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
														
 
															-             resume,
														
 
															-             **ANSI,
														
 
															-        ))
														
 
															-    else:
														
 
															-        print('{green}[▶] [{}] Updating content for {} pages in archive...{reset}'.format(
														
 
															-             datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
														
 
															-             len(links),
														
 
															-             **ANSI,
														
 
															-        ))
														
 
															+def update_archive_data(import_path=None, resume=None):
														
 
															+    """The main ArchiveBox entrancepoint.  Everything starts here."""
														
 
															+    check_dependencies()
														
 
															-    check_links_structure(links)
														
 
															+    # Step 1: Load list of links from the existing index
														
 
															+    #         merge in and dedupe new links from import_path
														
 
															+    all_links, new_links = load_links_index(out_dir=OUTPUT_DIR, import_path=import_path)
														
 
															-    # prefetch the first link off the generator so that if we pause or fail
														
 
															-    # immediately we can show that we paused on the first link and not just None
														
 
															-    to_archive = Peekable(links_after_timestamp(links, resume))
														
 
															-    idx, link = 0, to_archive.peek(0)
														
 
															+    # Step 2: Write updated index with deduped old and new links back to disk
														
 
															+    write_links_index(out_dir=OUTPUT_DIR, links=all_links)
														
 
															-    # loop over links and archive them
														
 
															+    # Step 3: Run the archive methods for each link
														
 
															+    links = new_links if ONLY_NEW else all_links
														
 
															+    log_archiving_started(len(links), resume)
														
 
															+    idx, link = 0, 0
														
 
															     try:
														
 
															-        check_dependencies()
														
 
															-        for idx, link in enumerate(to_archive):
														
 
															+        for idx, link in enumerate(links_after_timestamp(links, resume)):
														
 
															             link_dir = os.path.join(ARCHIVE_DIR, link['timestamp'])
														
 
															             archive_link(link_dir, link)
														
 
															-    except (KeyboardInterrupt, SystemExit, Exception) as e:
														
 
															-        # if isinstance(e, KeyboardInterrupt):
														
 
															-        #     # Step 4: Re-write links index with updated titles, icons, and resources
														
 
															-        #     all_links, _ = load_links(archive_path=out_dir)
														
 
															-        #     write_links_index(out_dir=out_dir, links=all_links, finished=True)
														
 
															-        print()
														
 
															-        print('\n{lightyellow}[X] [{now}] Downloading paused on link {timestamp} ({idx}/{total}){reset}'.format(
														
 
															-            **ANSI,
														
 
															-            now=datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
														
 
															-            idx=idx+1,
														
 
															-            timestamp=link['timestamp'],
														
 
															-            total=len(links),
														
 
															-        ))
														
 
															-        print('    To view your archive, open: {}/index.html'.format(OUTPUT_DIR.replace(REPO_DIR + '/', '')))
														
 
															-        print('    Continue where you left off by running:')
														
 
															-        print('        {} {}'.format(
														
 
															-            pretty_path(sys.argv[0]),
														
 
															-            link['timestamp'],
														
 
															-        ))
														
 
															-        if not isinstance(e, KeyboardInterrupt):
														
 
															-            print()
														
 
															-            raise e
														
 
															-        raise SystemExit(1)
														
 
															-
														
 
															-    # print timing information & summary
														
 
															-    end_ts = datetime.now().timestamp()
														
 
															-    seconds = end_ts - start_ts
														
 
															-    if seconds > 60:
														
 
															-        duration = '{0:.2f} min'.format(seconds / 60, 2)
														
 
															-    else:
														
 
															-        duration = '{0:.2f} sec'.format(seconds, 2)
														
 
															-
														
 
															-    print('{}[√] [{}] Update of {} pages complete ({}){}'.format(
														
 
															-        ANSI['green'],
														
 
															-        datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
														
 
															-        len(links),
														
 
															-        duration,
														
 
															-        ANSI['reset'],
														
 
															-    ))
														
 
															-    print('    - {} entries skipped'.format(_RESULTS_TOTALS['skipped']))
														
 
															-    print('    - {} entries updated'.format(_RESULTS_TOTALS['succeded']))
														
 
															-    print('    - {} errors'.format(_RESULTS_TOTALS['failed']))
														
 
															-    print('    To view your archive, open: {}/index.html'.format(OUTPUT_DIR.replace(REPO_DIR + '/', '')))
														
 
															-
														
 
															-
														
 
															-if __name__ == '__main__':
														
 
															-    argc = len(sys.argv)
														
 
															-
														
 
															-    if set(sys.argv).intersection(('-h', '--help', 'help')):
														
 
															-        print_help()
														
 
															+    except KeyboardInterrupt:
														
 
															+        log_archiving_paused(len(links), idx, link and link['timestamp'])
														
 
															         raise SystemExit(0)
														
 
															-    source = sys.argv[1] if argc > 1 else None  # path of links file to import
														
 
															-    resume = sys.argv[2] if argc > 2 else None  # timestamp to resume dowloading from
														
 
															-   
														
 
															-    stdin_raw_text = ''
														
 
															-
														
 
															-    if not sys.stdin.isatty():
														
 
															-        stdin_raw_text = sys.stdin.read()
														
 
															-
														
 
															-    if source and stdin_raw_text:
														
 
															-        print(
														
 
															-            '[X] You should pass either a path as an argument, '
														
 
															-            'or pass a list of links via stdin, but not both.\n'
														
 
															-        )
														
 
															-        print_help()
														
 
															-        raise SystemExit(1)
														
 
															-
														
 
															-
														
 
															-    if argc == 1:
														
 
															-        source, resume = None, None
														
 
															-    elif argc == 2:
														
 
															-        if all(d.isdigit() for d in sys.argv[1].split('.')):
														
 
															-            # argv[1] is a resume timestamp
														
 
															-            source, resume = None, sys.argv[1]
														
 
															-        else:
														
 
															-            # argv[1] is a path to a file to import
														
 
															-            source, resume = sys.argv[1].strip(), None
														
 
															-    elif argc == 3:
														
 
															-        source, resume = sys.argv[1].strip(), sys.argv[2]
														
 
															-    else:
														
 
															-        print_help()
														
 
															-        raise SystemExit(1)
														
 
															-
														
 
															-    # See if archive folder already exists
														
 
															-    for out_dir in (OUTPUT_DIR, 'bookmarks', 'pocket', 'pinboard', 'html'):
														
 
															-        if os.path.exists(out_dir):
														
 
															-            break
														
 
															-    else:
														
 
															-        out_dir = OUTPUT_DIR
														
 
															-
														
 
															-    # Step 0: Download url to local file (only happens if a URL is specified instead of local path) 
														
 
															-    if source and any(source.startswith(s) for s in ('http://', 'https://', 'ftp://')):
														
 
															-        source = save_remote_source(source)
														
 
															-    elif stdin_raw_text:
														
 
															-        source = save_stdin_source(stdin_raw_text)
														
 
															+    except:
														
 
															+        print()
														
 
															+        raise    
														
 
															-    # Step 1: Parse the links and dedupe them with existing archive
														
 
															-    all_links, new_links = load_links(archive_path=out_dir, import_path=source)
														
 
															+    log_archiving_finished(len(links))
														
 
															-    # Step 2: Write new index
														
 
															-    write_links_index(out_dir=out_dir, links=all_links)
														
 
															+    # Step 4: Re-write links index with updated titles, icons, and resources
														
 
															+    all_links, _ = load_links_index(out_dir=OUTPUT_DIR)
														
 
															+    write_links_index(out_dir=OUTPUT_DIR, links=all_links, finished=True)
														
 
															-    # Step 3: Run the archive methods for each link
														
 
															-    if ONLY_NEW:
														
 
															-        update_archive(out_dir, new_links, source=source, resume=resume, append=True)
														
 
															-    else:
														
 
															-        update_archive(out_dir, all_links, source=source, resume=resume, append=True)
														
 
															-    # Step 4: Re-write links index with updated titles, icons, and resources
														
 
															-    all_links, _ = load_links(archive_path=out_dir)
														
 
															-    write_links_index(out_dir=out_dir, links=all_links, finished=True)
														
 
															+if __name__ == '__main__':
														
 
															+    main(*sys.argv)
														
--- a/archivebox/archive_methods.py
+++ b/archivebox/archive_methods.py
@@ -3,18 +3,18 @@ import os
 
															 from functools import wraps
														
 
															 from collections import defaultdict
														
 
															 from datetime import datetime
														
 
															+from stdlib_patches import run, PIPE, DEVNULL
														
 
															 from index import (
														
 
															-    parse_json_link_index,
														
 
															     write_link_index,
														
 
															-    update_main_index,
														
 
															+    patch_links_index,
														
 
															+    load_json_link_index,
														
 
															 )
														
 
															 from config import (
														
 
															     CURL_BINARY,
														
 
															     GIT_BINARY,
														
 
															     WGET_BINARY,
														
 
															     YOUTUBEDL_BINARY,
														
 
															-    CHROME_BINARY,
														
 
															     FETCH_FAVICON,
														
 
															     FETCH_TITLE,
														
 
															     FETCH_WGET,
														
@@ -25,62 +25,37 @@ from config import (
 
															     FETCH_WARC,
														
 
															     FETCH_GIT,
														
 
															     FETCH_MEDIA,
														
 
															-    RESOLUTION,
														
 
															-    CHECK_SSL_VALIDITY,
														
 
															     SUBMIT_ARCHIVE_DOT_ORG,
														
 
															-    COOKIES_FILE,
														
 
															-    WGET_USER_AGENT,
														
 
															-    CHROME_USER_AGENT,
														
 
															-    CHROME_USER_DATA_DIR,
														
 
															-    CHROME_HEADLESS,
														
 
															-    CHROME_SANDBOX,
														
 
															     TIMEOUT,
														
 
															     MEDIA_TIMEOUT,
														
 
															     ANSI,
														
 
															-    ARCHIVE_DIR,
														
 
															+    OUTPUT_DIR,
														
 
															     GIT_DOMAINS,
														
 
															     GIT_SHA,
														
 
															+    WGET_USER_AGENT,
														
 
															+    CHECK_SSL_VALIDITY,
														
 
															+    COOKIES_FILE,
														
 
															 )
														
 
															 from util import (
														
 
															     domain,
														
 
															+    extension,
														
 
															     without_query,
														
 
															     without_fragment,
														
 
															     fetch_page_title,
														
 
															     is_static_file,
														
 
															     progress,
														
 
															     chmod_file,
														
 
															-    pretty_path,
														
 
															-    print_error_hints,
														
 
															     check_link_structure,
														
 
															     wget_output_path,
														
 
															-    run, PIPE, DEVNULL,
														
 
															+    chrome_args,
														
 
															+)
														
 
															+from logs import (
														
 
															+    _LAST_RUN_STATS,
														
 
															+    log_link_archiving_started,
														
 
															+    log_link_archiving_failed,
														
 
															 )
														
 
															-_RESULTS_TOTALS = {   # globals are bad, mmkay
														
 
															-    'skipped': 0,
														
 
															-    'succeded': 0,
														
 
															-    'failed': 0,
														
 
															-}
														
 
															-
														
 
															-def load_link_index(link_dir, link):
														
 
															-    """check for an existing link archive in the given directory, 
														
 
															-       and load+merge it into the given link dict
														
 
															-    """
														
 
															-    is_new = not os.path.exists(link_dir)
														
 
															-    if is_new:
														
 
															-        os.makedirs(link_dir)
														
 
															-    else:
														
 
															-        link = {
														
 
															-            **parse_json_link_index(link_dir),
														
 
															-            **link,
														
 
															-        }
														
 
															-
														
 
															-    check_link_structure(link)
														
 
															-    print_link_status_line(link_dir, link, is_new)
														
 
															-
														
 
															-    return link
														
 
															-
														
 
															 class ArchiveError(Exception):
														
 
															     def __init__(self, message, hints=None):
														
@@ -105,32 +80,24 @@ def archive_link(link_dir, link, overwrite=True):
 
															     active_methods = [method for toggle, method in ARCHIVE_METHODS if toggle]
														
 
															     try:
														
 
															-        link = load_link_index(link_dir, link)
														
 
															+        is_new = not os.path.exists(link_dir)
														
 
															+        if is_new:
														
 
															+            os.makedirs(link_dir)
														
 
															+
														
 
															+        link = load_json_link_index(link_dir, link)
														
 
															+        log_link_archiving_started(link_dir, link, is_new)
														
 
															         for archive_method in active_methods:
														
 
															             archive_method(link_dir, link, overwrite=overwrite)
														
 
															-
														
 
															         write_link_index(link_dir, link)
														
 
															-        update_main_index(link)
														
 
															+        patch_links_index(link)
														
 
															     except Exception as err:
														
 
															         print('    ! Failed to archive link: {}: {}'.format(err.__class__.__name__, err))
														
 
															     return link
														
 
															-def print_link_status_line(link_dir, link, is_new):
														
 
															-    print('[{symbol_color}{symbol}{reset}] [{now}] "{title}"\n    {blue}{url}{reset}'.format(
														
 
															-        symbol='+' if is_new else '*',
														
 
															-        symbol_color=ANSI['green' if is_new else 'black'],
														
 
															-        now=datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
														
 
															-        **{**link, 'title': link['title'] or link['url']},
														
 
															-        **ANSI,
														
 
															-    ))
														
 
															-
														
 
															-    print('    > {}{}'.format(pretty_path(link_dir), ' (new)' if is_new else ''))
														
 
															-
														
 
															-
														
 
															 def attach_result_to_link(method):
														
 
															     """
														
@@ -178,15 +145,75 @@ def attach_result_to_link(method):
 
															                 link['history'][method].append(history_entry)
														
 
															                 link['latest'][method] = result['output']
														
 
															-            _RESULTS_TOTALS[history_entry['status']] += 1
														
 
															+            _LAST_RUN_STATS[history_entry['status']] += 1
														
 
															             return link
														
 
															         return timed_fetch_func
														
 
															     return decorator
														
 
															+@attach_result_to_link('title')
														
 
															+def fetch_title(link_dir, link, timeout=TIMEOUT):
														
 
															+    """try to guess the page's title from its content"""
														
 
															+
														
 
															+    # if link already has valid title, skip it
														
 
															+    if link['title'] and not link['title'].lower().startswith('http'):
														
 
															+        return {'output': link['title'], 'status': 'skipped'}
														
 
															+
														
 
															+    if is_static_file(link['url']):
														
 
															+        return {'output': None, 'status': 'skipped'}
														
 
															+
														
 
															+    end = progress(timeout, prefix='      ')
														
 
															+    try:
														
 
															+        title = fetch_page_title(link['url'], timeout=timeout, progress=False)
														
 
															+        end()
														
 
															+        output = title
														
 
															+    except Exception as e:
														
 
															+        end()
														
 
															+        output = e
														
 
															+        print('        {}Failed: {} {}{}'.format(ANSI['red'], e.__class__.__name__, e, ANSI['reset']))
														
 
															+
														
 
															+    if title and title.strip():
														
 
															+        link['title'] = title
														
 
															+        output = title
														
 
															+
														
 
															+    return {
														
 
															+        'cmd': 'fetch_page_title("{}")'.format(link['url']),
														
 
															+        'output': output,
														
 
															+    }
														
 
															+
														
 
															+@attach_result_to_link('favicon')
														
 
															+def fetch_favicon(link_dir, link, timeout=TIMEOUT):
														
 
															+    """download site favicon from google's favicon api"""
														
 
															+
														
 
															+    output = 'favicon.ico'
														
 
															+    if os.path.exists(os.path.join(link_dir, output)):
														
 
															+        return {'output': output, 'status': 'skipped'}
														
 
															+
														
 
															+    CMD = [
														
 
															+        CURL_BINARY,
														
 
															+        '--max-time', str(timeout),
														
 
															+        '--location',
														
 
															+        '--output', output,
														
 
															+        *(() if CHECK_SSL_VALIDITY else ('--insecure',)),
														
 
															+        'https://www.google.com/s2/favicons?domain={}'.format(domain(link['url'])),
														
 
															+    ]
														
 
															+    end = progress(timeout, prefix='      ')
														
 
															+    try:
														
 
															+        run(CMD, stdout=PIPE, stderr=PIPE, cwd=link_dir, timeout=timeout)
														
 
															+        end()
														
 
															+        chmod_file(output, cwd=link_dir)
														
 
															+    except Exception as e:
														
 
															+        end()
														
 
															+        output = e
														
 
															+        print_error_hints(cmd=CMD, pwd=link_dir, err=e)
														
 
															+
														
 
															+    return {
														
 
															+        'cmd': CMD,
														
 
															+        'output': output,
														
 
															+    }
														
 
															 @attach_result_to_link('wget')
														
 
															-def fetch_wget(link_dir, link, requisites=FETCH_WGET_REQUISITES, warc=FETCH_WARC, timeout=TIMEOUT):
														
 
															+def fetch_wget(link_dir, link, timeout=TIMEOUT):
														
 
															     """download full site using wget"""
														
 
															     domain_dir = os.path.join(link_dir, domain(link['url']))
														
@@ -194,7 +221,7 @@ def fetch_wget(link_dir, link, requisites=FETCH_WGET_REQUISITES, warc=FETCH_WARC
 
															     if os.path.exists(domain_dir) and existing_file:
														
 
															         return {'output': existing_file, 'status': 'skipped'}
														
 
															-    if warc:
														
 
															+    if FETCH_WARC:
														
 
															         warc_dir = os.path.join(link_dir, 'warc')
														
 
															         os.makedirs(warc_dir, exist_ok=True)
														
 
															         warc_path = os.path.join('warc', str(int(datetime.now().timestamp())))
														
@@ -213,8 +240,8 @@ def fetch_wget(link_dir, link, requisites=FETCH_WGET_REQUISITES, warc=FETCH_WARC
 
															         '-e', 'robots=off',
														
 
															         '--restrict-file-names=unix',
														
 
															         '--timeout={}'.format(timeout),
														
 
															-        *(() if warc else ('--timestamping',)),
														
 
															-        *(('--warc-file={}'.format(warc_path),) if warc else ()),
														
 
															+        *(() if FETCH_WARC else ('--timestamping',)),
														
 
															+        *(('--warc-file={}'.format(warc_path),) if FETCH_WARC else ()),
														
 
															         *(('--page-requisites',) if FETCH_WGET_REQUISITES else ()),
														
 
															         *(('--user-agent={}'.format(WGET_USER_AGENT),) if WGET_USER_AGENT else ()),
														
 
															         *(('--load-cookies', COOKIES_FILE) if COOKIES_FILE else ()),
														
@@ -233,7 +260,8 @@ def fetch_wget(link_dir, link, requisites=FETCH_WGET_REQUISITES, warc=FETCH_WARC
 
															             if line.strip()
														
 
															         ]
														
 
															-        # parse out number of files downloaded from "Downloaded: 76 files, 4.0M in 1.6s (2.52 MB/s)"
														
 
															+        # parse out number of files downloaded from last line of stderr:
														
 
															+        #  "Downloaded: 76 files, 4.0M in 1.6s (2.52 MB/s)"
														
 
															         files_downloaded = (
														
 
															             int(output_tail[-1].strip().split(' ', 2)[1] or 0)
														
 
															             if 'Downloaded:' in output_tail[-1]
														
@@ -263,20 +291,19 @@ def fetch_wget(link_dir, link, requisites=FETCH_WGET_REQUISITES, warc=FETCH_WARC
 
															         'output': output,
														
 
															     }
														
 
															-
														
 
															 @attach_result_to_link('pdf')
														
 
															-def fetch_pdf(link_dir, link, timeout=TIMEOUT, **chrome_kwargs):
														
 
															+def fetch_pdf(link_dir, link, timeout=TIMEOUT):
														
 
															     """print PDF of site to file using chrome --headless"""
														
 
															     if is_static_file(link['url']):
														
 
															-        return {'output': wget_output_path(link), 'status': 'skipped'}
														
 
															+        return {'output': None, 'status': 'skipped'}
														
 
															     output = 'output.pdf'
														
 
															     if os.path.exists(os.path.join(link_dir, output)):
														
 
															         return {'output': output, 'status': 'skipped'}
														
 
															     CMD = [
														
 
															-        *chrome_headless(timeout=timeout, **chrome_kwargs),
														
 
															+        *chrome_args(timeout=timeout),
														
 
															         '--print-to-pdf',
														
 
															         link['url']
														
 
															     ]
														
@@ -302,18 +329,18 @@ def fetch_pdf(link_dir, link, timeout=TIMEOUT, **chrome_kwargs):
 
															     }
														
 
															 @attach_result_to_link('screenshot')
														
 
															-def fetch_screenshot(link_dir, link, timeout=TIMEOUT, **chrome_kwargs):
														
 
															+def fetch_screenshot(link_dir, link, timeout=TIMEOUT):
														
 
															     """take screenshot of site using chrome --headless"""
														
 
															     if is_static_file(link['url']):
														
 
															-        return {'output': wget_output_path(link), 'status': 'skipped'}
														
 
															+        return {'output': None, 'status': 'skipped'}
														
 
															     output = 'screenshot.png'
														
 
															     if os.path.exists(os.path.join(link_dir, output)):
														
 
															         return {'output': output, 'status': 'skipped'}
														
 
															     CMD = [
														
 
															-        *chrome_headless(timeout=timeout, **chrome_kwargs),
														
 
															+        *chrome_args(timeout=timeout),
														
 
															         '--screenshot',
														
 
															         link['url'],
														
 
															     ]
														
@@ -337,18 +364,19 @@ def fetch_screenshot(link_dir, link, timeout=TIMEOUT, **chrome_kwargs):
 
															     }
														
 
															 @attach_result_to_link('dom')
														
 
															-def fetch_dom(link_dir, link, timeout=TIMEOUT, **chrome_kwargs):
														
 
															+def fetch_dom(link_dir, link, timeout=TIMEOUT):
														
 
															     """print HTML of site to file using chrome --dump-html"""
														
 
															     if is_static_file(link['url']):
														
 
															-        return {'output': wget_output_path(link), 'status': 'skipped'}
														
 
															+        return {'output': None, 'status': 'skipped'}
														
 
															     output = 'output.html'
														
 
															-    if os.path.exists(os.path.join(link_dir, output)):
														
 
															+    output_path = os.path.join(link_dir, output)
														
 
															+    if os.path.exists(output_path):
														
 
															         return {'output': output, 'status': 'skipped'}
														
 
															     CMD = [
														
 
															-        *chrome_headless(timeout=timeout, **chrome_kwargs),
														
 
															+        *chrome_args(timeout=timeout),
														
 
															         '--dump-dom',
														
 
															         link['url']
														
 
															     ]
														
@@ -372,100 +400,43 @@ def fetch_dom(link_dir, link, timeout=TIMEOUT, **chrome_kwargs):
 
															         'output': output,
														
 
															     }
														
 
															-def parse_archive_dot_org_response(response):
														
 
															-    # Parse archive.org response headers
														
 
															-    headers = defaultdict(list)
														
 
															-
														
 
															-    # lowercase all the header names and store in dict
														
 
															-    for header in response.splitlines():
														
 
															-        if b':' not in header or not header.strip():
														
 
															-            continue
														
 
															-        name, val = header.decode().split(':', 1)
														
 
															-        headers[name.lower().strip()].append(val.strip())
														
 
															-
														
 
															-    # Get successful archive url in "content-location" header or any errors
														
 
															-    content_location = headers['content-location']
														
 
															-    errors = headers['x-archive-wayback-runtime-error']
														
 
															-    return content_location, errors
														
 
															-
														
 
															-@attach_result_to_link('archive_org')
														
 
															-def archive_dot_org(link_dir, link, timeout=TIMEOUT):
														
 
															-    """submit site to archive.org for archiving via their service, save returned archive url"""
														
 
															-
														
 
															-    output = 'archive.org.txt'
														
 
															-    archive_org_url = None
														
 
															-
														
 
															-    path = os.path.join(link_dir, output)
														
 
															-    if os.path.exists(path):
														
 
															-        archive_org_url = open(path, 'r').read().strip()
														
 
															-        return {'output': archive_org_url, 'status': 'skipped'}
														
 
															-
														
 
															-    submit_url = 'https://web.archive.org/save/{}'.format(link['url'])
														
 
															-    CMD = [
														
 
															-        CURL_BINARY,
														
 
															-        '--location',
														
 
															-        '--head',
														
 
															-        '--user-agent', 'ArchiveBox/{} (+https://github.com/pirate/ArchiveBox/)'.format(GIT_SHA),  # be nice to the Archive.org people and show them where all this ArchiveBox traffic is coming from
														
 
															-        '--max-time', str(timeout),
														
 
															-        *(() if CHECK_SSL_VALIDITY else ('--insecure',)),
														
 
															-        submit_url,
														
 
															-    ]
														
 
															-    end = progress(timeout, prefix='      ')
														
 
															-    try:
														
 
															-        result = run(CMD, stdout=PIPE, stderr=DEVNULL, cwd=link_dir, timeout=timeout)
														
 
															-        end()
														
 
															-        content_location, errors = parse_archive_dot_org_response(result.stdout)
														
 
															-        if content_location:
														
 
															-            archive_org_url = 'https://web.archive.org{}'.format(content_location[0])
														
 
															-        elif len(errors) == 1 and 'RobotAccessControlException' in errors[0]:
														
 
															-            archive_org_url = None
														
 
															-            # raise ArchiveError('Archive.org denied by {}/robots.txt'.format(domain(link['url'])))
														
 
															-        elif errors:
														
 
															-            raise ArchiveError(', '.join(errors))
														
 
															-        else:
														
 
															-            raise ArchiveError('Failed to find "content-location" URL header in Archive.org response.')
														
 
															-    except Exception as e:
														
 
															-        end()
														
 
															-        output = e
														
 
															-        print_error_hints(cmd=CMD, pwd=link_dir, err=e)
														
 
															-
														
 
															-    if not isinstance(output, Exception):
														
 
															-        # instead of writing None when archive.org rejects the url write the
														
 
															-        # url to resubmit it to archive.org. This is so when the user visits
														
 
															-        # the URL in person, it will attempt to re-archive it, and it'll show the
														
 
															-        # nicer error message explaining why the url was rejected if it fails.
														
 
															-        archive_org_url = archive_org_url or submit_url
														
 
															-        with open(os.path.join(link_dir, output), 'w', encoding='utf-8') as f:
														
 
															-            f.write(archive_org_url)
														
 
															-        chmod_file('archive.org.txt', cwd=link_dir)
														
 
															-        output = archive_org_url
														
 
															+@attach_result_to_link('git')
														
 
															+def fetch_git(link_dir, link, timeout=TIMEOUT):
														
 
															+    """download full site using git"""
														
 
															-    return {
														
 
															-        'cmd': CMD,
														
 
															-        'output': output,
														
 
															-    }
														
 
															+    is_clonable_url = (
														
 
															+        domain(link['url']) in GIT_DOMAINS
														
 
															+        or extension(link['url']) == 'git'
														
 
															+    )
														
 
															+    if is_static_file(link['url']) or not is_clonable_url:
														
 
															+        return {'output': None, 'status': 'skipped'}
														
 
															-@attach_result_to_link('favicon')
														
 
															-def fetch_favicon(link_dir, link, timeout=TIMEOUT):
														
 
															-    """download site favicon from google's favicon api"""
														
 
															+    output = 'git'
														
 
															+    output_path = os.path.join(link_dir, 'git')
														
 
															-    output = 'favicon.ico'
														
 
															-    if os.path.exists(os.path.join(link_dir, output)):
														
 
															+    if os.path.exists(output_path):
														
 
															         return {'output': output, 'status': 'skipped'}
														
 
															+    os.makedirs(output_path, exist_ok=True)
														
 
															     CMD = [
														
 
															-        CURL_BINARY,
														
 
															-        '--max-time', str(timeout),
														
 
															-        '--location',
														
 
															-        '--output', output,
														
 
															-        *(() if CHECK_SSL_VALIDITY else ('--insecure',)),
														
 
															-        'https://www.google.com/s2/favicons?domain={}'.format(domain(link['url'])),
														
 
															+        GIT_BINARY,
														
 
															+        'clone',
														
 
															+        '--mirror',
														
 
															+        '--recursive',
														
 
															+        *(() if CHECK_SSL_VALIDITY else ('-c', 'http.sslVerify=false')),
														
 
															+        without_query(without_fragment(link['url'])),
														
 
															     ]
														
 
															     end = progress(timeout, prefix='      ')
														
 
															     try:
														
 
															-        run(CMD, stdout=PIPE, stderr=PIPE, cwd=link_dir, timeout=timeout)
														
 
															+        result = run(CMD, stdout=PIPE, stderr=PIPE, cwd=output_path, timeout=timeout + 1)
														
 
															         end()
														
 
															-        chmod_file(output, cwd=link_dir)
														
 
															+
														
 
															+        if result.returncode == 128:
														
 
															+            # ignore failed re-download when the folder already exists
														
 
															+            pass
														
 
															+        elif result.returncode > 0:
														
 
															+            hints = 'got git response code {}:'.format(result.returncode)
														
 
															+            raise ArchiveError('Failed git download', hints)
														
 
															     except Exception as e:
														
 
															         end()
														
 
															         output = e
														
@@ -476,36 +447,6 @@ def fetch_favicon(link_dir, link, timeout=TIMEOUT):
 
															         'output': output,
														
 
															     }
														
 
															-@attach_result_to_link('title')
														
 
															-def fetch_title(link_dir, link, timeout=TIMEOUT):
														
 
															-    """try to guess the page's title from its content"""
														
 
															-
														
 
															-    # if link already has valid title, skip it
														
 
															-    if link['title'] and not link['title'].lower().startswith('http'):
														
 
															-        return {'output': link['title'], 'status': 'skipped'}
														
 
															-
														
 
															-    if is_static_file(link['url']):
														
 
															-        return {'output': None, 'status': 'skipped'}
														
 
															-
														
 
															-    end = progress(timeout, prefix='      ')
														
 
															-    try:
														
 
															-        title = fetch_page_title(link['url'], timeout=timeout, progress=False)
														
 
															-        end()
														
 
															-        output = title
														
 
															-    except Exception as e:
														
 
															-        end()
														
 
															-        output = e
														
 
															-        print('        {}Failed: {} {}{}'.format(ANSI['red'], e.__class__.__name__, e, ANSI['reset']))
														
 
															-
														
 
															-    if title and title.strip():
														
 
															-        link['title'] = title
														
 
															-        output = title
														
 
															-
														
 
															-    return {
														
 
															-        'cmd': 'fetch_page_title("{}")'.format(link['url']),
														
 
															-        'output': output,
														
 
															-    }
														
 
															-
														
 
															 @attach_result_to_link('media')
														
 
															 def fetch_media(link_dir, link, timeout=MEDIA_TIMEOUT, overwrite=False):
														
 
															     """Download playlists or individual video, audio, and subtitles using youtube-dl"""
														
@@ -569,102 +510,77 @@ def fetch_media(link_dir, link, timeout=MEDIA_TIMEOUT, overwrite=False):
 
															         'output': output,
														
 
															     }
														
 
															+def parse_archive_dot_org_response(response):
														
 
															+    # Parse archive.org response headers
														
 
															+    headers = defaultdict(list)
														
 
															-@attach_result_to_link('git')
														
 
															-def fetch_git(link_dir, link, timeout=TIMEOUT):
														
 
															-    """download full site using git"""
														
 
															+    # lowercase all the header names and store in dict
														
 
															+    for header in response.splitlines():
														
 
															+        if b':' not in header or not header.strip():
														
 
															+            continue
														
 
															+        name, val = header.decode().split(':', 1)
														
 
															+        headers[name.lower().strip()].append(val.strip())
														
 
															-    url_is_clonable = (
														
 
															-        domain(link['url']) in GIT_DOMAINS
														
 
															-        or link['url'].endswith('.git')
														
 
															-    )
														
 
															-    if not url_is_clonable or is_static_file(link['url']):
														
 
															-        return {'output': None, 'status': 'skipped'}
														
 
															+    # Get successful archive url in "content-location" header or any errors
														
 
															+    content_location = headers['content-location']
														
 
															+    errors = headers['x-archive-wayback-runtime-error']
														
 
															+    return content_location, errors
														
 
															-    output = 'git'
														
 
															-    output_path = os.path.join(link_dir, 'git')
														
 
															+@attach_result_to_link('archive_org')
														
 
															+def archive_dot_org(link_dir, link, timeout=TIMEOUT):
														
 
															+    """submit site to archive.org for archiving via their service, save returned archive url"""
														
 
															-    if os.path.exists(output_path):
														
 
															-        return {'output': output, 'status': 'skipped'}
														
 
															+    output = 'archive.org.txt'
														
 
															+    archive_org_url = None
														
 
															-    os.makedirs(output_path, exist_ok=True)
														
 
															+    path = os.path.join(link_dir, output)
														
 
															+    if os.path.exists(path):
														
 
															+        archive_org_url = open(path, 'r').read().strip()
														
 
															+        return {'output': archive_org_url, 'status': 'skipped'}
														
 
															+
														
 
															+    submit_url = 'https://web.archive.org/save/{}'.format(link['url'])
														
 
															     CMD = [
														
 
															-        GIT_BINARY,
														
 
															-        'clone',
														
 
															-        '--mirror',
														
 
															-        '--recursive',
														
 
															-        *(() if CHECK_SSL_VALIDITY else ('-c', 'http.sslVerify=false')),
														
 
															-        without_query(without_fragment(link['url'])),
														
 
															+        CURL_BINARY,
														
 
															+        '--location',
														
 
															+        '--head',
														
 
															+        '--user-agent', 'ArchiveBox/{} (+https://github.com/pirate/ArchiveBox/)'.format(GIT_SHA),  # be nice to the Archive.org people and show them where all this ArchiveBox traffic is coming from
														
 
															+        '--max-time', str(timeout),
														
 
															+        *(() if CHECK_SSL_VALIDITY else ('--insecure',)),
														
 
															+        submit_url,
														
 
															     ]
														
 
															     end = progress(timeout, prefix='      ')
														
 
															     try:
														
 
															-        result = run(CMD, stdout=PIPE, stderr=PIPE, cwd=output_path, timeout=timeout + 1)
														
 
															+        result = run(CMD, stdout=PIPE, stderr=DEVNULL, cwd=link_dir, timeout=timeout)
														
 
															         end()
														
 
															-
														
 
															-        if result.returncode == 128:
														
 
															-            # ignore failed re-download when the folder already exists
														
 
															-            pass
														
 
															-        elif result.returncode > 0:
														
 
															-            hints = 'got git response code {}:'.format(result.returncode)
														
 
															-            raise ArchiveError('Failed git download', hints)
														
 
															+        content_location, errors = parse_archive_dot_org_response(result.stdout)
														
 
															+        if content_location:
														
 
															+            archive_org_url = 'https://web.archive.org{}'.format(content_location[0])
														
 
															+        elif len(errors) == 1 and 'RobotAccessControlException' in errors[0]:
														
 
															+            archive_org_url = None
														
 
															+            # raise ArchiveError('Archive.org denied by {}/robots.txt'.format(domain(link['url'])))
														
 
															+        elif errors:
														
 
															+            raise ArchiveError(', '.join(errors))
														
 
															+        else:
														
 
															+            raise ArchiveError('Failed to find "content-location" URL header in Archive.org response.')
														
 
															     except Exception as e:
														
 
															         end()
														
 
															         output = e
														
 
															         print_error_hints(cmd=CMD, pwd=link_dir, err=e)
														
 
															+    if not isinstance(output, Exception):
														
 
															+        # instead of writing None when archive.org rejects the url write the
														
 
															+        # url to resubmit it to archive.org. This is so when the user visits
														
 
															+        # the URL in person, it will attempt to re-archive it, and it'll show the
														
 
															+        # nicer error message explaining why the url was rejected if it fails.
														
 
															+        archive_org_url = archive_org_url or submit_url
														
 
															+        with open(os.path.join(link_dir, output), 'w', encoding='utf-8') as f:
														
 
															+            f.write(archive_org_url)
														
 
															+        chmod_file('archive.org.txt', cwd=link_dir)
														
 
															+        output = archive_org_url
														
 
															+
														
 
															     return {
														
 
															         'cmd': CMD,
														
 
															         'output': output,
														
 
															     }
														
 
															-def chrome_headless(binary=CHROME_BINARY, user_data_dir=CHROME_USER_DATA_DIR, headless=CHROME_HEADLESS, sandbox=CHROME_SANDBOX, check_ssl_validity=CHECK_SSL_VALIDITY, user_agent=CHROME_USER_AGENT, resolution=RESOLUTION, timeout=TIMEOUT):
														
 
															-    global CACHED_USER_DATA_DIR
														
 
															-    user_data_dir = user_data_dir or CACHED_USER_DATA_DIR
														
 
															-    cmd_args = [binary]
														
 
															-
														
 
															-    if headless:
														
 
															-        cmd_args += ('--headless',)
														
 
															-    
														
 
															-    if not sandbox:
														
 
															-        # dont use GPU or sandbox when running inside docker container
														
 
															-        cmd_args += ('--no-sandbox', '--disable-gpu')
														
 
															-
														
 
															-    if not check_ssl_validity:
														
 
															-        cmd_args += ('--disable-web-security', '--ignore-certificate-errors')
														
 
															-
														
 
															-    if user_agent:
														
 
															-        cmd_args += ('--user-agent={}'.format(user_agent),)
														
 
															-
														
 
															-    if resolution:
														
 
															-        cmd_args += ('--window-size={}'.format(RESOLUTION),)
														
 
															-
														
 
															-    if timeout:
														
 
															-        cmd_args += ('--timeout={}'.format((timeout) * 1000),)
														
 
															-
														
 
															-    # Find chrome user data directory
														
 
															-    default_profile_paths = (
														
 
															-        '~/.config/chromium',
														
 
															-        '~/.config/google-chrome',
														
 
															-        '~/.config/google-chrome-beta',
														
 
															-        '~/.config/google-chrome-unstable',
														
 
															-        '~/Library/Application Support/Chromium',
														
 
															-        '~/Library/Application Support/Google/Chrome',
														
 
															-        '~/Library/Application Support/Google/Chrome Canary',
														
 
															-        '~/AppData/Local/Chromium/User Data',
														
 
															-        '~/AppData/Local/Google/Chrome/User Data',
														
 
															-        '~/AppData/Local/Google/Chrome SxS/User Data',
														
 
															-    )
														
 
															-    if user_data_dir:
														
 
															-        cmd_args.append('--user-data-dir={}'.format(user_data_dir))
														
 
															-    else:
														
 
															-        for path in default_profile_paths:
														
 
															-            full_path = os.path.expanduser(path)
														
 
															-            if os.path.exists(full_path):
														
 
															-                CACHED_USER_DATA_DIR = full_path
														
 
															-                cmd_args.append('--user-data-dir={}'.format(full_path))
														
 
															-                break
														
 
															-    
														
 
															-    return cmd_args
														
 
															-
														
 
															-CACHED_USER_DATA_DIR = CHROME_USER_DATA_DIR
														
--- a/archivebox/index.py
+++ b/archivebox/index.py
@@ -12,18 +12,24 @@ except ImportError:
 
															 from config import (
														
 
															     OUTPUT_DIR,
														
 
															     TEMPLATES_DIR,
														
 
															-    ANSI,
														
 
															     GIT_SHA,
														
 
															     FOOTER_INFO,
														
 
															 )
														
 
															 from util import (
														
 
															     chmod_file,
														
 
															     derived_link_info,
														
 
															-    pretty_path,
														
 
															     check_link_structure,
														
 
															     check_links_structure,
														
 
															     wget_output_path,
														
 
															 )
														
 
															+from parse import parse_links
														
 
															+from links import validate_links
														
 
															+from logs import (
														
 
															+    log_indexing_started,
														
 
															+    log_indexing_finished,
														
 
															+    log_parsing_started,
														
 
															+    log_parsing_finished,
														
 
															+)
														
 
															 TITLE_LOADING_MSG = 'Not yet archived...'
														
@@ -33,21 +39,40 @@ TITLE_LOADING_MSG = 'Not yet archived...'
 
															 def write_links_index(out_dir, links, finished=False):
														
 
															     """create index.html file for a given list of links"""
														
 
															+    log_indexing_started()
														
 
															     check_links_structure(links)
														
 
															-    if not os.path.exists(out_dir):
														
 
															-        os.makedirs(out_dir)
														
 
															-
														
 
															-    print('{green}[*] [{}] Saving main index files...{reset}'.format(
														
 
															-        datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
														
 
															-        **ANSI,
														
 
															-    ))
														
 
															     write_json_links_index(out_dir, links)
														
 
															-    print('    > {}/index.json'.format(pretty_path(out_dir)))
														
 
															+    log_indexing_finished(out_dir, 'index.json')
														
 
															     write_html_links_index(out_dir, links, finished=finished)
														
 
															-    print('    > {}/index.html'.format(pretty_path(out_dir)))
														
 
															+    log_indexing_finished(out_dir, 'index.html')
														
 
															+def load_links_index(out_dir=OUTPUT_DIR, import_path=None):
														
 
															+    """parse and load existing index with any new links from import_path merged in"""
														
 
															+
														
 
															+    existing_links = []
														
 
															+    if out_dir:
														
 
															+        existing_links = parse_json_links_index(out_dir)
														
 
															+        check_links_structure(existing_links)
														
 
															+
														
 
															+    new_links = []
														
 
															+    if import_path:
														
 
															+        # parse and validate the import file
														
 
															+        log_parsing_started(import_path)
														
 
															+        raw_links, parser_name = parse_links(import_path)
														
 
															+        new_links = validate_links(raw_links)
														
 
															+        check_links_structure(new_links)
														
 
															+
														
 
															+    # merge existing links in out_dir and new links
														
 
															+    all_links = validate_links(existing_links + new_links)
														
 
															+    check_links_structure(all_links)
														
 
															+    num_new_links = len(all_links) - len(existing_links)
														
 
															+
														
 
															+    if import_path and parser_name:
														
 
															+        log_parsing_finished(num_new_links, parser_name)
														
 
															+
														
 
															+    return all_links, new_links
														
 
															 def write_json_links_index(out_dir, links):
														
 
															     """write the json link index to a given path"""
														
@@ -70,8 +95,8 @@ def write_json_links_index(out_dir, links):
 
															     chmod_file(path)
														
 
															-def parse_json_links_index(out_dir):
														
 
															-    """load the index in a given directory and merge it with the given link"""
														
 
															+def parse_json_links_index(out_dir=OUTPUT_DIR):
														
 
															+    """parse a archive index json file and return the list of links"""
														
 
															     index_path = os.path.join(out_dir, 'index.json')
														
 
															     if os.path.exists(index_path):
														
 
															         with open(index_path, 'r', encoding='utf-8') as f:
														
@@ -136,31 +161,26 @@ def write_html_links_index(out_dir, links, finished=False):
 
															     chmod_file(path)
														
 
															-def update_main_index(link):
														
 
															+def patch_links_index(link, out_dir=OUTPUT_DIR):
														
 
															     """hack to in-place update one row's info in the generated index html"""
														
 
															     title = link['latest']['title']
														
 
															     successful = len([entry for entry in link['latest'].values() if entry])
														
 
															     # Patch JSON index
														
 
															-    json_path = os.path.join(OUTPUT_DIR, 'index.json')
														
 
															-
														
 
															-    links = parse_json_links_index(OUTPUT_DIR)
														
 
															-
														
 
															     changed = False
														
 
															-    for json_link in links:
														
 
															-        if json_link['url'] == link['url']:
														
 
															-            json_link['title'] = title
														
 
															-            json_link['latest'] = link['latest']
														
 
															+    json_file_links = parse_json_links_index(out_dir)
														
 
															+    for saved_link in json_file_links:
														
 
															+        if saved_link['url'] == link['url']:
														
 
															+            saved_link['title'] = title
														
 
															+            saved_link['latest'] = link['latest']
														
 
															             changed = True
														
 
															             break
														
 
															-
														
 
															     if changed:
														
 
															-        write_json_links_index(OUTPUT_DIR, links)
														
 
															+        write_json_links_index(out_dir, json_file_links)
														
 
															     # Patch HTML index
														
 
															-    html_path = os.path.join(OUTPUT_DIR, 'index.html')
														
 
															-
														
 
															+    html_path = os.path.join(out_dir, 'index.html')
														
 
															     html = open(html_path, 'r').read().split('\n')
														
 
															     for idx, line in enumerate(html):
														
 
															         if title and ('<span data-title-for="{}"'.format(link['url']) in line):
														
@@ -172,6 +192,7 @@ def update_main_index(link):
 
															     with open(html_path, 'w') as f:
														
 
															         f.write('\n'.join(html))
														
 
															+
														
 
															 ### Individual link index
														
 
															 def write_link_index(out_dir, link):
														
@@ -202,6 +223,18 @@ def parse_json_link_index(out_dir):
 
															             return link_json
														
 
															     return {}
														
 
															+def load_json_link_index(out_dir, link):
														
 
															+    """check for an existing link archive in the given directory, 
														
 
															+       and load+merge it into the given link dict
														
 
															+    """
														
 
															+    link = {
														
 
															+        **parse_json_link_index(out_dir),
														
 
															+        **link,
														
 
															+    }
														
 
															+
														
 
															+    check_link_structure(link)
														
 
															+    return link
														
 
															+
														
 
															 def write_html_link_index(out_dir, link):
														
 
															     check_link_structure(link)
														
 
															     with open(os.path.join(TEMPLATES_DIR, 'link_index.html'), 'r', encoding='utf-8') as f:
														
@@ -224,7 +257,10 @@ def write_html_link_index(out_dir, link):
 
															                 wget_output_path(link)
														
 
															                 or (link['domain'] if link['is_archived'] else 'about:blank')
														
 
															             ),
														
 
															-            'extension': link['extension'] or 'HTML',
														
 
															+            'extension': link['extension'] or 'html',
														
 
															+            'tags': link['tags'].strip() or 'untagged',
														
 
															+            'status': 'Archived' if link['is_archived'] else 'Not yet archived',
														
 
															+            'status_color': 'success' if link['is_archived'] else 'danger',
														
 
															         }))
														
 
															     chmod_file(path)
														
--- a/archivebox/logs.py
+++ b/archivebox/logs.py
@@ -0,0 +1,161 @@
 
															+import sys
														
 
															+from datetime import datetime
														
 
															+from config import ANSI, REPO_DIR, OUTPUT_DIR
														
 
															+
														
 
															+
														
 
															+# globals are bad, mmkay
														
 
															+_LAST_RUN_STATS = {
														
 
															+    'skipped': 0,
														
 
															+    'succeded': 0,
														
 
															+    'failed': 0,
														
 
															+
														
 
															+    'parsing_start_ts': 0,
														
 
															+    'parsing_end_ts': 0,
														
 
															+
														
 
															+    'indexing_start_ts': 0,
														
 
															+    'indexing_end_ts': 0,
														
 
															+
														
 
															+    'archiving_start_ts': 0,
														
 
															+    'archiving_end_ts': 0,
														
 
															+
														
 
															+    'links': {},
														
 
															+}
														
 
															+
														
 
															+def pretty_path(path):
														
 
															+    """convert paths like .../ArchiveBox/archivebox/../output/abc into output/abc"""
														
 
															+    return path.replace(REPO_DIR + '/', '')
														
 
															+
														
 
															+
														
 
															+def log_link_archiving_started(link_dir, link, is_new):
														
 
															+    print('[{symbol_color}{symbol}{reset}] [{now}] "{title}"\n    {blue}{url}{reset}'.format(
														
 
															+        symbol='+' if is_new else '*',
														
 
															+        symbol_color=ANSI['green' if is_new else 'black'],
														
 
															+        now=datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
														
 
															+        **{**link, 'title': link['title'] or link['url']},
														
 
															+        **ANSI,
														
 
															+    ))
														
 
															+
														
 
															+    print('    > {}{}'.format(pretty_path(link_dir), ' (new)' if is_new else ''))
														
 
															+
														
 
															+
														
 
															+def log_link_archiving_failed(cmd, pwd, err=None, hints=None, prefix='        '):
														
 
															+    """quote the argument with whitespace in a command so the user can 
														
 
															+       copy-paste the outputted string directly to run the cmd
														
 
															+    """
														
 
															+
														
 
															+    # Prettify CMD string and make it save to copy-paste by quoting arguments
														
 
															+    quoted_cmd = ' '.join(
														
 
															+        '"{}"'.format(arg) if ' ' in arg else arg
														
 
															+        for arg in cmd
														
 
															+    )
														
 
															+
														
 
															+    # Prettify error output hints string and limit to five lines
														
 
															+    hints = hints or getattr(err, 'hints', None)
														
 
															+    if hints:
														
 
															+        hints = hints if isinstance(hints, (list, tuple)) else hints.split('\n')
														
 
															+        hints = (
														
 
															+            '    {}{}{}'.format(ANSI['lightyellow'], line.strip(), ANSI['reset'])
														
 
															+            for line in hints[:5] if line.strip()
														
 
															+        )
														
 
															+    else:
														
 
															+        hints = ()
														
 
															+
														
 
															+    output_lines = [
														
 
															+        '{}Failed: {} {}{}'.format(ANSI['red'], err.__class__.__name__, err, ANSI['reset']),
														
 
															+        *hints,
														
 
															+        'Run to see full output:'        
														
 
															+        '    cd {};'.format(pwd),
														
 
															+        '    {}'.format(quoted_cmd),
														
 
															+    ]
														
 
															+
														
 
															+    return '\n'.join(
														
 
															+        '{}{}'.format(prefix, line)
														
 
															+        for line in output_lines
														
 
															+        if line
														
 
															+    )
														
 
															+
														
 
															+### Logging Helpers
														
 
															+
														
 
															+def log_parsing_started(source_file):
														
 
															+    start_ts = datetime.now()
														
 
															+    _LAST_RUN_STATS['parse_start_ts'] = start_ts
														
 
															+    print('{green}[*] [{}] Parsing new links from output/sources/{}...{reset}'.format(
														
 
															+        start_ts.strftime('%Y-%m-%d %H:%M:%S'),
														
 
															+        source_file.rsplit('/', 1)[-1],
														
 
															+        **ANSI,
														
 
															+    ))
														
 
															+
														
 
															+def log_parsing_finished(num_new_links, parser_name):
														
 
															+    print('    > Adding {} new links to index (parsed import as {})'.format(
														
 
															+        num_new_links,
														
 
															+        parser_name,
														
 
															+    ))
														
 
															+
														
 
															+def log_indexing_started():
														
 
															+    start_ts = datetime.now()
														
 
															+    _LAST_RUN_STATS['index_start_ts'] = start_ts
														
 
															+    print('{green}[*] [{}] Saving main index files...{reset}'.format(
														
 
															+        start_ts.strftime('%Y-%m-%d %H:%M:%S'),
														
 
															+        **ANSI,
														
 
															+    ))
														
 
															+
														
 
															+def log_indexing_finished(out_dir, out_file):
														
 
															+    end_ts = datetime.now()
														
 
															+    _LAST_RUN_STATS['index_end_ts'] = end_ts
														
 
															+    print('    > {}/{}'.format(pretty_path(out_dir), out_file))
														
 
															+
														
 
															+def log_archiving_started(num_links, resume):
														
 
															+    start_ts = datetime.now()
														
 
															+    _LAST_RUN_STATS['start_ts'] = start_ts
														
 
															+    if resume:
														
 
															+        print('{green}[▶] [{}] Resuming archive updating for {} pages starting from {}...{reset}'.format(
														
 
															+             start_ts.strftime('%Y-%m-%d %H:%M:%S'),
														
 
															+             num_links,
														
 
															+             resume,
														
 
															+             **ANSI,
														
 
															+        ))
														
 
															+    else:
														
 
															+        print('{green}[▶] [{}] Updating content for {} pages in archive...{reset}'.format(
														
 
															+             start_ts.strftime('%Y-%m-%d %H:%M:%S'),
														
 
															+             num_links,
														
 
															+             **ANSI,
														
 
															+        ))
														
 
															+
														
 
															+def log_archiving_paused(num_links, idx, timestamp):
														
 
															+    end_ts = datetime.now()
														
 
															+    _LAST_RUN_STATS['end_ts'] = end_ts
														
 
															+    print()
														
 
															+    print('\n{lightyellow}[X] [{now}] Downloading paused on link {timestamp} ({idx}/{total}){reset}'.format(
														
 
															+        **ANSI,
														
 
															+        now=end_ts.strftime('%Y-%m-%d %H:%M:%S'),
														
 
															+        idx=idx+1,
														
 
															+        timestamp=timestamp,
														
 
															+        total=num_links,
														
 
															+    ))
														
 
															+    print('    To view your archive, open: {}/index.html'.format(OUTPUT_DIR.replace(REPO_DIR + '/', '')))
														
 
															+    print('    Continue where you left off by running:')
														
 
															+    print('        {} {}'.format(
														
 
															+        pretty_path(sys.argv[0]),
														
 
															+        timestamp,
														
 
															+    ))
														
 
															+
														
 
															+def log_archiving_finished(num_links):
														
 
															+    end_ts = datetime.now()
														
 
															+    _LAST_RUN_STATS['end_ts'] = end_ts
														
 
															+    seconds = end_ts - _LAST_RUN_STATS['start_ts'].timestamp()
														
 
															+    if seconds > 60:
														
 
															+        duration = '{0:.2f} min'.format(seconds / 60, 2)
														
 
															+    else:
														
 
															+        duration = '{0:.2f} sec'.format(seconds, 2)
														
 
															+
														
 
															+    print('{}[√] [{}] Update of {} pages complete ({}){}'.format(
														
 
															+        ANSI['green'],
														
 
															+        end_ts.strftime('%Y-%m-%d %H:%M:%S'),
														
 
															+        num_links,
														
 
															+        duration,
														
 
															+        ANSI['reset'],
														
 
															+    ))
														
 
															+    print('    - {} entries skipped'.format(_LAST_RUN_STATS['skipped']))
														
 
															+    print('    - {} entries updated'.format(_LAST_RUN_STATS['succeded']))
														
 
															+    print('    - {} errors'.format(_LAST_RUN_STATS['failed']))
														
 
															+    print('    To view your archive, open: {}/index.html'.format(OUTPUT_DIR.replace(REPO_DIR + '/', '')))
														
--- a/archivebox/parse.py
+++ b/archivebox/parse.py
@@ -1,17 +1,19 @@
 
															-# coding: utf-8
														
 
															-
														
 
															 """
														
 
															-Everything related to parsing links from bookmark services.
														
 
															+Everything related to parsing links from input sources.
														
 
															 For a list of supported services, see the README.md.
														
 
															-For examples of supported files see examples/.
														
 
															+For examples of supported import formats see tests/.
														
 
															-Parsed link schema: {
														
 
															+Link: {
														
 
															     'url': 'https://example.com/example/?abc=123&xyc=345#lmnop',
														
 
															-    'timestamp': '15442123124234',
														
 
															+    'timestamp': '1544212312.4234',
														
 
															     'title': 'Example.com Page Title',
														
 
															-    'sources': ['ril_export.html', 'downloads/getpocket.com.txt'],
														
 
															     'tags': 'abc,def',
														
 
															+    'sources': [
														
 
															+        'output/sources/ril_export.html',
														
 
															+        'output/sources/getpocket.com-1523422111.txt',
														
 
															+        'output/sources/stdin-234234112312.txt'
														
 
															+    ]
														
 
															 }
														
 
															 """
														
@@ -19,45 +21,59 @@ import re
 
															 import json
														
 
															 from datetime import datetime
														
 
															-from collections import OrderedDict
														
 
															 import xml.etree.ElementTree as etree
														
 
															-from config import ANSI
														
 
															+from config import TIMEOUT
														
 
															 from util import (
														
 
															     str_between,
														
 
															     URL_REGEX,
														
 
															-    check_url_parsing,
														
 
															+    check_url_parsing_invariants,
														
 
															+    progress,
														
 
															 )
														
 
															-def parse_links(path):
														
 
															-    """parse a list of links dictionaries from a bookmark export file"""
														
 
															-    
														
 
															-    check_url_parsing()
														
 
															-
														
 
															-    links = []
														
 
															-    with open(path, 'r', encoding='utf-8') as file:
														
 
															-        print('{green}[*] [{}] Parsing new links from output/sources/{}...{reset}'.format(
														
 
															-            datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
														
 
															-            path.rsplit('/', 1)[-1],
														
 
															-            **ANSI,
														
 
															-        ))
														
 
															+def parse_links(source_file):
														
 
															+    """parse a list of URLs with their metadata from an 
														
 
															+       RSS feed, bookmarks export, or text file
														
 
															+    """
														
 
															-        for parser_name, parser_func in PARSERS.items():
														
 
															+    check_url_parsing_invariants()
														
 
															+    PARSERS = (
														
 
															+        # Specialized parsers
														
 
															+        ('Pocket HTML', parse_pocket_html_export),
														
 
															+        ('Pinboard RSS', parse_pinboard_rss_export),
														
 
															+        ('Shaarli RSS', parse_shaarli_rss_export),
														
 
															+        ('Medium RSS', parse_medium_rss_export),
														
 
															+        
														
 
															+        # General parsers
														
 
															+        ('Netscape HTML', parse_netscape_html_export),
														
 
															+        ('Generic RSS', parse_rss_export),
														
 
															+        ('Generic JSON', parse_json_export),
														
 
															+
														
 
															+        # Fallback parser
														
 
															+        ('Plain Text', parse_plain_text_export),
														
 
															+    )
														
 
															+    end = progress(TIMEOUT * 4, prefix='      ')
														
 
															+    with open(source_file, 'r', encoding='utf-8') as file:
														
 
															+        for parser_name, parser_func in PARSERS:
														
 
															             try:
														
 
															-                links += list(parser_func(file))
														
 
															+                links = list(parser_func(file))
														
 
															                 if links:
														
 
															-                    break
														
 
															+                    end()
														
 
															+                    return links, parser_name
														
 
															             except Exception as err:
														
 
															-                # we try each parser one by one, wong parsers will throw exeptions
														
 
															-                # if unsupported and we accept the first one that passes
														
 
															-                # uncomment the following line to see why the parser was unsupported for each attempted format
														
 
															+                # Parsers are tried one by one down the list, and the first one
														
 
															+                # that succeeds is used. To see why a certain parser was not used
														
 
															+                # due to error or format incompatibility, uncomment this line:
														
 
															                 # print('[!] Parser {} failed: {} {}'.format(parser_name, err.__class__.__name__, err))
														
 
															                 pass
														
 
															-    return links, parser_name
														
 
															+    end()
														
 
															+    return [], 'Plain Text'
														
 
															+### Import Parser Functions
														
 
															+
														
 
															 def parse_pocket_html_export(html_file):
														
 
															     """Parse Pocket-format bookmarks export files (produced by getpocket.com/export/)"""
														
@@ -81,40 +97,57 @@ def parse_pocket_html_export(html_file):
 
															                 'sources': [html_file.name],
														
 
															             }
														
 
															-def parse_pinboard_json_export(json_file):
														
 
															+
														
 
															+def parse_json_export(json_file):
														
 
															     """Parse JSON-format bookmarks export files (produced by pinboard.in/export/, or wallabag)"""
														
 
															+
														
 
															     json_file.seek(0)
														
 
															-    json_content = json.load(json_file)
														
 
															-    for line in json_content:
														
 
															+    links = json.load(json_file)
														
 
															+    json_date = lambda s: datetime.strptime(s, '%Y-%m-%dT%H:%M:%S%z')
														
 
															+
														
 
															+    for link in links:
														
 
															         # example line
														
 
															         # {"href":"http:\/\/www.reddit.com\/r\/example","description":"title here","extended":"","meta":"18a973f09c9cc0608c116967b64e0419","hash":"910293f019c2f4bb1a749fb937ba58e3","time":"2014-06-14T15:51:42Z","shared":"no","toread":"no","tags":"reddit android"}]
														
 
															-        if line:
														
 
															-            erg = line
														
 
															-            if erg.get('timestamp'):
														
 
															-                timestamp = str(erg['timestamp']/10000000)  # chrome/ff histories use a very precise timestamp
														
 
															-            elif erg.get('time'):
														
 
															-                timestamp = str(datetime.strptime(erg['time'].split(',', 1)[0], '%Y-%m-%dT%H:%M:%SZ').timestamp())
														
 
															-            elif erg.get('created_at'):
														
 
															-                timestamp = str(datetime.strptime(erg['created_at'], '%Y-%m-%dT%H:%M:%S%z').timestamp())
														
 
															-            else:
														
 
															-                timestamp = str(datetime.now().timestamp())
														
 
															-            if erg.get('href'):
														
 
															-                url = erg['href']
														
 
															-            else:
														
 
															-                url = erg['url']
														
 
															-            if erg.get('description'):
														
 
															-                title = (erg.get('description') or '').replace(' — Readability', '')
														
 
															-            else:
														
 
															-                title = erg['title'].strip()
														
 
															-
														
 
															-            info = {
														
 
															+        if link:
														
 
															+            # Parse URL
														
 
															+            url = link.get('href') or link.get('url') or link.get('URL')
														
 
															+            if not url:
														
 
															+                raise Exception('JSON must contain URL in each entry [{"url": "http://...", ...}, ...]')
														
 
															+
														
 
															+            # Parse the timestamp
														
 
															+            ts_str = str(datetime.now().timestamp())
														
 
															+            if link.get('timestamp'):
														
 
															+                # chrome/ff histories use a very precise timestamp
														
 
															+                ts_str = str(link['timestamp'] / 10000000)  
														
 
															+            elif link.get('time'):
														
 
															+                ts_str = str(json_date(link['time'].split(',', 1)[0]).timestamp())
														
 
															+            elif link.get('created_at'):
														
 
															+                ts_str = str(json_date(link['created_at']).timestamp())
														
 
															+            elif link.get('created'):
														
 
															+                ts_str = str(json_date(link['created']).timestamp())
														
 
															+            elif link.get('date'):
														
 
															+                ts_str = str(json_date(link['date']).timestamp())
														
 
															+            elif link.get('bookmarked'):
														
 
															+                ts_str = str(json_date(link['bookmarked']).timestamp())
														
 
															+            elif link.get('saved'):
														
 
															+                ts_str = str(json_date(link['saved']).timestamp())
														
 
															+            
														
 
															+            # Parse the title
														
 
															+            title = None
														
 
															+            if link.get('title'):
														
 
															+                title = link['title'].strip() or None
														
 
															+            elif link.get('description'):
														
 
															+                title = link['description'].replace(' — Readability', '').strip() or None
														
 
															+            elif link.get('name'):
														
 
															+                title = link['name'].strip() or None
														
 
															+
														
 
															+            yield {
														
 
															                 'url': url,
														
 
															-                'timestamp': timestamp,
														
 
															-                'title': title or None,
														
 
															-                'tags': erg.get('tags') or '',
														
 
															+                'timestamp': ts_str,
														
 
															+                'title': title,
														
 
															+                'tags': link.get('tags') or '',
														
 
															                 'sources': [json_file.name],
														
 
															             }
														
 
															-            yield info
														
 
															 def parse_rss_export(rss_file):
														
@@ -139,15 +172,15 @@ def parse_rss_export(rss_file):
 
															         def get_row(key):
														
 
															             return [r for r in rows if r.strip().startswith('<{}>'.format(key))][0]
														
 
															-        title = str_between(get_row('title'), '<![CDATA[', ']]').strip()
														
 
															         url = str_between(get_row('link'), '<link>', '</link>')
														
 
															         ts_str = str_between(get_row('pubDate'), '<pubDate>', '</pubDate>')
														
 
															         time = datetime.strptime(ts_str, "%a, %d %b %Y %H:%M:%S %z")
														
 
															+        title = str_between(get_row('title'), '<![CDATA[', ']]').strip() or None
														
 
															         yield {
														
 
															             'url': url,
														
 
															             'timestamp': str(time.timestamp()),
														
 
															-            'title': title or None,
														
 
															+            'title': title,
														
 
															             'tags': '',
														
 
															             'sources': [rss_file.name],
														
 
															         }
														
@@ -224,9 +257,6 @@ def parse_pinboard_rss_export(rss_file):
 
															         tags = item.find("{http://purl.org/dc/elements/1.1/}subject").text if item.find("{http://purl.org/dc/elements/1.1/}subject") else None
														
 
															         title = item.find("{http://purl.org/rss/1.0/}title").text.strip() if item.find("{http://purl.org/rss/1.0/}title").text.strip() else None
														
 
															         ts_str = item.find("{http://purl.org/dc/elements/1.1/}date").text if item.find("{http://purl.org/dc/elements/1.1/}date").text else None
														
 
															-        #       = 🌈🌈🌈🌈
														
 
															-        #        = 🌈🌈🌈🌈
														
 
															-        #         = 🏆🏆🏆🏆
														
 
															         # Pinboard includes a colon in its date stamp timezone offsets, which
														
 
															         # Python can't parse. Remove it:
														
@@ -254,8 +284,6 @@ def parse_medium_rss_export(rss_file):
 
															     root = etree.parse(rss_file).getroot()
														
 
															     items = root.find("channel").findall("item")
														
 
															     for item in items:
														
 
															-        # for child in item:
														
 
															-        #     print(child.tag, child.text)
														
 
															         url = item.find("link").text
														
 
															         title = item.find("title").text.strip()
														
 
															         ts_str = item.find("pubDate").text
														
@@ -274,31 +302,13 @@ def parse_plain_text_export(text_file):
 
															     """Parse raw links from each line in a text file"""
														
 
															     text_file.seek(0)
														
 
															-    text_content = text_file.readlines()
														
 
															-    for line in text_content:
														
 
															-        if line:
														
 
															-            urls = re.findall(URL_REGEX, line)
														
 
															-            
														
 
															-            for url in urls:
														
 
															-                url = url.strip()
														
 
															-                time = datetime.now()
														
 
															-                
														
 
															-                yield {
														
 
															-                    'url': url,
														
 
															-                    'timestamp': str(time.timestamp()),
														
 
															-                    'title': None,
														
 
															-                    'tags': '',
														
 
															-                    'sources': [text_file.name],
														
 
															-                }
														
 
															-
														
 
															-
														
 
															-PARSERS = OrderedDict([
														
 
															-    ('Pocket HTML', parse_pocket_html_export),
														
 
															-    ('Pinboard JSON', parse_pinboard_json_export),
														
 
															-    ('Netscape HTML', parse_netscape_html_export),
														
 
															-    ('RSS', parse_rss_export),
														
 
															-    ('Pinboard RSS', parse_pinboard_rss_export),
														
 
															-    ('Shaarli RSS', parse_shaarli_rss_export),
														
 
															-    ('Medium RSS', parse_medium_rss_export),
														
 
															-    ('Plain Text', parse_plain_text_export),
														
 
															-])
														
 
															+    for line in text_file.readlines():
														
 
															+        urls = re.findall(URL_REGEX, line) if line.strip() else ()
														
 
															+        for url in urls:
														
 
															+            yield {
														
 
															+                'url': url,
														
 
															+                'timestamp': str(datetime.now().timestamp()),
														
 
															+                'title': None,
														
 
															+                'tags': '',
														
 
															+                'sources': [text_file.name],
														
 
															+            }
														
--- a/archivebox/stdlib_patches.py
+++ b/archivebox/stdlib_patches.py
@@ -1,10 +1,64 @@
 
															+"""
														
 
															+Patches, additions, and shortcuts for Python standard library functions.
														
 
															+"""
														
 
															+
														
 
															+### subprocess
														
 
															+
														
 
															+from subprocess import (
														
 
															+    Popen,
														
 
															+    PIPE,
														
 
															+    DEVNULL, 
														
 
															+    CompletedProcess,
														
 
															+    TimeoutExpired,
														
 
															+    CalledProcessError,
														
 
															+)
														
 
															+
														
 
															+def run(*popenargs, input=None, capture_output=False, timeout=None, check=False, **kwargs):
														
 
															+    """Patched of subprocess.run to fix blocking io making timeout=innefective"""
														
 
															+
														
 
															+    if input is not None:
														
 
															+        if 'stdin' in kwargs:
														
 
															+            raise ValueError('stdin and input arguments may not both be used.')
														
 
															+        kwargs['stdin'] = PIPE
														
 
															+
														
 
															+    if capture_output:
														
 
															+        if ('stdout' in kwargs) or ('stderr' in kwargs):
														
 
															+            raise ValueError('stdout and stderr arguments may not be used '
														
 
															+                             'with capture_output.')
														
 
															+        kwargs['stdout'] = PIPE
														
 
															+        kwargs['stderr'] = PIPE
														
 
															+
														
 
															+    with Popen(*popenargs, **kwargs) as process:
														
 
															+        try:
														
 
															+            stdout, stderr = process.communicate(input, timeout=timeout)
														
 
															+        except TimeoutExpired:
														
 
															+            process.kill()
														
 
															+            try:
														
 
															+                stdout, stderr = process.communicate(input, timeout=2)
														
 
															+            except:
														
 
															+                pass
														
 
															+            raise TimeoutExpired(popenargs[0][0], timeout)
														
 
															+        except BaseException as err:
														
 
															+            process.kill()
														
 
															+            # We don't call process.wait() as .__exit__ does that for us.
														
 
															+            raise 
														
 
															+        retcode = process.poll()
														
 
															+        if check and retcode:
														
 
															+            raise CalledProcessError(retcode, process.args,
														
 
															+                                     output=stdout, stderr=stderr)
														
 
															+    return CompletedProcess(process.args, retcode, stdout, stderr)
														
 
															+
														
 
															+
														
 
															+
														
 
															+### collections
														
 
															+
														
 
															 from sys import maxsize
														
 
															 from itertools import islice
														
 
															 from collections import deque
														
 
															 _marker = object()
														
 
															-class Peekable(object):
														
 
															+class PeekableGenerator:
														
 
															     """Peekable version of a normal python generator.
														
 
															        Useful when you don't want to evaluate the entire iterable to look at
														
 
															        a specific item at a given idx.
														
@@ -74,8 +128,6 @@ class Peekable(object):
 
															         return next(self._it)
														
 
															-    next = __next__  # For Python 2 compatibility
														
 
															-
														
 
															     def _get_slice(self, index):
														
 
															         # Normalize the slice's arguments
														
 
															         step = 1 if (index.step is None) else index.step
														
--- a/archivebox/templates/link_index.html
+++ b/archivebox/templates/link_index.html
@@ -192,22 +192,27 @@
 
															                     Bookmarked: <small title="Timestamp: $timestamp">$bookmarked_date</small>
														
 
															                     &nbsp; | &nbsp;
														
 
															                     Last updated: <small title="Timestamp: $updated">$updated_date</small>
														
 
															+                    &nbsp; | &nbsp;
														
 
															+                    Total files: <small title="Archive methods">🗃 $num_outputs</small>
														
 
															                 </div>
														
 
															                 <div class="col-lg-4 alert well">
														
 
															                     Type: 
														
 
															                     <span class="badge badge-default">$extension</span>
														
 
															                     &nbsp; | &nbsp;
														
 
															                     Tags:
														
 
															-                    <span class="badge badge-success">$tags</span> 
														
 
															+                    <span class="badge badge-warning">$tags</span> 
														
 
															+                    &nbsp; | &nbsp;
														
 
															+                    Status:
														
 
															+                    <span class="badge badge-$status_color">$status</span> 
														
 
															                 </div>
														
 
															                 <div class="col-lg-4 alert well">
														
 
															-                    Download:
														
 
															+                    Archive Methods:
														
 
															                     <a href="index.json" title="JSON summary of archived link.">JSON</a> | 
														
 
															                     <a href="warc/" title="Any WARC archives for the page">WARC</a> | 
														
 
															                     <a href="media/" title="Audio, Video, and Subtitle files.">Media</a> | 
														
 
															                     <a href="git/" title="Any git repos at the url">Git Repos</a> | 
														
 
															                     <a href="favicon.ico" title="Any git repos at the url">Favicon</a> | 
														
 
															-                    <a href="." title="Webserver-provided index of files directory.">More files...</a>
														
 
															+                    <a href="." title="Webserver-provided index of files directory.">See all files...</a>
														
 
															                 </div>
														
 
															                 <hr/>
														
 
															                 <div class="col-lg-2">
														
--- a/archivebox/util.py
+++ b/archivebox/util.py
@@ -8,8 +8,8 @@ from urllib.parse import urlparse, quote
 
															 from decimal import Decimal
														
 
															 from datetime import datetime
														
 
															 from multiprocessing import Process
														
 
															-from subprocess import TimeoutExpired, Popen, PIPE, DEVNULL, CompletedProcess, CalledProcessError
														
 
															+from stdlib_patches import run, PIPE, DEVNULL
														
 
															 from config import (
														
 
															     ANSI,
														
 
															     TERM_WIDTH,
														
@@ -19,8 +19,6 @@ from config import (
 
															     OUTPUT_PERMISSIONS,
														
 
															     TIMEOUT,
														
 
															     SHOW_PROGRESS,
														
 
															-    CHECK_SSL_VALIDITY,
														
 
															-    WGET_USER_AGENT,
														
 
															     CURL_BINARY,
														
 
															     WGET_BINARY,
														
 
															     CHROME_BINARY,
														
@@ -37,6 +35,13 @@ from config import (
 
															     FETCH_MEDIA,
														
 
															     SUBMIT_ARCHIVE_DOT_ORG,
														
 
															     ARCHIVE_DIR_NAME,
														
 
															+    RESOLUTION,
														
 
															+    CHECK_SSL_VALIDITY,
														
 
															+    WGET_USER_AGENT,
														
 
															+    CHROME_USER_AGENT,
														
 
															+    CHROME_USER_DATA_DIR,
														
 
															+    CHROME_HEADLESS,
														
 
															+    CHROME_SANDBOX,
														
 
															 )
														
 
															 ### Parsing Helpers
														
@@ -56,6 +61,7 @@ extension = lambda url: basename(url).rsplit('.', 1)[-1].lower() if '.' in basen
 
															 base_url = lambda url: without_scheme(url)  # uniq base url used to dedupe links
														
 
															 short_ts = lambda ts: ts.split('.')[0]
														
 
															+urlencode = lambda s: quote(s, encoding='utf-8', errors='replace')
														
 
															 URL_REGEX = re.compile(
														
 
															     r'http[s]?://'                    # start matching from allowed schemes
														
@@ -109,66 +115,74 @@ def check_links_structure(links):
 
															 def check_dependencies():
														
 
															     """Check that all necessary dependencies are installed, and have valid versions"""
														
 
															-    python_vers = float('{}.{}'.format(sys.version_info.major, sys.version_info.minor))
														
 
															-    if python_vers < 3.5:
														
 
															-        print('{}[X] Python version is not new enough: {} (>3.5 is required){}'.format(ANSI['red'], python_vers, ANSI['reset']))
														
 
															-        print('    See https://github.com/pirate/ArchiveBox#troubleshooting for help upgrading your Python installation.')
														
 
															-        raise SystemExit(1)
														
 
															-
														
 
															-    if FETCH_FAVICON or SUBMIT_ARCHIVE_DOT_ORG:
														
 
															-        if run(['which', CURL_BINARY], stdout=DEVNULL).returncode or run([CURL_BINARY, '--version'], stdout=DEVNULL).returncode:
														
 
															-            print('{red}[X] Missing dependency: curl{reset}'.format(**ANSI))
														
 
															-            print('    Run ./setup.sh, then confirm it was installed with: {} --version'.format(CURL_BINARY))
														
 
															-            print('    See https://github.com/pirate/ArchiveBox for help.')
														
 
															+    try:
														
 
															+        python_vers = float('{}.{}'.format(sys.version_info.major, sys.version_info.minor))
														
 
															+        if python_vers < 3.5:
														
 
															+            print('{}[X] Python version is not new enough: {} (>3.5 is required){}'.format(ANSI['red'], python_vers, ANSI['reset']))
														
 
															+            print('    See https://github.com/pirate/ArchiveBox/wiki/Troubleshooting#python for help upgrading your Python installation.')
														
 
															             raise SystemExit(1)
														
 
															-    if FETCH_WGET or FETCH_WARC:
														
 
															-        if run(['which', WGET_BINARY], stdout=DEVNULL).returncode or run([WGET_BINARY, '--version'], stdout=DEVNULL).returncode:
														
 
															-            print('{red}[X] Missing dependency: wget{reset}'.format(**ANSI))
														
 
															-            print('    Run ./setup.sh, then confirm it was installed with: {} --version'.format(WGET_BINARY))
														
 
															-            print('    See https://github.com/pirate/ArchiveBox for help.')
														
 
															-            raise SystemExit(1)
														
 
															+        if FETCH_FAVICON or SUBMIT_ARCHIVE_DOT_ORG:
														
 
															+            if run(['which', CURL_BINARY], stdout=DEVNULL, stderr=DEVNULL).returncode or run([CURL_BINARY, '--version'], stdout=DEVNULL, stderr=DEVNULL).returncode:
														
 
															+                print('{red}[X] Missing dependency: curl{reset}'.format(**ANSI))
														
 
															+                print('    Install it, then confirm it works with: {} --version'.format(CURL_BINARY))
														
 
															+                print('    See https://github.com/pirate/ArchiveBox/wiki/Install for help.')
														
 
															+                raise SystemExit(1)
														
 
															-    if FETCH_PDF or FETCH_SCREENSHOT or FETCH_DOM:
														
 
															-        if run(['which', CHROME_BINARY], stdout=DEVNULL).returncode:
														
 
															-            print('{}[X] Missing dependency: {}{}'.format(ANSI['red'], CHROME_BINARY, ANSI['reset']))
														
 
															-            print('    Run ./setup.sh, then confirm it was installed with: {} --version'.format(CHROME_BINARY))
														
 
															-            print('    See https://github.com/pirate/ArchiveBox for help.')
														
 
															-            raise SystemExit(1)
														
 
															+        if FETCH_WGET or FETCH_WARC:
														
 
															+            if run(['which', WGET_BINARY], stdout=DEVNULL, stderr=DEVNULL).returncode or run([WGET_BINARY, '--version'], stdout=DEVNULL, stderr=DEVNULL).returncode:
														
 
															+                print('{red}[X] Missing dependency: wget{reset}'.format(**ANSI))
														
 
															+                print('    Install it, then confirm it works with: {} --version'.format(WGET_BINARY))
														
 
															+                print('    See https://github.com/pirate/ArchiveBox/wiki/Install for help.')
														
 
															+                raise SystemExit(1)
														
 
															-        # parse chrome --version e.g. Google Chrome 61.0.3114.0 canary / Chromium 59.0.3029.110 built on Ubuntu, running on Ubuntu 16.04
														
 
															-        try:
														
 
															-            result = run([CHROME_BINARY, '--version'], stdout=PIPE)
														
 
															-            version_str = result.stdout.decode('utf-8')
														
 
															-            version_lines = re.sub("(Google Chrome|Chromium) (\\d+?)\\.(\\d+?)\\.(\\d+?).*?$", "\\2", version_str).split('\n')
														
 
															-            version = [l for l in version_lines if l.isdigit()][-1]
														
 
															-            if int(version) < 59:
														
 
															-                print(version_lines)
														
 
															-                print('{red}[X] Chrome version must be 59 or greater for headless PDF, screenshot, and DOM saving{reset}'.format(**ANSI))
														
 
															-                print('    See https://github.com/pirate/ArchiveBox for help.')
														
 
															+        if FETCH_PDF or FETCH_SCREENSHOT or FETCH_DOM:
														
 
															+            if run(['which', CHROME_BINARY], stdout=DEVNULL, stderr=DEVNULL).returncode:
														
 
															+                print('{}[X] Missing dependency: {}{}'.format(ANSI['red'], CHROME_BINARY, ANSI['reset']))
														
 
															+                print('    Install it, then confirm it works with: {} --version'.format(CHROME_BINARY))
														
 
															+                print('    See https://github.com/pirate/ArchiveBox/wiki/Install for help.')
														
 
															                 raise SystemExit(1)
														
 
															-        except (IndexError, TypeError, OSError):
														
 
															-            print('{red}[X] Failed to parse Chrome version, is it installed properly?{reset}'.format(**ANSI))
														
 
															-            print('    Run ./setup.sh, then confirm it was installed with: {} --version'.format(CHROME_BINARY))
														
 
															-            print('    See https://github.com/pirate/ArchiveBox for help.')
														
 
															-            raise SystemExit(1)
														
 
															-    if FETCH_GIT:
														
 
															-        if run(['which', GIT_BINARY], stdout=DEVNULL).returncode or run([GIT_BINARY, '--version'], stdout=DEVNULL).returncode:
														
 
															-            print('{red}[X] Missing dependency: git{reset}'.format(**ANSI))
														
 
															-            print('    Run ./setup.sh, then confirm it was installed with: {} --version'.format(GIT_BINARY))
														
 
															-            print('    See https://github.com/pirate/ArchiveBox for help.')
														
 
															-            raise SystemExit(1)
														
 
															+            # parse chrome --version e.g. Google Chrome 61.0.3114.0 canary / Chromium 59.0.3029.110 built on Ubuntu, running on Ubuntu 16.04
														
 
															+            try:
														
 
															+                result = run([CHROME_BINARY, '--version'], stdout=PIPE)
														
 
															+                version_str = result.stdout.decode('utf-8')
														
 
															+                version_lines = re.sub("(Google Chrome|Chromium) (\\d+?)\\.(\\d+?)\\.(\\d+?).*?$", "\\2", version_str).split('\n')
														
 
															+                version = [l for l in version_lines if l.isdigit()][-1]
														
 
															+                if int(version) < 59:
														
 
															+                    print(version_lines)
														
 
															+                    print('{red}[X] Chrome version must be 59 or greater for headless PDF, screenshot, and DOM saving{reset}'.format(**ANSI))
														
 
															+                    print('    See https://github.com/pirate/ArchiveBox/wiki/Install for help.')
														
 
															+                    raise SystemExit(1)
														
 
															+            except (IndexError, TypeError, OSError):
														
 
															+                print('{red}[X] Failed to parse Chrome version, is it installed properly?{reset}'.format(**ANSI))
														
 
															+                print('    Install it, then confirm it works with: {} --version'.format(CHROME_BINARY))
														
 
															+                print('    See https://github.com/pirate/ArchiveBox/wiki/Install for help.')
														
 
															+                raise SystemExit(1)
														
 
															-    if FETCH_MEDIA:
														
 
															-        if run(['which', YOUTUBEDL_BINARY], stdout=DEVNULL).returncode or run([YOUTUBEDL_BINARY, '--version'], stdout=DEVNULL).returncode:
														
 
															-            print('{red}[X] Missing dependency: youtube-dl{reset}'.format(**ANSI))
														
 
															-            print('    Run ./setup.sh, then confirm it was installed with: {} --version'.format(YOUTUBEDL_BINARY))
														
 
															-            print('    See https://github.com/pirate/ArchiveBox for help.')
														
 
															-            raise SystemExit(1)
														
 
															+        if FETCH_GIT:
														
 
															+            if run(['which', GIT_BINARY], stdout=DEVNULL, stderr=DEVNULL).returncode or run([GIT_BINARY, '--version'], stdout=DEVNULL, stderr=DEVNULL).returncode:
														
 
															+                print('{red}[X] Missing dependency: git{reset}'.format(**ANSI))
														
 
															+                print('    Install it, then confirm it works with: {} --version'.format(GIT_BINARY))
														
 
															+                print('    See https://github.com/pirate/ArchiveBox/wiki/Install for help.')
														
 
															+                raise SystemExit(1)
														
 
															-def check_url_parsing():
														
 
															+        if FETCH_MEDIA:
														
 
															+            if run(['which', YOUTUBEDL_BINARY], stdout=DEVNULL, stderr=DEVNULL).returncode or run([YOUTUBEDL_BINARY, '--version'], stdout=DEVNULL, stderr=DEVNULL).returncode:
														
 
															+                print('{red}[X] Missing dependency: youtube-dl{reset}'.format(**ANSI))
														
 
															+                print('    Install it, then confirm it was installed with: {} --version'.format(YOUTUBEDL_BINARY))
														
 
															+                print('    See https://github.com/pirate/ArchiveBox/wiki/Install for help.')
														
 
															+                raise SystemExit(1)
														
 
															+    except (KeyboardInterrupt, Exception):
														
 
															+        raise SystemExit(1)
														
 
															+
														
 
															+def check_url_parsing_invariants():
														
 
															     """Check that plain text regex URL parsing works as expected"""
														
 
															+
														
 
															+    # this is last-line-of-defense to make sure the URL_REGEX isn't
														
 
															+    # misbehaving, as the consequences could be disastrous and lead to many
														
 
															+    # incorrect/badly parsed links being added to the archive
														
 
															+
														
 
															     test_urls = '''
														
 
															     https://example1.com/what/is/happening.html?what=1#how-about-this=1
														
 
															     https://example2.com/what/is/happening/?what=1#how-about-this=1
														
@@ -276,22 +290,9 @@ def wget_output_path(link):
 
															     if link.get('latest', {}).get('wget'):
														
 
															         return link['latest']['wget']
														
 
															-    urlencode = lambda s: quote(s, encoding='utf-8', errors='replace')
														
 
															-
														
 
															     if is_static_file(link['url']):
														
 
															         return urlencode(without_scheme(without_fragment(link['url'])))
														
 
															-    # Since the wget algorithm to for -E (appending .html) is incredibly complex
														
 
															-    # instead of trying to emulate it here, we just look in the output folder
														
 
															-    # to see what html file wget actually created as the output
														
 
															-    link_dir = os.path.join(ARCHIVE_DIR, link['timestamp'])
														
 
															-    full_path = without_fragment(without_query(path(link['url']))).strip('/')
														
 
															-    search_dir = os.path.join(
														
 
															-        link_dir,
														
 
															-        domain(link['url']),
														
 
															-        full_path,
														
 
															-    )
														
 
															-
														
 
															     # Wget downloads can save in a number of different ways depending on the url
														
 
															     #    https://example.com
														
 
															     #       > output/archive/<timestamp>/example.com/index.html
														
@@ -304,6 +305,19 @@ def wget_output_path(link):
 
															     # There's also lots of complexity around how the urlencoding and renaming
														
 
															     # is done for pages with query and hash fragments or extensions like shtml / htm
														
 
															+
														
 
															+    # Since the wget algorithm for -E (appending .html) is incredibly complex
														
 
															+    # and there's no way to get the computed output path from wget
														
 
															+    # in order to avoid having to reverse-engineer how they calculate it,
														
 
															+    # we just look in the output folder read the filename wget used from the filesystem
														
 
															+    link_dir = os.path.join(ARCHIVE_DIR, link['timestamp'])
														
 
															+    full_path = without_fragment(without_query(path(link['url']))).strip('/')
														
 
															+    search_dir = os.path.join(
														
 
															+        link_dir,
														
 
															+        domain(link['url']),
														
 
															+        full_path,
														
 
															+    )
														
 
															+
														
 
															     for _ in range(4):
														
 
															         if os.path.exists(search_dir):
														
 
															             if os.path.isdir(search_dir):
														
@@ -356,47 +370,6 @@ def str_between(string, start, end=None):
 
															     return content
														
 
															-def pretty_path(path):
														
 
															-    """convert paths like .../ArchiveBox/archivebox/../output/abc into output/abc"""
														
 
															-    return path.replace(REPO_DIR + '/', '')
														
 
															-
														
 
															-
														
 
															-def print_error_hints(cmd, pwd, err=None, hints=None, prefix='        '):
														
 
															-    """quote the argument with whitespace in a command so the user can 
														
 
															-       copy-paste the outputted string directly to run the cmd
														
 
															-    """
														
 
															-
														
 
															-    # Prettify CMD string and make it save to copy-paste by quoting arguments
														
 
															-    quoted_cmd = ' '.join(
														
 
															-        '"{}"'.format(arg) if ' ' in arg else arg
														
 
															-        for arg in cmd
														
 
															-    )
														
 
															-
														
 
															-    # Prettify error output hints string and limit to five lines
														
 
															-    hints = hints or getattr(err, 'hints', None)
														
 
															-    if hints:
														
 
															-        hints = hints if isinstance(hints, (list, tuple)) else hints.split('\n')
														
 
															-        hints = (
														
 
															-            '    {}{}{}'.format(ANSI['lightyellow'], line.strip(), ANSI['reset'])
														
 
															-            for line in hints[:5] if line.strip()
														
 
															-        )
														
 
															-    else:
														
 
															-        hints = ()
														
 
															-
														
 
															-    output_lines = [
														
 
															-        '{}Failed: {} {}{}'.format(ANSI['red'], err.__class__.__name__, err, ANSI['reset']),
														
 
															-        *hints,
														
 
															-        'Run to see full output:'        
														
 
															-        '    cd {};'.format(pwd),
														
 
															-        '    {}'.format(quoted_cmd),
														
 
															-    ]
														
 
															-
														
 
															-    return '\n'.join(
														
 
															-        '{}{}'.format(prefix, line)
														
 
															-        for line in output_lines
														
 
															-        if line
														
 
															-    )
														
 
															-
														
 
															 ### Link Helpers
														
@@ -571,37 +544,59 @@ def chmod_file(path, cwd='.', permissions=OUTPUT_PERMISSIONS, timeout=30):
 
															         print('     ', chmod_result.stderr.decode())
														
 
															         raise Exception('Failed to chmod {}/{}'.format(cwd, path))
														
 
															-def run(*popenargs, input=None, capture_output=False, timeout=None, check=False, **kwargs):
														
 
															-    """Patched of subprocess.run to fix blocking io making timeout=innefective"""
														
 
															-    if input is not None:
														
 
															-        if 'stdin' in kwargs:
														
 
															-            raise ValueError('stdin and input arguments may not both be used.')
														
 
															-        kwargs['stdin'] = PIPE
														
 
															+CACHED_USER_DATA_DIR = CHROME_USER_DATA_DIR
														
 
															-    if capture_output:
														
 
															-        if ('stdout' in kwargs) or ('stderr' in kwargs):
														
 
															-            raise ValueError('stdout and stderr arguments may not be used '
														
 
															-                             'with capture_output.')
														
 
															-        kwargs['stdout'] = PIPE
														
 
															-        kwargs['stderr'] = PIPE
														
 
															+def chrome_args(binary=CHROME_BINARY, user_data_dir=CHROME_USER_DATA_DIR,
														
 
															+               headless=CHROME_HEADLESS, sandbox=CHROME_SANDBOX,
														
 
															+               check_ssl_validity=CHECK_SSL_VALIDITY, user_agent=CHROME_USER_AGENT,
														
 
															+               resolution=RESOLUTION, timeout=TIMEOUT):
														
 
															+    """helper to build up a chrome shell command with arguments"""
														
 
															-    with Popen(*popenargs, **kwargs) as process:
														
 
															-        try:
														
 
															-            stdout, stderr = process.communicate(input, timeout=timeout)
														
 
															-        except TimeoutExpired:
														
 
															-            process.kill()
														
 
															-            try:
														
 
															-                stdout, stderr = process.communicate(input, timeout=2)
														
 
															-            except:
														
 
															-                pass
														
 
															-            raise TimeoutExpired(popenargs[0][0], timeout)
														
 
															-        except BaseException as err:
														
 
															-            process.kill()
														
 
															-            # We don't call process.wait() as .__exit__ does that for us.
														
 
															-            raise 
														
 
															-        retcode = process.poll()
														
 
															-        if check and retcode:
														
 
															-            raise CalledProcessError(retcode, process.args,
														
 
															-                                     output=stdout, stderr=stderr)
														
 
															-    return CompletedProcess(process.args, retcode, stdout, stderr)
														
 
															+    global CACHED_USER_DATA_DIR
														
 
															+    user_data_dir = user_data_dir or CACHED_USER_DATA_DIR
														
 
															+    cmd_args = [binary]
														
 
															+
														
 
															+    if headless:
														
 
															+        cmd_args += ('--headless',)
														
 
															+    
														
 
															+    if not sandbox:
														
 
															+        # dont use GPU or sandbox when running inside docker container
														
 
															+        cmd_args += ('--no-sandbox', '--disable-gpu')
														
 
															+
														
 
															+    if not check_ssl_validity:
														
 
															+        cmd_args += ('--disable-web-security', '--ignore-certificate-errors')
														
 
															+
														
 
															+    if user_agent:
														
 
															+        cmd_args += ('--user-agent={}'.format(user_agent),)
														
 
															+
														
 
															+    if resolution:
														
 
															+        cmd_args += ('--window-size={}'.format(RESOLUTION),)
														
 
															+
														
 
															+    if timeout:
														
 
															+        cmd_args += ('--timeout={}'.format((timeout) * 1000),)
														
 
															+
														
 
															+    # Find chrome user data directory
														
 
															+    default_profile_paths = (
														
 
															+        '~/.config/chromium',
														
 
															+        '~/.config/google-chrome',
														
 
															+        '~/.config/google-chrome-beta',
														
 
															+        '~/.config/google-chrome-unstable',
														
 
															+        '~/Library/Application Support/Chromium',
														
 
															+        '~/Library/Application Support/Google/Chrome',
														
 
															+        '~/Library/Application Support/Google/Chrome Canary',
														
 
															+        '~/AppData/Local/Chromium/User Data',
														
 
															+        '~/AppData/Local/Google/Chrome/User Data',
														
 
															+        '~/AppData/Local/Google/Chrome SxS/User Data',
														
 
															+    )
														
 
															+    if user_data_dir:
														
 
															+        cmd_args.append('--user-data-dir={}'.format(user_data_dir))
														
 
															+    else:
														
 
															+        for path in default_profile_paths:
														
 
															+            full_path = os.path.expanduser(path)
														
 
															+            if os.path.exists(full_path):
														
 
															+                CACHED_USER_DATA_DIR = full_path
														
 
															+                cmd_args.append('--user-data-dir={}'.format(full_path))
														
 
															+                break
														
 
															+    
														
 
															+    return cmd_args