Bläddra i källkod

major codebase-wide code cleanups

Nick Sweeting 6 år sedan
förälder
incheckning
e6bd1f8ca8

+ 83 - 176
archivebox/archive.py

@@ -1,225 +1,132 @@
 #!/usr/bin/env python3
 #!/usr/bin/env python3
-# ArchiveBox
-# Nick Sweeting 2017 | MIT License
-# https://github.com/pirate/ArchiveBox
+"""
+ArchiveBox command line application.
 
 
-import os
-import sys
+./archive and ./bin/archivebox both point to this file, 
+but you can also run it directly using `python3 archive.py`
 
 
-from datetime import datetime
-from peekable import Peekable
+Usage & Documentation:
+    https://github.com/pirate/ArchiveBox/Wiki
+"""
 
 
+import os
+import sys
 
 
-from parse import parse_links
-from links import validate_links, links_after_timestamp
-from archive_methods import archive_link, _RESULTS_TOTALS
-from index import (
-    write_links_index,
-    parse_json_links_index,
-)
+from links import links_after_timestamp
+from index import write_links_index, load_links_index
+from archive_methods import archive_link
 from config import (
 from config import (
     ARCHIVE_DIR,
     ARCHIVE_DIR,
     ONLY_NEW,
     ONLY_NEW,
     OUTPUT_DIR,
     OUTPUT_DIR,
-    REPO_DIR,
-    ANSI,
     GIT_SHA,
     GIT_SHA,
 )
 )
 from util import (
 from util import (
     check_dependencies,
     check_dependencies,
     save_remote_source,
     save_remote_source,
     save_stdin_source,
     save_stdin_source,
-    pretty_path,
-    check_links_structure,
+)
+from logs import (
+    log_archiving_started,
+    log_archiving_paused,
+    log_archiving_finished,
 )
 )
 
 
 __AUTHOR__ = 'Nick Sweeting <[email protected]>'
 __AUTHOR__ = 'Nick Sweeting <[email protected]>'
 __VERSION__ = GIT_SHA
 __VERSION__ = GIT_SHA
-__DESCRIPTION__ = 'ArchiveBox Usage:  Create a browsable html archive of a list of links.'
+__DESCRIPTION__ = 'ArchiveBox: The self-hosted internet archive.'
 __DOCUMENTATION__ = 'https://github.com/pirate/ArchiveBox/wiki'
 __DOCUMENTATION__ = 'https://github.com/pirate/ArchiveBox/wiki'
 
 
 
 
 def print_help():
 def print_help():
-    print(__DESCRIPTION__)
-    print("Documentation:     {}\n".format(__DOCUMENTATION__))
+    print('ArchiveBox: The self-hosted internet archive.\n')
+    print("Documentation:")
+    print("    https://github.com/pirate/ArchiveBox/wiki\n")
     print("Usage:")
     print("Usage:")
+    print("    echo 'https://examplecom' | ./bin/archivebox\n")
     print("    ./bin/archivebox ~/Downloads/bookmarks_export.html\n")
     print("    ./bin/archivebox ~/Downloads/bookmarks_export.html\n")
-    print("")
     print("    ./bin/archivebox https://example.com/feed.rss\n")
     print("    ./bin/archivebox https://example.com/feed.rss\n")
-    print("")
-    print("    echo 'https://examplecom' | ./bin/archivebox\n")
+    print("    ./bin/archivebox 15109948213.123\n")
 
 
 
 
-def load_links(archive_path=OUTPUT_DIR, import_path=None):
-    """get new links from file and optionally append them to links in existing archive"""
-
-    existing_links = []
-    if archive_path:
-        existing_links = parse_json_links_index(archive_path)
-        check_links_structure(existing_links)
+def main(*args):
+    if set(args).intersection(('-h', '--help', 'help')) or len(args) > 2:
+        print_help()
+        raise SystemExit(0)
 
 
-    new_links = []
-    if import_path:
-        # parse and validate the import file
-        raw_links, parser_name = parse_links(import_path)
-        new_links = validate_links(raw_links)
-        check_links_structure(new_links)
+    ### Handle CLI arguments
+    #     ./archive bookmarks.html
+    #     ./archive 1523422111.234
+    import_path, resume = None, None
+    if len(args) == 2:
+        # if the argument is a string, it's a import_path file to import
+        # if it's a number, it's a timestamp to resume archiving from
+        if args[1].replace('.', '').isdigit():
+            import_path, resume = None, args[1]
+        else:
+            import_path, resume = args[1], None
 
 
-    # merge existing links in archive_path and new links
-    all_links = validate_links(existing_links + new_links)
-    check_links_structure(all_links)
-    num_new_links = len(all_links) - len(existing_links)
+    ### Set up output folder
+    if not os.path.exists(OUTPUT_DIR):
+        os.makedirs(OUTPUT_DIR)
 
 
-    if import_path and parser_name:
-        print('    > Adding {} new links to index (parsed import as {})'.format(
-            num_new_links,
-            parser_name,
-        ))
+    ### Handle ingesting urls piped in through stdin
+    # (.e.g if user does cat example_urls.txt | ./archive)
+    if not sys.stdin.isatty():
+        stdin_raw_text = sys.stdin.read()
+        if stdin_raw_text and import_path:
+            print(
+                '[X] You should pass either a path as an argument, '
+                'or pass a list of links via stdin, but not both.\n'
+            )
+            print_help()
+            raise SystemExit(1)
 
 
-    return all_links, new_links
+        import_path = save_stdin_source(stdin_raw_text)
 
 
+    ### Handle ingesting urls from a remote file/feed
+    # (e.g. if an RSS feed URL is used as the import path) 
+    if import_path and any(import_path.startswith(s) for s in ('http://', 'https://', 'ftp://')):
+        import_path = save_remote_source(import_path)
 
 
-def update_archive(archive_path, links, source=None, resume=None, append=True):
-    """update or create index.html+json given a path to an export file containing new links"""
+    ### Run the main archive update process
+    update_archive_data(import_path=import_path, resume=resume)
 
 
-    start_ts = datetime.now().timestamp()
 
 
-    if resume:
-        print('{green}[▶] [{}] Resuming archive downloading from {}...{reset}'.format(
-             datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
-             resume,
-             **ANSI,
-        ))
-    else:
-        print('{green}[▶] [{}] Updating content for {} pages in archive...{reset}'.format(
-             datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
-             len(links),
-             **ANSI,
-        ))
+def update_archive_data(import_path=None, resume=None):
+    """The main ArchiveBox entrancepoint.  Everything starts here."""
+    check_dependencies()
 
 
-    check_links_structure(links)
+    # Step 1: Load list of links from the existing index
+    #         merge in and dedupe new links from import_path
+    all_links, new_links = load_links_index(out_dir=OUTPUT_DIR, import_path=import_path)
 
 
-    # prefetch the first link off the generator so that if we pause or fail
-    # immediately we can show that we paused on the first link and not just None
-    to_archive = Peekable(links_after_timestamp(links, resume))
-    idx, link = 0, to_archive.peek(0)
+    # Step 2: Write updated index with deduped old and new links back to disk
+    write_links_index(out_dir=OUTPUT_DIR, links=all_links)
 
 
-    # loop over links and archive them
+    # Step 3: Run the archive methods for each link
+    links = new_links if ONLY_NEW else all_links
+    log_archiving_started(len(links), resume)
+    idx, link = 0, 0
     try:
     try:
-        check_dependencies()
-        for idx, link in enumerate(to_archive):
+        for idx, link in enumerate(links_after_timestamp(links, resume)):
             link_dir = os.path.join(ARCHIVE_DIR, link['timestamp'])
             link_dir = os.path.join(ARCHIVE_DIR, link['timestamp'])
             archive_link(link_dir, link)
             archive_link(link_dir, link)
 
 
-    except (KeyboardInterrupt, SystemExit, Exception) as e:
-        # if isinstance(e, KeyboardInterrupt):
-        #     # Step 4: Re-write links index with updated titles, icons, and resources
-        #     all_links, _ = load_links(archive_path=out_dir)
-        #     write_links_index(out_dir=out_dir, links=all_links, finished=True)
-        print()
-        print('\n{lightyellow}[X] [{now}] Downloading paused on link {timestamp} ({idx}/{total}){reset}'.format(
-            **ANSI,
-            now=datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
-            idx=idx+1,
-            timestamp=link['timestamp'],
-            total=len(links),
-        ))
-        print('    To view your archive, open: {}/index.html'.format(OUTPUT_DIR.replace(REPO_DIR + '/', '')))
-        print('    Continue where you left off by running:')
-        print('        {} {}'.format(
-            pretty_path(sys.argv[0]),
-            link['timestamp'],
-        ))
-        if not isinstance(e, KeyboardInterrupt):
-            print()
-            raise e
-        raise SystemExit(1)
-
-    # print timing information & summary
-    end_ts = datetime.now().timestamp()
-    seconds = end_ts - start_ts
-    if seconds > 60:
-        duration = '{0:.2f} min'.format(seconds / 60, 2)
-    else:
-        duration = '{0:.2f} sec'.format(seconds, 2)
-
-    print('{}[√] [{}] Update of {} pages complete ({}){}'.format(
-        ANSI['green'],
-        datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
-        len(links),
-        duration,
-        ANSI['reset'],
-    ))
-    print('    - {} entries skipped'.format(_RESULTS_TOTALS['skipped']))
-    print('    - {} entries updated'.format(_RESULTS_TOTALS['succeded']))
-    print('    - {} errors'.format(_RESULTS_TOTALS['failed']))
-    print('    To view your archive, open: {}/index.html'.format(OUTPUT_DIR.replace(REPO_DIR + '/', '')))
-
-
-if __name__ == '__main__':
-    argc = len(sys.argv)
-
-    if set(sys.argv).intersection(('-h', '--help', 'help')):
-        print_help()
+    except KeyboardInterrupt:
+        log_archiving_paused(len(links), idx, link and link['timestamp'])
         raise SystemExit(0)
         raise SystemExit(0)
 
 
-    source = sys.argv[1] if argc > 1 else None  # path of links file to import
-    resume = sys.argv[2] if argc > 2 else None  # timestamp to resume dowloading from
-   
-    stdin_raw_text = ''
-
-    if not sys.stdin.isatty():
-        stdin_raw_text = sys.stdin.read()
-
-    if source and stdin_raw_text:
-        print(
-            '[X] You should pass either a path as an argument, '
-            'or pass a list of links via stdin, but not both.\n'
-        )
-        print_help()
-        raise SystemExit(1)
-
-
-    if argc == 1:
-        source, resume = None, None
-    elif argc == 2:
-        if all(d.isdigit() for d in sys.argv[1].split('.')):
-            # argv[1] is a resume timestamp
-            source, resume = None, sys.argv[1]
-        else:
-            # argv[1] is a path to a file to import
-            source, resume = sys.argv[1].strip(), None
-    elif argc == 3:
-        source, resume = sys.argv[1].strip(), sys.argv[2]
-    else:
-        print_help()
-        raise SystemExit(1)
-
-    # See if archive folder already exists
-    for out_dir in (OUTPUT_DIR, 'bookmarks', 'pocket', 'pinboard', 'html'):
-        if os.path.exists(out_dir):
-            break
-    else:
-        out_dir = OUTPUT_DIR
-
-    # Step 0: Download url to local file (only happens if a URL is specified instead of local path) 
-    if source and any(source.startswith(s) for s in ('http://', 'https://', 'ftp://')):
-        source = save_remote_source(source)
-    elif stdin_raw_text:
-        source = save_stdin_source(stdin_raw_text)
+    except:
+        print()
+        raise    
 
 
-    # Step 1: Parse the links and dedupe them with existing archive
-    all_links, new_links = load_links(archive_path=out_dir, import_path=source)
+    log_archiving_finished(len(links))
 
 
-    # Step 2: Write new index
-    write_links_index(out_dir=out_dir, links=all_links)
+    # Step 4: Re-write links index with updated titles, icons, and resources
+    all_links, _ = load_links_index(out_dir=OUTPUT_DIR)
+    write_links_index(out_dir=OUTPUT_DIR, links=all_links, finished=True)
 
 
-    # Step 3: Run the archive methods for each link
-    if ONLY_NEW:
-        update_archive(out_dir, new_links, source=source, resume=resume, append=True)
-    else:
-        update_archive(out_dir, all_links, source=source, resume=resume, append=True)
 
 
-    # Step 4: Re-write links index with updated titles, icons, and resources
-    all_links, _ = load_links(archive_path=out_dir)
-    write_links_index(out_dir=out_dir, links=all_links, finished=True)
+if __name__ == '__main__':
+    main(*sys.argv)

+ 179 - 263
archivebox/archive_methods.py

@@ -3,18 +3,18 @@ import os
 from functools import wraps
 from functools import wraps
 from collections import defaultdict
 from collections import defaultdict
 from datetime import datetime
 from datetime import datetime
+from stdlib_patches import run, PIPE, DEVNULL
 
 
 from index import (
 from index import (
-    parse_json_link_index,
     write_link_index,
     write_link_index,
-    update_main_index,
+    patch_links_index,
+    load_json_link_index,
 )
 )
 from config import (
 from config import (
     CURL_BINARY,
     CURL_BINARY,
     GIT_BINARY,
     GIT_BINARY,
     WGET_BINARY,
     WGET_BINARY,
     YOUTUBEDL_BINARY,
     YOUTUBEDL_BINARY,
-    CHROME_BINARY,
     FETCH_FAVICON,
     FETCH_FAVICON,
     FETCH_TITLE,
     FETCH_TITLE,
     FETCH_WGET,
     FETCH_WGET,
@@ -25,62 +25,37 @@ from config import (
     FETCH_WARC,
     FETCH_WARC,
     FETCH_GIT,
     FETCH_GIT,
     FETCH_MEDIA,
     FETCH_MEDIA,
-    RESOLUTION,
-    CHECK_SSL_VALIDITY,
     SUBMIT_ARCHIVE_DOT_ORG,
     SUBMIT_ARCHIVE_DOT_ORG,
-    COOKIES_FILE,
-    WGET_USER_AGENT,
-    CHROME_USER_AGENT,
-    CHROME_USER_DATA_DIR,
-    CHROME_HEADLESS,
-    CHROME_SANDBOX,
     TIMEOUT,
     TIMEOUT,
     MEDIA_TIMEOUT,
     MEDIA_TIMEOUT,
     ANSI,
     ANSI,
-    ARCHIVE_DIR,
+    OUTPUT_DIR,
     GIT_DOMAINS,
     GIT_DOMAINS,
     GIT_SHA,
     GIT_SHA,
+    WGET_USER_AGENT,
+    CHECK_SSL_VALIDITY,
+    COOKIES_FILE,
 )
 )
 from util import (
 from util import (
     domain,
     domain,
+    extension,
     without_query,
     without_query,
     without_fragment,
     without_fragment,
     fetch_page_title,
     fetch_page_title,
     is_static_file,
     is_static_file,
     progress,
     progress,
     chmod_file,
     chmod_file,
-    pretty_path,
-    print_error_hints,
     check_link_structure,
     check_link_structure,
     wget_output_path,
     wget_output_path,
-    run, PIPE, DEVNULL,
+    chrome_args,
+)
+from logs import (
+    _LAST_RUN_STATS,
+    log_link_archiving_started,
+    log_link_archiving_failed,
 )
 )
 
 
 
 
-_RESULTS_TOTALS = {   # globals are bad, mmkay
-    'skipped': 0,
-    'succeded': 0,
-    'failed': 0,
-}
-
-def load_link_index(link_dir, link):
-    """check for an existing link archive in the given directory, 
-       and load+merge it into the given link dict
-    """
-    is_new = not os.path.exists(link_dir)
-    if is_new:
-        os.makedirs(link_dir)
-    else:
-        link = {
-            **parse_json_link_index(link_dir),
-            **link,
-        }
-
-    check_link_structure(link)
-    print_link_status_line(link_dir, link, is_new)
-
-    return link
-
 
 
 class ArchiveError(Exception):
 class ArchiveError(Exception):
     def __init__(self, message, hints=None):
     def __init__(self, message, hints=None):
@@ -105,32 +80,24 @@ def archive_link(link_dir, link, overwrite=True):
     active_methods = [method for toggle, method in ARCHIVE_METHODS if toggle]
     active_methods = [method for toggle, method in ARCHIVE_METHODS if toggle]
     
     
     try:
     try:
-        link = load_link_index(link_dir, link)
+        is_new = not os.path.exists(link_dir)
+        if is_new:
+            os.makedirs(link_dir)
+
+        link = load_json_link_index(link_dir, link)
+        log_link_archiving_started(link_dir, link, is_new)
 
 
         for archive_method in active_methods:
         for archive_method in active_methods:
             archive_method(link_dir, link, overwrite=overwrite)
             archive_method(link_dir, link, overwrite=overwrite)
 
 
-
         write_link_index(link_dir, link)
         write_link_index(link_dir, link)
-        update_main_index(link)
+        patch_links_index(link)
 
 
     except Exception as err:
     except Exception as err:
         print('    ! Failed to archive link: {}: {}'.format(err.__class__.__name__, err))
         print('    ! Failed to archive link: {}: {}'.format(err.__class__.__name__, err))
     
     
     return link
     return link
 
 
-def print_link_status_line(link_dir, link, is_new):
-    print('[{symbol_color}{symbol}{reset}] [{now}] "{title}"\n    {blue}{url}{reset}'.format(
-        symbol='+' if is_new else '*',
-        symbol_color=ANSI['green' if is_new else 'black'],
-        now=datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
-        **{**link, 'title': link['title'] or link['url']},
-        **ANSI,
-    ))
-
-    print('    > {}{}'.format(pretty_path(link_dir), ' (new)' if is_new else ''))
-
-
 
 
 def attach_result_to_link(method):
 def attach_result_to_link(method):
     """
     """
@@ -178,15 +145,75 @@ def attach_result_to_link(method):
                 link['history'][method].append(history_entry)
                 link['history'][method].append(history_entry)
                 link['latest'][method] = result['output']
                 link['latest'][method] = result['output']
 
 
-            _RESULTS_TOTALS[history_entry['status']] += 1
+            _LAST_RUN_STATS[history_entry['status']] += 1
             
             
             return link
             return link
         return timed_fetch_func
         return timed_fetch_func
     return decorator
     return decorator
 
 
+@attach_result_to_link('title')
+def fetch_title(link_dir, link, timeout=TIMEOUT):
+    """try to guess the page's title from its content"""
+
+    # if link already has valid title, skip it
+    if link['title'] and not link['title'].lower().startswith('http'):
+        return {'output': link['title'], 'status': 'skipped'}
+
+    if is_static_file(link['url']):
+        return {'output': None, 'status': 'skipped'}
+
+    end = progress(timeout, prefix='      ')
+    try:
+        title = fetch_page_title(link['url'], timeout=timeout, progress=False)
+        end()
+        output = title
+    except Exception as e:
+        end()
+        output = e
+        print('        {}Failed: {} {}{}'.format(ANSI['red'], e.__class__.__name__, e, ANSI['reset']))
+
+    if title and title.strip():
+        link['title'] = title
+        output = title
+
+    return {
+        'cmd': 'fetch_page_title("{}")'.format(link['url']),
+        'output': output,
+    }
+
+@attach_result_to_link('favicon')
+def fetch_favicon(link_dir, link, timeout=TIMEOUT):
+    """download site favicon from google's favicon api"""
+
+    output = 'favicon.ico'
+    if os.path.exists(os.path.join(link_dir, output)):
+        return {'output': output, 'status': 'skipped'}
+
+    CMD = [
+        CURL_BINARY,
+        '--max-time', str(timeout),
+        '--location',
+        '--output', output,
+        *(() if CHECK_SSL_VALIDITY else ('--insecure',)),
+        'https://www.google.com/s2/favicons?domain={}'.format(domain(link['url'])),
+    ]
+    end = progress(timeout, prefix='      ')
+    try:
+        run(CMD, stdout=PIPE, stderr=PIPE, cwd=link_dir, timeout=timeout)
+        end()
+        chmod_file(output, cwd=link_dir)
+    except Exception as e:
+        end()
+        output = e
+        print_error_hints(cmd=CMD, pwd=link_dir, err=e)
+
+    return {
+        'cmd': CMD,
+        'output': output,
+    }
 
 
 @attach_result_to_link('wget')
 @attach_result_to_link('wget')
-def fetch_wget(link_dir, link, requisites=FETCH_WGET_REQUISITES, warc=FETCH_WARC, timeout=TIMEOUT):
+def fetch_wget(link_dir, link, timeout=TIMEOUT):
     """download full site using wget"""
     """download full site using wget"""
 
 
     domain_dir = os.path.join(link_dir, domain(link['url']))
     domain_dir = os.path.join(link_dir, domain(link['url']))
@@ -194,7 +221,7 @@ def fetch_wget(link_dir, link, requisites=FETCH_WGET_REQUISITES, warc=FETCH_WARC
     if os.path.exists(domain_dir) and existing_file:
     if os.path.exists(domain_dir) and existing_file:
         return {'output': existing_file, 'status': 'skipped'}
         return {'output': existing_file, 'status': 'skipped'}
 
 
-    if warc:
+    if FETCH_WARC:
         warc_dir = os.path.join(link_dir, 'warc')
         warc_dir = os.path.join(link_dir, 'warc')
         os.makedirs(warc_dir, exist_ok=True)
         os.makedirs(warc_dir, exist_ok=True)
         warc_path = os.path.join('warc', str(int(datetime.now().timestamp())))
         warc_path = os.path.join('warc', str(int(datetime.now().timestamp())))
@@ -213,8 +240,8 @@ def fetch_wget(link_dir, link, requisites=FETCH_WGET_REQUISITES, warc=FETCH_WARC
         '-e', 'robots=off',
         '-e', 'robots=off',
         '--restrict-file-names=unix',
         '--restrict-file-names=unix',
         '--timeout={}'.format(timeout),
         '--timeout={}'.format(timeout),
-        *(() if warc else ('--timestamping',)),
-        *(('--warc-file={}'.format(warc_path),) if warc else ()),
+        *(() if FETCH_WARC else ('--timestamping',)),
+        *(('--warc-file={}'.format(warc_path),) if FETCH_WARC else ()),
         *(('--page-requisites',) if FETCH_WGET_REQUISITES else ()),
         *(('--page-requisites',) if FETCH_WGET_REQUISITES else ()),
         *(('--user-agent={}'.format(WGET_USER_AGENT),) if WGET_USER_AGENT else ()),
         *(('--user-agent={}'.format(WGET_USER_AGENT),) if WGET_USER_AGENT else ()),
         *(('--load-cookies', COOKIES_FILE) if COOKIES_FILE else ()),
         *(('--load-cookies', COOKIES_FILE) if COOKIES_FILE else ()),
@@ -233,7 +260,8 @@ def fetch_wget(link_dir, link, requisites=FETCH_WGET_REQUISITES, warc=FETCH_WARC
             if line.strip()
             if line.strip()
         ]
         ]
 
 
-        # parse out number of files downloaded from "Downloaded: 76 files, 4.0M in 1.6s (2.52 MB/s)"
+        # parse out number of files downloaded from last line of stderr:
+        #  "Downloaded: 76 files, 4.0M in 1.6s (2.52 MB/s)"
         files_downloaded = (
         files_downloaded = (
             int(output_tail[-1].strip().split(' ', 2)[1] or 0)
             int(output_tail[-1].strip().split(' ', 2)[1] or 0)
             if 'Downloaded:' in output_tail[-1]
             if 'Downloaded:' in output_tail[-1]
@@ -263,20 +291,19 @@ def fetch_wget(link_dir, link, requisites=FETCH_WGET_REQUISITES, warc=FETCH_WARC
         'output': output,
         'output': output,
     }
     }
 
 
-
 @attach_result_to_link('pdf')
 @attach_result_to_link('pdf')
-def fetch_pdf(link_dir, link, timeout=TIMEOUT, **chrome_kwargs):
+def fetch_pdf(link_dir, link, timeout=TIMEOUT):
     """print PDF of site to file using chrome --headless"""
     """print PDF of site to file using chrome --headless"""
 
 
     if is_static_file(link['url']):
     if is_static_file(link['url']):
-        return {'output': wget_output_path(link), 'status': 'skipped'}
+        return {'output': None, 'status': 'skipped'}
     
     
     output = 'output.pdf'
     output = 'output.pdf'
     if os.path.exists(os.path.join(link_dir, output)):
     if os.path.exists(os.path.join(link_dir, output)):
         return {'output': output, 'status': 'skipped'}
         return {'output': output, 'status': 'skipped'}
 
 
     CMD = [
     CMD = [
-        *chrome_headless(timeout=timeout, **chrome_kwargs),
+        *chrome_args(timeout=timeout),
         '--print-to-pdf',
         '--print-to-pdf',
         link['url']
         link['url']
     ]
     ]
@@ -302,18 +329,18 @@ def fetch_pdf(link_dir, link, timeout=TIMEOUT, **chrome_kwargs):
     }
     }
 
 
 @attach_result_to_link('screenshot')
 @attach_result_to_link('screenshot')
-def fetch_screenshot(link_dir, link, timeout=TIMEOUT, **chrome_kwargs):
+def fetch_screenshot(link_dir, link, timeout=TIMEOUT):
     """take screenshot of site using chrome --headless"""
     """take screenshot of site using chrome --headless"""
 
 
     if is_static_file(link['url']):
     if is_static_file(link['url']):
-        return {'output': wget_output_path(link), 'status': 'skipped'}
+        return {'output': None, 'status': 'skipped'}
     
     
     output = 'screenshot.png'
     output = 'screenshot.png'
     if os.path.exists(os.path.join(link_dir, output)):
     if os.path.exists(os.path.join(link_dir, output)):
         return {'output': output, 'status': 'skipped'}
         return {'output': output, 'status': 'skipped'}
 
 
     CMD = [
     CMD = [
-        *chrome_headless(timeout=timeout, **chrome_kwargs),
+        *chrome_args(timeout=timeout),
         '--screenshot',
         '--screenshot',
         link['url'],
         link['url'],
     ]
     ]
@@ -337,18 +364,19 @@ def fetch_screenshot(link_dir, link, timeout=TIMEOUT, **chrome_kwargs):
     }
     }
     
     
 @attach_result_to_link('dom')
 @attach_result_to_link('dom')
-def fetch_dom(link_dir, link, timeout=TIMEOUT, **chrome_kwargs):
+def fetch_dom(link_dir, link, timeout=TIMEOUT):
     """print HTML of site to file using chrome --dump-html"""
     """print HTML of site to file using chrome --dump-html"""
 
 
     if is_static_file(link['url']):
     if is_static_file(link['url']):
-        return {'output': wget_output_path(link), 'status': 'skipped'}
+        return {'output': None, 'status': 'skipped'}
     
     
     output = 'output.html'
     output = 'output.html'
-    if os.path.exists(os.path.join(link_dir, output)):
+    output_path = os.path.join(link_dir, output)
+    if os.path.exists(output_path):
         return {'output': output, 'status': 'skipped'}
         return {'output': output, 'status': 'skipped'}
 
 
     CMD = [
     CMD = [
-        *chrome_headless(timeout=timeout, **chrome_kwargs),
+        *chrome_args(timeout=timeout),
         '--dump-dom',
         '--dump-dom',
         link['url']
         link['url']
     ]
     ]
@@ -372,100 +400,43 @@ def fetch_dom(link_dir, link, timeout=TIMEOUT, **chrome_kwargs):
         'output': output,
         'output': output,
     }
     }
 
 
-def parse_archive_dot_org_response(response):
-    # Parse archive.org response headers
-    headers = defaultdict(list)
-
-    # lowercase all the header names and store in dict
-    for header in response.splitlines():
-        if b':' not in header or not header.strip():
-            continue
-        name, val = header.decode().split(':', 1)
-        headers[name.lower().strip()].append(val.strip())
-
-    # Get successful archive url in "content-location" header or any errors
-    content_location = headers['content-location']
-    errors = headers['x-archive-wayback-runtime-error']
-    return content_location, errors
-
-@attach_result_to_link('archive_org')
-def archive_dot_org(link_dir, link, timeout=TIMEOUT):
-    """submit site to archive.org for archiving via their service, save returned archive url"""
-
-    output = 'archive.org.txt'
-    archive_org_url = None
-
-    path = os.path.join(link_dir, output)
-    if os.path.exists(path):
-        archive_org_url = open(path, 'r').read().strip()
-        return {'output': archive_org_url, 'status': 'skipped'}
-
-    submit_url = 'https://web.archive.org/save/{}'.format(link['url'])
-    CMD = [
-        CURL_BINARY,
-        '--location',
-        '--head',
-        '--user-agent', 'ArchiveBox/{} (+https://github.com/pirate/ArchiveBox/)'.format(GIT_SHA),  # be nice to the Archive.org people and show them where all this ArchiveBox traffic is coming from
-        '--max-time', str(timeout),
-        *(() if CHECK_SSL_VALIDITY else ('--insecure',)),
-        submit_url,
-    ]
-    end = progress(timeout, prefix='      ')
-    try:
-        result = run(CMD, stdout=PIPE, stderr=DEVNULL, cwd=link_dir, timeout=timeout)
-        end()
-        content_location, errors = parse_archive_dot_org_response(result.stdout)
-        if content_location:
-            archive_org_url = 'https://web.archive.org{}'.format(content_location[0])
-        elif len(errors) == 1 and 'RobotAccessControlException' in errors[0]:
-            archive_org_url = None
-            # raise ArchiveError('Archive.org denied by {}/robots.txt'.format(domain(link['url'])))
-        elif errors:
-            raise ArchiveError(', '.join(errors))
-        else:
-            raise ArchiveError('Failed to find "content-location" URL header in Archive.org response.')
-    except Exception as e:
-        end()
-        output = e
-        print_error_hints(cmd=CMD, pwd=link_dir, err=e)
-
-    if not isinstance(output, Exception):
-        # instead of writing None when archive.org rejects the url write the
-        # url to resubmit it to archive.org. This is so when the user visits
-        # the URL in person, it will attempt to re-archive it, and it'll show the
-        # nicer error message explaining why the url was rejected if it fails.
-        archive_org_url = archive_org_url or submit_url
-        with open(os.path.join(link_dir, output), 'w', encoding='utf-8') as f:
-            f.write(archive_org_url)
-        chmod_file('archive.org.txt', cwd=link_dir)
-        output = archive_org_url
+@attach_result_to_link('git')
+def fetch_git(link_dir, link, timeout=TIMEOUT):
+    """download full site using git"""
 
 
-    return {
-        'cmd': CMD,
-        'output': output,
-    }
+    is_clonable_url = (
+        domain(link['url']) in GIT_DOMAINS
+        or extension(link['url']) == 'git'
+    )
+    if is_static_file(link['url']) or not is_clonable_url:
+        return {'output': None, 'status': 'skipped'}
 
 
-@attach_result_to_link('favicon')
-def fetch_favicon(link_dir, link, timeout=TIMEOUT):
-    """download site favicon from google's favicon api"""
+    output = 'git'
+    output_path = os.path.join(link_dir, 'git')
 
 
-    output = 'favicon.ico'
-    if os.path.exists(os.path.join(link_dir, output)):
+    if os.path.exists(output_path):
         return {'output': output, 'status': 'skipped'}
         return {'output': output, 'status': 'skipped'}
 
 
+    os.makedirs(output_path, exist_ok=True)
     CMD = [
     CMD = [
-        CURL_BINARY,
-        '--max-time', str(timeout),
-        '--location',
-        '--output', output,
-        *(() if CHECK_SSL_VALIDITY else ('--insecure',)),
-        'https://www.google.com/s2/favicons?domain={}'.format(domain(link['url'])),
+        GIT_BINARY,
+        'clone',
+        '--mirror',
+        '--recursive',
+        *(() if CHECK_SSL_VALIDITY else ('-c', 'http.sslVerify=false')),
+        without_query(without_fragment(link['url'])),
     ]
     ]
     end = progress(timeout, prefix='      ')
     end = progress(timeout, prefix='      ')
     try:
     try:
-        run(CMD, stdout=PIPE, stderr=PIPE, cwd=link_dir, timeout=timeout)
+        result = run(CMD, stdout=PIPE, stderr=PIPE, cwd=output_path, timeout=timeout + 1)
         end()
         end()
-        chmod_file(output, cwd=link_dir)
+
+        if result.returncode == 128:
+            # ignore failed re-download when the folder already exists
+            pass
+        elif result.returncode > 0:
+            hints = 'got git response code {}:'.format(result.returncode)
+            raise ArchiveError('Failed git download', hints)
     except Exception as e:
     except Exception as e:
         end()
         end()
         output = e
         output = e
@@ -476,36 +447,6 @@ def fetch_favicon(link_dir, link, timeout=TIMEOUT):
         'output': output,
         'output': output,
     }
     }
 
 
-@attach_result_to_link('title')
-def fetch_title(link_dir, link, timeout=TIMEOUT):
-    """try to guess the page's title from its content"""
-
-    # if link already has valid title, skip it
-    if link['title'] and not link['title'].lower().startswith('http'):
-        return {'output': link['title'], 'status': 'skipped'}
-
-    if is_static_file(link['url']):
-        return {'output': None, 'status': 'skipped'}
-
-    end = progress(timeout, prefix='      ')
-    try:
-        title = fetch_page_title(link['url'], timeout=timeout, progress=False)
-        end()
-        output = title
-    except Exception as e:
-        end()
-        output = e
-        print('        {}Failed: {} {}{}'.format(ANSI['red'], e.__class__.__name__, e, ANSI['reset']))
-
-    if title and title.strip():
-        link['title'] = title
-        output = title
-
-    return {
-        'cmd': 'fetch_page_title("{}")'.format(link['url']),
-        'output': output,
-    }
-
 @attach_result_to_link('media')
 @attach_result_to_link('media')
 def fetch_media(link_dir, link, timeout=MEDIA_TIMEOUT, overwrite=False):
 def fetch_media(link_dir, link, timeout=MEDIA_TIMEOUT, overwrite=False):
     """Download playlists or individual video, audio, and subtitles using youtube-dl"""
     """Download playlists or individual video, audio, and subtitles using youtube-dl"""
@@ -569,102 +510,77 @@ def fetch_media(link_dir, link, timeout=MEDIA_TIMEOUT, overwrite=False):
         'output': output,
         'output': output,
     }
     }
 
 
+def parse_archive_dot_org_response(response):
+    # Parse archive.org response headers
+    headers = defaultdict(list)
 
 
-@attach_result_to_link('git')
-def fetch_git(link_dir, link, timeout=TIMEOUT):
-    """download full site using git"""
+    # lowercase all the header names and store in dict
+    for header in response.splitlines():
+        if b':' not in header or not header.strip():
+            continue
+        name, val = header.decode().split(':', 1)
+        headers[name.lower().strip()].append(val.strip())
 
 
-    url_is_clonable = (
-        domain(link['url']) in GIT_DOMAINS
-        or link['url'].endswith('.git')
-    )
-    if not url_is_clonable or is_static_file(link['url']):
-        return {'output': None, 'status': 'skipped'}
+    # Get successful archive url in "content-location" header or any errors
+    content_location = headers['content-location']
+    errors = headers['x-archive-wayback-runtime-error']
+    return content_location, errors
 
 
-    output = 'git'
-    output_path = os.path.join(link_dir, 'git')
+@attach_result_to_link('archive_org')
+def archive_dot_org(link_dir, link, timeout=TIMEOUT):
+    """submit site to archive.org for archiving via their service, save returned archive url"""
 
 
-    if os.path.exists(output_path):
-        return {'output': output, 'status': 'skipped'}
+    output = 'archive.org.txt'
+    archive_org_url = None
 
 
-    os.makedirs(output_path, exist_ok=True)
+    path = os.path.join(link_dir, output)
+    if os.path.exists(path):
+        archive_org_url = open(path, 'r').read().strip()
+        return {'output': archive_org_url, 'status': 'skipped'}
+
+    submit_url = 'https://web.archive.org/save/{}'.format(link['url'])
     CMD = [
     CMD = [
-        GIT_BINARY,
-        'clone',
-        '--mirror',
-        '--recursive',
-        *(() if CHECK_SSL_VALIDITY else ('-c', 'http.sslVerify=false')),
-        without_query(without_fragment(link['url'])),
+        CURL_BINARY,
+        '--location',
+        '--head',
+        '--user-agent', 'ArchiveBox/{} (+https://github.com/pirate/ArchiveBox/)'.format(GIT_SHA),  # be nice to the Archive.org people and show them where all this ArchiveBox traffic is coming from
+        '--max-time', str(timeout),
+        *(() if CHECK_SSL_VALIDITY else ('--insecure',)),
+        submit_url,
     ]
     ]
     end = progress(timeout, prefix='      ')
     end = progress(timeout, prefix='      ')
     try:
     try:
-        result = run(CMD, stdout=PIPE, stderr=PIPE, cwd=output_path, timeout=timeout + 1)
+        result = run(CMD, stdout=PIPE, stderr=DEVNULL, cwd=link_dir, timeout=timeout)
         end()
         end()
-
-        if result.returncode == 128:
-            # ignore failed re-download when the folder already exists
-            pass
-        elif result.returncode > 0:
-            hints = 'got git response code {}:'.format(result.returncode)
-            raise ArchiveError('Failed git download', hints)
+        content_location, errors = parse_archive_dot_org_response(result.stdout)
+        if content_location:
+            archive_org_url = 'https://web.archive.org{}'.format(content_location[0])
+        elif len(errors) == 1 and 'RobotAccessControlException' in errors[0]:
+            archive_org_url = None
+            # raise ArchiveError('Archive.org denied by {}/robots.txt'.format(domain(link['url'])))
+        elif errors:
+            raise ArchiveError(', '.join(errors))
+        else:
+            raise ArchiveError('Failed to find "content-location" URL header in Archive.org response.')
     except Exception as e:
     except Exception as e:
         end()
         end()
         output = e
         output = e
         print_error_hints(cmd=CMD, pwd=link_dir, err=e)
         print_error_hints(cmd=CMD, pwd=link_dir, err=e)
 
 
+    if not isinstance(output, Exception):
+        # instead of writing None when archive.org rejects the url write the
+        # url to resubmit it to archive.org. This is so when the user visits
+        # the URL in person, it will attempt to re-archive it, and it'll show the
+        # nicer error message explaining why the url was rejected if it fails.
+        archive_org_url = archive_org_url or submit_url
+        with open(os.path.join(link_dir, output), 'w', encoding='utf-8') as f:
+            f.write(archive_org_url)
+        chmod_file('archive.org.txt', cwd=link_dir)
+        output = archive_org_url
+
     return {
     return {
         'cmd': CMD,
         'cmd': CMD,
         'output': output,
         'output': output,
     }
     }
 
 
-def chrome_headless(binary=CHROME_BINARY, user_data_dir=CHROME_USER_DATA_DIR, headless=CHROME_HEADLESS, sandbox=CHROME_SANDBOX, check_ssl_validity=CHECK_SSL_VALIDITY, user_agent=CHROME_USER_AGENT, resolution=RESOLUTION, timeout=TIMEOUT):
-    global CACHED_USER_DATA_DIR
-    user_data_dir = user_data_dir or CACHED_USER_DATA_DIR
-    cmd_args = [binary]
-
-    if headless:
-        cmd_args += ('--headless',)
-    
-    if not sandbox:
-        # dont use GPU or sandbox when running inside docker container
-        cmd_args += ('--no-sandbox', '--disable-gpu')
-
-    if not check_ssl_validity:
-        cmd_args += ('--disable-web-security', '--ignore-certificate-errors')
-
-    if user_agent:
-        cmd_args += ('--user-agent={}'.format(user_agent),)
-
-    if resolution:
-        cmd_args += ('--window-size={}'.format(RESOLUTION),)
-
-    if timeout:
-        cmd_args += ('--timeout={}'.format((timeout) * 1000),)
-
-    # Find chrome user data directory
-    default_profile_paths = (
-        '~/.config/chromium',
-        '~/.config/google-chrome',
-        '~/.config/google-chrome-beta',
-        '~/.config/google-chrome-unstable',
-        '~/Library/Application Support/Chromium',
-        '~/Library/Application Support/Google/Chrome',
-        '~/Library/Application Support/Google/Chrome Canary',
-        '~/AppData/Local/Chromium/User Data',
-        '~/AppData/Local/Google/Chrome/User Data',
-        '~/AppData/Local/Google/Chrome SxS/User Data',
-    )
-    if user_data_dir:
-        cmd_args.append('--user-data-dir={}'.format(user_data_dir))
-    else:
-        for path in default_profile_paths:
-            full_path = os.path.expanduser(path)
-            if os.path.exists(full_path):
-                CACHED_USER_DATA_DIR = full_path
-                cmd_args.append('--user-data-dir={}'.format(full_path))
-                break
-    
-    return cmd_args
-
 
 
-CACHED_USER_DATA_DIR = CHROME_USER_DATA_DIR

+ 63 - 27
archivebox/index.py

@@ -12,18 +12,24 @@ except ImportError:
 from config import (
 from config import (
     OUTPUT_DIR,
     OUTPUT_DIR,
     TEMPLATES_DIR,
     TEMPLATES_DIR,
-    ANSI,
     GIT_SHA,
     GIT_SHA,
     FOOTER_INFO,
     FOOTER_INFO,
 )
 )
 from util import (
 from util import (
     chmod_file,
     chmod_file,
     derived_link_info,
     derived_link_info,
-    pretty_path,
     check_link_structure,
     check_link_structure,
     check_links_structure,
     check_links_structure,
     wget_output_path,
     wget_output_path,
 )
 )
+from parse import parse_links
+from links import validate_links
+from logs import (
+    log_indexing_started,
+    log_indexing_finished,
+    log_parsing_started,
+    log_parsing_finished,
+)
 
 
 TITLE_LOADING_MSG = 'Not yet archived...'
 TITLE_LOADING_MSG = 'Not yet archived...'
 
 
@@ -33,21 +39,40 @@ TITLE_LOADING_MSG = 'Not yet archived...'
 def write_links_index(out_dir, links, finished=False):
 def write_links_index(out_dir, links, finished=False):
     """create index.html file for a given list of links"""
     """create index.html file for a given list of links"""
 
 
+    log_indexing_started()
     check_links_structure(links)
     check_links_structure(links)
 
 
-    if not os.path.exists(out_dir):
-        os.makedirs(out_dir)
-
-    print('{green}[*] [{}] Saving main index files...{reset}'.format(
-        datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
-        **ANSI,
-    ))
     write_json_links_index(out_dir, links)
     write_json_links_index(out_dir, links)
-    print('    > {}/index.json'.format(pretty_path(out_dir)))
+    log_indexing_finished(out_dir, 'index.json')
     
     
     write_html_links_index(out_dir, links, finished=finished)
     write_html_links_index(out_dir, links, finished=finished)
-    print('    > {}/index.html'.format(pretty_path(out_dir)))
+    log_indexing_finished(out_dir, 'index.html')
     
     
+def load_links_index(out_dir=OUTPUT_DIR, import_path=None):
+    """parse and load existing index with any new links from import_path merged in"""
+
+    existing_links = []
+    if out_dir:
+        existing_links = parse_json_links_index(out_dir)
+        check_links_structure(existing_links)
+
+    new_links = []
+    if import_path:
+        # parse and validate the import file
+        log_parsing_started(import_path)
+        raw_links, parser_name = parse_links(import_path)
+        new_links = validate_links(raw_links)
+        check_links_structure(new_links)
+
+    # merge existing links in out_dir and new links
+    all_links = validate_links(existing_links + new_links)
+    check_links_structure(all_links)
+    num_new_links = len(all_links) - len(existing_links)
+
+    if import_path and parser_name:
+        log_parsing_finished(num_new_links, parser_name)
+
+    return all_links, new_links
 
 
 def write_json_links_index(out_dir, links):
 def write_json_links_index(out_dir, links):
     """write the json link index to a given path"""
     """write the json link index to a given path"""
@@ -70,8 +95,8 @@ def write_json_links_index(out_dir, links):
 
 
     chmod_file(path)
     chmod_file(path)
 
 
-def parse_json_links_index(out_dir):
-    """load the index in a given directory and merge it with the given link"""
+def parse_json_links_index(out_dir=OUTPUT_DIR):
+    """parse a archive index json file and return the list of links"""
     index_path = os.path.join(out_dir, 'index.json')
     index_path = os.path.join(out_dir, 'index.json')
     if os.path.exists(index_path):
     if os.path.exists(index_path):
         with open(index_path, 'r', encoding='utf-8') as f:
         with open(index_path, 'r', encoding='utf-8') as f:
@@ -136,31 +161,26 @@ def write_html_links_index(out_dir, links, finished=False):
     chmod_file(path)
     chmod_file(path)
 
 
 
 
-def update_main_index(link):
+def patch_links_index(link, out_dir=OUTPUT_DIR):
     """hack to in-place update one row's info in the generated index html"""
     """hack to in-place update one row's info in the generated index html"""
 
 
     title = link['latest']['title']
     title = link['latest']['title']
     successful = len([entry for entry in link['latest'].values() if entry])
     successful = len([entry for entry in link['latest'].values() if entry])
 
 
     # Patch JSON index
     # Patch JSON index
-    json_path = os.path.join(OUTPUT_DIR, 'index.json')
-
-    links = parse_json_links_index(OUTPUT_DIR)
-
     changed = False
     changed = False
-    for json_link in links:
-        if json_link['url'] == link['url']:
-            json_link['title'] = title
-            json_link['latest'] = link['latest']
+    json_file_links = parse_json_links_index(out_dir)
+    for saved_link in json_file_links:
+        if saved_link['url'] == link['url']:
+            saved_link['title'] = title
+            saved_link['latest'] = link['latest']
             changed = True
             changed = True
             break
             break
-
     if changed:
     if changed:
-        write_json_links_index(OUTPUT_DIR, links)
+        write_json_links_index(out_dir, json_file_links)
 
 
     # Patch HTML index
     # Patch HTML index
-    html_path = os.path.join(OUTPUT_DIR, 'index.html')
-
+    html_path = os.path.join(out_dir, 'index.html')
     html = open(html_path, 'r').read().split('\n')
     html = open(html_path, 'r').read().split('\n')
     for idx, line in enumerate(html):
     for idx, line in enumerate(html):
         if title and ('<span data-title-for="{}"'.format(link['url']) in line):
         if title and ('<span data-title-for="{}"'.format(link['url']) in line):
@@ -172,6 +192,7 @@ def update_main_index(link):
     with open(html_path, 'w') as f:
     with open(html_path, 'w') as f:
         f.write('\n'.join(html))
         f.write('\n'.join(html))
 
 
+
 ### Individual link index
 ### Individual link index
 
 
 def write_link_index(out_dir, link):
 def write_link_index(out_dir, link):
@@ -202,6 +223,18 @@ def parse_json_link_index(out_dir):
             return link_json
             return link_json
     return {}
     return {}
 
 
+def load_json_link_index(out_dir, link):
+    """check for an existing link archive in the given directory, 
+       and load+merge it into the given link dict
+    """
+    link = {
+        **parse_json_link_index(out_dir),
+        **link,
+    }
+
+    check_link_structure(link)
+    return link
+
 def write_html_link_index(out_dir, link):
 def write_html_link_index(out_dir, link):
     check_link_structure(link)
     check_link_structure(link)
     with open(os.path.join(TEMPLATES_DIR, 'link_index.html'), 'r', encoding='utf-8') as f:
     with open(os.path.join(TEMPLATES_DIR, 'link_index.html'), 'r', encoding='utf-8') as f:
@@ -224,7 +257,10 @@ def write_html_link_index(out_dir, link):
                 wget_output_path(link)
                 wget_output_path(link)
                 or (link['domain'] if link['is_archived'] else 'about:blank')
                 or (link['domain'] if link['is_archived'] else 'about:blank')
             ),
             ),
-            'extension': link['extension'] or 'HTML',
+            'extension': link['extension'] or 'html',
+            'tags': link['tags'].strip() or 'untagged',
+            'status': 'Archived' if link['is_archived'] else 'Not yet archived',
+            'status_color': 'success' if link['is_archived'] else 'danger',
         }))
         }))
 
 
     chmod_file(path)
     chmod_file(path)

+ 161 - 0
archivebox/logs.py

@@ -0,0 +1,161 @@
+import sys
+from datetime import datetime
+from config import ANSI, REPO_DIR, OUTPUT_DIR
+
+
+# globals are bad, mmkay
+_LAST_RUN_STATS = {
+    'skipped': 0,
+    'succeded': 0,
+    'failed': 0,
+
+    'parsing_start_ts': 0,
+    'parsing_end_ts': 0,
+
+    'indexing_start_ts': 0,
+    'indexing_end_ts': 0,
+
+    'archiving_start_ts': 0,
+    'archiving_end_ts': 0,
+
+    'links': {},
+}
+
+def pretty_path(path):
+    """convert paths like .../ArchiveBox/archivebox/../output/abc into output/abc"""
+    return path.replace(REPO_DIR + '/', '')
+
+
+def log_link_archiving_started(link_dir, link, is_new):
+    print('[{symbol_color}{symbol}{reset}] [{now}] "{title}"\n    {blue}{url}{reset}'.format(
+        symbol='+' if is_new else '*',
+        symbol_color=ANSI['green' if is_new else 'black'],
+        now=datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
+        **{**link, 'title': link['title'] or link['url']},
+        **ANSI,
+    ))
+
+    print('    > {}{}'.format(pretty_path(link_dir), ' (new)' if is_new else ''))
+
+
+def log_link_archiving_failed(cmd, pwd, err=None, hints=None, prefix='        '):
+    """quote the argument with whitespace in a command so the user can 
+       copy-paste the outputted string directly to run the cmd
+    """
+
+    # Prettify CMD string and make it save to copy-paste by quoting arguments
+    quoted_cmd = ' '.join(
+        '"{}"'.format(arg) if ' ' in arg else arg
+        for arg in cmd
+    )
+
+    # Prettify error output hints string and limit to five lines
+    hints = hints or getattr(err, 'hints', None)
+    if hints:
+        hints = hints if isinstance(hints, (list, tuple)) else hints.split('\n')
+        hints = (
+            '    {}{}{}'.format(ANSI['lightyellow'], line.strip(), ANSI['reset'])
+            for line in hints[:5] if line.strip()
+        )
+    else:
+        hints = ()
+
+    output_lines = [
+        '{}Failed: {} {}{}'.format(ANSI['red'], err.__class__.__name__, err, ANSI['reset']),
+        *hints,
+        'Run to see full output:'        
+        '    cd {};'.format(pwd),
+        '    {}'.format(quoted_cmd),
+    ]
+
+    return '\n'.join(
+        '{}{}'.format(prefix, line)
+        for line in output_lines
+        if line
+    )
+
+### Logging Helpers
+
+def log_parsing_started(source_file):
+    start_ts = datetime.now()
+    _LAST_RUN_STATS['parse_start_ts'] = start_ts
+    print('{green}[*] [{}] Parsing new links from output/sources/{}...{reset}'.format(
+        start_ts.strftime('%Y-%m-%d %H:%M:%S'),
+        source_file.rsplit('/', 1)[-1],
+        **ANSI,
+    ))
+
+def log_parsing_finished(num_new_links, parser_name):
+    print('    > Adding {} new links to index (parsed import as {})'.format(
+        num_new_links,
+        parser_name,
+    ))
+
+def log_indexing_started():
+    start_ts = datetime.now()
+    _LAST_RUN_STATS['index_start_ts'] = start_ts
+    print('{green}[*] [{}] Saving main index files...{reset}'.format(
+        start_ts.strftime('%Y-%m-%d %H:%M:%S'),
+        **ANSI,
+    ))
+
+def log_indexing_finished(out_dir, out_file):
+    end_ts = datetime.now()
+    _LAST_RUN_STATS['index_end_ts'] = end_ts
+    print('    > {}/{}'.format(pretty_path(out_dir), out_file))
+
+def log_archiving_started(num_links, resume):
+    start_ts = datetime.now()
+    _LAST_RUN_STATS['start_ts'] = start_ts
+    if resume:
+        print('{green}[▶] [{}] Resuming archive updating for {} pages starting from {}...{reset}'.format(
+             start_ts.strftime('%Y-%m-%d %H:%M:%S'),
+             num_links,
+             resume,
+             **ANSI,
+        ))
+    else:
+        print('{green}[▶] [{}] Updating content for {} pages in archive...{reset}'.format(
+             start_ts.strftime('%Y-%m-%d %H:%M:%S'),
+             num_links,
+             **ANSI,
+        ))
+
+def log_archiving_paused(num_links, idx, timestamp):
+    end_ts = datetime.now()
+    _LAST_RUN_STATS['end_ts'] = end_ts
+    print()
+    print('\n{lightyellow}[X] [{now}] Downloading paused on link {timestamp} ({idx}/{total}){reset}'.format(
+        **ANSI,
+        now=end_ts.strftime('%Y-%m-%d %H:%M:%S'),
+        idx=idx+1,
+        timestamp=timestamp,
+        total=num_links,
+    ))
+    print('    To view your archive, open: {}/index.html'.format(OUTPUT_DIR.replace(REPO_DIR + '/', '')))
+    print('    Continue where you left off by running:')
+    print('        {} {}'.format(
+        pretty_path(sys.argv[0]),
+        timestamp,
+    ))
+
+def log_archiving_finished(num_links):
+    end_ts = datetime.now()
+    _LAST_RUN_STATS['end_ts'] = end_ts
+    seconds = end_ts - _LAST_RUN_STATS['start_ts'].timestamp()
+    if seconds > 60:
+        duration = '{0:.2f} min'.format(seconds / 60, 2)
+    else:
+        duration = '{0:.2f} sec'.format(seconds, 2)
+
+    print('{}[√] [{}] Update of {} pages complete ({}){}'.format(
+        ANSI['green'],
+        end_ts.strftime('%Y-%m-%d %H:%M:%S'),
+        num_links,
+        duration,
+        ANSI['reset'],
+    ))
+    print('    - {} entries skipped'.format(_LAST_RUN_STATS['skipped']))
+    print('    - {} entries updated'.format(_LAST_RUN_STATS['succeded']))
+    print('    - {} errors'.format(_LAST_RUN_STATS['failed']))
+    print('    To view your archive, open: {}/index.html'.format(OUTPUT_DIR.replace(REPO_DIR + '/', '')))

+ 101 - 91
archivebox/parse.py

@@ -1,17 +1,19 @@
-# coding: utf-8
-
 """
 """
-Everything related to parsing links from bookmark services.
+Everything related to parsing links from input sources.
 
 
 For a list of supported services, see the README.md.
 For a list of supported services, see the README.md.
-For examples of supported files see examples/.
+For examples of supported import formats see tests/.
 
 
-Parsed link schema: {
+Link: {
     'url': 'https://example.com/example/?abc=123&xyc=345#lmnop',
     'url': 'https://example.com/example/?abc=123&xyc=345#lmnop',
-    'timestamp': '15442123124234',
+    'timestamp': '1544212312.4234',
     'title': 'Example.com Page Title',
     'title': 'Example.com Page Title',
-    'sources': ['ril_export.html', 'downloads/getpocket.com.txt'],
     'tags': 'abc,def',
     'tags': 'abc,def',
+    'sources': [
+        'output/sources/ril_export.html',
+        'output/sources/getpocket.com-1523422111.txt',
+        'output/sources/stdin-234234112312.txt'
+    ]
 }
 }
 """
 """
 
 
@@ -19,45 +21,59 @@ import re
 import json
 import json
 
 
 from datetime import datetime
 from datetime import datetime
-from collections import OrderedDict
 import xml.etree.ElementTree as etree
 import xml.etree.ElementTree as etree
 
 
-from config import ANSI
+from config import TIMEOUT
 from util import (
 from util import (
     str_between,
     str_between,
     URL_REGEX,
     URL_REGEX,
-    check_url_parsing,
+    check_url_parsing_invariants,
+    progress,
 )
 )
 
 
 
 
-def parse_links(path):
-    """parse a list of links dictionaries from a bookmark export file"""
-    
-    check_url_parsing()
-
-    links = []
-    with open(path, 'r', encoding='utf-8') as file:
-        print('{green}[*] [{}] Parsing new links from output/sources/{}...{reset}'.format(
-            datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
-            path.rsplit('/', 1)[-1],
-            **ANSI,
-        ))
+def parse_links(source_file):
+    """parse a list of URLs with their metadata from an 
+       RSS feed, bookmarks export, or text file
+    """
 
 
-        for parser_name, parser_func in PARSERS.items():
+    check_url_parsing_invariants()
+    PARSERS = (
+        # Specialized parsers
+        ('Pocket HTML', parse_pocket_html_export),
+        ('Pinboard RSS', parse_pinboard_rss_export),
+        ('Shaarli RSS', parse_shaarli_rss_export),
+        ('Medium RSS', parse_medium_rss_export),
+        
+        # General parsers
+        ('Netscape HTML', parse_netscape_html_export),
+        ('Generic RSS', parse_rss_export),
+        ('Generic JSON', parse_json_export),
+
+        # Fallback parser
+        ('Plain Text', parse_plain_text_export),
+    )
+    end = progress(TIMEOUT * 4, prefix='      ')
+    with open(source_file, 'r', encoding='utf-8') as file:
+        for parser_name, parser_func in PARSERS:
             try:
             try:
-                links += list(parser_func(file))
+                links = list(parser_func(file))
                 if links:
                 if links:
-                    break
+                    end()
+                    return links, parser_name
             except Exception as err:
             except Exception as err:
-                # we try each parser one by one, wong parsers will throw exeptions
-                # if unsupported and we accept the first one that passes
-                # uncomment the following line to see why the parser was unsupported for each attempted format
+                # Parsers are tried one by one down the list, and the first one
+                # that succeeds is used. To see why a certain parser was not used
+                # due to error or format incompatibility, uncomment this line:
                 # print('[!] Parser {} failed: {} {}'.format(parser_name, err.__class__.__name__, err))
                 # print('[!] Parser {} failed: {} {}'.format(parser_name, err.__class__.__name__, err))
                 pass
                 pass
 
 
-    return links, parser_name
+    end()
+    return [], 'Plain Text'
 
 
 
 
+### Import Parser Functions
+
 def parse_pocket_html_export(html_file):
 def parse_pocket_html_export(html_file):
     """Parse Pocket-format bookmarks export files (produced by getpocket.com/export/)"""
     """Parse Pocket-format bookmarks export files (produced by getpocket.com/export/)"""
 
 
@@ -81,40 +97,57 @@ def parse_pocket_html_export(html_file):
                 'sources': [html_file.name],
                 'sources': [html_file.name],
             }
             }
 
 
-def parse_pinboard_json_export(json_file):
+
+def parse_json_export(json_file):
     """Parse JSON-format bookmarks export files (produced by pinboard.in/export/, or wallabag)"""
     """Parse JSON-format bookmarks export files (produced by pinboard.in/export/, or wallabag)"""
+
     json_file.seek(0)
     json_file.seek(0)
-    json_content = json.load(json_file)
-    for line in json_content:
+    links = json.load(json_file)
+    json_date = lambda s: datetime.strptime(s, '%Y-%m-%dT%H:%M:%S%z')
+
+    for link in links:
         # example line
         # example line
         # {"href":"http:\/\/www.reddit.com\/r\/example","description":"title here","extended":"","meta":"18a973f09c9cc0608c116967b64e0419","hash":"910293f019c2f4bb1a749fb937ba58e3","time":"2014-06-14T15:51:42Z","shared":"no","toread":"no","tags":"reddit android"}]
         # {"href":"http:\/\/www.reddit.com\/r\/example","description":"title here","extended":"","meta":"18a973f09c9cc0608c116967b64e0419","hash":"910293f019c2f4bb1a749fb937ba58e3","time":"2014-06-14T15:51:42Z","shared":"no","toread":"no","tags":"reddit android"}]
-        if line:
-            erg = line
-            if erg.get('timestamp'):
-                timestamp = str(erg['timestamp']/10000000)  # chrome/ff histories use a very precise timestamp
-            elif erg.get('time'):
-                timestamp = str(datetime.strptime(erg['time'].split(',', 1)[0], '%Y-%m-%dT%H:%M:%SZ').timestamp())
-            elif erg.get('created_at'):
-                timestamp = str(datetime.strptime(erg['created_at'], '%Y-%m-%dT%H:%M:%S%z').timestamp())
-            else:
-                timestamp = str(datetime.now().timestamp())
-            if erg.get('href'):
-                url = erg['href']
-            else:
-                url = erg['url']
-            if erg.get('description'):
-                title = (erg.get('description') or '').replace(' — Readability', '')
-            else:
-                title = erg['title'].strip()
-
-            info = {
+        if link:
+            # Parse URL
+            url = link.get('href') or link.get('url') or link.get('URL')
+            if not url:
+                raise Exception('JSON must contain URL in each entry [{"url": "http://...", ...}, ...]')
+
+            # Parse the timestamp
+            ts_str = str(datetime.now().timestamp())
+            if link.get('timestamp'):
+                # chrome/ff histories use a very precise timestamp
+                ts_str = str(link['timestamp'] / 10000000)  
+            elif link.get('time'):
+                ts_str = str(json_date(link['time'].split(',', 1)[0]).timestamp())
+            elif link.get('created_at'):
+                ts_str = str(json_date(link['created_at']).timestamp())
+            elif link.get('created'):
+                ts_str = str(json_date(link['created']).timestamp())
+            elif link.get('date'):
+                ts_str = str(json_date(link['date']).timestamp())
+            elif link.get('bookmarked'):
+                ts_str = str(json_date(link['bookmarked']).timestamp())
+            elif link.get('saved'):
+                ts_str = str(json_date(link['saved']).timestamp())
+            
+            # Parse the title
+            title = None
+            if link.get('title'):
+                title = link['title'].strip() or None
+            elif link.get('description'):
+                title = link['description'].replace(' — Readability', '').strip() or None
+            elif link.get('name'):
+                title = link['name'].strip() or None
+
+            yield {
                 'url': url,
                 'url': url,
-                'timestamp': timestamp,
-                'title': title or None,
-                'tags': erg.get('tags') or '',
+                'timestamp': ts_str,
+                'title': title,
+                'tags': link.get('tags') or '',
                 'sources': [json_file.name],
                 'sources': [json_file.name],
             }
             }
-            yield info
 
 
 
 
 def parse_rss_export(rss_file):
 def parse_rss_export(rss_file):
@@ -139,15 +172,15 @@ def parse_rss_export(rss_file):
         def get_row(key):
         def get_row(key):
             return [r for r in rows if r.strip().startswith('<{}>'.format(key))][0]
             return [r for r in rows if r.strip().startswith('<{}>'.format(key))][0]
 
 
-        title = str_between(get_row('title'), '<![CDATA[', ']]').strip()
         url = str_between(get_row('link'), '<link>', '</link>')
         url = str_between(get_row('link'), '<link>', '</link>')
         ts_str = str_between(get_row('pubDate'), '<pubDate>', '</pubDate>')
         ts_str = str_between(get_row('pubDate'), '<pubDate>', '</pubDate>')
         time = datetime.strptime(ts_str, "%a, %d %b %Y %H:%M:%S %z")
         time = datetime.strptime(ts_str, "%a, %d %b %Y %H:%M:%S %z")
+        title = str_between(get_row('title'), '<![CDATA[', ']]').strip() or None
 
 
         yield {
         yield {
             'url': url,
             'url': url,
             'timestamp': str(time.timestamp()),
             'timestamp': str(time.timestamp()),
-            'title': title or None,
+            'title': title,
             'tags': '',
             'tags': '',
             'sources': [rss_file.name],
             'sources': [rss_file.name],
         }
         }
@@ -224,9 +257,6 @@ def parse_pinboard_rss_export(rss_file):
         tags = item.find("{http://purl.org/dc/elements/1.1/}subject").text if item.find("{http://purl.org/dc/elements/1.1/}subject") else None
         tags = item.find("{http://purl.org/dc/elements/1.1/}subject").text if item.find("{http://purl.org/dc/elements/1.1/}subject") else None
         title = item.find("{http://purl.org/rss/1.0/}title").text.strip() if item.find("{http://purl.org/rss/1.0/}title").text.strip() else None
         title = item.find("{http://purl.org/rss/1.0/}title").text.strip() if item.find("{http://purl.org/rss/1.0/}title").text.strip() else None
         ts_str = item.find("{http://purl.org/dc/elements/1.1/}date").text if item.find("{http://purl.org/dc/elements/1.1/}date").text else None
         ts_str = item.find("{http://purl.org/dc/elements/1.1/}date").text if item.find("{http://purl.org/dc/elements/1.1/}date").text else None
-        #       = 🌈🌈🌈🌈
-        #        = 🌈🌈🌈🌈
-        #         = 🏆🏆🏆🏆
         
         
         # Pinboard includes a colon in its date stamp timezone offsets, which
         # Pinboard includes a colon in its date stamp timezone offsets, which
         # Python can't parse. Remove it:
         # Python can't parse. Remove it:
@@ -254,8 +284,6 @@ def parse_medium_rss_export(rss_file):
     root = etree.parse(rss_file).getroot()
     root = etree.parse(rss_file).getroot()
     items = root.find("channel").findall("item")
     items = root.find("channel").findall("item")
     for item in items:
     for item in items:
-        # for child in item:
-        #     print(child.tag, child.text)
         url = item.find("link").text
         url = item.find("link").text
         title = item.find("title").text.strip()
         title = item.find("title").text.strip()
         ts_str = item.find("pubDate").text
         ts_str = item.find("pubDate").text
@@ -274,31 +302,13 @@ def parse_plain_text_export(text_file):
     """Parse raw links from each line in a text file"""
     """Parse raw links from each line in a text file"""
 
 
     text_file.seek(0)
     text_file.seek(0)
-    text_content = text_file.readlines()
-    for line in text_content:
-        if line:
-            urls = re.findall(URL_REGEX, line)
-            
-            for url in urls:
-                url = url.strip()
-                time = datetime.now()
-                
-                yield {
-                    'url': url,
-                    'timestamp': str(time.timestamp()),
-                    'title': None,
-                    'tags': '',
-                    'sources': [text_file.name],
-                }
-
-
-PARSERS = OrderedDict([
-    ('Pocket HTML', parse_pocket_html_export),
-    ('Pinboard JSON', parse_pinboard_json_export),
-    ('Netscape HTML', parse_netscape_html_export),
-    ('RSS', parse_rss_export),
-    ('Pinboard RSS', parse_pinboard_rss_export),
-    ('Shaarli RSS', parse_shaarli_rss_export),
-    ('Medium RSS', parse_medium_rss_export),
-    ('Plain Text', parse_plain_text_export),
-])
+    for line in text_file.readlines():
+        urls = re.findall(URL_REGEX, line) if line.strip() else ()
+        for url in urls:
+            yield {
+                'url': url,
+                'timestamp': str(datetime.now().timestamp()),
+                'title': None,
+                'tags': '',
+                'sources': [text_file.name],
+            }

+ 55 - 3
archivebox/peekable.py → archivebox/stdlib_patches.py

@@ -1,10 +1,64 @@
+"""
+Patches, additions, and shortcuts for Python standard library functions.
+"""
+
+### subprocess
+
+from subprocess import (
+    Popen,
+    PIPE,
+    DEVNULL, 
+    CompletedProcess,
+    TimeoutExpired,
+    CalledProcessError,
+)
+
+def run(*popenargs, input=None, capture_output=False, timeout=None, check=False, **kwargs):
+    """Patched of subprocess.run to fix blocking io making timeout=innefective"""
+
+    if input is not None:
+        if 'stdin' in kwargs:
+            raise ValueError('stdin and input arguments may not both be used.')
+        kwargs['stdin'] = PIPE
+
+    if capture_output:
+        if ('stdout' in kwargs) or ('stderr' in kwargs):
+            raise ValueError('stdout and stderr arguments may not be used '
+                             'with capture_output.')
+        kwargs['stdout'] = PIPE
+        kwargs['stderr'] = PIPE
+
+    with Popen(*popenargs, **kwargs) as process:
+        try:
+            stdout, stderr = process.communicate(input, timeout=timeout)
+        except TimeoutExpired:
+            process.kill()
+            try:
+                stdout, stderr = process.communicate(input, timeout=2)
+            except:
+                pass
+            raise TimeoutExpired(popenargs[0][0], timeout)
+        except BaseException as err:
+            process.kill()
+            # We don't call process.wait() as .__exit__ does that for us.
+            raise 
+        retcode = process.poll()
+        if check and retcode:
+            raise CalledProcessError(retcode, process.args,
+                                     output=stdout, stderr=stderr)
+    return CompletedProcess(process.args, retcode, stdout, stderr)
+
+
+
+### collections
+
 from sys import maxsize
 from sys import maxsize
 from itertools import islice
 from itertools import islice
 from collections import deque
 from collections import deque
 
 
 _marker = object()
 _marker = object()
 
 
-class Peekable(object):
+class PeekableGenerator:
     """Peekable version of a normal python generator.
     """Peekable version of a normal python generator.
        Useful when you don't want to evaluate the entire iterable to look at
        Useful when you don't want to evaluate the entire iterable to look at
        a specific item at a given idx.
        a specific item at a given idx.
@@ -74,8 +128,6 @@ class Peekable(object):
 
 
         return next(self._it)
         return next(self._it)
 
 
-    next = __next__  # For Python 2 compatibility
-
     def _get_slice(self, index):
     def _get_slice(self, index):
         # Normalize the slice's arguments
         # Normalize the slice's arguments
         step = 1 if (index.step is None) else index.step
         step = 1 if (index.step is None) else index.step

+ 8 - 3
archivebox/templates/link_index.html

@@ -192,22 +192,27 @@
                     Bookmarked: <small title="Timestamp: $timestamp">$bookmarked_date</small>
                     Bookmarked: <small title="Timestamp: $timestamp">$bookmarked_date</small>
                     &nbsp; | &nbsp;
                     &nbsp; | &nbsp;
                     Last updated: <small title="Timestamp: $updated">$updated_date</small>
                     Last updated: <small title="Timestamp: $updated">$updated_date</small>
+                    &nbsp; | &nbsp;
+                    Total files: <small title="Archive methods">🗃 $num_outputs</small>
                 </div>
                 </div>
                 <div class="col-lg-4 alert well">
                 <div class="col-lg-4 alert well">
                     Type: 
                     Type: 
                     <span class="badge badge-default">$extension</span>
                     <span class="badge badge-default">$extension</span>
                     &nbsp; | &nbsp;
                     &nbsp; | &nbsp;
                     Tags:
                     Tags:
-                    <span class="badge badge-success">$tags</span> 
+                    <span class="badge badge-warning">$tags</span> 
+                    &nbsp; | &nbsp;
+                    Status:
+                    <span class="badge badge-$status_color">$status</span> 
                 </div>
                 </div>
                 <div class="col-lg-4 alert well">
                 <div class="col-lg-4 alert well">
-                    Download:
+                    Archive Methods:
                     <a href="index.json" title="JSON summary of archived link.">JSON</a> | 
                     <a href="index.json" title="JSON summary of archived link.">JSON</a> | 
                     <a href="warc/" title="Any WARC archives for the page">WARC</a> | 
                     <a href="warc/" title="Any WARC archives for the page">WARC</a> | 
                     <a href="media/" title="Audio, Video, and Subtitle files.">Media</a> | 
                     <a href="media/" title="Audio, Video, and Subtitle files.">Media</a> | 
                     <a href="git/" title="Any git repos at the url">Git Repos</a> | 
                     <a href="git/" title="Any git repos at the url">Git Repos</a> | 
                     <a href="favicon.ico" title="Any git repos at the url">Favicon</a> | 
                     <a href="favicon.ico" title="Any git repos at the url">Favicon</a> | 
-                    <a href="." title="Webserver-provided index of files directory.">More files...</a>
+                    <a href="." title="Webserver-provided index of files directory.">See all files...</a>
                 </div>
                 </div>
                 <hr/>
                 <hr/>
                 <div class="col-lg-2">
                 <div class="col-lg-2">

+ 134 - 139
archivebox/util.py

@@ -8,8 +8,8 @@ from urllib.parse import urlparse, quote
 from decimal import Decimal
 from decimal import Decimal
 from datetime import datetime
 from datetime import datetime
 from multiprocessing import Process
 from multiprocessing import Process
-from subprocess import TimeoutExpired, Popen, PIPE, DEVNULL, CompletedProcess, CalledProcessError
 
 
+from stdlib_patches import run, PIPE, DEVNULL
 from config import (
 from config import (
     ANSI,
     ANSI,
     TERM_WIDTH,
     TERM_WIDTH,
@@ -19,8 +19,6 @@ from config import (
     OUTPUT_PERMISSIONS,
     OUTPUT_PERMISSIONS,
     TIMEOUT,
     TIMEOUT,
     SHOW_PROGRESS,
     SHOW_PROGRESS,
-    CHECK_SSL_VALIDITY,
-    WGET_USER_AGENT,
     CURL_BINARY,
     CURL_BINARY,
     WGET_BINARY,
     WGET_BINARY,
     CHROME_BINARY,
     CHROME_BINARY,
@@ -37,6 +35,13 @@ from config import (
     FETCH_MEDIA,
     FETCH_MEDIA,
     SUBMIT_ARCHIVE_DOT_ORG,
     SUBMIT_ARCHIVE_DOT_ORG,
     ARCHIVE_DIR_NAME,
     ARCHIVE_DIR_NAME,
+    RESOLUTION,
+    CHECK_SSL_VALIDITY,
+    WGET_USER_AGENT,
+    CHROME_USER_AGENT,
+    CHROME_USER_DATA_DIR,
+    CHROME_HEADLESS,
+    CHROME_SANDBOX,
 )
 )
 
 
 ### Parsing Helpers
 ### Parsing Helpers
@@ -56,6 +61,7 @@ extension = lambda url: basename(url).rsplit('.', 1)[-1].lower() if '.' in basen
 base_url = lambda url: without_scheme(url)  # uniq base url used to dedupe links
 base_url = lambda url: without_scheme(url)  # uniq base url used to dedupe links
 
 
 short_ts = lambda ts: ts.split('.')[0]
 short_ts = lambda ts: ts.split('.')[0]
+urlencode = lambda s: quote(s, encoding='utf-8', errors='replace')
 
 
 URL_REGEX = re.compile(
 URL_REGEX = re.compile(
     r'http[s]?://'                    # start matching from allowed schemes
     r'http[s]?://'                    # start matching from allowed schemes
@@ -109,66 +115,74 @@ def check_links_structure(links):
 def check_dependencies():
 def check_dependencies():
     """Check that all necessary dependencies are installed, and have valid versions"""
     """Check that all necessary dependencies are installed, and have valid versions"""
 
 
-    python_vers = float('{}.{}'.format(sys.version_info.major, sys.version_info.minor))
-    if python_vers < 3.5:
-        print('{}[X] Python version is not new enough: {} (>3.5 is required){}'.format(ANSI['red'], python_vers, ANSI['reset']))
-        print('    See https://github.com/pirate/ArchiveBox#troubleshooting for help upgrading your Python installation.')
-        raise SystemExit(1)
-
-    if FETCH_FAVICON or SUBMIT_ARCHIVE_DOT_ORG:
-        if run(['which', CURL_BINARY], stdout=DEVNULL).returncode or run([CURL_BINARY, '--version'], stdout=DEVNULL).returncode:
-            print('{red}[X] Missing dependency: curl{reset}'.format(**ANSI))
-            print('    Run ./setup.sh, then confirm it was installed with: {} --version'.format(CURL_BINARY))
-            print('    See https://github.com/pirate/ArchiveBox for help.')
+    try:
+        python_vers = float('{}.{}'.format(sys.version_info.major, sys.version_info.minor))
+        if python_vers < 3.5:
+            print('{}[X] Python version is not new enough: {} (>3.5 is required){}'.format(ANSI['red'], python_vers, ANSI['reset']))
+            print('    See https://github.com/pirate/ArchiveBox/wiki/Troubleshooting#python for help upgrading your Python installation.')
             raise SystemExit(1)
             raise SystemExit(1)
 
 
-    if FETCH_WGET or FETCH_WARC:
-        if run(['which', WGET_BINARY], stdout=DEVNULL).returncode or run([WGET_BINARY, '--version'], stdout=DEVNULL).returncode:
-            print('{red}[X] Missing dependency: wget{reset}'.format(**ANSI))
-            print('    Run ./setup.sh, then confirm it was installed with: {} --version'.format(WGET_BINARY))
-            print('    See https://github.com/pirate/ArchiveBox for help.')
-            raise SystemExit(1)
+        if FETCH_FAVICON or SUBMIT_ARCHIVE_DOT_ORG:
+            if run(['which', CURL_BINARY], stdout=DEVNULL, stderr=DEVNULL).returncode or run([CURL_BINARY, '--version'], stdout=DEVNULL, stderr=DEVNULL).returncode:
+                print('{red}[X] Missing dependency: curl{reset}'.format(**ANSI))
+                print('    Install it, then confirm it works with: {} --version'.format(CURL_BINARY))
+                print('    See https://github.com/pirate/ArchiveBox/wiki/Install for help.')
+                raise SystemExit(1)
 
 
-    if FETCH_PDF or FETCH_SCREENSHOT or FETCH_DOM:
-        if run(['which', CHROME_BINARY], stdout=DEVNULL).returncode:
-            print('{}[X] Missing dependency: {}{}'.format(ANSI['red'], CHROME_BINARY, ANSI['reset']))
-            print('    Run ./setup.sh, then confirm it was installed with: {} --version'.format(CHROME_BINARY))
-            print('    See https://github.com/pirate/ArchiveBox for help.')
-            raise SystemExit(1)
+        if FETCH_WGET or FETCH_WARC:
+            if run(['which', WGET_BINARY], stdout=DEVNULL, stderr=DEVNULL).returncode or run([WGET_BINARY, '--version'], stdout=DEVNULL, stderr=DEVNULL).returncode:
+                print('{red}[X] Missing dependency: wget{reset}'.format(**ANSI))
+                print('    Install it, then confirm it works with: {} --version'.format(WGET_BINARY))
+                print('    See https://github.com/pirate/ArchiveBox/wiki/Install for help.')
+                raise SystemExit(1)
 
 
-        # parse chrome --version e.g. Google Chrome 61.0.3114.0 canary / Chromium 59.0.3029.110 built on Ubuntu, running on Ubuntu 16.04
-        try:
-            result = run([CHROME_BINARY, '--version'], stdout=PIPE)
-            version_str = result.stdout.decode('utf-8')
-            version_lines = re.sub("(Google Chrome|Chromium) (\\d+?)\\.(\\d+?)\\.(\\d+?).*?$", "\\2", version_str).split('\n')
-            version = [l for l in version_lines if l.isdigit()][-1]
-            if int(version) < 59:
-                print(version_lines)
-                print('{red}[X] Chrome version must be 59 or greater for headless PDF, screenshot, and DOM saving{reset}'.format(**ANSI))
-                print('    See https://github.com/pirate/ArchiveBox for help.')
+        if FETCH_PDF or FETCH_SCREENSHOT or FETCH_DOM:
+            if run(['which', CHROME_BINARY], stdout=DEVNULL, stderr=DEVNULL).returncode:
+                print('{}[X] Missing dependency: {}{}'.format(ANSI['red'], CHROME_BINARY, ANSI['reset']))
+                print('    Install it, then confirm it works with: {} --version'.format(CHROME_BINARY))
+                print('    See https://github.com/pirate/ArchiveBox/wiki/Install for help.')
                 raise SystemExit(1)
                 raise SystemExit(1)
-        except (IndexError, TypeError, OSError):
-            print('{red}[X] Failed to parse Chrome version, is it installed properly?{reset}'.format(**ANSI))
-            print('    Run ./setup.sh, then confirm it was installed with: {} --version'.format(CHROME_BINARY))
-            print('    See https://github.com/pirate/ArchiveBox for help.')
-            raise SystemExit(1)
 
 
-    if FETCH_GIT:
-        if run(['which', GIT_BINARY], stdout=DEVNULL).returncode or run([GIT_BINARY, '--version'], stdout=DEVNULL).returncode:
-            print('{red}[X] Missing dependency: git{reset}'.format(**ANSI))
-            print('    Run ./setup.sh, then confirm it was installed with: {} --version'.format(GIT_BINARY))
-            print('    See https://github.com/pirate/ArchiveBox for help.')
-            raise SystemExit(1)
+            # parse chrome --version e.g. Google Chrome 61.0.3114.0 canary / Chromium 59.0.3029.110 built on Ubuntu, running on Ubuntu 16.04
+            try:
+                result = run([CHROME_BINARY, '--version'], stdout=PIPE)
+                version_str = result.stdout.decode('utf-8')
+                version_lines = re.sub("(Google Chrome|Chromium) (\\d+?)\\.(\\d+?)\\.(\\d+?).*?$", "\\2", version_str).split('\n')
+                version = [l for l in version_lines if l.isdigit()][-1]
+                if int(version) < 59:
+                    print(version_lines)
+                    print('{red}[X] Chrome version must be 59 or greater for headless PDF, screenshot, and DOM saving{reset}'.format(**ANSI))
+                    print('    See https://github.com/pirate/ArchiveBox/wiki/Install for help.')
+                    raise SystemExit(1)
+            except (IndexError, TypeError, OSError):
+                print('{red}[X] Failed to parse Chrome version, is it installed properly?{reset}'.format(**ANSI))
+                print('    Install it, then confirm it works with: {} --version'.format(CHROME_BINARY))
+                print('    See https://github.com/pirate/ArchiveBox/wiki/Install for help.')
+                raise SystemExit(1)
 
 
-    if FETCH_MEDIA:
-        if run(['which', YOUTUBEDL_BINARY], stdout=DEVNULL).returncode or run([YOUTUBEDL_BINARY, '--version'], stdout=DEVNULL).returncode:
-            print('{red}[X] Missing dependency: youtube-dl{reset}'.format(**ANSI))
-            print('    Run ./setup.sh, then confirm it was installed with: {} --version'.format(YOUTUBEDL_BINARY))
-            print('    See https://github.com/pirate/ArchiveBox for help.')
-            raise SystemExit(1)
+        if FETCH_GIT:
+            if run(['which', GIT_BINARY], stdout=DEVNULL, stderr=DEVNULL).returncode or run([GIT_BINARY, '--version'], stdout=DEVNULL, stderr=DEVNULL).returncode:
+                print('{red}[X] Missing dependency: git{reset}'.format(**ANSI))
+                print('    Install it, then confirm it works with: {} --version'.format(GIT_BINARY))
+                print('    See https://github.com/pirate/ArchiveBox/wiki/Install for help.')
+                raise SystemExit(1)
 
 
-def check_url_parsing():
+        if FETCH_MEDIA:
+            if run(['which', YOUTUBEDL_BINARY], stdout=DEVNULL, stderr=DEVNULL).returncode or run([YOUTUBEDL_BINARY, '--version'], stdout=DEVNULL, stderr=DEVNULL).returncode:
+                print('{red}[X] Missing dependency: youtube-dl{reset}'.format(**ANSI))
+                print('    Install it, then confirm it was installed with: {} --version'.format(YOUTUBEDL_BINARY))
+                print('    See https://github.com/pirate/ArchiveBox/wiki/Install for help.')
+                raise SystemExit(1)
+    except (KeyboardInterrupt, Exception):
+        raise SystemExit(1)
+
+def check_url_parsing_invariants():
     """Check that plain text regex URL parsing works as expected"""
     """Check that plain text regex URL parsing works as expected"""
+
+    # this is last-line-of-defense to make sure the URL_REGEX isn't
+    # misbehaving, as the consequences could be disastrous and lead to many
+    # incorrect/badly parsed links being added to the archive
+
     test_urls = '''
     test_urls = '''
     https://example1.com/what/is/happening.html?what=1#how-about-this=1
     https://example1.com/what/is/happening.html?what=1#how-about-this=1
     https://example2.com/what/is/happening/?what=1#how-about-this=1
     https://example2.com/what/is/happening/?what=1#how-about-this=1
@@ -276,22 +290,9 @@ def wget_output_path(link):
     if link.get('latest', {}).get('wget'):
     if link.get('latest', {}).get('wget'):
         return link['latest']['wget']
         return link['latest']['wget']
 
 
-    urlencode = lambda s: quote(s, encoding='utf-8', errors='replace')
-
     if is_static_file(link['url']):
     if is_static_file(link['url']):
         return urlencode(without_scheme(without_fragment(link['url'])))
         return urlencode(without_scheme(without_fragment(link['url'])))
 
 
-    # Since the wget algorithm to for -E (appending .html) is incredibly complex
-    # instead of trying to emulate it here, we just look in the output folder
-    # to see what html file wget actually created as the output
-    link_dir = os.path.join(ARCHIVE_DIR, link['timestamp'])
-    full_path = without_fragment(without_query(path(link['url']))).strip('/')
-    search_dir = os.path.join(
-        link_dir,
-        domain(link['url']),
-        full_path,
-    )
-
     # Wget downloads can save in a number of different ways depending on the url
     # Wget downloads can save in a number of different ways depending on the url
     #    https://example.com
     #    https://example.com
     #       > output/archive/<timestamp>/example.com/index.html
     #       > output/archive/<timestamp>/example.com/index.html
@@ -304,6 +305,19 @@ def wget_output_path(link):
 
 
     # There's also lots of complexity around how the urlencoding and renaming
     # There's also lots of complexity around how the urlencoding and renaming
     # is done for pages with query and hash fragments or extensions like shtml / htm
     # is done for pages with query and hash fragments or extensions like shtml / htm
+
+    # Since the wget algorithm for -E (appending .html) is incredibly complex
+    # and there's no way to get the computed output path from wget
+    # in order to avoid having to reverse-engineer how they calculate it,
+    # we just look in the output folder read the filename wget used from the filesystem
+    link_dir = os.path.join(ARCHIVE_DIR, link['timestamp'])
+    full_path = without_fragment(without_query(path(link['url']))).strip('/')
+    search_dir = os.path.join(
+        link_dir,
+        domain(link['url']),
+        full_path,
+    )
+
     for _ in range(4):
     for _ in range(4):
         if os.path.exists(search_dir):
         if os.path.exists(search_dir):
             if os.path.isdir(search_dir):
             if os.path.isdir(search_dir):
@@ -356,47 +370,6 @@ def str_between(string, start, end=None):
 
 
     return content
     return content
 
 
-def pretty_path(path):
-    """convert paths like .../ArchiveBox/archivebox/../output/abc into output/abc"""
-    return path.replace(REPO_DIR + '/', '')
-
-
-def print_error_hints(cmd, pwd, err=None, hints=None, prefix='        '):
-    """quote the argument with whitespace in a command so the user can 
-       copy-paste the outputted string directly to run the cmd
-    """
-
-    # Prettify CMD string and make it save to copy-paste by quoting arguments
-    quoted_cmd = ' '.join(
-        '"{}"'.format(arg) if ' ' in arg else arg
-        for arg in cmd
-    )
-
-    # Prettify error output hints string and limit to five lines
-    hints = hints or getattr(err, 'hints', None)
-    if hints:
-        hints = hints if isinstance(hints, (list, tuple)) else hints.split('\n')
-        hints = (
-            '    {}{}{}'.format(ANSI['lightyellow'], line.strip(), ANSI['reset'])
-            for line in hints[:5] if line.strip()
-        )
-    else:
-        hints = ()
-
-    output_lines = [
-        '{}Failed: {} {}{}'.format(ANSI['red'], err.__class__.__name__, err, ANSI['reset']),
-        *hints,
-        'Run to see full output:'        
-        '    cd {};'.format(pwd),
-        '    {}'.format(quoted_cmd),
-    ]
-
-    return '\n'.join(
-        '{}{}'.format(prefix, line)
-        for line in output_lines
-        if line
-    )
-
 
 
 ### Link Helpers
 ### Link Helpers
 
 
@@ -571,37 +544,59 @@ def chmod_file(path, cwd='.', permissions=OUTPUT_PERMISSIONS, timeout=30):
         print('     ', chmod_result.stderr.decode())
         print('     ', chmod_result.stderr.decode())
         raise Exception('Failed to chmod {}/{}'.format(cwd, path))
         raise Exception('Failed to chmod {}/{}'.format(cwd, path))
 
 
-def run(*popenargs, input=None, capture_output=False, timeout=None, check=False, **kwargs):
-    """Patched of subprocess.run to fix blocking io making timeout=innefective"""
 
 
-    if input is not None:
-        if 'stdin' in kwargs:
-            raise ValueError('stdin and input arguments may not both be used.')
-        kwargs['stdin'] = PIPE
+CACHED_USER_DATA_DIR = CHROME_USER_DATA_DIR
 
 
-    if capture_output:
-        if ('stdout' in kwargs) or ('stderr' in kwargs):
-            raise ValueError('stdout and stderr arguments may not be used '
-                             'with capture_output.')
-        kwargs['stdout'] = PIPE
-        kwargs['stderr'] = PIPE
+def chrome_args(binary=CHROME_BINARY, user_data_dir=CHROME_USER_DATA_DIR,
+               headless=CHROME_HEADLESS, sandbox=CHROME_SANDBOX,
+               check_ssl_validity=CHECK_SSL_VALIDITY, user_agent=CHROME_USER_AGENT,
+               resolution=RESOLUTION, timeout=TIMEOUT):
+    """helper to build up a chrome shell command with arguments"""
 
 
-    with Popen(*popenargs, **kwargs) as process:
-        try:
-            stdout, stderr = process.communicate(input, timeout=timeout)
-        except TimeoutExpired:
-            process.kill()
-            try:
-                stdout, stderr = process.communicate(input, timeout=2)
-            except:
-                pass
-            raise TimeoutExpired(popenargs[0][0], timeout)
-        except BaseException as err:
-            process.kill()
-            # We don't call process.wait() as .__exit__ does that for us.
-            raise 
-        retcode = process.poll()
-        if check and retcode:
-            raise CalledProcessError(retcode, process.args,
-                                     output=stdout, stderr=stderr)
-    return CompletedProcess(process.args, retcode, stdout, stderr)
+    global CACHED_USER_DATA_DIR
+    user_data_dir = user_data_dir or CACHED_USER_DATA_DIR
+    cmd_args = [binary]
+
+    if headless:
+        cmd_args += ('--headless',)
+    
+    if not sandbox:
+        # dont use GPU or sandbox when running inside docker container
+        cmd_args += ('--no-sandbox', '--disable-gpu')
+
+    if not check_ssl_validity:
+        cmd_args += ('--disable-web-security', '--ignore-certificate-errors')
+
+    if user_agent:
+        cmd_args += ('--user-agent={}'.format(user_agent),)
+
+    if resolution:
+        cmd_args += ('--window-size={}'.format(RESOLUTION),)
+
+    if timeout:
+        cmd_args += ('--timeout={}'.format((timeout) * 1000),)
+
+    # Find chrome user data directory
+    default_profile_paths = (
+        '~/.config/chromium',
+        '~/.config/google-chrome',
+        '~/.config/google-chrome-beta',
+        '~/.config/google-chrome-unstable',
+        '~/Library/Application Support/Chromium',
+        '~/Library/Application Support/Google/Chrome',
+        '~/Library/Application Support/Google/Chrome Canary',
+        '~/AppData/Local/Chromium/User Data',
+        '~/AppData/Local/Google/Chrome/User Data',
+        '~/AppData/Local/Google/Chrome SxS/User Data',
+    )
+    if user_data_dir:
+        cmd_args.append('--user-data-dir={}'.format(user_data_dir))
+    else:
+        for path in default_profile_paths:
+            full_path = os.path.expanduser(path)
+            if os.path.exists(full_path):
+                CACHED_USER_DATA_DIR = full_path
+                cmd_args.append('--user-data-dir={}'.format(full_path))
+                break
+    
+    return cmd_args