Просмотр исходного кода

major refactor + ability to handle http downloads

Nick Sweeting 8 лет назад
Родитель
Сommit
eb47155a12
12 измененных файлов с 1328 добавлено и 662 удалено
  1. 2 0
      .gitignore
  2. 1 1
      README.md
  3. 59 52
      archive.py
  4. 410 0
      archive_methods.py
  5. 18 129
      config.py
  6. 0 299
      fetch.py
  7. 166 17
      index.py
  8. 112 0
      links.py
  9. 84 162
      parse.py
  10. 2 2
      templates/index_row.html
  11. 258 0
      templates/link_index.html
  12. 216 0
      util.py

+ 2 - 0
.gitignore

@@ -2,6 +2,8 @@
 pocket/
 bookmarks/
 pinboard/
+html/
+downloads/
 
 # Byte-compiled / optimized / DLL files
 __pycache__/

+ 1 - 1
README.md

@@ -174,7 +174,7 @@ My published archive as an example: [sweeting.me/pocket](https://home.sweeting.m
 
 If you don't like running random setup scripts off the internet (:+1:), you can follow these manual setup instructions.
 
-**1. Install dependencies:** `chromium >= 59`,` wget >= 1.16`, `python3 >= 3.5`  (google-chrome >= v59 also works well)
+**1. Install dependencies:** `chromium >= 59`,` wget >= 1.16`, `python3 >= 3.5`  (`google-chrome >= v59` works fine as well)
 
 If you already have Google Chrome installed, or wish to use that instead of Chromium, follow the [Google Chrome Instructions](#google-chrome-instructions).
 

+ 59 - 52
archive.py

@@ -3,74 +3,79 @@
 # Nick Sweeting 2017 | MIT License
 # https://github.com/pirate/bookmark-archiver
 
-import os
 import sys
 
 from datetime import datetime
 
+from links import validate_links
 from parse import parse_export
-from index import dump_index
-from fetch import dump_website
+from archive_methods import archive_links, _RESULTS_TOTALS
+from index import (
+    write_links_index,
+    write_link_index,
+    parse_json_links_index,
+    parse_json_link_index,
+)
 from config import (
     ARCHIVE_PERMISSIONS,
-    ARCHIVE_DIR,
+    HTML_FOLDER,
+    ARCHIVE_FOLDER,
     ANSI,
+    TIMEOUT,
+)
+from util import (
+    download_url,
     check_dependencies,
+    progress,
 )
 
 DESCRIPTION = 'Bookmark Archiver: Create a browsable html archive of a list of links.'
 __DOCUMENTATION__ = 'https://github.com/pirate/bookmark-archiver'
 
 
-def create_archive(export_file, service=None, resume=None):
+
+def update_archive(export_path, resume=None, append=True):
     """update or create index.html and download archive of all links"""
 
-    print('[*] [{}] Starting archive from {} export file.'.format(
+    start_ts = datetime.now().timestamp()
+
+    # parse an validate the export file
+    new_links = validate_links(parse_export(export_path))
+
+    # load existing links if archive folder is present
+    if append:
+        existing_links = parse_json_links_index(HTML_FOLDER)
+        links = validate_links(existing_links + new_links)
+    else:
+        existing_links = []
+
+    # merge existing links and new links
+    num_new_links = len(links) - len(existing_links)
+    print('[*] [{}] Adding {} new links from {} to index'.format(
         datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
-        export_file,
+        num_new_links,
+        export_path,
     ))
 
-    with open(export_file, 'r', encoding='utf-8') as f:
-        links, service = parse_export(f, service=service)
-
-    if resume:
-        try:
-            links = [
-                link
-                for link in links
-                if float(link['timestamp']) >= float(resume)
-            ]
-        except TypeError:
-            print('Resume value and all timestamp values must be valid numbers.')
-
-    if not links or not service:
-        print('[X] No links found in {}, is it a {} export file?'.format(export_file, service))
-        raise SystemExit(1)
-
-    if not os.path.exists(os.path.join(ARCHIVE_DIR, service)):
-        os.makedirs(os.path.join(ARCHIVE_DIR, service))
-
-    if not os.path.exists(os.path.join(ARCHIVE_DIR, service, 'archive')):
-        os.makedirs(os.path.join(ARCHIVE_DIR, service, 'archive'))
-
-    dump_index(links, service)
-    check_dependencies()
-    try:
-        for link in links:
-            dump_website(link, service)
-    except (KeyboardInterrupt, SystemExit, Exception) as e:
-        print('{red}[X] Archive creation stopped.{reset}'.format(**ANSI))
-        print('    Continue where you left off by running:')
-        print('       ./archive.py {} {} {}'.format(
-            export_file,
-            service,
-            link['timestamp'],
-        ))
-        if not isinstance(e, KeyboardInterrupt):
-            raise e
-        raise SystemExit(1)
-
-    print('{}[√] [{}] Archive update complete.{}'.format(ANSI['green'], datetime.now().strftime('%Y-%m-%d %H:%M:%S'), ANSI['reset']))
+    # write link index html & json
+    write_links_index(HTML_FOLDER, links)
+
+    # loop over links and archive them
+    archive_links(ARCHIVE_FOLDER, links, export_path, resume=resume)
+
+    # print timing information & summary
+    end_ts = datetime.now().timestamp()
+    seconds = round(end_ts - start_ts, 1)
+    duration = '{} min'.format(seconds / 60) if seconds > 60 else '{} sec'.format(seconds)
+    print('{}[√] [{}] Archive update complete ({}){}'.format(
+        ANSI['green'],
+        datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
+        duration,
+        ANSI['reset'],
+    ))
+    print('    - {} skipped'.format(_RESULTS_TOTALS['skipped']))
+    print('    - {} updates'.format(_RESULTS_TOTALS['succeded']))
+    print('    - {} errors'.format(_RESULTS_TOTALS['failed']))
 
 
 if __name__ == '__main__':
@@ -85,8 +90,10 @@ if __name__ == '__main__':
         print("")
         raise SystemExit(0)
 
-    export_file = sys.argv[1]                                       # path to export file
-    export_type = sys.argv[2] if argc > 2 else None                 # select export_type for file format select
-    resume_from = sys.argv[3] if argc > 3 else None                 # timestamp to resume dowloading from
+    export_path = sys.argv[1]                        # path to export file
+    resume_from = sys.argv[2] if argc > 2 else None  # timestamp to resume dowloading from
+
+    if any(export_path.startswith(s) for s in ('http://', 'https://', 'ftp://')):
+        export_path = download_url(export_path)
 
-    create_archive(export_file, service=export_type, resume=resume_from)
+    update_archive(export_path, resume=resume_from)

+ 410 - 0
archive_methods.py

@@ -0,0 +1,410 @@
+import os
+
+from functools import wraps
+from datetime import datetime
+from subprocess import run, PIPE, DEVNULL
+
+from index import html_appended_url, parse_json_link_index, write_link_index
+from links import links_after_timestamp
+from config import (
+    ARCHIVE_PERMISSIONS,
+    ARCHIVE_DIR,
+    CHROME_BINARY,
+    FETCH_WGET,
+    FETCH_WGET_REQUISITES,
+    FETCH_PDF,
+    FETCH_SCREENSHOT,
+    RESOLUTION,
+    SUBMIT_ARCHIVE_DOT_ORG,
+    FETCH_AUDIO,
+    FETCH_VIDEO,
+    FETCH_FAVICON,
+    WGET_USER_AGENT,
+    TIMEOUT,
+    ANSI,
+)
+from util import (
+    check_dependencies,
+    progress,
+    chmod_file,
+)
+
+_RESULTS_TOTALS = {
+    'skipped': 0,
+    'succeded': 0,
+    'failed': 0,
+}
+
+def attach_result_to_link(method):
+    def decorator(fetch_func):
+        @wraps(fetch_func)
+        def timed_fetch_func(out_dir, link, overwrite=False, **kwargs):
+            # initialize methods and history json field on link
+            link['methods'] = link.get('methods') or {}
+            link['methods'][method] = link['methods'].get(method) or None
+            link['history'] = link.get('history') or {}
+            link['history'][method] = link['history'].get(method) or []
+
+            start_ts = datetime.now().timestamp()
+
+            # if a valid method output is already present, dont run the fetch function
+            if link['methods'][method] and not overwrite:
+                print('    √ Skipping: {}'.format(method))
+                result = None
+            else:
+                print('    - Fetching: {}'.format(method))
+                result = fetch_func(out_dir, link, **kwargs)
+
+            end_ts = datetime.now().timestamp()
+            duration = str(end_ts * 1000 - start_ts * 1000).split('.')[0]
+
+            # append a history item recording fail/success
+            history_entry = {
+                'timestamp': str(start_ts).split('.')[0],
+            }
+            if result is None:
+                history_entry['status'] = 'skipped'
+            elif isinstance(result.get('output'), Exception):
+                history_entry['status'] = 'failed'
+                history_entry['duration'] = duration
+                history_entry.update(result or {})
+                link['history'][method].append(history_entry)
+            else:
+                history_entry['status'] = 'succeded'
+                history_entry['duration'] = duration
+                history_entry.update(result or {})
+                link['history'][method].append(history_entry)
+                link['methods'][method] = result['output']
+            
+            _RESULTS_TOTALS[history_entry['status']] += 1
+            
+            return link
+        return timed_fetch_func
+    return decorator
+
+
+@attach_result_to_link('wget')
+def fetch_wget(out_dir, link, requisites=FETCH_WGET_REQUISITES, timeout=TIMEOUT):
+    """download full site using wget"""
+
+    if os.path.exists(os.path.join(out_dir, link['domain'])):
+        return {'output': html_appended_url(link), 'status': 'skipped'}
+
+    CMD = [
+        *'wget --timestamping --adjust-extension --no-parent'.split(' '),                # Docs: https://www.gnu.org/software/wget/manual/wget.html
+        *(('--page-requisites', '--convert-links') if FETCH_WGET_REQUISITES else ()),
+        *(('--user-agent="{}"'.format(WGET_USER_AGENT),) if WGET_USER_AGENT else ()),
+        link['url'],
+    ]
+    end = progress(timeout, prefix='      ')
+    try:
+        result = run(CMD, stdout=PIPE, stderr=PIPE, cwd=out_dir, timeout=timeout + 1)  # index.html
+        end()
+        output = html_appended_url(link)
+        if result.returncode > 0:
+            print('       got wget response code {}:'.format(result.returncode))
+            print('\n'.join('         ' + line for line in result.stderr.decode().rsplit('\n', 10)[-10:] if line.strip()))
+            # raise Exception('Failed to wget download')
+        chmod_file(link['domain'], cwd=out_dir)
+    except Exception as e:
+        end()
+        print('       Run to see full output:', 'cd {}; {}'.format(out_dir, ' '.join(CMD)))
+        print('       {}Failed: {} {}{}'.format(ANSI['red'], e.__class__.__name__, e, ANSI['reset']))
+        output = e
+
+    return {
+        'cmd': CMD,
+        'output': output,
+    }
+
+
+@attach_result_to_link('pdf')
+def fetch_pdf(out_dir, link, timeout=TIMEOUT):
+    """print PDF of site to file using chrome --headless"""
+
+    if link['type'] in ('PDF', 'image'):
+        return {'output': html_appended_url(link)}
+    
+    if os.path.exists(os.path.join(out_dir, 'output.pdf')):
+        return {'output': 'output.pdf', 'status': 'skipped'}
+
+    CMD = [
+        CHROME_BINARY,
+        *'--headless --disable-gpu --print-to-pdf'.split(' '),
+        link['url']
+    ]
+    end = progress(timeout, prefix='      ')
+    try:
+        result = run(CMD, stdout=DEVNULL, stderr=PIPE, cwd=out_dir, timeout=timeout + 1)  # output.pdf
+        end()
+        if result.returncode:
+            print('     ', result.stderr.decode())
+            raise Exception('Failed to print PDF')
+        chmod_file('output.pdf', cwd=out_dir)
+        output = 'output.pdf'
+    except Exception as e:
+        end()
+        print('       Run to see full output:', 'cd {}; {}'.format(out_dir, ' '.join(CMD)))
+        print('       {}Failed: {} {}{}'.format(ANSI['red'], e.__class__.__name__, e, ANSI['reset']))
+        output = e
+
+    return {
+        'cmd': CMD,
+        'output': output,
+    }
+
+
+@attach_result_to_link('screenshot')
+def fetch_screenshot(out_dir, link, timeout=TIMEOUT, resolution=RESOLUTION):
+    """take screenshot of site using chrome --headless"""
+
+    if link['type'] in ('PDF', 'image'):
+        return {'output': html_appended_url(link)}
+    
+    if os.path.exists(os.path.join(out_dir, 'screenshot.png')):
+        return {'output': 'screenshot.png', 'status': 'skipped'}
+
+    CMD = [
+        CHROME_BINARY,
+        *'--headless --disable-gpu --screenshot'.split(' '),
+        '--window-size={}'.format(resolution),
+        link['url']
+    ]
+    end = progress(timeout, prefix='      ')
+    try:
+        result = run(CMD, stdout=DEVNULL, stderr=DEVNULL, cwd=out_dir, timeout=timeout + 1)  # sreenshot.png
+        end()
+        if result.returncode:
+            print('     ', result.stderr.decode())
+            raise Exception('Failed to take screenshot')
+        chmod_file('screenshot.png', cwd=out_dir)
+        output = 'screenshot.png'
+    except Exception as e:
+        end()
+        print('       Run to see full output:', 'cd {}; {}'.format(out_dir, ' '.join(CMD)))
+        print('       {}Failed: {} {}{}'.format(ANSI['red'], e.__class__.__name__, e, ANSI['reset']))
+        output = e
+
+    return {
+        'cmd': CMD,
+        'output': output,
+    }
+    
+
+@attach_result_to_link('archive_org')
+def archive_dot_org(out_dir, link, timeout=TIMEOUT):
+    """submit site to archive.org for archiving via their service, save returned archive url"""
+
+    path = os.path.join(out_dir, 'archive.org.txt')
+    if os.path.exists(path):
+        archive_org_url = open(path, 'r').read().strip()
+        return {'output': archive_org_url, 'status': 'skipped'}
+
+    submit_url = 'https://web.archive.org/save/{}'.format(link['url'].split('?', 1)[0])
+
+    success = False
+    CMD = ['curl', '-I', submit_url]
+    end = progress(timeout, prefix='      ')
+    try:
+        result = run(CMD, stdout=PIPE, stderr=DEVNULL, cwd=out_dir, timeout=timeout + 1)  # archive.org.txt
+        end()
+
+        # Parse archive.org response headers
+        headers = result.stdout.splitlines()
+        content_location = [h for h in headers if b'Content-Location: ' in h]
+        errors = [h for h in headers if b'X-Archive-Wayback-Runtime-Error: ' in h]
+
+        if content_location:
+            archive_path = content_location[0].split(b'Content-Location: ', 1)[-1].decode('utf-8')
+            saved_url = 'https://web.archive.org{}'.format(archive_path)
+            success = True
+
+        elif len(errors) == 1 and b'RobotAccessControlException' in errors[0]:
+            output = submit_url
+            # raise Exception('Archive.org denied by {}/robots.txt'.format(link['domain']))
+        elif errors:
+            raise Exception(', '.join(e.decode() for e in errors))
+        else:
+            raise Exception('Failed to find "Content-Location" URL header in Archive.org response.')
+    except Exception as e:
+        end()
+        print('       Visit url to see output:', ' '.join(CMD))
+        print('       {}Failed: {} {}{}'.format(ANSI['red'], e.__class__.__name__, e, ANSI['reset']))
+        output = e
+
+    if success:
+        with open(os.path.join(out_dir, 'archive.org.txt'), 'w', encoding='utf-8') as f:
+            f.write(saved_url)
+        chmod_file('archive.org.txt', cwd=out_dir)
+        output = saved_url
+
+    return {
+        'cmd': CMD,
+        'output': output,
+    }
+
+@attach_result_to_link('favicon')
+def fetch_favicon(out_dir, link, timeout=TIMEOUT):
+    """download site favicon from google's favicon api"""
+
+    if os.path.exists(os.path.join(out_dir, 'favicon.ico')):
+        return {'output': 'favicon.ico', 'status': 'skipped'}
+
+    CMD = ['curl', 'https://www.google.com/s2/favicons?domain={domain}'.format(**link)]
+    fout = open('{}/favicon.ico'.format(out_dir), 'w')
+    end = progress(timeout, prefix='      ')
+    try:
+        run(CMD, stdout=fout, stderr=DEVNULL, cwd=out_dir, timeout=timeout + 1)  # favicon.ico
+        fout.close()
+        end()
+        chmod_file('favicon.ico', cwd=out_dir)
+        output = 'favicon.ico'
+    except Exception as e:
+        fout.close()
+        end()
+        print('       Run to see full output:', ' '.join(CMD))
+        print('       {}Failed: {} {}{}'.format(ANSI['red'], e.__class__.__name__, e, ANSI['reset']))
+        output = e
+
+    return {
+        'cmd': CMD,
+        'output': output,
+    }
+
+# @attach_result_to_link('audio')
+# def fetch_audio(out_dir, link, timeout=TIMEOUT):
+#     """Download audio rip using youtube-dl"""
+
+#     if link['type'] not in ('soundcloud',)\
+#        and 'audio' not in link['tags']:
+#         return
+
+#     path = os.path.join(out_dir, 'audio')
+
+#     if not os.path.exists(path) or overwrite:
+#         print('    - Downloading audio')
+#         CMD = [
+#             "youtube-dl -x --audio-format mp3 --audio-quality 0 -o '%(title)s.%(ext)s'",
+#             link['url'],
+#         ]
+#         end = progress(timeout, prefix='      ')
+#         try:
+#             result = run(CMD, stdout=DEVNULL, stderr=DEVNULL, cwd=out_dir, timeout=timeout + 1)  # audio/audio.mp3
+#             end()
+#             if result.returncode:
+#                 print('     ', result.stderr.decode())
+#                 raise Exception('Failed to download audio')
+#             chmod_file('audio.mp3', cwd=out_dir)
+#             return 'audio.mp3'
+#         except Exception as e:
+#             end()
+#             print('       Run to see full output:', 'cd {}; {}'.format(out_dir, ' '.join(CMD)))
+#             print('       {}Failed: {} {}{}'.format(ANSI['red'], e.__class__.__name__, e, ANSI['reset']))
+#             raise
+#     else:
+#         print('    √ Skipping audio download')
+
+# @attach_result_to_link('video')
+# def fetch_video(out_dir, link, timeout=TIMEOUT):
+#     """Download video rip using youtube-dl"""
+
+#     if link['type'] not in ('youtube', 'youku', 'vimeo')\
+#        and 'video' not in link['tags']:
+#         return
+
+#     path = os.path.join(out_dir, 'video')
+
+#     if not os.path.exists(path) or overwrite:
+#         print('    - Downloading video')
+#         CMD = [
+#             "youtube-dl -x --video-format mp4 --audio-quality 0 -o '%(title)s.%(ext)s'",
+#             link['url'],
+#         ]
+#         end = progress(timeout, prefix='      ')
+#         try:
+#             result = run(CMD, stdout=DEVNULL, stderr=DEVNULL, cwd=out_dir, timeout=timeout + 1)  # video/movie.mp4
+#             end()
+#             if result.returncode:
+#                 print('     ', result.stderr.decode())
+#                 raise Exception('Failed to download video')
+#             chmod_file('video.mp4', cwd=out_dir)
+#             return 'video.mp4'
+#         except Exception as e:
+#             end()
+#             print('       Run to see full output:', 'cd {}; {}'.format(out_dir, ' '.join(CMD)))
+#             print('       {}Failed: {} {}{}'.format(ANSI['red'], e.__class__.__name__, e, ANSI['reset']))
+#             raise
+#     else:
+#         print('    √ Skipping video download')
+
+
+def archive_links(out_dir, links, export_path, resume=None):
+    check_dependencies()
+
+    to_archive = links_after_timestamp(links, resume)
+    try:
+        for idx, link in enumerate(to_archive):
+            out_dir = os.path.join(out_dir, link['timestamp'])
+            archive_link(out_dir, link)
+    
+    except (KeyboardInterrupt, SystemExit, Exception) as e:
+        print('{red}[X] Archive update stopped on #{idx} out of {total} links{reset}'.format(
+            **ANSI,
+            idx=idx,
+            total=len(list(to_archive)),
+        ))
+        print('    Continue where you left off by running:')
+        print('       ./archive.py {} {}'.format(
+            export_path,
+            link['timestamp'],
+        ))
+        if not isinstance(e, KeyboardInterrupt):
+            raise e
+        raise SystemExit(1)
+
+
+def archive_link(out_dir, link, overwrite=False, permissions=ARCHIVE_PERMISSIONS):
+    """download the DOM, PDF, and a screenshot into a folder named after the link's timestamp"""
+
+    link = {**parse_json_link_index(out_dir), **link}
+    log_link_archive(out_dir, link)
+
+    if FETCH_WGET:
+        link = fetch_wget(out_dir, link, overwrite=overwrite)
+
+    if FETCH_PDF:
+        link = fetch_pdf(out_dir, link, overwrite=overwrite)
+
+    if FETCH_SCREENSHOT:
+        link = fetch_screenshot(out_dir, link, overwrite=overwrite)
+
+    if SUBMIT_ARCHIVE_DOT_ORG:
+        link = archive_dot_org(out_dir, link, overwrite=overwrite)
+
+    # if FETCH_AUDIO:
+    #     link = fetch_audio(out_dir, link, overwrite=overwrite)
+
+    # if FETCH_VIDEO:
+    #     link = fetch_video(out_dir, link, overwrite=overwrite)
+
+    if FETCH_FAVICON:
+        link = fetch_favicon(out_dir, link, overwrite=overwrite)
+
+    write_link_index(out_dir, link)
+    
+    return link
+
+def log_link_archive(out_dir, link):
+    update_existing = os.path.exists(out_dir)
+    if not update_existing:
+        os.makedirs(out_dir)
+        run(['chmod', ARCHIVE_PERMISSIONS, out_dir], timeout=5)
+    
+    print('[{symbol_color}{symbol}{reset}] [{timestamp}] "{title}": {blue}{base_url}{reset}'.format(
+        symbol='*' if update_existing else '+',
+        symbol_color=ANSI['black' if update_existing else 'green'],
+        **link,
+        **ANSI,
+    ))
+    if link['type']:
+        print('    i Type: {}'.format(link['type']))

+ 18 - 129
config.py

@@ -1,10 +1,8 @@
 import os
 import sys
-import time
 import shutil
 
-from subprocess import run, PIPE, DEVNULL
-from multiprocessing import Process
+from subprocess import run, PIPE
 
 # os.getenv('VARIABLE', 'DEFAULT') gets the value of environment
 # variable "VARIABLE" and if it is not set, sets it to 'DEFAULT'
@@ -12,8 +10,10 @@ from multiprocessing import Process
 # for boolean values, check to see if the string is 'true', and
 # if so, the python variable will be True
 
-IS_TTY = sys.stdout.isatty()
+# *******************************************************************************
+# *** TO SET YOUR PREFERENCES, EDIT THE VALUES HERE, or use the 'env' command ***
 
+IS_TTY = sys.stdout.isatty()
 USE_COLOR =              os.getenv('USE_COLOR',              str(IS_TTY)        ).lower() == 'true'
 SHOW_PROGRESS =          os.getenv('SHOW_PROGRESS',          str(IS_TTY)        ).lower() == 'true'
 FETCH_WGET =             os.getenv('FETCH_WGET',             'True'             ).lower() == 'true'
@@ -31,9 +31,12 @@ CHROME_BINARY =          os.getenv('CHROME_BINARY',          'chromium-browser'
 WGET_BINARY =            os.getenv('WGET_BINARY',            'wget'             )
 WGET_USER_AGENT =        os.getenv('WGET_USER_AGENT',         None)
 TIMEOUT =                int(os.getenv('TIMEOUT',            '60'))
+LINK_INDEX_TEMPLATE =    os.getenv('LINK_INDEX_TEMPLATE',    'templates/link_index.html')
 INDEX_TEMPLATE =         os.getenv('INDEX_TEMPLATE',         'templates/index.html')
 INDEX_ROW_TEMPLATE =     os.getenv('INDEX_ROW_TEMPLATE',     'templates/index_row.html')
 
+# *******************************************************************************
+
 TERM_WIDTH = shutil.get_terminal_size((100, 10)).columns
 ANSI = {
     'reset': '\033[00;00m',
@@ -50,6 +53,17 @@ if not USE_COLOR:
     # dont show colors if USE_COLOR is False
     ANSI = {k: '' for k in ANSI.keys()}
 
+
+ROOT_FOLDER = os.path.dirname(os.path.abspath(__file__))
+HTML_FOLDER = os.path.join(ARCHIVE_DIR, 'html')
+ARCHIVE_FOLDER = os.path.join(HTML_FOLDER, 'archive')
+try:
+    GIT_SHA = run(["git", "rev-list", "-1", "HEAD", "./"], stdout=PIPE, cwd=ROOT_FOLDER).stdout.strip().decode()
+except Exception:
+    GIT_SHA = None
+    print('[!] Warning, you need git installed for some archiving features to save correct version numbers!')
+
+
 if sys.stdout.encoding.upper() != 'UTF-8':
     print('[X] Your system is running python3 scripts with a bad locale setting: {} (it should be UTF-8).'.format(sys.stdout.encoding))
     print('    To fix it, add the line "export PYTHONIOENCODING=utf8" to your ~/.bashrc file (without quotes)')
@@ -59,128 +73,3 @@ if sys.stdout.encoding.upper() != 'UTF-8':
     print('')
     print('    Alternatively, run this script with:')
     print('        env PYTHONIOENCODING=utf8 ./archive.py export.html')
-
-### Util Functions
-
-def check_dependencies():
-    """Check that all necessary dependencies are installed, and have valid versions"""
-
-    print('[*] Checking Dependencies:')
-
-    python_vers = float('{}.{}'.format(sys.version_info.major, sys.version_info.minor))
-    if python_vers < 3.5:
-        print('{}[X] Python version is not new enough: {} (>3.5 is required){}'.format(ANSI['red'], python_vers, ANSI['reset']))
-        print('    See https://github.com/pirate/bookmark-archiver#troubleshooting for help upgrading your Python installation.')
-        raise SystemExit(1)
-
-    if FETCH_PDF or FETCH_SCREENSHOT:
-        if run(['which', CHROME_BINARY]).returncode:
-            print('{}[X] Missing dependency: {}{}'.format(ANSI['red'], CHROME_BINARY, ANSI['reset']))
-            print('    Run ./setup.sh, then confirm it was installed with: {} --version'.format(CHROME_BINARY))
-            print('    See https://github.com/pirate/bookmark-archiver for help.')
-            raise SystemExit(1)
-
-        # parse chrome --version e.g. Google Chrome 61.0.3114.0 canary / Chromium 59.0.3029.110 built on Ubuntu, running on Ubuntu 16.04
-        try:
-            result = run([CHROME_BINARY, '--version'], stdout=PIPE)
-            version = result.stdout.decode('utf-8').replace('Google Chrome ', '').replace('Chromium ', '').split(' ', 1)[0].split('.', 1)[0]  # TODO: regex might be better
-            if int(version) < 59:
-                print('{red}[X] Chrome version must be 59 or greater for headless PDF and screenshot saving{reset}'.format(**ANSI))
-                print('    See https://github.com/pirate/bookmark-archiver for help.')
-                raise SystemExit(1)
-        except (TypeError, OSError):
-            print('{red}[X] Failed to parse Chrome version, is it installed properly?{reset}'.format(**ANSI))
-            print('    Run ./setup.sh, then confirm it was installed with: {} --version'.format(CHROME_BINARY))
-            print('    See https://github.com/pirate/bookmark-archiver for help.')
-            raise SystemExit(1)
-
-    if FETCH_WGET:
-        if run(['which', 'wget']).returncode or run(['wget', '--version'], stdout=DEVNULL).returncode:
-            print('{red}[X] Missing dependency: wget{reset}'.format(**ANSI))
-            print('    Run ./setup.sh, then confirm it was installed with: {} --version'.format('wget'))
-            print('    See https://github.com/pirate/bookmark-archiver for help.')
-            raise SystemExit(1)
-
-    if FETCH_FAVICON or SUBMIT_ARCHIVE_DOT_ORG:
-        if run(['which', 'curl']).returncode or run(['curl', '--version'], stdout=DEVNULL).returncode:
-            print('{red}[X] Missing dependency: curl{reset}'.format(**ANSI))
-            print('    Run ./setup.sh, then confirm it was installed with: {} --version'.format('curl'))
-            print('    See https://github.com/pirate/bookmark-archiver for help.')
-            raise SystemExit(1)
-
-    if FETCH_AUDIO or FETCH_VIDEO:
-        if run(['which', 'youtube-dl']).returncode or run(['youtube-dl', '--version'], stdout=DEVNULL).returncode:
-            print('{red}[X] Missing dependency: youtube-dl{reset}'.format(**ANSI))
-            print('    Run ./setup.sh, then confirm it was installed with: {} --version'.format('youtube-dl'))
-            print('    See https://github.com/pirate/bookmark-archiver for help.')
-            raise SystemExit(1)
-
-
-def chmod_file(path, cwd='.', permissions=ARCHIVE_PERMISSIONS, timeout=30):
-    """chmod -R <permissions> <cwd>/<path>"""
-
-    if not os.path.exists(os.path.join(cwd, path)):
-        raise Exception('Failed to chmod: {} does not exist (did the previous step fail?)'.format(path))
-
-    chmod_result = run(['chmod', '-R', permissions, path], cwd=cwd, stdout=DEVNULL, stderr=PIPE, timeout=timeout)
-    if chmod_result.returncode == 1:
-        print('     ', chmod_result.stderr.decode())
-        raise Exception('Failed to chmod {}/{}'.format(cwd, path))
-
-
-def progress(seconds=TIMEOUT, prefix=''):
-    """Show a (subprocess-controlled) progress bar with a <seconds> timeout,
-       returns end() function to instantly finish the progress
-    """
-
-    if not SHOW_PROGRESS:
-        return lambda: None
-
-    chunk = '█' if sys.stdout.encoding == 'UTF-8' else '#'
-    chunks = TERM_WIDTH - len(prefix) - 20  # number of progress chunks to show (aka max bar width)
-
-    def progress_bar(seconds=seconds, prefix=prefix):
-        """show timer in the form of progress bar, with percentage and seconds remaining"""
-        try:
-            for s in range(seconds * chunks):
-                progress = s / chunks / seconds * 100
-                bar_width = round(progress/(100/chunks))
-
-                # ████████████████████           0.9% (1/60sec)
-                sys.stdout.write('\r{0}{1}{2}{3} {4}% ({5}/{6}sec)'.format(
-                    prefix,
-                    ANSI['green'],
-                    (chunk * bar_width).ljust(chunks),
-                    ANSI['reset'],
-                    round(progress, 1),
-                    round(s/chunks),
-                    seconds,
-                ))
-                sys.stdout.flush()
-                time.sleep(1 / chunks)
-
-            # ██████████████████████████████████ 100.0% (60/60sec)
-            sys.stdout.write('\r{0}{1}{2}{3} {4}% ({5}/{6}sec)\n'.format(
-                prefix,
-                ANSI['red'],
-                chunk * chunks,
-                ANSI['reset'],
-                100.0,
-                seconds,
-                seconds,
-            ))
-            sys.stdout.flush()
-        except KeyboardInterrupt:
-            print()
-            pass
-
-    p = Process(target=progress_bar)
-    p.start()
-
-    def end():
-        """immediately finish progress and clear the progressbar line"""
-        p.terminate()
-        sys.stdout.write('\r{}{}\r'.format((' ' * TERM_WIDTH), ANSI['reset']))  # clear whole terminal line
-        sys.stdout.flush()
-
-    return end

+ 0 - 299
fetch.py

@@ -1,299 +0,0 @@
-import os
-import json
-
-from datetime import datetime
-from subprocess import run, PIPE, DEVNULL
-
-from parse import derived_link_info
-from config import (
-    ARCHIVE_PERMISSIONS,
-    ARCHIVE_DIR,
-    CHROME_BINARY,
-    FETCH_WGET,
-    FETCH_WGET_REQUISITES,
-    FETCH_PDF,
-    FETCH_SCREENSHOT,
-    RESOLUTION,
-    SUBMIT_ARCHIVE_DOT_ORG,
-    FETCH_AUDIO,
-    FETCH_VIDEO,
-    FETCH_FAVICON,
-    WGET_USER_AGENT,
-    TIMEOUT,
-    ANSI,
-    progress,
-    chmod_file,
-)
-
-
-def fetch_wget(out_dir, link, overwrite=False, requisites=True, timeout=TIMEOUT):
-    """download full site using wget"""
-
-    if not os.path.exists(os.path.join(out_dir, link['domain'])) or overwrite:
-        print('    - Downloading full site')
-        CMD = [
-            *'wget --timestamping --adjust-extension --no-parent'.split(' '),                # Docs: https://www.gnu.org/software/wget/manual/wget.html
-            *(('--page-requisites', '--convert-links') if requisites else ()),
-            *(('--user-agent="{}"'.format(WGET_USER_AGENT),) if WGET_USER_AGENT else ()),
-            link['url'],
-        ]
-        end = progress(timeout, prefix='      ')
-        try:
-            result = run(CMD, stdout=PIPE, stderr=PIPE, cwd=out_dir, timeout=timeout + 1)  # index.html
-            end()
-            if result.returncode > 0:
-                print('       wget output:')
-                print('\n'.join('         ' + line for line in result.stderr.decode().rsplit('\n', 10)[-10:] if line.strip()))
-                raise Exception('Failed to wget download')
-            chmod_file(link['domain'], cwd=out_dir)
-        except Exception as e:
-            end()
-            print('       Run to see full output:', 'cd {}; {}'.format(out_dir, ' '.join(CMD)))
-            print('       {}Failed: {} {}{}'.format(ANSI['red'], e.__class__.__name__, e, ANSI['reset']))
-    else:
-        print('    √ Skipping site download')
-
-def fetch_pdf(out_dir, link, overwrite=False, timeout=TIMEOUT, chrome_binary=CHROME_BINARY):
-    """print PDF of site to file using chrome --headless"""
-
-    path = os.path.join(out_dir, 'output.pdf')
-
-    if (not os.path.exists(path) or overwrite) and link['type'] not in ('PDF', 'image'):
-        print('    - Printing PDF')
-        CMD = [
-            chrome_binary,
-            *'--headless --disable-gpu --print-to-pdf'.split(' '),
-            link['url']
-        ]
-        end = progress(timeout, prefix='      ')
-        try:
-            result = run(CMD, stdout=DEVNULL, stderr=PIPE, cwd=out_dir, timeout=timeout + 1)  # output.pdf
-            end()
-            if result.returncode:
-                print('     ', result.stderr.decode())
-                raise Exception('Failed to print PDF')
-            chmod_file('output.pdf', cwd=out_dir)
-        except Exception as e:
-            end()
-            print('       Run to see full output:', 'cd {}; {}'.format(out_dir, ' '.join(CMD)))
-            print('       {}Failed: {} {}{}'.format(ANSI['red'], e.__class__.__name__, e, ANSI['reset']))
-    else:
-        print('    √ Skipping PDF print')
-
-def fetch_screenshot(out_dir, link, overwrite=False, timeout=TIMEOUT, chrome_binary=CHROME_BINARY, resolution=RESOLUTION):
-    """take screenshot of site using chrome --headless"""
-
-    path = os.path.join(out_dir, 'screenshot.png')
-
-    if (not os.path.exists(path) or overwrite) and link['type'] not in ('PDF', 'image'):
-        print('    - Snapping Screenshot')
-        CMD = [
-            chrome_binary,
-            *'--headless --disable-gpu --screenshot'.split(' '),
-            '--window-size={}'.format(resolution),
-            link['url']
-        ]
-        end = progress(timeout, prefix='      ')
-        try:
-            result = run(CMD, stdout=DEVNULL, stderr=DEVNULL, cwd=out_dir, timeout=timeout + 1)  # sreenshot.png
-            end()
-            if result.returncode:
-                print('     ', result.stderr.decode())
-                raise Exception('Failed to take screenshot')
-            chmod_file('screenshot.png', cwd=out_dir)
-        except Exception as e:
-            end()
-            print('       Run to see full output:', 'cd {}; {}'.format(out_dir, ' '.join(CMD)))
-            print('       {}Failed: {} {}{}'.format(ANSI['red'], e.__class__.__name__, e, ANSI['reset']))
-    else:
-        print('    √ Skipping screenshot')
-
-def archive_dot_org(out_dir, link, overwrite=False, timeout=TIMEOUT):
-    """submit site to archive.org for archiving via their service, save returned archive url"""
-
-    path = os.path.join(out_dir, 'archive.org.txt')
-
-    if not os.path.exists(path) or overwrite:
-        print('    - Submitting to archive.org')
-        submit_url = 'https://web.archive.org/save/{}'.format(link['url'].split('?', 1)[0])
-
-        success = False
-        CMD = ['curl', '-I', submit_url]
-        end = progress(timeout, prefix='      ')
-        try:
-            result = run(CMD, stdout=PIPE, stderr=DEVNULL, cwd=out_dir, timeout=timeout + 1)  # archive.org.txt
-            end()
-
-            # Parse archive.org response headers
-            headers = result.stdout.splitlines()
-            content_location = [h for h in headers if b'Content-Location: ' in h]
-            errors = [h for h in headers if b'X-Archive-Wayback-Runtime-Error: ' in h]
-
-            if content_location:
-                archive_path = content_location[0].split(b'Content-Location: ', 1)[-1].decode('utf-8')
-                saved_url = 'https://web.archive.org{}'.format(archive_path)
-                success = True
-
-            elif len(errors) == 1 and b'RobotAccessControlException' in errors[0]:
-                raise Exception('Archive.org denied by {}/robots.txt'.format(link['domain']))
-            elif errors:
-                raise Exception(', '.join(e.decode() for e in errors))
-            else:
-                raise Exception('Failed to find "Content-Location" URL header in Archive.org response.')
-        except Exception as e:
-            end()
-            print('       Visit url to see output:', ' '.join(CMD))
-            print('       {}Failed: {} {}{}'.format(ANSI['red'], e.__class__.__name__, e, ANSI['reset']))
-
-        if success:
-            with open(os.path.join(out_dir, 'archive.org.txt'), 'w', encoding='utf-8') as f:
-                f.write(saved_url)
-            chmod_file('archive.org.txt', cwd=out_dir)
-
-    else:
-        print('    √ Skipping archive.org')
-
-def fetch_favicon(out_dir, link, overwrite=False, timeout=TIMEOUT):
-    """download site favicon from google's favicon api"""
-
-    path = os.path.join(out_dir, 'favicon.ico')
-
-    if not os.path.exists(path) or overwrite:
-        print('    - Fetching Favicon')
-        CMD = ['curl', 'https://www.google.com/s2/favicons?domain={domain}'.format(**link)]
-        fout = open('{}/favicon.ico'.format(out_dir), 'w')
-        end = progress(timeout, prefix='      ')
-        try:
-            run(CMD, stdout=fout, stderr=DEVNULL, cwd=out_dir, timeout=timeout + 1)  # favicon.ico
-            end()
-            chmod_file('favicon.ico', cwd=out_dir)
-        except Exception as e:
-            end()
-            print('       Run to see full output:', ' '.join(CMD))
-            print('       {}Failed: {} {}{}'.format(ANSI['red'], e.__class__.__name__, e, ANSI['reset']))
-        fout.close()
-    else:
-        print('    √ Skipping favicon')
-
-def fetch_audio(out_dir, link, overwrite=False, timeout=TIMEOUT):
-    """Download audio rip using youtube-dl"""
-
-    if link['type'] not in ('soundcloud',):
-        return
-
-    path = os.path.join(out_dir, 'audio')
-
-    if not os.path.exists(path) or overwrite:
-        print('    - Downloading audio')
-        CMD = [
-            "youtube-dl -x --audio-format mp3 --audio-quality 0 -o '%(title)s.%(ext)s'",
-            link['url'],
-        ]
-        end = progress(timeout, prefix='      ')
-        try:
-            result = run(CMD, stdout=DEVNULL, stderr=DEVNULL, cwd=out_dir, timeout=timeout + 1)  # audio/audio.mp3
-            end()
-            if result.returncode:
-                print('     ', result.stderr.decode())
-                raise Exception('Failed to download audio')
-            chmod_file('audio', cwd=out_dir)
-        except Exception as e:
-            end()
-            print('       Run to see full output:', 'cd {}; {}'.format(out_dir, ' '.join(CMD)))
-            print('       {}Failed: {} {}{}'.format(ANSI['red'], e.__class__.__name__, e, ANSI['reset']))
-    else:
-        print('    √ Skipping audio download')
-
-def fetch_video(out_dir, link, overwrite=False, timeout=TIMEOUT):
-    """Download video rip using youtube-dl"""
-
-    if link['type'] not in ('youtube', 'youku', 'vimeo'):
-        return
-
-    path = os.path.join(out_dir, 'video')
-
-    if not os.path.exists(path) or overwrite:
-        print('    - Downloading video')
-        CMD = [
-            "youtube-dl -x --video-format mp4 --audio-quality 0 -o '%(title)s.%(ext)s'",
-            link['url'],
-        ]
-        end = progress(timeout, prefix='      ')
-        try:
-            result = run(CMD, stdout=DEVNULL, stderr=DEVNULL, cwd=out_dir, timeout=timeout + 1)  # video/movie.mp4
-            end()
-            if result.returncode:
-                print('     ', result.stderr.decode())
-                raise Exception('Failed to download video')
-            chmod_file('video', cwd=out_dir)
-        except Exception as e:
-            end()
-            print('       Run to see full output:', 'cd {}; {}'.format(out_dir, ' '.join(CMD)))
-            print('       {}Failed: {} {}{}'.format(ANSI['red'], e.__class__.__name__, e, ANSI['reset']))
-    else:
-        print('    √ Skipping video download')
-
-def dump_link_info(out_dir, link, overwrite=False):
-    """write a json file with some info about the link"""
-
-    info_file_path = os.path.join(out_dir, 'link.json')
-
-    if (not os.path.exists(info_file_path) or overwrite):
-        print('    - Creating link info file')
-        try:
-            link_json = derived_link_info(link)
-            link_json['archived_timstamp'] = str(datetime.now().timestamp()).split('.')[0]
-
-            with open(info_file_path, 'w', encoding='utf-8') as link_file:
-                link_file.write(json.dumps(
-                    link_json,
-                    indent=4,
-                    default=str) + '\n')
-
-            chmod_file('link.json', cwd=out_dir)
-        except Exception as e:
-            print('       {}Failed: {} {}{}'.format(ANSI['red'], e.__class__.__name__, e, ANSI['reset']))
-    else:
-        print('    √ Skipping link info file')
-
-
-def dump_website(link, service, overwrite=False, permissions=ARCHIVE_PERMISSIONS):
-    """download the DOM, PDF, and a screenshot into a folder named after the link's timestamp"""
-
-    print('[{green}+{reset}] [{timestamp} ({time})] "{title}": {blue}{base_url}{reset}'.format(**link, **ANSI))
-
-    out_dir = os.path.join(ARCHIVE_DIR, service, 'archive', link['timestamp'])
-    if not os.path.exists(out_dir):
-        os.makedirs(out_dir)
-
-    run(['chmod', permissions, out_dir], timeout=5)
-
-    if link['type']:
-        print('    i Type: {}'.format(link['type']))
-
-    if not (link['url'].startswith('http') or link['url'].startswith('ftp')):
-        print('    {}X Skipping: invalid link.{}', ANSI['red'], ANSI['yellow'])
-        return
-
-    if FETCH_WGET:
-        fetch_wget(out_dir, link, overwrite=overwrite, requisites=FETCH_WGET_REQUISITES)
-
-    if FETCH_PDF:
-        fetch_pdf(out_dir, link, overwrite=overwrite, chrome_binary=CHROME_BINARY)
-
-    if FETCH_SCREENSHOT:
-        fetch_screenshot(out_dir, link, overwrite=overwrite, chrome_binary=CHROME_BINARY, resolution=RESOLUTION)
-
-    if SUBMIT_ARCHIVE_DOT_ORG:
-        archive_dot_org(out_dir, link, overwrite=overwrite)
-
-    if FETCH_AUDIO:
-        fetch_audio(out_dir, link, overwrite=overwrite)
-
-    if FETCH_VIDEO:
-        fetch_video(out_dir, link, overwrite=overwrite)
-
-    if FETCH_FAVICON:
-        fetch_favicon(out_dir, link, overwrite=overwrite)
-
-    dump_link_info(out_dir, link, overwrite=overwrite)

+ 166 - 17
index.py

@@ -1,47 +1,196 @@
 import os
+import re
+import json
+
 from datetime import datetime
 from string import Template
 
-from parse import derived_link_info
 from config import (
     INDEX_TEMPLATE,
     INDEX_ROW_TEMPLATE,
+    LINK_INDEX_TEMPLATE,
     ARCHIVE_PERMISSIONS,
     ARCHIVE_DIR,
     ANSI,
-    chmod_file,
+    GIT_SHA,
 )
+from util import chmod_file
+
+
+### Homepage index for all the links
+
+def parse_json_links_index(out_dir):
+    """load the index in a given directory and merge it with the given link"""
+    index_path = os.path.join(out_dir, 'index.json')
+    if os.path.exists(index_path):
+        with open(index_path, 'r', encoding='utf-8') as f:
+            return json.load(f)['links']
+
+    return []
+
+def write_links_index(out_dir, links):
+    """create index.html file for a given list of links"""
+
+    if not os.path.exists(out_dir):
+        os.makedirs(out_dir)
+
+    print('[i] [{}] Updating {}{}{} links in archive index...'.format(
+        datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
+        ANSI['green'],
+        len(links),
+        ANSI['reset'],
+    ))
+    
+    write_json_links_index(out_dir, links)
+    write_html_links_index(out_dir, links)
 
+    chmod_file(out_dir, permissions=ARCHIVE_PERMISSIONS)
 
-def dump_index(links, service):
-    """create index.html file for a given list of links and service"""
+def write_json_links_index(out_dir, links):
+    """write the json link index to a given path"""
+
+    path = os.path.join(out_dir, 'index.json')
+
+    index_json = {
+        'info': 'Bookmark Archiver Index',
+        'help': 'https://github.com/pirate/bookmark-archiver',
+        'version': GIT_SHA,
+        'num_links': len(links),
+        'updated': str(datetime.now().timestamp()),
+        'links': links,
+    }
+
+    with open(path, 'w', encoding='utf-8') as f:
+        json.dump(index_json, f, indent=4, default=str)
+
+    chmod_file(path)
+
+def write_html_links_index(out_dir, links):
+    """write the html link index to a given path"""
+
+    path = os.path.join(out_dir, 'index.html')
 
     with open(INDEX_TEMPLATE, 'r', encoding='utf-8') as f:
         index_html = f.read()
 
-    # TODO: refactor this out into index_template.html
     with open(INDEX_ROW_TEMPLATE, 'r', encoding='utf-8') as f:
-        link_html = f.read()
+        link_row_html = f.read()
 
-    article_rows = '\n'.join(
-        Template(link_html).substitute(**derived_link_info(link)) for link in links
+    link_rows = '\n'.join(
+        Template(link_row_html).substitute(**derived_link_info(link))
+        for link in links
     )
 
     template_vars = {
         'num_links': len(links),
         'date_updated': datetime.now().strftime('%Y-%m-%d'),
         'time_updated': datetime.now().strftime('%Y-%m-%d %H:%M'),
-        'rows': article_rows,
+        'rows': link_rows,
     }
 
-    with open(os.path.join(ARCHIVE_DIR, service, 'index.html'), 'w', encoding='utf-8') as f:
+    with open(path, 'w', encoding='utf-8') as f:
         f.write(Template(index_html).substitute(**template_vars))
 
-    chmod_file(os.path.join(ARCHIVE_DIR, service), permissions=ARCHIVE_PERMISSIONS)
 
-    print('[+] [{}] Created archive index with {}{}{} links.'.format(
-        datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
-        ANSI['green'],
-        len(links),
-        ANSI['reset'],
-    ))
+### Individual link index
+
+def parse_json_link_index(out_dir):
+    """load the index in a given directory and merge it with the given link"""
+    existing_index = os.path.join(out_dir, 'index.json')
+    if os.path.exists(existing_index):
+        with open(existing_index, 'r', encoding='utf-8') as f:
+            return json.load(f)
+    return {}
+
+def write_link_index(out_dir, link):
+    link['updated'] = str(datetime.now().timestamp())
+    write_json_link_index(out_dir, link)
+    write_html_link_index(out_dir, link)
+
+def write_json_link_index(out_dir, link):
+    """write a json file with some info about the link"""
+    
+    path = os.path.join(out_dir, 'index.json')
+
+    with open(path, 'w', encoding='utf-8') as f:
+        json.dump(link, f, indent=4, default=str)
+
+    chmod_file(path)
+
+def write_html_link_index(out_dir, link):
+    with open(LINK_INDEX_TEMPLATE, 'r', encoding='utf-8') as f:
+        link_html = f.read()
+
+    path = os.path.join(out_dir, 'index.html')
+
+    with open(path, 'w', encoding='utf-8') as f:
+        f.write(Template(link_html).substitute({
+            **link,
+            **link['methods'],
+            'type': link['type'] or 'website',
+            'tags': link['tags'] or '',
+            'bookmarked': datetime.fromtimestamp(float(link['timestamp'])).strftime('%Y-%m-%d %H:%M'),
+            'updated': datetime.fromtimestamp(float(link['updated'])).strftime('%Y-%m-%d %H:%M'),
+            'archive_org': link['methods']['archive_org'] or 'https://web.archive.org/save/{}'.format(link['url']),
+            'wget': link['methods']['wget'] or link['domain'],
+        }))
+
+    chmod_file(path)
+
+
+
+def html_appended_url(link):
+    """calculate the path to the wgetted .html file, since wget may
+    adjust some paths to be different than the base_url path.
+
+    See docs on wget --adjust-extension."""
+
+    if link['type'] in ('PDF', 'image'):
+        return link['base_url']
+
+    split_url = link['url'].split('#', 1)
+    query = ('%3F' + link['url'].split('?', 1)[-1]) if '?' in link['url'] else ''
+
+    if re.search(".+\\.[Hh][Tt][Mm][Ll]?$", split_url[0], re.I | re.M):
+        # already ends in .html
+        return link['base_url']
+    else:
+        # .html needs to be appended
+        without_scheme = split_url[0].split('://', 1)[-1].split('?', 1)[0]
+        if without_scheme.endswith('/'):
+            if query:
+                return '#'.join([without_scheme + 'index.html' + query + '.html', *split_url[1:]])
+            return '#'.join([without_scheme + 'index.html', *split_url[1:]])
+        else:
+            if query:
+                return '#'.join([without_scheme + '/index.html' + query + '.html', *split_url[1:]])
+            elif '/' in without_scheme:
+                return '#'.join([without_scheme + '.html', *split_url[1:]])
+            return link['base_url'] + '/index.html'
+
+
+def derived_link_info(link):
+    """extend link info with the archive urls and other derived data"""
+
+    link_info = {
+        **link,
+        'date': datetime.fromtimestamp(float(link['timestamp'])).strftime('%Y-%m-%d %H:%M'),
+        'google_favicon_url': 'https://www.google.com/s2/favicons?domain={domain}'.format(**link),
+        'favicon_url': 'archive/{timestamp}/favicon.ico'.format(**link),
+        'files_url': 'archive/{timestamp}/'.format(**link),
+        'archive_url': 'archive/{}/{}'.format(link['timestamp'], html_appended_url(link)),
+        'pdf_link': 'archive/{timestamp}/output.pdf'.format(**link),
+        'screenshot_link': 'archive/{timestamp}/screenshot.png'.format(**link),
+        'archive_org_url': 'https://web.archive.org/web/{base_url}'.format(**link),
+    }
+
+    # PDF and images are handled slightly differently
+    # wget, screenshot, & pdf urls all point to the same file
+    if link['type'] in ('PDF', 'image'):
+        link_info.update({
+            'archive_url': 'archive/{timestamp}/{base_url}'.format(**link),
+            'pdf_link': 'archive/{timestamp}/{base_url}'.format(**link),
+            'screenshot_link': 'archive/{timestamp}/{base_url}'.format(**link),
+            'title': '{title} ({type})'.format(**link),
+        })
+    return link_info

+ 112 - 0
links.py

@@ -0,0 +1,112 @@
+from util import (
+    domain,
+    base_url,
+    get_str_between,
+    get_link_type,
+)
+   
+
+def validate_links(links):
+    links = valid_links(links)       # remove chrome://, about:, mailto: etc.
+    links = uniquefied_links(links)  # fix duplicate timestamps, returns sorted list
+    links = sorted_links(links)      # deterministically sort the links
+    
+    if not links:
+        print('[X] No links found :(')
+        raise SystemExit(1)
+
+    return list(links)
+
+def sorted_links(links):
+    return sorted(
+        links,
+        key=lambda link: (link['timestamp'], link['url']),
+        reverse=True,
+    )
+
+def merge_links(link1, link2):
+    longer = lambda a, b, key: a[key] if len(a[key]) > len(b[key]) else b[key]
+    earlier = lambda a, b, key: a[key] if a[key] < b[key] else b[key]
+    
+    url = longer(link1, link2, 'url')
+    earliest_ts = earlier(link1, link2, 'timestamp')
+    longest_title = longer(link1, link2, 'title')
+    cleanest_title = link1['title'] if '://' not in link1['title'] else link2['title']
+    link = {
+        'url': url,
+        'domain': domain(url),
+        'base_url': base_url(url),
+        'timestamp': earliest_ts,
+        'tags': longer(link1, link2, 'tags'),
+        'title': longest_title if '://' not in longest_title else cleanest_title,
+        'sources': list(set(link1['sources'] + link2['sources'])),
+    }
+    link['type'] = get_link_type(link)
+    return link
+
+def uniquefied_links(sorted_links):
+    """
+    ensures that all non-duplicate links have monotonically increasing timestamps
+    """
+
+    seen_urls = {}
+    seen_timestamps = set()
+
+    lower = lambda url: url.lower().strip()
+    without_www = lambda url: url.replace('://www.', '://', 1)
+    without_trailing_slash = lambda url: url[:-1] if url[-1] == '/' else url.replace('/?', '?')
+
+    for link in sorted_links:
+        url = without_www(without_trailing_slash(lower(link['url'])))
+        if url in seen_urls:
+            # merge with any other links that share the same url
+            link = merge_links(seen_urls[url], link)
+        elif link['timestamp'] in seen_timestamps:
+            # add with incremented timestamp if earlier link exist with same timestamp
+            link['timestamp'] = next_uniq_timestamp(seen_timestamps, link['timestamp'])
+        
+        seen_urls[url] = link
+        seen_timestamps.add(link['timestamp'])
+    
+    return seen_urls.values()
+
+def valid_links(links):
+    """remove chrome://, about:// or other schemed links that cant be archived"""
+    return (
+        link
+        for link in links
+        if any(link['url'].startswith(s) for s in ('http://', 'https://', 'ftp://'))
+    )
+
+def links_after_timestamp(links, timestamp=None):
+    if not timestamp:
+        yield from links
+        return
+
+    print('[.] [{}] Resuming...'.format(timestamp))
+    for link in links:
+        try:
+            if float(link['timestamp']) <= float(timestamp):
+                yield link
+        except (ValueError, TypeError):
+            print('Resume value and all timestamp values must be valid numbers.')
+
+def next_uniq_timestamp(used_timestamps, timestamp):
+    """resolve duplicate timestamps by appending a decimal 1234, 1234 -> 1234.1, 1234.2"""
+
+    if timestamp not in used_timestamps:
+        return timestamp
+
+    if '.' in timestamp:
+        timestamp, nonce = timestamp.split('.')
+        nonce = int(nonce)
+    else:
+        nonce = 1
+
+    new_timestamp = '{}.{}'.format(timestamp, nonce)
+
+    while new_timestamp in used_timestamps:
+        nonce += 1
+        new_timestamp = '{}.{}'.format(timestamp, nonce)
+
+    return new_timestamp

+ 84 - 162
parse.py

@@ -1,56 +1,38 @@
 import re
-import time
 import json
-
 from datetime import datetime
 
+from util import (
+    domain,
+    base_url,
+    get_str_between,
+    get_link_type,
+)
+
 
-def parse_export(file, service=None):
+def parse_export(path):
     """parse a list of links dictionaries from a bookmark export file"""
+    
+    links = []
+    with open(path, 'r', encoding='utf-8') as file:
+        for service, parser_func in get_parsers().items():
+            # otherwise try all parsers until one works
+            try:
+                links += list(parser_func(file))
+                if links:
+                    break
+            except Exception as e:
+                pass
+
+    return links
 
-    # if specific service was passed via command line
-    if service == "pocket":
-        links = parse_pocket_export(file)
-    elif service == "pinboard":
-        links = parse_json_export(file)
-    elif service == "bookmarks":
-        links = parse_bookmarks_export(file)
-    else:
-        # otherwise try all parsers until one works
-        try:
-            links = list(parse_json_export(file))
-            service = 'pinboard'
-        except Exception:
-            links = list(parse_pocket_export(file))
-            if links:
-                service = 'pocket'
-            else:
-                links = list(parse_bookmarks_export(file))
-                service = 'bookmarks'
-
-    links = valid_links(links)              # remove chrome://, about:, mailto: etc.
-    links = uniquefied_links(links)         # fix duplicate timestamps, returns sorted list
-    return links, service
-
-
-def get_link_type(link):
-    """Certain types of links need to be handled specially, this figures out when that's the case"""
-
-    if link['base_url'].endswith('.pdf'):
-        return 'PDF'
-    elif link['base_url'].rsplit('.', 1) in ('pdf', 'png', 'jpg', 'jpeg', 'svg', 'bmp', 'gif', 'tiff', 'webp'):
-        return 'image'
-    elif 'wikipedia.org' in link['domain']:
-        return 'wiki'
-    elif 'youtube.com' in link['domain']:
-        return 'youtube'
-    elif 'soundcloud.com' in link['domain']:
-        return 'soundcloud'
-    elif 'youku.com' in link['domain']:
-        return 'youku'
-    elif 'vimeo.com' in link['domain']:
-        return 'vimeo'
-    return None
+def get_parsers():
+    return {
+        'pocket': parse_pocket_export,
+        'pinboard': parse_json_export,
+        'bookmarks': parse_bookmarks_export,
+        'rss': parse_rss_export,
+    }
 
 def parse_pocket_export(html_file):
     """Parse Pocket-format bookmarks export files (produced by getpocket.com/export/)"""
@@ -61,15 +43,15 @@ def parse_pocket_export(html_file):
         match = pattern.search(line)
         if match:
             fixed_url = match.group(1).replace('http://www.readability.com/read?url=', '')           # remove old readability prefixes to get original url
-            without_scheme = fixed_url.replace('http://', '').replace('https://', '')
+            time = datetime.fromtimestamp(float(match.group(2)))
             info = {
                 'url': fixed_url,
-                'domain': without_scheme.split('/', 1)[0],    # without pathname
-                'base_url': without_scheme.split('?', 1)[0],  # without query args
-                'time': datetime.fromtimestamp(int(match.group(2))).strftime('%Y-%m-%d %H:%M'),
-                'timestamp': match.group(2),
+                'domain': domain(fixed_url),
+                'base_url': base_url(fixed_url),
+                'timestamp': str(time.timestamp()),
                 'tags': match.group(3),
-                'title': match.group(4).replace(' — Readability', '').replace('http://www.readability.com/read?url=', '') or without_scheme,
+                'title': match.group(4).replace(' — Readability', '').replace('http://www.readability.com/read?url=', '') or base_url(fixed_url),
+                'sources': [html_file.name],
             }
             info['type'] = get_link_type(info)
             yield info
@@ -82,18 +64,59 @@ def parse_json_export(json_file):
     for line in json_content:
         if line:
             erg = line
+            time = datetime.strptime(erg['time'].split(',', 1)[0], '%Y-%m-%dT%H:%M:%SZ')
             info = {
                 'url': erg['href'],
-                'domain': erg['href'].replace('http://', '').replace('https://', '').split('/', 1)[0],
-                'base_url': erg['href'].replace('https://', '').replace('http://', '').split('?', 1)[0],
-                'time': datetime.fromtimestamp(int(time.mktime(time.strptime(erg['time'].split(',', 1)[0], '%Y-%m-%dT%H:%M:%SZ')))),
-                'timestamp': str(int(time.mktime(time.strptime(erg['time'].split(',', 1)[0], '%Y-%m-%dT%H:%M:%SZ')))),
+                'domain': domain(erg['href']),
+                'base_url': base_url(erg['href']),
+                'timestamp': str(time.timestamp()),
                 'tags': erg['tags'],
                 'title': erg['description'].replace(' — Readability', ''),
+                'sources': [json_file.name],
             }
             info['type'] = get_link_type(info)
             yield info
 
+def parse_rss_export(rss_file):
+    """Parse RSS XML-format files into links"""
+
+    rss_file.seek(0)
+    items = rss_file.read().split('</item>\n<item>')
+    for item in items:
+        # example item:
+        # <item>
+        # <title><![CDATA[How JavaScript works: inside the V8 engine]]></title>
+        # <category>Unread</category>
+        # <link>https://blog.sessionstack.com/how-javascript-works-inside</link>
+        # <guid>https://blog.sessionstack.com/how-javascript-works-inside</guid>
+        # <pubDate>Mon, 21 Aug 2017 14:21:58 -0500</pubDate>
+        # </item>
+
+        trailing_removed = item.split('</item>', 1)[0]
+        leading_removed = trailing_removed.split('<item>', 1)[-1]
+        rows = leading_removed.split('\n')
+
+        row = lambda key: [r for r in rows if r.startswith('<{}>'.format(key))][0]
+
+        title = get_str_between(row('title'), '<![CDATA[', ']]')
+        url = get_str_between(row('link'), '<link>', '</link>')
+        ts_str = get_str_between(row('pubDate'), '<pubDate>', '</pubDate>')
+        time = datetime.strptime(ts_str, "%a, %d %b %Y %H:%M:%S %z")
+
+        info = {
+            'url': url,
+            'domain': domain(url),
+            'base_url': base_url(url),
+            'timestamp': str(time.timestamp()),
+            'tags': '',
+            'title': title,
+            'sources': [rss_file.name],
+        }
+
+        info['type'] = get_link_type(info)
+        # import ipdb; ipdb.set_trace()
+        yield info
+
 def parse_bookmarks_export(html_file):
     """Parse netscape-format bookmarks export files (produced by all browsers)"""
 
@@ -103,118 +126,17 @@ def parse_bookmarks_export(html_file):
         match = pattern.search(line)
         if match:
             url = match.group(1)
-            secs = match.group(2)
-            dt = datetime.fromtimestamp(int(secs))
+            time = datetime.fromtimestamp(float(match.group(2)))
 
             info = {
                 'url': url,
-                'domain': url.replace('http://', '').replace('https://', '').split('/', 1)[0],
-                'base_url': url.replace('https://', '').replace('http://', '').split('?', 1)[0],
-                'time': dt,
-                'timestamp': secs,
+                'domain': domain(url),
+                'base_url': base_url(url),
+                'timestamp': str(time.timestamp()),
                 'tags': "",
                 'title': match.group(3),
+                'sources': [html_file.name],
             }
 
             info['type'] = get_link_type(info)
             yield info
-
-
-def next_uniq_timestamp(used_timestamps, timestamp):
-    """resolve duplicate timestamps by appending a decimal 1234, 1234 -> 1234.1, 1234.2"""
-
-    if timestamp not in used_timestamps:
-        return timestamp
-
-    if '.' in timestamp:
-        timestamp, nonce = timestamp.split('.')
-        nonce = int(nonce)
-    else:
-        nonce = 1
-
-    new_timestamp = '{}.{}'.format(timestamp, nonce)
-
-    while new_timestamp in used_timestamps:
-        nonce += 1
-        new_timestamp = '{}.{}'.format(timestamp, nonce)
-
-    return new_timestamp
-
-def uniquefied_links(links):
-    """uniqueify link timestamps by de-duping using url, returns links sorted most recent -> oldest
-
-    needed because firefox will produce exports where many links share the same timestamp, this func
-    ensures that all non-duplicate links have monotonically increasing timestamps
-    """
-
-    links = list(reversed(sorted(links, key=lambda l: (l['timestamp'], l['url']))))
-    seen_timestamps = {}
-
-    for link in links:
-        t = link['timestamp']
-        if t in seen_timestamps:
-            if link['url'] == seen_timestamps[t]['url']:
-                # don't create new unique timestamp if link is the same
-                continue
-            else:
-                # resolve duplicate timstamp by appending a decimal
-                link['timestamp'] = next_uniq_timestamp(seen_timestamps, link['timestamp'])
-        seen_timestamps[link['timestamp']] = link
-
-    return links
-
-def valid_links(links):
-    """remove chrome://, about:// or other schemed links that cant be archived"""
-    return (link for link in links if link['url'].startswith('http') or link['url'].startswith('ftp'))
-
-
-def html_appended_url(link):
-    """calculate the path to the wgetted .html file, since wget may
-    adjust some paths to be different than the base_url path.
-
-    See docs on wget --adjust-extension."""
-
-    split_url = link['url'].split('#', 1)
-    query = ('%3F' + link['url'].split('?', 1)[-1]) if '?' in link['url'] else ''
-
-    if re.search(".+\\.[Hh][Tt][Mm][Ll]?$", split_url[0], re.I | re.M):
-        # already ends in .html
-        return link['base_url']
-    else:
-        # .html needs to be appended
-        without_scheme = split_url[0].split('://', 1)[-1].split('?', 1)[0]
-        if without_scheme.endswith('/'):
-            if query:
-                return '#'.join([without_scheme + 'index.html' + query + '.html', *split_url[1:]])
-            return '#'.join([without_scheme + 'index.html', *split_url[1:]])
-        else:
-            if query:
-                return '#'.join([without_scheme + 'index.html' + query + '.html', *split_url[1:]])
-            return '#'.join([without_scheme + '.html', *split_url[1:]])
-
-
-def derived_link_info(link):
-    """extend link info with the archive urls and other derived data"""
-
-    link_info = {
-        **link,
-        'date': str(link['time'])[:-3],
-        'google_favicon_url': 'https://www.google.com/s2/favicons?domain={domain}'.format(**link),
-        'favicon_url': 'archive/{timestamp}/favicon.ico'.format(**link),
-        'files_url': 'archive/{timestamp}/'.format(**link),
-        'archive_url': 'archive/{}/{}'.format(link['timestamp'], html_appended_url(link)),
-        'pdf_link': 'archive/{timestamp}/output.pdf'.format(**link),
-        'screenshot_link': 'archive/{timestamp}/screenshot.png'.format(**link),
-        'archive_org_url': 'https://web.archive.org/web/{base_url}'.format(**link),
-    }
-
-    # PDF and images are handled slightly differently
-    # wget, screenshot, & pdf urls all point to the same file
-    if link['type'] in ('PDF', 'image'):
-        link_info.update({
-            'archive_url': 'archive/{timestamp}/{base_url}'.format(**link),
-            'pdf_link': 'archive/{timestamp}/{base_url}'.format(**link),
-            'screenshot_link': 'archive/{timestamp}/{base_url}'.format(**link),
-            'title': '{title} ({type})'.format(**link),
-        })
-    return link_info

+ 2 - 2
templates/index_row.html

@@ -1,10 +1,10 @@
 <tr>
-    <td title="Date Bookmarked: $time">$date</td>
+    <td title="Bookmarked timestamp: $timestamp">$date</td>
     <td><a href="$archive_url" style="font-size:1.4em;text-decoration:none;color:black;" title="$title">
         <img src="$favicon_url">
         $title <small style="background-color: #eee;border-radius:4px; float:right">$tags</small>
     </td>
-    <td style="text-align:center"><a href="$files_url" title="Files">📂</a></td>
+    <td style="text-align:center"><a href="$files_url/index.html" title="Files">📂</a></td>
     <td style="text-align:center"><a href="$pdf_link" title="PDF">📄</a></td>
     <td style="text-align:center"><a href="$screenshot_link" title="Screenshot">🖼</a></td>
     <td style="text-align:center"><a href="$archive_org_url" title="Archive.org">🏛</a></td>

+ 258 - 0
templates/link_index.html

@@ -0,0 +1,258 @@
+<html>
+    <head>
+        <meta charset="utf-8">
+        <title>$title</title>
+        <style>
+            html, body {
+                width: 100%;
+                height: 100%;
+            }
+            body {
+                background-color: #ddd;
+            }
+            header {
+                width: 100%;
+                background-color: #aa1e55;
+                margin: 0px;
+                text-align: center;
+                color: white;
+            }
+            header h1 {
+                padding-top: 5px;
+                padding-bottom: 5px;
+                margin: 0px;
+                font-weight: 200;
+                font-family: "Gill Sans", Helvetica, sans-serif;
+            }
+            .collapse-icon {
+                float: right;
+                color: black;
+                width: 126px;
+                font-size: 0.7em;
+                margin-top: 20px;
+                margin-right: -30px;
+                margin-left: -150px;
+            }
+            .nav-icon img {
+                display: inline-block;
+                margin-right: -200px;
+                float: left;
+                color: black;
+                height: 53px;
+                margin-top: 7px;
+                margin-left: 10px;
+            }
+            .nav-icon img:hover {
+                opacity: 0.5;
+            }
+            .title-url {
+                color: black;
+            }
+            .archive-page-header {
+                margin-top: 5px;
+                margin-bottom: 5px;
+            }
+            h1 small {
+                opacity: 0.4;
+                font-size: 0.6em;
+            }
+            h1 small:hover {
+                opacity: 0.8;
+            }
+            .card {
+                box-shadow: 2px 3px 14px 0px rgba(0,0,0,0.02);
+            }
+            .card h4 {
+                font-size: 1.4vw;
+            }
+            .card-body {
+                font-size: 1vw;
+                padding-top: 2vw;
+                padding-left: 1vw;
+                padding-right: 1vw;
+                padding-bottom: 5vw;
+                word-wrap: break-word;
+                max-height: 102px;
+                overflow: hidden;
+            }
+            .card-img-top {
+                border: 0px;
+                padding: 0px;
+                margin: 0px;
+                overflow: hidden;
+                opacity: 0.8;
+                border-top: 1px solid gray;
+                border-radius: 3px;
+                border-bottom: 1px solid #ddd;
+                height: 425px;
+                width: 400%;
+                margin-bottom: -330px;
+
+                transform: scale(0.25); 
+                transform-origin: 0 0;
+            }
+            .full-page-iframe {
+                border-top: 1px solid #ddd;
+                width: 100%;
+                height: 69vh;
+                margin: 0px;
+                border: 0px;
+                border-top: 3px solid #aa1e55;
+            }
+            .card.selected-card {
+                border: 2px solid orange;
+                box-shadow: 0px -6px 13px 1px rgba(0,0,0,0.05);
+            }
+            .iframe-large {
+                height: 93%;
+                margin-top: -10px;
+            }
+
+            @media(max-width: 1092px) {
+                iframe {
+                    display: none;
+                }
+            }
+                
+
+            @media(max-width: 728px) {
+                .card h4 {
+                    font-size: 5vw;
+                }
+                .card-body {
+                    font-size: 4vw;
+                }
+                .card {
+                    margin-bottom: 5px;
+                }
+            }
+        </style>
+        <script
+          src="https://code.jquery.com/jquery-3.2.1.slim.min.js"
+          integrity="sha256-k2WSCIexGzOj3Euiig+TlR8gA0EmPjuc79OEeY5L45g="
+          crossorigin="anonymous"></script>
+        <link rel="stylesheet" href="https://maxcdn.bootstrapcdn.com/bootstrap/4.0.0-alpha.6/css/bootstrap.min.css" integrity="sha384-rwoIResjU2yc3z8GV/NPeZWAv56rSmLldC3R/AZzGRnGxQQKnKkoFVhFQhNUwEyJ" crossorigin="anonymous">
+        <script src="https://maxcdn.bootstrapcdn.com/bootstrap/4.0.0-alpha.6/js/bootstrap.min.js" integrity="sha384-vBWWzlZJ8ea9aCX4pEW3rVHjgjt7zpkNpZk+02D9phzyeVkE+jo0ieGizqPLForn" crossorigin="anonymous"></script>
+    </head>
+    <body>
+        <header>
+            <h1 class="page-title">
+                <a href="#" class="collapse-icon" title="Collapse Navbar">
+                    [-]
+                </a>
+                <a href="../../../index.html" class="nav-icon" title="Archived Sites">
+                    <img src="https://nicksweeting.com/images/archive.png" alt="Archive Icon">
+                </a>
+                $title<br/>
+                <a href="$url" class="title-url">
+                    <small><img src="$favicon" height="20px"> $base_url</small>
+                </a>
+            </h1>
+        </header>
+        <div class="site-header container-fluid">
+            <div class="row archive-page-header">
+                <div class="col-lg-2">
+                    <div class="alert alert-warning">
+                        Tags:
+                        <span class="badge badge-success">$tags</span> 
+                        <span class="badge badge-default">$type</span>
+                        <div style="height:5px"></div>
+                        Bookmarked:<br/>
+                            <small>$bookmarked<br/></small>
+                        Archived:<br/>
+                            <small>$updated</small>
+                        <hr/>
+                        <a href="index.json">JSON</a> | <a href=".">Files</a>
+                    </div>
+                </div>
+                <div class="col-lg-2">
+                    <div class="card selected-card">
+                      <iframe class="card-img-top" src="$wget" sandbox="allow-same-origin allow-scripts allow-forms"></iframe>
+                      <div class="card-body">
+                        <a href="$wget" style="float:right"><small>➡️</small></a>
+                        <a href="$wget" target="preview"><h4 class="card-title">Local Archive</h4></a>
+                        <p class="card-text">archive/$domain</p>
+                      </div>
+                    </div>
+                </div>
+                <div class="col-lg-2">
+                    <div class="card">
+                      <iframe class="card-img-top" src="$pdf"></iframe>
+                      <div class="card-body">
+                        <a href="$pdf" style="float:right"><small>➡️</small></a>
+                        <a href="$pdf" target="preview" id="pdf-btn"><h4 class="card-title">PDF</h4></a>
+                        <p class="card-text">archive/output.pdf</p>
+                      </div>
+                    </div>
+                </div>
+                <div class="col-lg-2">
+                    <div class="card">
+                      <iframe class="card-img-top" src="$screenshot" sandbox="allow-same-origin allow-scripts allow-forms"></iframe>
+                      <div class="card-body">
+                        <a href="$screenshot" style="float:right"><small>➡️</small></a>
+                        <a href="$screenshot" target="preview"><h4 class="card-title">Screenshot</h4></a>
+                        <p class="card-text">archive/screenshot.png</p>
+                      </div>
+                    </div>
+                </div>
+                <div class="col-lg-2">
+                    <div class="card">
+                      <iframe class="card-img-top" src="$url" sandbox="allow-same-origin allow-scripts allow-forms"></iframe>
+                      <div class="card-body">
+                        <a href="$url" style="float:right"><small>➡️</small></a>
+                        <a href="$url" target="preview"><h4 class="card-title">Original</h4></a>
+                        <p class="card-text">$domain</p>
+                      </div>
+                    </div>
+                </div>
+                <div class="col-lg-2">
+                    <div class="card">
+                      <iframe class="card-img-top" src="$archive_org" sandbox="allow-same-origin allow-scripts allow-forms"></iframe>
+                      <div class="card-body">
+                        <a href="$archive_org" style="float:right"><small>➡️</small></a>
+                        <a href="$archive_org" target="preview"><h4 class="card-title">Archive.Org</h4></a>
+                        <p class="card-text">web.archive.org/web/...</p>
+                      </div>
+                    </div>
+                </div>
+            </div>
+        </div>
+        <iframe sandbox="allow-same-origin allow-scripts allow-forms" class="full-page-iframe" src="$wget" name="preview"></iframe>
+    </body>
+
+    <script>
+        jQuery('.card').on('click', function(e) {
+            jQuery('.selected-card').removeClass('selected-card')
+            jQuery(e.target).closest('.card').addClass('selected-card')
+        })
+        jQuery('.card a[target=preview]').on('click', function(e) {
+            if (e.currentTarget.href.endsWith('.pdf')) {
+                jQuery('.full-page-iframe')[0].removeAttribute('sandbox')
+            } else {
+                jQuery('.full-page-iframe')[0].sandbox = "allow-same-origin allow-scripts allow-forms"
+            }
+            return true
+        })
+        jQuery('iframe').map(function() {
+            if (this.src.endsWith('.pdf')) {
+                this.removeAttribute('sandbox')
+                this.src = this.src
+            }
+        })
+        jQuery('.collapse-icon').on('click', function() {
+            if (jQuery('.collapse-icon').text().includes('[-]')) {
+                jQuery('.collapse-icon').text('[+]')
+                jQuery('.site-header').hide()
+                jQuery('.full-page-iframe').addClass('iframe-large')
+            } else {
+                jQuery('.collapse-icon').text('[-]')
+                jQuery('.site-header').show()
+                jQuery('.full-page-iframe').removeClass('iframe-large')
+            }
+            return true
+        })
+        if (window.innerWidth < 1091) {
+            jQuery('.card a[target=preview]').attr('target', '_self')
+        }
+    </script>
+</html>

+ 216 - 0
util.py

@@ -0,0 +1,216 @@
+import os
+import sys
+import time
+import requests
+
+from datetime import datetime
+from subprocess import run, PIPE, DEVNULL
+from multiprocessing import Process
+
+from config import (
+    ARCHIVE_PERMISSIONS,
+    ARCHIVE_DIR,
+    TIMEOUT,
+    TERM_WIDTH,
+    SHOW_PROGRESS,
+    ANSI,
+    CHROME_BINARY,
+    FETCH_WGET,
+    FETCH_PDF,
+    FETCH_SCREENSHOT,
+    FETCH_FAVICON,
+    FETCH_AUDIO,
+    FETCH_VIDEO,
+    SUBMIT_ARCHIVE_DOT_ORG,
+)
+
+def check_dependencies():
+    """Check that all necessary dependencies are installed, and have valid versions"""
+
+    print('[*] Checking Dependencies:')
+
+    python_vers = float('{}.{}'.format(sys.version_info.major, sys.version_info.minor))
+    if python_vers < 3.5:
+        print('{}[X] Python version is not new enough: {} (>3.5 is required){}'.format(ANSI['red'], python_vers, ANSI['reset']))
+        print('    See https://github.com/pirate/bookmark-archiver#troubleshooting for help upgrading your Python installation.')
+        raise SystemExit(1)
+
+    if FETCH_PDF or FETCH_SCREENSHOT:
+        if run(['which', CHROME_BINARY]).returncode:
+            print('{}[X] Missing dependency: {}{}'.format(ANSI['red'], CHROME_BINARY, ANSI['reset']))
+            print('    Run ./setup.sh, then confirm it was installed with: {} --version'.format(CHROME_BINARY))
+            print('    See https://github.com/pirate/bookmark-archiver for help.')
+            raise SystemExit(1)
+
+        # parse chrome --version e.g. Google Chrome 61.0.3114.0 canary / Chromium 59.0.3029.110 built on Ubuntu, running on Ubuntu 16.04
+        try:
+            result = run([CHROME_BINARY, '--version'], stdout=PIPE)
+            version = result.stdout.decode('utf-8').replace('Google Chrome ', '').replace('Chromium ', '').split(' ', 1)[0].split('.', 1)[0]  # TODO: regex might be better
+            if int(version) < 59:
+                print('{red}[X] Chrome version must be 59 or greater for headless PDF and screenshot saving{reset}'.format(**ANSI))
+                print('    See https://github.com/pirate/bookmark-archiver for help.')
+                raise SystemExit(1)
+        except (TypeError, OSError):
+            print('{red}[X] Failed to parse Chrome version, is it installed properly?{reset}'.format(**ANSI))
+            print('    Run ./setup.sh, then confirm it was installed with: {} --version'.format(CHROME_BINARY))
+            print('    See https://github.com/pirate/bookmark-archiver for help.')
+            raise SystemExit(1)
+
+    if FETCH_WGET:
+        if run(['which', 'wget']).returncode or run(['wget', '--version'], stdout=DEVNULL).returncode:
+            print('{red}[X] Missing dependency: wget{reset}'.format(**ANSI))
+            print('    Run ./setup.sh, then confirm it was installed with: {} --version'.format('wget'))
+            print('    See https://github.com/pirate/bookmark-archiver for help.')
+            raise SystemExit(1)
+
+    if FETCH_FAVICON or SUBMIT_ARCHIVE_DOT_ORG:
+        if run(['which', 'curl']).returncode or run(['curl', '--version'], stdout=DEVNULL).returncode:
+            print('{red}[X] Missing dependency: curl{reset}'.format(**ANSI))
+            print('    Run ./setup.sh, then confirm it was installed with: {} --version'.format('curl'))
+            print('    See https://github.com/pirate/bookmark-archiver for help.')
+            raise SystemExit(1)
+
+    if FETCH_AUDIO or FETCH_VIDEO:
+        if run(['which', 'youtube-dl']).returncode or run(['youtube-dl', '--version'], stdout=DEVNULL).returncode:
+            print('{red}[X] Missing dependency: youtube-dl{reset}'.format(**ANSI))
+            print('    Run ./setup.sh, then confirm it was installed with: {} --version'.format('youtube-dl'))
+            print('    See https://github.com/pirate/bookmark-archiver for help.')
+            raise SystemExit(1)
+
+
+def chmod_file(path, cwd='.', permissions=ARCHIVE_PERMISSIONS, timeout=30):
+    """chmod -R <permissions> <cwd>/<path>"""
+
+    if not os.path.exists(os.path.join(cwd, path)):
+        raise Exception('Failed to chmod: {} does not exist (did the previous step fail?)'.format(path))
+
+    chmod_result = run(['chmod', '-R', permissions, path], cwd=cwd, stdout=DEVNULL, stderr=PIPE, timeout=timeout)
+    if chmod_result.returncode == 1:
+        print('     ', chmod_result.stderr.decode())
+        raise Exception('Failed to chmod {}/{}'.format(cwd, path))
+
+
+def progress(seconds=TIMEOUT, prefix=''):
+    """Show a (subprocess-controlled) progress bar with a <seconds> timeout,
+       returns end() function to instantly finish the progress
+    """
+
+    if not SHOW_PROGRESS:
+        return lambda: None
+
+    chunk = '█' if sys.stdout.encoding == 'UTF-8' else '#'
+    chunks = TERM_WIDTH - len(prefix) - 20  # number of progress chunks to show (aka max bar width)
+
+    def progress_bar(seconds=seconds, prefix=prefix):
+        """show timer in the form of progress bar, with percentage and seconds remaining"""
+        try:
+            for s in range(seconds * chunks):
+                progress = s / chunks / seconds * 100
+                bar_width = round(progress/(100/chunks))
+
+                # ████████████████████           0.9% (1/60sec)
+                sys.stdout.write('\r{0}{1}{2}{3} {4}% ({5}/{6}sec)'.format(
+                    prefix,
+                    ANSI['green'],
+                    (chunk * bar_width).ljust(chunks),
+                    ANSI['reset'],
+                    round(progress, 1),
+                    round(s/chunks),
+                    seconds,
+                ))
+                sys.stdout.flush()
+                time.sleep(1 / chunks)
+
+            # ██████████████████████████████████ 100.0% (60/60sec)
+            sys.stdout.write('\r{0}{1}{2}{3} {4}% ({5}/{6}sec)\n'.format(
+                prefix,
+                ANSI['red'],
+                chunk * chunks,
+                ANSI['reset'],
+                100.0,
+                seconds,
+                seconds,
+            ))
+            sys.stdout.flush()
+        except KeyboardInterrupt:
+            print()
+            pass
+
+    p = Process(target=progress_bar)
+    p.start()
+
+    def end():
+        """immediately finish progress and clear the progressbar line"""
+        p.terminate()
+        sys.stdout.write('\r{}{}\r'.format((' ' * TERM_WIDTH), ANSI['reset']))  # clear whole terminal line
+        sys.stdout.flush()
+
+    return end
+
+
+def download_url(url):
+    if not os.path.exists(os.path.join(ARCHIVE_DIR, 'downloads')):
+        os.makedirs(os.path.join(ARCHIVE_DIR, 'downloads'))
+
+    url_domain = url.split('/', 3)[2]
+    output_path = os.path.join(ARCHIVE_DIR, 'downloads', '{}.txt'.format(url_domain))
+    
+    print('[*] [{}] Downloading {} > {}'.format(
+        datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
+        url,
+        output_path,
+    ))
+    end = progress(TIMEOUT, prefix='      ')
+    try:
+        downloaded_xml = requests.get(url).content.decode()
+        end()
+    except Exception as e:
+        end()
+        print('[!] Failed to download {}\n'.format(url))
+        print('    ', e)
+        raise SystemExit(1)
+
+    with open(output_path, 'w', encoding='utf-8') as f:
+        f.write(downloaded_xml)
+    return output_path
+
+
+def get_str_between(string, start, end=None):
+    """(<abc>12345</def>, <abc>, </def>)  ->  12345"""
+
+    content = string.split(start, 1)[-1]
+    if end is not None:
+        content = content.rsplit(end, 1)[0]
+
+    return content
+
+
+
+
+def get_link_type(link):
+    """Certain types of links need to be handled specially, this figures out when that's the case"""
+
+    if link['base_url'].endswith('.pdf'):
+        return 'PDF'
+    elif link['base_url'].rsplit('.', 1) in ('pdf', 'png', 'jpg', 'jpeg', 'svg', 'bmp', 'gif', 'tiff', 'webp'):
+        return 'image'
+    elif 'wikipedia.org' in link['domain']:
+        return 'wiki'
+    elif 'youtube.com' in link['domain']:
+        return 'youtube'
+    elif 'soundcloud.com' in link['domain']:
+        return 'soundcloud'
+    elif 'youku.com' in link['domain']:
+        return 'youku'
+    elif 'vimeo.com' in link['domain']:
+        return 'vimeo'
+    return None
+
+
+# URL helpers
+without_scheme = lambda url: url.replace('http://', '').replace('https://', '').replace('ftp://', '')
+without_query = lambda url: url.split('?', 1)[0]
+without_hash = lambda url: url.split('#', 1)[0] 
+without_path = lambda url: url.split('/', 1)[0]
+domain = lambda url: without_hash(without_query(without_path(without_scheme(url))))
+base_url = lambda url: without_query(without_scheme(url))