6 years ago · bd9f3e313f
--- a/archivebox/archive.py
+++ b/archivebox/archive.py
@@ -94,7 +94,7 @@ def main(*args):
 
				 
			
 
				 
			
 
				 def update_archive_data(import_path=None, resume=None):
			
 
				-    """The main ArchiveBox entrancepoint.  Everything starts here."""
			
 
				+    """The main ArchiveBox entrancepoint. Everything starts here."""
			
 
				     check_dependencies()
			
 
				 
			
 
				     # Step 1: Load list of links from the existing index
			
--- a/archivebox/archive_methods.py
+++ b/archivebox/archive_methods.py
@@ -1,6 +1,5 @@
 
				 import os
			
 
				 
			
 
				-from functools import wraps
			
 
				 from collections import defaultdict
			
 
				 from datetime import datetime
			
 
				 
			
@@ -50,10 +49,9 @@ from util import (
 
				     run, PIPE, DEVNULL
			
 
				 )
			
 
				 from logs import (
			
 
				-    _LAST_RUN_STATS,
			
 
				     log_link_archiving_started,
			
 
				     log_link_archiving_finished,
			
 
				-    log_archive_method_starting,
			
 
				+    log_archive_method_started,
			
 
				     log_archive_method_finished,
			
 
				 )
			
 
				 
			
@@ -94,6 +92,7 @@ def archive_link(link_dir, link):
 
				                 link['history'][method_name] = []
			
 
				             if method_name not in link['latest']:
			
 
				                 link['latest'][method_name] = None
			
 
				+            
			
 
				             if not should_run(link_dir, link):
			
 
				                 continue
			
 
				 
			
@@ -101,7 +100,7 @@ def archive_link(link_dir, link):
 
				                 skipped_entirely = False
			
 
				                 print()
			
 
				 
			
 
				-            log_archive_method_starting(method_name)
			
 
				+            log_archive_method_started(method_name)
			
 
				             result = method_function(link_dir, link)
			
 
				             log_archive_method_finished(result)
			
 
				 
			
@@ -109,11 +108,6 @@ def archive_link(link_dir, link):
 
				             if result['status'] == 'succeeded':
			
 
				                 link['latest'][method_name] = result['output']
			
 
				 
			
 
				-            if result['status'] != 'skipped':
			
 
				-                made_changes = True
			
 
				-
			
 
				-            _LAST_RUN_STATS[result['status']] += 1
			
 
				-
			
 
				         write_link_index(link_dir, link)
			
 
				         patch_links_index(link)
			
 
				 
			
@@ -126,6 +120,7 @@ def archive_link(link_dir, link):
 
				     return link
			
 
				 
			
 
				 
			
 
				+### Archive Method Functions
			
 
				 
			
 
				 def should_fetch_title(link_dir, link):
			
 
				     # if link already has valid title, skip it
			
@@ -428,8 +423,8 @@ def should_fetch_git(link_dir, link):
 
				         return False
			
 
				 
			
 
				     is_clonable_url = (
			
 
				-        domain(link['url']) in GIT_DOMAINS
			
 
				-        or extension(link['url']) == 'git'
			
 
				+        (domain(link['url']) in GIT_DOMAINS)
			
 
				+        or (extension(link['url']) == 'git')
			
 
				     )
			
 
				     if not is_clonable_url:
			
 
				         return False
			
@@ -477,6 +472,7 @@ def fetch_git(link_dir, link, timeout=TIMEOUT):
 
				         **timer.stats,
			
 
				     }
			
 
				 
			
 
				+
			
 
				 def should_fetch_media(link_dir, link):
			
 
				     if is_static_file(link['url']):
			
 
				         return False
			
@@ -547,21 +543,6 @@ def fetch_media(link_dir, link, timeout=MEDIA_TIMEOUT):
 
				         **timer.stats,
			
 
				     }
			
 
				 
			
 
				-def parse_archive_dot_org_response(response):
			
 
				-    # Parse archive.org response headers
			
 
				-    headers = defaultdict(list)
			
 
				-
			
 
				-    # lowercase all the header names and store in dict
			
 
				-    for header in response.splitlines():
			
 
				-        if b':' not in header or not header.strip():
			
 
				-            continue
			
 
				-        name, val = header.decode().split(':', 1)
			
 
				-        headers[name.lower().strip()].append(val.strip())
			
 
				-
			
 
				-    # Get successful archive url in "content-location" header or any errors
			
 
				-    content_location = headers['content-location']
			
 
				-    errors = headers['x-archive-wayback-runtime-error']
			
 
				-    return content_location, errors
			
 
				 
			
 
				 def should_fetch_archive_dot_org(link_dir, link):
			
 
				     if is_static_file(link['url']):
			
@@ -627,4 +608,18 @@ def archive_dot_org(link_dir, link, timeout=TIMEOUT):
 
				         **timer.stats,
			
 
				     }
			
 
				 
			
 
				+def parse_archive_dot_org_response(response):
			
 
				+    # Parse archive.org response headers
			
 
				+    headers = defaultdict(list)
			
 
				+
			
 
				+    # lowercase all the header names and store in dict
			
 
				+    for header in response.splitlines():
			
 
				+        if b':' not in header or not header.strip():
			
 
				+            continue
			
 
				+        name, val = header.decode().split(':', 1)
			
 
				+        headers[name.lower().strip()].append(val.strip())
			
 
				 
			
 
				+    # Get successful archive url in "content-location" header or any errors
			
 
				+    content_location = headers['content-location']
			
 
				+    errors = headers['x-archive-wayback-runtime-error']
			
 
				+    return content_location, errors
			
--- a/archivebox/index.py
+++ b/archivebox/index.py
@@ -26,6 +26,7 @@ from util import (
 
				 from parse import parse_links
			
 
				 from links import validate_links
			
 
				 from logs import (
			
 
				+    log_indexing_process_started,
			
 
				     log_indexing_started,
			
 
				     log_indexing_finished,
			
 
				     log_parsing_started,
			
@@ -40,12 +41,14 @@ TITLE_LOADING_MSG = 'Not yet archived...'
 
				 def write_links_index(out_dir, links, finished=False):
			
 
				     """create index.html file for a given list of links"""
			
 
				 
			
 
				-    log_indexing_started()
			
 
				+    log_indexing_process_started()
			
 
				     check_links_structure(links)
			
 
				 
			
 
				+    log_indexing_started(out_dir, 'index.json')
			
 
				     write_json_links_index(out_dir, links)
			
 
				     log_indexing_finished(out_dir, 'index.json')
			
 
				     
			
 
				+    log_indexing_started(out_dir, 'index.html')
			
 
				     write_html_links_index(out_dir, links, finished=finished)
			
 
				     log_indexing_finished(out_dir, 'index.html')
			
 
				     
			
--- a/archivebox/links.py
+++ b/archivebox/links.py
@@ -3,33 +3,26 @@ In ArchiveBox, a Link represents a single entry that we track in the
 
				 json index.  All links pass through all archiver functions and the latest,
			
 
				 most up-to-date canonical output for each is stored in "latest".
			
 
				 
			
 
				-
			
 
				 Link {
			
 
				-    timestamp: str,     (how we uniquely id links)        _   _  _ _  ___
			
 
				-    url: str,                                            | \ / \ |\| ' |
			
 
				-    base_url: str,                                       |_/ \_/ | |   |
			
 
				-    domain: str,                                          _   _ _ _ _  _
			
 
				-    tags: str,                                           |_) /| |\| | / `
			
 
				-    type: str,                                           |  /"| | | | \_,
			
 
				-    title: str,                                              ,-'"`-.
			
 
				-    sources: [str],                                     /// /  @ @  \ \\\\
			
 
				-    latest: {                                           \ :=| ,._,. |=:  /
			
 
				-        ...,                                            || ,\ \_../ /. ||
			
 
				-        pdf: 'output.pdf',                              ||','`-._))'`.`||
			
 
				-        wget: 'example.com/1234/index.html'             `-'     (/    `-'
			
 
				+    timestamp: str,     (how we uniquely id links)    
			
 
				+    url: str,                                         
			
 
				+    title: str,                                       
			
 
				+    tags: str,                                        
			
 
				+    sources: [str],                                   
			
 
				+    latest: {                                         
			
 
				+        ...,                                          
			
 
				+        pdf: 'output.pdf',                            
			
 
				+        wget: 'example.com/1234/index.html',
			
 
				+        screenshot: null,        
			
 
				     },
			
 
				     history: {
			
 
				-        ...
			
 
				         pdf: [
			
 
				-            {timestamp: 15444234325, status: 'skipped', result='output.pdf'},
			
 
				+            {start_ts, end_ts, duration, cmd, pwd, status, output},
			
 
				             ...
			
 
				         ],
			
 
				-        wget: [
			
 
				-            {timestamp: 11534435345, status: 'succeded', result='donuts.com/eat/them.html'}
			
 
				-        ]
			
 
				+        ...
			
 
				     },
			
 
				 }
			
 
				-
			
 
				 """
			
 
				 
			
 
				 from html import unescape
			
--- a/archivebox/logs.py
+++ b/archivebox/logs.py
@@ -45,13 +45,21 @@ def log_link_archiving_started(link_dir, link, is_new):
 
				     ))
			
 
				 
			
 
				 def log_link_archiving_finished(link_dir, link, is_new, skipped_entirely):
			
 
				+    if all(output == 'succeeded' for output in link['latest']):
			
 
				+        _LAST_RUN_STATS['succeeded'] += 1
			
 
				+    elif skipped_entirely or all(output == 'skipped' for output in link['latest']):
			
 
				+        _LAST_RUN_STATS['skipped'] += 1
			
 
				+    else:
			
 
				+        _LAST_RUN_STATS['failed'] += 1
			
 
				+        # import ipdb; ipdb.set_trace()
			
 
				+
			
 
				     if skipped_entirely:
			
 
				         print('\r    √ {}{}'.format(
			
 
				             pretty_path(link_dir),
			
 
				             ' (new)' if is_new else '',
			
 
				         ))
			
 
				 
			
 
				-def log_archive_method_starting(method):
			
 
				+def log_archive_method_started(method):
			
 
				     print('      > {}'.format(method))
			
 
				 
			
 
				 def log_archive_method_finished(result):
			
@@ -117,7 +125,7 @@ def log_parsing_finished(num_new_links, parser_name):
 
				         parser_name,
			
 
				     ))
			
 
				 
			
 
				-def log_indexing_started():
			
 
				+def log_indexing_process_started():
			
 
				     start_ts = datetime.now()
			
 
				     _LAST_RUN_STATS['index_start_ts'] = start_ts
			
 
				     print('{green}[*] [{}] Saving main index files...{reset}'.format(
			
@@ -125,10 +133,13 @@ def log_indexing_started():
 
				         **ANSI,
			
 
				     ))
			
 
				 
			
 
				+def log_indexing_started(out_dir, out_file):
			
 
				+    sys.stdout.write('    > {}/{}'.format(pretty_path(out_dir), out_file))
			
 
				+
			
 
				 def log_indexing_finished(out_dir, out_file):
			
 
				     end_ts = datetime.now()
			
 
				     _LAST_RUN_STATS['index_end_ts'] = end_ts
			
 
				-    print('    √ {}/{}'.format(pretty_path(out_dir), out_file))
			
 
				+    print('\r    √ {}/{}'.format(pretty_path(out_dir), out_file))
			
 
				 
			
 
				 def log_archiving_started(num_links, resume):
			
 
				     start_ts = datetime.now()
			
--- a/archivebox/util.py
+++ b/archivebox/util.py
@@ -314,10 +314,20 @@ def wget_output_path(link):
 
				     # Wget downloads can save in a number of different ways depending on the url:
			
 
				     #    https://example.com
			
 
				     #       > output/archive/<timestamp>/example.com/index.html
			
 
				+    #    https://example.com?v=zzVa_tX1OiI
			
 
				+    #       > output/archive/<timestamp>/example.com/index.html?v=zzVa_tX1OiI.html
			
 
				+    #    https://www.example.com/?v=zzVa_tX1OiI
			
 
				+    #       > output/archive/<timestamp>/example.com/index.html?v=zzVa_tX1OiI.html
			
 
				+
			
 
				     #    https://example.com/abc
			
 
				     #       > output/archive/<timestamp>/example.com/abc.html
			
 
				     #    https://example.com/abc/
			
 
				     #       > output/archive/<timestamp>/example.com/abc/index.html
			
 
				+    #    https://example.com/abc?v=zzVa_tX1OiI.html
			
 
				+    #       > output/archive/<timestamp>/example.com/abc?v=zzVa_tX1OiI.html
			
 
				+    #    https://example.com/abc/?v=zzVa_tX1OiI.html
			
 
				+    #       > output/archive/<timestamp>/example.com/abc/index.html?v=zzVa_tX1OiI.html
			
 
				+
			
 
				     #    https://example.com/abc/test.html
			
 
				     #       > output/archive/<timestamp>/example.com/abc/test.html
			
 
				     #    https://example.com/abc/test?v=zzVa_tX1OiI
			
@@ -326,7 +336,7 @@ def wget_output_path(link):
 
				     #       > output/archive/<timestamp>/example.com/abc/test/index.html?v=zzVa_tX1OiI.html
			
 
				 
			
 
				     # There's also lots of complexity around how the urlencoding and renaming
			
 
				-    # is done for pages with query and hash fragments or extensions like shtml / htm
			
 
				+    # is done for pages with query and hash fragments or extensions like shtml / htm / php / etc
			
 
				 
			
 
				     # Since the wget algorithm for -E (appending .html) is incredibly complex
			
 
				     # and there's no way to get the computed output path from wget
			
@@ -359,27 +369,6 @@ def wget_output_path(link):
 
				 
			
 
				     return None
			
 
				 
			
 
				-    # If finding the actual output file didn't work, fall back to the buggy
			
 
				-    # implementation of the wget .html appending algorithm
			
 
				-    # split_url = link['url'].split('#', 1)
			
 
				-    # query = ('%3F' + link['url'].split('?', 1)[-1]) if '?' in link['url'] else ''
			
 
				-
			
 
				-    # if re.search(".+\\.[Hh][Tt][Mm][Ll]?$", split_url[0], re.I | re.M):
			
 
				-    #     # already ends in .html
			
 
				-    #     return urlencode(base_url(link['url']))
			
 
				-    # else:
			
 
				-    #     # .html needs to be appended
			
 
				-    #     without_scheme = split_url[0].split('://', 1)[-1].split('?', 1)[0]
			
 
				-    #     if without_scheme.endswith('/'):
			
 
				-    #         if query:
			
 
				-    #             return urlencode('#'.join([without_scheme + 'index.html' + query + '.html', *split_url[1:]]))
			
 
				-    #         return urlencode('#'.join([without_scheme + 'index.html', *split_url[1:]]))
			
 
				-    #     else:
			
 
				-    #         if query:
			
 
				-    #             return urlencode('#'.join([without_scheme + '/index.html' + query + '.html', *split_url[1:]]))
			
 
				-    #         elif '/' in without_scheme:
			
 
				-    #             return urlencode('#'.join([without_scheme + '.html', *split_url[1:]]))
			
 
				-    #         return urlencode(base_url(link['url']) + '/index.html')
			
 
				 
			
 
				 ### String Manipulation & Logging Helpers