Bläddra i källkod

better logging during long output

Nick Sweeting 6 år sedan
förälder
incheckning
bd9f3e313f
6 ändrade filer med 63 tillägg och 72 borttagningar
  1. 1 1
      archivebox/archive.py
  2. 21 26
      archivebox/archive_methods.py
  3. 4 1
      archivebox/index.py
  4. 12 19
      archivebox/links.py
  5. 14 3
      archivebox/logs.py
  6. 11 22
      archivebox/util.py

+ 1 - 1
archivebox/archive.py

@@ -94,7 +94,7 @@ def main(*args):
 
 
 
 
 def update_archive_data(import_path=None, resume=None):
 def update_archive_data(import_path=None, resume=None):
-    """The main ArchiveBox entrancepoint.  Everything starts here."""
+    """The main ArchiveBox entrancepoint. Everything starts here."""
     check_dependencies()
     check_dependencies()
 
 
     # Step 1: Load list of links from the existing index
     # Step 1: Load list of links from the existing index

+ 21 - 26
archivebox/archive_methods.py

@@ -1,6 +1,5 @@
 import os
 import os
 
 
-from functools import wraps
 from collections import defaultdict
 from collections import defaultdict
 from datetime import datetime
 from datetime import datetime
 
 
@@ -50,10 +49,9 @@ from util import (
     run, PIPE, DEVNULL
     run, PIPE, DEVNULL
 )
 )
 from logs import (
 from logs import (
-    _LAST_RUN_STATS,
     log_link_archiving_started,
     log_link_archiving_started,
     log_link_archiving_finished,
     log_link_archiving_finished,
-    log_archive_method_starting,
+    log_archive_method_started,
     log_archive_method_finished,
     log_archive_method_finished,
 )
 )
 
 
@@ -94,6 +92,7 @@ def archive_link(link_dir, link):
                 link['history'][method_name] = []
                 link['history'][method_name] = []
             if method_name not in link['latest']:
             if method_name not in link['latest']:
                 link['latest'][method_name] = None
                 link['latest'][method_name] = None
+            
             if not should_run(link_dir, link):
             if not should_run(link_dir, link):
                 continue
                 continue
 
 
@@ -101,7 +100,7 @@ def archive_link(link_dir, link):
                 skipped_entirely = False
                 skipped_entirely = False
                 print()
                 print()
 
 
-            log_archive_method_starting(method_name)
+            log_archive_method_started(method_name)
             result = method_function(link_dir, link)
             result = method_function(link_dir, link)
             log_archive_method_finished(result)
             log_archive_method_finished(result)
 
 
@@ -109,11 +108,6 @@ def archive_link(link_dir, link):
             if result['status'] == 'succeeded':
             if result['status'] == 'succeeded':
                 link['latest'][method_name] = result['output']
                 link['latest'][method_name] = result['output']
 
 
-            if result['status'] != 'skipped':
-                made_changes = True
-
-            _LAST_RUN_STATS[result['status']] += 1
-
         write_link_index(link_dir, link)
         write_link_index(link_dir, link)
         patch_links_index(link)
         patch_links_index(link)
 
 
@@ -126,6 +120,7 @@ def archive_link(link_dir, link):
     return link
     return link
 
 
 
 
+### Archive Method Functions
 
 
 def should_fetch_title(link_dir, link):
 def should_fetch_title(link_dir, link):
     # if link already has valid title, skip it
     # if link already has valid title, skip it
@@ -428,8 +423,8 @@ def should_fetch_git(link_dir, link):
         return False
         return False
 
 
     is_clonable_url = (
     is_clonable_url = (
-        domain(link['url']) in GIT_DOMAINS
-        or extension(link['url']) == 'git'
+        (domain(link['url']) in GIT_DOMAINS)
+        or (extension(link['url']) == 'git')
     )
     )
     if not is_clonable_url:
     if not is_clonable_url:
         return False
         return False
@@ -477,6 +472,7 @@ def fetch_git(link_dir, link, timeout=TIMEOUT):
         **timer.stats,
         **timer.stats,
     }
     }
 
 
+
 def should_fetch_media(link_dir, link):
 def should_fetch_media(link_dir, link):
     if is_static_file(link['url']):
     if is_static_file(link['url']):
         return False
         return False
@@ -547,21 +543,6 @@ def fetch_media(link_dir, link, timeout=MEDIA_TIMEOUT):
         **timer.stats,
         **timer.stats,
     }
     }
 
 
-def parse_archive_dot_org_response(response):
-    # Parse archive.org response headers
-    headers = defaultdict(list)
-
-    # lowercase all the header names and store in dict
-    for header in response.splitlines():
-        if b':' not in header or not header.strip():
-            continue
-        name, val = header.decode().split(':', 1)
-        headers[name.lower().strip()].append(val.strip())
-
-    # Get successful archive url in "content-location" header or any errors
-    content_location = headers['content-location']
-    errors = headers['x-archive-wayback-runtime-error']
-    return content_location, errors
 
 
 def should_fetch_archive_dot_org(link_dir, link):
 def should_fetch_archive_dot_org(link_dir, link):
     if is_static_file(link['url']):
     if is_static_file(link['url']):
@@ -627,4 +608,18 @@ def archive_dot_org(link_dir, link, timeout=TIMEOUT):
         **timer.stats,
         **timer.stats,
     }
     }
 
 
+def parse_archive_dot_org_response(response):
+    # Parse archive.org response headers
+    headers = defaultdict(list)
+
+    # lowercase all the header names and store in dict
+    for header in response.splitlines():
+        if b':' not in header or not header.strip():
+            continue
+        name, val = header.decode().split(':', 1)
+        headers[name.lower().strip()].append(val.strip())
 
 
+    # Get successful archive url in "content-location" header or any errors
+    content_location = headers['content-location']
+    errors = headers['x-archive-wayback-runtime-error']
+    return content_location, errors

+ 4 - 1
archivebox/index.py

@@ -26,6 +26,7 @@ from util import (
 from parse import parse_links
 from parse import parse_links
 from links import validate_links
 from links import validate_links
 from logs import (
 from logs import (
+    log_indexing_process_started,
     log_indexing_started,
     log_indexing_started,
     log_indexing_finished,
     log_indexing_finished,
     log_parsing_started,
     log_parsing_started,
@@ -40,12 +41,14 @@ TITLE_LOADING_MSG = 'Not yet archived...'
 def write_links_index(out_dir, links, finished=False):
 def write_links_index(out_dir, links, finished=False):
     """create index.html file for a given list of links"""
     """create index.html file for a given list of links"""
 
 
-    log_indexing_started()
+    log_indexing_process_started()
     check_links_structure(links)
     check_links_structure(links)
 
 
+    log_indexing_started(out_dir, 'index.json')
     write_json_links_index(out_dir, links)
     write_json_links_index(out_dir, links)
     log_indexing_finished(out_dir, 'index.json')
     log_indexing_finished(out_dir, 'index.json')
     
     
+    log_indexing_started(out_dir, 'index.html')
     write_html_links_index(out_dir, links, finished=finished)
     write_html_links_index(out_dir, links, finished=finished)
     log_indexing_finished(out_dir, 'index.html')
     log_indexing_finished(out_dir, 'index.html')
     
     

+ 12 - 19
archivebox/links.py

@@ -3,33 +3,26 @@ In ArchiveBox, a Link represents a single entry that we track in the
 json index.  All links pass through all archiver functions and the latest,
 json index.  All links pass through all archiver functions and the latest,
 most up-to-date canonical output for each is stored in "latest".
 most up-to-date canonical output for each is stored in "latest".
 
 
-
 Link {
 Link {
-    timestamp: str,     (how we uniquely id links)        _   _  _ _  ___
-    url: str,                                            | \ / \ |\| ' |
-    base_url: str,                                       |_/ \_/ | |   |
-    domain: str,                                          _   _ _ _ _  _
-    tags: str,                                           |_) /| |\| | / `
-    type: str,                                           |  /"| | | | \_,
-    title: str,                                              ,-'"`-.
-    sources: [str],                                     /// /  @ @  \ \\\\
-    latest: {                                           \ :=| ,._,. |=:  /
-        ...,                                            || ,\ \_../ /. ||
-        pdf: 'output.pdf',                              ||','`-._))'`.`||
-        wget: 'example.com/1234/index.html'             `-'     (/    `-'
+    timestamp: str,     (how we uniquely id links)    
+    url: str,                                         
+    title: str,                                       
+    tags: str,                                        
+    sources: [str],                                   
+    latest: {                                         
+        ...,                                          
+        pdf: 'output.pdf',                            
+        wget: 'example.com/1234/index.html',
+        screenshot: null,        
     },
     },
     history: {
     history: {
-        ...
         pdf: [
         pdf: [
-            {timestamp: 15444234325, status: 'skipped', result='output.pdf'},
+            {start_ts, end_ts, duration, cmd, pwd, status, output},
             ...
             ...
         ],
         ],
-        wget: [
-            {timestamp: 11534435345, status: 'succeded', result='donuts.com/eat/them.html'}
-        ]
+        ...
     },
     },
 }
 }
-
 """
 """
 
 
 from html import unescape
 from html import unescape

+ 14 - 3
archivebox/logs.py

@@ -45,13 +45,21 @@ def log_link_archiving_started(link_dir, link, is_new):
     ))
     ))
 
 
 def log_link_archiving_finished(link_dir, link, is_new, skipped_entirely):
 def log_link_archiving_finished(link_dir, link, is_new, skipped_entirely):
+    if all(output == 'succeeded' for output in link['latest']):
+        _LAST_RUN_STATS['succeeded'] += 1
+    elif skipped_entirely or all(output == 'skipped' for output in link['latest']):
+        _LAST_RUN_STATS['skipped'] += 1
+    else:
+        _LAST_RUN_STATS['failed'] += 1
+        # import ipdb; ipdb.set_trace()
+
     if skipped_entirely:
     if skipped_entirely:
         print('\r    √ {}{}'.format(
         print('\r    √ {}{}'.format(
             pretty_path(link_dir),
             pretty_path(link_dir),
             ' (new)' if is_new else '',
             ' (new)' if is_new else '',
         ))
         ))
 
 
-def log_archive_method_starting(method):
+def log_archive_method_started(method):
     print('      > {}'.format(method))
     print('      > {}'.format(method))
 
 
 def log_archive_method_finished(result):
 def log_archive_method_finished(result):
@@ -117,7 +125,7 @@ def log_parsing_finished(num_new_links, parser_name):
         parser_name,
         parser_name,
     ))
     ))
 
 
-def log_indexing_started():
+def log_indexing_process_started():
     start_ts = datetime.now()
     start_ts = datetime.now()
     _LAST_RUN_STATS['index_start_ts'] = start_ts
     _LAST_RUN_STATS['index_start_ts'] = start_ts
     print('{green}[*] [{}] Saving main index files...{reset}'.format(
     print('{green}[*] [{}] Saving main index files...{reset}'.format(
@@ -125,10 +133,13 @@ def log_indexing_started():
         **ANSI,
         **ANSI,
     ))
     ))
 
 
+def log_indexing_started(out_dir, out_file):
+    sys.stdout.write('    > {}/{}'.format(pretty_path(out_dir), out_file))
+
 def log_indexing_finished(out_dir, out_file):
 def log_indexing_finished(out_dir, out_file):
     end_ts = datetime.now()
     end_ts = datetime.now()
     _LAST_RUN_STATS['index_end_ts'] = end_ts
     _LAST_RUN_STATS['index_end_ts'] = end_ts
-    print('    √ {}/{}'.format(pretty_path(out_dir), out_file))
+    print('\r    √ {}/{}'.format(pretty_path(out_dir), out_file))
 
 
 def log_archiving_started(num_links, resume):
 def log_archiving_started(num_links, resume):
     start_ts = datetime.now()
     start_ts = datetime.now()

+ 11 - 22
archivebox/util.py

@@ -314,10 +314,20 @@ def wget_output_path(link):
     # Wget downloads can save in a number of different ways depending on the url:
     # Wget downloads can save in a number of different ways depending on the url:
     #    https://example.com
     #    https://example.com
     #       > output/archive/<timestamp>/example.com/index.html
     #       > output/archive/<timestamp>/example.com/index.html
+    #    https://example.com?v=zzVa_tX1OiI
+    #       > output/archive/<timestamp>/example.com/index.html?v=zzVa_tX1OiI.html
+    #    https://www.example.com/?v=zzVa_tX1OiI
+    #       > output/archive/<timestamp>/example.com/index.html?v=zzVa_tX1OiI.html
+
     #    https://example.com/abc
     #    https://example.com/abc
     #       > output/archive/<timestamp>/example.com/abc.html
     #       > output/archive/<timestamp>/example.com/abc.html
     #    https://example.com/abc/
     #    https://example.com/abc/
     #       > output/archive/<timestamp>/example.com/abc/index.html
     #       > output/archive/<timestamp>/example.com/abc/index.html
+    #    https://example.com/abc?v=zzVa_tX1OiI.html
+    #       > output/archive/<timestamp>/example.com/abc?v=zzVa_tX1OiI.html
+    #    https://example.com/abc/?v=zzVa_tX1OiI.html
+    #       > output/archive/<timestamp>/example.com/abc/index.html?v=zzVa_tX1OiI.html
+
     #    https://example.com/abc/test.html
     #    https://example.com/abc/test.html
     #       > output/archive/<timestamp>/example.com/abc/test.html
     #       > output/archive/<timestamp>/example.com/abc/test.html
     #    https://example.com/abc/test?v=zzVa_tX1OiI
     #    https://example.com/abc/test?v=zzVa_tX1OiI
@@ -326,7 +336,7 @@ def wget_output_path(link):
     #       > output/archive/<timestamp>/example.com/abc/test/index.html?v=zzVa_tX1OiI.html
     #       > output/archive/<timestamp>/example.com/abc/test/index.html?v=zzVa_tX1OiI.html
 
 
     # There's also lots of complexity around how the urlencoding and renaming
     # There's also lots of complexity around how the urlencoding and renaming
-    # is done for pages with query and hash fragments or extensions like shtml / htm
+    # is done for pages with query and hash fragments or extensions like shtml / htm / php / etc
 
 
     # Since the wget algorithm for -E (appending .html) is incredibly complex
     # Since the wget algorithm for -E (appending .html) is incredibly complex
     # and there's no way to get the computed output path from wget
     # and there's no way to get the computed output path from wget
@@ -359,27 +369,6 @@ def wget_output_path(link):
 
 
     return None
     return None
 
 
-    # If finding the actual output file didn't work, fall back to the buggy
-    # implementation of the wget .html appending algorithm
-    # split_url = link['url'].split('#', 1)
-    # query = ('%3F' + link['url'].split('?', 1)[-1]) if '?' in link['url'] else ''
-
-    # if re.search(".+\\.[Hh][Tt][Mm][Ll]?$", split_url[0], re.I | re.M):
-    #     # already ends in .html
-    #     return urlencode(base_url(link['url']))
-    # else:
-    #     # .html needs to be appended
-    #     without_scheme = split_url[0].split('://', 1)[-1].split('?', 1)[0]
-    #     if without_scheme.endswith('/'):
-    #         if query:
-    #             return urlencode('#'.join([without_scheme + 'index.html' + query + '.html', *split_url[1:]]))
-    #         return urlencode('#'.join([without_scheme + 'index.html', *split_url[1:]]))
-    #     else:
-    #         if query:
-    #             return urlencode('#'.join([without_scheme + '/index.html' + query + '.html', *split_url[1:]]))
-    #         elif '/' in without_scheme:
-    #             return urlencode('#'.join([without_scheme + '.html', *split_url[1:]]))
-    #         return urlencode(base_url(link['url']) + '/index.html')
 
 
 ### String Manipulation & Logging Helpers
 ### String Manipulation & Logging Helpers