Sfoglia il codice sorgente

move latest to derived data using history

Nick Sweeting 6 anni fa
parent
commit
d06775923b
4 ha cambiato i file con 34 aggiunte e 49 eliminazioni
  1. 0 4
      archivebox/archive_methods.py
  2. 3 5
      archivebox/index.py
  3. 1 29
      archivebox/links.py
  4. 30 11
      archivebox/util.py

+ 0 - 4
archivebox/archive_methods.py

@@ -90,8 +90,6 @@ def archive_link(link_dir, link):
         for method_name, should_run, method_function in ARCHIVE_METHODS:
             if method_name not in link['history']:
                 link['history'][method_name] = []
-            if method_name not in link['latest']:
-                link['latest'][method_name] = None
             
             if not should_run(link_dir, link):
                 continue
@@ -105,8 +103,6 @@ def archive_link(link_dir, link):
             log_archive_method_finished(result)
 
             link['history'][method_name].append(result)
-            if result['status'] == 'succeeded':
-                link['latest'][method_name] = result['output']
 
         write_link_index(link_dir, link)
         patch_links_index(link)

+ 3 - 5
archivebox/index.py

@@ -22,6 +22,7 @@ from util import (
     check_link_structure,
     check_links_structure,
     wget_output_path,
+    latest_output,
 )
 from parse import parse_links
 from links import validate_links
@@ -168,8 +169,8 @@ def write_html_links_index(out_dir, links, finished=False):
 def patch_links_index(link, out_dir=OUTPUT_DIR):
     """hack to in-place update one row's info in the generated index html"""
 
-    title = link['latest']['title']
-    successful = len([entry for entry in link['latest'].values() if entry])
+    title = link['title'] or latest_output(link)['title']
+    successful = len(tuple(filter(None, latest_output(link).values())))
 
     # Patch JSON index
     changed = False
@@ -177,7 +178,6 @@ def patch_links_index(link, out_dir=OUTPUT_DIR):
     for saved_link in json_file_links:
         if saved_link['url'] == link['url']:
             saved_link['title'] = title
-            saved_link['latest'] = link['latest']
             saved_link['history'] = link['history']
             changed = True
             break
@@ -235,12 +235,10 @@ def load_json_link_index(out_dir, link):
         **link,
     }
     link.update({
-        'latest': link.get('latest') or {},
         'history': link.get('history') or {},
     })
 
     check_link_structure(link)
-
     return link
 
 def write_html_link_index(out_dir, link):

+ 1 - 29
archivebox/links.py

@@ -9,12 +9,6 @@ Link {
     title: str,                                       
     tags: str,                                        
     sources: [str],                                   
-    latest: {                                         
-        ...,                                          
-        pdf: 'output.pdf',                            
-        wget: 'example.com/1234/index.html',
-        screenshot: null,        
-    },
     history: {
         pdf: [
             {start_ts, end_ts, duration, cmd, pwd, status, output},
@@ -30,7 +24,6 @@ from collections import OrderedDict
 
 from util import (
     merge_links,
-    wget_output_path,
     check_link_structure,
     check_links_structure,
 )
@@ -47,30 +40,9 @@ def validate_links(links):
         raise SystemExit(1)
 
     for link in links:
+        link['title'] = unescape(link['title'].strip()) if link['title'].strip() else None
         check_link_structure(link)
 
-        link['title'] = unescape(link['title']) if link['title'] else None
-        link['latest'] = link.get('latest') or {}
-
-        latest = link['latest']
-        if not link['latest'].get('wget'):
-            link['latest']['wget'] = wget_output_path(link)
-
-        if not link['latest'].get('pdf'):
-            link['latest']['pdf'] = None
-
-        if not link['latest'].get('screenshot'):
-            link['latest']['screenshot'] = None
-
-        if not link['latest'].get('dom'):
-            link['latest']['dom'] = None
-
-        if not latest.get('favicon'):
-            latest['favicon'] = None
-
-        if not link['latest'].get('title'):
-            link['latest']['title'] = link['title']
-
     return list(links)
 
 

+ 30 - 11
archivebox/util.py

@@ -118,12 +118,6 @@ def check_link_structure(link):
             assert isinstance(key, str)
             assert isinstance(val, list), 'history must be a Dict[str, List], got: {}'.format(link['history'])
     
-    if 'latest' in link:
-        assert isinstance(link['latest'], dict), 'latest must be a Dict'
-        for key, val in link['latest'].items():
-            assert isinstance(key, str)
-            assert (val is None) or isinstance(val, (str, Exception)), 'latest must be a Dict[str, Optional[str]], got: {}'.format(link['latest'])
-
 def check_links_structure(links):
     """basic sanity check invariants to make sure the data is valid"""
     assert isinstance(links, list)
@@ -304,10 +298,6 @@ def wget_output_path(link):
     See docs on wget --adjust-extension (-E)
     """
 
-    # if we have it stored, always prefer the actual output path to computed one
-    if link.get('latest', {}).get('wget'):
-        return link['latest']['wget']
-
     if is_static_file(link['url']):
         return without_scheme(without_fragment(link['url']))
 
@@ -433,7 +423,7 @@ def derived_link_info(link):
             link['timestamp'],
             domain(url),
         )),
-        'num_outputs': len([entry for entry in link['latest'].values() if entry]) if 'latest' in link else 0,
+        'num_outputs': len([entry for entry in latest_output(link).values() if entry]),
     }
 
     # Archive Method Output URLs
@@ -465,6 +455,35 @@ def derived_link_info(link):
     return extended_info
 
 
+def latest_output(link, status=None):
+    """get the latest output that each archive method produced for link"""
+    
+    latest = {
+        'title': None,
+        'favicon': None,
+        'wget': None,
+        'warc': None,
+        'pdf': None,
+        'screenshot': None,
+        'dom': None,
+        'git': None,
+        'media': None,
+        'archive_org': None,
+    }
+    for archive_method in latest.keys():
+        # get most recent succesful result in history for each archive method
+        history = link.get('history', {}).get(archive_method) or []
+        history = filter(lambda result: result['output'], reversed(history))
+        if status is not None:
+            history = filter(lambda result: result['status'] == status, history)
+
+        history = list(history)
+        if history:
+            latest[archive_method] = history[0]['output']
+
+    return latest
+
+
 ### Python / System Helpers
 
 def run(*popenargs, input=None, capture_output=False, timeout=None, check=False, **kwargs):