6 年之前 · eb5cc8078a
--- a/archivebox/archive_methods.py
+++ b/archivebox/archive_methods.py
@@ -7,7 +7,7 @@ from datetime import datetime
 
				 from index import (
			
 
				     parse_json_link_index,
			
 
				     write_link_index,
			
 
				-    patch_index_title_hack,
			
 
				+    update_main_index,
			
 
				 )
			
 
				 from config import (
			
 
				     CURL_BINARY,
			
@@ -103,7 +103,9 @@ def archive_link(link_dir, link, overwrite=True):
 
				         for archive_method in active_methods:
			
 
				             archive_method(link_dir, link, overwrite=overwrite)
			
 
				 
			
 
				+
			
 
				         write_link_index(link_dir, link)
			
 
				+        update_main_index(link)
			
 
				 
			
 
				     except Exception as err:
			
 
				         print('    ! Failed to archive link: {}: {}'.format(err.__class__.__name__, err))
			
@@ -218,7 +220,7 @@ def fetch_wget(link_dir, link, requisites=FETCH_WGET_REQUISITES, warc=FETCH_WARC
 
				     try:
			
 
				         result = run(CMD, stdout=PIPE, stderr=PIPE, cwd=link_dir, timeout=timeout)
			
 
				         end()
			
 
				-        output = wget_output_path(link, look_in=domain_dir)
			
 
				+        output = wget_output_path(link)
			
 
				 
			
 
				         output_tail = ['          ' + line for line in (result.stdout + result.stderr).decode().rsplit('\n', 3)[-3:] if line.strip()]
			
 
				 
			
@@ -391,11 +393,13 @@ def archive_dot_org(link_dir, link, timeout=TIMEOUT):
 
				     output = 'archive.org.txt'
			
 
				     archive_org_url = None
			
 
				 
			
 
				+
			
 
				     path = os.path.join(link_dir, output)
			
 
				     if os.path.exists(path):
			
 
				         archive_org_url = open(path, 'r').read().strip()
			
 
				         return {'output': archive_org_url, 'status': 'skipped'}
			
 
				 
			
 
				+
			
 
				     submit_url = 'https://web.archive.org/save/{}'.format(link['url'])
			
 
				     CMD = [
			
 
				         CURL_BINARY,
			
@@ -412,7 +416,6 @@ def archive_dot_org(link_dir, link, timeout=TIMEOUT):
 
				         end()
			
 
				 
			
 
				         content_location, errors = parse_archive_dot_org_response(result.stdout)
			
 
				-
			
 
				         if content_location:
			
 
				             archive_org_url = 'https://web.archive.org{}'.format(content_location[0])
			
 
				         elif len(errors) == 1 and 'RobotAccessControlException' in errors[0]:
			
@@ -427,6 +430,7 @@ def archive_dot_org(link_dir, link, timeout=TIMEOUT):
 
				         output = e
			
 
				         print_error_hints(cmd=CMD, pwd=link_dir, err=e)
			
 
				 
			
 
				+
			
 
				     if not isinstance(output, Exception):
			
 
				         # instead of writing None when archive.org rejects the url write the
			
 
				         # url to resubmit it to archive.org. This is so when the user visits
			
@@ -499,7 +503,6 @@ def fetch_title(link_dir, link, timeout=TIMEOUT):
 
				     # TODO: figure out how to do this without gnarly string replacement
			
 
				     if title:
			
 
				         link['title'] = title
			
 
				-        patch_index_title_hack(link['url'], title)
			
 
				 
			
 
				     return {
			
 
				         'cmd': 'fetch_page_title("{}")'.format(link['url']),
			
--- a/archivebox/index.py
+++ b/archivebox/index.py
@@ -22,8 +22,11 @@ from util import (
 
				     pretty_path,
			
 
				     check_link_structure,
			
 
				     check_links_structure,
			
 
				+    wget_output_path,
			
 
				 )
			
 
				 
			
 
				+TITLE_LOADING_MSG = 'Not yet archived...'
			
 
				+
			
 
				 
			
 
				 ### Homepage index for all the links
			
 
				 
			
@@ -96,9 +99,20 @@ def write_html_links_index(out_dir, links, finished=False):
 
				     with open(os.path.join(TEMPLATES_DIR, 'index_row.html'), 'r', encoding='utf-8') as f:
			
 
				         link_row_html = f.read()
			
 
				 
			
 
				+    full_links_info = (derived_link_info(link) for link in links)
			
 
				+
			
 
				     link_rows = '\n'.join(
			
 
				-        Template(link_row_html).substitute(**derived_link_info(link))
			
 
				-        for link in links
			
 
				+        Template(link_row_html).substitute(**{
			
 
				+            **link,
			
 
				+            'title': (
			
 
				+                link['title']
			
 
				+                or (link['base_url'] if link['is_archived'] else TITLE_LOADING_MSG)
			
 
				+            ),
			
 
				+            'archive_url': (
			
 
				+                wget_output_path(link) or 'index.html'
			
 
				+            ),
			
 
				+        })
			
 
				+        for link in full_links_info
			
 
				     )
			
 
				 
			
 
				     template_vars = {
			
@@ -118,24 +132,41 @@ def write_html_links_index(out_dir, links, finished=False):
 
				     chmod_file(path)
			
 
				 
			
 
				 
			
 
				-def patch_index_title_hack(link_url, new_title):
			
 
				-    """hack to update just one link's title in the link index json"""
			
 
				+def update_main_index(link):
			
 
				+    """hack to in-place update one row's info in the generated index html"""
			
 
				+
			
 
				+    title = link['latest']['title']
			
 
				+    successful = len([entry for entry in link['latest'].values() if entry])
			
 
				 
			
 
				+    # Patch JSON index
			
 
				     json_path = os.path.join(OUTPUT_DIR, 'index.json')
			
 
				 
			
 
				     links = parse_json_links_index(OUTPUT_DIR)
			
 
				 
			
 
				     changed = False
			
 
				-    for link in links:
			
 
				-        if link['url'] == link_url:
			
 
				-            link['title'] = new_title
			
 
				+    for json_link in links:
			
 
				+        if json_link['url'] == link['url']:
			
 
				+            json_link['title'] = title
			
 
				+            json_link['latest'] = link['latest']
			
 
				             changed = True
			
 
				             break
			
 
				 
			
 
				     if changed:
			
 
				         write_json_links_index(OUTPUT_DIR, links)
			
 
				 
			
 
				+    # Patch HTML index
			
 
				+    html_path = os.path.join(OUTPUT_DIR, 'index.html')
			
 
				 
			
 
				+    html = open(html_path, 'r').read().split('\n')
			
 
				+    for idx, line in enumerate(html):
			
 
				+        if title and ('<span data-title-for="{}"'.format(link['url']) in line):
			
 
				+            html[idx] = '<span>{}</span>'.format(title)
			
 
				+        elif successful and ('<span data-number-for="{}"'.format(link['url']) in line):
			
 
				+            html[idx] = '<span>{}</span>'.format(successful)
			
 
				+            break
			
 
				+
			
 
				+    with open(html_path, 'w') as f:
			
 
				+        f.write('\n'.join(html))
			
 
				 
			
 
				 ### Individual link index
			
 
				 
			
@@ -176,10 +207,19 @@ def write_html_link_index(out_dir, link):
 
				 
			
 
				     print('      √ index.html')
			
 
				 
			
 
				+    link = derived_link_info(link)
			
 
				+
			
 
				     with open(path, 'w', encoding='utf-8') as f:
			
 
				         f.write(Template(link_html).substitute({
			
 
				-            **derived_link_info(link),
			
 
				-            # **link['latest'],
			
 
				+            **link,
			
 
				+            'title': (
			
 
				+                link['title']
			
 
				+                or (link['base_url'] if link['is_archived'] else TITLE_LOADING_MSG)
			
 
				+            ),
			
 
				+            'archive_url': (
			
 
				+                wget_output_path(link)
			
 
				+                or (link['domain'] if link['is_archived'] else 'about:blank')
			
 
				+            ),
			
 
				         }))
			
 
				 
			
 
				     chmod_file(path)
			
--- a/archivebox/templates/index.html
+++ b/archivebox/templates/index.html
@@ -98,6 +98,28 @@
 
				                 overflow-y: scroll;
			
 
				                 table-layout: fixed;
			
 
				             }
			
 
				+            table tr a span[data-archived~=False] {
			
 
				+                opacity: 0.2;
			
 
				+            }
			
 
				+            .files-spinner {
			
 
				+                height: 15px;
			
 
				+                width: auto;
			
 
				+                opacity: 0.5;
			
 
				+                vertical-align: -2px;
			
 
				+            }
			
 
				+            .link-favicon {
			
 
				+                padding-right: 8px;
			
 
				+                vertical-align: -4px;
			
 
				+            }
			
 
				+            .in-progress {
			
 
				+                display: none;
			
 
				+            }
			
 
				+            body[data-status~=finished] .files-spinner {
			
 
				+                display: none;
			
 
				+            }
			
 
				+            body[data-status~=running] .in-progress {
			
 
				+                display: inline-block;
			
 
				+            }
			
 
				         </style>
			
 
				     </head>
			
 
				     <body data-status="$status">
			
@@ -121,12 +143,8 @@
 
				             <thead>
			
 
				                 <tr>
			
 
				                     <th style="width: 80px;">Bookmarked</th>
			
 
				-                    <th style="width: 26px;">Files</th>
			
 
				                     <th style="width: 26vw;">Saved Link ($num_links)</th>
			
 
				-                    <th style="width: 30px;">PNG</th>
			
 
				-                    <th style="width: 30px">PDF</th>
			
 
				-                    <th style="width: 30px">HTML</th>
			
 
				-                    <th style="width: 30px">A.org</th>
			
 
				+                    <th style="width: 50px">Saved Files</th>
			
 
				                     <th style="width: 16vw;whitespace:nowrap;overflow-x:hidden;">Original URL</th>
			
 
				                 </tr>
			
 
				             </thead>
			
--- a/archivebox/templates/index_row.html
+++ b/archivebox/templates/index_row.html
@@ -1,16 +1,18 @@
 
				-<tr>
			
 
				+<tr data-url="$url">
			
 
				     <td title="Bookmarked timestamp: $timestamp">$bookmarked_date</td>
			
 
				-    <td>
			
 
				+    <td style="text-align: left">
			
 
				         <a href="$link_dir/$index_url" title="Link Index">
			
 
				             <img src="$link_dir/$favicon_url" onerror="this.src='static/spinner.gif'" class="link-favicon">
			
 
				         </a>
			
 
				+        <a href="$link_dir/$archive_url" style="font-size:1.4em;text-decoration:none;color:black;" title="$title">
			
 
				+            <span data-title-for="$url" data-archived="$is_archived">$title</span>
			
 
				+            <small style="background-color: #eee;border-radius:4px; float:right">$tags</small>
			
 
				+        </a>
			
 
				     </td>
			
 
				-    <td style="text-align: left"><a href="$link_dir/$archive_url" style="font-size:1.4em;text-decoration:none;color:black;" title="$title">
			
 
				-        $title <small style="background-color: #eee;border-radius:4px; float:right">$tags</small>
			
 
				+    <td>
			
 
				+        <a href="$link_dir/$index_url">📄 
			
 
				+            <span data-number-for="$url" title="Fetching any missing files...">$num_outputs <img src="static/spinner.gif" class="files-spinner"/></span>
			
 
				+        </a>
			
 
				     </td>
			
 
				-    <td><a href="$link_dir/$screenshot_url" title="Screenshot">🖼</a></td>
			
 
				-    <td><a href="$link_dir/$pdf_url" title="PDF">📜</a></td>
			
 
				-    <td><a href="$link_dir/$dom_url" title="DOM">📄</a></td>
			
 
				-    <td><a href="$archive_org_url" title="Archive.org">🏛</a></td>
			
 
				     <td style="text-align: left"><!--🔗 <img src="$google_favicon_url" height="16px">--> <a href="$url">$url</a></td>
			
 
				 </tr>
			
--- a/archivebox/util.py
+++ b/archivebox/util.py
@@ -244,7 +244,7 @@ def fetch_page_title(url, timeout=10, progress=SHOW_PROGRESS):
 
				         # ))
			
 
				         return None
			
 
				 
			
 
				-def wget_output_path(link, look_in=None):
			
 
				+def wget_output_path(link):
			
 
				     """calculate the path to the wgetted .html file, since wget may
			
 
				     adjust some paths to be different than the base_url path.