Browse Source

fix logic for ONLY_NEW accidentally replacing all links

Nick Sweeting 6 years ago
parent
commit
3571ef24e4
2 changed files with 22 additions and 31 deletions
  1. 22 22
      archivebox/archive.py
  2. 0 9
      archivebox/links.py

+ 22 - 22
archivebox/archive.py

@@ -10,10 +10,7 @@ from datetime import datetime
 from subprocess import run
 from subprocess import run
 
 
 from parse import parse_links
 from parse import parse_links
-from links import (
-    new_links,
-    validate_links
-)
+from links import validate_links
 from archive_methods import archive_links, _RESULTS_TOTALS
 from archive_methods import archive_links, _RESULTS_TOTALS
 from index import (
 from index import (
     write_links_index,
     write_links_index,
@@ -56,34 +53,34 @@ def print_help():
     print("    echo 'https://examplecom' | ./bin/archivebox\n")
     print("    echo 'https://examplecom' | ./bin/archivebox\n")
 
 
 
 
-def merge_links(archive_path=OUTPUT_DIR, import_path=None, only_new=False):
+def load_links(archive_path=OUTPUT_DIR, import_path=None):
     """get new links from file and optionally append them to links in existing archive"""
     """get new links from file and optionally append them to links in existing archive"""
-    all_links = []
+    
+    existing_links = []
+    if archive_path:
+        existing_links = parse_json_links_index(archive_path)
+        existing_links = validate_links(existing_links)
+
+    new_links = []
     if import_path:
     if import_path:
         # parse and validate the import file
         # parse and validate the import file
         raw_links, parser_name = parse_links(import_path)
         raw_links, parser_name = parse_links(import_path)
-        all_links = validate_links(raw_links)
+        new_links = validate_links(raw_links)
+        if SHOW_PROGRESS:
+            print()
 
 
     # merge existing links in archive_path and new links
     # merge existing links in archive_path and new links
-    existing_links = []
-    if archive_path:
-        existing_links = parse_json_links_index(archive_path)
-        all_links = validate_links(existing_links + all_links)
-
+    all_links = validate_links(existing_links + new_links)
     num_new_links = len(all_links) - len(existing_links)
     num_new_links = len(all_links) - len(existing_links)
-    if SHOW_PROGRESS:
-        print()
-    if import_path:
+
+    if import_path and parser_name:
         print('    > Adding {} new links to index from {} (parsed as {} format)'.format(
         print('    > Adding {} new links to index from {} (parsed as {} format)'.format(
             num_new_links,
             num_new_links,
             pretty_path(import_path),
             pretty_path(import_path),
             parser_name,
             parser_name,
         ))
         ))
 
 
-    if only_new:
-        return new_links(all_links, existing_links)
-
-    return all_links
+    return all_links, new_links
 
 
 def update_archive(archive_path, links, source=None, resume=None, append=True):
 def update_archive(archive_path, links, source=None, resume=None, append=True):
     """update or create index.html+json given a path to an export file containing new links"""
     """update or create index.html+json given a path to an export file containing new links"""
@@ -182,13 +179,16 @@ if __name__ == '__main__':
         source = save_source(stdin_raw_text)
         source = save_source(stdin_raw_text)
 
 
     # Step 1: Parse the links and dedupe them with existing archive
     # Step 1: Parse the links and dedupe them with existing archive
-    links = merge_links(archive_path=out_dir, import_path=source, only_new=ONLY_NEW)
+    all_links, new_links = load_links(archive_path=out_dir, import_path=source)
 
 
     # Step 2: Write new index
     # Step 2: Write new index
-    write_links_index(out_dir=out_dir, links=links)
+    write_links_index(out_dir=out_dir, links=all_links)
 
 
     # Step 3: Verify folder structure is 1:1 with index
     # Step 3: Verify folder structure is 1:1 with index
     # cleanup_archive(out_dir, links)
     # cleanup_archive(out_dir, links)
 
 
     # Step 4: Run the archive methods for each link
     # Step 4: Run the archive methods for each link
-    update_archive(out_dir, links, source=source, resume=resume, append=True)
+    if ONLY_NEW:
+        update_archive(out_dir, new_links, source=source, resume=resume, append=True)
+    else:
+        update_archive(out_dir, all_links, source=source, resume=resume, append=True)

+ 0 - 9
archivebox/links.py

@@ -81,15 +81,6 @@ def validate_links(links):
 
 
     return list(links)
     return list(links)
 
 
-def new_links(all_links, existing_links):
-    """
-    Return all links which are in the all_links but not in the existing_links.
-    This is used to determine which links are new and not indexed jet. Set the
-    ONLY_NEW environment variable to activate this filter mechanism.
-    """
-    existing_urls = {link['url'] for link in existing_links}
-    return [link for link in all_links if link['url'] not in existing_urls]
-
 def archivable_links(links):
 def archivable_links(links):
     """remove chrome://, about:// or other schemed links that cant be archived"""
     """remove chrome://, about:// or other schemed links that cant be archived"""
     return (
     return (