Browse Source

Merge pull request #362 from cdvv7788/add-fix

Add fix
Nick Sweeting 5 years ago
parent
commit
175e6fa3d0
2 changed files with 9 additions and 7 deletions
  1. 5 1
      archivebox/index/__init__.py
  2. 4 6
      archivebox/main.py

+ 5 - 1
archivebox/index/__init__.py

@@ -292,7 +292,6 @@ def dedupe_links(existing_links: List[Link],
                  new_links: List[Link]) -> Tuple[List[Link], List[Link]]:
                  new_links: List[Link]) -> Tuple[List[Link], List[Link]]:
 
 
     from ..parsers import parse_links
     from ..parsers import parse_links
-
     # merge existing links in out_dir and new links
     # merge existing links in out_dir and new links
     all_links = validate_links(existing_links + new_links)
     all_links = validate_links(existing_links + new_links)
     all_link_urls = {link.url for link in existing_links}
     all_link_urls = {link.url for link in existing_links}
@@ -301,6 +300,11 @@ def dedupe_links(existing_links: List[Link],
         link for link in new_links
         link for link in new_links
         if link.url not in all_link_urls
         if link.url not in all_link_urls
     ]
     ]
+
+    all_links_deduped = {link.url: link for link in all_links}
+    for i in range(len(new_links)):
+        if new_links[i].url in all_links_deduped.keys():
+            new_links[i] = all_links_deduped[new_links[i].url]
     log_deduping_finished(len(new_links))
     log_deduping_finished(len(new_links))
 
 
     return all_links, new_links
     return all_links, new_links

+ 4 - 6
archivebox/main.py

@@ -520,18 +520,16 @@ def add(urls: Union[str, List[str]],
         write_ahead_log = save_text_as_source('\n'.join(urls), filename='{ts}-import.txt', out_dir=out_dir)
         write_ahead_log = save_text_as_source('\n'.join(urls), filename='{ts}-import.txt', out_dir=out_dir)
     
     
     new_links += parse_links_from_source(write_ahead_log)
     new_links += parse_links_from_source(write_ahead_log)
-    all_links, new_links = dedupe_links(all_links, new_links)
-    write_main_index(links=all_links, out_dir=out_dir, finished=not new_links)
-
 
 
     # If we're going one level deeper, download each link and look for more links
     # If we're going one level deeper, download each link and look for more links
+    new_links_depth = []
     if new_links and depth == 1:
     if new_links and depth == 1:
         log_crawl_started(new_links)
         log_crawl_started(new_links)
         for new_link in new_links:
         for new_link in new_links:
             downloaded_file = save_file_as_source(new_link.url, filename='{ts}-crawl-{basename}.txt', out_dir=out_dir)
             downloaded_file = save_file_as_source(new_link.url, filename='{ts}-crawl-{basename}.txt', out_dir=out_dir)
-            new_links += parse_links_from_source(downloaded_file)
-            all_links, new_links = dedupe_links(all_links, new_links)
-            write_main_index(links=all_links, out_dir=out_dir, finished=not new_links)
+            new_links_depth += parse_links_from_source(downloaded_file)
+    all_links, new_links = dedupe_links(all_links, new_links + new_links_depth)
+    write_main_index(links=all_links, out_dir=out_dir, finished=not new_links)
 
 
     if index_only:
     if index_only:
         return all_links
         return all_links