5 年之前 · be520d137a
--- a/archivebox/extractors/__init__.py
+++ b/archivebox/extractors/__init__.py
@@ -9,6 +9,7 @@ from ..index.schema import Link
 
				 from ..index import (
			
 
				     load_link_details,
			
 
				     write_link_details,
			
 
				+    write_main_index,
			
 
				 )
			
 
				 from ..util import enforce_types
			
 
				 from ..logging_util import (
			
@@ -128,24 +129,34 @@ def archive_link(link: Link, overwrite: bool=False, methods: Optional[Iterable[s
 
				 
			
 
				     return link
			
 
				 
			
 
				-
			
 
				 @enforce_types
			
 
				-def archive_links(links: List[Link], overwrite: bool=False, methods: Optional[Iterable[str]]=None, out_dir: Optional[str]=None) -> List[Link]:
			
 
				-    if not links:
			
 
				+def archive_links(all_links: any, overwrite: bool=False, methods: Optional[Iterable[str]]=None, out_dir: Optional[str]=None) -> List[Link]:
			
 
				+
			
 
				+    if type(all_links) is list:
			
 
				+        num_links: int = len(all_links)
			
 
				+        get_link = lambda x: x
			
 
				+        get_iter = lambda x: x
			
 
				+    else:
			
 
				+        num_links: int = all_links.count()
			
 
				+        get_link = lambda x: x.as_link()
			
 
				+        get_iter = lambda x: x.iterator()
			
 
				+
			
 
				+    if num_links == 0:
			
 
				         return []
			
 
				 
			
 
				-    log_archiving_started(len(links))
			
 
				+    log_archiving_started(num_links)
			
 
				     idx: int = 0
			
 
				-    link: Link = links[0]
			
 
				     try:
			
 
				-        for idx, link in enumerate(links):
			
 
				-            archive_link(link, overwrite=overwrite, methods=methods, out_dir=link.link_dir)
			
 
				+        for link in get_iter(all_links):
			
 
				+            idx += 1
			
 
				+            to_archive = get_link(link)
			
 
				+            archive_link(to_archive, overwrite=overwrite, methods=methods, out_dir=link.link_dir)
			
 
				     except KeyboardInterrupt:
			
 
				-        log_archiving_paused(len(links), idx, link.timestamp)
			
 
				+        log_archiving_paused(num_links, idx, link.timestamp)
			
 
				         raise SystemExit(0)
			
 
				     except BaseException:
			
 
				         print()
			
 
				         raise
			
 
				 
			
 
				-    log_archiving_finished(len(links))
			
 
				-    return links
			
 
				+    log_archiving_finished(num_links)
			
 
				+    return all_links
			
--- a/archivebox/index/__init__.py
+++ b/archivebox/index/__init__.py
@@ -11,6 +11,7 @@ from typing import List, Tuple, Dict, Optional, Iterable
 
				 from collections import OrderedDict
			
 
				 from contextlib import contextmanager
			
 
				 from urllib.parse import urlparse
			
 
				+from django.db.models import QuerySet
			
 
				 
			
 
				 from ..util import (
			
 
				     scheme,
			
@@ -133,7 +134,6 @@ def validate_links(links: Iterable[Link]) -> List[Link]:
 
				 
			
 
				     return list(links)
			
 
				 
			
 
				-
			
 
				 @enforce_types
			
 
				 def archivable_links(links: Iterable[Link]) -> Iterable[Link]:
			
 
				     """remove chrome://, about:// or other schemed links that cant be archived"""
			
@@ -165,15 +165,6 @@ def fix_duplicate_links(sorted_links: Iterable[Link]) -> Iterable[Link]:
 
				             link = merge_links(unique_urls[link.url], link)
			
 
				         unique_urls[link.url] = link
			
 
				 
			
 
				-    # unique_timestamps: OrderedDict[str, Link] = OrderedDict()
			
 
				-    # for link in unique_urls.values():
			
 
				-    #     closest_non_duplicate_ts = lowest_uniq_timestamp(unique_timestamps, link.timestamp)
			
 
				-    #     if closest_non_duplicate_ts != link.timestamp:
			
 
				-    #         link = link.overwrite(timestamp=closest_non_duplicate_ts)
			
 
				-    #         Snapshot.objects.filter(url=link.url).update(timestamp=link.timestamp)
			
 
				-    #     unique_timestamps[link.timestamp] = link
			
 
				-
			
 
				-    # return unique_timestamps.values()
			
 
				     return unique_urls.values()
			
 
				 
			
 
				 
			
@@ -245,11 +236,7 @@ def write_main_index(links: List[Link], out_dir: str=OUTPUT_DIR, finished: bool=
 
				             os.chmod(os.path.join(out_dir, SQL_INDEX_FILENAME), int(OUTPUT_PERMISSIONS, base=8)) # set here because we don't write it with atomic writes
			
 
				 
			
 
				         if finished:
			
 
				-            with timed_index_update(os.path.join(out_dir, JSON_INDEX_FILENAME)):
			
 
				-                write_json_main_index(links, out_dir=out_dir)
			
 
				-
			
 
				-            with timed_index_update(os.path.join(out_dir, HTML_INDEX_FILENAME)):
			
 
				-                write_html_main_index(links, out_dir=out_dir, finished=finished)
			
 
				+            write_static_index(links, out_dir=out_dir)
			
 
				     except (KeyboardInterrupt, SystemExit):
			
 
				         stderr('[!] Warning: Still writing index to disk...', color='lightyellow')
			
 
				         stderr('    Run archivebox init to fix any inconsisntencies from an ungraceful exit.')
			
@@ -260,7 +247,14 @@ def write_main_index(links: List[Link], out_dir: str=OUTPUT_DIR, finished: bool=
 
				 
			
 
				     log_indexing_process_finished()
			
 
				 
			
 
				+@enforce_types
			
 
				+def write_static_index(links: List[Link], out_dir: str=OUTPUT_DIR) -> None:
			
 
				+    with timed_index_update(os.path.join(out_dir, JSON_INDEX_FILENAME)):
			
 
				+        write_json_main_index(links)
			
 
				+    with timed_index_update(os.path.join(out_dir, HTML_INDEX_FILENAME)):
			
 
				+        write_html_main_index(links, out_dir=out_dir, finished=True)
			
 
				 
			
 
				+@enforce_types
			
 
				 def get_empty_snapshot_queryset(out_dir: str=OUTPUT_DIR):
			
 
				     setup_django(out_dir, check_db=True)
			
 
				     from core.models import Snapshot
			
@@ -306,27 +300,47 @@ def parse_links_from_source(source_path: str, root_url: Optional[str]=None) -> T
 
				 
			
 
				     return new_links
			
 
				 
			
 
				-
			
 
				 @enforce_types
			
 
				-def dedupe_links(existing_links: List[Link],
			
 
				-                 new_links: List[Link]) -> Tuple[List[Link], List[Link]]:
			
 
				+def fix_duplicate_links_in_index(snapshots: QuerySet, links: Iterable[Link]) -> Iterable[Link]:
			
 
				+    """
			
 
				+    Look for urls in the index, and merge them too
			
 
				+    """
			
 
				+    unique_urls: OrderedDict[str, Link] = OrderedDict()
			
 
				+
			
 
				+    for link in links:
			
 
				+        index_link = snapshots.filter(url=link.url)
			
 
				+        if index_link:
			
 
				+            link = merge_links(index_link[0].as_link(), link)
			
 
				 
			
 
				+        unique_urls[link.url] = link
			
 
				+
			
 
				+    return unique_urls.values()
			
 
				+
			
 
				+@enforce_types
			
 
				+def dedupe_links(snapshots: QuerySet,
			
 
				+                 new_links: List[Link]) -> List[Link]:
			
 
				+    """
			
 
				+    The validation of links happened at a different stage. This method will
			
 
				+    focus on actual deduplication and timestamp fixing.
			
 
				+    """
			
 
				+    
			
 
				     # merge existing links in out_dir and new links
			
 
				-    all_links = validate_links(existing_links + new_links)
			
 
				-    all_link_urls = {link.url for link in existing_links}
			
 
				+    dedup_links = fix_duplicate_links_in_index(snapshots, new_links)
			
 
				 
			
 
				     new_links = [
			
 
				         link for link in new_links
			
 
				-        if link.url not in all_link_urls
			
 
				+        if not snapshots.filter(url=link.url).exists()
			
 
				     ]
			
 
				 
			
 
				-    all_links_deduped = {link.url: link for link in all_links}
			
 
				+    dedup_links_dict = {link.url: link for link in dedup_links}
			
 
				+
			
 
				+    # Replace links in new_links with the dedup version
			
 
				     for i in range(len(new_links)):
			
 
				-        if new_links[i].url in all_links_deduped.keys():
			
 
				-            new_links[i] = all_links_deduped[new_links[i].url]
			
 
				+        if new_links[i].url in dedup_links_dict.keys():
			
 
				+            new_links[i] = dedup_links_dict[new_links[i].url]
			
 
				     log_deduping_finished(len(new_links))
			
 
				 
			
 
				-    return all_links, new_links
			
 
				+    return new_links
			
 
				 
			
 
				 ### Link Details Index
			
 
				 
			
--- a/archivebox/index/sql.py
+++ b/archivebox/index/sql.py
@@ -40,9 +40,11 @@ def write_sql_main_index(links: List[Link], out_dir: str=OUTPUT_DIR) -> None:
 
				         for link in links:
			
 
				             info = {k: v for k, v in link._asdict().items() if k in Snapshot.keys}
			
 
				             try:
			
 
				-                info['timestamp'] = Snapshot.objects.get(url=link.url).timestamp
			
 
				+                info["timestamp"] = Snapshot.objects.get(url=link.url).timestamp
			
 
				             except Snapshot.DoesNotExist:
			
 
				-                pass
			
 
				+                while Snapshot.objects.filter(timestamp=info["timestamp"]).exists():
			
 
				+                    info["timestamp"] = str(float(info["timestamp"]) + 1.0)
			
 
				+
			
 
				             Snapshot.objects.update_or_create(url=link.url, defaults=info)
			
 
				 
			
 
				 @enforce_types
			
--- a/archivebox/main.py
+++ b/archivebox/main.py
@@ -30,6 +30,7 @@ from .index import (
 
				     parse_links_from_source,
			
 
				     dedupe_links,
			
 
				     write_main_index,
			
 
				+    write_static_index,
			
 
				     link_matches_filter,
			
 
				     get_indexed_folders,
			
 
				     get_archived_folders,
			
@@ -520,7 +521,7 @@ def add(urls: Union[str, List[str]],
 
				     check_data_folder(out_dir=out_dir)
			
 
				     check_dependencies()
			
 
				     new_links: List[Link] = []
			
 
				-    all_links = [x.as_link() for x in load_main_index(out_dir=out_dir)]
			
 
				+    all_links = load_main_index(out_dir=out_dir)
			
 
				 
			
 
				     log_importing_started(urls=urls, depth=depth, index_only=index_only)
			
 
				     if isinstance(urls, str):
			
@@ -541,8 +542,10 @@ def add(urls: Union[str, List[str]],
 
				             new_links_depth += parse_links_from_source(downloaded_file, root_url=new_link.url)
			
 
				 
			
 
				     imported_links = list({link.url: link for link in (new_links + new_links_depth)}.values())
			
 
				-    all_links, new_links = dedupe_links(all_links, imported_links)
			
 
				-    write_main_index(links=all_links, out_dir=out_dir, finished=not new_links)
			
 
				+    new_links = dedupe_links(all_links, imported_links)
			
 
				+
			
 
				+    write_main_index(links=new_links, out_dir=out_dir, finished=not new_links)
			
 
				+    all_links = load_main_index(out_dir=out_dir)
			
 
				 
			
 
				     if index_only:
			
 
				         return all_links
			
@@ -555,12 +558,9 @@ def add(urls: Union[str, List[str]],
 
				     elif new_links:
			
 
				         archive_links(new_links, overwrite=False, out_dir=out_dir)
			
 
				     else:
			
 
				-        # nothing was updated, don't bother re-saving the index
			
 
				         return all_links
			
 
				 
			
 
				-    # Step 4: Re-write links index with updated titles, icons, and resources
			
 
				-    all_links = [x.as_link() for x in load_main_index(out_dir=out_dir)]
			
 
				-    write_main_index(links=list(all_links), out_dir=out_dir, finished=True)
			
 
				+    write_static_index([link.as_link() for link in all_links], out_dir=out_dir)
			
 
				     return all_links
			
 
				 
			
 
				 @enforce_types