瀏覽代碼

feat: Refactor add method to use querysets

Cristian 5 年之前
父節點
當前提交
be520d137a
共有 4 個文件被更改,包括 71 次插入44 次删除
  1. 21 10
      archivebox/extractors/__init__.py
  2. 39 25
      archivebox/index/__init__.py
  3. 4 2
      archivebox/index/sql.py
  4. 7 7
      archivebox/main.py

+ 21 - 10
archivebox/extractors/__init__.py

@@ -9,6 +9,7 @@ from ..index.schema import Link
 from ..index import (
 from ..index import (
     load_link_details,
     load_link_details,
     write_link_details,
     write_link_details,
+    write_main_index,
 )
 )
 from ..util import enforce_types
 from ..util import enforce_types
 from ..logging_util import (
 from ..logging_util import (
@@ -128,24 +129,34 @@ def archive_link(link: Link, overwrite: bool=False, methods: Optional[Iterable[s
 
 
     return link
     return link
 
 
-
 @enforce_types
 @enforce_types
-def archive_links(links: List[Link], overwrite: bool=False, methods: Optional[Iterable[str]]=None, out_dir: Optional[str]=None) -> List[Link]:
-    if not links:
+def archive_links(all_links: any, overwrite: bool=False, methods: Optional[Iterable[str]]=None, out_dir: Optional[str]=None) -> List[Link]:
+
+    if type(all_links) is list:
+        num_links: int = len(all_links)
+        get_link = lambda x: x
+        get_iter = lambda x: x
+    else:
+        num_links: int = all_links.count()
+        get_link = lambda x: x.as_link()
+        get_iter = lambda x: x.iterator()
+
+    if num_links == 0:
         return []
         return []
 
 
-    log_archiving_started(len(links))
+    log_archiving_started(num_links)
     idx: int = 0
     idx: int = 0
-    link: Link = links[0]
     try:
     try:
-        for idx, link in enumerate(links):
-            archive_link(link, overwrite=overwrite, methods=methods, out_dir=link.link_dir)
+        for link in get_iter(all_links):
+            idx += 1
+            to_archive = get_link(link)
+            archive_link(to_archive, overwrite=overwrite, methods=methods, out_dir=link.link_dir)
     except KeyboardInterrupt:
     except KeyboardInterrupt:
-        log_archiving_paused(len(links), idx, link.timestamp)
+        log_archiving_paused(num_links, idx, link.timestamp)
         raise SystemExit(0)
         raise SystemExit(0)
     except BaseException:
     except BaseException:
         print()
         print()
         raise
         raise
 
 
-    log_archiving_finished(len(links))
-    return links
+    log_archiving_finished(num_links)
+    return all_links

+ 39 - 25
archivebox/index/__init__.py

@@ -11,6 +11,7 @@ from typing import List, Tuple, Dict, Optional, Iterable
 from collections import OrderedDict
 from collections import OrderedDict
 from contextlib import contextmanager
 from contextlib import contextmanager
 from urllib.parse import urlparse
 from urllib.parse import urlparse
+from django.db.models import QuerySet
 
 
 from ..util import (
 from ..util import (
     scheme,
     scheme,
@@ -133,7 +134,6 @@ def validate_links(links: Iterable[Link]) -> List[Link]:
 
 
     return list(links)
     return list(links)
 
 
-
 @enforce_types
 @enforce_types
 def archivable_links(links: Iterable[Link]) -> Iterable[Link]:
 def archivable_links(links: Iterable[Link]) -> Iterable[Link]:
     """remove chrome://, about:// or other schemed links that cant be archived"""
     """remove chrome://, about:// or other schemed links that cant be archived"""
@@ -165,15 +165,6 @@ def fix_duplicate_links(sorted_links: Iterable[Link]) -> Iterable[Link]:
             link = merge_links(unique_urls[link.url], link)
             link = merge_links(unique_urls[link.url], link)
         unique_urls[link.url] = link
         unique_urls[link.url] = link
 
 
-    # unique_timestamps: OrderedDict[str, Link] = OrderedDict()
-    # for link in unique_urls.values():
-    #     closest_non_duplicate_ts = lowest_uniq_timestamp(unique_timestamps, link.timestamp)
-    #     if closest_non_duplicate_ts != link.timestamp:
-    #         link = link.overwrite(timestamp=closest_non_duplicate_ts)
-    #         Snapshot.objects.filter(url=link.url).update(timestamp=link.timestamp)
-    #     unique_timestamps[link.timestamp] = link
-
-    # return unique_timestamps.values()
     return unique_urls.values()
     return unique_urls.values()
 
 
 
 
@@ -245,11 +236,7 @@ def write_main_index(links: List[Link], out_dir: str=OUTPUT_DIR, finished: bool=
             os.chmod(os.path.join(out_dir, SQL_INDEX_FILENAME), int(OUTPUT_PERMISSIONS, base=8)) # set here because we don't write it with atomic writes
             os.chmod(os.path.join(out_dir, SQL_INDEX_FILENAME), int(OUTPUT_PERMISSIONS, base=8)) # set here because we don't write it with atomic writes
 
 
         if finished:
         if finished:
-            with timed_index_update(os.path.join(out_dir, JSON_INDEX_FILENAME)):
-                write_json_main_index(links, out_dir=out_dir)
-
-            with timed_index_update(os.path.join(out_dir, HTML_INDEX_FILENAME)):
-                write_html_main_index(links, out_dir=out_dir, finished=finished)
+            write_static_index(links, out_dir=out_dir)
     except (KeyboardInterrupt, SystemExit):
     except (KeyboardInterrupt, SystemExit):
         stderr('[!] Warning: Still writing index to disk...', color='lightyellow')
         stderr('[!] Warning: Still writing index to disk...', color='lightyellow')
         stderr('    Run archivebox init to fix any inconsisntencies from an ungraceful exit.')
         stderr('    Run archivebox init to fix any inconsisntencies from an ungraceful exit.')
@@ -260,7 +247,14 @@ def write_main_index(links: List[Link], out_dir: str=OUTPUT_DIR, finished: bool=
 
 
     log_indexing_process_finished()
     log_indexing_process_finished()
 
 
+@enforce_types
+def write_static_index(links: List[Link], out_dir: str=OUTPUT_DIR) -> None:
+    with timed_index_update(os.path.join(out_dir, JSON_INDEX_FILENAME)):
+        write_json_main_index(links)
+    with timed_index_update(os.path.join(out_dir, HTML_INDEX_FILENAME)):
+        write_html_main_index(links, out_dir=out_dir, finished=True)
 
 
+@enforce_types
 def get_empty_snapshot_queryset(out_dir: str=OUTPUT_DIR):
 def get_empty_snapshot_queryset(out_dir: str=OUTPUT_DIR):
     setup_django(out_dir, check_db=True)
     setup_django(out_dir, check_db=True)
     from core.models import Snapshot
     from core.models import Snapshot
@@ -306,27 +300,47 @@ def parse_links_from_source(source_path: str, root_url: Optional[str]=None) -> T
 
 
     return new_links
     return new_links
 
 
-
 @enforce_types
 @enforce_types
-def dedupe_links(existing_links: List[Link],
-                 new_links: List[Link]) -> Tuple[List[Link], List[Link]]:
+def fix_duplicate_links_in_index(snapshots: QuerySet, links: Iterable[Link]) -> Iterable[Link]:
+    """
+    Look for urls in the index, and merge them too
+    """
+    unique_urls: OrderedDict[str, Link] = OrderedDict()
+
+    for link in links:
+        index_link = snapshots.filter(url=link.url)
+        if index_link:
+            link = merge_links(index_link[0].as_link(), link)
 
 
+        unique_urls[link.url] = link
+
+    return unique_urls.values()
+
+@enforce_types
+def dedupe_links(snapshots: QuerySet,
+                 new_links: List[Link]) -> List[Link]:
+    """
+    The validation of links happened at a different stage. This method will
+    focus on actual deduplication and timestamp fixing.
+    """
+    
     # merge existing links in out_dir and new links
     # merge existing links in out_dir and new links
-    all_links = validate_links(existing_links + new_links)
-    all_link_urls = {link.url for link in existing_links}
+    dedup_links = fix_duplicate_links_in_index(snapshots, new_links)
 
 
     new_links = [
     new_links = [
         link for link in new_links
         link for link in new_links
-        if link.url not in all_link_urls
+        if not snapshots.filter(url=link.url).exists()
     ]
     ]
 
 
-    all_links_deduped = {link.url: link for link in all_links}
+    dedup_links_dict = {link.url: link for link in dedup_links}
+
+    # Replace links in new_links with the dedup version
     for i in range(len(new_links)):
     for i in range(len(new_links)):
-        if new_links[i].url in all_links_deduped.keys():
-            new_links[i] = all_links_deduped[new_links[i].url]
+        if new_links[i].url in dedup_links_dict.keys():
+            new_links[i] = dedup_links_dict[new_links[i].url]
     log_deduping_finished(len(new_links))
     log_deduping_finished(len(new_links))
 
 
-    return all_links, new_links
+    return new_links
 
 
 ### Link Details Index
 ### Link Details Index
 
 

+ 4 - 2
archivebox/index/sql.py

@@ -40,9 +40,11 @@ def write_sql_main_index(links: List[Link], out_dir: str=OUTPUT_DIR) -> None:
         for link in links:
         for link in links:
             info = {k: v for k, v in link._asdict().items() if k in Snapshot.keys}
             info = {k: v for k, v in link._asdict().items() if k in Snapshot.keys}
             try:
             try:
-                info['timestamp'] = Snapshot.objects.get(url=link.url).timestamp
+                info["timestamp"] = Snapshot.objects.get(url=link.url).timestamp
             except Snapshot.DoesNotExist:
             except Snapshot.DoesNotExist:
-                pass
+                while Snapshot.objects.filter(timestamp=info["timestamp"]).exists():
+                    info["timestamp"] = str(float(info["timestamp"]) + 1.0)
+
             Snapshot.objects.update_or_create(url=link.url, defaults=info)
             Snapshot.objects.update_or_create(url=link.url, defaults=info)
 
 
 @enforce_types
 @enforce_types

+ 7 - 7
archivebox/main.py

@@ -30,6 +30,7 @@ from .index import (
     parse_links_from_source,
     parse_links_from_source,
     dedupe_links,
     dedupe_links,
     write_main_index,
     write_main_index,
+    write_static_index,
     link_matches_filter,
     link_matches_filter,
     get_indexed_folders,
     get_indexed_folders,
     get_archived_folders,
     get_archived_folders,
@@ -520,7 +521,7 @@ def add(urls: Union[str, List[str]],
     check_data_folder(out_dir=out_dir)
     check_data_folder(out_dir=out_dir)
     check_dependencies()
     check_dependencies()
     new_links: List[Link] = []
     new_links: List[Link] = []
-    all_links = [x.as_link() for x in load_main_index(out_dir=out_dir)]
+    all_links = load_main_index(out_dir=out_dir)
 
 
     log_importing_started(urls=urls, depth=depth, index_only=index_only)
     log_importing_started(urls=urls, depth=depth, index_only=index_only)
     if isinstance(urls, str):
     if isinstance(urls, str):
@@ -541,8 +542,10 @@ def add(urls: Union[str, List[str]],
             new_links_depth += parse_links_from_source(downloaded_file, root_url=new_link.url)
             new_links_depth += parse_links_from_source(downloaded_file, root_url=new_link.url)
 
 
     imported_links = list({link.url: link for link in (new_links + new_links_depth)}.values())
     imported_links = list({link.url: link for link in (new_links + new_links_depth)}.values())
-    all_links, new_links = dedupe_links(all_links, imported_links)
-    write_main_index(links=all_links, out_dir=out_dir, finished=not new_links)
+    new_links = dedupe_links(all_links, imported_links)
+
+    write_main_index(links=new_links, out_dir=out_dir, finished=not new_links)
+    all_links = load_main_index(out_dir=out_dir)
 
 
     if index_only:
     if index_only:
         return all_links
         return all_links
@@ -555,12 +558,9 @@ def add(urls: Union[str, List[str]],
     elif new_links:
     elif new_links:
         archive_links(new_links, overwrite=False, out_dir=out_dir)
         archive_links(new_links, overwrite=False, out_dir=out_dir)
     else:
     else:
-        # nothing was updated, don't bother re-saving the index
         return all_links
         return all_links
 
 
-    # Step 4: Re-write links index with updated titles, icons, and resources
-    all_links = [x.as_link() for x in load_main_index(out_dir=out_dir)]
-    write_main_index(links=list(all_links), out_dir=out_dir, finished=True)
+    write_static_index([link.as_link() for link in all_links], out_dir=out_dir)
     return all_links
     return all_links
 
 
 @enforce_types
 @enforce_types