Browse Source

wip attempt to fix timestamp unique constraint errors

Nick Sweeting 5 years ago
parent
commit
f18d92570e
2 changed files with 27 additions and 18 deletions
  1. 23 18
      archivebox/index/__init__.py
  2. 4 0
      archivebox/index/sql.py

+ 23 - 18
archivebox/index/__init__.py

@@ -129,7 +129,7 @@ def validate_links(links: Iterable[Link]) -> List[Link]:
     try:
     try:
         links = archivable_links(links)  # remove chrome://, about:, mailto: etc.
         links = archivable_links(links)  # remove chrome://, about:, mailto: etc.
         links = sorted_links(links)      # deterministically sort the links based on timstamp, url
         links = sorted_links(links)      # deterministically sort the links based on timstamp, url
-        links = uniquefied_links(links)  # merge/dedupe duplicate timestamps & urls
+        links = fix_duplicate_links(links)  # merge/dedupe duplicate timestamps & urls
     finally:
     finally:
         timer.end()
         timer.end()
 
 
@@ -144,34 +144,39 @@ def archivable_links(links: Iterable[Link]) -> Iterable[Link]:
             urlparse(link.url)
             urlparse(link.url)
         except ValueError:
         except ValueError:
             continue
             continue
-        scheme_is_valid = scheme(link.url) in ('http', 'https', 'ftp')
-        not_blacklisted = (not URL_BLACKLIST_PTN.match(link.url)) if URL_BLACKLIST_PTN else True
-        if scheme_is_valid and not_blacklisted:
-            yield link
+        if scheme(link.url) not in ('http', 'https', 'ftp'):
+            continue
+        if URL_BLACKLIST_PTN and URL_BLACKLIST_PTN.search(link.url):
+            continue
+
+        yield link
 
 
 
 
 @enforce_types
 @enforce_types
-def uniquefied_links(sorted_links: Iterable[Link]) -> Iterable[Link]:
+def fix_duplicate_links(sorted_links: Iterable[Link]) -> Iterable[Link]:
     """
     """
     ensures that all non-duplicate links have monotonically increasing timestamps
     ensures that all non-duplicate links have monotonically increasing timestamps
     """
     """
+    from core.models import Snapshot
 
 
     unique_urls: OrderedDict[str, Link] = OrderedDict()
     unique_urls: OrderedDict[str, Link] = OrderedDict()
 
 
     for link in sorted_links:
     for link in sorted_links:
-        if link.base_url in unique_urls:
+        if link.url in unique_urls:
             # merge with any other links that share the same url
             # merge with any other links that share the same url
-            link = merge_links(unique_urls[link.base_url], link)
-        unique_urls[link.base_url] = link
-
-    unique_timestamps: OrderedDict[str, Link] = OrderedDict()
-    for link in unique_urls.values():
-        new_link = link.overwrite(
-            timestamp=lowest_uniq_timestamp(unique_timestamps, link.timestamp),
-        )
-        unique_timestamps[new_link.timestamp] = new_link
-
-    return unique_timestamps.values()
+            link = merge_links(unique_urls[link.url], link)
+        unique_urls[link.url] = link
+
+    # unique_timestamps: OrderedDict[str, Link] = OrderedDict()
+    # for link in unique_urls.values():
+    #     closest_non_duplicate_ts = lowest_uniq_timestamp(unique_timestamps, link.timestamp)
+    #     if closest_non_duplicate_ts != link.timestamp:
+    #         link = link.overwrite(timestamp=closest_non_duplicate_ts)
+    #         Snapshot.objects.filter(url=link.url).update(timestamp=link.timestamp)
+    #     unique_timestamps[link.timestamp] = link
+
+    # return unique_timestamps.values()
+    return unique_urls.values()
 
 
 
 
 @enforce_types
 @enforce_types

+ 4 - 0
archivebox/index/sql.py

@@ -39,6 +39,10 @@ def write_sql_main_index(links: List[Link], out_dir: str=OUTPUT_DIR) -> None:
     with transaction.atomic():
     with transaction.atomic():
         for link in links:
         for link in links:
             info = {k: v for k, v in link._asdict().items() if k in Snapshot.keys}
             info = {k: v for k, v in link._asdict().items() if k in Snapshot.keys}
+            try:
+                info['timestamp'] = Snapshot.objects.get(url=link.url).timestamp
+            except Snapshot.DoesNotExist:
+                pass
             Snapshot.objects.update_or_create(url=link.url, defaults=info)
             Snapshot.objects.update_or_create(url=link.url, defaults=info)
 
 
 @enforce_types
 @enforce_types