Quellcode durchsuchen

skip invalid urls at all stages

Nick Sweeting vor 5 Jahren
Ursprung
Commit
225b63b732
1 geänderte Dateien mit 5 neuen und 0 gelöschten Zeilen
  1. 5 0
      archivebox/index/__init__.py

+ 5 - 0
archivebox/index/__init__.py

@@ -9,6 +9,7 @@ from itertools import chain
 from typing import List, Tuple, Dict, Optional, Iterable
 from collections import OrderedDict
 from contextlib import contextmanager
+from urllib.parse import urlparse
 
 from ..system import atomic_write
 from ..util import (
@@ -139,6 +140,10 @@ def validate_links(links: Iterable[Link]) -> List[Link]:
 def archivable_links(links: Iterable[Link]) -> Iterable[Link]:
     """remove chrome://, about:// or other schemed links that cant be archived"""
     for link in links:
+        try:
+            urlparse(link.url)
+        except ValueError:
+            continue
         scheme_is_valid = scheme(link.url) in ('http', 'https', 'ftp')
         not_blacklisted = (not URL_BLACKLIST_PTN.match(link.url)) if URL_BLACKLIST_PTN else True
         if scheme_is_valid and not_blacklisted: