瀏覽代碼

skip invalid urls at all stages

Nick Sweeting 5 年之前
父節點
當前提交
225b63b732
共有 1 個文件被更改,包括 5 次插入0 次删除
  1. 5 0
      archivebox/index/__init__.py

+ 5 - 0
archivebox/index/__init__.py

@@ -9,6 +9,7 @@ from itertools import chain
 from typing import List, Tuple, Dict, Optional, Iterable
 from collections import OrderedDict
 from contextlib import contextmanager
+from urllib.parse import urlparse
 
 from ..system import atomic_write
 from ..util import (
@@ -139,6 +140,10 @@ def validate_links(links: Iterable[Link]) -> List[Link]:
 def archivable_links(links: Iterable[Link]) -> Iterable[Link]:
     """remove chrome://, about:// or other schemed links that cant be archived"""
     for link in links:
+        try:
+            urlparse(link.url)
+        except ValueError:
+            continue
         scheme_is_valid = scheme(link.url) in ('http', 'https', 'ftp')
         not_blacklisted = (not URL_BLACKLIST_PTN.match(link.url)) if URL_BLACKLIST_PTN else True
         if scheme_is_valid and not_blacklisted: