Browse Source

Merge pull request #448 from pirate/skip-invalid-urls

Skip invalid URLs when archiving
Nick Sweeting 5 years ago
parent
commit
09ad3a5303
1 changed files with 5 additions and 0 deletions
  1. 5 0
      archivebox/index/__init__.py

+ 5 - 0
archivebox/index/__init__.py

@@ -9,6 +9,7 @@ from itertools import chain
 from typing import List, Tuple, Dict, Optional, Iterable
 from typing import List, Tuple, Dict, Optional, Iterable
 from collections import OrderedDict
 from collections import OrderedDict
 from contextlib import contextmanager
 from contextlib import contextmanager
+from urllib.parse import urlparse
 
 
 from ..system import atomic_write
 from ..system import atomic_write
 from ..util import (
 from ..util import (
@@ -139,6 +140,10 @@ def validate_links(links: Iterable[Link]) -> List[Link]:
 def archivable_links(links: Iterable[Link]) -> Iterable[Link]:
 def archivable_links(links: Iterable[Link]) -> Iterable[Link]:
     """remove chrome://, about:// or other schemed links that cant be archived"""
     """remove chrome://, about:// or other schemed links that cant be archived"""
     for link in links:
     for link in links:
+        try:
+            urlparse(link.url)
+        except ValueError:
+            continue
         scheme_is_valid = scheme(link.url) in ('http', 'https', 'ftp')
         scheme_is_valid = scheme(link.url) in ('http', 'https', 'ftp')
         not_blacklisted = (not URL_BLACKLIST_PTN.match(link.url)) if URL_BLACKLIST_PTN else True
         not_blacklisted = (not URL_BLACKLIST_PTN.match(link.url)) if URL_BLACKLIST_PTN else True
         if scheme_is_valid and not_blacklisted:
         if scheme_is_valid and not_blacklisted: