Browse Source

only add url-list lines that are real urls

Nick Sweeting 4 years ago
parent
commit
f59b6d4189
1 changed files with 5 additions and 2 deletions
  1. 5 2
      archivebox/parsers/url_list.py

+ 5 - 2
archivebox/parsers/url_list.py

@@ -1,12 +1,15 @@
 __package__ = 'archivebox.parsers'
 __package__ = 'archivebox.parsers'
 __description__ = 'URL list'
 __description__ = 'URL list'
 
 
+import re
+
 from typing import IO, Iterable
 from typing import IO, Iterable
 from datetime import datetime
 from datetime import datetime
 
 
 from ..index.schema import Link
 from ..index.schema import Link
 from ..util import (
 from ..util import (
-    enforce_types
+    enforce_types,
+    URL_REGEX,
 )
 )
 
 
 
 
@@ -17,7 +20,7 @@ def parse_url_list(text_file: IO[str], **_kwargs) -> Iterable[Link]:
     text_file.seek(0)
     text_file.seek(0)
     for line in text_file.readlines():
     for line in text_file.readlines():
         url = line.strip()
         url = line.strip()
-        if not url:
+        if (not url) or not re.findall(URL_REGEX, url):
             continue
             continue
 
 
         yield Link(
         yield Link(