2
0
Эх сурвалжийг харах

accept local paths as valid link URLs when parsing

Nick Sweeting 5 жил өмнө
parent
commit
96b1e4a8ec

+ 17 - 2
archivebox/parsers/generic_txt.py

@@ -5,6 +5,7 @@ import re
 
 
 from typing import IO, Iterable
 from typing import IO, Iterable
 from datetime import datetime
 from datetime import datetime
+from pathlib import Path
 
 
 from ..index.schema import Link
 from ..index.schema import Link
 from ..util import (
 from ..util import (
@@ -13,14 +14,28 @@ from ..util import (
     URL_REGEX
     URL_REGEX
 )
 )
 
 
+
 @enforce_types
 @enforce_types
 def parse_generic_txt_export(text_file: IO[str]) -> Iterable[Link]:
 def parse_generic_txt_export(text_file: IO[str]) -> Iterable[Link]:
     """Parse raw links from each line in a text file"""
     """Parse raw links from each line in a text file"""
 
 
     text_file.seek(0)
     text_file.seek(0)
     for line in text_file.readlines():
     for line in text_file.readlines():
-        urls = re.findall(URL_REGEX, line) if line.strip() else ()
-        for url in urls:                                                # type: ignore
+        if not line.strip():
+            continue
+
+        # if the line is a local file path that resolves, then we can archive it
+        if Path(line).exists():
+            yield Link(
+                url=line,
+                timestamp=str(datetime.now().timestamp()),
+                title=None,
+                tags=None,
+                sources=[text_file.name],
+            )
+
+        # otherwise look for anything that looks like a URL in the line
+        for url in re.findall(URL_REGEX, line):
             yield Link(
             yield Link(
                 url=htmldecode(url),
                 url=htmldecode(url),
                 timestamp=str(datetime.now().timestamp()),
                 timestamp=str(datetime.now().timestamp()),