1 year ago · beb3932d80
--- a/archivebox/parsers/__init__.py
+++ b/archivebox/parsers/__init__.py
@@ -28,7 +28,7 @@ from ..util import (
 
				     htmldecode,
			
 
				     download_url,
			
 
				     enforce_types,
			
 
				-    URL_REGEX,
			
 
				+    find_all_urls,
			
 
				 )
			
 
				 from ..index.schema import Link
			
 
				 from ..logging_util import TimedProgress, log_source_saved
			
@@ -202,54 +202,3 @@ def save_file_as_source(path: str, timeout: int=TIMEOUT, filename: str='{ts}-{ba
 
				     log_source_saved(source_file=source_path)
			
 
				 
			
 
				     return source_path
			
 
				-
			
 
				-
			
 
				-# Check that plain text regex URL parsing works as expected
			
 
				-#   this is last-line-of-defense to make sure the URL_REGEX isn't
			
 
				-#   misbehaving due to some OS-level or environment level quirks (e.g. bad regex lib)
			
 
				-#   the consequences of bad URL parsing could be disastrous and lead to many
			
 
				-#   incorrect/badly parsed links being added to the archive, so this is worth the cost of checking
			
 
				-_test_url_strs = {
			
 
				-    'example.com': 0,
			
 
				-    '/example.com': 0,
			
 
				-    '//example.com': 0,
			
 
				-    ':/example.com': 0,
			
 
				-    '://example.com': 0,
			
 
				-    'htt://example8.com': 0,
			
 
				-    '/htt://example.com': 0,
			
 
				-    'https://example': 1,
			
 
				-    'https://localhost/2345': 1,
			
 
				-    'https://localhost:1234/123': 1,
			
 
				-    '://': 0,
			
 
				-    'https://': 0,
			
 
				-    'http://': 0,
			
 
				-    'ftp://': 0,
			
 
				-    'ftp://example.com': 0,
			
 
				-    'https://example.com': 1,
			
 
				-    'https://example.com/': 1,
			
 
				-    'https://a.example.com': 1,
			
 
				-    'https://a.example.com/': 1,
			
 
				-    'https://a.example.com/what/is/happening.html': 1,
			
 
				-    'https://a.example.com/what/ís/happening.html': 1,
			
 
				-    'https://a.example.com/what/is/happening.html?what=1&2%20b#höw-about-this=1a': 1,
			
 
				-    'https://a.example.com/what/is/happéning/?what=1&2%20b#how-aboüt-this=1a': 1,
			
 
				-    'HTtpS://a.example.com/what/is/happening/?what=1&2%20b#how-about-this=1af&2f%20b': 1,
			
 
				-    'https://example.com/?what=1#how-about-this=1&2%20baf': 1,
			
 
				-    'https://example.com?what=1#how-about-this=1&2%20baf': 1,
			
 
				-    '<test>http://example7.com</test>': 1,
			
 
				-    'https://<test>': 0,
			
 
				-    'https://[test]': 0,
			
 
				-    'http://"test"': 0,
			
 
				-    'http://\'test\'': 0,
			
 
				-    '[https://example8.com/what/is/this.php?what=1]': 1,
			
 
				-    '[and http://example9.com?what=1&other=3#and-thing=2]': 1,
			
 
				-    '<what>https://example10.com#and-thing=2 "</about>': 1,
			
 
				-    'abc<this["https://example11.com/what/is#and-thing=2?whoami=23&where=1"]that>def': 1,
			
 
				-    'sdflkf[what](https://example12.com/who/what.php?whoami=1#whatami=2)?am=hi': 1,
			
 
				-    '<or>http://examplehttp://15.badc</that>': 2,
			
 
				-    'https://a.example.com/one.html?url=http://example.com/inside/of/another?=http://': 2,
			
 
				-    '[https://a.example.com/one.html?url=http://example.com/inside/of/another?=](http://a.example.com)': 3,
			
 
				-}
			
 
				-for url_str, num_urls in _test_url_strs.items():
			
 
				-    assert len(re.findall(URL_REGEX, url_str)) == num_urls, (
			
 
				-        f'{url_str} does not contain {num_urls} urls')
			
--- a/archivebox/parsers/generic_html.py
+++ b/archivebox/parsers/generic_html.py
@@ -10,7 +10,7 @@ from ..index.schema import Link
 
				 from ..util import (
			
 
				     htmldecode,
			
 
				     enforce_types,
			
 
				-    URL_REGEX,
			
 
				+    find_all_urls,
			
 
				 )
			
 
				 from html.parser import HTMLParser
			
 
				 from urllib.parse import urljoin
			
@@ -42,8 +42,9 @@ def parse_generic_html_export(html_file: IO[str], root_url: Optional[str]=None,
 
				             if root_url:
			
 
				                 # resolve relative urls /home.html -> https://example.com/home.html
			
 
				                 url = urljoin(root_url, url)
			
 
				-            
			
 
				-            for archivable_url in re.findall(URL_REGEX, url):
			
 
				+                # TODO: fix double // getting stripped by urljoin bug https://github.com/python/cpython/issues/96015
			
 
				+
			
 
				+            for archivable_url in find_all_urls(url):
			
 
				                 yield Link(
			
 
				                     url=htmldecode(archivable_url),
			
 
				                     timestamp=str(datetime.now(timezone.utc).timestamp()),
			
--- a/archivebox/parsers/generic_txt.py
+++ b/archivebox/parsers/generic_txt.py
@@ -11,7 +11,7 @@ from ..index.schema import Link
 
				 from ..util import (
			
 
				     htmldecode,
			
 
				     enforce_types,
			
 
				-    URL_REGEX
			
 
				+    find_all_urls,
			
 
				 )
			
 
				 
			
 
				 
			
@@ -39,7 +39,7 @@ def parse_generic_txt_export(text_file: IO[str], **_kwargs) -> Iterable[Link]:
 
				             pass
			
 
				 
			
 
				         # otherwise look for anything that looks like a URL in the line
			
 
				-        for url in re.findall(URL_REGEX, line):
			
 
				+        for url in find_all_urls(line):
			
 
				             yield Link(
			
 
				                 url=htmldecode(url),
			
 
				                 timestamp=str(datetime.now(timezone.utc).timestamp()),
			
@@ -48,17 +48,6 @@ def parse_generic_txt_export(text_file: IO[str], **_kwargs) -> Iterable[Link]:
 
				                 sources=[text_file.name],
			
 
				             )
			
 
				 
			
 
				-            # look inside the URL for any sub-urls, e.g. for archive.org links
			
 
				-            # https://web.archive.org/web/20200531203453/https://www.reddit.com/r/socialism/comments/gu24ke/nypd_officers_claim_they_are_protecting_the_rule/fsfq0sw/
			
 
				-            # -> https://www.reddit.com/r/socialism/comments/gu24ke/nypd_officers_claim_they_are_protecting_the_rule/fsfq0sw/
			
 
				-            for sub_url in re.findall(URL_REGEX, line[1:]):
			
 
				-                yield Link(
			
 
				-                    url=htmldecode(sub_url),
			
 
				-                    timestamp=str(datetime.now(timezone.utc).timestamp()),
			
 
				-                    title=None,
			
 
				-                    tags=None,
			
 
				-                    sources=[text_file.name],
			
 
				-                )
			
 
				 
			
 
				 KEY = 'txt'
			
 
				 NAME = 'Generic TXT'
			
--- a/archivebox/util.py
+++ b/archivebox/util.py
@@ -439,9 +439,12 @@ class ExtendedEncoder(pyjson.JSONEncoder):
 
				 
			
 
				 
			
 
				 ### URL PARSING TESTS / ASSERTIONS
			
 
				-# they run at runtime because I like having them inline in this file,
			
 
				-# I like the peace of mind knowing it's enforced at runtime across all OS's (in case the regex engine ever has any weird locale-specific quirks),
			
 
				-# and these assertions are basically instant, so not a big performance cost to do it on startup
			
 
				+
			
 
				+# Check that plain text regex URL parsing works as expected
			
 
				+#   this is last-line-of-defense to make sure the URL_REGEX isn't
			
 
				+#   misbehaving due to some OS-level or environment level quirks (e.g. regex engine / cpython / locale differences)
			
 
				+#   the consequences of bad URL parsing could be disastrous and lead to many
			
 
				+#   incorrect/badly parsed links being added to the archive, so this is worth the cost of checking
			
 
				 
			
 
				 assert fix_url_from_markdown('/a(b)c).x(y)z') == '/a(b)c'
			
 
				 assert fix_url_from_markdown('https://wikipedia.org/en/some_article_(Disambiguation).html?abc=def).link(with)_trailingtext') == 'https://wikipedia.org/en/some_article_(Disambiguation).html?abc=def'
			
@@ -482,3 +485,50 @@ URL_REGEX_TESTS = [
 
				 for urls_str, expected_url_matches in URL_REGEX_TESTS:
			
 
				     url_matches = list(find_all_urls(urls_str))
			
 
				     assert url_matches == expected_url_matches, 'FAILED URL_REGEX CHECK!'
			
 
				+
			
 
				+
			
 
				+# More test cases
			
 
				+_test_url_strs = {
			
 
				+    'example.com': 0,
			
 
				+    '/example.com': 0,
			
 
				+    '//example.com': 0,
			
 
				+    ':/example.com': 0,
			
 
				+    '://example.com': 0,
			
 
				+    'htt://example8.com': 0,
			
 
				+    '/htt://example.com': 0,
			
 
				+    'https://example': 1,
			
 
				+    'https://localhost/2345': 1,
			
 
				+    'https://localhost:1234/123': 1,
			
 
				+    '://': 0,
			
 
				+    'https://': 0,
			
 
				+    'http://': 0,
			
 
				+    'ftp://': 0,
			
 
				+    'ftp://example.com': 0,
			
 
				+    'https://example.com': 1,
			
 
				+    'https://example.com/': 1,
			
 
				+    'https://a.example.com': 1,
			
 
				+    'https://a.example.com/': 1,
			
 
				+    'https://a.example.com/what/is/happening.html': 1,
			
 
				+    'https://a.example.com/what/ís/happening.html': 1,
			
 
				+    'https://a.example.com/what/is/happening.html?what=1&2%20b#höw-about-this=1a': 1,
			
 
				+    'https://a.example.com/what/is/happéning/?what=1&2%20b#how-aboüt-this=1a': 1,
			
 
				+    'HTtpS://a.example.com/what/is/happening/?what=1&2%20b#how-about-this=1af&2f%20b': 1,
			
 
				+    'https://example.com/?what=1#how-about-this=1&2%20baf': 1,
			
 
				+    'https://example.com?what=1#how-about-this=1&2%20baf': 1,
			
 
				+    '<test>http://example7.com</test>': 1,
			
 
				+    'https://<test>': 0,
			
 
				+    'https://[test]': 0,
			
 
				+    'http://"test"': 0,
			
 
				+    'http://\'test\'': 0,
			
 
				+    '[https://example8.com/what/is/this.php?what=1]': 1,
			
 
				+    '[and http://example9.com?what=1&other=3#and-thing=2]': 1,
			
 
				+    '<what>https://example10.com#and-thing=2 "</about>': 1,
			
 
				+    'abc<this["https://example11.com/what/is#and-thing=2?whoami=23&where=1"]that>def': 1,
			
 
				+    'sdflkf[what](https://example12.com/who/what.php?whoami=1#whatami=2)?am=hi': 1,
			
 
				+    '<or>http://examplehttp://15.badc</that>': 2,
			
 
				+    'https://a.example.com/one.html?url=http://example.com/inside/of/another?=http://': 2,
			
 
				+    '[https://a.example.com/one.html?url=http://example.com/inside/of/another?=](http://a.example.com)': 3,
			
 
				+}
			
 
				+for url_str, num_urls in _test_url_strs.items():
			
 
				+    assert len(list(find_all_urls(url_str))) == num_urls, (
			
 
				+        f'{url_str} does not contain {num_urls} urls')