瀏覽代碼

Merge pull request #1205 from overhacked/fix_url_regex_hyphen

Nick Sweeting 2 年之前
父節點
當前提交
a7d7644dca
共有 2 個文件被更改,包括 5 次插入1 次删除
  1. 4 0
      archivebox/parsers/__init__.py
  2. 1 1
      archivebox/util.py

+ 4 - 0
archivebox/parsers/__init__.py

@@ -233,6 +233,10 @@ _test_url_strs = {
     'https://example.com/?what=1#how-about-this=1&2%20baf': 1,
     'https://example.com/?what=1#how-about-this=1&2%20baf': 1,
     'https://example.com?what=1#how-about-this=1&2%20baf': 1,
     'https://example.com?what=1#how-about-this=1&2%20baf': 1,
     '<test>http://example7.com</test>': 1,
     '<test>http://example7.com</test>': 1,
+    'https://<test>': 0,
+    'https://[test]': 0,
+    'http://"test"': 0,
+    'http://\'test\'': 0,
     '[https://example8.com/what/is/this.php?what=1]': 1,
     '[https://example8.com/what/is/this.php?what=1]': 1,
     '[and http://example9.com?what=1&other=3#and-thing=2]': 1,
     '[and http://example9.com?what=1&other=3#and-thing=2]': 1,
     '<what>https://example10.com#and-thing=2 "</about>': 1,
     '<what>https://example10.com#and-thing=2 "</about>': 1,

+ 1 - 1
archivebox/util.py

@@ -59,7 +59,7 @@ URL_REGEX = re.compile(
     r'(?=('
     r'(?=('
     r'http[s]?://'                    # start matching from allowed schemes
     r'http[s]?://'                    # start matching from allowed schemes
     r'(?:[a-zA-Z]|[0-9]'              # followed by allowed alphanum characters
     r'(?:[a-zA-Z]|[0-9]'              # followed by allowed alphanum characters
-    r'|[$-_@.&+]|[!*\(\),]'           #    or allowed symbols
+    r'|[-_$@.&+!*\(\),]'           #    or allowed symbols (keep hyphen first to match literal hyphen)
     r'|(?:%[0-9a-fA-F][0-9a-fA-F]))'  #    or allowed unicode bytes
     r'|(?:%[0-9a-fA-F][0-9a-fA-F]))'  #    or allowed unicode bytes
     r'[^\]\[\(\)<>"\'\s]+'          # stop parsing at these symbols
     r'[^\]\[\(\)<>"\'\s]+'          # stop parsing at these symbols
     r'))',
     r'))',