浏览代码

support negation patterns by checking both re.search and re.match

Nick Sweeting 4 年之前
父节点
当前提交
e4974d3536
共有 1 个文件被更改,包括 4 次插入1 次删除
  1. 4 1
      archivebox/index/__init__.py

+ 4 - 1
archivebox/index/__init__.py

@@ -141,7 +141,10 @@ def archivable_links(links: Iterable[Link]) -> Iterable[Link]:
             continue
         if scheme(link.url) not in ('http', 'https', 'ftp'):
             continue
-        if URL_BLACKLIST_PTN and URL_BLACKLIST_PTN.search(link.url):
+        if URL_BLACKLIST_PTN and (URL_BLACKLIST_PTN.match(link.url) or URL_BLACKLIST_PTN.search(link.url)):
+            # https://stackoverflow.com/questions/180986/what-is-the-difference-between-re-search-and-re-match
+            # we want both behaviors in order to support multiple patterns in the regex,
+            # and negation regexes like (?!someptnhere) to allow for whitelisting
             continue
 
         yield link