Explorar el Código

exclude links that are in blacklist

mlazana hace 6 años
padre
commit
4d10568477
Se han modificado 2 ficheros con 18 adiciones y 3 borrados
  1. 2 1
      archivebox/config.py
  2. 16 2
      archivebox/links.py

+ 2 - 1
archivebox/config.py

@@ -47,7 +47,7 @@ WGET_BINARY =            os.getenv('WGET_BINARY',            'wget')
 YOUTUBEDL_BINARY =       os.getenv('YOUTUBEDL_BINARY',       'youtube-dl')
 CHROME_BINARY =          os.getenv('CHROME_BINARY',          None)
 
-URL_BLACKLIST =          os.getenv('URL_BLACKLIST',          '.*youtube.com.*,.*facebook.com/.*,.*.exe') 
+URL_BLACKLIST =          os.getenv('URL_BLACKLIST',          '.*youtube.com.*,.*facebook.com/.*,.*.exe')
 
 try:
     OUTPUT_DIR = os.path.abspath(os.getenv('OUTPUT_DIR'))
@@ -270,6 +270,7 @@ except:
 
 URL_BLACKLIST = re.compile(
     r'(.*\.youtube\.com)|'
+    r'(.*\.facebook\.com)|'
     r'(.*\.amazon\.com)|'
     r'(.*\.reddit\.com)',
     re.IGNORECASE,

+ 16 - 2
archivebox/links.py

@@ -28,13 +28,19 @@ from util import (
     check_links_structure,
 )
 
+from config import (
+    URL_BLACKLIST,
+)
 
 def validate_links(links):
     check_links_structure(links)
     links = archivable_links(links)  # remove chrome://, about:, mailto: etc.
     links = uniquefied_links(links)  # merge/dedupe duplicate timestamps & urls
     links = sorted_links(links)      # deterministically sort the links based on timstamp, url
-
+    links = exclude_links(links)     # exclude links that are in blacklist
+    
+    print(links)
+     
     if not links:
         print('[X] No links found :(')
         raise SystemExit(1)
@@ -42,7 +48,8 @@ def validate_links(links):
     for link in links:
         link['title'] = unescape(link['title'].strip()) if link['title'] else None
         check_link_structure(link)
-
+    
+    print("FINAL LIST", list(links))
     return list(links)
 
 
@@ -115,3 +122,10 @@ def lowest_uniq_timestamp(used_timestamps, timestamp):
         new_timestamp = '{}.{}'.format(timestamp, nonce)
 
     return new_timestamp
+
+def exclude_links(links):
+    """ exclude links that are in blacklist"""
+
+    links = [link for link in links if not URL_BLACKLIST.match(link['url'])]
+
+    return links