瀏覽代碼

Merge branch 'master' into dev

Nick Sweeting 6 年之前
父節點
當前提交
5d0185b6dd

+ 1 - 1
.github/ISSUE_TEMPLATE/bug_report.md

@@ -1,5 +1,5 @@
 ---
 ---
-name: Bug report
+name: 🐞 Bug report
 about: Create a report to help us improve
 about: Create a report to help us improve
 title: ''
 title: ''
 labels: ''
 labels: ''

+ 15 - 0
.github/ISSUE_TEMPLATE/documentation_change.md

@@ -0,0 +1,15 @@
+---
+name: 📑 Documentation change
+about: Submit a suggestion for the Wiki documentation
+title: ''
+labels: ''
+assignees: ''
+
+---
+
+## Wiki Page URL
+
+
+## Suggested Edit
+
+...

+ 1 - 1
.github/ISSUE_TEMPLATE/feature_request.md

@@ -1,5 +1,5 @@
 ---
 ---
-name: Feature request
+name: 💡 Feature request
 about: Suggest an idea for this project
 about: Suggest an idea for this project
 title: ''
 title: ''
 labels: ''
 labels: ''

+ 2 - 0
.github/PULL_REQUEST_TEMPLATE.md

@@ -1,3 +1,5 @@
+**IMPORTANT: Do not submit PRs with only formatting / PEP8 / line length changes, I will close them with great prejudice.  The PEP8 checks I don't follow are intentional. PRs for minor bugfixes, typos, etc are fine.**
+
 # Summary
 # Summary
 
 
 e.g. This PR fixes ABC or adds the ability to do XYZ...
 e.g. This PR fixes ABC or adds the ability to do XYZ...

+ 12 - 1
archivebox/config.py

@@ -1,12 +1,14 @@
 import os
 import os
+import re
 import sys
 import sys
 import shutil
 import shutil
 
 
-from typing import Optional
+from typing import Optional, Pattern
 from subprocess import run, PIPE, DEVNULL
 from subprocess import run, PIPE, DEVNULL
 
 
 
 
 OUTPUT_DIR: str
 OUTPUT_DIR: str
+URL_BLACKLIST: Optional[Pattern[str]]
 
 
 # ******************************************************************************
 # ******************************************************************************
 # Documentation: https://github.com/pirate/ArchiveBox/wiki/Configuration
 # Documentation: https://github.com/pirate/ArchiveBox/wiki/Configuration
@@ -24,6 +26,7 @@ TIMEOUT =                int(os.getenv('TIMEOUT',            '60'))
 MEDIA_TIMEOUT =          int(os.getenv('MEDIA_TIMEOUT',      '3600'))
 MEDIA_TIMEOUT =          int(os.getenv('MEDIA_TIMEOUT',      '3600'))
 OUTPUT_PERMISSIONS =     os.getenv('OUTPUT_PERMISSIONS',     '755'              )
 OUTPUT_PERMISSIONS =     os.getenv('OUTPUT_PERMISSIONS',     '755'              )
 FOOTER_INFO =            os.getenv('FOOTER_INFO',            'Content is hosted for personal archiving purposes only.  Contact server owner for any takedown requests.',)
 FOOTER_INFO =            os.getenv('FOOTER_INFO',            'Content is hosted for personal archiving purposes only.  Contact server owner for any takedown requests.',)
+URL_BLACKLIST =          os.getenv('URL_BLACKLIST',          None)
 
 
 FETCH_WGET =             os.getenv('FETCH_WGET',             'True'             ).lower() == 'true'
 FETCH_WGET =             os.getenv('FETCH_WGET',             'True'             ).lower() == 'true'
 FETCH_WGET_REQUISITES =  os.getenv('FETCH_WGET_REQUISITES',  'True'             ).lower() == 'true'
 FETCH_WGET_REQUISITES =  os.getenv('FETCH_WGET_REQUISITES',  'True'             ).lower() == 'true'
@@ -58,6 +61,11 @@ CHROME_BINARY =          os.getenv('CHROME_BINARY',          None)
 
 
 CHROME_SANDBOX =         os.getenv('CHROME_SANDBOX', 'True').lower() == 'true'
 CHROME_SANDBOX =         os.getenv('CHROME_SANDBOX', 'True').lower() == 'true'
 
 
+try:
+    OUTPUT_DIR = os.path.abspath(os.getenv('OUTPUT_DIR'))
+except Exception:
+    OUTPUT_DIR = None
+
 # ******************************************************************************
 # ******************************************************************************
 
 
 ### Terminal Configuration
 ### Terminal Configuration
@@ -95,6 +103,9 @@ TEMPLATES_DIR = os.path.join(PYTHON_DIR, 'templates')
 if COOKIES_FILE:
 if COOKIES_FILE:
     COOKIES_FILE = os.path.abspath(COOKIES_FILE)
     COOKIES_FILE = os.path.abspath(COOKIES_FILE)
 
 
+URL_BLACKLIST = URL_BLACKLIST and re.compile(URL_BLACKLIST, re.IGNORECASE)
+
+########################### Environment & Dependencies #########################
 
 
 VERSION = open(os.path.join(PYTHON_DIR, 'VERSION'), 'r').read().strip()
 VERSION = open(os.path.join(PYTHON_DIR, 'VERSION'), 'r').read().strip()
 GIT_SHA = VERSION.split('+')[1]
 GIT_SHA = VERSION.split('+')[1]

+ 10 - 5
archivebox/links.py

@@ -8,6 +8,9 @@ from .util import (
     merge_links,
     merge_links,
 )
 )
 
 
+from config import (
+    URL_BLACKLIST,
+)
 
 
 def validate_links(links: Iterable[Link]) -> Iterable[Link]:
 def validate_links(links: Iterable[Link]) -> Iterable[Link]:
     links = archivable_links(links)  # remove chrome://, about:, mailto: etc.
     links = archivable_links(links)  # remove chrome://, about:, mailto: etc.
@@ -22,11 +25,11 @@ def validate_links(links: Iterable[Link]) -> Iterable[Link]:
 
 
 def archivable_links(links: Iterable[Link]) -> Iterable[Link]:
 def archivable_links(links: Iterable[Link]) -> Iterable[Link]:
     """remove chrome://, about:// or other schemed links that cant be archived"""
     """remove chrome://, about:// or other schemed links that cant be archived"""
-    return (
-        link
-        for link in links
-        if scheme(link.url) in ('http', 'https', 'ftp')
-    )
+    for link in links:
+        scheme_is_valid = scheme(link.url) in ('http', 'https', 'ftp')
+        not_blacklisted = (not URL_BLACKLIST.match(link.url)) if URL_BLACKLIST else True
+        if scheme_is_valid and not_blacklisted:
+            yield link
 
 
 
 
 def uniquefied_links(sorted_links: Iterable[Link]) -> Iterable[Link]:
 def uniquefied_links(sorted_links: Iterable[Link]) -> Iterable[Link]:
@@ -87,3 +90,5 @@ def lowest_uniq_timestamp(used_timestamps: OrderedDict, timestamp: str) -> str:
         new_timestamp = '{}.{}'.format(timestamp, nonce)
         new_timestamp = '{}.{}'.format(timestamp, nonce)
 
 
     return new_timestamp
     return new_timestamp
+    
+