Browse Source

Merge pull request #1195 from overhacked/method_allow_deny

Nick Sweeting 2 years ago
parent
commit
720061185c

+ 13 - 5
archivebox/config.py

@@ -90,10 +90,13 @@ CONFIG_SCHEMA: Dict[str, ConfigDefaultDict] = {
         'MEDIA_TIMEOUT':            {'type': int,   'default': 3600},
         'MEDIA_TIMEOUT':            {'type': int,   'default': 3600},
         'OUTPUT_PERMISSIONS':       {'type': str,   'default': '644'},
         'OUTPUT_PERMISSIONS':       {'type': str,   'default': '644'},
         'RESTRICT_FILE_NAMES':      {'type': str,   'default': 'windows'},
         'RESTRICT_FILE_NAMES':      {'type': str,   'default': 'windows'},
-        'URL_BLACKLIST':            {'type': str,   'default': r'\.(css|js|otf|ttf|woff|woff2|gstatic\.com|googleapis\.com/css)(\?.*)?$'},  # to avoid downloading code assets as their own pages
+
+        'URL_DENYLIST':             {'type': str,   'default': r'\.(css|js|otf|ttf|woff|woff2|gstatic\.com|googleapis\.com/css)(\?.*)?$', 'aliases': ('URL_BLACKLIST',)},  # to avoid downloading code assets as their own pages
+        'URL_ALLOWLIST':            {'type': str,   'default': None, 'aliases': ('URL_WHITELIST',)},
+
         'ADMIN_USERNAME':           {'type': str,   'default': None},
         'ADMIN_USERNAME':           {'type': str,   'default': None},
         'ADMIN_PASSWORD':           {'type': str,   'default': None},
         'ADMIN_PASSWORD':           {'type': str,   'default': None},
-        'URL_WHITELIST':            {'type': str,   'default': None},
+
         'ENFORCE_ATOMIC_WRITES':    {'type': bool,  'default': True},
         'ENFORCE_ATOMIC_WRITES':    {'type': bool,  'default': True},
         'TAG_SEPARATOR_PATTERN':    {'type': str,   'default': r'[,]'},
         'TAG_SEPARATOR_PATTERN':    {'type': str,   'default': r'[,]'},
     },
     },
@@ -144,6 +147,8 @@ CONFIG_SCHEMA: Dict[str, ConfigDefaultDict] = {
         'SAVE_GIT':                 {'type': bool,  'default': True, 'aliases': ('FETCH_GIT',)},
         'SAVE_GIT':                 {'type': bool,  'default': True, 'aliases': ('FETCH_GIT',)},
         'SAVE_MEDIA':               {'type': bool,  'default': True, 'aliases': ('FETCH_MEDIA',)},
         'SAVE_MEDIA':               {'type': bool,  'default': True, 'aliases': ('FETCH_MEDIA',)},
         'SAVE_ARCHIVE_DOT_ORG':     {'type': bool,  'default': True, 'aliases': ('SUBMIT_ARCHIVE_DOT_ORG',)},
         'SAVE_ARCHIVE_DOT_ORG':     {'type': bool,  'default': True, 'aliases': ('SUBMIT_ARCHIVE_DOT_ORG',)},
+        'SAVE_ALLOWLIST':           {'type': dict,  'default': {},},
+        'SAVE_DENYLIST':            {'type': dict,  'default': {},},
     },
     },
 
 
     'ARCHIVE_METHOD_OPTIONS': {
     'ARCHIVE_METHOD_OPTIONS': {
@@ -373,6 +378,8 @@ def get_commit_hash(config):
 ############################## Derived Config ##################################
 ############################## Derived Config ##################################
 
 
 
 
+ALLOWDENYLIST_REGEX_FLAGS: int = re.IGNORECASE | re.UNICODE | re.MULTILINE
+
 DYNAMIC_CONFIG_SCHEMA: ConfigDefaultDict = {
 DYNAMIC_CONFIG_SCHEMA: ConfigDefaultDict = {
     'TERM_WIDTH':               {'default': lambda c: lambda: shutil.get_terminal_size((100, 10)).columns},
     'TERM_WIDTH':               {'default': lambda c: lambda: shutil.get_terminal_size((100, 10)).columns},
     'USER':                     {'default': lambda c: SYSTEM_USER},
     'USER':                     {'default': lambda c: SYSTEM_USER},
@@ -389,8 +396,8 @@ DYNAMIC_CONFIG_SCHEMA: ConfigDefaultDict = {
     'CONFIG_FILE':              {'default': lambda c: Path(c['CONFIG_FILE']).resolve() if c['CONFIG_FILE'] else c['OUTPUT_DIR'] / CONFIG_FILENAME},
     'CONFIG_FILE':              {'default': lambda c: Path(c['CONFIG_FILE']).resolve() if c['CONFIG_FILE'] else c['OUTPUT_DIR'] / CONFIG_FILENAME},
     'COOKIES_FILE':             {'default': lambda c: c['COOKIES_FILE'] and Path(c['COOKIES_FILE']).resolve()},
     'COOKIES_FILE':             {'default': lambda c: c['COOKIES_FILE'] and Path(c['COOKIES_FILE']).resolve()},
     'CHROME_USER_DATA_DIR':     {'default': lambda c: find_chrome_data_dir() if c['CHROME_USER_DATA_DIR'] is None else (Path(c['CHROME_USER_DATA_DIR']).resolve() if c['CHROME_USER_DATA_DIR'] else None)},   # None means unset, so we autodetect it with find_chrome_Data_dir(), but emptystring '' means user manually set it to '', and we should store it as None
     'CHROME_USER_DATA_DIR':     {'default': lambda c: find_chrome_data_dir() if c['CHROME_USER_DATA_DIR'] is None else (Path(c['CHROME_USER_DATA_DIR']).resolve() if c['CHROME_USER_DATA_DIR'] else None)},   # None means unset, so we autodetect it with find_chrome_Data_dir(), but emptystring '' means user manually set it to '', and we should store it as None
-    'URL_BLACKLIST_PTN':        {'default': lambda c: c['URL_BLACKLIST'] and re.compile(c['URL_BLACKLIST'] or '', re.IGNORECASE | re.UNICODE | re.MULTILINE)},
-    'URL_WHITELIST_PTN':        {'default': lambda c: c['URL_WHITELIST'] and re.compile(c['URL_WHITELIST'] or '', re.IGNORECASE | re.UNICODE | re.MULTILINE)},
+    'URL_DENYLIST_PTN':         {'default': lambda c: c['URL_DENYLIST'] and re.compile(c['URL_DENYLIST'] or '', ALLOWDENYLIST_REGEX_FLAGS)},
+    'URL_ALLOWLIST_PTN':        {'default': lambda c: c['URL_ALLOWLIST'] and re.compile(c['URL_ALLOWLIST'] or '', ALLOWDENYLIST_REGEX_FLAGS)},
     'DIR_OUTPUT_PERMISSIONS':   {'default': lambda c: c['OUTPUT_PERMISSIONS'].replace('6', '7').replace('4', '5')},
     'DIR_OUTPUT_PERMISSIONS':   {'default': lambda c: c['OUTPUT_PERMISSIONS'].replace('6', '7').replace('4', '5')},
 
 
     'ARCHIVEBOX_BINARY':        {'default': lambda c: sys.argv[0] or bin_path('archivebox')},
     'ARCHIVEBOX_BINARY':        {'default': lambda c: sys.argv[0] or bin_path('archivebox')},
@@ -464,10 +471,11 @@ DYNAMIC_CONFIG_SCHEMA: ConfigDefaultDict = {
     'EXTERNAL_LOCATIONS':       {'default': lambda c: get_external_locations(c)},
     'EXTERNAL_LOCATIONS':       {'default': lambda c: get_external_locations(c)},
     'DATA_LOCATIONS':           {'default': lambda c: get_data_locations(c)},
     'DATA_LOCATIONS':           {'default': lambda c: get_data_locations(c)},
     'CHROME_OPTIONS':           {'default': lambda c: get_chrome_info(c)},
     'CHROME_OPTIONS':           {'default': lambda c: get_chrome_info(c)},
+    'SAVE_ALLOWLIST_PTN':       {'default': lambda c: c['SAVE_ALLOWLIST'] and {re.compile(k, ALLOWDENYLIST_REGEX_FLAGS): v for k, v in c['SAVE_ALLOWLIST'].items()}},
+    'SAVE_DENYLIST_PTN':       {'default': lambda c: c['SAVE_DENYLIST'] and {re.compile(k, ALLOWDENYLIST_REGEX_FLAGS): v for k, v in c['SAVE_DENYLIST'].items()}},
 }
 }
 
 
 
 
-
 ################################### Helpers ####################################
 ################################### Helpers ####################################
 
 
 
 

+ 1 - 1
archivebox/config_stubs.py

@@ -41,7 +41,7 @@ class ConfigDict(BaseConfig, total=False):
     MEDIA_TIMEOUT: int
     MEDIA_TIMEOUT: int
     OUTPUT_PERMISSIONS: str
     OUTPUT_PERMISSIONS: str
     RESTRICT_FILE_NAMES: str
     RESTRICT_FILE_NAMES: str
-    URL_BLACKLIST: str
+    URL_DENYLIST: str
 
 
     SECRET_KEY: Optional[str]
     SECRET_KEY: Optional[str]
     BIND_ADDR: str
     BIND_ADDR: str

+ 1 - 1
archivebox/core/forms.py

@@ -41,7 +41,7 @@ class AddLinkForm(forms.Form):
     #     label="Exclude patterns",
     #     label="Exclude patterns",
     #     min_length='1',
     #     min_length='1',
     #     required=False,
     #     required=False,
-    #     initial=URL_BLACKLIST,
+    #     initial=URL_DENYLIST,
     # )
     # )
     # timeout = forms.IntegerField(
     # timeout = forms.IntegerField(
     #     initial=TIMEOUT,
     #     initial=TIMEOUT,

+ 36 - 11
archivebox/extractors/__init__.py

@@ -4,12 +4,16 @@ import os
 import sys
 import sys
 from pathlib import Path
 from pathlib import Path
 
 
-from typing import Optional, List, Iterable, Union
+from typing import Callable, Optional, List, Iterable, Union
 from datetime import datetime, timezone
 from datetime import datetime, timezone
 from django.db.models import QuerySet
 from django.db.models import QuerySet
 
 
+from ..config import (
+    SAVE_ALLOWLIST_PTN,
+    SAVE_DENYLIST_PTN,
+)
 from ..core.settings import ERROR_LOG
 from ..core.settings import ERROR_LOG
-from ..index.schema import Link
+from ..index.schema import ArchiveResult, Link
 from ..index.sql import write_link_to_sql_index
 from ..index.sql import write_link_to_sql_index
 from ..index import (
 from ..index import (
     load_link_details,
     load_link_details,
@@ -42,7 +46,11 @@ from .archive_org import should_save_archive_dot_org, save_archive_dot_org
 from .headers import should_save_headers, save_headers
 from .headers import should_save_headers, save_headers
 
 
 
 
-def get_default_archive_methods():
+ShouldSaveFunction = Callable[[Link, Optional[Path], Optional[bool]], bool]
+SaveFunction = Callable[[Link, Optional[Path], int], ArchiveResult]
+ArchiveMethodEntry = tuple[str, ShouldSaveFunction, SaveFunction]
+
+def get_default_archive_methods() -> List[ArchiveMethodEntry]:
     return [
     return [
         ('favicon', should_save_favicon, save_favicon),
         ('favicon', should_save_favicon, save_favicon),
         ('headers', should_save_headers, save_headers),
         ('headers', should_save_headers, save_headers),
@@ -59,14 +67,31 @@ def get_default_archive_methods():
         ('archive_org', should_save_archive_dot_org, save_archive_dot_org),
         ('archive_org', should_save_archive_dot_org, save_archive_dot_org),
     ]
     ]
 
 
+@enforce_types
+def get_archive_methods_for_link(link: Link) -> Iterable[ArchiveMethodEntry]:
+    DEFAULT_METHODS = get_default_archive_methods()
+    allowed_methods = {
+        m for pat, methods in
+        SAVE_ALLOWLIST_PTN.items()
+        if pat.search(link.url)
+        for m in methods
+    } or { m[0] for m in DEFAULT_METHODS }
+    denied_methods = {
+        m for pat, methods in
+        SAVE_DENYLIST_PTN.items()
+        if pat.search(link.url)
+        for m in methods
+    }
+    allowed_methods -= denied_methods
+
+    return (m for m in DEFAULT_METHODS if m[0] in allowed_methods)
+
 ARCHIVE_METHODS_INDEXING_PRECEDENCE = [('readability', 1), ('singlefile', 2), ('dom', 3), ('wget', 4)]
 ARCHIVE_METHODS_INDEXING_PRECEDENCE = [('readability', 1), ('singlefile', 2), ('dom', 3), ('wget', 4)]
 
 
 @enforce_types
 @enforce_types
-def ignore_methods(to_ignore: List[str]):
+def ignore_methods(to_ignore: List[str]) -> Iterable[str]:
     ARCHIVE_METHODS = get_default_archive_methods()
     ARCHIVE_METHODS = get_default_archive_methods()
-    methods = filter(lambda x: x[0] not in to_ignore, ARCHIVE_METHODS)
-    methods = map(lambda x: x[0], methods)
-    return list(methods)
+    return [x[0] for x in ARCHIVE_METHODS if x[0] not in to_ignore]
 
 
 @enforce_types
 @enforce_types
 def archive_link(link: Link, overwrite: bool=False, methods: Optional[Iterable[str]]=None, out_dir: Optional[Path]=None) -> Link:
 def archive_link(link: Link, overwrite: bool=False, methods: Optional[Iterable[str]]=None, out_dir: Optional[Path]=None) -> Link:
@@ -79,11 +104,11 @@ def archive_link(link: Link, overwrite: bool=False, methods: Optional[Iterable[s
     except Snapshot.DoesNotExist:
     except Snapshot.DoesNotExist:
         snapshot = write_link_to_sql_index(link)
         snapshot = write_link_to_sql_index(link)
 
 
-    ARCHIVE_METHODS = get_default_archive_methods()
+    active_methods = get_archive_methods_for_link(link)
     
     
     if methods:
     if methods:
-        ARCHIVE_METHODS = [
-            method for method in ARCHIVE_METHODS
+        active_methods = [
+            method for method in active_methods
             if method[0] in methods
             if method[0] in methods
         ]
         ]
 
 
@@ -100,7 +125,7 @@ def archive_link(link: Link, overwrite: bool=False, methods: Optional[Iterable[s
         stats = {'skipped': 0, 'succeeded': 0, 'failed': 0}
         stats = {'skipped': 0, 'succeeded': 0, 'failed': 0}
         start_ts = datetime.now(timezone.utc)
         start_ts = datetime.now(timezone.utc)
 
 
-        for method_name, should_run, method_function in ARCHIVE_METHODS:
+        for method_name, should_run, method_function in active_methods:
             try:
             try:
                 if method_name not in link.history:
                 if method_name not in link.history:
                     link.history[method_name] = []
                     link.history[method_name] = []

+ 4 - 4
archivebox/index/__init__.py

@@ -22,8 +22,8 @@ from ..config import (
     JSON_INDEX_FILENAME,
     JSON_INDEX_FILENAME,
     OUTPUT_DIR,
     OUTPUT_DIR,
     TIMEOUT,
     TIMEOUT,
-    URL_BLACKLIST_PTN,
-    URL_WHITELIST_PTN,
+    URL_DENYLIST_PTN,
+    URL_ALLOWLIST_PTN,
     stderr,
     stderr,
     OUTPUT_PERMISSIONS
     OUTPUT_PERMISSIONS
 )
 )
@@ -142,9 +142,9 @@ def archivable_links(links: Iterable[Link]) -> Iterable[Link]:
             continue
             continue
         if scheme(link.url) not in ('http', 'https', 'ftp'):
         if scheme(link.url) not in ('http', 'https', 'ftp'):
             continue
             continue
-        if URL_BLACKLIST_PTN and URL_BLACKLIST_PTN.search(link.url):
+        if URL_DENYLIST_PTN and URL_DENYLIST_PTN.search(link.url):
             continue
             continue
-        if URL_WHITELIST_PTN and (not URL_WHITELIST_PTN.search(link.url)):
+        if URL_ALLOWLIST_PTN and (not URL_ALLOWLIST_PTN.search(link.url)):
             continue
             continue
 
 
         yield link
         yield link

+ 41 - 2
tests/test_extractors.py

@@ -13,12 +13,51 @@ def test_ignore_methods():
     Takes the passed method out of the default methods list and returns that value
     Takes the passed method out of the default methods list and returns that value
     """
     """
     ignored = ignore_methods(['title'])
     ignored = ignore_methods(['title'])
-    assert should_save_title not in ignored
+    assert "title" not in ignored
+
+def test_save_allowdenylist_works(tmp_path, process, disable_extractors_dict):
+    allow_list = {
+        r'/static': ["headers", "singlefile"],
+        r'example\.com\.html$': ["headers"],
+    }
+    deny_list = {
+        "/static": ["singlefile"],
+    }
+    disable_extractors_dict.update({
+        "SAVE_HEADERS": "true",
+        "USE_SINGLEFILE": "true",
+        "SAVE_ALLOWLIST": pyjson.dumps(allow_list),
+        "SAVE_DENYLIST": pyjson.dumps(deny_list),
+    })
+    add_process = subprocess.run(['archivebox', 'add', 'http://127.0.0.1:8080/static/example.com.html'],
+                                  capture_output=True, env=disable_extractors_dict) 
+    archived_item_path = list(tmp_path.glob('archive/**/*'))[0]
+    singlefile_file = archived_item_path / "singlefile.html"
+    assert not singlefile_file.exists()
+    headers_file = archived_item_path / "headers.json"
+    assert headers_file.exists()
+
+def test_save_denylist_works(tmp_path, process, disable_extractors_dict):
+    deny_list = {
+        "/static": ["singlefile"],
+    }
+    disable_extractors_dict.update({
+        "SAVE_HEADERS": "true",
+        "USE_SINGLEFILE": "true",
+        "SAVE_DENYLIST": pyjson.dumps(deny_list),
+    })
+    add_process = subprocess.run(['archivebox', 'add', 'http://127.0.0.1:8080/static/example.com.html'],
+                                  capture_output=True, env=disable_extractors_dict) 
+    archived_item_path = list(tmp_path.glob('archive/**/*'))[0]
+    singlefile_file = archived_item_path / "singlefile.html"
+    assert not singlefile_file.exists()
+    headers_file = archived_item_path / "headers.json"
+    assert headers_file.exists()
 
 
 def test_singlefile_works(tmp_path, process, disable_extractors_dict):
 def test_singlefile_works(tmp_path, process, disable_extractors_dict):
     disable_extractors_dict.update({"USE_SINGLEFILE": "true"})
     disable_extractors_dict.update({"USE_SINGLEFILE": "true"})
     add_process = subprocess.run(['archivebox', 'add', 'http://127.0.0.1:8080/static/example.com.html'],
     add_process = subprocess.run(['archivebox', 'add', 'http://127.0.0.1:8080/static/example.com.html'],
-                                  capture_output=True, env=disable_extractors_dict) 
+                                  capture_output=True, env=disable_extractors_dict)
     archived_item_path = list(tmp_path.glob('archive/**/*'))[0]
     archived_item_path = list(tmp_path.glob('archive/**/*'))[0]
     output_file = archived_item_path / "singlefile.html" 
     output_file = archived_item_path / "singlefile.html" 
     assert output_file.exists()
     assert output_file.exists()