1 anno fa · 4e69d2c9e1
--- a/archivebox/config.py
+++ b/archivebox/config.py
@@ -187,12 +187,15 @@ CONFIG_SCHEMA: Dict[str, ConfigDefaultDict] = {
 
				                                                                 '--no-parent',
			
 
				                                                                 '-e', 'robots=off',
			
 
				                                                                 ]},
			
 
				+        'WGET_EXTRA_ARGS':          {'type': list,  'default': None},
			
 
				         'CURL_ARGS':                {'type': list,  'default': ['--silent',
			
 
				                                                                 '--location',
			
 
				                                                                 '--compressed'
			
 
				                                                                ]},
			
 
				+        'CURL_EXTRA_ARGS':          {'type': list,  'default': None},
			
 
				         'GIT_ARGS':                 {'type': list,  'default': ['--recursive']},
			
 
				-        'SINGLEFILE_ARGS':          {'type': list,  'default' : None},
			
 
				+        'SINGLEFILE_ARGS':          {'type': list,  'default': None},
			
 
				+        'SINGLEFILE_EXTRA_ARGS':    {'type': list,  'default': None},
			
 
				         'FAVICON_PROVIDER':         {'type': str,   'default': 'https://www.google.com/s2/favicons?domain={}'},
			
 
				     },
			
 
				 
			
@@ -530,6 +533,7 @@ DYNAMIC_CONFIG_SCHEMA: ConfigDefaultDict = {
 
				     'CURL_VERSION':             {'default': lambda c: bin_version(c['CURL_BINARY']) if c['USE_CURL'] else None},
			
 
				     'CURL_USER_AGENT':          {'default': lambda c: c['CURL_USER_AGENT'].format(**c)},
			
 
				     'CURL_ARGS':                {'default': lambda c: c['CURL_ARGS'] or []},
			
 
				+    'CURL_EXTRA_ARGS':          {'default': lambda c: c['CURL_EXTRA_ARGS'] or []},
			
 
				     'SAVE_FAVICON':             {'default': lambda c: c['USE_CURL'] and c['SAVE_FAVICON']},
			
 
				     'SAVE_ARCHIVE_DOT_ORG':     {'default': lambda c: c['USE_CURL'] and c['SAVE_ARCHIVE_DOT_ORG']},
			
 
				 
			
@@ -540,12 +544,14 @@ DYNAMIC_CONFIG_SCHEMA: ConfigDefaultDict = {
 
				     'SAVE_WGET':                {'default': lambda c: c['USE_WGET'] and c['SAVE_WGET']},
			
 
				     'SAVE_WARC':                {'default': lambda c: c['USE_WGET'] and c['SAVE_WARC']},
			
 
				     'WGET_ARGS':                {'default': lambda c: c['WGET_ARGS'] or []},
			
 
				+    'WGET_EXTRA_ARGS':          {'default': lambda c: c['WGET_EXTRA_ARGS'] or []},
			
 
				 
			
 
				     'RIPGREP_VERSION':          {'default': lambda c: bin_version(c['RIPGREP_BINARY']) if c['USE_RIPGREP'] else None},
			
 
				 
			
 
				     'USE_SINGLEFILE':           {'default': lambda c: c['USE_SINGLEFILE'] and c['SAVE_SINGLEFILE']},
			
 
				     'SINGLEFILE_VERSION':       {'default': lambda c: bin_version(c['SINGLEFILE_BINARY']) if c['USE_SINGLEFILE'] else None},
			
 
				     'SINGLEFILE_ARGS':          {'default': lambda c: c['SINGLEFILE_ARGS'] or []},
			
 
				+    'SINGLEFILE_EXTRA_ARGS':    {'default': lambda c: c['SINGLEFILE_EXTRA_ARGS'] or []},
			
 
				 
			
 
				     'USE_READABILITY':          {'default': lambda c: c['USE_READABILITY'] and c['SAVE_READABILITY']},
			
 
				     'READABILITY_VERSION':      {'default': lambda c: bin_version(c['READABILITY_BINARY']) if c['USE_READABILITY'] else None},
			
--- a/archivebox/extractors/archive_org.py
+++ b/archivebox/extractors/archive_org.py
@@ -10,10 +10,12 @@ from ..system import run, chmod_file
 
				 from ..util import (
			
 
				     enforce_types,
			
 
				     is_static_file,
			
 
				+    dedupe,
			
 
				 )
			
 
				 from ..config import (
			
 
				     TIMEOUT,
			
 
				     CURL_ARGS,
			
 
				+    CURL_EXTRA_ARGS,
			
 
				     CHECK_SSL_VALIDITY,
			
 
				     SAVE_ARCHIVE_DOT_ORG,
			
 
				     CURL_BINARY,
			
@@ -44,13 +46,18 @@ def save_archive_dot_org(link: Link, out_dir: Optional[Path]=None, timeout: int=
 
				     output: ArchiveOutput = 'archive.org.txt'
			
 
				     archive_org_url = None
			
 
				     submit_url = 'https://web.archive.org/save/{}'.format(link.url)
			
 
				-    cmd = [
			
 
				-        CURL_BINARY,
			
 
				-        *CURL_ARGS,
			
 
				+    # earlier options take precedence
			
 
				+    options = [
			
 
				         '--head',
			
 
				         '--max-time', str(timeout),
			
 
				         *(['--user-agent', '{}'.format(CURL_USER_AGENT)] if CURL_USER_AGENT else []),
			
 
				         *([] if CHECK_SSL_VALIDITY else ['--insecure']),
			
 
				+        *CURL_EXTRA_ARGS,
			
 
				+        *CURL_ARGS,
			
 
				+    ]
			
 
				+    cmd = [
			
 
				+        CURL_BINARY,
			
 
				+        *dedupe(*options),
			
 
				         submit_url,
			
 
				     ]
			
 
				     status = 'succeeded'
			
--- a/archivebox/extractors/favicon.py
+++ b/archivebox/extractors/favicon.py
@@ -6,13 +6,18 @@ from typing import Optional
 
				 
			
 
				 from ..index.schema import Link, ArchiveResult, ArchiveOutput
			
 
				 from ..system import chmod_file, run
			
 
				-from ..util import enforce_types, domain
			
 
				+from ..util import (
			
 
				+    enforce_types,
			
 
				+     domain,
			
 
				+     dedupe,
			
 
				+)
			
 
				 from ..config import (
			
 
				     TIMEOUT,
			
 
				     SAVE_FAVICON,
			
 
				     FAVICON_PROVIDER,
			
 
				     CURL_BINARY,
			
 
				     CURL_ARGS,
			
 
				+    CURL_EXTRA_ARGS,
			
 
				     CURL_VERSION,
			
 
				     CHECK_SSL_VALIDITY,
			
 
				     CURL_USER_AGENT,
			
@@ -34,13 +39,18 @@ def save_favicon(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT)
 
				 
			
 
				     out_dir = out_dir or link.link_dir
			
 
				     output: ArchiveOutput = 'favicon.ico'
			
 
				-    cmd = [
			
 
				-        CURL_BINARY,
			
 
				-        *CURL_ARGS,
			
 
				+    # earlier options take precedence
			
 
				+    options = [
			
 
				         '--max-time', str(timeout),
			
 
				         '--output', str(output),
			
 
				         *(['--user-agent', '{}'.format(CURL_USER_AGENT)] if CURL_USER_AGENT else []),
			
 
				         *([] if CHECK_SSL_VALIDITY else ['--insecure']),
			
 
				+        *CURL_EXTRA_ARGS,
			
 
				+        *CURL_ARGS,
			
 
				+    ]
			
 
				+    cmd = [
			
 
				+        CURL_BINARY,
			
 
				+        *dedupe(*options),
			
 
				         FAVICON_PROVIDER.format(domain(link.url)),
			
 
				     ]
			
 
				     status = 'failed'
			
--- a/archivebox/extractors/headers.py
+++ b/archivebox/extractors/headers.py
@@ -9,11 +9,13 @@ from ..system import atomic_write
 
				 from ..util import (
			
 
				     enforce_types,
			
 
				     get_headers,
			
 
				+    dedupe,
			
 
				 )
			
 
				 from ..config import (
			
 
				     TIMEOUT,
			
 
				     CURL_BINARY,
			
 
				     CURL_ARGS,
			
 
				+    CURL_EXTRA_ARGS,
			
 
				     CURL_USER_AGENT,
			
 
				     CURL_VERSION,
			
 
				     CHECK_SSL_VALIDITY,
			
@@ -40,14 +42,18 @@ def save_headers(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEOUT)
 
				 
			
 
				     status = 'succeeded'
			
 
				     timer = TimedProgress(timeout, prefix='      ')
			
 
				-
			
 
				-    cmd = [
			
 
				-        CURL_BINARY,
			
 
				-        *CURL_ARGS,
			
 
				+    # earlier options take precedence
			
 
				+    options = [
			
 
				         '--head',
			
 
				         '--max-time', str(timeout),
			
 
				         *(['--user-agent', '{}'.format(CURL_USER_AGENT)] if CURL_USER_AGENT else []),
			
 
				         *([] if CHECK_SSL_VALIDITY else ['--insecure']),
			
 
				+        *CURL_EXTRA_ARGS,
			
 
				+        *CURL_ARGS,
			
 
				+    ]
			
 
				+    cmd = [
			
 
				+        CURL_BINARY,
			
 
				+        *dedupe(*options),
			
 
				         link.url,
			
 
				     ]
			
 
				     try:
			
--- a/archivebox/extractors/singlefile.py
+++ b/archivebox/extractors/singlefile.py
@@ -11,6 +11,7 @@ from ..util import (
 
				     enforce_types,
			
 
				     is_static_file,
			
 
				     chrome_args,
			
 
				+    dedupe,
			
 
				 )
			
 
				 from ..config import (
			
 
				     TIMEOUT,
			
@@ -18,6 +19,7 @@ from ..config import (
 
				     DEPENDENCIES,
			
 
				     SINGLEFILE_VERSION,
			
 
				     SINGLEFILE_ARGS,
			
 
				+    SINGLEFILE_EXTRA_ARGS,
			
 
				     CHROME_BINARY,
			
 
				 )
			
 
				 from ..logging_util import TimedProgress
			
@@ -46,11 +48,6 @@ def save_singlefile(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEO
 
				 
			
 
				     # SingleFile CLI Docs: https://github.com/gildas-lormeau/SingleFile/tree/master/cli
			
 
				     browser_args = '--browser-args={}'.format(json.dumps(browser_args[1:]))
			
 
				-    options = [
			
 
				-        *SINGLEFILE_ARGS,
			
 
				-        '--browser-executable-path={}'.format(CHROME_BINARY),
			
 
				-        browser_args,
			
 
				-    ]
			
 
				 
			
 
				     # Deduplicate options (single-file doesn't like when you use the same option two times)
			
 
				     #
			
@@ -58,19 +55,15 @@ def save_singlefile(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEO
 
				     # My logic is SINGLEFILE_ARGS is the option that affects the singlefile command with most 
			
 
				     # specificity, therefore the user sets it with a lot intent, therefore it should take precedence 
			
 
				     # kind of like the ergonomic principle of lexical scope in programming languages.
			
 
				-    seen_option_names = []
			
 
				-    def test_seen(argument):
			
 
				-        option_name = argument.split("=")[0]
			
 
				-        if option_name in seen_option_names:
			
 
				-            return False
			
 
				-        else:
			
 
				-            seen_option_names.append(option_name)
			
 
				-            return True
			
 
				-    deduped_options = list(filter(test_seen, options))
			
 
				-
			
 
				+    options = [
			
 
				+        '--browser-executable-path={}'.format(CHROME_BINARY),
			
 
				+        browser_args,
			
 
				+        *SINGLEFILE_EXTRA_ARGS,
			
 
				+        *SINGLEFILE_ARGS,
			
 
				+    ]
			
 
				     cmd = [
			
 
				         DEPENDENCIES['SINGLEFILE_BINARY']['path'],
			
 
				-        *deduped_options,
			
 
				+        *dedupe(*options),
			
 
				         link.url,
			
 
				         output,
			
 
				     ]
			
--- a/archivebox/extractors/title.py
+++ b/archivebox/extractors/title.py
@@ -10,6 +10,7 @@ from ..util import (
 
				     enforce_types,
			
 
				     download_url,
			
 
				     htmldecode,
			
 
				+    dedupe,
			
 
				 )
			
 
				 from ..config import (
			
 
				     TIMEOUT,
			
@@ -17,6 +18,7 @@ from ..config import (
 
				     SAVE_TITLE,
			
 
				     CURL_BINARY,
			
 
				     CURL_ARGS,
			
 
				+    CURL_EXTRA_ARGS,
			
 
				     CURL_VERSION,
			
 
				     CURL_USER_AGENT,
			
 
				 )
			
@@ -102,12 +104,17 @@ def save_title(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) -
 
				     from core.models import Snapshot
			
 
				 
			
 
				     output: ArchiveOutput = None
			
 
				-    cmd = [
			
 
				-        CURL_BINARY,
			
 
				-        *CURL_ARGS,
			
 
				+    # earlier options take precedence
			
 
				+    options = [
			
 
				         '--max-time', str(timeout),
			
 
				         *(['--user-agent', '{}'.format(CURL_USER_AGENT)] if CURL_USER_AGENT else []),
			
 
				         *([] if CHECK_SSL_VALIDITY else ['--insecure']),
			
 
				+        *CURL_EXTRA_ARGS,
			
 
				+        *CURL_ARGS,
			
 
				+    ]
			
 
				+    cmd = [
			
 
				+        CURL_BINARY,
			
 
				+        *dedupe(*options),
			
 
				         link.url,
			
 
				     ]
			
 
				     status = 'succeeded'
			
--- a/archivebox/extractors/wget.py
+++ b/archivebox/extractors/wget.py
@@ -15,9 +15,11 @@ from ..util import (
 
				     path,
			
 
				     domain,
			
 
				     urldecode,
			
 
				+    dedupe,
			
 
				 )
			
 
				 from ..config import (
			
 
				     WGET_ARGS,
			
 
				+    WGET_EXTRA_ARGS,
			
 
				     TIMEOUT,
			
 
				     SAVE_WGET,
			
 
				     SAVE_WARC,
			
@@ -55,10 +57,8 @@ def save_wget(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) ->
 
				 
			
 
				     # WGET CLI Docs: https://www.gnu.org/software/wget/manual/wget.html
			
 
				     output: ArchiveOutput = None
			
 
				-    cmd = [
			
 
				-        WGET_BINARY,
			
 
				-        # '--server-response',  # print headers for better error parsing
			
 
				-        *WGET_ARGS,
			
 
				+    # earlier options take precedence
			
 
				+    options = [
			
 
				         '--timeout={}'.format(timeout),
			
 
				         *(['--restrict-file-names={}'.format(RESTRICT_FILE_NAMES)] if RESTRICT_FILE_NAMES else []),
			
 
				         *(['--warc-file={}'.format(str(warc_path))] if SAVE_WARC else []),
			
@@ -68,6 +68,13 @@ def save_wget(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) ->
 
				         *(['--compression=auto'] if WGET_AUTO_COMPRESSION else []),
			
 
				         *([] if SAVE_WARC else ['--timestamping']),
			
 
				         *([] if CHECK_SSL_VALIDITY else ['--no-check-certificate', '--no-hsts']),
			
 
				+        # '--server-response',  # print headers for better error parsing
			
 
				+        *WGET_EXTRA_ARGS, 
			
 
				+        *WGET_ARGS,
			
 
				+    ]
			
 
				+    cmd = [
			
 
				+        WGET_BINARY,
			
 
				+        *dedupe(*options),
			
 
				         link.url,
			
 
				     ]
			
 
				 
			
--- a/archivebox/util.py
+++ b/archivebox/util.py
@@ -317,6 +317,23 @@ def ansi_to_html(text):
 
				     return COLOR_REGEX.sub(single_sub, text)
			
 
				 
			
 
				 
			
 
				+@enforce_types
			
 
				+def dedupe(*options: List[str]) -> List[str]:
			
 
				+    """
			
 
				+    Deduplicates the given options. Options that come earlier in the list clobber
			
 
				+    later conflicting options.
			
 
				+    """
			
 
				+    seen_option_names = []
			
 
				+    def test_seen(argument):
			
 
				+        option_name = argument.split("=")[0]
			
 
				+        if option_name in seen_option_names:
			
 
				+            return False
			
 
				+        else:
			
 
				+            seen_option_names.append(option_name)
			
 
				+            return True
			
 
				+    return list(filter(test_seen, options))
			
 
				+
			
 
				+
			
 
				 class AttributeDict(dict):
			
 
				     """Helper to allow accessing dict values via Example.key or Example['key']"""