1 жил өмнө · d74ddd42ae
--- a/archivebox/extractors/archive_org.py
+++ b/archivebox/extractors/archive_org.py
@@ -46,14 +46,14 @@ def save_archive_dot_org(link: Link, out_dir: Optional[Path]=None, timeout: int=
 
															     output: ArchiveOutput = 'archive.org.txt'
														
 
															     archive_org_url = None
														
 
															     submit_url = 'https://web.archive.org/save/{}'.format(link.url)
														
 
															-    # earlier options take precedence
														
 
															+    # later options take precedence
														
 
															     options = [
														
 
															+        *CURL_ARGS,
														
 
															+        *CURL_EXTRA_ARGS,
														
 
															         '--head',
														
 
															         '--max-time', str(timeout),
														
 
															         *(['--user-agent', '{}'.format(CURL_USER_AGENT)] if CURL_USER_AGENT else []),
														
 
															         *([] if CHECK_SSL_VALIDITY else ['--insecure']),
														
 
															-        *CURL_EXTRA_ARGS,
														
 
															-        *CURL_ARGS,
														
 
															     ]
														
 
															     cmd = [
														
 
															         CURL_BINARY,
														
--- a/archivebox/extractors/favicon.py
+++ b/archivebox/extractors/favicon.py
@@ -39,14 +39,14 @@ def save_favicon(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT)
 
															     out_dir = out_dir or link.link_dir
														
 
															     output: ArchiveOutput = 'favicon.ico'
														
 
															-    # earlier options take precedence
														
 
															+    # later options take precedence
														
 
															     options = [
														
 
															+        *CURL_ARGS,
														
 
															+        *CURL_EXTRA_ARGS,
														
 
															         '--max-time', str(timeout),
														
 
															         '--output', str(output),
														
 
															         *(['--user-agent', '{}'.format(CURL_USER_AGENT)] if CURL_USER_AGENT else []),
														
 
															         *([] if CHECK_SSL_VALIDITY else ['--insecure']),
														
 
															-        *CURL_EXTRA_ARGS,
														
 
															-        *CURL_ARGS,
														
 
															     ]
														
 
															     cmd = [
														
 
															         CURL_BINARY,
														
--- a/archivebox/extractors/headers.py
+++ b/archivebox/extractors/headers.py
@@ -42,14 +42,14 @@ def save_headers(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEOUT)
 
															     status = 'succeeded'
														
 
															     timer = TimedProgress(timeout, prefix='      ')
														
 
															-    # earlier options take precedence
														
 
															+    # later options take precedence
														
 
															     options = [
														
 
															+        *CURL_ARGS,
														
 
															+        *CURL_EXTRA_ARGS,
														
 
															         '--head',
														
 
															         '--max-time', str(timeout),
														
 
															         *(['--user-agent', '{}'.format(CURL_USER_AGENT)] if CURL_USER_AGENT else []),
														
 
															         *([] if CHECK_SSL_VALIDITY else ['--insecure']),
														
 
															-        *CURL_EXTRA_ARGS,
														
 
															-        *CURL_ARGS,
														
 
															     ]
														
 
															     cmd = [
														
 
															         CURL_BINARY,
														
--- a/archivebox/extractors/media.py
+++ b/archivebox/extractors/media.py
@@ -41,11 +41,12 @@ def save_media(link: Link, out_dir: Optional[Path]=None, timeout: int=MEDIA_TIME
 
															     output: ArchiveOutput = 'media'
														
 
															     output_path = out_dir / output
														
 
															     output_path.mkdir(exist_ok=True)
														
 
															+    # later options take precedence
														
 
															     options = [
														
 
															+        *YOUTUBEDL_ARGS,
														
 
															+        *YOUTUBEDL_EXTRA_ARGS,
														
 
															         *([] if CHECK_SSL_VALIDITY else ['--no-check-certificate']),
														
 
															         # TODO: add --cookies-from-browser={CHROME_USER_DATA_DIR}
														
 
															-        *YOUTUBEDL_EXTRA_ARGS,
														
 
															-        *YOUTUBEDL_ARGS,
														
 
															     ]
														
 
															     cmd = [
														
 
															         YOUTUBEDL_BINARY,
														
--- a/archivebox/extractors/singlefile.py
+++ b/archivebox/extractors/singlefile.py
@@ -48,18 +48,12 @@ def save_singlefile(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEO
 
															     # SingleFile CLI Docs: https://github.com/gildas-lormeau/SingleFile/tree/master/cli
														
 
															     browser_args = '--browser-args={}'.format(json.dumps(browser_args[1:]))
														
 
															-
														
 
															-    # Deduplicate options (single-file doesn't like when you use the same option two times)
														
 
															-    #
														
 
															-    # NOTE: Options names that come first clobber conflicting names that come later
														
 
															-    # My logic is SINGLEFILE_ARGS is the option that affects the singlefile command with most 
														
 
															-    # specificity, therefore the user sets it with a lot intent, therefore it should take precedence 
														
 
															-    # kind of like the ergonomic principle of lexical scope in programming languages.
														
 
															+    # later options take precedence
														
 
															     options = [
														
 
															-        '--browser-executable-path={}'.format(CHROME_BINARY),
														
 
															-        browser_args,
														
 
															-        *SINGLEFILE_EXTRA_ARGS,
														
 
															         *SINGLEFILE_ARGS,
														
 
															+        *SINGLEFILE_EXTRA_ARGS,
														
 
															+        browser_args,
														
 
															+        '--browser-executable-path={}'.format(CHROME_BINARY),
														
 
															     ]
														
 
															     cmd = [
														
 
															         DEPENDENCIES['SINGLEFILE_BINARY']['path'],
														
--- a/archivebox/extractors/title.py
+++ b/archivebox/extractors/title.py
@@ -104,13 +104,13 @@ def save_title(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) -
 
															     from core.models import Snapshot
														
 
															     output: ArchiveOutput = None
														
 
															-    # earlier options take precedence
														
 
															+    # later options take precedence
														
 
															     options = [
														
 
															+        *CURL_ARGS,
														
 
															+        *CURL_EXTRA_ARGS,
														
 
															         '--max-time', str(timeout),
														
 
															         *(['--user-agent', '{}'.format(CURL_USER_AGENT)] if CURL_USER_AGENT else []),
														
 
															         *([] if CHECK_SSL_VALIDITY else ['--insecure']),
														
 
															-        *CURL_EXTRA_ARGS,
														
 
															-        *CURL_ARGS,
														
 
															     ]
														
 
															     cmd = [
														
 
															         CURL_BINARY,
														
--- a/archivebox/extractors/wget.py
+++ b/archivebox/extractors/wget.py
@@ -57,8 +57,10 @@ def save_wget(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) ->
 
															     # WGET CLI Docs: https://www.gnu.org/software/wget/manual/wget.html
														
 
															     output: ArchiveOutput = None
														
 
															-    # earlier options take precedence
														
 
															+    # later options take precedence
														
 
															     options = [
														
 
															+        *WGET_ARGS,
														
 
															+        *WGET_EXTRA_ARGS,
														
 
															         '--timeout={}'.format(timeout),
														
 
															         *(['--restrict-file-names={}'.format(RESTRICT_FILE_NAMES)] if RESTRICT_FILE_NAMES else []),
														
 
															         *(['--warc-file={}'.format(str(warc_path))] if SAVE_WARC else []),
														
@@ -69,8 +71,6 @@ def save_wget(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) ->
 
															         *([] if SAVE_WARC else ['--timestamping']),
														
 
															         *([] if CHECK_SSL_VALIDITY else ['--no-check-certificate', '--no-hsts']),
														
 
															         # '--server-response',  # print headers for better error parsing
														
 
															-        *WGET_EXTRA_ARGS,
														
 
															-        *WGET_ARGS,
														
 
															     ]
														
 
															     cmd = [
														
 
															         WGET_BINARY,
														
--- a/archivebox/util.py
+++ b/archivebox/util.py
@@ -240,6 +240,8 @@ def chrome_args(**options) -> List[str]:
 
															     cmd_args = [options['CHROME_BINARY']]
														
 
															+    cmd_args += CHROME_EXTRA_ARGS
														
 
															+
														
 
															     if options['CHROME_HEADLESS']:
														
 
															         chrome_major_version = int(re.search(r'\s(\d+)\.\d', CHROME_VERSION)[1])
														
 
															         if chrome_major_version >= 111:
														
@@ -284,7 +286,6 @@ def chrome_args(**options) -> List[str]:
 
															     if options['CHROME_USER_DATA_DIR']:
														
 
															         cmd_args.append('--user-data-dir={}'.format(options['CHROME_USER_DATA_DIR']))
														
 
															-    cmd_args += CHROME_EXTRA_ARGS
														
 
															     return dedupe(*cmd_args)
														
@@ -324,20 +325,17 @@ def ansi_to_html(text):
 
															 @enforce_types
														
 
															-def dedupe(*options: List[str]) -> List[str]:
														
 
															+def dedupe(*options: str) -> List[str]:
														
 
															     """
														
 
															-    Deduplicates the given options. Options that come earlier in the list clobber
														
 
															-    later conflicting options.
														
 
															+    Deduplicates the given options. Options that come later clobber earlier
														
 
															+    conflicting options.
														
 
															     """
														
 
															-    seen_option_names = []
														
 
															-    def test_seen(argument):
														
 
															-        option_name = argument.split("=")[0]
														
 
															-        if option_name in seen_option_names:
														
 
															-            return False
														
 
															-        else:
														
 
															-            seen_option_names.append(option_name)
														
 
															-            return True
														
 
															-    return list(filter(test_seen, options))
														
 
															+    deduped = {}
														
 
															+
														
 
															+    for option in options:
														
 
															+        deduped[option.split('=')[0]] = option
														
 
															+
														
 
															+    return list(deduped.values())
														
 
															 class AttributeDict(dict):