2
0
Эх сурвалжийг харах

Flip dedupe precedence order

Ben Muthalaly 1 жил өмнө
parent
commit
d74ddd42ae

+ 3 - 3
archivebox/extractors/archive_org.py

@@ -46,14 +46,14 @@ def save_archive_dot_org(link: Link, out_dir: Optional[Path]=None, timeout: int=
     output: ArchiveOutput = 'archive.org.txt'
     output: ArchiveOutput = 'archive.org.txt'
     archive_org_url = None
     archive_org_url = None
     submit_url = 'https://web.archive.org/save/{}'.format(link.url)
     submit_url = 'https://web.archive.org/save/{}'.format(link.url)
-    # earlier options take precedence
+    # later options take precedence
     options = [
     options = [
+        *CURL_ARGS,
+        *CURL_EXTRA_ARGS,
         '--head',
         '--head',
         '--max-time', str(timeout),
         '--max-time', str(timeout),
         *(['--user-agent', '{}'.format(CURL_USER_AGENT)] if CURL_USER_AGENT else []),
         *(['--user-agent', '{}'.format(CURL_USER_AGENT)] if CURL_USER_AGENT else []),
         *([] if CHECK_SSL_VALIDITY else ['--insecure']),
         *([] if CHECK_SSL_VALIDITY else ['--insecure']),
-        *CURL_EXTRA_ARGS,
-        *CURL_ARGS,
     ]
     ]
     cmd = [
     cmd = [
         CURL_BINARY,
         CURL_BINARY,

+ 3 - 3
archivebox/extractors/favicon.py

@@ -39,14 +39,14 @@ def save_favicon(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT)
 
 
     out_dir = out_dir or link.link_dir
     out_dir = out_dir or link.link_dir
     output: ArchiveOutput = 'favicon.ico'
     output: ArchiveOutput = 'favicon.ico'
-    # earlier options take precedence
+    # later options take precedence
     options = [
     options = [
+        *CURL_ARGS,
+        *CURL_EXTRA_ARGS,
         '--max-time', str(timeout),
         '--max-time', str(timeout),
         '--output', str(output),
         '--output', str(output),
         *(['--user-agent', '{}'.format(CURL_USER_AGENT)] if CURL_USER_AGENT else []),
         *(['--user-agent', '{}'.format(CURL_USER_AGENT)] if CURL_USER_AGENT else []),
         *([] if CHECK_SSL_VALIDITY else ['--insecure']),
         *([] if CHECK_SSL_VALIDITY else ['--insecure']),
-        *CURL_EXTRA_ARGS,
-        *CURL_ARGS,
     ]
     ]
     cmd = [
     cmd = [
         CURL_BINARY,
         CURL_BINARY,

+ 3 - 3
archivebox/extractors/headers.py

@@ -42,14 +42,14 @@ def save_headers(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEOUT)
 
 
     status = 'succeeded'
     status = 'succeeded'
     timer = TimedProgress(timeout, prefix='      ')
     timer = TimedProgress(timeout, prefix='      ')
-    # earlier options take precedence
+    # later options take precedence
     options = [
     options = [
+        *CURL_ARGS,
+        *CURL_EXTRA_ARGS,
         '--head',
         '--head',
         '--max-time', str(timeout),
         '--max-time', str(timeout),
         *(['--user-agent', '{}'.format(CURL_USER_AGENT)] if CURL_USER_AGENT else []),
         *(['--user-agent', '{}'.format(CURL_USER_AGENT)] if CURL_USER_AGENT else []),
         *([] if CHECK_SSL_VALIDITY else ['--insecure']),
         *([] if CHECK_SSL_VALIDITY else ['--insecure']),
-        *CURL_EXTRA_ARGS,
-        *CURL_ARGS,
     ]
     ]
     cmd = [
     cmd = [
         CURL_BINARY,
         CURL_BINARY,

+ 3 - 2
archivebox/extractors/media.py

@@ -41,11 +41,12 @@ def save_media(link: Link, out_dir: Optional[Path]=None, timeout: int=MEDIA_TIME
     output: ArchiveOutput = 'media'
     output: ArchiveOutput = 'media'
     output_path = out_dir / output
     output_path = out_dir / output
     output_path.mkdir(exist_ok=True)
     output_path.mkdir(exist_ok=True)
+    # later options take precedence
     options = [
     options = [
+        *YOUTUBEDL_ARGS,
+        *YOUTUBEDL_EXTRA_ARGS,
         *([] if CHECK_SSL_VALIDITY else ['--no-check-certificate']),
         *([] if CHECK_SSL_VALIDITY else ['--no-check-certificate']),
         # TODO: add --cookies-from-browser={CHROME_USER_DATA_DIR}
         # TODO: add --cookies-from-browser={CHROME_USER_DATA_DIR}
-        *YOUTUBEDL_EXTRA_ARGS,
-        *YOUTUBEDL_ARGS,
     ]
     ]
     cmd = [
     cmd = [
         YOUTUBEDL_BINARY,
         YOUTUBEDL_BINARY,

+ 4 - 10
archivebox/extractors/singlefile.py

@@ -48,18 +48,12 @@ def save_singlefile(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEO
 
 
     # SingleFile CLI Docs: https://github.com/gildas-lormeau/SingleFile/tree/master/cli
     # SingleFile CLI Docs: https://github.com/gildas-lormeau/SingleFile/tree/master/cli
     browser_args = '--browser-args={}'.format(json.dumps(browser_args[1:]))
     browser_args = '--browser-args={}'.format(json.dumps(browser_args[1:]))
-
-    # Deduplicate options (single-file doesn't like when you use the same option two times)
-    #
-    # NOTE: Options names that come first clobber conflicting names that come later
-    # My logic is SINGLEFILE_ARGS is the option that affects the singlefile command with most 
-    # specificity, therefore the user sets it with a lot intent, therefore it should take precedence 
-    # kind of like the ergonomic principle of lexical scope in programming languages.
+    # later options take precedence
     options = [
     options = [
-        '--browser-executable-path={}'.format(CHROME_BINARY),
-        browser_args,
-        *SINGLEFILE_EXTRA_ARGS,
         *SINGLEFILE_ARGS,
         *SINGLEFILE_ARGS,
+        *SINGLEFILE_EXTRA_ARGS,
+        browser_args,
+        '--browser-executable-path={}'.format(CHROME_BINARY),
     ]
     ]
     cmd = [
     cmd = [
         DEPENDENCIES['SINGLEFILE_BINARY']['path'],
         DEPENDENCIES['SINGLEFILE_BINARY']['path'],

+ 3 - 3
archivebox/extractors/title.py

@@ -104,13 +104,13 @@ def save_title(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) -
     from core.models import Snapshot
     from core.models import Snapshot
 
 
     output: ArchiveOutput = None
     output: ArchiveOutput = None
-    # earlier options take precedence
+    # later options take precedence
     options = [
     options = [
+        *CURL_ARGS,
+        *CURL_EXTRA_ARGS,
         '--max-time', str(timeout),
         '--max-time', str(timeout),
         *(['--user-agent', '{}'.format(CURL_USER_AGENT)] if CURL_USER_AGENT else []),
         *(['--user-agent', '{}'.format(CURL_USER_AGENT)] if CURL_USER_AGENT else []),
         *([] if CHECK_SSL_VALIDITY else ['--insecure']),
         *([] if CHECK_SSL_VALIDITY else ['--insecure']),
-        *CURL_EXTRA_ARGS,
-        *CURL_ARGS,
     ]
     ]
     cmd = [
     cmd = [
         CURL_BINARY,
         CURL_BINARY,

+ 3 - 3
archivebox/extractors/wget.py

@@ -57,8 +57,10 @@ def save_wget(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) ->
 
 
     # WGET CLI Docs: https://www.gnu.org/software/wget/manual/wget.html
     # WGET CLI Docs: https://www.gnu.org/software/wget/manual/wget.html
     output: ArchiveOutput = None
     output: ArchiveOutput = None
-    # earlier options take precedence
+    # later options take precedence
     options = [
     options = [
+        *WGET_ARGS,
+        *WGET_EXTRA_ARGS,
         '--timeout={}'.format(timeout),
         '--timeout={}'.format(timeout),
         *(['--restrict-file-names={}'.format(RESTRICT_FILE_NAMES)] if RESTRICT_FILE_NAMES else []),
         *(['--restrict-file-names={}'.format(RESTRICT_FILE_NAMES)] if RESTRICT_FILE_NAMES else []),
         *(['--warc-file={}'.format(str(warc_path))] if SAVE_WARC else []),
         *(['--warc-file={}'.format(str(warc_path))] if SAVE_WARC else []),
@@ -69,8 +71,6 @@ def save_wget(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) ->
         *([] if SAVE_WARC else ['--timestamping']),
         *([] if SAVE_WARC else ['--timestamping']),
         *([] if CHECK_SSL_VALIDITY else ['--no-check-certificate', '--no-hsts']),
         *([] if CHECK_SSL_VALIDITY else ['--no-check-certificate', '--no-hsts']),
         # '--server-response',  # print headers for better error parsing
         # '--server-response',  # print headers for better error parsing
-        *WGET_EXTRA_ARGS,
-        *WGET_ARGS,
     ]
     ]
     cmd = [
     cmd = [
         WGET_BINARY,
         WGET_BINARY,

+ 11 - 13
archivebox/util.py

@@ -240,6 +240,8 @@ def chrome_args(**options) -> List[str]:
 
 
     cmd_args = [options['CHROME_BINARY']]
     cmd_args = [options['CHROME_BINARY']]
 
 
+    cmd_args += CHROME_EXTRA_ARGS
+
     if options['CHROME_HEADLESS']:
     if options['CHROME_HEADLESS']:
         chrome_major_version = int(re.search(r'\s(\d+)\.\d', CHROME_VERSION)[1])
         chrome_major_version = int(re.search(r'\s(\d+)\.\d', CHROME_VERSION)[1])
         if chrome_major_version >= 111:
         if chrome_major_version >= 111:
@@ -284,7 +286,6 @@ def chrome_args(**options) -> List[str]:
     if options['CHROME_USER_DATA_DIR']:
     if options['CHROME_USER_DATA_DIR']:
         cmd_args.append('--user-data-dir={}'.format(options['CHROME_USER_DATA_DIR']))
         cmd_args.append('--user-data-dir={}'.format(options['CHROME_USER_DATA_DIR']))
 
 
-    cmd_args += CHROME_EXTRA_ARGS
 
 
     return dedupe(*cmd_args)
     return dedupe(*cmd_args)
 
 
@@ -324,20 +325,17 @@ def ansi_to_html(text):
 
 
 
 
 @enforce_types
 @enforce_types
-def dedupe(*options: List[str]) -> List[str]:
+def dedupe(*options: str) -> List[str]:
     """
     """
-    Deduplicates the given options. Options that come earlier in the list clobber
-    later conflicting options.
+    Deduplicates the given options. Options that come later clobber earlier
+    conflicting options.
     """
     """
-    seen_option_names = []
-    def test_seen(argument):
-        option_name = argument.split("=")[0]
-        if option_name in seen_option_names:
-            return False
-        else:
-            seen_option_names.append(option_name)
-            return True
-    return list(filter(test_seen, options))
+    deduped = {}
+
+    for option in options:
+        deduped[option.split('=')[0]] = option
+
+    return list(deduped.values())
 
 
 
 
 class AttributeDict(dict):
 class AttributeDict(dict):