Просмотр исходного кода

feat: Add WGET_ARGS to control wget arguments

Cristian 5 лет назад
Родитель
Сommit
24e7a74855
3 измененных файлов с 15 добавлено и 9 удалено
  1. 12 1
      archivebox/config/__init__.py
  2. 1 0
      archivebox/config/stubs.py
  3. 2 8
      archivebox/extractors/wget.py

+ 12 - 1
archivebox/config/__init__.py

@@ -120,7 +120,17 @@ CONFIG_DEFAULTS: Dict[str, ConfigDefaultDict] = {
                                                                 '--audio-format', 'mp3',
                                                                 '--audio-quality', '320K',
                                                                 '--embed-thumbnail',
-                                                                '--add-metadata']}
+                                                                '--add-metadata']},
+
+        'WGET_ARGS':                {'type': list,  'default': ['--no-verbose',
+                                                                '--adjust-extension',
+                                                                '--convert-links',
+                                                                '--force-directories',
+                                                                '--backup-converted',
+                                                                '--span-hosts',
+                                                                '--no-parent',
+                                                                '-e', 'robots=off',
+                                                                ]}
     },
 
     'DEPENDENCY_CONFIG': {
@@ -276,6 +286,7 @@ DERIVED_CONFIG_DEFAULTS: ConfigDefaultDict = {
     'WGET_USER_AGENT':          {'default': lambda c: c['WGET_USER_AGENT'].format(**c)},
     'SAVE_WGET':                {'default': lambda c: c['USE_WGET'] and c['SAVE_WGET']},
     'SAVE_WARC':                {'default': lambda c: c['USE_WGET'] and c['SAVE_WARC']},
+    'WGET_ARGS':                {'default': lambda c: c['WGET_ARGS'] or []},
 
     'USE_SINGLEFILE':           {'default': lambda c: c['USE_SINGLEFILE'] and c['SAVE_SINGLEFILE']},
     'SINGLEFILE_VERSION':       {'default': lambda c: bin_version(c['SINGLEFILE_BINARY']) if c['USE_SINGLEFILE'] else None},

+ 1 - 0
archivebox/config/stubs.py

@@ -95,6 +95,7 @@ class ConfigDict(BaseConfig, total=False):
     CHROME_BINARY: Optional[str]
 
     YOUTUBEDL_ARGS: Optional[str]
+    WGET_ARGS: Optional[str]
 
 
 ConfigDefaultValueGetter = Callable[[ConfigDict], ConfigValue]

+ 2 - 8
archivebox/extractors/wget.py

@@ -19,6 +19,7 @@ from ..util import (
     urldecode,
 )
 from ..config import (
+    WGET_ARGS,
     TIMEOUT,
     SAVE_WGET,
     SAVE_WARC,
@@ -59,14 +60,7 @@ def save_wget(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) ->
     cmd = [
         WGET_BINARY,
         # '--server-response',  # print headers for better error parsing
-        '--no-verbose',
-        '--adjust-extension',
-        '--convert-links',
-        '--force-directories',
-        '--backup-converted',
-        '--span-hosts',
-        '--no-parent',
-        '-e', 'robots=off',
+        *WGET_ARGS,
         '--timeout={}'.format(timeout),
         *(['--restrict-file-names={}'.format(RESTRICT_FILE_NAMES)] if RESTRICT_FILE_NAMES else []),
         *(['--warc-file={}'.format(str(warc_path))] if SAVE_WARC else []),