Browse Source

Add `ARGS` and `EXTRA_ARGS` for Mercury extractor

Ben Muthalaly 1 year ago
parent
commit
f4deb97f59
2 changed files with 14 additions and 4 deletions
  1. 4 0
      archivebox/config.py
  2. 10 4
      archivebox/extractors/mercury.py

+ 4 - 0
archivebox/config.py

@@ -199,6 +199,8 @@ CONFIG_SCHEMA: Dict[str, ConfigDefaultDict] = {
         'GIT_ARGS':                 {'type': list,  'default': ['--recursive']},
         'SINGLEFILE_ARGS':          {'type': list,  'default': None},
         'SINGLEFILE_EXTRA_ARGS':    {'type': list,  'default': None},
+        'MERCURY_ARGS':             {'type': list,  'default': ['--format=text']},
+        'MERCURY_EXTRA_ARGS':       {'type': list,  'default': None},
         'FAVICON_PROVIDER':         {'type': str,   'default': 'https://www.google.com/s2/favicons?domain={}'},
     },
 
@@ -561,6 +563,8 @@ DYNAMIC_CONFIG_SCHEMA: ConfigDefaultDict = {
 
     'USE_MERCURY':              {'default': lambda c: c['USE_MERCURY'] and c['SAVE_MERCURY']},
     'MERCURY_VERSION':          {'default': lambda c: '1.0.0' if shutil.which(str(bin_path(c['MERCURY_BINARY']))) else None},  # mercury doesnt expose version info until this is merged https://github.com/postlight/parser/pull/750
+    'MERCURY_ARGS':             {'default': lambda c: c['MERCURY_ARGS'] or []},
+    'MERCURY_EXTRA_ARGS':       {'default': lambda c: c['MERCURY_EXTRA_ARGS'] or []},
 
     'USE_GIT':                  {'default': lambda c: c['USE_GIT'] and c['SAVE_GIT']},
     'GIT_VERSION':              {'default': lambda c: bin_version(c['GIT_BINARY']) if c['USE_GIT'] else None},

+ 10 - 4
archivebox/extractors/mercury.py

@@ -11,13 +11,15 @@ from ..system import run, atomic_write
 from ..util import (
     enforce_types,
     is_static_file,
-
+    dedupe,
 )
 from ..config import (
     TIMEOUT,
     SAVE_MERCURY,
     DEPENDENCIES,
     MERCURY_VERSION,
+    MERCURY_ARGS,
+    MERCURY_EXTRA_ARGS,
 )
 from ..logging_util import TimedProgress
 
@@ -60,12 +62,16 @@ def save_mercury(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT)
     timer = TimedProgress(timeout, prefix='      ')
     try:
         output_folder.mkdir(exist_ok=True)
-
-        # Get plain text version of article
+        # later options take precedence
+        options = [
+            *MERCURY_ARGS,
+            *MERCURY_EXTRA_ARGS,
+        ]
+        # By default, get plain text version of article
         cmd = [
             DEPENDENCIES['MERCURY_BINARY']['path'],
             link.url,
-            "--format=text"
+            *dedupe(options)
         ]
         result = run(cmd, cwd=out_dir, timeout=timeout)
         try: