Browse Source

Merge pull request #403 from cdvv7788/single-file

Nick Sweeting 5 years ago
parent
commit
c8e3aed647

+ 24 - 0
.github/workflows/test.yml

@@ -40,6 +40,22 @@ jobs:
         with:
           fetch-depth: 1
 
+      - uses: actions/checkout@v2
+        with:
+          fetch-depth: 1
+          repository: "gildas-lormeau/SingleFile"
+          ref: "master"
+          path: "singlefile"
+
+      - name: Install npm requirements for singlefile
+        run: npm install --prefix singlefile/cli
+
+      - name: Give singlefile execution permissions
+        run: chmod +x singlefile/cli/single-file
+
+      - name: Set SINGLEFILE_BINARY
+        run: echo "::set-env name=SINGLEFILE_BINARY::$GITHUB_WORKSPACE/singlefile/cli/single-file"
+
       - name: Set up Python ${{ matrix.python }}
         uses: actions/setup-python@v1
         with:
@@ -60,6 +76,14 @@ jobs:
           restore-keys: |
             ${{ runner.os }}-${{ matrix.python }}-venv-
 
+      - name: Use nodejs 14.7.0
+        uses: actions/setup-node@v1
+        with:
+          node-version: 14.7.0
+
+      - name: Debug
+        run: ls ./
+
       - name: Install dependencies
         run: |
           python -m pip install .

+ 30 - 17
Dockerfile

@@ -10,8 +10,8 @@
 FROM python:3.8-slim-buster
 
 LABEL name="archivebox" \
-      maintainer="Nick Sweeting <[email protected]>" \
-      description="All-in-one personal internet archiving container"
+    maintainer="Nick Sweeting <[email protected]>" \
+    description="All-in-one personal internet archiving container"
 
 ENV TZ=UTC \
     LANGUAGE=en_US:en \
@@ -22,28 +22,40 @@ ENV TZ=UTC \
     APT_KEY_DONT_WARN_ON_DANGEROUS_USAGE=1 \
     CODE_PATH=/app \
     VENV_PATH=/venv \
-    DATA_PATH=/data
+    DATA_PATH=/data \
+    EXTRA_PATH=/extra
 
-# First install CLI utils and base deps, then Chrome + Fons
+# First install CLI utils and base deps, then Chrome + Fons + nodejs
 RUN echo 'debconf debconf/frontend select Noninteractive' | debconf-set-selections \
     && apt-get update -qq \
     && apt-get install -qq -y --no-install-recommends \
-       apt-transport-https ca-certificates apt-utils gnupg gosu gnupg2 libgconf-2-4 zlib1g-dev \
-       dumb-init jq git wget curl youtube-dl ffmpeg \
+    apt-transport-https ca-certificates apt-utils gnupg gosu gnupg2 libgconf-2-4 zlib1g-dev \
+    dumb-init jq git wget curl youtube-dl ffmpeg \
     && curl -sSL "https://dl.google.com/linux/linux_signing_key.pub" | apt-key add - \
     && echo "deb https://dl.google.com/linux/chrome/deb/ stable main" > /etc/apt/sources.list.d/google-chrome.list \
+    && curl -sL https://deb.nodesource.com/setup_14.x | bash - \
     && apt-get update -qq \
     && apt-get install -qq -y --no-install-recommends \
-       google-chrome-stable \
-       fontconfig \
-       fonts-ipafont-gothic \
-       fonts-wqy-zenhei \
-       fonts-thai-tlwg \
-       fonts-kacst \
-       fonts-symbola \
-       fonts-noto \
-       fonts-freefont-ttf \
-    && rm -rf /var/lib/apt/lists/*
+    google-chrome-stable \
+    fontconfig \
+    fonts-ipafont-gothic \
+    fonts-wqy-zenhei \
+    fonts-thai-tlwg \
+    fonts-kacst \
+    fonts-symbola \
+    fonts-noto \
+    fonts-freefont-ttf \
+    nodejs \
+    unzip \
+    && rm -rf /var/lib/apt/lists/* 
+
+# Clone singlefile and move it to the /bin folder so archivebox can find it
+
+WORKDIR "$EXTRA_PATH"
+RUN wget -qO - https://github.com/gildas-lormeau/SingleFile/archive/master.zip > SingleFile.zip \
+    && unzip -q SingleFile.zip \
+    && npm install --prefix SingleFile-master/cli --production > /dev/null 2>&1 \
+    && chmod +x SingleFile-master/cli/single-file 
 
 # Run everything from here on out as non-privileged user
 RUN groupadd --system archivebox \
@@ -60,7 +72,8 @@ VOLUME "$DATA_PATH"
 WORKDIR "$DATA_PATH"
 EXPOSE 8000
 ENV CHROME_BINARY=google-chrome \
-    CHROME_SANDBOX=False
+    CHROME_SANDBOX=False \
+    SINGLEFILE_BINARY="$EXTRA_PATH/SingleFile-master/cli/single-file"
 
 RUN env ALLOW_ROOT=True archivebox version
 

+ 14 - 0
archivebox/config/__init__.py

@@ -74,6 +74,7 @@ CONFIG_DEFAULTS: Dict[str, ConfigDefaultDict] = {
         'SAVE_FAVICON':             {'type': bool,  'default': True, 'aliases': ('FETCH_FAVICON',)},
         'SAVE_WGET':                {'type': bool,  'default': True, 'aliases': ('FETCH_WGET',)},
         'SAVE_WGET_REQUISITES':     {'type': bool,  'default': True, 'aliases': ('FETCH_WGET_REQUISITES',)},
+        'SAVE_SINGLEFILE':          {'type': bool,  'default': True, 'aliases': ('FETCH_SINGLEFILE',)},
         'SAVE_PDF':                 {'type': bool,  'default': True, 'aliases': ('FETCH_PDF',)},
         'SAVE_SCREENSHOT':          {'type': bool,  'default': True, 'aliases': ('FETCH_SCREENSHOT',)},
         'SAVE_DOM':                 {'type': bool,  'default': True, 'aliases': ('FETCH_DOM',)},
@@ -104,6 +105,7 @@ CONFIG_DEFAULTS: Dict[str, ConfigDefaultDict] = {
     'DEPENDENCY_CONFIG': {
         'USE_CURL':                 {'type': bool,  'default': True},
         'USE_WGET':                 {'type': bool,  'default': True},
+        'USE_SINGLEFILE':           {'type': bool,  'default': True},
         'USE_GIT':                  {'type': bool,  'default': True},
         'USE_CHROME':               {'type': bool,  'default': True},
         'USE_YOUTUBEDL':            {'type': bool,  'default': True},
@@ -111,6 +113,7 @@ CONFIG_DEFAULTS: Dict[str, ConfigDefaultDict] = {
         'CURL_BINARY':              {'type': str,   'default': 'curl'},
         'GIT_BINARY':               {'type': str,   'default': 'git'},
         'WGET_BINARY':              {'type': str,   'default': 'wget'},
+        'SINGLEFILE_BINARY':        {'type': str,   'default': 'single-file'},
         'YOUTUBEDL_BINARY':         {'type': str,   'default': 'youtube-dl'},
         'CHROME_BINARY':            {'type': str,   'default': None},
     },
@@ -249,6 +252,10 @@ DERIVED_CONFIG_DEFAULTS: ConfigDefaultDict = {
     'SAVE_WGET':                {'default': lambda c: c['USE_WGET'] and c['SAVE_WGET']},
     'SAVE_WARC':                {'default': lambda c: c['USE_WGET'] and c['SAVE_WARC']},
 
+    'USE_SINGLEFILE':           {'default': lambda c: c['USE_SINGLEFILE'] and (c['SAVE_SINGLEFILE'])},
+    'SINGLEFILE_VERSION':       {'default': lambda c: bin_version(c['SINGLEFILE_BINARY']) if c['USE_SINGLEFILE'] else None},
+    'SAVE_SINGLEFILE':          {'default': lambda c: c['USE_SINGLEFILE'] and c['SAVE_SINGLEFILE']},
+
     'USE_GIT':                  {'default': lambda c: c['USE_GIT'] and c['SAVE_GIT']},
     'GIT_VERSION':              {'default': lambda c: bin_version(c['GIT_BINARY']) if c['USE_GIT'] else None},
     'SAVE_GIT':                 {'default': lambda c: c['USE_GIT'] and c['SAVE_GIT']},
@@ -674,6 +681,13 @@ def get_dependency_info(config: ConfigDict) -> ConfigValue:
             'enabled': config['USE_WGET'],
             'is_valid': bool(config['WGET_VERSION']),
         },
+        'SINGLEFILE_BINARY': {
+            'path': bin_path(config['SINGLEFILE_BINARY']),
+            'version': config['SINGLEFILE_VERSION'],
+            'hash': bin_hash(config['SINGLEFILE_BINARY']),
+            'enabled': config['USE_SINGLEFILE'],
+            'is_valid': bool(config['SINGLEFILE_VERSION']),
+        },
         'GIT_BINARY': {
             'path': bin_path(config['GIT_BINARY']),
             'version': config['GIT_VERSION'],

+ 2 - 0
archivebox/core/admin.py

@@ -106,6 +106,7 @@ class SnapshotAdmin(admin.ModelAdmin):
                 '<a href="/{}/{}" class="exists-{}" title="Screenshot">🖥 </a> '
                 '<a href="/{}/{}" class="exists-{}" title="HTML dump">🅷 </a> '
                 '<a href="/{}/{}/" class="exists-{}" title="WARC">🆆 </a> '
+                '<a href="/{}/{}" class="exists-{}" title="SingleFile">&#128476; </a>'
                 '<a href="/{}/{}/" class="exists-{}" title="Media files">📼 </a> '
                 '<a href="/{}/{}/" class="exists-{}" title="Git repos">📦 </a> '
                 '<a href="{}" class="exists-{}" title="Archive.org snapshot">🏛 </a> '
@@ -115,6 +116,7 @@ class SnapshotAdmin(admin.ModelAdmin):
             *link_tuple(link, 'screenshot_path'),
             *link_tuple(link, 'dom_path'),
             *link_tuple(link, 'warc_path')[:2], any((out_dir / canon['warc_path']).glob('*.warc.gz')),
+            *link_tuple(link, 'singlefile_path'),
             *link_tuple(link, 'media_path')[:2], any((out_dir / canon['media_path']).glob('*')),
             *link_tuple(link, 'git_path')[:2], any((out_dir / canon['git_path']).glob('*')),
             canon['archive_org_path'], (out_dir / 'archive.org.txt').exists(),

+ 2 - 0
archivebox/extractors/__init__.py

@@ -25,6 +25,7 @@ from ..logging_util import (
 from .title import should_save_title, save_title
 from .favicon import should_save_favicon, save_favicon
 from .wget import should_save_wget, save_wget
+from .singlefile import should_save_singlefile, save_singlefile
 from .pdf import should_save_pdf, save_pdf
 from .screenshot import should_save_screenshot, save_screenshot
 from .dom import should_save_dom, save_dom
@@ -37,6 +38,7 @@ def get_default_archive_methods():
             ('title', should_save_title, save_title),
             ('favicon', should_save_favicon, save_favicon),
             ('wget', should_save_wget, save_wget),
+            ('singlefile', should_save_singlefile, save_singlefile),
             ('pdf', should_save_pdf, save_pdf),
             ('screenshot', should_save_screenshot, save_screenshot),
             ('dom', should_save_dom, save_dom),

+ 84 - 0
archivebox/extractors/singlefile.py

@@ -0,0 +1,84 @@
+__package__ = 'archivebox.extractors'
+
+from pathlib import Path
+
+from typing import Optional
+import json
+
+from ..index.schema import Link, ArchiveResult, ArchiveError
+from ..system import run, chmod_file
+from ..util import (
+    enforce_types,
+    chrome_args
+)
+from ..config import (
+    TIMEOUT,
+    SAVE_SINGLEFILE,
+    SINGLEFILE_BINARY,
+    SINGLEFILE_VERSION,
+    CHROME_BINARY,
+)
+from ..logging_util import TimedProgress
+
+
+@enforce_types
+def should_save_singlefile(link: Link, out_dir: Optional[str]=None) -> bool:
+    out_dir = out_dir or link.link_dir
+
+    output = Path(out_dir or link.link_dir) / 'singlefile.html'
+    return SAVE_SINGLEFILE and (not output.exists())
+
+
+@enforce_types
+def save_singlefile(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEOUT) -> ArchiveResult:
+    """download full site using single-file"""
+
+    out_dir = out_dir or link.link_dir
+    output = str(Path(out_dir).absolute() / "singlefile.html")
+
+    browser_args = chrome_args(TIMEOUT=0)
+
+    # SingleFile CLI Docs: https://github.com/gildas-lormeau/SingleFile/tree/master/cli
+    cmd = [
+        SINGLEFILE_BINARY,
+        '--browser-executable-path={}'.format(CHROME_BINARY),
+        '--browser-args="{}"'.format(json.dumps(browser_args[1:])),
+        link.url,
+        output
+    ]
+
+    status = 'succeeded'
+    timer = TimedProgress(timeout, prefix='      ')
+    try:
+        result = run(cmd, cwd=out_dir, timeout=timeout)
+
+        # parse out number of files downloaded from last line of stderr:
+        #  "Downloaded: 76 files, 4.0M in 1.6s (2.52 MB/s)"
+        output_tail = [
+            line.strip()
+            for line in (result.stdout + result.stderr).decode().rsplit('\n', 3)[-3:]
+            if line.strip()
+        ]
+        hints = (
+            'Got single-file response code: {}.'.format(result.returncode),
+            *output_tail,
+        )
+
+        # Check for common failure cases
+        if (result.returncode > 0):
+            raise ArchiveError('SingleFile was not able to archive the page', hints)
+        chmod_file(output)
+    except Exception as err:
+        status = 'failed'
+        output = err
+    finally:
+        timer.end()
+
+    return ArchiveResult(
+        cmd=cmd,
+        pwd=out_dir,
+        cmd_version=SINGLEFILE_VERSION,
+        output=output,
+        status=status,
+        **timer.stats,
+    )

+ 4 - 3
archivebox/index/schema.py

@@ -365,6 +365,7 @@ class Link:
             'screenshot.png',
             'output.html',
             'media',
+            'singlefile.html'
         )
 
         return any(
@@ -376,7 +377,7 @@ class Link:
         """get the latest output that each archive method produced for link"""
         
         ARCHIVE_METHODS = (
-            'title', 'favicon', 'wget', 'warc', 'pdf',
+            'title', 'favicon', 'wget', 'warc', 'singlefile', 'pdf',
             'screenshot', 'dom', 'git', 'media', 'archive_org',
         )
         latest: Dict[str, ArchiveOutput] = {}
@@ -392,7 +393,6 @@ class Link:
                 latest[archive_method] = history[0].output
             else:
                 latest[archive_method] = None
-
         return latest
 
 
@@ -406,6 +406,7 @@ class Link:
             'google_favicon_path': 'https://www.google.com/s2/favicons?domain={}'.format(self.domain),
             'wget_path': wget_output_path(self),
             'warc_path': 'warc',
+            'singlefile_path': 'singlefile.html',
             'pdf_path': 'output.pdf',
             'screenshot_path': 'screenshot.png',
             'dom_path': 'output.html',
@@ -425,7 +426,7 @@ class Link:
                 'pdf_path': static_path,
                 'screenshot_path': static_path,
                 'dom_path': static_path,
+                'singlefile_path': static_path,
             })
         return canonical
 
-

+ 1 - 0
archivebox/logging_util.py

@@ -518,6 +518,7 @@ def printable_folder_status(name: str, folder: Dict) -> str:
 
 @enforce_types
 def printable_dependency_version(name: str, dependency: Dict) -> str:
+    version = None
     if dependency['enabled']:
         if dependency['is_valid']:
             color, symbol, note, version = 'green', '√', 'valid', ''

+ 13 - 0
archivebox/themes/legacy/link_details.html

@@ -79,6 +79,7 @@
             .card {
                 overflow: hidden;
                 box-shadow: 2px 3px 14px 0px rgba(0,0,0,0.02);
+                margin-top: 10px;
             }
             .card h4 {
                 font-size: 1.4vw;
@@ -335,6 +336,18 @@
                           </div>
                         </div>
                     </div>
+                    <div class="col-lg-2">
+                        <div class="card">
+                          <iframe class="card-img-top" src="$singlefile_path" sandbox="allow-same-origin allow-scripts allow-forms" scrolling="no"></iframe>
+                          <div class="card-body">
+                            <a href="$singlefile_path" style="float:right" title="Open in new tab..." target="_blank" rel="noopener">
+                                <img src="../../static/external.png" class="external"/>
+                            </a>
+                            <a href="$singlefile_path" target="preview"><h4 class="card-title">SingleFile</h4></a>
+                            <p class="card-text">archive/singlefile.html</p>
+                          </div>
+                        </div>
+                    </div>
                     <div class="col-lg-2">
                         <div class="card">
                           <iframe class="card-img-top pdf-frame" src="$pdf_path" scrolling="no"></iframe>

+ 16 - 1
tests/fixtures.py

@@ -7,4 +7,19 @@ import pytest
 def process(tmp_path):
     os.chdir(tmp_path)
     process = subprocess.run(['archivebox', 'init'], capture_output=True)
-    return process
+    return process
+
[email protected]
+def disable_extractors_dict():
+    env = os.environ.copy()
+    env.update({
+        "USE_WGET": "false",
+        "USE_SINGLEFILE": "false",
+        "SAVE_PDF": "false",
+        "SAVE_SCREENSHOT": "false",
+        "SAVE_DOM": "false",
+        "USE_GIT": "false",
+        "SAVE_MEDIA": "false",
+        "SAVE_ARCHIVE_DOT_ORG": "false"
+    })
+    return env

+ 14 - 9
tests/test_args.py

@@ -3,25 +3,30 @@ import json
 
 from .fixtures import *
 
-def test_depth_flag_is_accepted(process):
-    arg_process = subprocess.run(["archivebox", "add", "http://127.0.0.1:8080/static/example.com.html", "--depth=0"], capture_output=True)
+def test_depth_flag_is_accepted(process, disable_extractors_dict):
+    arg_process = subprocess.run(["archivebox", "add", "http://127.0.0.1:8080/static/example.com.html", "--depth=0"],
+                                  capture_output=True, env=disable_extractors_dict)
     assert 'unrecognized arguments: --depth' not in arg_process.stderr.decode("utf-8")
 
-def test_depth_flag_fails_if_it_is_not_0_or_1(process):
-    arg_process = subprocess.run(["archivebox", "add", "http://127.0.0.1:8080/static/example.com.html", "--depth=5"], capture_output=True)
+def test_depth_flag_fails_if_it_is_not_0_or_1(process, disable_extractors_dict):
+    arg_process = subprocess.run(["archivebox", "add", "http://127.0.0.1:8080/static/example.com.html", "--depth=5"],
+                                  capture_output=True, env=disable_extractors_dict)
     assert 'invalid choice' in arg_process.stderr.decode("utf-8")
-    arg_process = subprocess.run(["archivebox", "add", "http://127.0.0.1:8080/static/example.com.html", "--depth=-1"], capture_output=True)
+    arg_process = subprocess.run(["archivebox", "add", "http://127.0.0.1:8080/static/example.com.html", "--depth=-1"],
+                                  capture_output=True, env=disable_extractors_dict)
     assert 'invalid choice' in arg_process.stderr.decode("utf-8")
 
-def test_depth_flag_0_crawls_only_the_arg_page(tmp_path, process):
-    arg_process = subprocess.run(["archivebox", "add", "http://127.0.0.1:8080/static/example.com.html", "--depth=0"], capture_output=True)
+def test_depth_flag_0_crawls_only_the_arg_page(tmp_path, process, disable_extractors_dict):
+    arg_process = subprocess.run(["archivebox", "add", "http://127.0.0.1:8080/static/example.com.html", "--depth=0"],
+                                  capture_output=True, env=disable_extractors_dict)
     archived_item_path = list(tmp_path.glob('archive/**/*'))[0]
     with open(archived_item_path / "index.json", "r") as f:
         output_json = json.load(f)
     assert output_json["base_url"] == "127.0.0.1:8080/static/example.com.html"
 
-def test_depth_flag_1_crawls_the_page_AND_links(tmp_path, process):
-    arg_process = subprocess.run(["archivebox", "add", "http://127.0.0.1:8080/static/example.com.html", "--depth=1"], capture_output=True)
+def test_depth_flag_1_crawls_the_page_AND_links(tmp_path, process, disable_extractors_dict):
+    arg_process = subprocess.run(["archivebox", "add", "http://127.0.0.1:8080/static/example.com.html", "--depth=1"],
+                                  capture_output=True, env=disable_extractors_dict)
     with open(tmp_path / "index.json", "r") as f:
         archive_file = f.read()
     assert "http://127.0.0.1:8080/static/example.com.html" in archive_file

+ 13 - 3
tests/test_extractors.py

@@ -1,8 +1,10 @@
 from .fixtures import *
 from archivebox.extractors import ignore_methods, get_default_archive_methods, should_save_title
 
-def test_wget_broken_pipe(tmp_path, process):
-    add_process = subprocess.run(['archivebox', 'add', 'http://127.0.0.1:8080/static/example.com.html'], capture_output=True)
+def test_wget_broken_pipe(tmp_path, process, disable_extractors_dict):
+    disable_extractors_dict.update({"USE_WGET": "true"})
+    add_process = subprocess.run(['archivebox', 'add', 'http://127.0.0.1:8080/static/example.com.html'],
+                                 capture_output=True, env=disable_extractors_dict)
     assert "TypeError chmod_file(..., path: str) got unexpected NoneType argument path=None" not in add_process.stdout.decode("utf-8")
 
 def test_ignore_methods():
@@ -10,4 +12,12 @@ def test_ignore_methods():
     Takes the passed method out of the default methods list and returns that value
     """
     ignored = ignore_methods(['title'])
-    assert should_save_title not in ignored
+    assert should_save_title not in ignored
+
+def test_singlefile_works(tmp_path, process, disable_extractors_dict):
+    disable_extractors_dict.update({"USE_SINGLEFILE": "true"})
+    add_process = subprocess.run(['archivebox', 'add', 'http://127.0.0.1:8080/static/example.com.html'],
+                                  capture_output=True, env=disable_extractors_dict) 
+    archived_item_path = list(tmp_path.glob('archive/**/*'))[0]
+    output_file = archived_item_path / "singlefile.html" 
+    assert output_file.exists()

+ 9 - 6
tests/test_init.py

@@ -18,9 +18,10 @@ def test_update(tmp_path, process):
     update_process = subprocess.run(['archivebox', 'init'], capture_output=True)
     assert "Updating existing ArchiveBox collection in this folder" in update_process.stdout.decode("utf-8")
 
-def test_add_link(tmp_path, process):
+def test_add_link(tmp_path, process, disable_extractors_dict):
     os.chdir(tmp_path)
-    add_process = subprocess.run(['archivebox', 'add', 'http://127.0.0.1:8080/static/example.com.html'], capture_output=True)
+    add_process = subprocess.run(['archivebox', 'add', 'http://127.0.0.1:8080/static/example.com.html'],
+                                  capture_output=True, env=disable_extractors_dict)
     archived_item_path = list(tmp_path.glob('archive/**/*'))[0]
 
     assert "index.json" in [x.name for x in archived_item_path.iterdir()]
@@ -33,9 +34,10 @@ def test_add_link(tmp_path, process):
         output_html = f.read()
     assert "Example Domain" in output_html
 
-def test_add_link_support_stdin(tmp_path, process):
+def test_add_link_support_stdin(tmp_path, process, disable_extractors_dict):
     os.chdir(tmp_path)
-    stdin_process = subprocess.Popen(["archivebox", "add"], stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
+    stdin_process = subprocess.Popen(["archivebox", "add"], stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT,
+                                      env=disable_extractors_dict)
     stdin_process.communicate(input="http://127.0.0.1:8080/static/example.com.html".encode())
     archived_item_path = list(tmp_path.glob('archive/**/*'))[0]
 
@@ -51,9 +53,10 @@ def test_correct_permissions_output_folder(tmp_path, process):
         file_path = tmp_path / file
         assert oct(file_path.stat().st_mode)[-3:] == OUTPUT_PERMISSIONS
 
-def test_correct_permissions_add_command_results(tmp_path, process):
+def test_correct_permissions_add_command_results(tmp_path, process, disable_extractors_dict):
     os.chdir(tmp_path)
-    add_process = subprocess.run(['archivebox', 'add', 'http://127.0.0.1:8080/static/example.com.html'], capture_output=True)
+    add_process = subprocess.run(['archivebox', 'add', 'http://127.0.0.1:8080/static/example.com.html'], capture_output=True,
+                                  env=disable_extractors_dict)
     archived_item_path = list(tmp_path.glob('archive/**/*'))[0]
     for path in archived_item_path.iterdir():
         assert oct(path.stat().st_mode)[-3:] == OUTPUT_PERMISSIONS

+ 5 - 4
tests/test_oneshot.py

@@ -2,13 +2,14 @@ from pathlib import Path
 
 from .fixtures import *
 
-def test_oneshot_command_exists(tmp_path):
+def test_oneshot_command_exists(tmp_path, disable_extractors_dict):
     os.chdir(tmp_path)
-    process = subprocess.run(['archivebox', 'oneshot'], capture_output=True)
+    process = subprocess.run(['archivebox', 'oneshot'], capture_output=True, env=disable_extractors_dict)
     assert not "invalid choice: 'oneshot'" in process.stderr.decode("utf-8")
 
-def test_oneshot_commad_saves_page_in_right_folder(tmp_path):
-    process = subprocess.run(["archivebox", "oneshot", f"--out-dir={tmp_path}", "http://127.0.0.1:8080/static/example.com.html"], capture_output=True)
+def test_oneshot_commad_saves_page_in_right_folder(tmp_path, disable_extractors_dict):
+    process = subprocess.run(["archivebox", "oneshot", f"--out-dir={tmp_path}", "http://127.0.0.1:8080/static/example.com.html"],
+                              capture_output=True, env=disable_extractors_dict)
     items = ' '.join([str(x) for x in tmp_path.iterdir()])
     current_path = ' '.join([str(x) for x in Path.cwd().iterdir()])
     assert "index.json" in items

+ 2 - 2
tests/test_remove.py

@@ -1,8 +1,8 @@
 from .fixtures import *
 
-def test_remove_leaves_index_in_consistent_state(tmp_path, process):
+def test_remove_leaves_index_in_consistent_state(tmp_path, process, disable_extractors_dict):
     os.chdir(tmp_path)
-    subprocess.run(['archivebox', 'add', 'http://127.0.0.1:8080/static/example.com.html'], capture_output=True)
+    subprocess.run(['archivebox', 'add', 'http://127.0.0.1:8080/static/example.com.html'], capture_output=True, env=disable_extractors_dict)
     remove_process = subprocess.run(['archivebox', 'remove', '127.0.0.1:8080/static/example.com.html', '--yes'], capture_output=True)
     list_process = subprocess.run(['archivebox', 'list'], capture_output=True)
     assert "Warning: SQL index does not match JSON index!" not in list_process.stderr.decode("utf-8")

+ 3 - 2
tests/test_title.py

@@ -1,12 +1,13 @@
 from .fixtures import *
 
-def test_title_is_htmlencoded_in_index_html(tmp_path, process):
+def test_title_is_htmlencoded_in_index_html(tmp_path, process, disable_extractors_dict):
     """
     https://github.com/pirate/ArchiveBox/issues/330
     Unencoded content should not be rendered as it facilitates xss injections
     and breaks the layout.
     """
-    add_process = subprocess.run(['archivebox', 'add', 'http://localhost:8080/static/title_with_html.com.html'], capture_output=True)
+    add_process = subprocess.run(['archivebox', 'add', 'http://localhost:8080/static/title_with_html.com.html'],
+                                 capture_output=True, env=disable_extractors_dict)
 
     with open(tmp_path / "index.html", "r") as f:
         output_html = f.read()