5 years ago · c8e3aed647
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -40,6 +40,22 @@ jobs:
 
				         with:
			
 
				           fetch-depth: 1
			
 
				 
			
 
				+      - uses: actions/checkout@v2
			
 
				+        with:
			
 
				+          fetch-depth: 1
			
 
				+          repository: "gildas-lormeau/SingleFile"
			
 
				+          ref: "master"
			
 
				+          path: "singlefile"
			
 
				+
			
 
				+      - name: Install npm requirements for singlefile
			
 
				+        run: npm install --prefix singlefile/cli
			
 
				+
			
 
				+      - name: Give singlefile execution permissions
			
 
				+        run: chmod +x singlefile/cli/single-file
			
 
				+
			
 
				+      - name: Set SINGLEFILE_BINARY
			
 
				+        run: echo "::set-env name=SINGLEFILE_BINARY::$GITHUB_WORKSPACE/singlefile/cli/single-file"
			
 
				+
			
 
				       - name: Set up Python ${{ matrix.python }}
			
 
				         uses: actions/setup-python@v1
			
 
				         with:
			
@@ -60,6 +76,14 @@ jobs:
 
				           restore-keys: |
			
 
				             ${{ runner.os }}-${{ matrix.python }}-venv-
			
 
				 
			
 
				+      - name: Use nodejs 14.7.0
			
 
				+        uses: actions/setup-node@v1
			
 
				+        with:
			
 
				+          node-version: 14.7.0
			
 
				+
			
 
				+      - name: Debug
			
 
				+        run: ls ./
			
 
				+
			
 
				       - name: Install dependencies
			
 
				         run: |
			
 
				           python -m pip install .
			
--- a/Dockerfile
+++ b/Dockerfile
@@ -10,8 +10,8 @@
 
				 FROM python:3.8-slim-buster
			
 
				 
			
 
				 LABEL name="archivebox" \
			
 
				-      maintainer="Nick Sweeting <[email protected]>" \
			
 
				-      description="All-in-one personal internet archiving container"
			
 
				+    maintainer="Nick Sweeting <[email protected]>" \
			
 
				+    description="All-in-one personal internet archiving container"
			
 
				 
			
 
				 ENV TZ=UTC \
			
 
				     LANGUAGE=en_US:en \
			
@@ -22,28 +22,40 @@ ENV TZ=UTC \
 
				     APT_KEY_DONT_WARN_ON_DANGEROUS_USAGE=1 \
			
 
				     CODE_PATH=/app \
			
 
				     VENV_PATH=/venv \
			
 
				-    DATA_PATH=/data
			
 
				+    DATA_PATH=/data \
			
 
				+    EXTRA_PATH=/extra
			
 
				 
			
 
				-# First install CLI utils and base deps, then Chrome + Fons
			
 
				+# First install CLI utils and base deps, then Chrome + Fons + nodejs
			
 
				 RUN echo 'debconf debconf/frontend select Noninteractive' | debconf-set-selections \
			
 
				     && apt-get update -qq \
			
 
				     && apt-get install -qq -y --no-install-recommends \
			
 
				-       apt-transport-https ca-certificates apt-utils gnupg gosu gnupg2 libgconf-2-4 zlib1g-dev \
			
 
				-       dumb-init jq git wget curl youtube-dl ffmpeg \
			
 
				+    apt-transport-https ca-certificates apt-utils gnupg gosu gnupg2 libgconf-2-4 zlib1g-dev \
			
 
				+    dumb-init jq git wget curl youtube-dl ffmpeg \
			
 
				     && curl -sSL "https://dl.google.com/linux/linux_signing_key.pub" | apt-key add - \
			
 
				     && echo "deb https://dl.google.com/linux/chrome/deb/ stable main" > /etc/apt/sources.list.d/google-chrome.list \
			
 
				+    && curl -sL https://deb.nodesource.com/setup_14.x | bash - \
			
 
				     && apt-get update -qq \
			
 
				     && apt-get install -qq -y --no-install-recommends \
			
 
				-       google-chrome-stable \
			
 
				-       fontconfig \
			
 
				-       fonts-ipafont-gothic \
			
 
				-       fonts-wqy-zenhei \
			
 
				-       fonts-thai-tlwg \
			
 
				-       fonts-kacst \
			
 
				-       fonts-symbola \
			
 
				-       fonts-noto \
			
 
				-       fonts-freefont-ttf \
			
 
				-    && rm -rf /var/lib/apt/lists/*
			
 
				+    google-chrome-stable \
			
 
				+    fontconfig \
			
 
				+    fonts-ipafont-gothic \
			
 
				+    fonts-wqy-zenhei \
			
 
				+    fonts-thai-tlwg \
			
 
				+    fonts-kacst \
			
 
				+    fonts-symbola \
			
 
				+    fonts-noto \
			
 
				+    fonts-freefont-ttf \
			
 
				+    nodejs \
			
 
				+    unzip \
			
 
				+    && rm -rf /var/lib/apt/lists/* 
			
 
				+
			
 
				+# Clone singlefile and move it to the /bin folder so archivebox can find it
			
 
				+
			
 
				+WORKDIR "$EXTRA_PATH"
			
 
				+RUN wget -qO - https://github.com/gildas-lormeau/SingleFile/archive/master.zip > SingleFile.zip \
			
 
				+    && unzip -q SingleFile.zip \
			
 
				+    && npm install --prefix SingleFile-master/cli --production > /dev/null 2>&1 \
			
 
				+    && chmod +x SingleFile-master/cli/single-file 
			
 
				 
			
 
				 # Run everything from here on out as non-privileged user
			
 
				 RUN groupadd --system archivebox \
			
@@ -60,7 +72,8 @@ VOLUME "$DATA_PATH"
 
				 WORKDIR "$DATA_PATH"
			
 
				 EXPOSE 8000
			
 
				 ENV CHROME_BINARY=google-chrome \
			
 
				-    CHROME_SANDBOX=False
			
 
				+    CHROME_SANDBOX=False \
			
 
				+    SINGLEFILE_BINARY="$EXTRA_PATH/SingleFile-master/cli/single-file"
			
 
				 
			
 
				 RUN env ALLOW_ROOT=True archivebox version
			
 
				 
			
--- a/archivebox/config/__init__.py
+++ b/archivebox/config/__init__.py
@@ -74,6 +74,7 @@ CONFIG_DEFAULTS: Dict[str, ConfigDefaultDict] = {
 
				         'SAVE_FAVICON':             {'type': bool,  'default': True, 'aliases': ('FETCH_FAVICON',)},
			
 
				         'SAVE_WGET':                {'type': bool,  'default': True, 'aliases': ('FETCH_WGET',)},
			
 
				         'SAVE_WGET_REQUISITES':     {'type': bool,  'default': True, 'aliases': ('FETCH_WGET_REQUISITES',)},
			
 
				+        'SAVE_SINGLEFILE':          {'type': bool,  'default': True, 'aliases': ('FETCH_SINGLEFILE',)},
			
 
				         'SAVE_PDF':                 {'type': bool,  'default': True, 'aliases': ('FETCH_PDF',)},
			
 
				         'SAVE_SCREENSHOT':          {'type': bool,  'default': True, 'aliases': ('FETCH_SCREENSHOT',)},
			
 
				         'SAVE_DOM':                 {'type': bool,  'default': True, 'aliases': ('FETCH_DOM',)},
			
@@ -104,6 +105,7 @@ CONFIG_DEFAULTS: Dict[str, ConfigDefaultDict] = {
 
				     'DEPENDENCY_CONFIG': {
			
 
				         'USE_CURL':                 {'type': bool,  'default': True},
			
 
				         'USE_WGET':                 {'type': bool,  'default': True},
			
 
				+        'USE_SINGLEFILE':           {'type': bool,  'default': True},
			
 
				         'USE_GIT':                  {'type': bool,  'default': True},
			
 
				         'USE_CHROME':               {'type': bool,  'default': True},
			
 
				         'USE_YOUTUBEDL':            {'type': bool,  'default': True},
			
@@ -111,6 +113,7 @@ CONFIG_DEFAULTS: Dict[str, ConfigDefaultDict] = {
 
				         'CURL_BINARY':              {'type': str,   'default': 'curl'},
			
 
				         'GIT_BINARY':               {'type': str,   'default': 'git'},
			
 
				         'WGET_BINARY':              {'type': str,   'default': 'wget'},
			
 
				+        'SINGLEFILE_BINARY':        {'type': str,   'default': 'single-file'},
			
 
				         'YOUTUBEDL_BINARY':         {'type': str,   'default': 'youtube-dl'},
			
 
				         'CHROME_BINARY':            {'type': str,   'default': None},
			
 
				     },
			
@@ -249,6 +252,10 @@ DERIVED_CONFIG_DEFAULTS: ConfigDefaultDict = {
 
				     'SAVE_WGET':                {'default': lambda c: c['USE_WGET'] and c['SAVE_WGET']},
			
 
				     'SAVE_WARC':                {'default': lambda c: c['USE_WGET'] and c['SAVE_WARC']},
			
 
				 
			
 
				+    'USE_SINGLEFILE':           {'default': lambda c: c['USE_SINGLEFILE'] and (c['SAVE_SINGLEFILE'])},
			
 
				+    'SINGLEFILE_VERSION':       {'default': lambda c: bin_version(c['SINGLEFILE_BINARY']) if c['USE_SINGLEFILE'] else None},
			
 
				+    'SAVE_SINGLEFILE':          {'default': lambda c: c['USE_SINGLEFILE'] and c['SAVE_SINGLEFILE']},
			
 
				+
			
 
				     'USE_GIT':                  {'default': lambda c: c['USE_GIT'] and c['SAVE_GIT']},
			
 
				     'GIT_VERSION':              {'default': lambda c: bin_version(c['GIT_BINARY']) if c['USE_GIT'] else None},
			
 
				     'SAVE_GIT':                 {'default': lambda c: c['USE_GIT'] and c['SAVE_GIT']},
			
@@ -674,6 +681,13 @@ def get_dependency_info(config: ConfigDict) -> ConfigValue:
 
				             'enabled': config['USE_WGET'],
			
 
				             'is_valid': bool(config['WGET_VERSION']),
			
 
				         },
			
 
				+        'SINGLEFILE_BINARY': {
			
 
				+            'path': bin_path(config['SINGLEFILE_BINARY']),
			
 
				+            'version': config['SINGLEFILE_VERSION'],
			
 
				+            'hash': bin_hash(config['SINGLEFILE_BINARY']),
			
 
				+            'enabled': config['USE_SINGLEFILE'],
			
 
				+            'is_valid': bool(config['SINGLEFILE_VERSION']),
			
 
				+        },
			
 
				         'GIT_BINARY': {
			
 
				             'path': bin_path(config['GIT_BINARY']),
			
 
				             'version': config['GIT_VERSION'],
			
--- a/archivebox/core/admin.py
+++ b/archivebox/core/admin.py
@@ -106,6 +106,7 @@ class SnapshotAdmin(admin.ModelAdmin):
 
				                 '<a href="/{}/{}" class="exists-{}" title="Screenshot">🖥 </a> '
			
 
				                 '<a href="/{}/{}" class="exists-{}" title="HTML dump">🅷 </a> '
			
 
				                 '<a href="/{}/{}/" class="exists-{}" title="WARC">🆆 </a> '
			
 
				+                '<a href="/{}/{}" class="exists-{}" title="SingleFile">&#128476; </a>'
			
 
				                 '<a href="/{}/{}/" class="exists-{}" title="Media files">📼 </a> '
			
 
				                 '<a href="/{}/{}/" class="exists-{}" title="Git repos">📦 </a> '
			
 
				                 '<a href="{}" class="exists-{}" title="Archive.org snapshot">🏛 </a> '
			
@@ -115,6 +116,7 @@ class SnapshotAdmin(admin.ModelAdmin):
 
				             *link_tuple(link, 'screenshot_path'),
			
 
				             *link_tuple(link, 'dom_path'),
			
 
				             *link_tuple(link, 'warc_path')[:2], any((out_dir / canon['warc_path']).glob('*.warc.gz')),
			
 
				+            *link_tuple(link, 'singlefile_path'),
			
 
				             *link_tuple(link, 'media_path')[:2], any((out_dir / canon['media_path']).glob('*')),
			
 
				             *link_tuple(link, 'git_path')[:2], any((out_dir / canon['git_path']).glob('*')),
			
 
				             canon['archive_org_path'], (out_dir / 'archive.org.txt').exists(),
			
--- a/archivebox/extractors/__init__.py
+++ b/archivebox/extractors/__init__.py
@@ -25,6 +25,7 @@ from ..logging_util import (
 
				 from .title import should_save_title, save_title
			
 
				 from .favicon import should_save_favicon, save_favicon
			
 
				 from .wget import should_save_wget, save_wget
			
 
				+from .singlefile import should_save_singlefile, save_singlefile
			
 
				 from .pdf import should_save_pdf, save_pdf
			
 
				 from .screenshot import should_save_screenshot, save_screenshot
			
 
				 from .dom import should_save_dom, save_dom
			
@@ -37,6 +38,7 @@ def get_default_archive_methods():
 
				             ('title', should_save_title, save_title),
			
 
				             ('favicon', should_save_favicon, save_favicon),
			
 
				             ('wget', should_save_wget, save_wget),
			
 
				+            ('singlefile', should_save_singlefile, save_singlefile),
			
 
				             ('pdf', should_save_pdf, save_pdf),
			
 
				             ('screenshot', should_save_screenshot, save_screenshot),
			
 
				             ('dom', should_save_dom, save_dom),
			
--- a/archivebox/extractors/singlefile.py
+++ b/archivebox/extractors/singlefile.py
@@ -0,0 +1,84 @@
 
				+__package__ = 'archivebox.extractors'
			
 
				+
			
 
				+from pathlib import Path
			
 
				+
			
 
				+from typing import Optional
			
 
				+import json
			
 
				+
			
 
				+from ..index.schema import Link, ArchiveResult, ArchiveError
			
 
				+from ..system import run, chmod_file
			
 
				+from ..util import (
			
 
				+    enforce_types,
			
 
				+    chrome_args
			
 
				+)
			
 
				+from ..config import (
			
 
				+    TIMEOUT,
			
 
				+    SAVE_SINGLEFILE,
			
 
				+    SINGLEFILE_BINARY,
			
 
				+    SINGLEFILE_VERSION,
			
 
				+    CHROME_BINARY,
			
 
				+)
			
 
				+from ..logging_util import TimedProgress
			
 
				+
			
 
				+
			
 
				+@enforce_types
			
 
				+def should_save_singlefile(link: Link, out_dir: Optional[str]=None) -> bool:
			
 
				+    out_dir = out_dir or link.link_dir
			
 
				+
			
 
				+    output = Path(out_dir or link.link_dir) / 'singlefile.html'
			
 
				+    return SAVE_SINGLEFILE and (not output.exists())
			
 
				+
			
 
				+
			
 
				+@enforce_types
			
 
				+def save_singlefile(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEOUT) -> ArchiveResult:
			
 
				+    """download full site using single-file"""
			
 
				+
			
 
				+    out_dir = out_dir or link.link_dir
			
 
				+    output = str(Path(out_dir).absolute() / "singlefile.html")
			
 
				+
			
 
				+    browser_args = chrome_args(TIMEOUT=0)
			
 
				+
			
 
				+    # SingleFile CLI Docs: https://github.com/gildas-lormeau/SingleFile/tree/master/cli
			
 
				+    cmd = [
			
 
				+        SINGLEFILE_BINARY,
			
 
				+        '--browser-executable-path={}'.format(CHROME_BINARY),
			
 
				+        '--browser-args="{}"'.format(json.dumps(browser_args[1:])),
			
 
				+        link.url,
			
 
				+        output
			
 
				+    ]
			
 
				+
			
 
				+    status = 'succeeded'
			
 
				+    timer = TimedProgress(timeout, prefix='      ')
			
 
				+    try:
			
 
				+        result = run(cmd, cwd=out_dir, timeout=timeout)
			
 
				+
			
 
				+        # parse out number of files downloaded from last line of stderr:
			
 
				+        #  "Downloaded: 76 files, 4.0M in 1.6s (2.52 MB/s)"
			
 
				+        output_tail = [
			
 
				+            line.strip()
			
 
				+            for line in (result.stdout + result.stderr).decode().rsplit('\n', 3)[-3:]
			
 
				+            if line.strip()
			
 
				+        ]
			
 
				+        hints = (
			
 
				+            'Got single-file response code: {}.'.format(result.returncode),
			
 
				+            *output_tail,
			
 
				+        )
			
 
				+
			
 
				+        # Check for common failure cases
			
 
				+        if (result.returncode > 0):
			
 
				+            raise ArchiveError('SingleFile was not able to archive the page', hints)
			
 
				+        chmod_file(output)
			
 
				+    except Exception as err:
			
 
				+        status = 'failed'
			
 
				+        output = err
			
 
				+    finally:
			
 
				+        timer.end()
			
 
				+
			
 
				+    return ArchiveResult(
			
 
				+        cmd=cmd,
			
 
				+        pwd=out_dir,
			
 
				+        cmd_version=SINGLEFILE_VERSION,
			
 
				+        output=output,
			
 
				+        status=status,
			
 
				+        **timer.stats,
			
 
				+    )
			
--- a/archivebox/index/schema.py
+++ b/archivebox/index/schema.py
@@ -365,6 +365,7 @@ class Link:
 
				             'screenshot.png',
			
 
				             'output.html',
			
 
				             'media',
			
 
				+            'singlefile.html'
			
 
				         )
			
 
				 
			
 
				         return any(
			
@@ -376,7 +377,7 @@ class Link:
 
				         """get the latest output that each archive method produced for link"""
			
 
				         
			
 
				         ARCHIVE_METHODS = (
			
 
				-            'title', 'favicon', 'wget', 'warc', 'pdf',
			
 
				+            'title', 'favicon', 'wget', 'warc', 'singlefile', 'pdf',
			
 
				             'screenshot', 'dom', 'git', 'media', 'archive_org',
			
 
				         )
			
 
				         latest: Dict[str, ArchiveOutput] = {}
			
@@ -392,7 +393,6 @@ class Link:
 
				                 latest[archive_method] = history[0].output
			
 
				             else:
			
 
				                 latest[archive_method] = None
			
 
				-
			
 
				         return latest
			
 
				 
			
 
				 
			
@@ -406,6 +406,7 @@ class Link:
 
				             'google_favicon_path': 'https://www.google.com/s2/favicons?domain={}'.format(self.domain),
			
 
				             'wget_path': wget_output_path(self),
			
 
				             'warc_path': 'warc',
			
 
				+            'singlefile_path': 'singlefile.html',
			
 
				             'pdf_path': 'output.pdf',
			
 
				             'screenshot_path': 'screenshot.png',
			
 
				             'dom_path': 'output.html',
			
@@ -425,7 +426,7 @@ class Link:
 
				                 'pdf_path': static_path,
			
 
				                 'screenshot_path': static_path,
			
 
				                 'dom_path': static_path,
			
 
				+                'singlefile_path': static_path,
			
 
				             })
			
 
				         return canonical
			
 
				 
			
 
				-
			
--- a/archivebox/logging_util.py
+++ b/archivebox/logging_util.py
@@ -518,6 +518,7 @@ def printable_folder_status(name: str, folder: Dict) -> str:
 
				 
			
 
				 @enforce_types
			
 
				 def printable_dependency_version(name: str, dependency: Dict) -> str:
			
 
				+    version = None
			
 
				     if dependency['enabled']:
			
 
				         if dependency['is_valid']:
			
 
				             color, symbol, note, version = 'green', '√', 'valid', ''
			
--- a/archivebox/themes/legacy/link_details.html
+++ b/archivebox/themes/legacy/link_details.html
@@ -79,6 +79,7 @@
 
				             .card {
			
 
				                 overflow: hidden;
			
 
				                 box-shadow: 2px 3px 14px 0px rgba(0,0,0,0.02);
			
 
				+                margin-top: 10px;
			
 
				             }
			
 
				             .card h4 {
			
 
				                 font-size: 1.4vw;
			
@@ -335,6 +336,18 @@
 
				                           </div>
			
 
				                         </div>
			
 
				                     </div>
			
 
				+                    <div class="col-lg-2">
			
 
				+                        <div class="card">
			
 
				+                          <iframe class="card-img-top" src="$singlefile_path" sandbox="allow-same-origin allow-scripts allow-forms" scrolling="no"></iframe>
			
 
				+                          <div class="card-body">
			
 
				+                            <a href="$singlefile_path" style="float:right" title="Open in new tab..." target="_blank" rel="noopener">
			
 
				+                                <img src="../../static/external.png" class="external"/>
			
 
				+                            </a>
			
 
				+                            <a href="$singlefile_path" target="preview"><h4 class="card-title">SingleFile</h4></a>
			
 
				+                            <p class="card-text">archive/singlefile.html</p>
			
 
				+                          </div>
			
 
				+                        </div>
			
 
				+                    </div>
			
 
				                     <div class="col-lg-2">
			
 
				                         <div class="card">
			
 
				                           <iframe class="card-img-top pdf-frame" src="$pdf_path" scrolling="no"></iframe>
			
--- a/tests/fixtures.py
+++ b/tests/fixtures.py
@@ -7,4 +7,19 @@ import pytest
 
				 def process(tmp_path):
			
 
				     os.chdir(tmp_path)
			
 
				     process = subprocess.run(['archivebox', 'init'], capture_output=True)
			
 
				-    return process
			
 
				+    return process
			
 
				+
			
 
				[email protected]
			
 
				+def disable_extractors_dict():
			
 
				+    env = os.environ.copy()
			
 
				+    env.update({
			
 
				+        "USE_WGET": "false",
			
 
				+        "USE_SINGLEFILE": "false",
			
 
				+        "SAVE_PDF": "false",
			
 
				+        "SAVE_SCREENSHOT": "false",
			
 
				+        "SAVE_DOM": "false",
			
 
				+        "USE_GIT": "false",
			
 
				+        "SAVE_MEDIA": "false",
			
 
				+        "SAVE_ARCHIVE_DOT_ORG": "false"
			
 
				+    })
			
 
				+    return env
			
--- a/tests/test_args.py
+++ b/tests/test_args.py
@@ -3,25 +3,30 @@ import json
 
				 
			
 
				 from .fixtures import *
			
 
				 
			
 
				-def test_depth_flag_is_accepted(process):
			
 
				-    arg_process = subprocess.run(["archivebox", "add", "http://127.0.0.1:8080/static/example.com.html", "--depth=0"], capture_output=True)
			
 
				+def test_depth_flag_is_accepted(process, disable_extractors_dict):
			
 
				+    arg_process = subprocess.run(["archivebox", "add", "http://127.0.0.1:8080/static/example.com.html", "--depth=0"],
			
 
				+                                  capture_output=True, env=disable_extractors_dict)
			
 
				     assert 'unrecognized arguments: --depth' not in arg_process.stderr.decode("utf-8")
			
 
				 
			
 
				-def test_depth_flag_fails_if_it_is_not_0_or_1(process):
			
 
				-    arg_process = subprocess.run(["archivebox", "add", "http://127.0.0.1:8080/static/example.com.html", "--depth=5"], capture_output=True)
			
 
				+def test_depth_flag_fails_if_it_is_not_0_or_1(process, disable_extractors_dict):
			
 
				+    arg_process = subprocess.run(["archivebox", "add", "http://127.0.0.1:8080/static/example.com.html", "--depth=5"],
			
 
				+                                  capture_output=True, env=disable_extractors_dict)
			
 
				     assert 'invalid choice' in arg_process.stderr.decode("utf-8")
			
 
				-    arg_process = subprocess.run(["archivebox", "add", "http://127.0.0.1:8080/static/example.com.html", "--depth=-1"], capture_output=True)
			
 
				+    arg_process = subprocess.run(["archivebox", "add", "http://127.0.0.1:8080/static/example.com.html", "--depth=-1"],
			
 
				+                                  capture_output=True, env=disable_extractors_dict)
			
 
				     assert 'invalid choice' in arg_process.stderr.decode("utf-8")
			
 
				 
			
 
				-def test_depth_flag_0_crawls_only_the_arg_page(tmp_path, process):
			
 
				-    arg_process = subprocess.run(["archivebox", "add", "http://127.0.0.1:8080/static/example.com.html", "--depth=0"], capture_output=True)
			
 
				+def test_depth_flag_0_crawls_only_the_arg_page(tmp_path, process, disable_extractors_dict):
			
 
				+    arg_process = subprocess.run(["archivebox", "add", "http://127.0.0.1:8080/static/example.com.html", "--depth=0"],
			
 
				+                                  capture_output=True, env=disable_extractors_dict)
			
 
				     archived_item_path = list(tmp_path.glob('archive/**/*'))[0]
			
 
				     with open(archived_item_path / "index.json", "r") as f:
			
 
				         output_json = json.load(f)
			
 
				     assert output_json["base_url"] == "127.0.0.1:8080/static/example.com.html"
			
 
				 
			
 
				-def test_depth_flag_1_crawls_the_page_AND_links(tmp_path, process):
			
 
				-    arg_process = subprocess.run(["archivebox", "add", "http://127.0.0.1:8080/static/example.com.html", "--depth=1"], capture_output=True)
			
 
				+def test_depth_flag_1_crawls_the_page_AND_links(tmp_path, process, disable_extractors_dict):
			
 
				+    arg_process = subprocess.run(["archivebox", "add", "http://127.0.0.1:8080/static/example.com.html", "--depth=1"],
			
 
				+                                  capture_output=True, env=disable_extractors_dict)
			
 
				     with open(tmp_path / "index.json", "r") as f:
			
 
				         archive_file = f.read()
			
 
				     assert "http://127.0.0.1:8080/static/example.com.html" in archive_file
			
--- a/tests/test_extractors.py
+++ b/tests/test_extractors.py
@@ -1,8 +1,10 @@
 
				 from .fixtures import *
			
 
				 from archivebox.extractors import ignore_methods, get_default_archive_methods, should_save_title
			
 
				 
			
 
				-def test_wget_broken_pipe(tmp_path, process):
			
 
				-    add_process = subprocess.run(['archivebox', 'add', 'http://127.0.0.1:8080/static/example.com.html'], capture_output=True)
			
 
				+def test_wget_broken_pipe(tmp_path, process, disable_extractors_dict):
			
 
				+    disable_extractors_dict.update({"USE_WGET": "true"})
			
 
				+    add_process = subprocess.run(['archivebox', 'add', 'http://127.0.0.1:8080/static/example.com.html'],
			
 
				+                                 capture_output=True, env=disable_extractors_dict)
			
 
				     assert "TypeError chmod_file(..., path: str) got unexpected NoneType argument path=None" not in add_process.stdout.decode("utf-8")
			
 
				 
			
 
				 def test_ignore_methods():
			
@@ -10,4 +12,12 @@ def test_ignore_methods():
 
				     Takes the passed method out of the default methods list and returns that value
			
 
				     """
			
 
				     ignored = ignore_methods(['title'])
			
 
				-    assert should_save_title not in ignored
			
 
				+    assert should_save_title not in ignored
			
 
				+
			
 
				+def test_singlefile_works(tmp_path, process, disable_extractors_dict):
			
 
				+    disable_extractors_dict.update({"USE_SINGLEFILE": "true"})
			
 
				+    add_process = subprocess.run(['archivebox', 'add', 'http://127.0.0.1:8080/static/example.com.html'],
			
 
				+                                  capture_output=True, env=disable_extractors_dict) 
			
 
				+    archived_item_path = list(tmp_path.glob('archive/**/*'))[0]
			
 
				+    output_file = archived_item_path / "singlefile.html" 
			
 
				+    assert output_file.exists()
			
--- a/tests/test_init.py
+++ b/tests/test_init.py
@@ -18,9 +18,10 @@ def test_update(tmp_path, process):
 
				     update_process = subprocess.run(['archivebox', 'init'], capture_output=True)
			
 
				     assert "Updating existing ArchiveBox collection in this folder" in update_process.stdout.decode("utf-8")
			
 
				 
			
 
				-def test_add_link(tmp_path, process):
			
 
				+def test_add_link(tmp_path, process, disable_extractors_dict):
			
 
				     os.chdir(tmp_path)
			
 
				-    add_process = subprocess.run(['archivebox', 'add', 'http://127.0.0.1:8080/static/example.com.html'], capture_output=True)
			
 
				+    add_process = subprocess.run(['archivebox', 'add', 'http://127.0.0.1:8080/static/example.com.html'],
			
 
				+                                  capture_output=True, env=disable_extractors_dict)
			
 
				     archived_item_path = list(tmp_path.glob('archive/**/*'))[0]
			
 
				 
			
 
				     assert "index.json" in [x.name for x in archived_item_path.iterdir()]
			
@@ -33,9 +34,10 @@ def test_add_link(tmp_path, process):
 
				         output_html = f.read()
			
 
				     assert "Example Domain" in output_html
			
 
				 
			
 
				-def test_add_link_support_stdin(tmp_path, process):
			
 
				+def test_add_link_support_stdin(tmp_path, process, disable_extractors_dict):
			
 
				     os.chdir(tmp_path)
			
 
				-    stdin_process = subprocess.Popen(["archivebox", "add"], stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
			
 
				+    stdin_process = subprocess.Popen(["archivebox", "add"], stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT,
			
 
				+                                      env=disable_extractors_dict)
			
 
				     stdin_process.communicate(input="http://127.0.0.1:8080/static/example.com.html".encode())
			
 
				     archived_item_path = list(tmp_path.glob('archive/**/*'))[0]
			
 
				 
			
@@ -51,9 +53,10 @@ def test_correct_permissions_output_folder(tmp_path, process):
 
				         file_path = tmp_path / file
			
 
				         assert oct(file_path.stat().st_mode)[-3:] == OUTPUT_PERMISSIONS
			
 
				 
			
 
				-def test_correct_permissions_add_command_results(tmp_path, process):
			
 
				+def test_correct_permissions_add_command_results(tmp_path, process, disable_extractors_dict):
			
 
				     os.chdir(tmp_path)
			
 
				-    add_process = subprocess.run(['archivebox', 'add', 'http://127.0.0.1:8080/static/example.com.html'], capture_output=True)
			
 
				+    add_process = subprocess.run(['archivebox', 'add', 'http://127.0.0.1:8080/static/example.com.html'], capture_output=True,
			
 
				+                                  env=disable_extractors_dict)
			
 
				     archived_item_path = list(tmp_path.glob('archive/**/*'))[0]
			
 
				     for path in archived_item_path.iterdir():
			
 
				         assert oct(path.stat().st_mode)[-3:] == OUTPUT_PERMISSIONS
			
--- a/tests/test_oneshot.py
+++ b/tests/test_oneshot.py
@@ -2,13 +2,14 @@ from pathlib import Path
 
				 
			
 
				 from .fixtures import *
			
 
				 
			
 
				-def test_oneshot_command_exists(tmp_path):
			
 
				+def test_oneshot_command_exists(tmp_path, disable_extractors_dict):
			
 
				     os.chdir(tmp_path)
			
 
				-    process = subprocess.run(['archivebox', 'oneshot'], capture_output=True)
			
 
				+    process = subprocess.run(['archivebox', 'oneshot'], capture_output=True, env=disable_extractors_dict)
			
 
				     assert not "invalid choice: 'oneshot'" in process.stderr.decode("utf-8")
			
 
				 
			
 
				-def test_oneshot_commad_saves_page_in_right_folder(tmp_path):
			
 
				-    process = subprocess.run(["archivebox", "oneshot", f"--out-dir={tmp_path}", "http://127.0.0.1:8080/static/example.com.html"], capture_output=True)
			
 
				+def test_oneshot_commad_saves_page_in_right_folder(tmp_path, disable_extractors_dict):
			
 
				+    process = subprocess.run(["archivebox", "oneshot", f"--out-dir={tmp_path}", "http://127.0.0.1:8080/static/example.com.html"],
			
 
				+                              capture_output=True, env=disable_extractors_dict)
			
 
				     items = ' '.join([str(x) for x in tmp_path.iterdir()])
			
 
				     current_path = ' '.join([str(x) for x in Path.cwd().iterdir()])
			
 
				     assert "index.json" in items
			
--- a/tests/test_remove.py
+++ b/tests/test_remove.py
@@ -1,8 +1,8 @@
 
				 from .fixtures import *
			
 
				 
			
 
				-def test_remove_leaves_index_in_consistent_state(tmp_path, process):
			
 
				+def test_remove_leaves_index_in_consistent_state(tmp_path, process, disable_extractors_dict):
			
 
				     os.chdir(tmp_path)
			
 
				-    subprocess.run(['archivebox', 'add', 'http://127.0.0.1:8080/static/example.com.html'], capture_output=True)
			
 
				+    subprocess.run(['archivebox', 'add', 'http://127.0.0.1:8080/static/example.com.html'], capture_output=True, env=disable_extractors_dict)
			
 
				     remove_process = subprocess.run(['archivebox', 'remove', '127.0.0.1:8080/static/example.com.html', '--yes'], capture_output=True)
			
 
				     list_process = subprocess.run(['archivebox', 'list'], capture_output=True)
			
 
				     assert "Warning: SQL index does not match JSON index!" not in list_process.stderr.decode("utf-8")
			
--- a/tests/test_title.py
+++ b/tests/test_title.py
@@ -1,12 +1,13 @@
 
				 from .fixtures import *
			
 
				 
			
 
				-def test_title_is_htmlencoded_in_index_html(tmp_path, process):
			
 
				+def test_title_is_htmlencoded_in_index_html(tmp_path, process, disable_extractors_dict):
			
 
				     """
			
 
				     https://github.com/pirate/ArchiveBox/issues/330
			
 
				     Unencoded content should not be rendered as it facilitates xss injections
			
 
				     and breaks the layout.
			
 
				     """
			
 
				-    add_process = subprocess.run(['archivebox', 'add', 'http://localhost:8080/static/title_with_html.com.html'], capture_output=True)
			
 
				+    add_process = subprocess.run(['archivebox', 'add', 'http://localhost:8080/static/title_with_html.com.html'],
			
 
				+                                 capture_output=True, env=disable_extractors_dict)
			
 
				 
			
 
				     with open(tmp_path / "index.html", "r") as f:
			
 
				         output_html = f.read()