Переглянути джерело

feat: Add options to ease management of node related extractors

Cristian 5 роки тому
батько
коміт
cc0fa747ce
4 змінених файлів з 31 додано та 11 видалено
  1. 17 8
      README.md
  2. 4 2
      archivebox/config/__init__.py
  3. 2 1
      package.json
  4. 8 0
      tests/test_extractors.py

+ 17 - 8
README.md

@@ -2,12 +2,12 @@
 <img src="https://i.imgur.com/4nkFjdv.png" height="80px">
 <h1>ArchiveBox<br/><sub>The open-source self-hosted web archive.</sub></h1>
 
-▶️ <a href="https://github.com/pirate/ArchiveBox/wiki/Quickstart">Quickstart</a> | 
-<a href="https://archivebox.zervice.io/">Demo</a> | 
-<a href="https://github.com/pirate/ArchiveBox">Github</a> | 
-<a href="https://github.com/pirate/ArchiveBox/wiki">Documentation</a> | 
-<a href="#background--motivation">Info & Motivation</a> | 
-<a href="https://github.com/pirate/ArchiveBox/wiki/Web-Archiving-Community">Community</a> | 
+▶️ <a href="https://github.com/pirate/ArchiveBox/wiki/Quickstart">Quickstart</a> |
+<a href="https://archivebox.zervice.io/">Demo</a> |
+<a href="https://github.com/pirate/ArchiveBox">Github</a> |
+<a href="https://github.com/pirate/ArchiveBox/wiki">Documentation</a> |
+<a href="#background--motivation">Info & Motivation</a> |
+<a href="https://github.com/pirate/ArchiveBox/wiki/Web-Archiving-Community">Community</a> |
 <a href="https://github.com/pirate/ArchiveBox/wiki/Roadmap">Roadmap</a>
 
 <pre>
@@ -22,6 +22,7 @@
 <a href="https://test.pypi.org/project/archivebox/"><img src="https://img.shields.io/badge/Python-%3E%3D3.7-yellow.svg?logo=python&logoColor=yellow"/></a>
 <a href="https://github.com/pirate/ArchiveBox/wiki/Install#dependencies"><img src="https://img.shields.io/badge/Chromium-%3E%3D59-orange.svg?logo=Google+Chrome&logoColor=orange"/></a>
 <a href="https://hub.docker.com/r/nikisweeting/archivebox"><img src="https://img.shields.io/badge/Docker-all%20platforms-lightblue.svg?logo=docker&logoColor=lightblue"/></a>
+
 <hr/>
 </div>
 
@@ -56,8 +57,8 @@ ArchiveBox is written in Python 3.7 and uses wget, Chrome headless, youtube-dl,
 
 ## Quickstart
 
-ArchiveBox is written in `python3.7` and has [3 main binary dependencies](https://github.com/pirate/ArchiveBox/wiki/Install#dependencies): `wget`, `chromium`, and `youtube-dl`.
-To get started, you can [install them manually](https://github.com/pirate/ArchiveBox/wiki/Install) using your system's package manager, use the [automated helper script](https://github.com/pirate/ArchiveBox/wiki/Quickstart), or use the official [Docker](https://github.com/pirate/ArchiveBox/wiki/Docker) container. All three dependencies are optional if [disabled](https://github.com/pirate/ArchiveBox/wiki/Configuration#archive-method-toggles) in settings.
+ArchiveBox is written in `python3.7` and has [4 main binary dependencies](https://github.com/pirate/ArchiveBox/wiki/Install#dependencies): `wget`, `chromium`, `youtube-dl` and `nodejs`.
+To get started, you can [install them manually](https://github.com/pirate/ArchiveBox/wiki/Install) using your system's package manager, use the [automated helper script](https://github.com/pirate/ArchiveBox/wiki/Quickstart), or use the official [Docker](https://github.com/pirate/ArchiveBox/wiki/Docker) container. These dependencies are optional if [disabled](https://github.com/pirate/ArchiveBox/wiki/Configuration#archive-method-toggles) in settings.
 
 ```bash
 # Docker
@@ -82,9 +83,16 @@ open http://127.0.0.1:8000
 ```bash
 # Bare Metal
 # Use apt on Ubuntu/Debian, brew on mac, or pkg on BSD
+# You may need to add a ppa with a more recent version of nodejs
 apt install python3 python3-pip git curl wget youtube-dl chromium-browser
 
+curl -s https://deb.nodesource.com/gpgkey/nodesource.gpg.key | apt-key add - \
+  && echo 'deb https://deb.nodesource.com/node_14.x buster main' >> /etc/apt/sources.list \
+  && apt-get update -qq \
+  && apt-get install -qq -y --no-install-recommends nodejs
+
 pip install archivebox      # install archivebox
+npm run setup
 
 mkdir data && cd data       # (doesn't have to be called data)
 archivebox init
@@ -97,6 +105,7 @@ archivebox add https://getpocket.com/users/USERNAME/feed/all --depth=1
 Once you've added your first links, open `data/index.html` in a browser to view the static archive.
 
 You can also start it as a server with a full web UI to manage your links:
+
 ```bash
 archivebox manage createsuperuser
 archivebox server

+ 4 - 2
archivebox/config/__init__.py

@@ -112,6 +112,7 @@ CONFIG_DEFAULTS: Dict[str, ConfigDefaultDict] = {
         'USE_READABILITY':          {'type': bool,  'default': True},
         'USE_GIT':                  {'type': bool,  'default': True},
         'USE_CHROME':               {'type': bool,  'default': True},
+        'USE_NODE':                 {'type': bool,  'default': True},
         'USE_YOUTUBEDL':            {'type': bool,  'default': True},
 
         'CURL_BINARY':              {'type': str,   'default': 'curl'},
@@ -275,11 +276,12 @@ DERIVED_CONFIG_DEFAULTS: ConfigDefaultDict = {
     'USE_CHROME':               {'default': lambda c: c['USE_CHROME'] and (c['SAVE_PDF'] or c['SAVE_SCREENSHOT'] or c['SAVE_DOM'] or c['SAVE_SINGLEFILE'])},
     'CHROME_BINARY':            {'default': lambda c: c['CHROME_BINARY'] if c['CHROME_BINARY'] else find_chrome_binary()},
     'CHROME_VERSION':           {'default': lambda c: bin_version(c['CHROME_BINARY']) if c['USE_CHROME'] else None},
+    'USE_NODE':                 {'default': lambda c: c['USE_NODE'] and (c['SAVE_READABILITY'] or c['SAVE_SINGLEFILE'])},
     'SAVE_PDF':                 {'default': lambda c: c['USE_CHROME'] and c['SAVE_PDF']},
     'SAVE_SCREENSHOT':          {'default': lambda c: c['USE_CHROME'] and c['SAVE_SCREENSHOT']},
     'SAVE_DOM':                 {'default': lambda c: c['USE_CHROME'] and c['SAVE_DOM']},
-    'SAVE_SINGLEFILE':          {'default': lambda c: c['USE_CHROME'] and c['USE_SINGLEFILE']},
-    'SAVE_READABILITY':         {'default': lambda c: c['USE_READABILITY']},
+    'SAVE_SINGLEFILE':          {'default': lambda c: c['USE_CHROME'] and c['USE_SINGLEFILE'] and c['USE_NODE']},
+    'SAVE_READABILITY':         {'default': lambda c: c['USE_READABILITY'] and c['USE_NODE']},
 
     'DEPENDENCIES':             {'default': lambda c: get_dependency_info(c)},
     'CODE_LOCATIONS':           {'default': lambda c: get_code_locations(c)},

+ 2 - 1
package.json

@@ -5,7 +5,8 @@
 	"author": "Nick Sweeting <[email protected]>",
 	"license": "MIT",
 	"scripts": {
-		"archivebox": "./bin/archive"
+   "setup": "node -e \"const {execSync} = require('child_process'); Object.entries(JSON.parse(fs.readFileSync('package.json')).dependencies).forEach(globaldep => execSync('npm i -g ' + globaldep[1]))\"",
+   "archivebox": "./bin/archive"
 	},
 	"bin": {
 		"archivebox": "./bin/archive"

+ 8 - 0
tests/test_extractors.py

@@ -53,3 +53,11 @@ def test_readability_works_with_dom(tmp_path, process, disable_extractors_dict):
     archived_item_path = list(tmp_path.glob("archive/**/*"))[0]
     output_file = archived_item_path / "readability" / "content.html"
     assert output_file.exists()
+
+def test_use_node_false_disables_readability_and_singlefile(tmp_path, process, disable_extractors_dict):
+    disable_extractors_dict.update({"USE_READABILITY": "true", "SAVE_DOM": "true", "USE_SINGLEFILE": "true", "USE_NODE": "false"}) 
+    add_process = subprocess.run(['archivebox', 'add', 'http://127.0.0.1:8080/static/example.com.html'],
+                                  capture_output=True, env=disable_extractors_dict)
+    output_str = add_process.stdout.decode("utf-8")
+    assert "> singlefile" not in output_str
+    assert "> readability" not in output_str