2
0
Эх сурвалжийг харах

Merge branch 'master' into archive-result

Nick Sweeting 5 жил өмнө
parent
commit
efe3027797

+ 1 - 1
.github/FUNDING.yml

@@ -1,3 +1,3 @@
 github: pirate
 github: pirate
 patreon: theSquashSH
 patreon: theSquashSH
-custom: ["https://paypal.me/NicholasSweeting", "https://www.blockchain.com/eth/address/0x5D4c34D4a121Fe08d1dDB7969F07550f2dB9f471", "https://www.blockchain.com/btc/address/1HuxXriPE2Bbnag3jJrqa3bkNHrs297dYH"]
+custom: ["https://twitter.com/ArchiveBoxApp", "https://paypal.me/NicholasSweeting", "https://www.blockchain.com/eth/address/0x5D4c34D4a121Fe08d1dDB7969F07550f2dB9f471", "https://www.blockchain.com/btc/address/1HuxXriPE2Bbnag3jJrqa3bkNHrs297dYH"]

+ 54 - 0
.github/workflows/docker.yml

@@ -0,0 +1,54 @@
+name: Docker Push
+
+on:
+  workflow_dispatch:
+  push:
+
+jobs:
+  buildx:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Docker Login
+        uses: docker/login-action@v1
+        with:
+           username: ${{ secrets.DOCKER_USERNAME }}
+           password: ${{ secrets.DOCKER_PASSWORD }}
+      - name: Checkout
+        uses: actions/checkout@v2
+      - name: Set up QEMU
+        uses: docker/setup-qemu-action@v1
+      - name: Set up Docker Buildx
+        id: buildx
+        uses: docker/setup-buildx-action@v1
+        with:
+          version: latest
+          install: true
+      - name: Builder instance name
+        run: echo ${{ steps.buildx.outputs.name }}
+      - name: Available platforms
+        run: echo ${{ steps.buildx.outputs.platforms }}
+      - name: Cache Docker layers
+        uses: actions/cache@v2
+        with:
+          path: /tmp/.buildx-cache
+          key: ${{ runner.os }}-buildx-${{ github.sha }}
+          restore-keys: |
+            ${{ runner.os }}-buildx-
+      - name: Build and push
+        id: docker_build
+        uses: docker/build-push-action@v2
+        with:
+          context: ./
+          file: ./Dockerfile
+          builder: ${{ steps.buildx.outputs.name }}
+          push: true
+          tags: |
+            ${{ secrets.DOCKER_USERNAME }}/archivebox:latest
+            ${{ secrets.DOCKER_USERNAME }}/archivebox:${{ github.sha }}
+            archivebox/archivebox:latest
+            archivebox/archivebox:${{ github.sha }}
+          cache-from: type=local,src=/tmp/.buildx-cache
+          cache-to: type=local,dest=/tmp/.buildx-cache
+          platforms: linux/amd64,linux/arm64,linux/arm/v7
+      - name: Image digest
+        run: echo ${{ steps.docker_build.outputs.digest }}

+ 3 - 2
.github/workflows/test.yml

@@ -1,4 +1,4 @@
-name: Test workflow
+name: 'Lint, Test, and Build'
 on: [push]
 on: [push]
 
 
 env:
 env:
@@ -113,7 +113,8 @@ jobs:
         with:
         with:
           fetch-depth: 1
           fetch-depth: 1
 
 
-      - uses: satackey/[email protected]
+      # TODO: as of 2020-11 this helper layer broke, upgrade and re-enable this once it's usable again
+      # - uses: satackey/[email protected]
 
 
       - name: Build image
       - name: Build image
         run: |
         run: |

+ 9 - 1
.gitignore

@@ -4,13 +4,21 @@
 __pycache__/
 __pycache__/
 .mypy_cache/
 .mypy_cache/
 
 
+# Python and Node dependencies
 venv/
 venv/
 .venv/
 .venv/
 .docker-venv/
 .docker-venv/
+node_modules/
 
 
+# Packaging artifacts
+archivebox-*.tar.gz
 build/
 build/
+deb_dist/
 dist/
 dist/
-node_modules/
 
 
+# Data folders
 data/
 data/
+data1/
+data2/
+data3/
 output/
 output/

+ 2 - 2
Dockerfile

@@ -12,8 +12,8 @@ FROM python:3.8-slim-buster
 LABEL name="archivebox" \
 LABEL name="archivebox" \
     maintainer="Nick Sweeting <[email protected]>" \
     maintainer="Nick Sweeting <[email protected]>" \
     description="All-in-one personal internet archiving container" \
     description="All-in-one personal internet archiving container" \
-    homepage="https://github.com/pirate/ArchiveBox" \
-    documentation="https://github.com/pirate/ArchiveBox/wiki/Docker#docker"
+    homepage="https://github.com/ArchiveBox/ArchiveBox" \
+    documentation="https://github.com/ArchiveBox/ArchiveBox/wiki/Docker#docker"
 
 
 # System-level base config
 # System-level base config
 ENV TZ=UTC \
 ENV TZ=UTC \

+ 126 - 107
README.md

@@ -1,14 +1,14 @@
 <div align="center">
 <div align="center">
-<img src="https://i.imgur.com/4nkFjdv.png" height="80px">
+<em><img src="https://i.imgur.com/5B48E3N.png" height="90px"></em>
 <h1>ArchiveBox<br/><sub>The open-source self-hosted web archive.</sub></h1>
 <h1>ArchiveBox<br/><sub>The open-source self-hosted web archive.</sub></h1>
 
 
-▶️ <a href="https://github.com/pirate/ArchiveBox/wiki/Quickstart">Quickstart</a> |
+▶️ <a href="https://github.com/ArchiveBox/ArchiveBox/wiki/Quickstart">Quickstart</a> |
 <a href="https://archivebox.zervice.io/">Demo</a> |
 <a href="https://archivebox.zervice.io/">Demo</a> |
-<a href="https://github.com/pirate/ArchiveBox">Github</a> |
-<a href="https://github.com/pirate/ArchiveBox/wiki">Documentation</a> |
+<a href="https://github.com/ArchiveBox/ArchiveBox">Github</a> |
+<a href="https://github.com/ArchiveBox/ArchiveBox/wiki">Documentation</a> |
 <a href="#background--motivation">Info & Motivation</a> |
 <a href="#background--motivation">Info & Motivation</a> |
-<a href="https://github.com/pirate/ArchiveBox/wiki/Web-Archiving-Community">Community</a> |
-<a href="https://github.com/pirate/ArchiveBox/wiki/Roadmap">Roadmap</a>
+<a href="https://github.com/ArchiveBox/ArchiveBox/wiki/Web-Archiving-Community">Community</a> |
+<a href="https://github.com/ArchiveBox/ArchiveBox/wiki/Roadmap">Roadmap</a>
 
 
 <pre>
 <pre>
 "Your own personal internet archive" (网站存档 / 爬虫)
 "Your own personal internet archive" (网站存档 / 爬虫)
@@ -16,39 +16,78 @@
 
 
 <!--<a href="http://webchat.freenode.net?channels=ArchiveBox&uio=d4"><img src="https://img.shields.io/badge/Community_chat-IRC-%2328A745.svg"/></a>-->
 <!--<a href="http://webchat.freenode.net?channels=ArchiveBox&uio=d4"><img src="https://img.shields.io/badge/Community_chat-IRC-%2328A745.svg"/></a>-->
 
 
-<a href="https://github.com/pirate/ArchiveBox/blob/master/LICENSE"><img src="https://img.shields.io/badge/Open_source-MIT-green.svg?logo=git&logoColor=green"/></a>
-<a href="https://github.com/pirate/ArchiveBox/commits/dev"><img src="https://img.shields.io/github/last-commit/pirate/ArchiveBox.svg?logo=Sublime+Text&logoColor=green&label=Active"/></a>
-<a href="https://github.com/pirate/ArchiveBox"><img src="https://img.shields.io/github/stars/pirate/ArchiveBox.svg?logo=github&label=Stars&logoColor=blue"/></a>
+<a href="https://github.com/ArchiveBox/ArchiveBox/blob/master/LICENSE"><img src="https://img.shields.io/badge/Open_source-MIT-green.svg?logo=git&logoColor=green"/></a>
+<a href="https://github.com/ArchiveBox/ArchiveBox/commits/dev"><img src="https://img.shields.io/github/last-commit/ArchiveBox/ArchiveBox.svg?logo=Sublime+Text&logoColor=green&label=Active"/></a>
+<a href="https://github.com/ArchiveBox/ArchiveBox"><img src="https://img.shields.io/github/stars/ArchiveBox/ArchiveBox.svg?logo=github&label=Stars&logoColor=blue"/></a>
 <a href="https://test.pypi.org/project/archivebox/"><img src="https://img.shields.io/badge/Python-%3E%3D3.7-yellow.svg?logo=python&logoColor=yellow"/></a>
 <a href="https://test.pypi.org/project/archivebox/"><img src="https://img.shields.io/badge/Python-%3E%3D3.7-yellow.svg?logo=python&logoColor=yellow"/></a>
-<a href="https://github.com/pirate/ArchiveBox/wiki/Install#dependencies"><img src="https://img.shields.io/badge/Chromium-%3E%3D59-orange.svg?logo=Google+Chrome&logoColor=orange"/></a>
-<a href="https://hub.docker.com/r/nikisweeting/archivebox"><img src="https://img.shields.io/badge/Docker-all%20platforms-lightblue.svg?logo=docker&logoColor=lightblue"/></a>
+<a href="https://github.com/ArchiveBox/ArchiveBox/wiki/Install#dependencies"><img src="https://img.shields.io/badge/Chromium-%3E%3D59-orange.svg?logo=Google+Chrome&logoColor=orange"/></a>
+<a href="https://hub.docker.com/r/archivebox/archivebox"><img src="https://img.shields.io/badge/Docker-all%20platforms-lightblue.svg?logo=docker&logoColor=lightblue"/></a>
 
 
 <hr/>
 <hr/>
 </div>
 </div>
 
 
-ArchiveBox is a powerful self-hosted internet archiving solution written in Python 3. You feed it URLs of pages you want to archive, and it saves them to disk in a varitety of formats depending on the configuration and the content it detects. ArchiveBox can be installed via [Docker](https://docs.docker.com/get-docker/) or [`pip3`](https://wiki.python.org/moin/BeginnersGuide/Download).
+ArchiveBox is a powerful self-hosted internet archiving solution written in Python 3. You feed it URLs of pages you want to archive, and it saves them to disk in a varitety of formats depending on the configuration and the content it detects. ArchiveBox can be installed via [Docker](https://docs.docker.com/get-docker/) (recommended), [`apt`](https://launchpad.net/~archivebox/+archive/ubuntu/archivebox/+packages), [`brew`](https://github.com/ArchiveBox/homebrew-archivebox), or [`pip`](https://www.python.org/downloads/). It works on macOS, Windows, and Linux/BSD (both armv7 and amd64).
 
 
 Once installed, URLs can be added via the command line `archivebox add` or the built-in Web UI `archivebox server`. It can ingest bookmarks from a service like Pocket/Pinboard, your entire browsing history, RSS feeds, or URLs one at a time.
 Once installed, URLs can be added via the command line `archivebox add` or the built-in Web UI `archivebox server`. It can ingest bookmarks from a service like Pocket/Pinboard, your entire browsing history, RSS feeds, or URLs one at a time.
 
 
 The main index is a self-contained `data/index.sqlite3` file, and each snapshot is stored as a folder `data/archive/<timestamp>/`, with an easy-to-read `index.html` and `index.json` within. For each page, ArchiveBox auto-extracts many types of assets/media and saves them in standard formats, with out-of-the-box support for: 3 types of HTML snapshots (wget, Chrome headless, singlefile), a PDF snapshot, a screenshot, a WARC archive, git repositories, images, audio, video, subtitles, article text, and more. The snapshots are browseable and managable offline through the filesystem, the built-in webserver, or the Python API.
 The main index is a self-contained `data/index.sqlite3` file, and each snapshot is stored as a folder `data/archive/<timestamp>/`, with an easy-to-read `index.html` and `index.json` within. For each page, ArchiveBox auto-extracts many types of assets/media and saves them in standard formats, with out-of-the-box support for: 3 types of HTML snapshots (wget, Chrome headless, singlefile), a PDF snapshot, a screenshot, a WARC archive, git repositories, images, audio, video, subtitles, article text, and more. The snapshots are browseable and managable offline through the filesystem, the built-in webserver, or the Python API.
 
 
-
 #### Quickstart
 #### Quickstart
 
 
+**First, get ArchiveBox using your system package manager, Docker, or pip:**
+```bash
+# You can run it with Docker or Docker Compose (recommended)
+docker pull archivebox/archivebox
+# https://raw.githubusercontent.com/ArchiveBox/ArchiveBox/master/docker-compose.yml
+
+# or Ubuntu/Debian
+sudo add-apt-repository -u ppa:archivebox/archivebox
+apt install archivebox
+
+# or macOS
+brew install archivebox/archivebox/archivebox
+
+# or for the Python version only, without wget/git/chrome/etc. included
+pip3 install archivebox
+
+# If you're using an apt/brew/pip install you can run archivebox commands normally
+#   archivebox [subcommand] [...args]
+# If you're using Docker you'll have to run the commands like this
+#   docker run -v $PWD:/data -it archivebox/archivebox [subcommand] [...args]
+# And the equivalent in Docker Compose:  
+#   docker-compose run archivebox [subcommand] [...args]
+```
+
+<small>Check that everything installed correctly with `archivebox --version`</small>
+
+**To start using archivebox, you have to create a data folder and `cd` into it:**
+
 ```bash
 ```bash
-docker run -d -it -v ~/archivebox:/data -p 8000:8000 nikisweeting/archivebox server --init 0.0.0.0:8000
-docker run -v ~/archivebox:/data -it nikisweeting/archivebox manage createsuperuser
-docker run -v ~/archivebox:/data -it nikisweeting/archivebox add 'https://example.com'
+mkdir ~/archivebox && cd ~/archivebox    # you can put the collection dir anywhere
+archivebox init
+```
+
+**Then Add some URLs to your archive collection:**
+```bash
+archivebox add https://github.com/ArchiveBox/ArchiveBox
+archivebox add --depth=1 https://example.com
+```
 
 
-open http://127.0.0.1:8000/admin/login/  # then click "Add" in the navbar
+**View the snapshots of the URLs you added via the self-hosted web UI:**
+```bash
+archivebox manage createsuperuser         # create an admin acct
+archivebox server 0.0.0.0:8000            # start the web server
+open http://127.0.0.1:8000/               # open the interactive admin panel
+ls ~/archivebox/archive/*/index.html      # or browse the snapshots on disk
 ```
 ```
 
 
+
 <div align="center">
 <div align="center">
 <img src="https://i.imgur.com/lUuicew.png" width="400px">
 <img src="https://i.imgur.com/lUuicew.png" width="400px">
 <br/>
 <br/>
 
 
-[DEMO: archivebox.zervice.io/](https://archivebox.zervice.io)  
-For more information, see the [full Quickstart guide](https://github.com/pirate/ArchiveBox/wiki/Quickstart), [Usage](https://github.com/pirate/ArchiveBox/wiki/Usage), and [Configuration](https://github.com/pirate/ArchiveBox/wiki/Configuration) docs.
+<a href="https://archivebox.zervice.io">DEMO: archivebox.zervice.io/</a>  
+For more information, see the <a href="https://github.com/ArchiveBox/ArchiveBox/wiki/Quickstart">full Quickstart guide</a>, <a href="https://github.com/ArchiveBox/ArchiveBox/wiki/Usage">Usage</a>, and <a href="https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration">Configuration</a> docs.
 </div>
 </div>
 
 
 ---
 ---
@@ -56,18 +95,11 @@ For more information, see the [full Quickstart guide](https://github.com/pirate/
 
 
 # Overview
 # Overview
 
 
-ArchiveBox is a command line tool, self-hostable web-archiving server, and Python library all-in-one. It's available as a Python3 package or a Docker image, both methods provide the same CLI, Web UI, and on-disk data format.
-
-It works on Docker, macOS, and Linux/BSD. Windows is not officially supported, but users have reported getting it working using the WSL2 + Docker.
+ArchiveBox is a command line tool, self-hostable web-archiving server, and Python library all-in-one. It can be installed on Docker, macOS, and Linux/BSD, and Windows. You can download and install it as a Debian/Ubuntu package, Homebrew package, Python3 package, or a Docker image. No matter which install method you choose, they all provide the same CLI, Web UI, and on-disk data format.
 
 
-To use ArchiveBox you start by creating a folder for your data to live in (it can be anywhere on your system), and running `archivebox init` inside of it. That will create a sqlite3 index and an `ArchiveBox.conf` file. After that, you can continue to add/remove/search/import/export/manage/config/etc using the CLI `archivebox help`, or you can run the Web UI (recommended):
-```bash
-archivebox manage createsuperuser
-archivebox server 0.0.0.0:8000
-open http://127.0.0.1:8000
-```
+To use ArchiveBox you start by creating a folder for your data to live in (it can be anywhere on your system), and running `archivebox init` inside of it. That will create a sqlite3 index and an `ArchiveBox.conf` file. After that, you can continue to add/export/manage/etc using the CLI `archivebox help`, or you can run the Web UI (recommended).
 
 
-The CLI is considered "stable", and the ArchiveBox Python API and REST APIs are in "beta".
+The CLI is considered "stable", the ArchiveBox Python API and REST APIs are in "beta", and the [desktop app](https://github.com/ArchiveBox/desktop) is in "alpha" stage.
 
 
 At the end of the day, the goal is to sleep soundly knowing that the part of the internet you care about will be automatically preserved in multiple, durable long-term formats that will be accessible for decades (or longer). You can also self-host your archivebox server on a public domain to provide archive.org-style public access to your site snapshots.
 At the end of the day, the goal is to sleep soundly knowing that the part of the internet you care about will be automatically preserved in multiple, durable long-term formats that will be accessible for decades (or longer). You can also self-host your archivebox server on a public domain to provide archive.org-style public access to your site snapshots.
 
 
@@ -76,7 +108,7 @@ At the end of the day, the goal is to sleep soundly knowing that the part of the
 <img src="https://i.imgur.com/viklZNG.png" width="22%" alt="Desktop index screenshot" align="top">
 <img src="https://i.imgur.com/viklZNG.png" width="22%" alt="Desktop index screenshot" align="top">
 <img src="https://i.imgur.com/RefWsXB.jpg" width="22%" alt="Desktop details page Screenshot"/>
 <img src="https://i.imgur.com/RefWsXB.jpg" width="22%" alt="Desktop details page Screenshot"/>
 <img src="https://i.imgur.com/M6HhzVx.png" width="22%" alt="Desktop details page Screenshot"/><br/>
 <img src="https://i.imgur.com/M6HhzVx.png" width="22%" alt="Desktop details page Screenshot"/><br/>
-<sup><a href="https://archive.sweeting.me/">Demo</a> | <a href="https://github.com/pirate/ArchiveBox/wiki/Usage">Usage</a> | <a href="#screenshots">Screenshots</a></sup>
+<sup><a href="https://archive.sweeting.me/">Demo</a> | <a href="https://github.com/ArchiveBox/ArchiveBox/wiki/Usage">Usage</a> | <a href="#screenshots">Screenshots</a></sup>
 <br/>
 <br/>
 <sub>. . . . . . . . . . . . . . . . . . . . . . . . . . . .</sub>
 <sub>. . . . . . . . . . . . . . . . . . . . . . . . . . . .</sub>
 </div><br/>
 </div><br/>
@@ -84,16 +116,16 @@ At the end of the day, the goal is to sleep soundly knowing that the part of the
 
 
 ## Key Features
 ## Key Features
 
 
-- [**Free & open source**](https://github.com/pirate/ArchiveBox/blob/master/LICENSE), doesn't require signing up for anything, stores all data locally
-- [**Few dependencies**](https://github.com/pirate/ArchiveBox/wiki/Install#dependencies) and [simple command line interface](https://github.com/pirate/ArchiveBox/wiki/Usage#CLI-Usage)
-- [**Comprehensive documentation**](https://github.com/pirate/ArchiveBox/wiki), [active development](https://github.com/pirate/ArchiveBox/wiki/Roadmap), and [rich community](https://github.com/pirate/ArchiveBox/wiki/Web-Archiving-Community)
-- Easy to set up **[scheduled importing](https://github.com/pirate/ArchiveBox/wiki/Scheduled-Archiving) from multiple sources**
+- [**Free & open source**](https://github.com/ArchiveBox/ArchiveBox/blob/master/LICENSE), doesn't require signing up for anything, stores all data locally
+- [**Few dependencies**](https://github.com/ArchiveBox/ArchiveBox/wiki/Install#dependencies) and [simple command line interface](https://github.com/ArchiveBox/ArchiveBox/wiki/Usage#CLI-Usage)
+- [**Comprehensive documentation**](https://github.com/ArchiveBox/ArchiveBox/wiki), [active development](https://github.com/ArchiveBox/ArchiveBox/wiki/Roadmap), and [rich community](https://github.com/ArchiveBox/ArchiveBox/wiki/Web-Archiving-Community)
+- Easy to set up **[scheduled importing](https://github.com/ArchiveBox/ArchiveBox/wiki/Scheduled-Archiving) from multiple sources**
 - Uses common, **durable, [long-term formats](#saves-lots-of-useful-stuff-for-each-imported-link)** like HTML, JSON, PDF, PNG, and WARC
 - Uses common, **durable, [long-term formats](#saves-lots-of-useful-stuff-for-each-imported-link)** like HTML, JSON, PDF, PNG, and WARC
-- ~~**Suitable for paywalled / [authenticated content](https://github.com/pirate/ArchiveBox/wiki/Configuration#chrome_user_data_dir)** (can use your cookies)~~ (do not do this until v0.5 is released with some security fixes)
+- ~~**Suitable for paywalled / [authenticated content](https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#chrome_user_data_dir)** (can use your cookies)~~ (do not do this until v0.5 is released with some security fixes)
 - **Doesn't require a constantly-running daemon**, proxy, or native app
 - **Doesn't require a constantly-running daemon**, proxy, or native app
 - Provides a CLI, Python API, self-hosted web UI, and REST API (WIP)
 - Provides a CLI, Python API, self-hosted web UI, and REST API (WIP)
-- Architected to be able to run [**many varieties of scripts during archiving**](https://github.com/pirate/ArchiveBox/issues/51), e.g. to extract media, summarize articles, [scroll pages](https://github.com/pirate/ArchiveBox/issues/80), [close modals](https://github.com/pirate/ArchiveBox/issues/175), expand comment threads, etc.
-- Can also [**mirror content to 3rd-party archiving services**](https://github.com/pirate/ArchiveBox/wiki/Configuration#submit_archive_dot_org) automatically for redundancy
+- Architected to be able to run [**many varieties of scripts during archiving**](https://github.com/ArchiveBox/ArchiveBox/issues/51), e.g. to extract media, summarize articles, [scroll pages](https://github.com/ArchiveBox/ArchiveBox/issues/80), [close modals](https://github.com/ArchiveBox/ArchiveBox/issues/175), expand comment threads, etc.
+- Can also [**mirror content to 3rd-party archiving services**](https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#submit_archive_dot_org) automatically for redundancy
 
 
 ## Input formats
 ## Input formats
 
 
@@ -112,7 +144,7 @@ archivebox add --depth=1 'https://news.ycombinator.com#2020-12-12'
 - <img src="https://nicksweeting.com/images/rss.svg" height="22px"/> RSS, XML, JSON, CSV, SQL, HTML, Markdown, TXT, or any other text-based format
 - <img src="https://nicksweeting.com/images/rss.svg" height="22px"/> RSS, XML, JSON, CSV, SQL, HTML, Markdown, TXT, or any other text-based format
 - <img src="https://getpocket.com/favicon.ico" height="22px"/> Pocket, Pinboard, Instapaper, Shaarli, Delicious, Reddit Saved Posts, Wallabag, Unmark.it, OneTab, and more
 - <img src="https://getpocket.com/favicon.ico" height="22px"/> Pocket, Pinboard, Instapaper, Shaarli, Delicious, Reddit Saved Posts, Wallabag, Unmark.it, OneTab, and more
 
 
-See the [Usage: CLI](https://github.com/pirate/ArchiveBox/wiki/Usage#CLI-Usage) page for documentation and examples.
+See the [Usage: CLI](https://github.com/ArchiveBox/ArchiveBox/wiki/Usage#CLI-Usage) page for documentation and examples.
 
 
 It also includes a built-in scheduled import feature and browser bookmarklet, so you can ingest URLs from RSS feeds, websites, or the filesystem regularly.
 It also includes a built-in scheduled import feature and browser bookmarklet, so you can ingest URLs from RSS feeds, websites, or the filesystem regularly.
 
 
@@ -137,15 +169,15 @@ The on-disk layout is optimized to be easy to browse by hand and durable long-te
 - **URL to Archive.org:** `archive.org.txt` A link to the saved site on archive.org
 - **URL to Archive.org:** `archive.org.txt` A link to the saved site on archive.org
 - **Audio & Video:** `media/` all audio/video files + playlists, including subtitles & metadata with youtube-dl
 - **Audio & Video:** `media/` all audio/video files + playlists, including subtitles & metadata with youtube-dl
 - **Source Code:** `git/` clone of any repository found on github, bitbucket, or gitlab links
 - **Source Code:** `git/` clone of any repository found on github, bitbucket, or gitlab links
-- _More coming soon! See the [Roadmap](https://github.com/pirate/ArchiveBox/wiki/Roadmap)..._
+- _More coming soon! See the [Roadmap](https://github.com/ArchiveBox/ArchiveBox/wiki/Roadmap)..._
 
 
-It does everything out-of-the-box by default, but you can disable or tweak [individual archive methods](https://github.com/pirate/ArchiveBox/wiki/Configuration) via environment variables or config file.
+It does everything out-of-the-box by default, but you can disable or tweak [individual archive methods](https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration) via environment variables or config file.
 
 
 ## Dependencies
 ## Dependencies
 
 
-You don't need to install all the dependencies, ArchiveBox will automatically enable the relevant modules based on whatever you have available, but it's recommended to use the official [Docker image](https://github.com/pirate/ArchiveBox/wiki/Docker) with everything preinstalled.
+You don't need to install all the dependencies, ArchiveBox will automatically enable the relevant modules based on whatever you have available, but it's recommended to use the official [Docker image](https://github.com/ArchiveBox/ArchiveBox/wiki/Docker) with everything preinstalled.
 
 
-If you so choose, you can also install ArchiveBox and its dependencies directly on any Linux or macOS systems using the [automated setup script](https://github.com/pirate/ArchiveBox/wiki/Quickstart) or the [system package manager](https://github.com/pirate/ArchiveBox/wiki/Install).
+If you so choose, you can also install ArchiveBox and its dependencies directly on any Linux or macOS systems using the [automated setup script](https://github.com/ArchiveBox/ArchiveBox/wiki/Quickstart) or the [system package manager](https://github.com/ArchiveBox/ArchiveBox/wiki/Install).
 
 
 ArchiveBox is written in Python 3 so it requires `python3` and `pip3` available on your system. It also uses a set of optional, but highly recommended external dependencies for archiving sites: `wget` (for plain HTML, static files, and WARC saving), `chromium` (for screenshots, PDFs, JS execution, and more), `youtube-dl` (for audio and video), `git` (for cloning git repos), and `nodejs` (for readability and singlefile), and more.
 ArchiveBox is written in Python 3 so it requires `python3` and `pip3` available on your system. It also uses a set of optional, but highly recommended external dependencies for archiving sites: `wget` (for plain HTML, static files, and WARC saving), `chromium` (for screenshots, PDFs, JS execution, and more), `youtube-dl` (for audio and video), `git` (for cloning git repos), and `nodejs` (for readability and singlefile), and more.
 
 
@@ -163,7 +195,7 @@ archivebox config --set SAVE_FAVICON=False  # optional: only the domain is leake
 archivebox config --get CHROME_VERSION      # optional: set this to chromium instead of chrome if you don't like Google
 archivebox config --get CHROME_VERSION      # optional: set this to chromium instead of chrome if you don't like Google
 ```
 ```
 
 
-Be aware that malicious archived JS can also read the contents of other pages in your archive due to snapshot CSRF and XSS protections being imperfect. See the [Security Overview](https://github.com/pirate/ArchiveBox/wiki/Security-Overview#stealth-mode) page for more details.
+Be aware that malicious archived JS can also read the contents of other pages in your archive due to snapshot CSRF and XSS protections being imperfect. See the [Security Overview](https://github.com/ArchiveBox/ArchiveBox/wiki/Security-Overview#stealth-mode) page for more details.
 ```bash
 ```bash
 # visiting an archived page with malicious JS:
 # visiting an archived page with malicious JS:
 https://127.0.0.1:8000/archive/1602401954/example.com/index.html
 https://127.0.0.1:8000/archive/1602401954/example.com/index.html
@@ -174,7 +206,7 @@ https://127.0.0.1:8000/archive/*
 # then example.com/index.js can send it off to some evil server
 # then example.com/index.js can send it off to some evil server
 ```
 ```
 
 
-Support for saving multiple snapshots of each site over time will be [added soon](https://github.com/pirate/ArchiveBox/issues/179) (along with the ability to view diffs of the changes between runs). For now ArchiveBox is designed to only archive each URL with each extractor type once. A workaround to take multiple snapshots of the same URL is to make them slightly different by adding a hash:
+Support for saving multiple snapshots of each site over time will be [added soon](https://github.com/ArchiveBox/ArchiveBox/issues/179) (along with the ability to view diffs of the changes between runs). For now ArchiveBox is designed to only archive each URL with each extractor type once. A workaround to take multiple snapshots of the same URL is to make them slightly different by adding a hash:
 ```bash
 ```bash
 archivebox add 'https://example.com#2020-10-24'
 archivebox add 'https://example.com#2020-10-24'
 ...
 ...
@@ -196,7 +228,7 @@ a headless browser runtime, a full webserver, and CLI interface.
 # docker-compose run archivebox <command> [args]
 # docker-compose run archivebox <command> [args]
 
 
 mkdir archivebox && cd archivebox
 mkdir archivebox && cd archivebox
-wget 'https://github.com/pirate/ArchiveBox/blob/master/docker-compose.yml'
+wget 'https://raw.githubusercontent.com/ArchiveBox/ArchiveBox/master/docker-compose.yml'
 docker-compose run archivebox init
 docker-compose run archivebox init
 docker-compose run archivebox add 'https://example.com'
 docker-compose run archivebox add 'https://example.com'
 docker-compose run archivebox manage createsuperuser
 docker-compose run archivebox manage createsuperuser
@@ -207,20 +239,20 @@ open http://127.0.0.1:8000
 ## Docker
 ## Docker
 
 
 ```bash
 ```bash
-# docker run -v $PWD:/data -it nikisweeting/archivebox <command> [args]
+# docker run -v $PWD:/data -it archivebox/archivebox <command> [args]
 
 
 mkdir archivebox && cd archivebox
 mkdir archivebox && cd archivebox
-docker run -v $PWD:/data -it nikisweeting/archivebox init
-docker run -v $PWD:/data -it nikisweeting/archivebox add 'https://example.com'
-docker run -v $PWD:/data -it nikisweeting/archivebox manage createsuperuser
+docker run -v $PWD:/data -it archivebox/archivebox init
+docker run -v $PWD:/data -it archivebox/archivebox add 'https://example.com'
+docker run -v $PWD:/data -it archivebox/archivebox manage createsuperuser
 
 
 # run the webserver to access the web UI
 # run the webserver to access the web UI
-docker run -v $PWD:/data -it -p 8000:8000 nikisweeting/archivebox server 0.0.0.0:8000
+docker run -v $PWD:/data -it -p 8000:8000 archivebox/archivebox server 0.0.0.0:8000
 open http://127.0.0.1:8000
 open http://127.0.0.1:8000
 
 
 # or export a static version of the index if you dont want to run a server
 # or export a static version of the index if you dont want to run a server
-docker run -v $PWD:/data -it nikisweeting/archivebox list --html --with-headers > index.html
-docker run -v $PWD:/data -it nikisweeting/archivebox list --json --with-headers > index.json
+docker run -v $PWD:/data -it archivebox/archivebox list --html --with-headers > index.html
+docker run -v $PWD:/data -it archivebox/archivebox list --json --with-headers > index.json
 open ./index.html
 open ./index.html
 ```
 ```
 
 
@@ -229,32 +261,19 @@ open ./index.html
 
 
 ```bash
 ```bash
 # archivebox <command> [args]
 # archivebox <command> [args]
-```
-
-First install the system, pip, and npm dependencies:
-```bash
-# Install main dependendencies using apt on Ubuntu/Debian, brew on mac, or pkg on BSD
-apt install python3 python3-pip python3-dev git curl wget chromium-browser youtube-dl
-
-# Install Node runtime (used for headless browser scripts like Readability, Singlefile, Mercury, etc.)
-curl -s https://deb.nodesource.com/gpgkey/nodesource.gpg.key | apt-key add - \
-  && echo 'deb https://deb.nodesource.com/node_14.x $(lsb_release -cs) main' >> /etc/apt/sources.list \
-  && apt-get update \
-  && apt-get install --no-install-recommends nodejs
-
-# Make a directory to hold your collection
-mkdir archivebox && cd archivebox    # (can be anywhere, doesn't have to be called archivebox)
 
 
-# Install the archivebox python package in ./.venv
-python3 -m venv .venv && source .venv/bin/activate
-pip install --upgrade archivebox
+# on Debian/Ubuntu
+sudo add-apt-repository -u ppa:archivebox/archivebox
+apt install archivebox
 
 
-# Install node packages in ./node_modules (used for SingleFile, Readability, and Puppeteer)
-npm install --prefix . 'git+https://github.com/pirate/ArchiveBox.git' 
+# on macOS
+brew install archivebox/archivebox/archivebox
 ```
 ```
 
 
-Initialize your archive and add some links:
+Initialize your archive in a directory somewhere and add some links:
 ```bash
 ```bash
+mkdir ~/archivebox && cd archivebox
+npm install --prefix . 'git+https://github.com/ArchiveBox/ArchiveBox.git' 
 archivebox init
 archivebox init
 archivebox add 'https://example.com'  # add URLs as args pipe them in via stdin
 archivebox add 'https://example.com'  # add URLs as args pipe them in via stdin
 archivebox add --depth=1 https://example.com/table-of-contents.html
 archivebox add --depth=1 https://example.com/table-of-contents.html
@@ -314,13 +333,13 @@ All the archived links are stored by date bookmarked in `./archive/<timestamp>`,
 
 
 ## Comparison to Other Projects
 ## Comparison to Other Projects
 
 
-▶ **Check out our [community page](https://github.com/pirate/ArchiveBox/wiki/Web-Archiving-Community) for an index of web archiving initiatives and projects.**
+▶ **Check out our [community page](https://github.com/ArchiveBox/ArchiveBox/wiki/Web-Archiving-Community) for an index of web archiving initiatives and projects.**
 
 
-<img src="https://i.imgur.com/4nkFjdv.png" width="10%" align="left"/> The aim of ArchiveBox is to go beyond what the Wayback Machine and other public archiving services can do, by adding a headless browser to replay sessions accurately, and by automatically extracting all the content in multiple redundant formats that will survive being passed down to historians and archivists through many generations.
+<img src="https://i.imgur.com/4nkFjdv.png" width="10%" align="left" alt="comparison"/> The aim of ArchiveBox is to go beyond what the Wayback Machine and other public archiving services can do, by adding a headless browser to replay sessions accurately, and by automatically extracting all the content in multiple redundant formats that will survive being passed down to historians and archivists through many generations.
 
 
 #### User Interface & Intended Purpose
 #### User Interface & Intended Purpose
 
 
-ArchiveBox differentiates itself from [similar projects](https://github.com/pirate/ArchiveBox/wiki/Web-Archiving-Community#Web-Archiving-Projects) by being a simple, one-shot CLI interface for users to ingest bulk feeds of URLs over extended periods, as opposed to being a backend service that ingests individual, manually-submitted URLs from a web UI. However, we also have the option to add urls via a web interface through our Django frontend.
+ArchiveBox differentiates itself from [similar projects](https://github.com/ArchiveBox/ArchiveBox/wiki/Web-Archiving-Community#Web-Archiving-Projects) by being a simple, one-shot CLI interface for users to ingest bulk feeds of URLs over extended periods, as opposed to being a backend service that ingests individual, manually-submitted URLs from a web UI. However, we also have the option to add urls via a web interface through our Django frontend.
 
 
 #### Private Local Archives vs Centralized Public Archives
 #### Private Local Archives vs Centralized Public Archives
 
 
@@ -336,18 +355,18 @@ Whether you want to learn which organizations are the big players in the web arc
 
 
 <img src="https://i.imgur.com/0ZOmOvN.png" width="14%" align="right"/>
 <img src="https://i.imgur.com/0ZOmOvN.png" width="14%" align="right"/>
 
 
-- [Community Wiki](https://github.com/pirate/ArchiveBox/wiki/Web-Archiving-Community)
-  - [The Master Lists](https://github.com/pirate/ArchiveBox/wiki/Web-Archiving-Community#The-Master-Lists)  
+- [Community Wiki](https://github.com/ArchiveBox/ArchiveBox/wiki/Web-Archiving-Community)
+  - [The Master Lists](https://github.com/ArchiveBox/ArchiveBox/wiki/Web-Archiving-Community#The-Master-Lists)  
     _Community-maintained indexes of archiving tools and institutions._
     _Community-maintained indexes of archiving tools and institutions._
-  - [Web Archiving Software](https://github.com/pirate/ArchiveBox/wiki/Web-Archiving-Community#Web-Archiving-Projects)  
+  - [Web Archiving Software](https://github.com/ArchiveBox/ArchiveBox/wiki/Web-Archiving-Community#Web-Archiving-Projects)  
     _Open source tools and projects in the internet archiving space._
     _Open source tools and projects in the internet archiving space._
-  - [Reading List](https://github.com/pirate/ArchiveBox/wiki/Web-Archiving-Community#Reading-List)  
+  - [Reading List](https://github.com/ArchiveBox/ArchiveBox/wiki/Web-Archiving-Community#Reading-List)  
     _Articles, posts, and blogs relevant to ArchiveBox and web archiving in general._
     _Articles, posts, and blogs relevant to ArchiveBox and web archiving in general._
-  - [Communities](https://github.com/pirate/ArchiveBox/wiki/Web-Archiving-Community#Communities)  
+  - [Communities](https://github.com/ArchiveBox/ArchiveBox/wiki/Web-Archiving-Community#Communities)  
     _A collection of the most active internet archiving communities and initiatives._
     _A collection of the most active internet archiving communities and initiatives._
-- Check out the ArchiveBox [Roadmap](https://github.com/pirate/ArchiveBox/wiki/Roadmap) and [Changelog](https://github.com/pirate/ArchiveBox/wiki/Changelog)
+- Check out the ArchiveBox [Roadmap](https://github.com/ArchiveBox/ArchiveBox/wiki/Roadmap) and [Changelog](https://github.com/ArchiveBox/ArchiveBox/wiki/Changelog)
 - Learn why archiving the internet is important by reading the "[On the Importance of Web Archiving](https://parameters.ssrc.org/2018/09/on-the-importance-of-web-archiving/)" blog post.
 - Learn why archiving the internet is important by reading the "[On the Importance of Web Archiving](https://parameters.ssrc.org/2018/09/on-the-importance-of-web-archiving/)" blog post.
-- Or reach out to me for questions and comments via [@theSquashSH](https://twitter.com/thesquashSH) on Twitter.
+- Or reach out to me for questions and comments via [@ArchiveBoxApp](https://twitter.com/ArchiveBoxApp) or [@theSquashSH](https://twitter.com/thesquashSH) on Twitter.
 
 
 ---
 ---
 
 
@@ -355,51 +374,51 @@ Whether you want to learn which organizations are the big players in the web arc
 
 
 <img src="https://read-the-docs-guidelines.readthedocs-hosted.com/_images/logo-dark.png" width="13%" align="right"/>
 <img src="https://read-the-docs-guidelines.readthedocs-hosted.com/_images/logo-dark.png" width="13%" align="right"/>
 
 
-We use the [Github wiki system](https://github.com/pirate/ArchiveBox/wiki) and [Read the Docs](https://archivebox.readthedocs.io/en/latest/) (WIP) for documentation.
+We use the [Github wiki system](https://github.com/ArchiveBox/ArchiveBox/wiki) and [Read the Docs](https://archivebox.readthedocs.io/en/latest/) (WIP) for documentation.
 
 
-You can also access the docs locally by looking in the [`ArchiveBox/docs/`](https://github.com/pirate/ArchiveBox/wiki/Home) folder.
+You can also access the docs locally by looking in the [`ArchiveBox/docs/`](https://github.com/ArchiveBox/ArchiveBox/wiki/Home) folder.
 
 
 ## Getting Started
 ## Getting Started
 
 
-- [Quickstart](https://github.com/pirate/ArchiveBox/wiki/Quickstart)
-- [Install](https://github.com/pirate/ArchiveBox/wiki/Install)
-- [Docker](https://github.com/pirate/ArchiveBox/wiki/Docker)
+- [Quickstart](https://github.com/ArchiveBox/ArchiveBox/wiki/Quickstart)
+- [Install](https://github.com/ArchiveBox/ArchiveBox/wiki/Install)
+- [Docker](https://github.com/ArchiveBox/ArchiveBox/wiki/Docker)
 
 
 ## Reference
 ## Reference
 
 
-- [Usage](https://github.com/pirate/ArchiveBox/wiki/Usage)
-- [Configuration](https://github.com/pirate/ArchiveBox/wiki/Configuration)
-- [Supported Sources](https://github.com/pirate/ArchiveBox/wiki/Quickstart#2-get-your-list-of-urls-to-archive)
-- [Supported Outputs](https://github.com/pirate/ArchiveBox/wiki#can-save-these-things-for-each-site)
-- [Scheduled Archiving](https://github.com/pirate/ArchiveBox/wiki/Scheduled-Archiving)
-- [Publishing Your Archive](https://github.com/pirate/ArchiveBox/wiki/Publishing-Your-Archive)
-- [Chromium Install](https://github.com/pirate/ArchiveBox/wiki/Install-Chromium)
-- [Security Overview](https://github.com/pirate/ArchiveBox/wiki/Security-Overview)
-- [Troubleshooting](https://github.com/pirate/ArchiveBox/wiki/Troubleshooting)
+- [Usage](https://github.com/ArchiveBox/ArchiveBox/wiki/Usage)
+- [Configuration](https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration)
+- [Supported Sources](https://github.com/ArchiveBox/ArchiveBox/wiki/Quickstart#2-get-your-list-of-urls-to-archive)
+- [Supported Outputs](https://github.com/ArchiveBox/ArchiveBox/wiki#can-save-these-things-for-each-site)
+- [Scheduled Archiving](https://github.com/ArchiveBox/ArchiveBox/wiki/Scheduled-Archiving)
+- [Publishing Your Archive](https://github.com/ArchiveBox/ArchiveBox/wiki/Publishing-Your-Archive)
+- [Chromium Install](https://github.com/ArchiveBox/ArchiveBox/wiki/Chromium-Install)
+- [Security Overview](https://github.com/ArchiveBox/ArchiveBox/wiki/Security-Overview)
+- [Troubleshooting](https://github.com/ArchiveBox/ArchiveBox/wiki/Troubleshooting)
 - [Python API](https://docs.archivebox.io/en/latest/modules.html)
 - [Python API](https://docs.archivebox.io/en/latest/modules.html)
 - REST API (coming soon...)
 - REST API (coming soon...)
 
 
 ## More Info
 ## More Info
 
 
-- [Tickets](https://github.com/pirate/ArchiveBox/issues)
-- [Roadmap](https://github.com/pirate/ArchiveBox/wiki/Roadmap)
-- [Changelog](https://github.com/pirate/ArchiveBox/wiki/Changelog)
-- [Donations](https://github.com/pirate/ArchiveBox/wiki/Donations)
-- [Background & Motivation](https://github.com/pirate/ArchiveBox#background--motivation)
-- [Web Archiving Community](https://github.com/pirate/ArchiveBox/wiki/Web-Archiving-Community)
+- [Tickets](https://github.com/ArchiveBox/ArchiveBox/issues)
+- [Roadmap](https://github.com/ArchiveBox/ArchiveBox/wiki/Roadmap)
+- [Changelog](https://github.com/ArchiveBox/ArchiveBox/wiki/Changelog)
+- [Donations](https://github.com/ArchiveBox/ArchiveBox/wiki/Donations)
+- [Background & Motivation](https://github.com/ArchiveBox/ArchiveBox#background--motivation)
+- [Web Archiving Community](https://github.com/ArchiveBox/ArchiveBox/wiki/Web-Archiving-Community)
 
 
 ---
 ---
 
 
 # ArchiveBox Development
 # ArchiveBox Development
 
 
-All contributions to ArchiveBox are welcomed! Check our [issues](https://github.com/pirate/ArchiveBox/issues) and [Roadmap](https://github.com/pirate/ArchiveBox/wiki/Roadmap) for things to work on, and please open an issue to discuss your proposed implementation before working on things! Otherwise we may have to close your PR if it doesn't align with our roadmap.
+All contributions to ArchiveBox are welcomed! Check our [issues](https://github.com/ArchiveBox/ArchiveBox/issues) and [Roadmap](https://github.com/ArchiveBox/ArchiveBox/wiki/Roadmap) for things to work on, and please open an issue to discuss your proposed implementation before working on things! Otherwise we may have to close your PR if it doesn't align with our roadmap.
 
 
 ### Setup the dev environment
 ### Setup the dev environment
 
 
 First, install the system dependencies from the "Bare Metal" section above.
 First, install the system dependencies from the "Bare Metal" section above.
 Then you can clone the ArchiveBox repo and install
 Then you can clone the ArchiveBox repo and install
 ```python3
 ```python3
-git clone https://github.com/pirate/ArchiveBox
+git clone https://github.com/ArchiveBox/ArchiveBox
 cd ArchiveBox
 cd ArchiveBox
 git checkout master  # or the branch you want to test
 git checkout master  # or the branch you want to test
 git pull
 git pull
@@ -479,8 +498,8 @@ You can also run all these in Docker. For more examples see the Github Actions C
 <a href="https://www.patreon.com/theSquashSH"><img src="https://img.shields.io/badge/Donate_to_support_development-via_Patreon-%23DD5D76.svg?style=flat"/></a>
 <a href="https://www.patreon.com/theSquashSH"><img src="https://img.shields.io/badge/Donate_to_support_development-via_Patreon-%23DD5D76.svg?style=flat"/></a>
 <br/>
 <br/>
 
 
-<a href="https://twitter.com/thesquashSH"><img src="https://img.shields.io/badge/Tweet-%40theSquashSH-blue.svg?style=flat"/></a>
-<a href="https://github.com/pirate/ArchiveBox"><img src="https://img.shields.io/github/stars/pirate/ArchiveBox.svg?style=flat&label=Star+on+Github"/></a>
+<a href="https://twitter.com/ArchiveBoxApp"><img src="https://img.shields.io/badge/Tweet-%40ArchiveBoxApp-blue.svg?style=flat"/></a>
+<a href="https://github.com/ArchiveBox/ArchiveBox"><img src="https://img.shields.io/github/stars/ArchiveBox/ArchiveBox.svg?style=flat&label=Star+on+Github"/></a>
 
 
 <br/><br/>
 <br/><br/>
 
 

+ 1 - 1
_config.yml

@@ -1 +1 @@
-theme: jekyll-theme-minimal
+theme: jekyll-theme-merlot

+ 95 - 87
archivebox.egg-info/PKG-INFO

@@ -1,29 +1,29 @@
 Metadata-Version: 2.1
 Metadata-Version: 2.1
 Name: archivebox
 Name: archivebox
-Version: 0.4.21
+Version: 0.4.24
 Summary: The self-hosted internet archive.
 Summary: The self-hosted internet archive.
-Home-page: https://github.com/pirate/ArchiveBox
+Home-page: https://github.com/ArchiveBox/ArchiveBox
 Author: Nick Sweeting
 Author: Nick Sweeting
 Author-email: [email protected]
 Author-email: [email protected]
 License: MIT
 License: MIT
-Project-URL: Source, https://github.com/pirate/ArchiveBox
-Project-URL: Documentation, https://github.com/pirate/ArchiveBox/wiki
-Project-URL: Bug Tracker, https://github.com/pirate/ArchiveBox/issues
-Project-URL: Changelog, https://github.com/pirate/ArchiveBox/wiki/Changelog
-Project-URL: Roadmap, https://github.com/pirate/ArchiveBox/wiki/Roadmap
-Project-URL: Community, https://github.com/pirate/ArchiveBox/wiki/Web-Archiving-Community
-Project-URL: Donate, https://github.com/pirate/ArchiveBox/wiki/Donations
+Project-URL: Source, https://github.com/ArchiveBox/ArchiveBox
+Project-URL: Documentation, https://github.com/ArchiveBox/ArchiveBox/wiki
+Project-URL: Bug Tracker, https://github.com/ArchiveBox/ArchiveBox/issues
+Project-URL: Changelog, https://github.com/ArchiveBox/ArchiveBox/wiki/Changelog
+Project-URL: Roadmap, https://github.com/ArchiveBox/ArchiveBox/wiki/Roadmap
+Project-URL: Community, https://github.com/ArchiveBox/ArchiveBox/wiki/Web-Archiving-Community
+Project-URL: Donate, https://github.com/ArchiveBox/ArchiveBox/wiki/Donations
 Description: <div align="center">
 Description: <div align="center">
-        <img src="https://i.imgur.com/4nkFjdv.png" height="80px">
+        <em><img src="https://i.imgur.com/5B48E3N.png" height="90px"></em>
         <h1>ArchiveBox<br/><sub>The open-source self-hosted web archive.</sub></h1>
         <h1>ArchiveBox<br/><sub>The open-source self-hosted web archive.</sub></h1>
         
         
-        ▶️ <a href="https://github.com/pirate/ArchiveBox/wiki/Quickstart">Quickstart</a> |
+        ▶️ <a href="https://github.com/ArchiveBox/ArchiveBox/wiki/Quickstart">Quickstart</a> |
         <a href="https://archivebox.zervice.io/">Demo</a> |
         <a href="https://archivebox.zervice.io/">Demo</a> |
-        <a href="https://github.com/pirate/ArchiveBox">Github</a> |
-        <a href="https://github.com/pirate/ArchiveBox/wiki">Documentation</a> |
+        <a href="https://github.com/ArchiveBox/ArchiveBox">Github</a> |
+        <a href="https://github.com/ArchiveBox/ArchiveBox/wiki">Documentation</a> |
         <a href="#background--motivation">Info & Motivation</a> |
         <a href="#background--motivation">Info & Motivation</a> |
-        <a href="https://github.com/pirate/ArchiveBox/wiki/Web-Archiving-Community">Community</a> |
-        <a href="https://github.com/pirate/ArchiveBox/wiki/Roadmap">Roadmap</a>
+        <a href="https://github.com/ArchiveBox/ArchiveBox/wiki/Web-Archiving-Community">Community</a> |
+        <a href="https://github.com/ArchiveBox/ArchiveBox/wiki/Roadmap">Roadmap</a>
         
         
         <pre>
         <pre>
         "Your own personal internet archive" (网站存档 / 爬虫)
         "Your own personal internet archive" (网站存档 / 爬虫)
@@ -31,17 +31,17 @@ Description: <div align="center">
         
         
         <!--<a href="http://webchat.freenode.net?channels=ArchiveBox&uio=d4"><img src="https://img.shields.io/badge/Community_chat-IRC-%2328A745.svg"/></a>-->
         <!--<a href="http://webchat.freenode.net?channels=ArchiveBox&uio=d4"><img src="https://img.shields.io/badge/Community_chat-IRC-%2328A745.svg"/></a>-->
         
         
-        <a href="https://github.com/pirate/ArchiveBox/blob/master/LICENSE"><img src="https://img.shields.io/badge/Open_source-MIT-green.svg?logo=git&logoColor=green"/></a>
-        <a href="https://github.com/pirate/ArchiveBox/commits/dev"><img src="https://img.shields.io/github/last-commit/pirate/ArchiveBox.svg?logo=Sublime+Text&logoColor=green&label=Active"/></a>
-        <a href="https://github.com/pirate/ArchiveBox"><img src="https://img.shields.io/github/stars/pirate/ArchiveBox.svg?logo=github&label=Stars&logoColor=blue"/></a>
+        <a href="https://github.com/ArchiveBox/ArchiveBox/blob/master/LICENSE"><img src="https://img.shields.io/badge/Open_source-MIT-green.svg?logo=git&logoColor=green"/></a>
+        <a href="https://github.com/ArchiveBox/ArchiveBox/commits/dev"><img src="https://img.shields.io/github/last-commit/ArchiveBox/ArchiveBox.svg?logo=Sublime+Text&logoColor=green&label=Active"/></a>
+        <a href="https://github.com/ArchiveBox/ArchiveBox"><img src="https://img.shields.io/github/stars/ArchiveBox/ArchiveBox.svg?logo=github&label=Stars&logoColor=blue"/></a>
         <a href="https://test.pypi.org/project/archivebox/"><img src="https://img.shields.io/badge/Python-%3E%3D3.7-yellow.svg?logo=python&logoColor=yellow"/></a>
         <a href="https://test.pypi.org/project/archivebox/"><img src="https://img.shields.io/badge/Python-%3E%3D3.7-yellow.svg?logo=python&logoColor=yellow"/></a>
-        <a href="https://github.com/pirate/ArchiveBox/wiki/Install#dependencies"><img src="https://img.shields.io/badge/Chromium-%3E%3D59-orange.svg?logo=Google+Chrome&logoColor=orange"/></a>
-        <a href="https://hub.docker.com/r/nikisweeting/archivebox"><img src="https://img.shields.io/badge/Docker-all%20platforms-lightblue.svg?logo=docker&logoColor=lightblue"/></a>
+        <a href="https://github.com/ArchiveBox/ArchiveBox/wiki/Install#dependencies"><img src="https://img.shields.io/badge/Chromium-%3E%3D59-orange.svg?logo=Google+Chrome&logoColor=orange"/></a>
+        <a href="https://hub.docker.com/r/archivebox/archivebox"><img src="https://img.shields.io/badge/Docker-all%20platforms-lightblue.svg?logo=docker&logoColor=lightblue"/></a>
         
         
         <hr/>
         <hr/>
         </div>
         </div>
         
         
-        ArchiveBox is a powerful self-hosted internet archiving solution written in Python 3. You feed it URLs of pages you want to archive, and it saves them to disk in a varitety of formats depending on the configuration and the content it detects. ArchiveBox can be installed via [Docker](https://docs.docker.com/get-docker/) or [`pip3`](https://wiki.python.org/moin/BeginnersGuide/Download).
+        ArchiveBox is a powerful self-hosted internet archiving solution written in Python 3. You feed it URLs of pages you want to archive, and it saves them to disk in a varitety of formats depending on the configuration and the content it detects. ArchiveBox can be installed via [Docker](https://docs.docker.com/get-docker/) (recommended) or [`pip`](https://www.python.org/downloads/). It works on macOS, Windows, and Linux/BSD (both armv7 and amd64).
         
         
         Once installed, URLs can be added via the command line `archivebox add` or the built-in Web UI `archivebox server`. It can ingest bookmarks from a service like Pocket/Pinboard, your entire browsing history, RSS feeds, or URLs one at a time.
         Once installed, URLs can be added via the command line `archivebox add` or the built-in Web UI `archivebox server`. It can ingest bookmarks from a service like Pocket/Pinboard, your entire browsing history, RSS feeds, or URLs one at a time.
         
         
@@ -51,19 +51,27 @@ Description: <div align="center">
         #### Quickstart
         #### Quickstart
         
         
         ```bash
         ```bash
-        docker run -d -it -v ~/archivebox:/data -p 8000:8000 nikisweeting/archivebox server --init 0.0.0.0:8000
-        docker run -v ~/archivebox:/data -it nikisweeting/archivebox manage createsuperuser
-        docker run -v ~/archivebox:/data -it nikisweeting/archivebox add 'https://example.com'
-        
-        open http://127.0.0.1:8000/admin/login/  # then click "Add" in the navbar
+        # 1. Create a folder somewhere to hold your ArchiveBox data
+        mkdir ~/archivebox && cd ~/archivebox
+        docker run -v $PWD:/data -it archivebox/archivebox init
+        
+        # 2. Archive some URLs to get started
+        docker run -v $PWD:/data -t archivebox/archivebox add https://github.com/ArchiveBox/ArchiveBox
+        docker run -v $PWD:/data -t archivebox/archivebox add --depth=1 https://example.com
+        
+        # 3. Then view the snapshots of the URLs you added via the self-hosted web UI
+        docker run -v $PWD:/data -it archivebox/archivebox manage createsuperuser  # create an admin acct
+        docker run -v $PWD:/data -p 8000:8000 archivebox/archivebox                # start the web server
+        open http://127.0.0.1:8000/                                    # open the interactive admin panel
+        ls archive/*/index.html                                        # or just browse snapshots on disk
         ```
         ```
         
         
         <div align="center">
         <div align="center">
         <img src="https://i.imgur.com/lUuicew.png" width="400px">
         <img src="https://i.imgur.com/lUuicew.png" width="400px">
         <br/>
         <br/>
         
         
-        [DEMO: archivebox.zervice.io/](https://archivebox.zervice.io)  
-        For more information, see the [full Quickstart guide](https://github.com/pirate/ArchiveBox/wiki/Quickstart), [Usage](https://github.com/pirate/ArchiveBox/wiki/Usage), and [Configuration](https://github.com/pirate/ArchiveBox/wiki/Configuration) docs.
+        <a href="https://archivebox.zervice.io">DEMO: archivebox.zervice.io/</a>  
+        For more information, see the <a href="https://github.com/ArchiveBox/ArchiveBox/wiki/Quickstart">full Quickstart guide</a>, <a href="https://github.com/ArchiveBox/ArchiveBox/wiki/Usage">Usage</a>, and <a href="https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration">Configuration</a> docs.
         </div>
         </div>
         
         
         ---
         ---
@@ -82,7 +90,7 @@ Description: <div align="center">
         open http://127.0.0.1:8000
         open http://127.0.0.1:8000
         ```
         ```
         
         
-        The CLI is considered "stable", and the ArchiveBox Python API and REST APIs are in "beta".
+        The CLI is considered "stable", the ArchiveBox Python API and REST APIs are in "beta", and the [desktop app](https://github.com/ArchiveBox/desktop) is in "alpha" stage.
         
         
         At the end of the day, the goal is to sleep soundly knowing that the part of the internet you care about will be automatically preserved in multiple, durable long-term formats that will be accessible for decades (or longer). You can also self-host your archivebox server on a public domain to provide archive.org-style public access to your site snapshots.
         At the end of the day, the goal is to sleep soundly knowing that the part of the internet you care about will be automatically preserved in multiple, durable long-term formats that will be accessible for decades (or longer). You can also self-host your archivebox server on a public domain to provide archive.org-style public access to your site snapshots.
         
         
@@ -91,7 +99,7 @@ Description: <div align="center">
         <img src="https://i.imgur.com/viklZNG.png" width="22%" alt="Desktop index screenshot" align="top">
         <img src="https://i.imgur.com/viklZNG.png" width="22%" alt="Desktop index screenshot" align="top">
         <img src="https://i.imgur.com/RefWsXB.jpg" width="22%" alt="Desktop details page Screenshot"/>
         <img src="https://i.imgur.com/RefWsXB.jpg" width="22%" alt="Desktop details page Screenshot"/>
         <img src="https://i.imgur.com/M6HhzVx.png" width="22%" alt="Desktop details page Screenshot"/><br/>
         <img src="https://i.imgur.com/M6HhzVx.png" width="22%" alt="Desktop details page Screenshot"/><br/>
-        <sup><a href="https://archive.sweeting.me/">Demo</a> | <a href="https://github.com/pirate/ArchiveBox/wiki/Usage">Usage</a> | <a href="#screenshots">Screenshots</a></sup>
+        <sup><a href="https://archive.sweeting.me/">Demo</a> | <a href="https://github.com/ArchiveBox/ArchiveBox/wiki/Usage">Usage</a> | <a href="#screenshots">Screenshots</a></sup>
         <br/>
         <br/>
         <sub>. . . . . . . . . . . . . . . . . . . . . . . . . . . .</sub>
         <sub>. . . . . . . . . . . . . . . . . . . . . . . . . . . .</sub>
         </div><br/>
         </div><br/>
@@ -99,16 +107,16 @@ Description: <div align="center">
         
         
         ## Key Features
         ## Key Features
         
         
-        - [**Free & open source**](https://github.com/pirate/ArchiveBox/blob/master/LICENSE), doesn't require signing up for anything, stores all data locally
-        - [**Few dependencies**](https://github.com/pirate/ArchiveBox/wiki/Install#dependencies) and [simple command line interface](https://github.com/pirate/ArchiveBox/wiki/Usage#CLI-Usage)
-        - [**Comprehensive documentation**](https://github.com/pirate/ArchiveBox/wiki), [active development](https://github.com/pirate/ArchiveBox/wiki/Roadmap), and [rich community](https://github.com/pirate/ArchiveBox/wiki/Web-Archiving-Community)
-        - Easy to set up **[scheduled importing](https://github.com/pirate/ArchiveBox/wiki/Scheduled-Archiving) from multiple sources**
+        - [**Free & open source**](https://github.com/ArchiveBox/ArchiveBox/blob/master/LICENSE), doesn't require signing up for anything, stores all data locally
+        - [**Few dependencies**](https://github.com/ArchiveBox/ArchiveBox/wiki/Install#dependencies) and [simple command line interface](https://github.com/ArchiveBox/ArchiveBox/wiki/Usage#CLI-Usage)
+        - [**Comprehensive documentation**](https://github.com/ArchiveBox/ArchiveBox/wiki), [active development](https://github.com/ArchiveBox/ArchiveBox/wiki/Roadmap), and [rich community](https://github.com/ArchiveBox/ArchiveBox/wiki/Web-Archiving-Community)
+        - Easy to set up **[scheduled importing](https://github.com/ArchiveBox/ArchiveBox/wiki/Scheduled-Archiving) from multiple sources**
         - Uses common, **durable, [long-term formats](#saves-lots-of-useful-stuff-for-each-imported-link)** like HTML, JSON, PDF, PNG, and WARC
         - Uses common, **durable, [long-term formats](#saves-lots-of-useful-stuff-for-each-imported-link)** like HTML, JSON, PDF, PNG, and WARC
-        - ~~**Suitable for paywalled / [authenticated content](https://github.com/pirate/ArchiveBox/wiki/Configuration#chrome_user_data_dir)** (can use your cookies)~~ (do not do this until v0.5 is released with some security fixes)
+        - ~~**Suitable for paywalled / [authenticated content](https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#chrome_user_data_dir)** (can use your cookies)~~ (do not do this until v0.5 is released with some security fixes)
         - **Doesn't require a constantly-running daemon**, proxy, or native app
         - **Doesn't require a constantly-running daemon**, proxy, or native app
         - Provides a CLI, Python API, self-hosted web UI, and REST API (WIP)
         - Provides a CLI, Python API, self-hosted web UI, and REST API (WIP)
-        - Architected to be able to run [**many varieties of scripts during archiving**](https://github.com/pirate/ArchiveBox/issues/51), e.g. to extract media, summarize articles, [scroll pages](https://github.com/pirate/ArchiveBox/issues/80), [close modals](https://github.com/pirate/ArchiveBox/issues/175), expand comment threads, etc.
-        - Can also [**mirror content to 3rd-party archiving services**](https://github.com/pirate/ArchiveBox/wiki/Configuration#submit_archive_dot_org) automatically for redundancy
+        - Architected to be able to run [**many varieties of scripts during archiving**](https://github.com/ArchiveBox/ArchiveBox/issues/51), e.g. to extract media, summarize articles, [scroll pages](https://github.com/ArchiveBox/ArchiveBox/issues/80), [close modals](https://github.com/ArchiveBox/ArchiveBox/issues/175), expand comment threads, etc.
+        - Can also [**mirror content to 3rd-party archiving services**](https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#submit_archive_dot_org) automatically for redundancy
         
         
         ## Input formats
         ## Input formats
         
         
@@ -127,7 +135,7 @@ Description: <div align="center">
         - <img src="https://nicksweeting.com/images/rss.svg" height="22px"/> RSS, XML, JSON, CSV, SQL, HTML, Markdown, TXT, or any other text-based format
         - <img src="https://nicksweeting.com/images/rss.svg" height="22px"/> RSS, XML, JSON, CSV, SQL, HTML, Markdown, TXT, or any other text-based format
         - <img src="https://getpocket.com/favicon.ico" height="22px"/> Pocket, Pinboard, Instapaper, Shaarli, Delicious, Reddit Saved Posts, Wallabag, Unmark.it, OneTab, and more
         - <img src="https://getpocket.com/favicon.ico" height="22px"/> Pocket, Pinboard, Instapaper, Shaarli, Delicious, Reddit Saved Posts, Wallabag, Unmark.it, OneTab, and more
         
         
-        See the [Usage: CLI](https://github.com/pirate/ArchiveBox/wiki/Usage#CLI-Usage) page for documentation and examples.
+        See the [Usage: CLI](https://github.com/ArchiveBox/ArchiveBox/wiki/Usage#CLI-Usage) page for documentation and examples.
         
         
         It also includes a built-in scheduled import feature and browser bookmarklet, so you can ingest URLs from RSS feeds, websites, or the filesystem regularly.
         It also includes a built-in scheduled import feature and browser bookmarklet, so you can ingest URLs from RSS feeds, websites, or the filesystem regularly.
         
         
@@ -152,15 +160,15 @@ Description: <div align="center">
         - **URL to Archive.org:** `archive.org.txt` A link to the saved site on archive.org
         - **URL to Archive.org:** `archive.org.txt` A link to the saved site on archive.org
         - **Audio & Video:** `media/` all audio/video files + playlists, including subtitles & metadata with youtube-dl
         - **Audio & Video:** `media/` all audio/video files + playlists, including subtitles & metadata with youtube-dl
         - **Source Code:** `git/` clone of any repository found on github, bitbucket, or gitlab links
         - **Source Code:** `git/` clone of any repository found on github, bitbucket, or gitlab links
-        - _More coming soon! See the [Roadmap](https://github.com/pirate/ArchiveBox/wiki/Roadmap)..._
+        - _More coming soon! See the [Roadmap](https://github.com/ArchiveBox/ArchiveBox/wiki/Roadmap)..._
         
         
-        It does everything out-of-the-box by default, but you can disable or tweak [individual archive methods](https://github.com/pirate/ArchiveBox/wiki/Configuration) via environment variables or config file.
+        It does everything out-of-the-box by default, but you can disable or tweak [individual archive methods](https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration) via environment variables or config file.
         
         
         ## Dependencies
         ## Dependencies
         
         
-        You don't need to install all the dependencies, ArchiveBox will automatically enable the relevant modules based on whatever you have available, but it's recommended to use the official [Docker image](https://github.com/pirate/ArchiveBox/wiki/Docker) with everything preinstalled.
+        You don't need to install all the dependencies, ArchiveBox will automatically enable the relevant modules based on whatever you have available, but it's recommended to use the official [Docker image](https://github.com/ArchiveBox/ArchiveBox/wiki/Docker) with everything preinstalled.
         
         
-        If you so choose, you can also install ArchiveBox and its dependencies directly on any Linux or macOS systems using the [automated setup script](https://github.com/pirate/ArchiveBox/wiki/Quickstart) or the [system package manager](https://github.com/pirate/ArchiveBox/wiki/Install).
+        If you so choose, you can also install ArchiveBox and its dependencies directly on any Linux or macOS systems using the [automated setup script](https://github.com/ArchiveBox/ArchiveBox/wiki/Quickstart) or the [system package manager](https://github.com/ArchiveBox/ArchiveBox/wiki/Install).
         
         
         ArchiveBox is written in Python 3 so it requires `python3` and `pip3` available on your system. It also uses a set of optional, but highly recommended external dependencies for archiving sites: `wget` (for plain HTML, static files, and WARC saving), `chromium` (for screenshots, PDFs, JS execution, and more), `youtube-dl` (for audio and video), `git` (for cloning git repos), and `nodejs` (for readability and singlefile), and more.
         ArchiveBox is written in Python 3 so it requires `python3` and `pip3` available on your system. It also uses a set of optional, but highly recommended external dependencies for archiving sites: `wget` (for plain HTML, static files, and WARC saving), `chromium` (for screenshots, PDFs, JS execution, and more), `youtube-dl` (for audio and video), `git` (for cloning git repos), and `nodejs` (for readability and singlefile), and more.
         
         
@@ -178,7 +186,7 @@ Description: <div align="center">
         archivebox config --get CHROME_VERSION      # optional: set this to chromium instead of chrome if you don't like Google
         archivebox config --get CHROME_VERSION      # optional: set this to chromium instead of chrome if you don't like Google
         ```
         ```
         
         
-        Be aware that malicious archived JS can also read the contents of other pages in your archive due to snapshot CSRF and XSS protections being imperfect. See the [Security Overview](https://github.com/pirate/ArchiveBox/wiki/Security-Overview#stealth-mode) page for more details.
+        Be aware that malicious archived JS can also read the contents of other pages in your archive due to snapshot CSRF and XSS protections being imperfect. See the [Security Overview](https://github.com/ArchiveBox/ArchiveBox/wiki/Security-Overview#stealth-mode) page for more details.
         ```bash
         ```bash
         # visiting an archived page with malicious JS:
         # visiting an archived page with malicious JS:
         https://127.0.0.1:8000/archive/1602401954/example.com/index.html
         https://127.0.0.1:8000/archive/1602401954/example.com/index.html
@@ -189,7 +197,7 @@ Description: <div align="center">
         # then example.com/index.js can send it off to some evil server
         # then example.com/index.js can send it off to some evil server
         ```
         ```
         
         
-        Support for saving multiple snapshots of each site over time will be [added soon](https://github.com/pirate/ArchiveBox/issues/179) (along with the ability to view diffs of the changes between runs). For now ArchiveBox is designed to only archive each URL with each extractor type once. A workaround to take multiple snapshots of the same URL is to make them slightly different by adding a hash:
+        Support for saving multiple snapshots of each site over time will be [added soon](https://github.com/ArchiveBox/ArchiveBox/issues/179) (along with the ability to view diffs of the changes between runs). For now ArchiveBox is designed to only archive each URL with each extractor type once. A workaround to take multiple snapshots of the same URL is to make them slightly different by adding a hash:
         ```bash
         ```bash
         archivebox add 'https://example.com#2020-10-24'
         archivebox add 'https://example.com#2020-10-24'
         ...
         ...
@@ -211,7 +219,7 @@ Description: <div align="center">
         # docker-compose run archivebox <command> [args]
         # docker-compose run archivebox <command> [args]
         
         
         mkdir archivebox && cd archivebox
         mkdir archivebox && cd archivebox
-        wget 'https://github.com/pirate/ArchiveBox/blob/master/docker-compose.yml'
+        wget 'https://raw.githubusercontent.com/ArchiveBox/ArchiveBox/master/docker-compose.yml'
         docker-compose run archivebox init
         docker-compose run archivebox init
         docker-compose run archivebox add 'https://example.com'
         docker-compose run archivebox add 'https://example.com'
         docker-compose run archivebox manage createsuperuser
         docker-compose run archivebox manage createsuperuser
@@ -222,20 +230,20 @@ Description: <div align="center">
         ## Docker
         ## Docker
         
         
         ```bash
         ```bash
-        # docker run -v $PWD:/data -it nikisweeting/archivebox <command> [args]
+        # docker run -v $PWD:/data -it archivebox/archivebox <command> [args]
         
         
         mkdir archivebox && cd archivebox
         mkdir archivebox && cd archivebox
-        docker run -v $PWD:/data -it nikisweeting/archivebox init
-        docker run -v $PWD:/data -it nikisweeting/archivebox add 'https://example.com'
-        docker run -v $PWD:/data -it nikisweeting/archivebox manage createsuperuser
+        docker run -v $PWD:/data -it archivebox/archivebox init
+        docker run -v $PWD:/data -it archivebox/archivebox add 'https://example.com'
+        docker run -v $PWD:/data -it archivebox/archivebox manage createsuperuser
         
         
         # run the webserver to access the web UI
         # run the webserver to access the web UI
-        docker run -v $PWD:/data -it -p 8000:8000 nikisweeting/archivebox server 0.0.0.0:8000
+        docker run -v $PWD:/data -it -p 8000:8000 archivebox/archivebox server 0.0.0.0:8000
         open http://127.0.0.1:8000
         open http://127.0.0.1:8000
         
         
         # or export a static version of the index if you dont want to run a server
         # or export a static version of the index if you dont want to run a server
-        docker run -v $PWD:/data -it nikisweeting/archivebox list --html --with-headers > index.html
-        docker run -v $PWD:/data -it nikisweeting/archivebox list --json --with-headers > index.json
+        docker run -v $PWD:/data -it archivebox/archivebox list --html --with-headers > index.html
+        docker run -v $PWD:/data -it archivebox/archivebox list --json --with-headers > index.json
         open ./index.html
         open ./index.html
         ```
         ```
         
         
@@ -265,7 +273,7 @@ Description: <div align="center">
         pip install --upgrade archivebox
         pip install --upgrade archivebox
         
         
         # Install node packages in ./node_modules (used for SingleFile, Readability, and Puppeteer)
         # Install node packages in ./node_modules (used for SingleFile, Readability, and Puppeteer)
-        npm install --prefix . 'git+https://github.com/pirate/ArchiveBox.git' 
+        npm install --prefix . 'git+https://github.com/ArchiveBox/ArchiveBox.git' 
         ```
         ```
         
         
         Initialize your archive and add some links:
         Initialize your archive and add some links:
@@ -329,13 +337,13 @@ Description: <div align="center">
         
         
         ## Comparison to Other Projects
         ## Comparison to Other Projects
         
         
-        ▶ **Check out our [community page](https://github.com/pirate/ArchiveBox/wiki/Web-Archiving-Community) for an index of web archiving initiatives and projects.**
+        ▶ **Check out our [community page](https://github.com/ArchiveBox/ArchiveBox/wiki/Web-Archiving-Community) for an index of web archiving initiatives and projects.**
         
         
-        <img src="https://i.imgur.com/4nkFjdv.png" width="10%" align="left"/> The aim of ArchiveBox is to go beyond what the Wayback Machine and other public archiving services can do, by adding a headless browser to replay sessions accurately, and by automatically extracting all the content in multiple redundant formats that will survive being passed down to historians and archivists through many generations.
+        <img src="https://i.imgur.com/4nkFjdv.png" width="10%" align="left" alt="comparison"/> The aim of ArchiveBox is to go beyond what the Wayback Machine and other public archiving services can do, by adding a headless browser to replay sessions accurately, and by automatically extracting all the content in multiple redundant formats that will survive being passed down to historians and archivists through many generations.
         
         
         #### User Interface & Intended Purpose
         #### User Interface & Intended Purpose
         
         
-        ArchiveBox differentiates itself from [similar projects](https://github.com/pirate/ArchiveBox/wiki/Web-Archiving-Community#Web-Archiving-Projects) by being a simple, one-shot CLI interface for users to ingest bulk feeds of URLs over extended periods, as opposed to being a backend service that ingests individual, manually-submitted URLs from a web UI. However, we also have the option to add urls via a web interface through our Django frontend.
+        ArchiveBox differentiates itself from [similar projects](https://github.com/ArchiveBox/ArchiveBox/wiki/Web-Archiving-Community#Web-Archiving-Projects) by being a simple, one-shot CLI interface for users to ingest bulk feeds of URLs over extended periods, as opposed to being a backend service that ingests individual, manually-submitted URLs from a web UI. However, we also have the option to add urls via a web interface through our Django frontend.
         
         
         #### Private Local Archives vs Centralized Public Archives
         #### Private Local Archives vs Centralized Public Archives
         
         
@@ -351,18 +359,18 @@ Description: <div align="center">
         
         
         <img src="https://i.imgur.com/0ZOmOvN.png" width="14%" align="right"/>
         <img src="https://i.imgur.com/0ZOmOvN.png" width="14%" align="right"/>
         
         
-        - [Community Wiki](https://github.com/pirate/ArchiveBox/wiki/Web-Archiving-Community)
-          - [The Master Lists](https://github.com/pirate/ArchiveBox/wiki/Web-Archiving-Community#The-Master-Lists)  
+        - [Community Wiki](https://github.com/ArchiveBox/ArchiveBox/wiki/Web-Archiving-Community)
+          - [The Master Lists](https://github.com/ArchiveBox/ArchiveBox/wiki/Web-Archiving-Community#The-Master-Lists)  
             _Community-maintained indexes of archiving tools and institutions._
             _Community-maintained indexes of archiving tools and institutions._
-          - [Web Archiving Software](https://github.com/pirate/ArchiveBox/wiki/Web-Archiving-Community#Web-Archiving-Projects)  
+          - [Web Archiving Software](https://github.com/ArchiveBox/ArchiveBox/wiki/Web-Archiving-Community#Web-Archiving-Projects)  
             _Open source tools and projects in the internet archiving space._
             _Open source tools and projects in the internet archiving space._
-          - [Reading List](https://github.com/pirate/ArchiveBox/wiki/Web-Archiving-Community#Reading-List)  
+          - [Reading List](https://github.com/ArchiveBox/ArchiveBox/wiki/Web-Archiving-Community#Reading-List)  
             _Articles, posts, and blogs relevant to ArchiveBox and web archiving in general._
             _Articles, posts, and blogs relevant to ArchiveBox and web archiving in general._
-          - [Communities](https://github.com/pirate/ArchiveBox/wiki/Web-Archiving-Community#Communities)  
+          - [Communities](https://github.com/ArchiveBox/ArchiveBox/wiki/Web-Archiving-Community#Communities)  
             _A collection of the most active internet archiving communities and initiatives._
             _A collection of the most active internet archiving communities and initiatives._
-        - Check out the ArchiveBox [Roadmap](https://github.com/pirate/ArchiveBox/wiki/Roadmap) and [Changelog](https://github.com/pirate/ArchiveBox/wiki/Changelog)
+        - Check out the ArchiveBox [Roadmap](https://github.com/ArchiveBox/ArchiveBox/wiki/Roadmap) and [Changelog](https://github.com/ArchiveBox/ArchiveBox/wiki/Changelog)
         - Learn why archiving the internet is important by reading the "[On the Importance of Web Archiving](https://parameters.ssrc.org/2018/09/on-the-importance-of-web-archiving/)" blog post.
         - Learn why archiving the internet is important by reading the "[On the Importance of Web Archiving](https://parameters.ssrc.org/2018/09/on-the-importance-of-web-archiving/)" blog post.
-        - Or reach out to me for questions and comments via [@theSquashSH](https://twitter.com/thesquashSH) on Twitter.
+        - Or reach out to me for questions and comments via [@ArchiveBoxApp](https://twitter.com/ArchiveBoxApp) or [@theSquashSH](https://twitter.com/thesquashSH) on Twitter.
         
         
         ---
         ---
         
         
@@ -370,51 +378,51 @@ Description: <div align="center">
         
         
         <img src="https://read-the-docs-guidelines.readthedocs-hosted.com/_images/logo-dark.png" width="13%" align="right"/>
         <img src="https://read-the-docs-guidelines.readthedocs-hosted.com/_images/logo-dark.png" width="13%" align="right"/>
         
         
-        We use the [Github wiki system](https://github.com/pirate/ArchiveBox/wiki) and [Read the Docs](https://archivebox.readthedocs.io/en/latest/) (WIP) for documentation.
+        We use the [Github wiki system](https://github.com/ArchiveBox/ArchiveBox/wiki) and [Read the Docs](https://archivebox.readthedocs.io/en/latest/) (WIP) for documentation.
         
         
-        You can also access the docs locally by looking in the [`ArchiveBox/docs/`](https://github.com/pirate/ArchiveBox/wiki/Home) folder.
+        You can also access the docs locally by looking in the [`ArchiveBox/docs/`](https://github.com/ArchiveBox/ArchiveBox/wiki/Home) folder.
         
         
         ## Getting Started
         ## Getting Started
         
         
-        - [Quickstart](https://github.com/pirate/ArchiveBox/wiki/Quickstart)
-        - [Install](https://github.com/pirate/ArchiveBox/wiki/Install)
-        - [Docker](https://github.com/pirate/ArchiveBox/wiki/Docker)
+        - [Quickstart](https://github.com/ArchiveBox/ArchiveBox/wiki/Quickstart)
+        - [Install](https://github.com/ArchiveBox/ArchiveBox/wiki/Install)
+        - [Docker](https://github.com/ArchiveBox/ArchiveBox/wiki/Docker)
         
         
         ## Reference
         ## Reference
         
         
-        - [Usage](https://github.com/pirate/ArchiveBox/wiki/Usage)
-        - [Configuration](https://github.com/pirate/ArchiveBox/wiki/Configuration)
-        - [Supported Sources](https://github.com/pirate/ArchiveBox/wiki/Quickstart#2-get-your-list-of-urls-to-archive)
-        - [Supported Outputs](https://github.com/pirate/ArchiveBox/wiki#can-save-these-things-for-each-site)
-        - [Scheduled Archiving](https://github.com/pirate/ArchiveBox/wiki/Scheduled-Archiving)
-        - [Publishing Your Archive](https://github.com/pirate/ArchiveBox/wiki/Publishing-Your-Archive)
-        - [Chromium Install](https://github.com/pirate/ArchiveBox/wiki/Install-Chromium)
-        - [Security Overview](https://github.com/pirate/ArchiveBox/wiki/Security-Overview)
-        - [Troubleshooting](https://github.com/pirate/ArchiveBox/wiki/Troubleshooting)
+        - [Usage](https://github.com/ArchiveBox/ArchiveBox/wiki/Usage)
+        - [Configuration](https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration)
+        - [Supported Sources](https://github.com/ArchiveBox/ArchiveBox/wiki/Quickstart#2-get-your-list-of-urls-to-archive)
+        - [Supported Outputs](https://github.com/ArchiveBox/ArchiveBox/wiki#can-save-these-things-for-each-site)
+        - [Scheduled Archiving](https://github.com/ArchiveBox/ArchiveBox/wiki/Scheduled-Archiving)
+        - [Publishing Your Archive](https://github.com/ArchiveBox/ArchiveBox/wiki/Publishing-Your-Archive)
+        - [Chromium Install](https://github.com/ArchiveBox/ArchiveBox/wiki/Install-Chromium)
+        - [Security Overview](https://github.com/ArchiveBox/ArchiveBox/wiki/Security-Overview)
+        - [Troubleshooting](https://github.com/ArchiveBox/ArchiveBox/wiki/Troubleshooting)
         - [Python API](https://docs.archivebox.io/en/latest/modules.html)
         - [Python API](https://docs.archivebox.io/en/latest/modules.html)
         - REST API (coming soon...)
         - REST API (coming soon...)
         
         
         ## More Info
         ## More Info
         
         
-        - [Tickets](https://github.com/pirate/ArchiveBox/issues)
-        - [Roadmap](https://github.com/pirate/ArchiveBox/wiki/Roadmap)
-        - [Changelog](https://github.com/pirate/ArchiveBox/wiki/Changelog)
-        - [Donations](https://github.com/pirate/ArchiveBox/wiki/Donations)
-        - [Background & Motivation](https://github.com/pirate/ArchiveBox#background--motivation)
-        - [Web Archiving Community](https://github.com/pirate/ArchiveBox/wiki/Web-Archiving-Community)
+        - [Tickets](https://github.com/ArchiveBox/ArchiveBox/issues)
+        - [Roadmap](https://github.com/ArchiveBox/ArchiveBox/wiki/Roadmap)
+        - [Changelog](https://github.com/ArchiveBox/ArchiveBox/wiki/Changelog)
+        - [Donations](https://github.com/ArchiveBox/ArchiveBox/wiki/Donations)
+        - [Background & Motivation](https://github.com/ArchiveBox/ArchiveBox#background--motivation)
+        - [Web Archiving Community](https://github.com/ArchiveBox/ArchiveBox/wiki/Web-Archiving-Community)
         
         
         ---
         ---
         
         
         # ArchiveBox Development
         # ArchiveBox Development
         
         
-        All contributions to ArchiveBox are welcomed! Check our [issues](https://github.com/pirate/ArchiveBox/issues) and [Roadmap](https://github.com/pirate/ArchiveBox/wiki/Roadmap) for things to work on, and please open an issue to discuss your proposed implementation before working on things! Otherwise we may have to close your PR if it doesn't align with our roadmap.
+        All contributions to ArchiveBox are welcomed! Check our [issues](https://github.com/ArchiveBox/ArchiveBox/issues) and [Roadmap](https://github.com/ArchiveBox/ArchiveBox/wiki/Roadmap) for things to work on, and please open an issue to discuss your proposed implementation before working on things! Otherwise we may have to close your PR if it doesn't align with our roadmap.
         
         
         ### Setup the dev environment
         ### Setup the dev environment
         
         
         First, install the system dependencies from the "Bare Metal" section above.
         First, install the system dependencies from the "Bare Metal" section above.
         Then you can clone the ArchiveBox repo and install
         Then you can clone the ArchiveBox repo and install
         ```python3
         ```python3
-        git clone https://github.com/pirate/ArchiveBox
+        git clone https://github.com/ArchiveBox/ArchiveBox
         cd ArchiveBox
         cd ArchiveBox
         git checkout master  # or the branch you want to test
         git checkout master  # or the branch you want to test
         git pull
         git pull
@@ -494,8 +502,8 @@ Description: <div align="center">
         <a href="https://www.patreon.com/theSquashSH"><img src="https://img.shields.io/badge/Donate_to_support_development-via_Patreon-%23DD5D76.svg?style=flat"/></a>
         <a href="https://www.patreon.com/theSquashSH"><img src="https://img.shields.io/badge/Donate_to_support_development-via_Patreon-%23DD5D76.svg?style=flat"/></a>
         <br/>
         <br/>
         
         
-        <a href="https://twitter.com/thesquashSH"><img src="https://img.shields.io/badge/Tweet-%40theSquashSH-blue.svg?style=flat"/></a>
-        <a href="https://github.com/pirate/ArchiveBox"><img src="https://img.shields.io/github/stars/pirate/ArchiveBox.svg?style=flat&label=Star+on+Github"/></a>
+        <a href="https://twitter.com/ArchiveBoxApp"><img src="https://img.shields.io/badge/Tweet-%40ArchiveBoxApp-blue.svg?style=flat"/></a>
+        <a href="https://github.com/ArchiveBox/ArchiveBox"><img src="https://img.shields.io/github/stars/ArchiveBox/ArchiveBox.svg?style=flat&label=Star+on+Github"/></a>
         
         
         <br/><br/>
         <br/><br/>
         
         

+ 1 - 0
archivebox.egg-info/SOURCES.txt

@@ -6,6 +6,7 @@ archivebox/LICENSE
 archivebox/README.md
 archivebox/README.md
 archivebox/__init__.py
 archivebox/__init__.py
 archivebox/__main__.py
 archivebox/__main__.py
+archivebox/base32_crockford.py
 archivebox/config.py
 archivebox/config.py
 archivebox/config_stubs.py
 archivebox/config_stubs.py
 archivebox/logging_util.py
 archivebox/logging_util.py

+ 1 - 1
archivebox.egg-info/requires.txt

@@ -1,7 +1,6 @@
 requests==2.24.0
 requests==2.24.0
 atomicwrites==1.4.0
 atomicwrites==1.4.0
 mypy-extensions==0.4.3
 mypy-extensions==0.4.3
-base32-crockford==0.3.0
 django==3.1.3
 django==3.1.3
 django-extensions==3.0.3
 django-extensions==3.0.3
 dateparser
 dateparser
@@ -23,3 +22,4 @@ sphinx-rtd-theme
 recommonmark
 recommonmark
 pytest
 pytest
 bottle
 bottle
+stdeb

+ 172 - 0
archivebox/base32_crockford.py

@@ -0,0 +1,172 @@
+"""
+base32-crockford
+================
+
+A Python module implementing the alternate base32 encoding as described
+by Douglas Crockford at: http://www.crockford.com/wrmg/base32.html.
+
+He designed the encoding to:
+
+   * Be human and machine readable
+   * Be compact
+   * Be error resistant
+   * Be pronounceable
+
+It uses a symbol set of 10 digits and 22 letters, excluding I, L O and
+U. Decoding is not case sensitive, and 'i' and 'l' are converted to '1'
+and 'o' is converted to '0'. Encoding uses only upper-case characters.
+
+Hyphens may be present in symbol strings to improve readability, and
+are removed when decoding.
+
+A check symbol can be appended to a symbol string to detect errors
+within the string.
+
+"""
+
+import re
+import sys
+
+PY3 = sys.version_info[0] == 3
+
+if not PY3:
+    import string as str
+
+
+__all__ = ["encode", "decode", "normalize"]
+
+
+if PY3:
+    string_types = (str,)
+else:
+    string_types = (basestring,)  # noqa
+
+# The encoded symbol space does not include I, L, O or U
+symbols = '0123456789ABCDEFGHJKMNPQRSTVWXYZ'
+# These five symbols are exclusively for checksum values
+check_symbols = '*~$=U'
+
+encode_symbols = dict((i, ch) for (i, ch) in enumerate(symbols + check_symbols))
+decode_symbols = dict((ch, i) for (i, ch) in enumerate(symbols + check_symbols))
+normalize_symbols = str.maketrans('IiLlOo', '111100')
+valid_symbols = re.compile('^[%s]+[%s]?$' % (symbols,
+                                             re.escape(check_symbols)))
+
+base = len(symbols)
+check_base = len(symbols + check_symbols)
+
+
+def encode(number, checksum=False, split=0):
+    """Encode an integer into a symbol string.
+
+    A ValueError is raised on invalid input.
+
+    If checksum is set to True, a check symbol will be
+    calculated and appended to the string.
+
+    If split is specified, the string will be divided into
+    clusters of that size separated by hyphens.
+
+    The encoded string is returned.
+    """
+    number = int(number)
+    if number < 0:
+        raise ValueError("number '%d' is not a positive integer" % number)
+
+    split = int(split)
+    if split < 0:
+        raise ValueError("split '%d' is not a positive integer" % split)
+
+    check_symbol = ''
+    if checksum:
+        check_symbol = encode_symbols[number % check_base]
+
+    if number == 0:
+        return '0' + check_symbol
+
+    symbol_string = ''
+    while number > 0:
+        remainder = number % base
+        number //= base
+        symbol_string = encode_symbols[remainder] + symbol_string
+    symbol_string = symbol_string + check_symbol
+
+    if split:
+        chunks = []
+        for pos in range(0, len(symbol_string), split):
+            chunks.append(symbol_string[pos:pos + split])
+        symbol_string = '-'.join(chunks)
+
+    return symbol_string
+
+
+def decode(symbol_string, checksum=False, strict=False):
+    """Decode an encoded symbol string.
+
+    If checksum is set to True, the string is assumed to have a
+    trailing check symbol which will be validated. If the
+    checksum validation fails, a ValueError is raised.
+
+    If strict is set to True, a ValueError is raised if the
+    normalization step requires changes to the string.
+
+    The decoded string is returned.
+    """
+    symbol_string = normalize(symbol_string, strict=strict)
+    if checksum:
+        symbol_string, check_symbol = symbol_string[:-1], symbol_string[-1]
+
+    number = 0
+    for symbol in symbol_string:
+        number = number * base + decode_symbols[symbol]
+
+    if checksum:
+        check_value = decode_symbols[check_symbol]
+        modulo = number % check_base
+        if check_value != modulo:
+            raise ValueError("invalid check symbol '%s' for string '%s'" %
+                             (check_symbol, symbol_string))
+
+    return number
+
+
+def normalize(symbol_string, strict=False):
+    """Normalize an encoded symbol string.
+
+    Normalization provides error correction and prepares the
+    string for decoding. These transformations are applied:
+
+       1. Hyphens are removed
+       2. 'I', 'i', 'L' or 'l' are converted to '1'
+       3. 'O' or 'o' are converted to '0'
+       4. All characters are converted to uppercase
+
+    A TypeError is raised if an invalid string type is provided.
+
+    A ValueError is raised if the normalized string contains
+    invalid characters.
+
+    If the strict parameter is set to True, a ValueError is raised
+    if any of the above transformations are applied.
+
+    The normalized string is returned.
+    """
+    if isinstance(symbol_string, string_types):
+        if not PY3:
+            try:
+                symbol_string = symbol_string.encode('ascii')
+            except UnicodeEncodeError:
+                raise ValueError("string should only contain ASCII characters")
+    else:
+        raise TypeError("string is of invalid type %s" %
+                        symbol_string.__class__.__name__)
+
+    norm_string = symbol_string.replace('-', '').translate(normalize_symbols).upper()
+
+    if not valid_symbols.match(norm_string):
+        raise ValueError("string '%s' contains invalid characters" % norm_string)
+
+    if strict and norm_string != symbol_string:
+        raise ValueError("string '%s' requires normalization" % symbol_string)
+
+    return norm_string

+ 9 - 1
archivebox/cli/archivebox_add.py

@@ -62,10 +62,17 @@ def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional
         help="Re-archive URLs from scratch, overwriting any existing files"
         help="Re-archive URLs from scratch, overwriting any existing files"
     )
     )
     parser.add_argument(
     parser.add_argument(
-        '--init', #'-i',
+        "--init", #'-i',
         action='store_true',
         action='store_true',
         help="Init/upgrade the curent data directory before adding",
         help="Init/upgrade the curent data directory before adding",
     )
     )
+    parser.add_argument(
+        "--extract",
+        type=str,
+        help="Pass a list of the extractors to be used. If the method name is not correct, it will be ignored. \
+              This does not take precedence over the configuration",
+        default=""
+    )
     command = parser.parse_args(args or ())
     command = parser.parse_args(args or ())
     urls = command.urls
     urls = command.urls
     stdin_urls = accept_stdin(stdin)
     stdin_urls = accept_stdin(stdin)
@@ -83,6 +90,7 @@ def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional
         overwrite=command.overwrite,
         overwrite=command.overwrite,
         init=command.init,
         init=command.init,
         out_dir=pwd or OUTPUT_DIR,
         out_dir=pwd or OUTPUT_DIR,
+        extractors=command.extract,
     )
     )
 
 
 
 

+ 1 - 1
archivebox/cli/archivebox_list.py

@@ -98,7 +98,7 @@ def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional
     parser.add_argument(
     parser.add_argument(
         '--filter-type',
         '--filter-type',
         type=str,
         type=str,
-        choices=('exact', 'substring', 'domain', 'regex'),
+        choices=('exact', 'substring', 'domain', 'regex','tag'),
         default='exact',
         default='exact',
         help='Type of pattern matching to use when filtering URLs',
         help='Type of pattern matching to use when filtering URLs',
     )
     )

+ 1 - 1
archivebox/cli/archivebox_remove.py

@@ -50,7 +50,7 @@ def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional
     parser.add_argument(
     parser.add_argument(
         '--filter-type',
         '--filter-type',
         type=str,
         type=str,
-        choices=('exact', 'substring', 'domain', 'regex'),
+        choices=('exact', 'substring', 'domain', 'regex','tag'),
         default='exact',
         default='exact',
         help='Type of pattern matching to use when filtering URLs',
         help='Type of pattern matching to use when filtering URLs',
     )
     )

+ 45 - 19
archivebox/config.py

@@ -36,7 +36,7 @@ from .config_stubs import (
 # 
 # 
 
 
 # ******************************************************************************
 # ******************************************************************************
-# Documentation: https://github.com/pirate/ArchiveBox/wiki/Configuration
+# Documentation: https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration
 # Use the 'env' command to pass config options to ArchiveBox.  e.g.:
 # Use the 'env' command to pass config options to ArchiveBox.  e.g.:
 #     env USE_COLOR=True CHROME_BINARY=chromium archivebox add < example.html
 #     env USE_COLOR=True CHROME_BINARY=chromium archivebox add < example.html
 # ******************************************************************************
 # ******************************************************************************
@@ -98,8 +98,8 @@ CONFIG_DEFAULTS: Dict[str, ConfigDefaultDict] = {
         'GIT_DOMAINS':              {'type': str,   'default': 'github.com,bitbucket.org,gitlab.com'},
         'GIT_DOMAINS':              {'type': str,   'default': 'github.com,bitbucket.org,gitlab.com'},
         'CHECK_SSL_VALIDITY':       {'type': bool,  'default': True},
         'CHECK_SSL_VALIDITY':       {'type': bool,  'default': True},
 
 
-        'CURL_USER_AGENT':          {'type': str,   'default': 'ArchiveBox/{VERSION} (+https://github.com/pirate/ArchiveBox/) curl/{CURL_VERSION}'},
-        'WGET_USER_AGENT':          {'type': str,   'default': 'ArchiveBox/{VERSION} (+https://github.com/pirate/ArchiveBox/) wget/{WGET_VERSION}'},
+        'CURL_USER_AGENT':          {'type': str,   'default': 'ArchiveBox/{VERSION} (+https://github.com/ArchiveBox/ArchiveBox/) curl/{CURL_VERSION}'},
+        'WGET_USER_AGENT':          {'type': str,   'default': 'ArchiveBox/{VERSION} (+https://github.com/ArchiveBox/ArchiveBox/) wget/{WGET_VERSION}'},
         'CHROME_USER_AGENT':        {'type': str,   'default': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.75 Safari/537.36'},
         'CHROME_USER_AGENT':        {'type': str,   'default': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.75 Safari/537.36'},
 
 
         'COOKIES_FILE':             {'type': str,   'default': None},
         'COOKIES_FILE':             {'type': str,   'default': None},
@@ -157,6 +157,7 @@ CONFIG_DEFAULTS: Dict[str, ConfigDefaultDict] = {
         'READABILITY_BINARY':       {'type': str,   'default': 'readability-extractor'},
         'READABILITY_BINARY':       {'type': str,   'default': 'readability-extractor'},
         'MERCURY_BINARY':           {'type': str,   'default': 'mercury-parser'},
         'MERCURY_BINARY':           {'type': str,   'default': 'mercury-parser'},
         'YOUTUBEDL_BINARY':         {'type': str,   'default': 'youtube-dl'},
         'YOUTUBEDL_BINARY':         {'type': str,   'default': 'youtube-dl'},
+        'NODE_BINARY':              {'type': str,   'default': 'node'},
         'CHROME_BINARY':            {'type': str,   'default': None},
         'CHROME_BINARY':            {'type': str,   'default': None},
     },
     },
 }
 }
@@ -248,7 +249,7 @@ CONFIG_HEADER = (
 #    archivebox init
 #    archivebox init
 #
 #
 # A list of all possible config with documentation and examples can be found here:
 # A list of all possible config with documentation and examples can be found here:
-#    https://github.com/pirate/ArchiveBox/wiki/Configuration
+#    https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration
 
 
 """)
 """)
 
 
@@ -296,6 +297,7 @@ DERIVED_CONFIG_DEFAULTS: ConfigDefaultDict = {
     'SAVE_WARC':                {'default': lambda c: c['USE_WGET'] and c['SAVE_WARC']},
     'SAVE_WARC':                {'default': lambda c: c['USE_WGET'] and c['SAVE_WARC']},
     'WGET_ARGS':                {'default': lambda c: c['WGET_ARGS'] or []},
     'WGET_ARGS':                {'default': lambda c: c['WGET_ARGS'] or []},
 
 
+
     'USE_SINGLEFILE':           {'default': lambda c: c['USE_SINGLEFILE'] and c['SAVE_SINGLEFILE']},
     'USE_SINGLEFILE':           {'default': lambda c: c['USE_SINGLEFILE'] and c['SAVE_SINGLEFILE']},
     'SINGLEFILE_VERSION':       {'default': lambda c: bin_version(c['SINGLEFILE_BINARY']) if c['USE_SINGLEFILE'] else None},
     'SINGLEFILE_VERSION':       {'default': lambda c: bin_version(c['SINGLEFILE_BINARY']) if c['USE_SINGLEFILE'] else None},
 
 
@@ -318,6 +320,8 @@ DERIVED_CONFIG_DEFAULTS: ConfigDefaultDict = {
     'CHROME_BINARY':            {'default': lambda c: c['CHROME_BINARY'] if c['CHROME_BINARY'] else find_chrome_binary()},
     'CHROME_BINARY':            {'default': lambda c: c['CHROME_BINARY'] if c['CHROME_BINARY'] else find_chrome_binary()},
     'CHROME_VERSION':           {'default': lambda c: bin_version(c['CHROME_BINARY']) if c['USE_CHROME'] else None},
     'CHROME_VERSION':           {'default': lambda c: bin_version(c['CHROME_BINARY']) if c['USE_CHROME'] else None},
     'USE_NODE':                 {'default': lambda c: c['USE_NODE'] and (c['SAVE_READABILITY'] or c['SAVE_SINGLEFILE'])},
     'USE_NODE':                 {'default': lambda c: c['USE_NODE'] and (c['SAVE_READABILITY'] or c['SAVE_SINGLEFILE'])},
+    'NODE_VERSION':             {'default': lambda c: bin_version(c['NODE_BINARY']) if c['USE_NODE'] else None},
+    
     'SAVE_PDF':                 {'default': lambda c: c['USE_CHROME'] and c['SAVE_PDF']},
     'SAVE_PDF':                 {'default': lambda c: c['USE_CHROME'] and c['SAVE_PDF']},
     'SAVE_SCREENSHOT':          {'default': lambda c: c['USE_CHROME'] and c['SAVE_SCREENSHOT']},
     'SAVE_SCREENSHOT':          {'default': lambda c: c['USE_CHROME'] and c['SAVE_SCREENSHOT']},
     'SAVE_DOM':                 {'default': lambda c: c['USE_CHROME'] and c['SAVE_DOM']},
     'SAVE_DOM':                 {'default': lambda c: c['USE_CHROME'] and c['SAVE_DOM']},
@@ -505,7 +509,7 @@ def load_config(defaults: ConfigDefaultDict,
             stderr('    Check your config for mistakes and try again (your archive data is unaffected).')
             stderr('    Check your config for mistakes and try again (your archive data is unaffected).')
             stderr()
             stderr()
             stderr('    For config documentation and examples see:')
             stderr('    For config documentation and examples see:')
-            stderr('        https://github.com/pirate/ArchiveBox/wiki/Configuration')
+            stderr('        https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration')
             stderr()
             stderr()
             raise
             raise
             raise SystemExit(2)
             raise SystemExit(2)
@@ -565,7 +569,7 @@ def bin_version(binary: Optional[str]) -> Optional[str]:
         # stderr(f'        {binary} --version')
         # stderr(f'        {binary} --version')
         # stderr()
         # stderr()
         # stderr('    If you don\'t want to install it, you can disable it via config. See here for more info:')
         # stderr('    If you don\'t want to install it, you can disable it via config. See here for more info:')
-        # stderr('        https://github.com/pirate/ArchiveBox/wiki/Install')
+        # stderr('        https://github.com/ArchiveBox/ArchiveBox/wiki/Install')
     return None
     return None
 
 
 def bin_path(binary: Optional[str]) -> Optional[str]:
 def bin_path(binary: Optional[str]) -> Optional[str]:
@@ -643,12 +647,15 @@ def find_chrome_data_dir() -> Optional[str]:
     return None
     return None
 
 
 def wget_supports_compression(config):
 def wget_supports_compression(config):
-    cmd = [
-        config['WGET_BINARY'],
-        "--compression=auto",
-        "--help",
-    ]
-    return not run(cmd, stdout=DEVNULL, stderr=DEVNULL).returncode
+    try:
+        cmd = [
+            config['WGET_BINARY'],
+            "--compression=auto",
+            "--help",
+        ]
+        return not run(cmd, stdout=DEVNULL, stderr=DEVNULL).returncode
+    except (FileNotFoundError, OSError):
+        return False
 
 
 def get_code_locations(config: ConfigDict) -> SimpleConfigValueDict:
 def get_code_locations(config: ConfigDict) -> SimpleConfigValueDict:
     return {
     return {
@@ -662,6 +669,11 @@ def get_code_locations(config: ConfigDict) -> SimpleConfigValueDict:
             'enabled': True,
             'enabled': True,
             'is_valid': (config['TEMPLATES_DIR'] / 'static').exists(),
             'is_valid': (config['TEMPLATES_DIR'] / 'static').exists(),
         },
         },
+        # 'NODE_MODULES_DIR': {
+        #     'path': ,
+        #     'enabled': ,
+        #     'is_valid': (...).exists(),
+        # },
     }
     }
 
 
 def get_external_locations(config: ConfigDict) -> ConfigValue:
 def get_external_locations(config: ConfigDict) -> ConfigValue:
@@ -715,6 +727,13 @@ def get_data_locations(config: ConfigDict) -> ConfigValue:
 
 
 def get_dependency_info(config: ConfigDict) -> ConfigValue:
 def get_dependency_info(config: ConfigDict) -> ConfigValue:
     return {
     return {
+        'ARCHIVEBOX_BINARY': {
+            'path': bin_path(config['ARCHIVEBOX_BINARY']),
+            'version': config['VERSION'],
+            'hash': bin_hash(config['ARCHIVEBOX_BINARY']),
+            'enabled': True,
+            'is_valid': True,
+        },
         'PYTHON_BINARY': {
         'PYTHON_BINARY': {
             'path': bin_path(config['PYTHON_BINARY']),
             'path': bin_path(config['PYTHON_BINARY']),
             'version': config['PYTHON_VERSION'],
             'version': config['PYTHON_VERSION'],
@@ -743,6 +762,13 @@ def get_dependency_info(config: ConfigDict) -> ConfigValue:
             'enabled': config['USE_WGET'],
             'enabled': config['USE_WGET'],
             'is_valid': bool(config['WGET_VERSION']),
             'is_valid': bool(config['WGET_VERSION']),
         },
         },
+        'NODE_BINARY': {
+            'path': bin_path(config['NODE_BINARY']),
+            'version': config['NODE_VERSION'],
+            'hash': bin_hash(config['NODE_BINARY']),
+            'enabled': config['USE_NODE'],
+            'is_valid': bool(config['SINGLEFILE_VERSION']),
+        },
         'SINGLEFILE_BINARY': {
         'SINGLEFILE_BINARY': {
             'path': bin_path(config['SINGLEFILE_BINARY']),
             'path': bin_path(config['SINGLEFILE_BINARY']),
             'version': config['SINGLEFILE_VERSION'],
             'version': config['SINGLEFILE_VERSION'],
@@ -828,13 +854,13 @@ def check_system_config(config: ConfigDict=CONFIG) -> None:
     if config['USER'] == 'root':
     if config['USER'] == 'root':
         stderr('[!] ArchiveBox should never be run as root!', color='red')
         stderr('[!] ArchiveBox should never be run as root!', color='red')
         stderr('    For more information, see the security overview documentation:')
         stderr('    For more information, see the security overview documentation:')
-        stderr('        https://github.com/pirate/ArchiveBox/wiki/Security-Overview#do-not-run-as-root')
+        stderr('        https://github.com/ArchiveBox/ArchiveBox/wiki/Security-Overview#do-not-run-as-root')
         raise SystemExit(2)
         raise SystemExit(2)
 
 
     ### Check Python environment
     ### Check Python environment
     if sys.version_info[:3] < (3, 6, 0):
     if sys.version_info[:3] < (3, 6, 0):
         stderr(f'[X] Python version is not new enough: {config["PYTHON_VERSION"]} (>3.6 is required)', color='red')
         stderr(f'[X] Python version is not new enough: {config["PYTHON_VERSION"]} (>3.6 is required)', color='red')
-        stderr('    See https://github.com/pirate/ArchiveBox/wiki/Troubleshooting#python for help upgrading your Python installation.')
+        stderr('    See https://github.com/ArchiveBox/ArchiveBox/wiki/Troubleshooting#python for help upgrading your Python installation.')
         raise SystemExit(2)
         raise SystemExit(2)
 
 
     if config['PYTHON_ENCODING'] not in ('UTF-8', 'UTF8'):
     if config['PYTHON_ENCODING'] not in ('UTF-8', 'UTF8'):
@@ -854,7 +880,7 @@ def check_system_config(config: ConfigDict=CONFIG) -> None:
             stderr(f'    {config["CHROME_USER_DATA_DIR"]}')
             stderr(f'    {config["CHROME_USER_DATA_DIR"]}')
             stderr('    Make sure you set it to a Chrome user data directory containing a Default profile folder.')
             stderr('    Make sure you set it to a Chrome user data directory containing a Default profile folder.')
             stderr('    For more info see:')
             stderr('    For more info see:')
-            stderr('        https://github.com/pirate/ArchiveBox/wiki/Configuration#CHROME_USER_DATA_DIR')
+            stderr('        https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#CHROME_USER_DATA_DIR')
             if '/Default' in str(config['CHROME_USER_DATA_DIR']):
             if '/Default' in str(config['CHROME_USER_DATA_DIR']):
                 stderr()
                 stderr()
                 stderr('    Try removing /Default from the end e.g.:')
                 stderr('    Try removing /Default from the end e.g.:')
@@ -878,7 +904,7 @@ def check_dependencies(config: ConfigDict=CONFIG, show_help: bool=True) -> None:
                 )
                 )
             )
             )
             if dependency in ('SINGLEFILE_BINARY', 'READABILITY_BINARY', 'MERCURY_BINARY'):
             if dependency in ('SINGLEFILE_BINARY', 'READABILITY_BINARY', 'MERCURY_BINARY'):
-                hint(('npm install --prefix . "git+https://github.com/pirate/ArchiveBox.git"',
+                hint(('npm install --prefix . "git+https://github.com/ArchiveBox/ArchiveBox.git"',
                     f'or archivebox config --set SAVE_{dependency.rsplit("_", 1)[0]}=False to silence this warning',
                     f'or archivebox config --set SAVE_{dependency.rsplit("_", 1)[0]}=False to silence this warning',
                     ''), prefix='      ')
                     ''), prefix='      ')
         stderr('')
         stderr('')
@@ -889,7 +915,7 @@ def check_dependencies(config: ConfigDict=CONFIG, show_help: bool=True) -> None:
         stderr('    (Setting it to somewhere between 30 and 3000 seconds is recommended)')
         stderr('    (Setting it to somewhere between 30 and 3000 seconds is recommended)')
         stderr()
         stderr()
         stderr('    If you want to make ArchiveBox run faster, disable specific archive methods instead:')
         stderr('    If you want to make ArchiveBox run faster, disable specific archive methods instead:')
-        stderr('        https://github.com/pirate/ArchiveBox/wiki/Configuration#archive-method-toggles')
+        stderr('        https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#archive-method-toggles')
         stderr()
         stderr()
 
 
     elif config['USE_CHROME'] and config['TIMEOUT'] < 15:
     elif config['USE_CHROME'] and config['TIMEOUT'] < 15:
@@ -898,7 +924,7 @@ def check_dependencies(config: ConfigDict=CONFIG, show_help: bool=True) -> None:
         stderr('    (Setting it to somewhere between 30 and 300 seconds is recommended)')
         stderr('    (Setting it to somewhere between 30 and 300 seconds is recommended)')
         stderr()
         stderr()
         stderr('    If you want to make ArchiveBox run faster, disable specific archive methods instead:')
         stderr('    If you want to make ArchiveBox run faster, disable specific archive methods instead:')
-        stderr('        https://github.com/pirate/ArchiveBox/wiki/Configuration#archive-method-toggles')
+        stderr('        https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#archive-method-toggles')
         stderr()
         stderr()
 
 
     if config['USE_YOUTUBEDL'] and config['MEDIA_TIMEOUT'] < 20:
     if config['USE_YOUTUBEDL'] and config['MEDIA_TIMEOUT'] < 20:
@@ -907,7 +933,7 @@ def check_dependencies(config: ConfigDict=CONFIG, show_help: bool=True) -> None:
         stderr('    (Setting it somewhere over 60 seconds is recommended)')
         stderr('    (Setting it somewhere over 60 seconds is recommended)')
         stderr()
         stderr()
         stderr('    If you want to disable media archiving entirely, set SAVE_MEDIA=False instead:')
         stderr('    If you want to disable media archiving entirely, set SAVE_MEDIA=False instead:')
-        stderr('        https://github.com/pirate/ArchiveBox/wiki/Configuration#save_media')
+        stderr('        https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#save_media')
         stderr()
         stderr()
         
         
 def check_data_folder(out_dir: Optional[str]=None, config: ConfigDict=CONFIG) -> None:
 def check_data_folder(out_dir: Optional[str]=None, config: ConfigDict=CONFIG) -> None:

+ 1 - 1
archivebox/core/admin.py

@@ -86,7 +86,7 @@ class SnapshotAdmin(admin.ModelAdmin):
     list_display = ('added', 'title_str', 'url_str', 'files', 'size')
     list_display = ('added', 'title_str', 'url_str', 'files', 'size')
     sort_fields = ('title_str', 'url_str', 'added')
     sort_fields = ('title_str', 'url_str', 'added')
     readonly_fields = ('id', 'url', 'timestamp', 'num_outputs', 'is_archived', 'url_hash', 'added', 'updated')
     readonly_fields = ('id', 'url', 'timestamp', 'num_outputs', 'is_archived', 'url_hash', 'added', 'updated')
-    search_fields = ('url', 'timestamp', 'title', 'tags')
+    search_fields = ['url', 'timestamp', 'title', 'tags__name']
     fields = (*readonly_fields, 'title', 'tags')
     fields = (*readonly_fields, 'title', 'tags')
     list_filter = ('added', 'updated', 'tags')
     list_filter = ('added', 'updated', 'tags')
     ordering = ['-added']
     ordering = ['-added']

+ 1 - 1
archivebox/core/urls.py

@@ -14,7 +14,7 @@ urlpatterns = [
     path('robots.txt', static.serve, {'document_root': settings.OUTPUT_DIR, 'path': 'robots.txt'}),
     path('robots.txt', static.serve, {'document_root': settings.OUTPUT_DIR, 'path': 'robots.txt'}),
     path('favicon.ico', static.serve, {'document_root': settings.OUTPUT_DIR, 'path': 'favicon.ico'}),
     path('favicon.ico', static.serve, {'document_root': settings.OUTPUT_DIR, 'path': 'favicon.ico'}),
 
 
-    path('docs/', RedirectView.as_view(url='https://github.com/pirate/ArchiveBox/wiki'), name='Docs'),
+    path('docs/', RedirectView.as_view(url='https://github.com/ArchiveBox/ArchiveBox/wiki'), name='Docs'),
 
 
     path('archive/', RedirectView.as_view(url='/')),
     path('archive/', RedirectView.as_view(url='/')),
     path('archive/<path:path>', LinkDetails.as_view(), name='LinkAssets'),
     path('archive/<path:path>', LinkDetails.as_view(), name='LinkAssets'),

+ 1 - 0
archivebox/index/__init__.py

@@ -361,6 +361,7 @@ LINK_FILTERS = {
     'substring': lambda pattern: Q(url__icontains=pattern),
     'substring': lambda pattern: Q(url__icontains=pattern),
     'regex': lambda pattern: Q(url__iregex=pattern),
     'regex': lambda pattern: Q(url__iregex=pattern),
     'domain': lambda pattern: Q(url__istartswith=f"http://{pattern}") | Q(url__istartswith=f"https://{pattern}") | Q(url__istartswith=f"ftp://{pattern}"),
     'domain': lambda pattern: Q(url__istartswith=f"http://{pattern}") | Q(url__istartswith=f"https://{pattern}") | Q(url__istartswith=f"ftp://{pattern}"),
+    'tag': lambda pattern: Q(tags__name=pattern),
 }
 }
 
 
 @enforce_types
 @enforce_types

+ 3 - 3
archivebox/index/json.py

@@ -32,9 +32,9 @@ MAIN_INDEX_HEADER = {
         'version': VERSION,
         'version': VERSION,
         'git_sha': GIT_SHA,
         'git_sha': GIT_SHA,
         'website': 'https://ArchiveBox.io',
         'website': 'https://ArchiveBox.io',
-        'docs': 'https://github.com/pirate/ArchiveBox/wiki',
-        'source': 'https://github.com/pirate/ArchiveBox',
-        'issues': 'https://github.com/pirate/ArchiveBox/issues',
+        'docs': 'https://github.com/ArchiveBox/ArchiveBox/wiki',
+        'source': 'https://github.com/ArchiveBox/ArchiveBox',
+        'issues': 'https://github.com/ArchiveBox/ArchiveBox/issues',
         'dependencies': DEPENDENCIES,
         'dependencies': DEPENDENCIES,
     },
     },
 }
 }

+ 1 - 1
archivebox/logging_util.py

@@ -447,7 +447,7 @@ def log_shell_welcome_msg():
     print('{green}from archivebox import *\n    {}{reset}'.format("\n    ".join(list_subcommands().keys()), **ANSI))
     print('{green}from archivebox import *\n    {}{reset}'.format("\n    ".join(list_subcommands().keys()), **ANSI))
     print()
     print()
     print('[i] Welcome to the ArchiveBox Shell!')
     print('[i] Welcome to the ArchiveBox Shell!')
-    print('    https://github.com/pirate/ArchiveBox/wiki/Usage#Shell-Usage')
+    print('    https://github.com/ArchiveBox/ArchiveBox/wiki/Usage#Shell-Usage')
     print()
     print()
     print('    {lightred}Hint:{reset} Example use:'.format(**ANSI))
     print('    {lightred}Hint:{reset} Example use:'.format(**ANSI))
     print('        print(Snapshot.objects.filter(is_archived=True).count())')
     print('        print(Snapshot.objects.filter(is_archived=True).count())')

+ 20 - 8
archivebox/main.py

@@ -3,6 +3,7 @@ __package__ = 'archivebox'
 import os
 import os
 import sys
 import sys
 import shutil
 import shutil
+import platform
 from pathlib import Path
 from pathlib import Path
 from datetime import date
 from datetime import date
 
 
@@ -111,6 +112,7 @@ from .logging_util import (
 
 
 
 
 ALLOWED_IN_OUTPUT_DIR = {
 ALLOWED_IN_OUTPUT_DIR = {
+    'lost+found',
     '.DS_Store',
     '.DS_Store',
     '.venv',
     '.venv',
     'venv',
     'venv',
@@ -178,7 +180,7 @@ def help(out_dir: Path=OUTPUT_DIR) -> None:
     archivebox update --resume=15109948213.123
     archivebox update --resume=15109948213.123
 
 
 {lightred}Documentation:{reset}
 {lightred}Documentation:{reset}
-    https://github.com/pirate/ArchiveBox/wiki
+    https://github.com/ArchiveBox/ArchiveBox/wiki
 '''.format(VERSION, out_dir, COMMANDS_HELP_TEXT, **ANSI))
 '''.format(VERSION, out_dir, COMMANDS_HELP_TEXT, **ANSI))
     
     
     else:
     else:
@@ -197,7 +199,7 @@ def help(out_dir: Path=OUTPUT_DIR) -> None:
         print('    2. archivebox init')
         print('    2. archivebox init')
         print()
         print()
         print('For more information, see the documentation here:')
         print('For more information, see the documentation here:')
-        print('    https://github.com/pirate/ArchiveBox/wiki')
+        print('    https://github.com/ArchiveBox/ArchiveBox/wiki')
 
 
 
 
 @enforce_types
 @enforce_types
@@ -209,6 +211,8 @@ def version(quiet: bool=False,
         print(VERSION)
         print(VERSION)
     else:
     else:
         print('ArchiveBox v{}'.format(VERSION))
         print('ArchiveBox v{}'.format(VERSION))
+        p = platform.uname()
+        print(p.system, platform.platform(), p.machine)
         print()
         print()
 
 
         print('{white}[i] Dependency versions:{reset}'.format(**ANSI))
         print('{white}[i] Dependency versions:{reset}'.format(**ANSI))
@@ -525,11 +529,14 @@ def add(urls: Union[str, List[str]],
         index_only: bool=False,
         index_only: bool=False,
         overwrite: bool=False,
         overwrite: bool=False,
         init: bool=False,
         init: bool=False,
-        out_dir: Path=OUTPUT_DIR) -> List[Link]:
+        out_dir: Path=OUTPUT_DIR,
+        extractors: str="") -> List[Link]:
     """Add a new URL or list of URLs to your archive"""
     """Add a new URL or list of URLs to your archive"""
 
 
     assert depth in (0, 1), 'Depth must be 0 or 1 (depth >1 is not supported yet)'
     assert depth in (0, 1), 'Depth must be 0 or 1 (depth >1 is not supported yet)'
 
 
+    extractors = extractors.split(",") if extractors else []
+
     if init:
     if init:
         run_subcommand('init', stdin=None, pwd=out_dir)
         run_subcommand('init', stdin=None, pwd=out_dir)
 
 
@@ -567,12 +574,17 @@ def add(urls: Union[str, List[str]],
         return all_links
         return all_links
 
 
     # Run the archive methods for each link
     # Run the archive methods for each link
+    archive_kwargs = {
+        "out_dir": out_dir,
+    }
+    if extractors:
+        archive_kwargs["methods"] = extractors
     if update_all:
     if update_all:
-        archive_links(all_links, overwrite=overwrite, out_dir=out_dir)
+        archive_links(all_links, overwrite=overwrite, **archive_kwargs)
     elif overwrite:
     elif overwrite:
-        archive_links(imported_links, overwrite=True, out_dir=out_dir)
+        archive_links(imported_links, overwrite=True, **archive_kwargs)
     elif new_links:
     elif new_links:
-        archive_links(new_links, overwrite=False, out_dir=out_dir)
+        archive_links(new_links, overwrite=False, **archive_kwargs)
     
     
     return all_links
     return all_links
 
 
@@ -857,7 +869,7 @@ def config(config_options_str: Optional[str]=None,
                 stderr(f'    {line}')
                 stderr(f'    {line}')
                 raise SystemExit(2)
                 raise SystemExit(2)
 
 
-            raw_key, val = line.split('=')
+            raw_key, val = line.split('=', 1)
             raw_key = raw_key.upper().strip()
             raw_key = raw_key.upper().strip()
             key = get_real_name(raw_key)
             key = get_real_name(raw_key)
             if key != raw_key:
             if key != raw_key:
@@ -930,7 +942,7 @@ def schedule(add: bool=False,
 
 
     if every or add:
     if every or add:
         every = every or 'day'
         every = every or 'day'
-        quoted = lambda s: f'"{s}"' if s and ' ' in s else s
+        quoted = lambda s: f'"{s}"' if s and ' ' in str(s) else str(s)
         cmd = [
         cmd = [
             'cd',
             'cd',
             quoted(out_dir),
             quoted(out_dir),

+ 11 - 5
archivebox/system.py

@@ -39,11 +39,17 @@ def atomic_write(path: Union[Path, str], contents: Union[dict, str, bytes], over
     mode = 'wb+' if isinstance(contents, bytes) else 'w'
     mode = 'wb+' if isinstance(contents, bytes) else 'w'
 
 
     # print('\n> Atomic Write:', mode, path, len(contents), f'overwrite={overwrite}')
     # print('\n> Atomic Write:', mode, path, len(contents), f'overwrite={overwrite}')
-    with lib_atomic_write(path, mode=mode, overwrite=overwrite) as f:
-        if isinstance(contents, dict):
-            dump(contents, f, indent=4, sort_keys=True, cls=ExtendedEncoder)
-        elif isinstance(contents, (bytes, str)):
-            f.write(contents)
+    try:
+        with lib_atomic_write(path, mode=mode, overwrite=overwrite) as f:
+            if isinstance(contents, dict):
+                dump(contents, f, indent=4, sort_keys=True, cls=ExtendedEncoder)
+            elif isinstance(contents, (bytes, str)):
+                f.write(contents)
+    except OSError as e:
+        print(f"[X] OSError: Failed to write {path} with fcntl.F_FULLFSYNC. ({e})")
+        print("    For data integrity, ArchiveBox requires a filesystem that supports atomic writes.")
+        print("    Filesystems and network drives that don't implement FSYNC are incompatible and require workarounds.")
+        raise SystemExit(1)
     os.chmod(path, int(OUTPUT_PERMISSIONS, base=8))
     os.chmod(path, int(OUTPUT_PERMISSIONS, base=8))
 
 
 @enforce_types
 @enforce_types

+ 3 - 2
archivebox/themes/default/base.html

@@ -226,6 +226,7 @@
 
 
         .exists-False {
         .exists-False {
           opacity: 0.1;
           opacity: 0.1;
+          filter: grayscale(100%);
           pointer-events: none;
           pointer-events: none;
         }
         }
     </style>
     </style>
@@ -265,7 +266,7 @@
                     <div class="col-sm-10" style="text-align: right">
                     <div class="col-sm-10" style="text-align: right">
                         <a href="/add/">Add Links</a> &nbsp; | &nbsp;
                         <a href="/add/">Add Links</a> &nbsp; | &nbsp;
                         <a href="/admin/core/snapshot/">Admin</a> &nbsp; | &nbsp;
                         <a href="/admin/core/snapshot/">Admin</a> &nbsp; | &nbsp;
-                        <a href="https://github.com/pirate/ArchiveBox/wiki">Docs</a>
+                        <a href="https://github.com/ArchiveBox/ArchiveBox/wiki">Docs</a>
                     </div>
                     </div>
                 </div>
                 </div>
             </div>
             </div>
@@ -277,7 +278,7 @@
             <br />
             <br />
             <center>
             <center>
                 <small>
                 <small>
-                    Archive created using <a href="https://github.com/pirate/ArchiveBox" title="Github">ArchiveBox</a> &nbsp; |
+                    Archive created using <a href="https://github.com/ArchiveBox/ArchiveBox" title="Github">ArchiveBox</a> &nbsp; |
                     &nbsp;
                     &nbsp;
                     Download index as <a href="index.json" title="JSON summary of archived links.">JSON</a>
                     Download index as <a href="index.json" title="JSON summary of archived links.">JSON</a>
                     <br /><br />
                     <br /><br />

+ 3 - 3
archivebox/themes/default/main_index.html

@@ -223,7 +223,7 @@
                     <div class="col-sm-10" style="text-align: right">
                     <div class="col-sm-10" style="text-align: right">
                         <a href="/add/">Add Links</a> &nbsp; | &nbsp; 
                         <a href="/add/">Add Links</a> &nbsp; | &nbsp; 
                         <a href="/admin/core/snapshot/">Admin</a> &nbsp; | &nbsp; 
                         <a href="/admin/core/snapshot/">Admin</a> &nbsp; | &nbsp; 
-                        <a href="https://github.com/pirate/ArchiveBox/wiki">Docs</a>
+                        <a href="https://github.com/ArchiveBox/ArchiveBox/wiki">Docs</a>
                     </div>
                     </div>
                 </div>
                 </div>
             </div>
             </div>
@@ -266,8 +266,8 @@
             <br/>
             <br/>
             <center>
             <center>
                 <small>
                 <small>
-                    Archive created using <a href="https://github.com/pirate/ArchiveBox" title="Github">ArchiveBox</a>
-                    version <a href="https://github.com/pirate/ArchiveBox/tree/v{{VERSION}}" title="Git commit">v{{VERSION}}</a> &nbsp; | &nbsp; 
+                    Archive created using <a href="https://github.com/ArchiveBox/ArchiveBox" title="Github">ArchiveBox</a>
+                    version <a href="https://github.com/ArchiveBox/ArchiveBox/tree/v{{VERSION}}" title="Git commit">v{{VERSION}}</a> &nbsp; | &nbsp; 
                     Download index as <a href="index.json" title="JSON summary of archived links.">JSON</a>
                     Download index as <a href="index.json" title="JSON summary of archived links.">JSON</a>
                     <br/><br/>
                     <br/><br/>
                     {{FOOTER_INFO}}
                     {{FOOTER_INFO}}

+ 4 - 4
archivebox/themes/legacy/main_index.html

@@ -187,8 +187,8 @@
                         </a>
                         </a>
                     </div>
                     </div>
                     <div class="col-sm-10" style="text-align: right">
                     <div class="col-sm-10" style="text-align: right">
-                        <a href="https://github.com/pirate/ArchiveBox/wiki">Documentation</a> &nbsp; | &nbsp; 
-                        <a href="https://github.com/pirate/ArchiveBox">Source</a> &nbsp; | &nbsp; 
+                        <a href="https://github.com/ArchiveBox/ArchiveBox/wiki">Documentation</a> &nbsp; | &nbsp; 
+                        <a href="https://github.com/ArchiveBox/ArchiveBox">Source</a> &nbsp; | &nbsp; 
                         <a href="https://archivebox.io">Website</a>
                         <a href="https://archivebox.io">Website</a>
                     </div>
                     </div>
                 </div>
                 </div>
@@ -209,8 +209,8 @@
             <br/>
             <br/>
             <center>
             <center>
                 <small>
                 <small>
-                    Archive created using <a href="https://github.com/pirate/ArchiveBox" title="Github">ArchiveBox</a>
-                    version <a href="https://github.com/pirate/ArchiveBox/tree/v$version" title="Git commit">v$version</a> &nbsp; | &nbsp; 
+                    Archive created using <a href="https://github.com/ArchiveBox/ArchiveBox" title="Github">ArchiveBox</a>
+                    version <a href="https://github.com/ArchiveBox/ArchiveBox/tree/v$version" title="Git commit">v$version</a> &nbsp; | &nbsp; 
                     Download index as <a href="index.json" title="JSON summary of archived links.">JSON</a>
                     Download index as <a href="index.json" title="JSON summary of archived links.">JSON</a>
                     <br/><br/>
                     <br/><br/>
                     $footer_info
                     $footer_info

+ 1 - 1
archivebox/util.py

@@ -16,7 +16,7 @@ from dateparser import parse as dateparser
 
 
 import requests
 import requests
 from requests.exceptions import RequestException, ReadTimeout
 from requests.exceptions import RequestException, ReadTimeout
-from base32_crockford import encode as base32_encode                            # type: ignore
+from .base32_crockford import encode as base32_encode                            # type: ignore
 from w3lib.encoding import html_body_declared_encoding, http_content_type_encoding
 from w3lib.encoding import html_body_declared_encoding, http_content_type_encoding
 
 
 try:
 try:

+ 50 - 0
assets/css/style.scss

@@ -0,0 +1,50 @@
+---
+---
+
+@import "{{ site.theme }}";
+
+div.shell {
+    width: 80%;
+    max-width: 1300px;
+    min-width: 300px;
+}
+
+span.banner-fix {
+    width: 80%;
+    max-width: 1300px;
+    min-width: 300px;
+}
+
+header h1 {
+    background-color: #aa1f55;
+    padding-bottom: 15px;
+    font-weight: 200px;
+}
+header h2 {
+    background-color: #aa1f55;
+    font-family: 'Open Sans';
+}
+
+#main_content div[align=center] h1 {
+    display: none;
+}
+#main_content img {
+    box-shadow: 4px 4px 4px rgba(0,0,0,0.1);
+    border-radius: 8px;
+    border: 0px;
+    vertical-align: top;
+}
+#main_content em img {
+    display: block;
+    margin-top: -83px;
+    padding: 0px;
+    margin-bottom: 20px;
+}
+
+#main_content img[alt=comparison] {
+    margin: 25px;
+}
+
+#forkme_banner {
+    opacity: 0.1;
+}

+ 3 - 0
bin/build.sh

@@ -14,8 +14,11 @@ REPO_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && cd .. && p
 
 
 cd "$REPO_DIR"
 cd "$REPO_DIR"
 
 
+# pipenv install --dev
+
 ./bin/build_docs.sh
 ./bin/build_docs.sh
 ./bin/build_pip.sh
 ./bin/build_pip.sh
+./bin/build_deb.sh
 ./bin/build_docker.sh
 ./bin/build_docker.sh
 
 
 echo "[√] Done. Install the built package by running:"
 echo "[√] Done. Install the built package by running:"

+ 42 - 0
bin/build_deb.sh

@@ -0,0 +1,42 @@
+#!/usr/bin/env bash
+
+### Bash Environment Setup
+# http://redsymbol.net/articles/unofficial-bash-strict-mode/
+# https://www.gnu.org/software/bash/manual/html_node/The-Set-Builtin.html
+# set -o xtrace
+set -o errexit
+set -o errtrace
+set -o nounset
+set -o pipefail
+IFS=$'\n'
+
+REPO_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && cd .. && pwd )"
+
+source "$REPO_DIR/.venv/bin/activate"
+cd "$REPO_DIR"
+
+VERSION="$(jq -r '.version' < "$REPO_DIR/package.json")"
+DEBIAN_VERSION="1"
+PGP_KEY_ID="7D5695D3B618872647861D51C38137A7C1675988"
+# make sure you have this in ~/.dput.cf:
+#     [archivebox-ppa]
+#     fqdn: ppa.launchpad.net
+#     method: ftp
+#     incoming: ~archivebox/ubuntu/archivebox/
+#     login: anonymous
+#     allow_unsigned_uploads: 0
+
+
+# cleanup build artifacts
+rm -Rf build deb_dist dist archivebox-*.tar.gz
+
+# build source and binary packages
+python3 setup.py --command-packages=stdeb.command \
+    sdist_dsc --debian-version=$DEBIAN_VERSION \
+    bdist_deb
+
+# sign the build with your PGP key ID
+debsign -k "$PGP_KEY_ID" "deb_dist/archivebox_${VERSION}-${DEBIAN_VERSION}_source.changes"
+
+# push the build to launchpad ppa
+# dput archivebox "deb_dist/archivebox_${VERSION}-${DEBIAN_VERSION}_source.changes"

+ 2 - 0
bin/build_docker.sh

@@ -21,5 +21,7 @@ docker build . -t archivebox \
                -t archivebox:$VERSION \
                -t archivebox:$VERSION \
                -t docker.io/nikisweeting/archivebox:latest \
                -t docker.io/nikisweeting/archivebox:latest \
                -t docker.io/nikisweeting/archivebox:$VERSION \
                -t docker.io/nikisweeting/archivebox:$VERSION \
+               -t docker.io/archivebox/archivebox:latest \
+               -t docker.io/archivebox/archivebox:$VERSION \
                -t docker.pkg.github.com/pirate/archivebox/archivebox:latest \
                -t docker.pkg.github.com/pirate/archivebox/archivebox:latest \
                -t docker.pkg.github.com/pirate/archivebox/archivebox:$VERSION
                -t docker.pkg.github.com/pirate/archivebox/archivebox:$VERSION

+ 2 - 2
bin/docker_entrypoint.sh

@@ -9,8 +9,8 @@ GRID=$(stat --format="%g" "$DATA_DIR")
 
 
 # If user is not root, modify the archivebox user+files to have the same uid,gid
 # If user is not root, modify the archivebox user+files to have the same uid,gid
 if [[ "$USID" != 0 && "$GRID" != 0 ]]; then
 if [[ "$USID" != 0 && "$GRID" != 0 ]]; then
-    usermod -u "$USID" "$ARCHIVEBOX_USER"
-    groupmod -g "$GRID" "$ARCHIVEBOX_USER"
+    usermod -u "$USID" "$ARCHIVEBOX_USER" > /dev/null 2>&1
+    groupmod -g "$GRID" "$ARCHIVEBOX_USER" > /dev/null 2>&1
     chown -R "$USID":"$GRID" "/home/$ARCHIVEBOX_USER"
     chown -R "$USID":"$GRID" "/home/$ARCHIVEBOX_USER"
     chown "$USID":"$GRID" "$DATA_DIR"
     chown "$USID":"$GRID" "$DATA_DIR"
     chown "$USID":"$GRID" "$DATA_DIR/*" > /dev/null 2>&1 || true
     chown "$USID":"$GRID" "$DATA_DIR/*" > /dev/null 2>&1 || true

+ 7 - 3
bin/release.sh

@@ -12,9 +12,8 @@ IFS=$'\n'
 
 
 REPO_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && cd .. && pwd )"
 REPO_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && cd .. && pwd )"
 
 
-source "$REPO_DIR/.venv/bin/activate"
 cd "$REPO_DIR"
 cd "$REPO_DIR"
-
+source "./.venv/bin/activate"
 
 
 
 
 # Make sure git is clean
 # Make sure git is clean
@@ -42,6 +41,7 @@ echo "${contents}" > package.json
 # Build docs, python package, and docker image
 # Build docs, python package, and docker image
 ./bin/build_docs.sh
 ./bin/build_docs.sh
 ./bin/build_pip.sh
 ./bin/build_pip.sh
+./bin/build_deb.sh
 ./bin/build_docker.sh
 ./bin/build_docker.sh
 
 
 
 
@@ -64,10 +64,14 @@ python3 -m twine upload --repository testpypi dist/*
 echo "[^] Uploading to pypi.org"
 echo "[^] Uploading to pypi.org"
 python3 -m twine upload --repository pypi dist/*
 python3 -m twine upload --repository pypi dist/*
 
 
+echo "[^] Uploading to launchpad.net"
+dput archivebox "deb_dist/archivebox_${NEW_VERSION}-1_source.changes"
+
 echo "[^] Uploading docker image"
 echo "[^] Uploading docker image"
 # docker login --username=nikisweeting
 # docker login --username=nikisweeting
 # docker login docker.pkg.github.com --username=pirate
 # docker login docker.pkg.github.com --username=pirate
 docker push docker.io/nikisweeting/archivebox
 docker push docker.io/nikisweeting/archivebox
-docker push docker.pkg.github.com/pirate/archivebox/archivebox
+docker push docker.io/archivebox/archivebox
+docker push docker.pkg.github.com/archivebox/archivebox/archivebox
 
 
 echo "[√] Done. Published version v$NEW_VERSION"
 echo "[√] Done. Published version v$NEW_VERSION"

+ 13 - 8
bin/setup.sh

@@ -1,7 +1,7 @@
 #!/bin/bash
 #!/bin/bash
 # ArchiveBox Setup Script
 # ArchiveBox Setup Script
 # Nick Sweeting 2017 | MIT License
 # Nick Sweeting 2017 | MIT License
-# https://github.com/pirate/ArchiveBox
+# https://github.com/ArchiveBox/ArchiveBox
 
 
 echo "[i] ArchiveBox Setup Script 📦"
 echo "[i] ArchiveBox Setup Script 📦"
 echo ""
 echo ""
@@ -16,7 +16,7 @@ echo "        - youtube-dl"
 echo "        - chromium-browser  (skip this if Chrome/Chromium is already installed)"
 echo "        - chromium-browser  (skip this if Chrome/Chromium is already installed)"
 echo ""
 echo ""
 echo "    If you'd rather install these manually, you can find documentation here:"
 echo "    If you'd rather install these manually, you can find documentation here:"
-echo "        https://github.com/pirate/ArchiveBox/wiki/Install"
+echo "        https://github.com/ArchiveBox/ArchiveBox/wiki/Install"
 echo ""
 echo ""
 echo "Press enter to continue with the automatic install, or Ctrl+C to cancel..."
 echo "Press enter to continue with the automatic install, or Ctrl+C to cancel..."
 read
 read
@@ -31,14 +31,17 @@ if which apt-get > /dev/null; then
     apt install git python3 python3-pip python3-distutils wget curl youtube-dl
     apt install git python3 python3-pip python3-distutils wget curl youtube-dl
 
 
     if which google-chrome; then
     if which google-chrome; then
-        echo "[i] You already have google-chrome installed, if you would like to download chromium-browser instead (they work pretty much the same), follow the Manual Setup instructions"
+        echo "[i] You already have google-chrome installed, if you would like to download chromium instead (they work pretty much the same), follow the Manual Setup instructions"
         google-chrome --version
         google-chrome --version
     elif which chromium-browser; then
     elif which chromium-browser; then
         echo "[i] chromium-browser already installed, using existing installation."
         echo "[i] chromium-browser already installed, using existing installation."
         chromium-browser --version
         chromium-browser --version
+    elif which chromium; then
+        echo "[i] chromium already installed, using existing installation."
+        chromium --version
     else
     else
-        echo "[+] Installing chromium-browser..."
-        apt install chromium-browser
+        echo "[+] Installing chromium..."
+        apt install chromium
     fi
     fi
 
 
 # On Mac:
 # On Mac:
@@ -63,8 +66,10 @@ elif which brew > /dev/null; then   # 🐍 eye of newt
         echo "[√] Using existing /Applications/Chromium.app"
         echo "[√] Using existing /Applications/Chromium.app"
     elif which chromium-browser; then
     elif which chromium-browser; then
         echo "[√] Using existing $(which chromium-browser)"
         echo "[√] Using existing $(which chromium-browser)"
+    elif which chromium; then
+        echo "[√] Using existing $(which chromium)"
     else
     else
-        echo "[+] Installing chromium-browser..."
+        echo "[+] Installing chromium..."
         brew cask install chromium
         brew cask install chromium
     fi
     fi
 else
 else
@@ -78,7 +83,7 @@ else
     exit 1
     exit 1
 fi
 fi
 
 
-pip3 install --upgrade archivebox
+python3 -m pip install --upgrade archivebox
 
 
 # Check:
 # Check:
 echo ""
 echo ""
@@ -107,5 +112,5 @@ echo "---------------------------------------------------"
 echo "[X] Failed to install some dependencies! ‼️"
 echo "[X] Failed to install some dependencies! ‼️"
 echo "    - Try the Manual Setup instructions in the README.md"
 echo "    - Try the Manual Setup instructions in the README.md"
 echo "    - Try the Troubleshooting: Dependencies instructions in the README.md"
 echo "    - Try the Troubleshooting: Dependencies instructions in the README.md"
-echo "    - Open an issue on github to get help: https://github.com/pirate/ArchiveBox/issues"
+echo "    - Open an issue on github to get help: https://github.com/ArchiveBox/ArchiveBox/issues"
 exit 1
 exit 1

+ 3 - 3
docker-compose.yml

@@ -5,14 +5,14 @@
 #     docker-compose run archivebox add --depth=1 https://example.com/some/feed.rss
 #     docker-compose run archivebox add --depth=1 https://example.com/some/feed.rss
 #     docker-compose run archivebox config --set PUBLIC_INDEX=True
 #     docker-compose run archivebox config --set PUBLIC_INDEX=True
 # Documentation:
 # Documentation:
-#     https://github.com/pirate/ArchiveBox/wiki/Docker#docker-compose
+#     https://github.com/ArchiveBox/ArchiveBox/wiki/Docker#docker-compose
 
 
 version: '3.7'
 version: '3.7'
 
 
 services:
 services:
     archivebox:
     archivebox:
         # build: .
         # build: .
-        image: ${DOCKER_IMAGE:-nikisweeting/archivebox:latest} 
+        image: ${DOCKER_IMAGE:-archivebox/archivebox:latest} 
         command: server 0.0.0.0:8000
         command: server 0.0.0.0:8000
         stdin_open: true
         stdin_open: true
         tty: true
         tty: true
@@ -30,7 +30,7 @@ services:
     # Example: Run scheduled imports in a docker instead of using cron on the
     # Example: Run scheduled imports in a docker instead of using cron on the
     # host machine, add tasks and see more info with archivebox schedule --help
     # host machine, add tasks and see more info with archivebox schedule --help
     # scheduler:
     # scheduler:
-    #    image: nikisweeting/archivebox:latest
+    #    image: archivebox/archivebox:latest
     #    command: schedule --foreground --every=day --depth=1 'https://getpocket.com/users/USERNAME/feed/all'
     #    command: schedule --foreground --every=day --depth=1 'https://getpocket.com/users/USERNAME/feed/all'
     #    environment:
     #    environment:
     #        - USE_COLOR=True
     #        - USE_COLOR=True

+ 1 - 1
docs

@@ -1 +1 @@
-Subproject commit c90af04d27c1d4b77a97f700beb7676ef3703ef0
+Subproject commit d5071d92367a91bb585abb5da7c65ebc61d0d7b0

+ 1 - 1
etc/ArchiveBox.conf.default

@@ -4,7 +4,7 @@
 # DO NOT EDIT THIS FILE DIRECTLY!
 # DO NOT EDIT THIS FILE DIRECTLY!
 #
 #
 # See the list of all the possible options. documentation, and examples here:
 # See the list of all the possible options. documentation, and examples here:
-#    https://github.com/pirate/ArchiveBox/wiki/Configuration
+#    https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration
 
 
 [GENERAL_CONFIG]
 [GENERAL_CONFIG]
 # OUTPUT_PERMISSIONS = 755
 # OUTPUT_PERMISSIONS = 755

BIN
icon.png


+ 1 - 1
package.json

@@ -1,6 +1,6 @@
 {
 {
   "name": "archivebox",
   "name": "archivebox",
-  "version": "0.4.21",
+  "version": "0.4.24",
   "description": "ArchiveBox: The self-hosted internet archive",
   "description": "ArchiveBox: The self-hosted internet archive",
   "author": "Nick Sweeting <[email protected]>",
   "author": "Nick Sweeting <[email protected]>",
   "license": "MIT",
   "license": "MIT",

+ 2 - 3
setup.py

@@ -9,7 +9,7 @@ DESCRIPTION = "The self-hosted internet archive."
 LICENSE = "MIT"
 LICENSE = "MIT"
 AUTHOR = "Nick Sweeting"
 AUTHOR = "Nick Sweeting"
 AUTHOR_EMAIL="[email protected]"
 AUTHOR_EMAIL="[email protected]"
-REPO_URL = "https://github.com/pirate/ArchiveBox"
+REPO_URL = "https://github.com/ArchiveBox/ArchiveBox"
 PROJECT_URLS = {
 PROJECT_URLS = {
     "Source":           f"{REPO_URL}",
     "Source":           f"{REPO_URL}",
     "Documentation":    f"{REPO_URL}/wiki",
     "Documentation":    f"{REPO_URL}/wiki",
@@ -51,10 +51,8 @@ setuptools.setup(
         "requests==2.24.0",
         "requests==2.24.0",
         "atomicwrites==1.4.0",
         "atomicwrites==1.4.0",
         "mypy-extensions==0.4.3",
         "mypy-extensions==0.4.3",
-        "base32-crockford==0.3.0",
         "django==3.1.3",
         "django==3.1.3",
         "django-extensions==3.0.3",
         "django-extensions==3.0.3",
-
         "dateparser",
         "dateparser",
         "ipython",
         "ipython",
         "youtube-dl",
         "youtube-dl",
@@ -80,6 +78,7 @@ setuptools.setup(
             "recommonmark",
             "recommonmark",
             "pytest",
             "pytest",
             "bottle",
             "bottle",
+            "stdeb",
         ],
         ],
         # 'redis': ['redis', 'django-redis'],
         # 'redis': ['redis', 'django-redis'],
         # 'pywb': ['pywb', 'redis'],
         # 'pywb': ['pywb', 'redis'],

+ 9 - 0
stdeb.cfg

@@ -0,0 +1,9 @@
+[DEFAULT]
+Source: archivebox
+Package: archivebox
+Package3: archivebox
+Suite: focal
+Suite3: focal
+Build-Depends: dh-python, python3-pip, python3-setuptools, python3-wheel, python3-stdeb
+Depends3: nodejs, chromium-browser, wget, curl, git, ffmpeg, youtube-dl, python3-atomicwrites, python3-croniter, python3-crontab, python3-dateparser, python3-django, python3-django-extensions, python3-mypy-extensions, python3-requests, python3-w3lib
+XS-Python-Version: >= 3.7

+ 10 - 1
tests/test_add.py

@@ -81,4 +81,13 @@ def test_add_updates_history_json_index(tmp_path, process, disable_extractors_di
 
 
     with open(archived_item_path / "index.json", "r") as f:
     with open(archived_item_path / "index.json", "r") as f:
         output_json = json.load(f)
         output_json = json.load(f)
-    assert output_json["history"] != {}
+    assert output_json["history"] != {}
+
+def test_extract_input_uses_only_passed_extractors(tmp_path, process):
+    subprocess.run(["archivebox", "add", "http://127.0.0.1:8080/static/example.com.html", "--extract", "wget"],
+                    capture_output=True)
+    
+    archived_item_path = list(tmp_path.glob('archive/**/*'))[0]
+
+    assert (archived_item_path / "warc").exists()
+    assert not (archived_item_path / "singlefile.html").exists()

+ 23 - 0
tests/test_remove.py

@@ -70,6 +70,29 @@ def test_remove_domain(tmp_path, process, disable_extractors_dict):
 
 
     assert count == 0
     assert count == 0
 
 
+
+def test_remove_tag(tmp_path, process, disable_extractors_dict):
+    subprocess.run(['archivebox', 'add', 'http://127.0.0.1:8080/static/example.com.html'], capture_output=True, env=disable_extractors_dict)
+    subprocess.run(['archivebox', 'add', 'http://127.0.0.1:8080/static/iana.org.html'], capture_output=True, env=disable_extractors_dict)
+    assert list((tmp_path / "archive").iterdir()) != []
+    
+    conn = sqlite3.connect("index.sqlite3")
+    c = conn.cursor()
+    c.execute("INSERT INTO core_tag (id, name, slug) VALUES (2, 'test-tag', 'test-tag')")
+    snapshot_ids = c.execute("SELECT id from core_snapshot")
+    c.executemany('INSERT INTO core_snapshot_tags (snapshot_id, tag_id) VALUES (?, 2)', list(snapshot_ids))
+    conn.commit()
+
+    remove_process = subprocess.run(['archivebox', 'remove', '--filter-type=tag', 'test-tag', '--yes', '--delete'], capture_output=True)
+
+    assert len(list((tmp_path / "archive").iterdir())) == 0
+
+    count = c.execute("SELECT COUNT() from core_snapshot").fetchone()[0]
+    conn.commit()
+    conn.close()
+
+    assert count == 0
+
 def test_remove_before(tmp_path, process, disable_extractors_dict):
 def test_remove_before(tmp_path, process, disable_extractors_dict):
     subprocess.run(['archivebox', 'add', 'http://127.0.0.1:8080/static/example.com.html'], capture_output=True, env=disable_extractors_dict)
     subprocess.run(['archivebox', 'add', 'http://127.0.0.1:8080/static/example.com.html'], capture_output=True, env=disable_extractors_dict)
     subprocess.run(['archivebox', 'add', 'http://127.0.0.1:8080/static/iana.org.html'], capture_output=True, env=disable_extractors_dict)
     subprocess.run(['archivebox', 'add', 'http://127.0.0.1:8080/static/iana.org.html'], capture_output=True, env=disable_extractors_dict)

+ 1 - 1
tests/test_title.py

@@ -5,7 +5,7 @@ from .fixtures import *
 
 
 def test_title_is_htmlencoded_in_index_html(tmp_path, process, disable_extractors_dict):
 def test_title_is_htmlencoded_in_index_html(tmp_path, process, disable_extractors_dict):
     """
     """
-    https://github.com/pirate/ArchiveBox/issues/330
+    https://github.com/ArchiveBox/ArchiveBox/issues/330
     Unencoded content should not be rendered as it facilitates xss injections
     Unencoded content should not be rendered as it facilitates xss injections
     and breaks the layout.
     and breaks the layout.
     """
     """