5 years ago · 83693a5c03
--- a/.gitignore
+++ b/.gitignore
@@ -4,13 +4,21 @@
 
				 __pycache__/
			
 
				 .mypy_cache/
			
 
				 
			
 
				+# Python and Node dependencies
			
 
				 venv/
			
 
				 .venv/
			
 
				 .docker-venv/
			
 
				+node_modules/
			
 
				 
			
 
				+# Packaging artifacts
			
 
				+archivebox-*.tar.gz
			
 
				 build/
			
 
				+deb_dist/
			
 
				 dist/
			
 
				-node_modules/
			
 
				 
			
 
				+# Data folders
			
 
				 data/
			
 
				+data1/
			
 
				+data2/
			
 
				+data3/
			
 
				 output/
			
--- a/archivebox.egg-info/PKG-INFO
+++ b/archivebox.egg-info/PKG-INFO
@@ -14,7 +14,7 @@ Project-URL: Roadmap, https://github.com/ArchiveBox/ArchiveBox/wiki/Roadmap
 
				 Project-URL: Community, https://github.com/ArchiveBox/ArchiveBox/wiki/Web-Archiving-Community
			
 
				 Project-URL: Donate, https://github.com/ArchiveBox/ArchiveBox/wiki/Donations
			
 
				 Description: <div align="center">
			
 
				-        <img src="https://i.imgur.com/4nkFjdv.png" height="80px">
			
 
				+        <em><img src="https://i.imgur.com/5B48E3N.png" height="90px"></em>
			
 
				         <h1>ArchiveBox<br/><sub>The open-source self-hosted web archive.</sub></h1>
			
 
				         
			
 
				         ▶️ <a href="https://github.com/ArchiveBox/ArchiveBox/wiki/Quickstart">Quickstart</a> |
			
@@ -41,7 +41,7 @@ Description: <div align="center">
 
				         <hr/>
			
 
				         </div>
			
 
				         
			
 
				-        ArchiveBox is a powerful self-hosted internet archiving solution written in Python 3. You feed it URLs of pages you want to archive, and it saves them to disk in a varitety of formats depending on the configuration and the content it detects. ArchiveBox can be installed via [Docker](https://docs.docker.com/get-docker/) or [`pip3`](https://wiki.python.org/moin/BeginnersGuide/Download).
			
 
				+        ArchiveBox is a powerful self-hosted internet archiving solution written in Python 3. You feed it URLs of pages you want to archive, and it saves them to disk in a varitety of formats depending on the configuration and the content it detects. ArchiveBox can be installed via [Docker](https://docs.docker.com/get-docker/) (recommended) or [`pip`](https://www.python.org/downloads/). It works on macOS, Windows, and Linux/BSD (both armv7 and amd64).
			
 
				         
			
 
				         Once installed, URLs can be added via the command line `archivebox add` or the built-in Web UI `archivebox server`. It can ingest bookmarks from a service like Pocket/Pinboard, your entire browsing history, RSS feeds, or URLs one at a time.
			
 
				         
			
@@ -51,19 +51,27 @@ Description: <div align="center">
 
				         #### Quickstart
			
 
				         
			
 
				         ```bash
			
 
				-        docker run -d -it -v ~/archivebox:/data -p 8000:8000 archivebox/archivebox server --init 0.0.0.0:8000
			
 
				-        docker run -v ~/archivebox:/data -it archivebox/archivebox manage createsuperuser
			
 
				-        docker run -v ~/archivebox:/data -it archivebox/archivebox add 'https://example.com'
			
 
				+        # 1. Create a folder somewhere to hold your ArchiveBox data
			
 
				+        mkdir ~/archivebox && cd ~/archivebox
			
 
				+        docker run -v $PWD:/data -it archivebox/archivebox init
			
 
				+        
			
 
				+        # 2. Archive some URLs to get started
			
 
				+        docker run -v $PWD:/data -t archivebox/archivebox add https://github.com/ArchiveBox/ArchiveBox
			
 
				+        docker run -v $PWD:/data -t archivebox/archivebox add --depth=1 https://example.com
			
 
				         
			
 
				-        open http://127.0.0.1:8000/admin/login/  # then click "Add" in the navbar
			
 
				+        # 3. Then view the snapshots of the URLs you added via the self-hosted web UI
			
 
				+        docker run -v $PWD:/data -it archivebox/archivebox manage createsuperuser  # create an admin acct
			
 
				+        docker run -v $PWD:/data -p 8000:8000 archivebox/archivebox                # start the web server
			
 
				+        open http://127.0.0.1:8000/                                    # open the interactive admin panel
			
 
				+        ls archive/*/index.html                                        # or just browse snapshots on disk
			
 
				         ```
			
 
				         
			
 
				         <div align="center">
			
 
				         <img src="https://i.imgur.com/lUuicew.png" width="400px">
			
 
				         <br/>
			
 
				         
			
 
				-        [DEMO: archivebox.zervice.io/](https://archivebox.zervice.io)  
			
 
				-        For more information, see the [full Quickstart guide](https://github.com/ArchiveBox/ArchiveBox/wiki/Quickstart), [Usage](https://github.com/ArchiveBox/ArchiveBox/wiki/Usage), and [Configuration](https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration) docs.
			
 
				+        <a href="https://archivebox.zervice.io">DEMO: archivebox.zervice.io/</a>  
			
 
				+        For more information, see the <a href="https://github.com/ArchiveBox/ArchiveBox/wiki/Quickstart">full Quickstart guide</a>, <a href="https://github.com/ArchiveBox/ArchiveBox/wiki/Usage">Usage</a>, and <a href="https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration">Configuration</a> docs.
			
 
				         </div>
			
 
				         
			
 
				         ---
			
@@ -82,7 +90,7 @@ Description: <div align="center">
 
				         open http://127.0.0.1:8000
			
 
				         ```
			
 
				         
			
 
				-        The CLI is considered "stable", and the ArchiveBox Python API and REST APIs are in "beta".
			
 
				+        The CLI is considered "stable", the ArchiveBox Python API and REST APIs are in "beta", and the [desktop app](https://github.com/ArchiveBox/desktop) is in "alpha" stage.
			
 
				         
			
 
				         At the end of the day, the goal is to sleep soundly knowing that the part of the internet you care about will be automatically preserved in multiple, durable long-term formats that will be accessible for decades (or longer). You can also self-host your archivebox server on a public domain to provide archive.org-style public access to your site snapshots.
			
 
				         
			
@@ -331,7 +339,7 @@ Description: <div align="center">
 
				         
			
 
				         ▶ **Check out our [community page](https://github.com/ArchiveBox/ArchiveBox/wiki/Web-Archiving-Community) for an index of web archiving initiatives and projects.**
			
 
				         
			
 
				-        <img src="https://i.imgur.com/4nkFjdv.png" width="10%" align="left"/> The aim of ArchiveBox is to go beyond what the Wayback Machine and other public archiving services can do, by adding a headless browser to replay sessions accurately, and by automatically extracting all the content in multiple redundant formats that will survive being passed down to historians and archivists through many generations.
			
 
				+        <img src="https://i.imgur.com/4nkFjdv.png" width="10%" align="left" alt="comparison"/> The aim of ArchiveBox is to go beyond what the Wayback Machine and other public archiving services can do, by adding a headless browser to replay sessions accurately, and by automatically extracting all the content in multiple redundant formats that will survive being passed down to historians and archivists through many generations.
			
 
				         
			
 
				         #### User Interface & Intended Purpose
			
 
				         
			
@@ -362,7 +370,7 @@ Description: <div align="center">
 
				             _A collection of the most active internet archiving communities and initiatives._
			
 
				         - Check out the ArchiveBox [Roadmap](https://github.com/ArchiveBox/ArchiveBox/wiki/Roadmap) and [Changelog](https://github.com/ArchiveBox/ArchiveBox/wiki/Changelog)
			
 
				         - Learn why archiving the internet is important by reading the "[On the Importance of Web Archiving](https://parameters.ssrc.org/2018/09/on-the-importance-of-web-archiving/)" blog post.
			
 
				-        - Or reach out to me for questions and comments via [@theSquashSH](https://twitter.com/thesquashSH) on Twitter.
			
 
				+        - Or reach out to me for questions and comments via [@ArchiveBoxApp](https://twitter.com/ArchiveBoxApp) or [@theSquashSH](https://twitter.com/thesquashSH) on Twitter.
			
 
				         
			
 
				         ---
			
 
				         
			
@@ -494,7 +502,7 @@ Description: <div align="center">
 
				         <a href="https://www.patreon.com/theSquashSH"><img src="https://img.shields.io/badge/Donate_to_support_development-via_Patreon-%23DD5D76.svg?style=flat"/></a>
			
 
				         <br/>
			
 
				         
			
 
				-        <a href="https://twitter.com/thesquashSH"><img src="https://img.shields.io/badge/Tweet-%40theSquashSH-blue.svg?style=flat"/></a>
			
 
				+        <a href="https://twitter.com/ArchiveBoxApp"><img src="https://img.shields.io/badge/Tweet-%40ArchiveBoxApp-blue.svg?style=flat"/></a>
			
 
				         <a href="https://github.com/ArchiveBox/ArchiveBox"><img src="https://img.shields.io/github/stars/ArchiveBox/ArchiveBox.svg?style=flat&label=Star+on+Github"/></a>
			
 
				         
			
 
				         <br/><br/>
			
--- a/archivebox.egg-info/SOURCES.txt
+++ b/archivebox.egg-info/SOURCES.txt
@@ -6,6 +6,7 @@ archivebox/LICENSE
 
				 archivebox/README.md
			
 
				 archivebox/__init__.py
			
 
				 archivebox/__main__.py
			
 
				+archivebox/base32_crockford.py
			
 
				 archivebox/config.py
			
 
				 archivebox/config_stubs.py
			
 
				 archivebox/logging_util.py
			
--- a/archivebox/base32_crockford.py
+++ b/archivebox/base32_crockford.py
@@ -0,0 +1,172 @@
 
				+"""
			
 
				+base32-crockford
			
 
				+================
			
 
				+
			
 
				+A Python module implementing the alternate base32 encoding as described
			
 
				+by Douglas Crockford at: http://www.crockford.com/wrmg/base32.html.
			
 
				+
			
 
				+He designed the encoding to:
			
 
				+
			
 
				+   * Be human and machine readable
			
 
				+   * Be compact
			
 
				+   * Be error resistant
			
 
				+   * Be pronounceable
			
 
				+
			
 
				+It uses a symbol set of 10 digits and 22 letters, excluding I, L O and
			
 
				+U. Decoding is not case sensitive, and 'i' and 'l' are converted to '1'
			
 
				+and 'o' is converted to '0'. Encoding uses only upper-case characters.
			
 
				+
			
 
				+Hyphens may be present in symbol strings to improve readability, and
			
 
				+are removed when decoding.
			
 
				+
			
 
				+A check symbol can be appended to a symbol string to detect errors
			
 
				+within the string.
			
 
				+
			
 
				+"""
			
 
				+
			
 
				+import re
			
 
				+import sys
			
 
				+
			
 
				+PY3 = sys.version_info[0] == 3
			
 
				+
			
 
				+if not PY3:
			
 
				+    import string as str
			
 
				+
			
 
				+
			
 
				+__all__ = ["encode", "decode", "normalize"]
			
 
				+
			
 
				+
			
 
				+if PY3:
			
 
				+    string_types = str,
			
 
				+else:
			
 
				+    string_types = basestring,
			
 
				+
			
 
				+# The encoded symbol space does not include I, L, O or U
			
 
				+symbols = '0123456789ABCDEFGHJKMNPQRSTVWXYZ'
			
 
				+# These five symbols are exclusively for checksum values
			
 
				+check_symbols = '*~$=U'
			
 
				+
			
 
				+encode_symbols = dict((i, ch) for (i, ch) in enumerate(symbols + check_symbols))
			
 
				+decode_symbols = dict((ch, i) for (i, ch) in enumerate(symbols + check_symbols))
			
 
				+normalize_symbols = str.maketrans('IiLlOo', '111100')
			
 
				+valid_symbols = re.compile('^[%s]+[%s]?$' % (symbols,
			
 
				+                                             re.escape(check_symbols)))
			
 
				+
			
 
				+base = len(symbols)
			
 
				+check_base = len(symbols + check_symbols)
			
 
				+
			
 
				+
			
 
				+def encode(number, checksum=False, split=0):
			
 
				+    """Encode an integer into a symbol string.
			
 
				+
			
 
				+    A ValueError is raised on invalid input.
			
 
				+
			
 
				+    If checksum is set to True, a check symbol will be
			
 
				+    calculated and appended to the string.
			
 
				+
			
 
				+    If split is specified, the string will be divided into
			
 
				+    clusters of that size separated by hyphens.
			
 
				+
			
 
				+    The encoded string is returned.
			
 
				+    """
			
 
				+    number = int(number)
			
 
				+    if number < 0:
			
 
				+        raise ValueError("number '%d' is not a positive integer" % number)
			
 
				+
			
 
				+    split = int(split)
			
 
				+    if split < 0:
			
 
				+        raise ValueError("split '%d' is not a positive integer" % split)
			
 
				+
			
 
				+    check_symbol = ''
			
 
				+    if checksum:
			
 
				+        check_symbol = encode_symbols[number % check_base]
			
 
				+
			
 
				+    if number == 0:
			
 
				+        return '0' + check_symbol
			
 
				+
			
 
				+    symbol_string = ''
			
 
				+    while number > 0:
			
 
				+        remainder = number % base
			
 
				+        number //= base
			
 
				+        symbol_string = encode_symbols[remainder] + symbol_string
			
 
				+    symbol_string = symbol_string + check_symbol
			
 
				+
			
 
				+    if split:
			
 
				+        chunks = []
			
 
				+        for pos in range(0, len(symbol_string), split):
			
 
				+            chunks.append(symbol_string[pos:pos + split])
			
 
				+        symbol_string = '-'.join(chunks)
			
 
				+
			
 
				+    return symbol_string
			
 
				+
			
 
				+
			
 
				+def decode(symbol_string, checksum=False, strict=False):
			
 
				+    """Decode an encoded symbol string.
			
 
				+
			
 
				+    If checksum is set to True, the string is assumed to have a
			
 
				+    trailing check symbol which will be validated. If the
			
 
				+    checksum validation fails, a ValueError is raised.
			
 
				+
			
 
				+    If strict is set to True, a ValueError is raised if the
			
 
				+    normalization step requires changes to the string.
			
 
				+
			
 
				+    The decoded string is returned.
			
 
				+    """
			
 
				+    symbol_string = normalize(symbol_string, strict=strict)
			
 
				+    if checksum:
			
 
				+        symbol_string, check_symbol = symbol_string[:-1], symbol_string[-1]
			
 
				+
			
 
				+    number = 0
			
 
				+    for symbol in symbol_string:
			
 
				+        number = number * base + decode_symbols[symbol]
			
 
				+
			
 
				+    if checksum:
			
 
				+        check_value = decode_symbols[check_symbol]
			
 
				+        modulo = number % check_base
			
 
				+        if check_value != modulo:
			
 
				+            raise ValueError("invalid check symbol '%s' for string '%s'" %
			
 
				+                             (check_symbol, symbol_string))
			
 
				+
			
 
				+    return number
			
 
				+
			
 
				+
			
 
				+def normalize(symbol_string, strict=False):
			
 
				+    """Normalize an encoded symbol string.
			
 
				+
			
 
				+    Normalization provides error correction and prepares the
			
 
				+    string for decoding. These transformations are applied:
			
 
				+
			
 
				+       1. Hyphens are removed
			
 
				+       2. 'I', 'i', 'L' or 'l' are converted to '1'
			
 
				+       3. 'O' or 'o' are converted to '0'
			
 
				+       4. All characters are converted to uppercase
			
 
				+
			
 
				+    A TypeError is raised if an invalid string type is provided.
			
 
				+
			
 
				+    A ValueError is raised if the normalized string contains
			
 
				+    invalid characters.
			
 
				+
			
 
				+    If the strict parameter is set to True, a ValueError is raised
			
 
				+    if any of the above transformations are applied.
			
 
				+
			
 
				+    The normalized string is returned.
			
 
				+    """
			
 
				+    if isinstance(symbol_string, string_types):
			
 
				+        if not PY3:
			
 
				+            try:
			
 
				+                symbol_string = symbol_string.encode('ascii')
			
 
				+            except UnicodeEncodeError:
			
 
				+                raise ValueError("string should only contain ASCII characters")
			
 
				+    else:
			
 
				+        raise TypeError("string is of invalid type %s" %
			
 
				+                        symbol_string.__class__.__name__)
			
 
				+
			
 
				+    norm_string = symbol_string.replace('-', '').translate(normalize_symbols).upper()
			
 
				+
			
 
				+    if not valid_symbols.match(norm_string):
			
 
				+        raise ValueError("string '%s' contains invalid characters" % norm_string)
			
 
				+
			
 
				+    if strict and norm_string != symbol_string:
			
 
				+        raise ValueError("string '%s' requires normalization" % symbol_string)
			
 
				+
			
 
				+    return norm_string
			
--- a/archivebox/util.py
+++ b/archivebox/util.py
@@ -16,7 +16,7 @@ from dateparser import parse as dateparser
 
				 
			
 
				 import requests
			
 
				 from requests.exceptions import RequestException, ReadTimeout
			
 
				-from base32_crockford import encode as base32_encode                            # type: ignore
			
 
				+from .base32_crockford import encode as base32_encode                            # type: ignore
			
 
				 from w3lib.encoding import html_body_declared_encoding, http_content_type_encoding
			
 
				 
			
 
				 try:
			
--- a/assets/css/style.scss
+++ b/assets/css/style.scss
@@ -6,11 +6,13 @@
 
				 div.shell {
			
 
				     width: 80%;
			
 
				     max-width: 1300px;
			
 
				+    min-width: 300px;
			
 
				 }
			
 
				 
			
 
				 span.banner-fix {
			
 
				     width: 80%;
			
 
				     max-width: 1300px;
			
 
				+    min-width: 300px;
			
 
				 }
			
 
				 
			
 
				 header h1 {
			
--- a/bin/build.sh
+++ b/bin/build.sh
@@ -16,6 +16,7 @@ cd "$REPO_DIR"
 
				 
			
 
				 ./bin/build_docs.sh
			
 
				 ./bin/build_pip.sh
			
 
				+./bin/build_deb.sh
			
 
				 ./bin/build_docker.sh
			
 
				 
			
 
				 echo "[√] Done. Install the built package by running:"
			
--- a/bin/build_deb.sh
+++ b/bin/build_deb.sh
@@ -0,0 +1,42 @@
 
				+#!/usr/bin/env bash
			
 
				+
			
 
				+### Bash Environment Setup
			
 
				+# http://redsymbol.net/articles/unofficial-bash-strict-mode/
			
 
				+# https://www.gnu.org/software/bash/manual/html_node/The-Set-Builtin.html
			
 
				+# set -o xtrace
			
 
				+set -o errexit
			
 
				+set -o errtrace
			
 
				+set -o nounset
			
 
				+set -o pipefail
			
 
				+IFS=$'\n'
			
 
				+
			
 
				+REPO_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && cd .. && pwd )"
			
 
				+
			
 
				+source "$REPO_DIR/.venv/bin/activate"
			
 
				+cd "$REPO_DIR"
			
 
				+
			
 
				+VERSION="$(jq -r '.version' < "$REPO_DIR/package.json")"
			
 
				+DEBIAN_VERSION="1"
			
 
				+PGP_KEY_ID="7D5695D3B618872647861D51C38137A7C1675988"
			
 
				+# make sure you have this in ~/.dput.cf:
			
 
				+#     [archivebox-ppa]
			
 
				+#     fqdn: ppa.launchpad.net
			
 
				+#     method: ftp
			
 
				+#     incoming: ~archivebox/ubuntu/archivebox/
			
 
				+#     login: anonymous
			
 
				+#     allow_unsigned_uploads: 0
			
 
				+
			
 
				+
			
 
				+# cleanup build artifacts
			
 
				+rm -Rf build deb_dist dist archivebox-*.tar.gz
			
 
				+
			
 
				+# build source and binary packages
			
 
				+python3 setup.py --command-packages=stdeb.command \
			
 
				+    sdist_dsc --debian-version=$DEBIAN_VERSION \
			
 
				+    bdist_deb
			
 
				+
			
 
				+# sign the build with your PGP key ID
			
 
				+debsign -k "$PGP_KEY_ID" "deb_dist/archivebox_${VERSION}-${DEBIAN_VERSION}_source.changes"
			
 
				+
			
 
				+# push the build to launchpad ppa
			
 
				+# dput archivebox "deb_dist/archivebox_${VERSION}-${DEBIAN_VERSION}_source.changes"
			
--- a/bin/release.sh
+++ b/bin/release.sh
@@ -42,6 +42,7 @@ echo "${contents}" > package.json
 
				 # Build docs, python package, and docker image
			
 
				 ./bin/build_docs.sh
			
 
				 ./bin/build_pip.sh
			
 
				+./bin/build_deb.sh
			
 
				 ./bin/build_docker.sh
			
 
				 
			
 
				 
			
@@ -64,11 +65,14 @@ python3 -m twine upload --repository testpypi dist/*
 
				 echo "[^] Uploading to pypi.org"
			
 
				 python3 -m twine upload --repository pypi dist/*
			
 
				 
			
 
				+echo "[^] Uploading to launchpad.net"
			
 
				+python3 -m dput archivebox "deb_dist/archivebox_${NEW_VERSION}-1_source.changes"
			
 
				+
			
 
				 echo "[^] Uploading docker image"
			
 
				 # docker login --username=nikisweeting
			
 
				 # docker login docker.pkg.github.com --username=pirate
			
 
				 docker push docker.io/nikisweeting/archivebox
			
 
				 docker push docker.io/archivebox/archivebox
			
 
				-docker push docker.pkg.github.com/pirate/archivebox/archivebox
			
 
				+docker push docker.pkg.github.com/archivebox/archivebox/archivebox
			
 
				 
			
 
				 echo "[√] Done. Published version v$NEW_VERSION"
			
--- a/icon.png
+++ b/icon.png
--- a/package.json
+++ b/package.json
@@ -1,6 +1,6 @@
 
				 {
			
 
				   "name": "archivebox",
			
 
				-  "version": "0.4.21",
			
 
				+  "version": "0.4.22",
			
 
				   "description": "ArchiveBox: The self-hosted internet archive",
			
 
				   "author": "Nick Sweeting <[email protected]>",
			
 
				   "license": "MIT",
			
--- a/setup.py
+++ b/setup.py
@@ -51,7 +51,6 @@ setuptools.setup(
 
				         "requests==2.24.0",
			
 
				         "atomicwrites==1.4.0",
			
 
				         "mypy-extensions==0.4.3",
			
 
				-        "base32-crockford==0.3.0",
			
 
				         "django==3.0.8",
			
 
				         "django-extensions==3.0.3",
			
 
				 
			
@@ -80,6 +79,7 @@ setuptools.setup(
 
				             "recommonmark",
			
 
				             "pytest",
			
 
				             "bottle",
			
 
				+            "stdeb",
			
 
				         ],
			
 
				         # 'redis': ['redis', 'django-redis'],
			
 
				         # 'pywb': ['pywb', 'redis'],
			
--- a/stdeb.cfg
+++ b/stdeb.cfg
@@ -0,0 +1,6 @@
 
				+[DEFAULT]
			
 
				+Package: archivebox
			
 
				+Suite: focal
			
 
				+Build-Depends: dh-python
			
 
				+Depends: nodejs, chromium-browser, wget, curl, git, ffmpeg, youtube-dl, python3-atomicwrites, python3-croniter, python3-crontab, python3-dateparser, python3-django, python3-django-extensions, python3-mypy-extensions, python3-requests, python3-w3lib
			
 
				+XS-Python-Version: >= 3.7