Browse Source

Merge pull request #652 from ArchiveBox/dev

Nick Sweeting 4 years ago
parent
commit
9766ea21a7

+ 1 - 1
.github/workflows/docker.yml

@@ -75,7 +75,7 @@ jobs:
           tags: ${{ steps.docker_meta.outputs.tags }}
           tags: ${{ steps.docker_meta.outputs.tags }}
           cache-from: type=local,src=/tmp/.buildx-cache
           cache-from: type=local,src=/tmp/.buildx-cache
           cache-to: type=local,dest=/tmp/.buildx-cache
           cache-to: type=local,dest=/tmp/.buildx-cache
-          platforms: linux/amd64,linux/386,linux/arm64,linux/arm/v7
+          platforms: linux/amd64,linux/arm64,linux/arm/v7
 
 
       - name: Image digest
       - name: Image digest
         run: echo ${{ steps.docker_build.outputs.digest }}
         run: echo ${{ steps.docker_build.outputs.digest }}

+ 2 - 0
Dockerfile

@@ -63,6 +63,7 @@ RUN curl -s https://deb.nodesource.com/gpgkey/nodesource.gpg.key | apt-key add -
     && apt-get update -qq \
     && apt-get update -qq \
     && apt-get install -qq -y --no-install-recommends \
     && apt-get install -qq -y --no-install-recommends \
         nodejs \
         nodejs \
+    # && npm install -g npm \
     && rm -rf /var/lib/apt/lists/*
     && rm -rf /var/lib/apt/lists/*
 
 
 # Install Node dependencies
 # Install Node dependencies
@@ -82,6 +83,7 @@ ADD ./pip_dist/archivebox.egg-info/requires.txt "$CODE_DIR/pip_dist/archivebox.e
 RUN apt-get update -qq \
 RUN apt-get update -qq \
     && apt-get install -qq -y --no-install-recommends \
     && apt-get install -qq -y --no-install-recommends \
         build-essential python-dev python3-dev \
         build-essential python-dev python3-dev \
+    # && pip install --upgrade pip \
     && grep -B 1000 -E '^$' "$CODE_DIR/pip_dist/archivebox.egg-info/requires.txt" | pip install --quiet -r /dev/stdin \
     && grep -B 1000 -E '^$' "$CODE_DIR/pip_dist/archivebox.egg-info/requires.txt" | pip install --quiet -r /dev/stdin \
     && pip install --quiet "sonic-client==0.0.5" \
     && pip install --quiet "sonic-client==0.0.5" \
     && apt-get purge -y build-essential python-dev python3-dev \
     && apt-get purge -y build-essential python-dev python3-dev \

+ 2 - 0
MANIFEST.in

@@ -2,3 +2,5 @@ graft archivebox
 global-exclude .DS_Store
 global-exclude .DS_Store
 global-exclude __pycache__
 global-exclude __pycache__
 global-exclude *.pyc
 global-exclude *.pyc
+
+prune tests/

+ 19 - 7
README.md

@@ -82,8 +82,9 @@ archivebox help
 <sub>. . . . . . . . . . . . . . . . . . . . . . . . . . . .</sub>
 <sub>. . . . . . . . . . . . . . . . . . . . . . . . . . . .</sub>
 <br/><br/>
 <br/><br/>
 <img src="https://i.imgur.com/njxgSbl.png" width="22%" alt="cli init screenshot" align="top">
 <img src="https://i.imgur.com/njxgSbl.png" width="22%" alt="cli init screenshot" align="top">
+<img src="https://i.imgur.com/lUuicew.png" width="22%" alt="cli init screenshot" align="top">
 <img src="https://i.imgur.com/p6wK6KM.png" width="22%" alt="server snapshot admin screenshot" align="top">
 <img src="https://i.imgur.com/p6wK6KM.png" width="22%" alt="server snapshot admin screenshot" align="top">
-<img src="https://i.imgur.com/RefWsXB.jpg" width="28.6%" alt="server snapshot details page screenshot" align="top"/>
+<img src="https://i.imgur.com/xHvQfon.png" width="28.6%" alt="server snapshot details page screenshot" align="top"/>
 <br/>
 <br/>
 <br/>
 <br/>
 <img src="https://i.imgur.com/T2UAGUD.png" width="49%" alt="grass"/><img src="https://i.imgur.com/T2UAGUD.png" width="49%" alt="grass"/>
 <img src="https://i.imgur.com/T2UAGUD.png" width="49%" alt="grass"/><img src="https://i.imgur.com/T2UAGUD.png" width="49%" alt="grass"/>
@@ -266,10 +267,7 @@ No matter which install method you choose, they all roughly follow this 3-step p
 <br/>
 <br/>
 
 
 <div align="center">
 <div align="center">
-<img src="https://i.imgur.com/lUuicew.png" width="22.4%" align="top">
-<img src="https://i.imgur.com/p6wK6KM.png" width="35.9%" align="top">
-<img src="https://i.imgur.com/pzq4uXq.png" width="29.7%" align="top">
-<br/><br/>
+<br/>
 <sub>. . . . . . . . . . . . . . . . . . . . . . . . . . . .</sub>
 <sub>. . . . . . . . . . . . . . . . . . . . . . . . . . . .</sub>
 <br/><br/>
 <br/><br/>
 <a href="https://archivebox.zervice.io">DEMO: <code>https://archivebox.zervice.io</code></a><br/>
 <a href="https://archivebox.zervice.io">DEMO: <code>https://archivebox.zervice.io</code></a><br/>
@@ -327,7 +325,15 @@ All of ArchiveBox's state (including the index, snapshot data, and config file)
 The on-disk layout is optimized to be easy to browse by hand and durable long-term. The main index is a standard sqlite3 database (it can also be exported as static JSON/HTML), and the archive snapshots are organized by date-added timestamp in the `archive/` subfolder. Each snapshot subfolder includes a static JSON and HTML index describing its contents, and the snapshot extrator outputs are plain files within the folder (e.g. `media/example.mp4`, `git/somerepo.git`, `static/someimage.png`, etc.)
 The on-disk layout is optimized to be easy to browse by hand and durable long-term. The main index is a standard sqlite3 database (it can also be exported as static JSON/HTML), and the archive snapshots are organized by date-added timestamp in the `archive/` subfolder. Each snapshot subfolder includes a static JSON and HTML index describing its contents, and the snapshot extrator outputs are plain files within the folder (e.g. `media/example.mp4`, `git/somerepo.git`, `static/someimage.png`, etc.)
 
 
 ```bash
 ```bash
- ls ./archive/<timestamp>/
+# to browse your index statically without running the archivebox server, run:
+archivebox list --html --with-headers > index.html
+archivebox list --json --with-headers > index.json
+
+# then open the static index in a browser
+open index.html
+
+# or browse the snapshots via filesystem directly
+ls ./archive/<timestamp>/
 ```
 ```
 
 
 - **Index:** `index.html` & `index.json` HTML and JSON index files containing metadata and details
 - **Index:** `index.html` & `index.json` HTML and JSON index files containing metadata and details
@@ -346,6 +352,12 @@ The on-disk layout is optimized to be easy to browse by hand and durable long-te
 
 
 It does everything out-of-the-box by default, but you can disable or tweak [individual archive methods](https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration) via environment variables or config file.
 It does everything out-of-the-box by default, but you can disable or tweak [individual archive methods](https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration) via environment variables or config file.
 
 
+```bash
+archivebox config --set SAVE_ARCHIVE_DOT_ORG=False
+archivebox config --set YOUTUBEDL_ARGS='--max-filesize=500m'
+archivebox config --help
+```
+
 <div align="center">
 <div align="center">
 <img src="https://i.imgur.com/ucyimDX.png" width="96%" alt="lego graphic">
 <img src="https://i.imgur.com/ucyimDX.png" width="96%" alt="lego graphic">
 </div>
 </div>
@@ -445,7 +457,7 @@ archivebox add 'https://example.com#2020-10-25'
 <img src="https://i.imgur.com/p6wK6KM.png" alt="archivebox server list">
 <img src="https://i.imgur.com/p6wK6KM.png" alt="archivebox server list">
 </td>
 </td>
 <td>
 <td>
-<img src="https://i.imgur.com/pzq4uXq.png" alt="archivebox server detail">
+<img src="https://i.imgur.com/xHvQfon.png" alt="archivebox server detail">
 </td>
 </td>
 </tr>
 </tr>
 </tbody>
 </tbody>

+ 1 - 1
archivebox/config.py

@@ -1079,6 +1079,6 @@ def setup_django(out_dir: Path=None, check_db=False, config: ConfigDict=CONFIG,
         if check_db:
         if check_db:
             sql_index_path = Path(output_dir) / SQL_INDEX_FILENAME
             sql_index_path = Path(output_dir) / SQL_INDEX_FILENAME
             assert sql_index_path.exists(), (
             assert sql_index_path.exists(), (
-                f'No database file {SQL_INDEX_FILENAME} found in OUTPUT_DIR: {config["OUTPUT_DIR"]}')
+                f'No database file {SQL_INDEX_FILENAME} found in: {config["OUTPUT_DIR"]} (Are you in an ArchiveBox collection directory?)')
     except KeyboardInterrupt:
     except KeyboardInterrupt:
         raise SystemExit(2)
         raise SystemExit(2)

+ 19 - 2
archivebox/core/migrations/0007_archiveresult.py

@@ -36,8 +36,25 @@ def forwards_func(apps, schema_editor):
 
 
         for extractor in history:
         for extractor in history:
             for result in history[extractor]:
             for result in history[extractor]:
-                ArchiveResult.objects.create(extractor=extractor, snapshot=snapshot, cmd=result["cmd"], cmd_version=result["cmd_version"] or 'unknown', 
-                start_ts=result["start_ts"], end_ts=result["end_ts"], status=result["status"], pwd=result["pwd"], output=result["output"])
+                try:
+                    ArchiveResult.objects.create(
+                        extractor=extractor,
+                        snapshot=snapshot,
+                        pwd=result["pwd"],
+                        cmd=result.get("cmd") or [],
+                        cmd_version=result.get("cmd_version") or 'unknown',
+                        start_ts=result["start_ts"],
+                        end_ts=result["end_ts"],
+                        status=result["status"],
+                        output=result.get("output") or 'null',
+                    )
+                except Exception as e:
+                    print(
+                        '    ! Skipping import due to missing/invalid index.json:',
+                        out_dir,
+                        e,
+                        '(open an issue with this index.json for help)',
+                    )
 
 
 
 
 def verify_json_index_integrity(snapshot):
 def verify_json_index_integrity(snapshot):

+ 4 - 4
archivebox/index/schema.py

@@ -55,11 +55,11 @@ class ArchiveResult:
         assert isinstance(self.end_ts, datetime)
         assert isinstance(self.end_ts, datetime)
         assert isinstance(self.cmd, list)
         assert isinstance(self.cmd, list)
         assert all(isinstance(arg, str) and arg for arg in self.cmd)
         assert all(isinstance(arg, str) and arg for arg in self.cmd)
-        assert self.pwd is None or isinstance(self.pwd, str) and self.pwd
-        assert self.cmd_version is None or isinstance(self.cmd_version, str) and self.cmd_version
+
+        # TODO: replace emptystrings in these three with None / remove them from the DB
+        assert self.pwd is None or isinstance(self.pwd, str)
+        assert self.cmd_version is None or isinstance(self.cmd_version, str)
         assert self.output is None or isinstance(self.output, (str, Exception))
         assert self.output is None or isinstance(self.output, (str, Exception))
-        if isinstance(self.output, str):
-            assert self.output
 
 
     @classmethod
     @classmethod
     def guess_ts(_cls, dict_info):
     def guess_ts(_cls, dict_info):

+ 12 - 23
bin/build_deb.sh

@@ -10,14 +10,6 @@ set -o nounset
 set -o pipefail
 set -o pipefail
 IFS=$'\n'
 IFS=$'\n'
 
 
-REPO_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && cd .. && pwd )"
-
-if [[ -f "$REPO_DIR/.venv/bin/activate" ]]; then
-    source "$REPO_DIR/.venv/bin/activate"
-else
-    echo "[!] Warning: No virtualenv presesnt in $REPO_DIR.venv"
-fi
-cd "$REPO_DIR"
 
 
 CURRENT_PLAFORM="$(uname)"
 CURRENT_PLAFORM="$(uname)"
 REQUIRED_PLATFORM="Linux"
 REQUIRED_PLATFORM="Linux"
@@ -26,30 +18,27 @@ if [[ "$CURRENT_PLAFORM" != "$REQUIRED_PLATFORM" ]]; then
    exit 0
    exit 0
 fi
 fi
 
 
+
+REPO_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && cd .. && pwd )"
 VERSION="$(jq -r '.version' < "$REPO_DIR/package.json")"
 VERSION="$(jq -r '.version' < "$REPO_DIR/package.json")"
-DEBIAN_VERSION="1"
-PGP_KEY_ID="7D5695D3B618872647861D51C38137A7C1675988"
-# make sure you have this in ~/.dput.cf:
-#     [archivebox-ppa]
-#     fqdn: ppa.launchpad.net
-#     method: ftp
-#     incoming: ~archivebox/ubuntu/archivebox/
-#     login: anonymous
-#     allow_unsigned_uploads: 0
+DEBIAN_VERSION="${DEBIAN_VERSION:-1}"
+cd "$REPO_DIR"
+
 
 
+if [[ -f "$REPO_DIR/.venv/bin/activate" ]]; then
+    source "$REPO_DIR/.venv/bin/activate"
+else
+    echo "[!] Warning: No virtualenv presesnt in $REPO_DIR.venv"
+fi
 
 
 # cleanup build artifacts
 # cleanup build artifacts
 rm -Rf build deb_dist dist archivebox-*.tar.gz
 rm -Rf build deb_dist dist archivebox-*.tar.gz
 
 
-# make sure the stdeb.cfg file is up-to-date with all the dependencies
 
 
 # build source and binary packages
 # build source and binary packages
+# make sure the stdeb.cfg file is up-to-date with all the dependencies
 python3 setup.py --command-packages=stdeb.command \
 python3 setup.py --command-packages=stdeb.command \
     sdist_dsc --debian-version=$DEBIAN_VERSION \
     sdist_dsc --debian-version=$DEBIAN_VERSION \
     bdist_deb
     bdist_deb
 
 
-# sign the build with your PGP key ID
-debsign -k "$PGP_KEY_ID" "deb_dist/archivebox_${VERSION}-${DEBIAN_VERSION}_source.changes"
-
-# push the build to launchpad ppa
-# dput archivebox "deb_dist/archivebox_${VERSION}-${DEBIAN_VERSION}_source.changes"
+# should output deb_dist/archivebox_0.5.4-1.{deb,changes,buildinfo,tar.gz}

+ 32 - 2
bin/release_deb.sh

@@ -10,11 +10,41 @@ set -o nounset
 set -o pipefail
 set -o pipefail
 IFS=$'\n'
 IFS=$'\n'
 
 
+
+CURRENT_PLAFORM="$(uname)"
+REQUIRED_PLATFORM="Linux"
+if [[ "$CURRENT_PLAFORM" != "$REQUIRED_PLATFORM" ]]; then
+   echo "[!] Skipping the Debian package build on $CURRENT_PLAFORM (it can only be run on $REQUIRED_PLATFORM)."
+   exit 0
+fi
+
+
 REPO_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && cd .. && pwd )"
 REPO_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && cd .. && pwd )"
 VERSION="$(jq -r '.version' < "$REPO_DIR/package.json")"
 VERSION="$(jq -r '.version' < "$REPO_DIR/package.json")"
-SHORT_VERSION="$(echo "$VERSION" | perl -pe 's/(\d+)\.(\d+)\.(\d+)/$1.$2/g')"
+DEBIAN_VERSION="${DEBIAN_VERSION:-1}"
 cd "$REPO_DIR"
 cd "$REPO_DIR"
 
 
 
 
+echo "[+] Loading PGP keys from env vars and filesystem..."
+# https://github.com/ArchiveBox/debian-archivebox/settings/secrets/actions
+PGP_KEY_ID="${PGP_KEY_ID:-BC2D21B0D84E16C437300B8652423FBED1586F45}"
+[[ "${PGP_PUBLIC_KEY:-}" ]] && echo "$PGP_PUBLIC_KEY" > /tmp/archivebox_gpg.key.pub
+[[ "${PGP_PRIVATE_KEY:-}" ]] && echo "$PGP_PRIVATE_KEY" > /tmp/archivebox_gpg.key
+gpg --import /tmp/archivebox_gpg.key.pub || true
+gpg --import --allow-secret-key-import /tmp/archivebox_gpg.key || true
+echo "$PGP_KEY_ID:6:" | gpg --import-ownertrust || true
+
+echo "[*] Signing build and changelog with PGP..."
+debsign  --re-sign -k "$PGP_KEY_ID" "deb_dist/archivebox_${VERSION}-${DEBIAN_VERSION}_source.changes"
+
+# make sure you have this in ~/.dput.cf:
+#     [archivebox-ppa]
+#     fqdn: ppa.launchpad.net
+#     method: ftp
+#     incoming: ~archivebox/ubuntu/archivebox/
+#     login: anonymous
+#     allow_unsigned_uploads: 0
+
+
 echo "[^] Uploading to launchpad.net"
 echo "[^] Uploading to launchpad.net"
-dput archivebox "deb_dist/archivebox_${VERSION}-1_source.changes"
+dput -f archivebox "deb_dist/archivebox_${VERSION}-${DEBIAN_VERSION}_source.changes"

+ 1 - 0
bin/release_docker.sh

@@ -19,6 +19,7 @@ cd "$REPO_DIR"
 echo "[^] Uploading docker image"
 echo "[^] Uploading docker image"
 # docker login --username=nikisweeting
 # docker login --username=nikisweeting
 # docker login docker.pkg.github.com --username=pirate
 # docker login docker.pkg.github.com --username=pirate
+docker push archivebox/archivebox:$VERSION archivebox/archivebox:$SHORT_VERSION archivebox/archivebox:latest
 docker push docker.io/nikisweeting/archivebox
 docker push docker.io/nikisweeting/archivebox
 docker push docker.io/archivebox/archivebox
 docker push docker.io/archivebox/archivebox
 docker push docker.pkg.github.com/archivebox/archivebox/archivebox
 docker push docker.pkg.github.com/archivebox/archivebox/archivebox

+ 1 - 1
package.json

@@ -1,6 +1,6 @@
 {
 {
   "name": "archivebox",
   "name": "archivebox",
-  "version": "0.5.4",
+  "version": "0.5.6",
   "description": "ArchiveBox: The self-hosted internet archive",
   "description": "ArchiveBox: The self-hosted internet archive",
   "author": "Nick Sweeting <[email protected]>",
   "author": "Nick Sweeting <[email protected]>",
   "license": "MIT",
   "license": "MIT",

+ 3 - 4
setup.py

@@ -33,11 +33,10 @@ VERSION = json.loads((PACKAGE_DIR / "package.json").read_text().strip())['versio
 # print('>', sys.executable, *sys.argv)
 # print('>', sys.executable, *sys.argv)
 
 
 
 
-class CustomTest(test):
+class DisabledTestCommand(test):
     def run(self):
     def run(self):
         # setup.py test is deprecated, disable it here by force so stdeb doesnt run it
         # setup.py test is deprecated, disable it here by force so stdeb doesnt run it
-        #super().run()
-        pass
+        print('Use the ./bin/test.sh script to run tests, not setup.py test.')
 
 
 
 
 setuptools.setup(
 setuptools.setup(
@@ -129,6 +128,6 @@ setuptools.setup(
         "Typing :: Typed",
         "Typing :: Typed",
     ],
     ],
     cmdclass={
     cmdclass={
-        "test": CustomTest,
+        "test": DisabledTestCommand,
     },
     },
 )
 )

+ 1 - 0
stdeb.cfg

@@ -7,3 +7,4 @@ Suite3: focal
 Build-Depends: dh-python, python3-pip, python3-setuptools, python3-wheel, python3-stdeb
 Build-Depends: dh-python, python3-pip, python3-setuptools, python3-wheel, python3-stdeb
 Depends3: nodejs, chromium-browser, wget, curl, git, ffmpeg, youtube-dl, python3-atomicwrites, python3-croniter, python3-crontab, python3-dateparser, python3-django, python3-django-extensions, python3-django-jsonfield, python3-mypy-extensions, python3-requests, python3-w3lib, ripgrep
 Depends3: nodejs, chromium-browser, wget, curl, git, ffmpeg, youtube-dl, python3-atomicwrites, python3-croniter, python3-crontab, python3-dateparser, python3-django, python3-django-extensions, python3-django-jsonfield, python3-mypy-extensions, python3-requests, python3-w3lib, ripgrep
 XS-Python-Version: >= 3.7
 XS-Python-Version: >= 3.7
+Setup-Env-Vars: DEB_BUILD_OPTIONS=nocheck