2
0
Эх сурвалжийг харах

Merge branch 'dev' into plugins-browsertrix

Nick Sweeting 1 жил өмнө
parent
commit
fbb3c84be2

+ 1 - 1
.github/FUNDING.yml

@@ -1,3 +1,3 @@
 github: pirate
 github: pirate
 patreon: theSquashSH
 patreon: theSquashSH
-custom: ["https://twitter.com/ArchiveBoxApp", "https://paypal.me/NicholasSweeting", "https://www.blockchain.com/eth/address/0x5D4c34D4a121Fe08d1dDB7969F07550f2dB9f471", "https://www.blockchain.com/btc/address/1HuxXriPE2Bbnag3jJrqa3bkNHrs297dYH"]
+custom: ["https://hcb.hackclub.com/donations/start/archivebox", "https://paypal.me/NicholasSweeting"]

+ 7 - 0
.github/workflows/docker.yml

@@ -81,6 +81,13 @@ jobs:
 
 
       - name: Image digest
       - name: Image digest
         run: echo ${{ steps.docker_build.outputs.digest }}
         run: echo ${{ steps.docker_build.outputs.digest }}
+
+      - name: Update README
+        uses: peter-evans/dockerhub-description@v4
+        with:
+          username: ${{ secrets.DOCKER_USERNAME }}
+          password: ${{ secrets.DOCKER_PASSWORD }}
+          repository: archivebox/archivebox
        
        
       # This ugly bit is necessary if you don't want your cache to grow forever
       # This ugly bit is necessary if you don't want your cache to grow forever
       # until it hits GitHub's limit of 5GB.
       # until it hits GitHub's limit of 5GB.

+ 52 - 42
README.md

@@ -13,7 +13,7 @@
 <a href="https://github.com/ArchiveBox/ArchiveBox/blob/dev/LICENSE"><img src="https://img.shields.io/badge/Open_source-MIT-green.svg?logo=git&logoColor=green"/></a> <a href="https://github.com/ArchiveBox/ArchiveBox/commits/dev"><img src="https://img.shields.io/github/last-commit/ArchiveBox/ArchiveBox.svg?logo=Sublime+Text&logoColor=green&label=Active"/></a> &nbsp; <a href="https://github.com/ArchiveBox/ArchiveBox"><img src="https://img.shields.io/github/stars/ArchiveBox/ArchiveBox.svg?logo=github&label=Stars&logoColor=blue"/></a> &nbsp; <a href="https://pypi.org/project/archivebox/"><img src="https://img.shields.io/pypi/dm/archivebox?label=PyPI%20Installs&color=%235f7dae"/></a> <a href="https://chromewebstore.google.com/detail/archivebox-exporter/habonpimjphpdnmcfkaockjnffodikoj"><img src="https://img.shields.io/chrome-web-store/users/habonpimjphpdnmcfkaockjnffodikoj?label=Chrome%20Web%20Store&color=%231973e8"/></a> <a href="https://hub.docker.com/r/archivebox/archivebox"><img src="https://img.shields.io/docker/pulls/archivebox/archivebox.svg?label=Docker+Pulls"/></a>
 <a href="https://github.com/ArchiveBox/ArchiveBox/blob/dev/LICENSE"><img src="https://img.shields.io/badge/Open_source-MIT-green.svg?logo=git&logoColor=green"/></a> <a href="https://github.com/ArchiveBox/ArchiveBox/commits/dev"><img src="https://img.shields.io/github/last-commit/ArchiveBox/ArchiveBox.svg?logo=Sublime+Text&logoColor=green&label=Active"/></a> &nbsp; <a href="https://github.com/ArchiveBox/ArchiveBox"><img src="https://img.shields.io/github/stars/ArchiveBox/ArchiveBox.svg?logo=github&label=Stars&logoColor=blue"/></a> &nbsp; <a href="https://pypi.org/project/archivebox/"><img src="https://img.shields.io/pypi/dm/archivebox?label=PyPI%20Installs&color=%235f7dae"/></a> <a href="https://chromewebstore.google.com/detail/archivebox-exporter/habonpimjphpdnmcfkaockjnffodikoj"><img src="https://img.shields.io/chrome-web-store/users/habonpimjphpdnmcfkaockjnffodikoj?label=Chrome%20Web%20Store&color=%231973e8"/></a> <a href="https://hub.docker.com/r/archivebox/archivebox"><img src="https://img.shields.io/docker/pulls/archivebox/archivebox.svg?label=Docker+Pulls"/></a>
 
 
 <!--<pre lang="bash" align="left"><code style="white-space: pre-line; text-align: left" align="left">
 <!--<pre lang="bash" align="left"><code style="white-space: pre-line; text-align: left" align="left">
-curl -sSL 'https://get.archivebox.io' | sh    # (or see pip/brew/Docker instructions below)
+curl -fsSL 'https://get.archivebox.io' | sh    # (or see pip/brew/Docker instructions below)
 </code></pre>-->
 </code></pre>-->
 
 
 </div>
 </div>
@@ -25,7 +25,6 @@ curl -sSL 'https://get.archivebox.io' | sh    # (or see pip/brew/Docker instruct
 Without active preservation effort, everything on the internet eventually dissapears or degrades. Archive.org does a great job as a centralized service, but saved URLs have to be public, and they can't save every type of content.
 Without active preservation effort, everything on the internet eventually dissapears or degrades. Archive.org does a great job as a centralized service, but saved URLs have to be public, and they can't save every type of content.
 
 
 *ArchiveBox is an open source tool that lets organizations & individuals archive both public & private web content while retaining control over their data. It can be used to save copies of bookmarks, preserve evidence for legal cases, backup photos from FB/Insta/Flickr or media from YT/Soundcloud/etc., save research papers, and more...*
 *ArchiveBox is an open source tool that lets organizations & individuals archive both public & private web content while retaining control over their data. It can be used to save copies of bookmarks, preserve evidence for legal cases, backup photos from FB/Insta/Flickr or media from YT/Soundcloud/etc., save research papers, and more...*
-
 <br/>
 <br/>
 
 
 > ➡️ Get ArchiveBox with `pip install archivebox` on [Linux](#quickstart), [macOS](#quickstart), and [Windows](#quickstart) (WSL2), or via **[Docker](#quickstart)** ⭐️.  
 > ➡️ Get ArchiveBox with `pip install archivebox` on [Linux](#quickstart), [macOS](#quickstart), and [Windows](#quickstart) (WSL2), or via **[Docker](#quickstart)** ⭐️.  
@@ -51,10 +50,13 @@ It also detects any content featured *inside* pages & extracts it out into a fol
 - 💾 **Github**/**Gitlab**/etc. links ➡️ `clone of GIT source code`, `README`, `images`, ...
 - 💾 **Github**/**Gitlab**/etc. links ➡️ `clone of GIT source code`, `README`, `images`, ...
 - ✨ *and more, see [Output Formats](#output-formats) below...*
 - ✨ *and more, see [Output Formats](#output-formats) below...*
 
 
+You can run ArchiveBox as a Docker web app to manage these snapshots, or continue accessing the same collection using the `pip`-installed CLI, Python API, and SQLite3 APIs. 
+All the ways of using it are equivalent, and provide matching features like adding tags, scheduling regular crawls, viewing logs, and more...
+
 <br/>
 <br/>
 <hr/>
 <hr/>
 
 
-🛠️ ArchiveBox uses [standard tools](#dependencies) like Chrome, `wget`, & `yt-dlp`, and stores data in [ordinary files & folders](#archive-layout).  
+🛠️ ArchiveBox uses [standard tools](#dependencies) like Chrome, [`wget`](https://www.gnu.org/software/wget/), & [`yt-dlp`](https://github.com/yt-dlp/yt-dlp), and stores data in [ordinary files & folders](#archive-layout).  
 *(no complex proprietary formats, all data is readable without needing to run ArchiveBox)*
 *(no complex proprietary formats, all data is readable without needing to run ArchiveBox)*
 
 
 The goal is to sleep soundly knowing the part of the internet you care about will be automatically preserved in durable, easily accessible formats [for decades](#background--motivation) after it goes down.
 The goal is to sleep soundly knowing the part of the internet you care about will be automatically preserved in durable, easily accessible formats [for decades](#background--motivation) after it goes down.
@@ -70,10 +72,9 @@ The goal is to sleep soundly knowing the part of the internet you care about wil
 <details>
 <details>
 &nbsp; <summary><i>Expand for quick copy-pastable install commands...</i> &nbsp; ⤵️</summary>
 &nbsp; <summary><i>Expand for quick copy-pastable install commands...</i> &nbsp; ⤵️</summary>
 <br/>
 <br/>
-<pre lang="bash"><code style="white-space: pre-line">mkdir ~/archivebox; cd ~/archivebox    # create a dir somewhere for your archivebox data
-<br/>
-# Option A: Get ArchiveBox with Docker Compose (recommended):
-curl -sSL 'https://docker-compose.archivebox.io' > docker-compose.yml   # edit options in this file as-needed
+<pre lang="bash"><code style="white-space: pre-line"># Option A: Get ArchiveBox with Docker Compose (recommended):
+mkdir -p ~/archivebox/data && cd ~/archivebox
+curl -fsSL 'https://docker-compose.archivebox.io' > docker-compose.yml   # edit options in this file as-needed
 docker compose run archivebox init --setup
 docker compose run archivebox init --setup
 # docker compose run archivebox add 'https://example.com'
 # docker compose run archivebox add 'https://example.com'
 # docker compose run archivebox help
 # docker compose run archivebox help
@@ -81,6 +82,7 @@ docker compose run archivebox init --setup
 <br/>
 <br/>
 <br/>
 <br/>
 # Option B: Or use it as a plain Docker container:
 # Option B: Or use it as a plain Docker container:
+mkdir -p ~/archivebox/data && cd ~/archivebox/data
 docker run -it -v $PWD:/data archivebox/archivebox init --setup
 docker run -it -v $PWD:/data archivebox/archivebox init --setup
 # docker run -it -v $PWD:/data archivebox/archivebox add 'https://example.com'
 # docker run -it -v $PWD:/data archivebox/archivebox add 'https://example.com'
 # docker run -it -v $PWD:/data archivebox/archivebox help
 # docker run -it -v $PWD:/data archivebox/archivebox help
@@ -89,6 +91,7 @@ docker run -it -v $PWD:/data archivebox/archivebox init --setup
 <br/>
 <br/>
 # Option C: Or install it with your preferred pkg manager (see Quickstart below for apt, brew, and more)
 # Option C: Or install it with your preferred pkg manager (see Quickstart below for apt, brew, and more)
 pip install archivebox
 pip install archivebox
+mkdir -p ~/archivebox/data && cd ~/archivebox/data
 archivebox init --setup
 archivebox init --setup
 # archviebox add 'https://example.com'
 # archviebox add 'https://example.com'
 # archivebox help
 # archivebox help
@@ -96,7 +99,7 @@ archivebox init --setup
 <br/>
 <br/>
 <br/>
 <br/>
 # Option D: Or use the optional auto setup script to install it
 # Option D: Or use the optional auto setup script to install it
-curl -sSL 'https://get.archivebox.io' | sh
+curl -fsSL 'https://get.archivebox.io' | sh
 </code></pre>
 </code></pre>
 <br/>
 <br/>
 <sub>Open <a href="http://localhost:8000"><code>http://localhost:8000</code></a> to see your server's Web UI ➡️</sub>
 <sub>Open <a href="http://localhost:8000"><code>http://localhost:8000</code></a> to see your server's Web UI ➡️</sub>
@@ -180,9 +183,9 @@ ArchiveBox is free for everyone to self-host, but we also provide support, secur
 <ol>
 <ol>
 <li>Install <a href="https://docs.docker.com/get-docker/">Docker</a> on your system (if not already installed).</li>
 <li>Install <a href="https://docs.docker.com/get-docker/">Docker</a> on your system (if not already installed).</li>
 <li>Download the <a href="https://raw.githubusercontent.com/ArchiveBox/ArchiveBox/dev/docker-compose.yml" download><code>docker-compose.yml</code></a> file into a new empty directory (can be anywhere).
 <li>Download the <a href="https://raw.githubusercontent.com/ArchiveBox/ArchiveBox/dev/docker-compose.yml" download><code>docker-compose.yml</code></a> file into a new empty directory (can be anywhere).
-<pre lang="bash"><code style="white-space: pre-line">mkdir ~/archivebox && cd ~/archivebox
+<pre lang="bash"><code style="white-space: pre-line">mkdir -p ~/archivebox/data && cd ~/archivebox
 # Read and edit docker-compose.yml options as-needed after downloading
 # Read and edit docker-compose.yml options as-needed after downloading
-curl -sSL 'https://docker-compose.archivebox.io' > docker-compose.yml
+curl -fsSL 'https://docker-compose.archivebox.io' > docker-compose.yml
 </code></pre></li>
 </code></pre></li>
 <li>Run the initial setup to create an admin user (or set ADMIN_USER/PASS in docker-compose.yml)
 <li>Run the initial setup to create an admin user (or set ADMIN_USER/PASS in docker-compose.yml)
 <pre lang="bash"><code style="white-space: pre-line">docker compose run archivebox init --setup
 <pre lang="bash"><code style="white-space: pre-line">docker compose run archivebox init --setup
@@ -190,7 +193,7 @@ curl -sSL 'https://docker-compose.archivebox.io' > docker-compose.yml
 <li>Next steps: Start the server then login to the Web UI <a href="http://127.0.0.1:8000">http://127.0.0.1:8000</a> ⇢ Admin.
 <li>Next steps: Start the server then login to the Web UI <a href="http://127.0.0.1:8000">http://127.0.0.1:8000</a> ⇢ Admin.
 <pre lang="bash"><code style="white-space: pre-line">docker compose up
 <pre lang="bash"><code style="white-space: pre-line">docker compose up
 # completely optional, CLI can always be used without running a server
 # completely optional, CLI can always be used without running a server
-# docker compose run [-T] archivebox [subcommand] [--args]
+# docker compose run [-T] archivebox [subcommand] [--help]
 docker compose run archivebox add 'https://example.com'
 docker compose run archivebox add 'https://example.com'
 docker compose run archivebox help
 docker compose run archivebox help
 </code></pre></li>
 </code></pre></li>
@@ -206,14 +209,14 @@ See <a href="#%EF%B8%8F-cli-usage">below</a> for more usage examples using the C
 <ol>
 <ol>
 <li>Install <a href="https://docs.docker.com/get-docker/">Docker</a> on your system (if not already installed).</li>
 <li>Install <a href="https://docs.docker.com/get-docker/">Docker</a> on your system (if not already installed).</li>
 <li>Create a new empty directory and initialize your collection (can be anywhere).
 <li>Create a new empty directory and initialize your collection (can be anywhere).
-<pre lang="bash"><code style="white-space: pre-line">mkdir ~/archivebox && cd ~/archivebox
+<pre lang="bash"><code style="white-space: pre-line">mkdir -p ~/archivebox/data && cd ~/archivebox/data
 docker run -v $PWD:/data -it archivebox/archivebox init --setup
 docker run -v $PWD:/data -it archivebox/archivebox init --setup
 </code></pre>
 </code></pre>
 </li>
 </li>
 <li>Optional: Start the server then login to the Web UI <a href="http://127.0.0.1:8000">http://127.0.0.1:8000</a> ⇢ Admin.
 <li>Optional: Start the server then login to the Web UI <a href="http://127.0.0.1:8000">http://127.0.0.1:8000</a> ⇢ Admin.
 <pre lang="bash"><code style="white-space: pre-line">docker run -v $PWD:/data -p 8000:8000 archivebox/archivebox
 <pre lang="bash"><code style="white-space: pre-line">docker run -v $PWD:/data -p 8000:8000 archivebox/archivebox
 # completely optional, CLI can always be used without running a server
 # completely optional, CLI can always be used without running a server
-# docker run -v $PWD:/data -it [subcommand] [--args]
+# docker run -v $PWD:/data -it [subcommand] [--help]
 docker run -v $PWD:/data -it archivebox/archivebox help
 docker run -v $PWD:/data -it archivebox/archivebox help
 </code></pre>
 </code></pre>
 </li>
 </li>
@@ -229,7 +232,7 @@ See <a href="#%EF%B8%8F-cli-usage">below</a> for more usage examples using the C
 <ol>
 <ol>
 <li>Install <a href="https://docs.docker.com/get-docker/">Docker</a> on your system (optional, highly recommended but not required).</li>
 <li>Install <a href="https://docs.docker.com/get-docker/">Docker</a> on your system (optional, highly recommended but not required).</li>
 <li>Run the automatic setup script.
 <li>Run the automatic setup script.
-<pre lang="bash"><code style="white-space: pre-line">curl -sSL 'https://get.archivebox.io' | sh</code></pre>
+<pre lang="bash"><code style="white-space: pre-line">curl -fsSL 'https://get.archivebox.io' | sh</code></pre>
 </li>
 </li>
 </ol>
 </ol>
 
 
@@ -254,25 +257,30 @@ See <a href="https://docs.sweeting.me/s/against-curl-sh">"Against curl | sh as a
 <li>Install <a href="https://realpython.com/installing-python/">Python >= v3.10</a> and <a href="https://nodejs.org/en/download/package-manager/">Node >= v18</a> on your system (if not already installed).</li>
 <li>Install <a href="https://realpython.com/installing-python/">Python >= v3.10</a> and <a href="https://nodejs.org/en/download/package-manager/">Node >= v18</a> on your system (if not already installed).</li>
 <li>Install the ArchiveBox package using <code>pip3</code> (or <a href="https://pipx.pypa.io"><code>pipx</code></a>).
 <li>Install the ArchiveBox package using <code>pip3</code> (or <a href="https://pipx.pypa.io"><code>pipx</code></a>).
 <pre lang="bash"><code style="white-space: pre-line">pip3 install archivebox
 <pre lang="bash"><code style="white-space: pre-line">pip3 install archivebox
+archivebox version
+# install any missing extras shown using apt/brew/pkg/etc.
+#    [email protected] node curl wget git ripgrep ...
 </code></pre>
 </code></pre>
+<i>See the <a href="https://github.com/ArchiveBox/ArchiveBox/wiki/Install">Install: Bare Metal</a> Wiki for full install instructions for each OS...</i>
 </li>
 </li>
 <li>Create a new empty directory and initialize your collection (can be anywhere).
 <li>Create a new empty directory and initialize your collection (can be anywhere).
-<pre lang="bash"><code style="white-space: pre-line">mkdir ~/archivebox && cd ~/archivebox
-archivebox init --setup
-# install any missing extras like wget/git/ripgrep/etc. manually as needed
+<pre lang="bash"><code style="white-space: pre-line">mkdir -p ~/archivebox/data && cd ~/archivebox/data   # for example
+archivebox init --setup   # instantialize a new collection
+# (--setup auto-installs and link JS dependencies: singlefile, readability, etc.)
 </code></pre>
 </code></pre>
 </li>
 </li>
 <li>Optional: Start the server then login to the Web UI <a href="http://127.0.0.1:8000">http://127.0.0.1:8000</a> ⇢ Admin.
 <li>Optional: Start the server then login to the Web UI <a href="http://127.0.0.1:8000">http://127.0.0.1:8000</a> ⇢ Admin.
 <pre lang="bash"><code style="white-space: pre-line">archivebox server 0.0.0.0:8000
 <pre lang="bash"><code style="white-space: pre-line">archivebox server 0.0.0.0:8000
 # completely optional, CLI can always be used without running a server
 # completely optional, CLI can always be used without running a server
-# archivebox [subcommand] [--args]
+# archivebox [subcommand] [--help]
 archivebox help
 archivebox help
 </code></pre>
 </code></pre>
 </li>
 </li>
 </ol>
 </ol>
 
 
 See <a href="#%EF%B8%8F-cli-usage">below</a> for more usage examples using the CLI, Web UI, or filesystem/SQL/Python to manage your archive.<br/>
 See <a href="#%EF%B8%8F-cli-usage">below</a> for more usage examples using the CLI, Web UI, or filesystem/SQL/Python to manage your archive.<br/>
-See the <a href="https://github.com/ArchiveBox/pip-archivebox"><code>pip-archivebox</code></a> repo for more details about this distribution.
+<br/>
+<small>See the <a href="https://github.com/ArchiveBox/pip-archivebox"><code>pip-archivebox</code></a> repo for more details about this distribution.</small>
 <br/><br/>
 <br/><br/>
 </details>
 </details>
 
 
@@ -293,15 +301,15 @@ sudo python3 -m pip install --upgrade --ignore-installed archivebox   # pip need
 </code></pre>
 </code></pre>
 </li>
 </li>
 <li>Create a new empty directory and initialize your collection (can be anywhere).
 <li>Create a new empty directory and initialize your collection (can be anywhere).
-<pre lang="bash"><code style="white-space: pre-line">mkdir ~/archivebox && cd ~/archivebox
+<pre lang="bash"><code style="white-space: pre-line">mkdir -p ~/archivebox/data && cd ~/archivebox/data
 archivebox init --setup           # if any problems, install with pip instead
 archivebox init --setup           # if any problems, install with pip instead
 </code></pre>
 </code></pre>
-<i>Note: If you encounter issues with NPM/NodeJS, <a href="https://github.com/nodesource/distributions#debinstall">install a more recent version</a>.</i><br/><br/>
+<i>Note: If you encounter issues or want more granular instructions, see the <a href="https://github.com/ArchiveBox/ArchiveBox/wiki/Install#option-c-bare-metal-setup">Install: Bare Metal</a> Wiki.</i><br/><br/>
 </li>
 </li>
 <li>Optional: Start the server then login to the Web UI <a href="http://127.0.0.1:8000">http://127.0.0.1:8000</a> ⇢ Admin.
 <li>Optional: Start the server then login to the Web UI <a href="http://127.0.0.1:8000">http://127.0.0.1:8000</a> ⇢ Admin.
 <pre lang="bash"><code style="white-space: pre-line">archivebox server 0.0.0.0:8000
 <pre lang="bash"><code style="white-space: pre-line">archivebox server 0.0.0.0:8000
 # completely optional, CLI can always be used without running a server
 # completely optional, CLI can always be used without running a server
-# archivebox [subcommand] [--args]
+# archivebox [subcommand] [--help]
 archivebox help
 archivebox help
 </code></pre>
 </code></pre>
 </li>
 </li>
@@ -321,18 +329,19 @@ See the <a href="https://github.com/ArchiveBox/debian-archivebox"><code>debian-a
 <pre lang="bash"><code style="white-space: pre-line">brew tap archivebox/archivebox
 <pre lang="bash"><code style="white-space: pre-line">brew tap archivebox/archivebox
 brew install archivebox
 brew install archivebox
 </code></pre>
 </code></pre>
+<i>See the <a href="https://github.com/ArchiveBox/ArchiveBox/wiki/Install#option-c-bare-metal-setup">Install: Bare Metal</a> Wiki for more granular instructions for macOS... ➡️</i>
 </li>
 </li>
 <li>Create a new empty directory and initialize your collection (can be anywhere).
 <li>Create a new empty directory and initialize your collection (can be anywhere).
-<pre lang="bash"><code style="white-space: pre-line">mkdir ~/archivebox && cd ~/archivebox
+<pre lang="bash"><code style="white-space: pre-line">mkdir -p ~/archivebox/data && cd ~/archivebox/data
 archivebox init --setup         # if any problems, install with pip instead
 archivebox init --setup         # if any problems, install with pip instead
 </code></pre>
 </code></pre>
 </li>
 </li>
 <li>Optional: Start the server then login to the Web UI <a href="http://127.0.0.1:8000">http://127.0.0.1:8000</a> ⇢ Admin.
 <li>Optional: Start the server then login to the Web UI <a href="http://127.0.0.1:8000">http://127.0.0.1:8000</a> ⇢ Admin.
 <pre lang="bash"><code style="white-space: pre-line">archivebox server 0.0.0.0:8000
 <pre lang="bash"><code style="white-space: pre-line">archivebox server 0.0.0.0:8000
 # completely optional, CLI can always be used without running a server
 # completely optional, CLI can always be used without running a server
-# archivebox [subcommand] [--args]
+# archivebox [subcommand] [--help]
 archivebox help
 archivebox help
-</code></pre>
+</code></pre><br/>
 </li>
 </li>
 </ol>
 </ol>
 
 
@@ -349,7 +358,7 @@ See the <a href="https://github.com/ArchiveBox/homebrew-archivebox"><code>homebr
 
 
 <ul>
 <ul>
 <li>Arch: <a href="https://aur.archlinux.org/packages/archivebox/"><code>yay -S archivebox</code></a> (contributed by <a href="https://github.com/imlonghao"><code>@imlonghao</code></a>)</li>
 <li>Arch: <a href="https://aur.archlinux.org/packages/archivebox/"><code>yay -S archivebox</code></a> (contributed by <a href="https://github.com/imlonghao"><code>@imlonghao</code></a>)</li>
-<li>FreeBSD: <a href="https://github.com/ArchiveBox/ArchiveBox#%EF%B8%8F-easy-setup"><code>curl -sSL 'https://get.archivebox.io' | sh</code></a> (uses <code>pkg</code> + <code>pip3</code> under-the-hood)</li>
+<li>FreeBSD: <a href="https://github.com/ArchiveBox/ArchiveBox#%EF%B8%8F-easy-setup"><code>curl -fsSL 'https://get.archivebox.io' | sh</code></a> (uses <code>pkg</code> + <code>pip3</code> under-the-hood)</li>
 <li>Nix: <a href="https://github.com/NixOS/nixpkgs/blob/master/pkgs/applications/misc/archivebox/default.nix"><code>nix-env --install archivebox</code></a> (contributed by <a href="https://github.com/siraben"><code>@siraben</code></a>)</li>
 <li>Nix: <a href="https://github.com/NixOS/nixpkgs/blob/master/pkgs/applications/misc/archivebox/default.nix"><code>nix-env --install archivebox</code></a> (contributed by <a href="https://github.com/siraben"><code>@siraben</code></a>)</li>
 <li>Guix: <a href="https://packages.guix.gnu.org/packages/archivebox/"><code>guix install archivebox</code></a> (contributed by <a href="https://github.com/rakino"><code>@rakino</code></a>)</li>
 <li>Guix: <a href="https://packages.guix.gnu.org/packages/archivebox/"><code>guix install archivebox</code></a> (contributed by <a href="https://github.com/rakino"><code>@rakino</code></a>)</li>
 <li>More: <a href="https://github.com/ArchiveBox/ArchiveBox/issues/new"><i>contribute another distribution...!</i></a></li>
 <li>More: <a href="https://github.com/ArchiveBox/ArchiveBox/issues/new"><i>contribute another distribution...!</i></a></li>
@@ -458,14 +467,15 @@ ArchiveBox commands can be run in a terminal directly on your host, or via Docke
 mkdir -p ~/archivebox/data   # create a new data dir anywhere
 mkdir -p ~/archivebox/data   # create a new data dir anywhere
 cd ~/archivebox/data         # IMPORTANT: cd into the directory
 cd ~/archivebox/data         # IMPORTANT: cd into the directory
 
 
-# archivebox [subcommand] [--args]
+# archivebox [subcommand] [--help]
+archivebox version
 archivebox help
 archivebox help
 
 
-# equivalent: docker compose run archivebox [subcommand [--args]
+# equivalent: docker compose run archivebox [subcommand] [--help]
 docker compose run archivebox help
 docker compose run archivebox help
 
 
-# equivalent: docker run -it -v $PWD:/data archivebox/archivebox [subcommand [--args]
- docker run -it -v $PWD:/data archivebox/archivebox help
+# equivalent: docker run -it -v $PWD:/data archivebox/archivebox [subcommand] [--help]
+docker run -it -v $PWD:/data archivebox/archivebox help
 ```
 ```
 
 
 #### ArchiveBox Subcommands
 #### ArchiveBox Subcommands
@@ -482,7 +492,7 @@ docker compose run archivebox help
 <pre lang="bash"><code style="white-space: pre-line">
 <pre lang="bash"><code style="white-space: pre-line">
 # make sure you have pip-installed ArchiveBox and it's available in your $PATH first  
 # make sure you have pip-installed ArchiveBox and it's available in your $PATH first  
 <br/>
 <br/>
-# archivebox [subcommand] [--args]
+# archivebox [subcommand] [--help]
 archivebox init --setup      # safe to run init multiple times (also how you update versions)
 archivebox init --setup      # safe to run init multiple times (also how you update versions)
 archivebox version           # get archivebox version info + check dependencies
 archivebox version           # get archivebox version info + check dependencies
 archivebox help              # get list of archivebox subcommands that can be run
 archivebox help              # get list of archivebox subcommands that can be run
@@ -498,7 +508,7 @@ archivebox add --depth=1 'https://news.ycombinator.com'
 <pre lang="bash"><code style="white-space: pre-line">
 <pre lang="bash"><code style="white-space: pre-line">
 # make sure you have `docker-compose.yml` from the Quickstart instructions first
 # make sure you have `docker-compose.yml` from the Quickstart instructions first
 <br/>
 <br/>
-# docker compose run archivebox [subcommand [--args]
+# docker compose run archivebox [subcommand] [--help]
 docker compose run archivebox init --setup
 docker compose run archivebox init --setup
 docker compose run archivebox version
 docker compose run archivebox version
 docker compose run archivebox help
 docker compose run archivebox help
@@ -515,7 +525,7 @@ docker compose run archivebox add --depth=1 'https://news.ycombinator.com'
 <pre lang="bash"><code style="white-space: pre-line">
 <pre lang="bash"><code style="white-space: pre-line">
 # make sure you create and cd into in a new empty directory first  
 # make sure you create and cd into in a new empty directory first  
 <br/>
 <br/>
-# docker run -it -v $PWD:/data archivebox/archivebox [subcommand [--args]
+# docker run -it -v $PWD:/data archivebox/archivebox [subcommand] [--help]
 docker run -v $PWD:/data -it archivebox/archivebox init --setup
 docker run -v $PWD:/data -it archivebox/archivebox init --setup
 docker run -v $PWD:/data -it archivebox/archivebox version
 docker run -v $PWD:/data -it archivebox/archivebox version
 docker run -v $PWD:/data -it archivebox/archivebox help
 docker run -v $PWD:/data -it archivebox/archivebox help
@@ -675,7 +685,7 @@ It uses all available methods out-of-the-box, but you can disable extractors and
 <summary><i>Expand to see the full list of ways it saves each page...</i></summary>
 <summary><i>Expand to see the full list of ways it saves each page...</i></summary>
 
 
 
 
-<code>./archive/{Snapshot.id}/</code><br/>
+<code>data/archive/{Snapshot.id}/</code><br/>
 <ul>
 <ul>
 <li><strong>Index:</strong> <code>index.html</code> &amp; <code>index.json</code> HTML and JSON index files containing metadata and details</li>
 <li><strong>Index:</strong> <code>index.html</code> &amp; <code>index.json</code> HTML and JSON index files containing metadata and details</li>
 <li><strong>Title</strong>, <strong>Favicon</strong>, <strong>Headers</strong> Response headers, site favicon, and parsed site title</li>
 <li><strong>Title</strong>, <strong>Favicon</strong>, <strong>Headers</strong> Response headers, site favicon, and parsed site title</li>
@@ -806,18 +816,18 @@ All of ArchiveBox's state (SQLite DB, content, config, logs, etc.) is stored in
 <details>
 <details>
 <summary><i>Expand to learn more about the layout of Archivebox's data on-disk...</i></summary><br/>
 <summary><i>Expand to learn more about the layout of Archivebox's data on-disk...</i></summary><br/>
 
 
-Data folders can be created anywhere (`~/archivebox` or `$PWD/data` as seen in our examples), and you can create as many data folders as you want to hold different collections.
+Data folders can be created anywhere (`~/archivebox/data` or `$PWD/data` as seen in our examples), and you can create as many data folders as you want to hold different collections.
 All <code>archivebox</code> CLI commands are designed to be run from inside an ArchiveBox data folder, starting with <code>archivebox init</code> to initialize a new collection inside an empty directory.
 All <code>archivebox</code> CLI commands are designed to be run from inside an ArchiveBox data folder, starting with <code>archivebox init</code> to initialize a new collection inside an empty directory.
 
 
-<pre lang="bash"><code style="white-space: pre-line">mkdir ~/archivebox && cd ~/archivebox   # just an example, can be anywhere
+<pre lang="bash"><code style="white-space: pre-line">mkdir -p ~/archivebox/data && cd ~/archivebox/data   # just an example, can be anywhere
 archivebox init</code></pre>
 archivebox init</code></pre>
 
 
-The on-disk layout is optimized to be easy to browse by hand and durable long-term. The main index is a standard <code>index.sqlite3</code> database in the root of the data folder (it can also be <a href="https://github.com/ArchiveBox/ArchiveBox/wiki/Publishing-Your-Archive#2-export-and-host-it-as-static-html">exported as static JSON/HTML</a>), and the archive snapshots are organized by date-added timestamp in the <code>./archive/</code> subfolder.
+The on-disk layout is optimized to be easy to browse by hand and durable long-term. The main index is a standard <code>index.sqlite3</code> database in the root of the data folder (it can also be <a href="https://github.com/ArchiveBox/ArchiveBox/wiki/Publishing-Your-Archive#2-export-and-host-it-as-static-html">exported as static JSON/HTML</a>), and the archive snapshots are organized by date-added timestamp in the <code>data/archive/</code> subfolder.
 
 
 <img src="https://user-images.githubusercontent.com/511499/117453293-c7b91600-af12-11eb-8a3f-aa48b0f9da3c.png" width="400px" align="right" style="float: right"/>
 <img src="https://user-images.githubusercontent.com/511499/117453293-c7b91600-af12-11eb-8a3f-aa48b0f9da3c.png" width="400px" align="right" style="float: right"/>
 
 
 
 
-<pre lang="bash"><code style="white-space: pre-line">/data/
+<pre lang="bash"><code style="white-space: pre-line">data/
     index.sqlite3
     index.sqlite3
     ArchiveBox.conf
     ArchiveBox.conf
     archive/
     archive/
@@ -832,7 +842,7 @@ The on-disk layout is optimized to be easy to browse by hand and durable long-te
             ...
             ...
 </code></pre>
 </code></pre>
 
 
-Each snapshot subfolder <code>./archive/TIMESTAMP/</code> includes a static <code>index.json</code> and <code>index.html</code> describing its contents, and the snapshot extractor outputs are plain files within the folder.
+Each snapshot subfolder <code>data/archive/TIMESTAMP/</code> includes a static <code>index.json</code> and <code>index.html</code> describing its contents, and the snapshot extractor outputs are plain files within the folder.
 
 
 <h4>Learn More</h4>
 <h4>Learn More</h4>
 <ul>
 <ul>
@@ -1046,9 +1056,9 @@ Because ArchiveBox is designed to ingest a large volume of URLs with multiple co
 
 
 Disk usage can be reduced by using a compressed/deduplicated filesystem like ZFS/BTRFS, or by turning off extractors methods you don't need. You can also deduplicate content with a tool like [fdupes](https://github.com/adrianlopezroche/fdupes) or [rdfind](https://github.com/pauldreik/rdfind).  
 Disk usage can be reduced by using a compressed/deduplicated filesystem like ZFS/BTRFS, or by turning off extractors methods you don't need. You can also deduplicate content with a tool like [fdupes](https://github.com/adrianlopezroche/fdupes) or [rdfind](https://github.com/pauldreik/rdfind).  
 
 
-**Don't store large collections on older filesystems like EXT3/FAT** as they may not be able to handle more than 50k directory entries in the `archive/` folder.
+**Don't store large collections on older filesystems like EXT3/FAT** as they may not be able to handle more than 50k directory entries in the `data/archive/` folder.
 
 
-**Try to keep the `index.sqlite3` file on local drive (not a network mount)** or SSD for maximum performance, however the `archive/` folder can be on a network mount or slower HDD.
+**Try to keep the `data/index.sqlite3` file on local drive (not a network mount)** or SSD for maximum performance, however the `data/archive/` folder can be on a network mount or slower HDD.
 
 
 If using Docker or NFS/SMB/FUSE for the `data/archive/` folder, you may need to set [`PUID` & `PGID`](https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#puid--pgid) and [disable `root_squash`](https://github.com/ArchiveBox/ArchiveBox/issues/1304) on your fileshare server.
 If using Docker or NFS/SMB/FUSE for the `data/archive/` folder, you may need to set [`PUID` & `PGID`](https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#puid--pgid) and [disable `root_squash`](https://github.com/ArchiveBox/ArchiveBox/issues/1304) on your fileshare server.
 
 
@@ -1439,7 +1449,7 @@ https://stackoverflow.com/questions/1074212/how-can-i-see-the-raw-sql-queries-dj
 
 
 ArchiveBox [`extractors`](https://github.com/ArchiveBox/ArchiveBox/blob/dev/archivebox/extractors/media.py) are external binaries or Python/Node scripts that ArchiveBox runs to archive content on a page.
 ArchiveBox [`extractors`](https://github.com/ArchiveBox/ArchiveBox/blob/dev/archivebox/extractors/media.py) are external binaries or Python/Node scripts that ArchiveBox runs to archive content on a page.
 
 
-Extractors take the URL of a page to archive, write their output to the filesystem `archive/TIMESTAMP/EXTRACTOR/...`, and return an [`ArchiveResult`](https://github.com/ArchiveBox/ArchiveBox/blob/dev/archivebox/core/models.py#:~:text=return%20qs-,class%20ArchiveResult,-(models.Model)%3A) entry which is saved to the database (visible on the `Log` page in the UI).
+Extractors take the URL of a page to archive, write their output to the filesystem `data/archive/TIMESTAMP/EXTRACTOR/...`, and return an [`ArchiveResult`](https://github.com/ArchiveBox/ArchiveBox/blob/dev/archivebox/core/models.py#:~:text=return%20qs-,class%20ArchiveResult,-(models.Model)%3A) entry which is saved to the database (visible on the `Log` page in the UI).
 
 
 *Check out how we added **[`archivebox/extractors/singlefile.py`](https://github.com/ArchiveBox/ArchiveBox/blob/dev/archivebox/extractors/singlefile.py)** as an example of the process: [Issue #399](https://github.com/ArchiveBox/ArchiveBox/issues/399) + [PR #403](https://github.com/ArchiveBox/ArchiveBox/pull/403).*
 *Check out how we added **[`archivebox/extractors/singlefile.py`](https://github.com/ArchiveBox/ArchiveBox/blob/dev/archivebox/extractors/singlefile.py)** as an example of the process: [Issue #399](https://github.com/ArchiveBox/ArchiveBox/issues/399) + [PR #403](https://github.com/ArchiveBox/ArchiveBox/pull/403).*
 
 

+ 20 - 12
archivebox/config.py

@@ -154,7 +154,7 @@ CONFIG_SCHEMA: Dict[str, ConfigDefaultDict] = {
         'CHROME_SANDBOX':           {'type': bool,  'default': lambda c: not c['IN_DOCKER']},
         'CHROME_SANDBOX':           {'type': bool,  'default': lambda c: not c['IN_DOCKER']},
         'YOUTUBEDL_ARGS':           {'type': list,  'default': lambda c: [
         'YOUTUBEDL_ARGS':           {'type': list,  'default': lambda c: [
                                                                 '--restrict-filenames',
                                                                 '--restrict-filenames',
-                                                                '--trim-filenames',
+                                                                '--trim-filenames', '128',
                                                                 '--write-description',
                                                                 '--write-description',
                                                                 '--write-info-json',
                                                                 '--write-info-json',
                                                                 '--write-annotations',
                                                                 '--write-annotations',
@@ -366,24 +366,32 @@ ALLOWDENYLIST_REGEX_FLAGS: int = re.IGNORECASE | re.UNICODE | re.MULTILINE
 
 
 ############################## Version Config ##################################
 ############################## Version Config ##################################
 
 
-def get_system_user():
-    SYSTEM_USER = getpass.getuser() or os.getlogin()
+def get_system_user() -> str:
+    # some host OS's are unable to provide a username (k3s, Windows), making this complicated
+    # uid 999 is especially problematic and breaks many attempts
+    SYSTEM_USER = None
+    FALLBACK_USER_PLACHOLDER = f'user_{os.getuid()}'
+
+    # Option 1
     try:
     try:
         import pwd
         import pwd
-        return pwd.getpwuid(os.geteuid()).pw_name or SYSTEM_USER
-    except KeyError:
-        # Process' UID might not map to a user in cases such as running the Docker image
-        # (where `archivebox` is 999) as a different UID.
+        SYSTEM_USER = SYSTEM_USER or pwd.getpwuid(os.geteuid()).pw_name
+    except (ModuleNotFoundError, Exception):
         pass
         pass
-    except ModuleNotFoundError:
-        # pwd doesn't exist on windows
+
+    # Option 2
+    try:
+        SYSTEM_USER = SYSTEM_USER or getpass.getuser()
+    except Exception:
         pass
         pass
+
+    # Option 3
+    try:
+        SYSTEM_USER = SYSTEM_USER or os.getlogin()
     except Exception:
     except Exception:
-        # this should never happen, uncomment to debug
-        # raise
         pass
         pass
 
 
-    return SYSTEM_USER
+    return SYSTEM_USER or FALLBACK_USER_PLACHOLDER
 
 
 def get_version(config):
 def get_version(config):
     try:
     try:

+ 17 - 13
archivebox/system.py

@@ -146,20 +146,24 @@ def get_dir_size(path: Union[str, Path], recursive: bool=True, pattern: Optional
        recursively and limiting to a given filter list
        recursively and limiting to a given filter list
     """
     """
     num_bytes, num_dirs, num_files = 0, 0, 0
     num_bytes, num_dirs, num_files = 0, 0, 0
-    for entry in os.scandir(path):
-        if (pattern is not None) and (pattern not in entry.path):
-            continue
-        if entry.is_dir(follow_symlinks=False):
-            if not recursive:
+    try:
+        for entry in os.scandir(path):
+            if (pattern is not None) and (pattern not in entry.path):
                 continue
                 continue
-            num_dirs += 1
-            bytes_inside, dirs_inside, files_inside = get_dir_size(entry.path)
-            num_bytes += bytes_inside
-            num_dirs += dirs_inside
-            num_files += files_inside
-        else:
-            num_bytes += entry.stat(follow_symlinks=False).st_size
-            num_files += 1
+            if entry.is_dir(follow_symlinks=False):
+                if not recursive:
+                    continue
+                num_dirs += 1
+                bytes_inside, dirs_inside, files_inside = get_dir_size(entry.path)
+                num_bytes += bytes_inside
+                num_dirs += dirs_inside
+                num_files += files_inside
+            else:
+                num_bytes += entry.stat(follow_symlinks=False).st_size
+                num_files += 1
+    except OSError:
+        # e.g. FileNameTooLong or other error while trying to read dir
+        pass
     return num_bytes, num_dirs, num_files
     return num_bytes, num_dirs, num_files