فهرست منبع

Merge pull request #680 from ArchiveBox/dev

Nick Sweeting 4 سال پیش
والد
کامیت
fe2b682b1f
74فایلهای تغییر یافته به همراه2061 افزوده شده و 767 حذف شده
  1. 2 2
      .github/ISSUE_TEMPLATE/bug_report.md
  2. 3 0
      .gitmodules
  3. 17 12
      Dockerfile
  4. 59 29
      README.md
  5. 5 1
      archivebox/cli/__init__.py
  6. 12 1
      archivebox/cli/archivebox_add.py
  7. 4 1
      archivebox/cli/archivebox_config.py
  8. 6 0
      archivebox/cli/archivebox_init.py
  9. 11 11
      archivebox/cli/archivebox_list.py
  10. 4 1
      archivebox/cli/archivebox_oneshot.py
  11. 4 1
      archivebox/cli/archivebox_remove.py
  12. 13 2
      archivebox/cli/archivebox_server.py
  13. 7 3
      archivebox/cli/archivebox_update.py
  14. 227 0
      archivebox/cli/tests.py
  15. 62 10
      archivebox/config.py
  16. 162 71
      archivebox/core/admin.py
  17. 2 1
      archivebox/core/forms.py
  18. 18 0
      archivebox/core/migrations/0009_auto_20210216_1038.py
  19. 18 0
      archivebox/core/migrations/0010_auto_20210216_1055.py
  20. 24 0
      archivebox/core/migrations/0011_auto_20210216_1331.py
  21. 23 0
      archivebox/core/migrations/0012_auto_20210216_1425.py
  22. 18 0
      archivebox/core/migrations/0013_auto_20210218_0729.py
  23. 18 0
      archivebox/core/migrations/0014_auto_20210218_0729.py
  24. 18 0
      archivebox/core/migrations/0015_auto_20210218_0730.py
  25. 18 0
      archivebox/core/migrations/0016_auto_20210218_1204.py
  26. 18 0
      archivebox/core/migrations/0017_auto_20210219_0211.py
  27. 23 0
      archivebox/core/migrations/0018_auto_20210327_0952.py
  28. 90 38
      archivebox/core/models.py
  29. 127 1
      archivebox/core/settings.py
  30. 43 34
      archivebox/core/urls.py
  31. 161 42
      archivebox/core/views.py
  32. 3 3
      archivebox/core/wsgi.py
  33. 10 3
      archivebox/extractors/__init__.py
  34. 1 1
      archivebox/extractors/archive_org.py
  35. 11 3
      archivebox/extractors/mercury.py
  36. 13 5
      archivebox/extractors/readability.py
  37. 1 0
      archivebox/index/__init__.py
  38. 79 72
      archivebox/index/html.py
  39. 1 2
      archivebox/index/json.py
  40. 10 1
      archivebox/index/schema.py
  41. 61 21
      archivebox/index/sql.py
  42. 64 11
      archivebox/logging_util.py
  43. 156 95
      archivebox/main.py
  44. 45 31
      archivebox/parsers/__init__.py
  45. 1 1
      archivebox/search/utils.py
  46. 3 2
      archivebox/system.py
  47. 0 1
      archivebox/templates/admin/actions_as_select.html
  48. 6 6
      archivebox/templates/admin/base.html
  49. 1 1
      archivebox/templates/core/add.html
  50. 2 2
      archivebox/templates/core/base.html
  51. 2 2
      archivebox/templates/core/index_row.html
  52. 1 1
      archivebox/templates/core/minimal_index.html
  53. 15 10
      archivebox/templates/core/public_index.html
  54. 44 18
      archivebox/templates/core/snapshot.html
  55. 2 2
      archivebox/templates/core/static_index.html
  56. 1 1
      archivebox/templates/static/add.css
  57. 37 0
      archivebox/templates/static/admin.css
  58. BIN
      archivebox/templates/static/favicon.ico
  59. 2 0
      archivebox/templates/static/robots.txt
  60. 3 1
      archivebox/util.py
  61. 1 0
      archivebox/vendor/atomicwrites.py
  62. 1 0
      archivebox/vendor/python-atomicwrites
  63. 5 3
      bin/docker_entrypoint.sh
  64. 23 20
      docker-compose.yml
  65. 142 116
      package-lock.json
  66. 1 1
      package.json
  67. 49 40
      setup.py
  68. 1 1
      stdeb.cfg
  69. 3 3
      tests/test_add.py
  70. 4 4
      tests/test_extractors.py
  71. 10 10
      tests/test_init.py
  72. 2 2
      tests/test_list.py
  73. 14 10
      tests/test_remove.py
  74. 13 0
      uwsgi.ini

+ 2 - 2
.github/ISSUE_TEMPLATE/bug_report.md

@@ -1,8 +1,8 @@
 ---
 ---
 name: 🐞 Bug report
 name: 🐞 Bug report
 about: Create a report to help us improve
 about: Create a report to help us improve
-title: 'Bugfix: ...'
-labels: 'changes: bugfixes'
+title: 'Bug: ...'
+labels: 'bug'
 assignees: ''
 assignees: ''
 
 
 ---
 ---

+ 3 - 0
.gitmodules

@@ -23,3 +23,6 @@
 [submodule "archivebox/vendor/django-taggit"]
 [submodule "archivebox/vendor/django-taggit"]
 	path = archivebox/vendor/django-taggit
 	path = archivebox/vendor/django-taggit
 	url = https://github.com/jazzband/django-taggit
 	url = https://github.com/jazzband/django-taggit
+[submodule "archivebox/vendor/python-atomicwrites"]
+	path = archivebox/vendor/python-atomicwrites
+	url = https://github.com/untitaker/python-atomicwrites

+ 17 - 12
Dockerfile

@@ -50,13 +50,6 @@ RUN apt-get update -qq \
         fontconfig fonts-ipafont-gothic fonts-wqy-zenhei fonts-thai-tlwg fonts-kacst fonts-symbola fonts-noto fonts-freefont-ttf \
         fontconfig fonts-ipafont-gothic fonts-wqy-zenhei fonts-thai-tlwg fonts-kacst fonts-symbola fonts-noto fonts-freefont-ttf \
     && rm -rf /var/lib/apt/lists/*
     && rm -rf /var/lib/apt/lists/*
 
 
-# Install apt development dependencies
-# RUN apt-get install -qq \
-#     && apt-get install -qq -y --no-install-recommends \
-#         python3 python3-dev python3-pip python3-venv python3-all \
-#         dh-python debhelper devscripts dput software-properties-common \
-#         python3-distutils python3-setuptools python3-wheel python3-stdeb
-
 # Install Node environment
 # Install Node environment
 RUN curl -s https://deb.nodesource.com/gpgkey/nodesource.gpg.key | apt-key add - \
 RUN curl -s https://deb.nodesource.com/gpgkey/nodesource.gpg.key | apt-key add - \
     && echo 'deb https://deb.nodesource.com/node_15.x buster main' >> /etc/apt/sources.list \
     && echo 'deb https://deb.nodesource.com/node_15.x buster main' >> /etc/apt/sources.list \
@@ -79,17 +72,26 @@ WORKDIR "$CODE_DIR"
 ENV PATH="${PATH}:$VENV_PATH/bin"
 ENV PATH="${PATH}:$VENV_PATH/bin"
 RUN python -m venv --clear --symlinks "$VENV_PATH" \
 RUN python -m venv --clear --symlinks "$VENV_PATH" \
     && pip install --upgrade --quiet pip setuptools
     && pip install --upgrade --quiet pip setuptools
-ADD ./pip_dist/archivebox.egg-info/requires.txt "$CODE_DIR/pip_dist/archivebox.egg-info/requires.txt"
+ADD "./setup.py" "$CODE_DIR/"
+ADD "./README.md" "./package.json" "$CODE_DIR/archivebox/"
 RUN apt-get update -qq \
 RUN apt-get update -qq \
     && apt-get install -qq -y --no-install-recommends \
     && apt-get install -qq -y --no-install-recommends \
         build-essential python-dev python3-dev \
         build-essential python-dev python3-dev \
-    # && pip install --upgrade pip \
-    && grep -B 1000 -E '^$' "$CODE_DIR/pip_dist/archivebox.egg-info/requires.txt" | pip install --quiet -r /dev/stdin \
-    && pip install --quiet "sonic-client==0.0.5" \
+    && python3 -c 'from distutils.core import run_setup; result = run_setup("./setup.py", stop_after="init"); print("\n".join(result.install_requires + result.extras_require["sonic"]))' > /tmp/requirements.txt \
+    && pip install --quiet -r /tmp/requirements.txt \
     && apt-get purge -y build-essential python-dev python3-dev \
     && apt-get purge -y build-essential python-dev python3-dev \
     && apt-get autoremove -y \
     && apt-get autoremove -y \
     && rm -rf /var/lib/apt/lists/*
     && rm -rf /var/lib/apt/lists/*
 
 
+# Install apt development dependencies
+# RUN apt-get install -qq \
+#     && apt-get install -qq -y --no-install-recommends \
+#         python3 python3-dev python3-pip python3-venv python3-all \
+#         dh-python debhelper devscripts dput software-properties-common \
+#         python3-distutils python3-setuptools python3-wheel python3-stdeb
+# RUN python3 -c 'from distutils.core import run_setup; result = run_setup("./setup.py", stop_after="init"); print("\n".join(result.extras_require["dev"]))' > /tmp/dev_requirements.txt \
+    # && pip install --quiet -r /tmp/dev_requirements.txt
+
 # Install ArchiveBox Python package and its dependencies
 # Install ArchiveBox Python package and its dependencies
 WORKDIR "$CODE_DIR"
 WORKDIR "$CODE_DIR"
 ADD . "$CODE_DIR"
 ADD . "$CODE_DIR"
@@ -115,5 +117,8 @@ RUN /app/bin/docker_entrypoint.sh archivebox version
 VOLUME "$DATA_DIR"
 VOLUME "$DATA_DIR"
 EXPOSE 8000
 EXPOSE 8000
 
 
+HEALTHCHECK --interval=30s --timeout=20s --retries=15 \
+    CMD curl --silent 'http://localhost:8000/admin/login/' || exit 1
+
 ENTRYPOINT ["dumb-init", "--", "/app/bin/docker_entrypoint.sh"]
 ENTRYPOINT ["dumb-init", "--", "/app/bin/docker_entrypoint.sh"]
-CMD ["archivebox", "server", "0.0.0.0:8000"]
+CMD ["archivebox", "server", "--quick-init", "0.0.0.0:8000"]

+ 59 - 29
README.md

@@ -93,7 +93,7 @@ archivebox help
 
 
 ### Quickstart
 ### Quickstart
 
 
-**🖥  Supported OSs:** Linux/BSD, macOS, Windows      **🎮  CPU Architectures:** x86, amd64, arm7, arm8 (raspi >=3)
+**🖥  Supported OSs:** Linux/BSD, macOS, Windows (w/ Docker)      **🎮  CPU Architectures:** x86, amd64, arm7, arm8 (raspi >=3)
 **📦  Distributions:** `docker`/`apt`/`brew`/`pip3`/`npm` (in order of completeness)
 **📦  Distributions:** `docker`/`apt`/`brew`/`pip3`/`npm` (in order of completeness)
 
 
 *(click to expand your preferred **► `distribution`** below for full setup instructions)*
 *(click to expand your preferred **► `distribution`** below for full setup instructions)*
@@ -103,22 +103,29 @@ archivebox help
 
 
 <i>First make sure you have Docker installed: https://docs.docker.com/get-docker/</i>
 <i>First make sure you have Docker installed: https://docs.docker.com/get-docker/</i>
 
 
+Download the [`docker-compose.yml`](https://raw.githubusercontent.com/ArchiveBox/ArchiveBox/master/docker-compose.yml) file.
 <pre lang="bash"><code>
 <pre lang="bash"><code>
-# create a new empty directory and initalize your collection (can be anywhere)
-mkdir ~/archivebox && cd ~/archivebox
 curl -O 'https://raw.githubusercontent.com/ArchiveBox/ArchiveBox/master/docker-compose.yml'
 curl -O 'https://raw.githubusercontent.com/ArchiveBox/ArchiveBox/master/docker-compose.yml'
-docker-compose run archivebox init
-docker-compose run archivebox --version
+</code></pre>
 
 
-# start the webserver and open the UI (optional)
+Start the server.
+<pre lang="bash"><code>
+docker-compose run archivebox server --quick-init
 docker-compose run archivebox manage createsuperuser
 docker-compose run archivebox manage createsuperuser
-docker-compose up -d
-open 'http://127.0.0.1:8000'
+</code></pre>
 
 
+Open [`http://127.0.0.1:8000`](http://127.0.0.1:8000).
+
+<pre lang="bash"><code>
 # you can also add links and manage your archive via the CLI:
 # you can also add links and manage your archive via the CLI:
 docker-compose run archivebox add 'https://example.com'
 docker-compose run archivebox add 'https://example.com'
+echo 'https://example.com' | docker-compose run archivebox -T add
 docker-compose run archivebox status
 docker-compose run archivebox status
 docker-compose run archivebox help  # to see more options
 docker-compose run archivebox help  # to see more options
+
+# when passing stdin/stdout via the cli, use the -T flag
+echo 'https://example.com' | docker-compose run -T archivebox add
+docker-compose run -T archivebox list --html --with-headers > index.html
 </code></pre>
 </code></pre>
 
 
 This is the recommended way to run ArchiveBox because it includes <i>all</i> the extractors like:<br/>
 This is the recommended way to run ArchiveBox because it includes <i>all</i> the extractors like:<br/>
@@ -127,7 +134,7 @@ chrome, wget, youtube-dl, git, etc., full-text search w/ sonic, and many other g
 </details>
 </details>
 
 
 <details>
 <details>
-<summary><b>Get ArchiveBox with <code>docker</code> on any platform</b></summary>
+<summary><b>Get ArchiveBox with <code>docker</code> on macOS/Linux/Windows</b></summary>
 
 
 <i>First make sure you have Docker installed: https://docs.docker.com/get-docker/</i>
 <i>First make sure you have Docker installed: https://docs.docker.com/get-docker/</i>
 
 
@@ -145,21 +152,30 @@ open http://127.0.0.1:8000
 docker run -v $PWD:/data -it archivebox/archivebox add 'https://example.com'
 docker run -v $PWD:/data -it archivebox/archivebox add 'https://example.com'
 docker run -v $PWD:/data -it archivebox/archivebox status
 docker run -v $PWD:/data -it archivebox/archivebox status
 docker run -v $PWD:/data -it archivebox/archivebox help  # to see more options
 docker run -v $PWD:/data -it archivebox/archivebox help  # to see more options
+
+# when passing stdin/stdout via the cli, use only -i (not -it)
+echo 'https://example.com' | docker run -v $PWD:/data -i archivebox/archivebox add
+docker run -v $PWD:/data -i archivebox/archivebox list --html --with-headers > index.html
 </code></pre>
 </code></pre>
 
 
 </details>
 </details>
 
 
 <details>
 <details>
-<summary><b>Get ArchiveBox with <code>apt</code> on Ubuntu >=20.04</b></summary>
+<summary><b>Get ArchiveBox with <code>apt</code> on Ubuntu/Debian</b></summary>
+
+This method should work on all Ubuntu/Debian based systems, including x86, amd64, arm7, and arm8 CPUs (e.g. Raspberry Pis >=3).
 
 
-<i>First make sure you're on Ubuntu >= 20.04, or scroll down for older/non-Ubuntu instructions.</i>
+If you're on Ubuntu >= 20.04, add the `apt` repository with `add-apt-repository`:
+<small><i>(on other Ubuntu/Debian-based systems follow the <b>♰ instructions</b> below)</i></small>
 
 
 <pre lang="bash"><code>
 <pre lang="bash"><code>
 # add the repo to your sources and install the archivebox package using apt
 # add the repo to your sources and install the archivebox package using apt
 sudo apt install software-properties-common
 sudo apt install software-properties-common
 sudo add-apt-repository -u ppa:archivebox/archivebox
 sudo add-apt-repository -u ppa:archivebox/archivebox
 sudo apt install archivebox
 sudo apt install archivebox
+</code></pre>
 
 
+<pre lang="bash"><code>
 # create a new empty directory and initalize your collection (can be anywhere)
 # create a new empty directory and initalize your collection (can be anywhere)
 mkdir ~/archivebox && cd ~/archivebox
 mkdir ~/archivebox && cd ~/archivebox
 npm install --prefix . 'git+https://github.com/ArchiveBox/ArchiveBox.git'
 npm install --prefix . 'git+https://github.com/ArchiveBox/ArchiveBox.git'
@@ -178,21 +194,25 @@ archivebox list --json --with-headers > index.json
 archivebox help  # to see more options
 archivebox help  # to see more options
 </code></pre>
 </code></pre>
 
 
-For other Debian-based systems or older Ubuntu systems you can add these sources to `/etc/apt/sources.list`:
+<i><b>♰ On other Ubuntu/Debian-based systems</b> add these sources directly to <code>/etc/apt/sources.list</code>:</i>
 
 
 <pre lang="bash"><code>
 <pre lang="bash"><code>
-deb http://ppa.launchpad.net/archivebox/archivebox/ubuntu focal main
-deb-src http://ppa.launchpad.net/archivebox/archivebox/ubuntu focal main
+echo "deb http://ppa.launchpad.net/archivebox/archivebox/ubuntu focal main" > /etc/apt/sources.list.d/archivebox.list
+echo "deb-src http://ppa.launchpad.net/archivebox/archivebox/ubuntu focal main" >> /etc/apt/sources.list.d/archivebox.list
+sudo apt-key adv --keyserver keyserver.ubuntu.com --recv-keys C258F79DCC02E369
+sudo apt update
+sudo apt install archivebox
+sudo snap install chromium
+archivebox --version
+# then scroll back up and continue the initalization instructions above
 </code></pre>
 </code></pre>
 
 
-Then run `apt update; apt install archivebox; archivebox --version`.
-
 (you may need to install some other dependencies manually however)
 (you may need to install some other dependencies manually however)
 
 
 </details>
 </details>
 
 
 <details>
 <details>
-<summary><b>Get ArchiveBox with <code>brew</code> on macOS >=10.13</b></summary>
+<summary><b>Get ArchiveBox with <code>brew</code> on macOS</b></summary>
 
 
 <i>First make sure you have Homebrew installed: https://brew.sh/#install</i>
 <i>First make sure you have Homebrew installed: https://brew.sh/#install</i>
 
 
@@ -252,13 +272,12 @@ archivebox help  # to see more options
 
 
 No matter which install method you choose, they all roughly follow this 3-step process and all provide the same CLI, Web UI, and on-disk data format.
 No matter which install method you choose, they all roughly follow this 3-step process and all provide the same CLI, Web UI, and on-disk data format.
 
 
-<small>
-
-1. Install ArchiveBox: `apt/brew/pip3 install archivebox`
-2. Start a collection: `archivebox init`
-3. Start archiving: `archivebox add 'https://example.com'`
-
-</small>
+<small><ol>
+<li>Install ArchiveBox: <code>apt/brew/pip3 install archivebox</code></li>
+<li>Start a collection: <code>archivebox init</code></li>
+<li>Start archiving: <code>archivebox add 'https://example.com'</code></li>
+<li>View the archive: <code>archivebox server</code> or <code>archivebox list ...</code>, <code>ls ./archive/*/index.html</code></li>
+</ol></small>
 
 
 <br/>
 <br/>
 <div align="center">
 <div align="center">
@@ -307,8 +326,13 @@ archivebox add < ~/Downloads/firefox_bookmarks_export.html
 archivebox add < any_text_with_urls_in_it.txt
 archivebox add < any_text_with_urls_in_it.txt
 archivebox add --depth=1 'https://example.com/some/downloads.html'
 archivebox add --depth=1 'https://example.com/some/downloads.html'
 archivebox add --depth=1 'https://news.ycombinator.com#2020-12-12'
 archivebox add --depth=1 'https://news.ycombinator.com#2020-12-12'
-```
 
 
+# (if using docker add -i when passing via stdin)
+echo 'https://example.com' | docker run -v $PWD:/data -i archivebox/archivebox add
+
+# (if using docker-compose add -T when passing via stdin)
+echo 'https://example.com' | docker-compose run -T archivebox add
+```
 
 
 - <img src="https://nicksweeting.com/images/rss.svg" height="22px"/> TXT, RSS, XML, JSON, CSV, SQL, HTML, Markdown, or [any other text-based format...](https://github.com/ArchiveBox/ArchiveBox/wiki/Usage#Import-a-list-of-URLs-from-a-text-file)
 - <img src="https://nicksweeting.com/images/rss.svg" height="22px"/> TXT, RSS, XML, JSON, CSV, SQL, HTML, Markdown, or [any other text-based format...](https://github.com/ArchiveBox/ArchiveBox/wiki/Usage#Import-a-list-of-URLs-from-a-text-file)
 - <img src="https://nicksweeting.com/images/bookmarks.png" height="22px"/> [Browser history](https://github.com/ArchiveBox/ArchiveBox/wiki/Quickstart#2-get-your-list-of-urls-to-archive) or [browser bookmarks](https://github.com/ArchiveBox/ArchiveBox/wiki/Quickstart#2-get-your-list-of-urls-to-archive) (see instructions for: [Chrome](https://support.google.com/chrome/answer/96816?hl=en), [Firefox](https://support.mozilla.org/en-US/kb/export-firefox-bookmarks-to-backup-or-transfer), [Safari](http://i.imgur.com/AtcvUZA.png), [IE](https://support.microsoft.com/en-us/help/211089/how-to-import-and-export-the-internet-explorer-favorites-folder-to-a-32-bit-version-of-windows), [Opera](http://help.opera.com/Windows/12.10/en/importexport.html), [and more...](https://github.com/ArchiveBox/ArchiveBox/wiki/Quickstart#2-get-your-list-of-urls-to-archive))
 - <img src="https://nicksweeting.com/images/bookmarks.png" height="22px"/> [Browser history](https://github.com/ArchiveBox/ArchiveBox/wiki/Quickstart#2-get-your-list-of-urls-to-archive) or [browser bookmarks](https://github.com/ArchiveBox/ArchiveBox/wiki/Quickstart#2-get-your-list-of-urls-to-archive) (see instructions for: [Chrome](https://support.google.com/chrome/answer/96816?hl=en), [Firefox](https://support.mozilla.org/en-US/kb/export-firefox-bookmarks-to-backup-or-transfer), [Safari](http://i.imgur.com/AtcvUZA.png), [IE](https://support.microsoft.com/en-us/help/211089/how-to-import-and-export-the-internet-explorer-favorites-folder-to-a-32-bit-version-of-windows), [Opera](http://help.opera.com/Windows/12.10/en/importexport.html), [and more...](https://github.com/ArchiveBox/ArchiveBox/wiki/Quickstart#2-get-your-list-of-urls-to-archive))
@@ -328,6 +352,8 @@ The on-disk layout is optimized to be easy to browse by hand and durable long-te
 # to browse your index statically without running the archivebox server, run:
 # to browse your index statically without running the archivebox server, run:
 archivebox list --html --with-headers > index.html
 archivebox list --html --with-headers > index.html
 archivebox list --json --with-headers > index.json
 archivebox list --json --with-headers > index.json
+# if running these commands with docker-compose, add -T:
+# docker-compose run -T archivebox list ...
 
 
 # then open the static index in a browser
 # then open the static index in a browser
 open index.html
 open index.html
@@ -338,13 +364,13 @@ ls ./archive/<timestamp>/
 
 
 - **Index:** `index.html` & `index.json` HTML and JSON index files containing metadata and details
 - **Index:** `index.html` & `index.json` HTML and JSON index files containing metadata and details
 - **Title**, **Favicon**, **Headers** Response headers, site favicon, and parsed site title
 - **Title**, **Favicon**, **Headers** Response headers, site favicon, and parsed site title
+- **SingleFile:** `singlefile.html` HTML snapshot rendered with headless Chrome using SingleFile
 - **Wget Clone:** `example.com/page-name.html` wget clone of the site with  `warc/<timestamp>.gz`
 - **Wget Clone:** `example.com/page-name.html` wget clone of the site with  `warc/<timestamp>.gz`
 - Chrome Headless
 - Chrome Headless
-  - **SingleFile:** `singlefile.html` HTML snapshot rendered with headless Chrome using SingleFile
   - **PDF:** `output.pdf` Printed PDF of site using headless chrome
   - **PDF:** `output.pdf` Printed PDF of site using headless chrome
   - **Screenshot:** `screenshot.png` 1440x900 screenshot of site using headless chrome
   - **Screenshot:** `screenshot.png` 1440x900 screenshot of site using headless chrome
   - **DOM Dump:** `output.html` DOM Dump of the HTML after rendering using headless chrome
   - **DOM Dump:** `output.html` DOM Dump of the HTML after rendering using headless chrome
-  - **Readability:** `article.html/json` Article text extraction using Readability
+- **Article Text:** `article.html/json` Article text extraction using Readability & Mercury
 - **Archive.org Permalink:** `archive.org.txt` A link to the saved site on archive.org
 - **Archive.org Permalink:** `archive.org.txt` A link to the saved site on archive.org
 - **Audio & Video:** `media/` all audio/video files + playlists, including subtitles & metadata with youtube-dl
 - **Audio & Video:** `media/` all audio/video files + playlists, including subtitles & metadata with youtube-dl
 - **Source Code:** `git/` clone of any repository found on github, bitbucket, or gitlab links
 - **Source Code:** `git/` clone of any repository found on github, bitbucket, or gitlab links
@@ -534,7 +560,8 @@ Whether you want to learn which organizations are the big players in the web arc
     _A collection of the most active internet archiving communities and initiatives._
     _A collection of the most active internet archiving communities and initiatives._
 - Check out the ArchiveBox [Roadmap](https://github.com/ArchiveBox/ArchiveBox/wiki/Roadmap) and [Changelog](https://github.com/ArchiveBox/ArchiveBox/wiki/Changelog)
 - Check out the ArchiveBox [Roadmap](https://github.com/ArchiveBox/ArchiveBox/wiki/Roadmap) and [Changelog](https://github.com/ArchiveBox/ArchiveBox/wiki/Changelog)
 - Learn why archiving the internet is important by reading the "[On the Importance of Web Archiving](https://parameters.ssrc.org/2018/09/on-the-importance-of-web-archiving/)" blog post.
 - Learn why archiving the internet is important by reading the "[On the Importance of Web Archiving](https://parameters.ssrc.org/2018/09/on-the-importance-of-web-archiving/)" blog post.
-- Or reach out to me for questions and comments via [@ArchiveBoxApp](https://twitter.com/ArchiveBoxApp) or [@theSquashSH](https://twitter.com/thesquashSH) on Twitter.
+- Reach out to me for questions and comments via [@ArchiveBoxApp](https://twitter.com/ArchiveBoxApp) or [@theSquashSH](https://twitter.com/thesquashSH) on Twitter
+- Hire us to develop an internet archiving solution for you [@MonadicalSAS](https://twitter.com/MonadicalSAS) [Monadical.com](https://monadical.com)
 
 
 <br/>
 <br/>
 
 
@@ -719,7 +746,10 @@ archivebox manage dbshell
 <br/><br/>
 <br/><br/>
 <img src="https://raw.githubusercontent.com/Monadical-SAS/redux-time/HEAD/examples/static/jeremy.jpg" height="40px"/>
 <img src="https://raw.githubusercontent.com/Monadical-SAS/redux-time/HEAD/examples/static/jeremy.jpg" height="40px"/>
 <br/>
 <br/>
-<sub><i>This project is maintained mostly in <a href="https://nicksweeting.com/blog#About">my spare time</a> with the help from generous contributors and Monadical.com.</i></sub>
+<i><sub>
+This project is maintained mostly in <a href="https://nicksweeting.com/blog#About">my spare time</a> with the help from generous contributors and Monadical (✨  <a href="https://monadical.com">hire them</a> for dev work!).
+</sub>
+</i>
 <br/><br/>
 <br/><br/>
 
 
 <br/>
 <br/>

+ 5 - 1
archivebox/cli/__init__.py

@@ -63,7 +63,11 @@ def run_subcommand(subcommand: str,
 
 
     if subcommand not in meta_cmds:
     if subcommand not in meta_cmds:
         from ..config import setup_django
         from ..config import setup_django
-        setup_django(in_memory_db=subcommand in fake_db, check_db=subcommand in archive_cmds)
+
+        cmd_requires_db = subcommand in archive_cmds
+        init_pending = '--init' in subcommand_args or '--quick-init' in subcommand_args
+
+        setup_django(in_memory_db=subcommand in fake_db, check_db=cmd_requires_db and not init_pending)
 
 
     module = import_module('.archivebox_{}'.format(subcommand), __package__)
     module = import_module('.archivebox_{}'.format(subcommand), __package__)
     module.main(args=subcommand_args, stdin=stdin, pwd=pwd)    # type: ignore
     module.main(args=subcommand_args, stdin=stdin, pwd=pwd)    # type: ignore

+ 12 - 1
archivebox/cli/archivebox_add.py

@@ -22,6 +22,12 @@ def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional
         add_help=True,
         add_help=True,
         formatter_class=SmartFormatter,
         formatter_class=SmartFormatter,
     )
     )
+    parser.add_argument(
+        '--tag', '-t',
+        type=str,
+        default='',
+        help="Tag the added URLs with the provided tags e.g. --tag=tag1,tag2,tag3",
+    )
     parser.add_argument(
     parser.add_argument(
         '--update-all', #'-n',
         '--update-all', #'-n',
         action='store_true',
         action='store_true',
@@ -75,7 +81,11 @@ def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional
     )
     )
     command = parser.parse_args(args or ())
     command = parser.parse_args(args or ())
     urls = command.urls
     urls = command.urls
-    stdin_urls = accept_stdin(stdin)
+
+    stdin_urls = ''
+    if not urls:
+        stdin_urls = accept_stdin(stdin)
+
     if (stdin_urls and urls) or (not stdin and not urls):
     if (stdin_urls and urls) or (not stdin and not urls):
         stderr(
         stderr(
             '[X] You must pass URLs/paths to add via stdin or CLI arguments.\n',
             '[X] You must pass URLs/paths to add via stdin or CLI arguments.\n',
@@ -85,6 +95,7 @@ def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional
     add(
     add(
         urls=stdin_urls or urls,
         urls=stdin_urls or urls,
         depth=command.depth,
         depth=command.depth,
+        tag=command.tag,
         update_all=command.update_all,
         update_all=command.update_all,
         index_only=command.index_only,
         index_only=command.index_only,
         overwrite=command.overwrite,
         overwrite=command.overwrite,

+ 4 - 1
archivebox/cli/archivebox_config.py

@@ -45,7 +45,10 @@ def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional
         help='KEY or KEY=VALUE formatted config values to get or set',
         help='KEY or KEY=VALUE formatted config values to get or set',
     )
     )
     command = parser.parse_args(args or ())
     command = parser.parse_args(args or ())
-    config_options_str = accept_stdin(stdin)
+
+    config_options_str = ''
+    if not command.config_options:
+        config_options_str = accept_stdin(stdin)
 
 
     config(
     config(
         config_options_str=config_options_str,
         config_options_str=config_options_str,

+ 6 - 0
archivebox/cli/archivebox_init.py

@@ -27,11 +27,17 @@ def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional
         action='store_true',
         action='store_true',
         help='Ignore unrecognized files in current directory and initialize anyway',
         help='Ignore unrecognized files in current directory and initialize anyway',
     )
     )
+    parser.add_argument(
+        '--quick', '-q',
+        action='store_true',
+        help='Run any updates or migrations without rechecking all snapshot dirs',
+    )
     command = parser.parse_args(args or ())
     command = parser.parse_args(args or ())
     reject_stdin(__command__, stdin)
     reject_stdin(__command__, stdin)
 
 
     init(
     init(
         force=command.force,
         force=command.force,
+        quick=command.quick,
         out_dir=pwd or OUTPUT_DIR,
         out_dir=pwd or OUTPUT_DIR,
     )
     )
     
     

+ 11 - 11
archivebox/cli/archivebox_list.py

@@ -12,6 +12,7 @@ from ..main import list_all
 from ..util import docstring
 from ..util import docstring
 from ..config import OUTPUT_DIR
 from ..config import OUTPUT_DIR
 from ..index import (
 from ..index import (
+    LINK_FILTERS,
     get_indexed_folders,
     get_indexed_folders,
     get_archived_folders,
     get_archived_folders,
     get_unarchived_folders,
     get_unarchived_folders,
@@ -23,7 +24,7 @@ from ..index import (
     get_corrupted_folders,
     get_corrupted_folders,
     get_unrecognized_folders,
     get_unrecognized_folders,
 )
 )
-from ..logging_util import SmartFormatter, accept_stdin, stderr
+from ..logging_util import SmartFormatter, reject_stdin, stderr
 
 
 
 
 @docstring(list_all.__doc__)
 @docstring(list_all.__doc__)
@@ -44,7 +45,7 @@ def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional
     group.add_argument(
     group.add_argument(
         '--json', #'-j',
         '--json', #'-j',
         action='store_true',
         action='store_true',
-        help="Print the output in JSON format with all columns included.",
+        help="Print the output in JSON format with all columns included",
     )
     )
     group.add_argument(
     group.add_argument(
         '--html',
         '--html',
@@ -59,19 +60,19 @@ def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional
     parser.add_argument(
     parser.add_argument(
         '--sort', #'-s',
         '--sort', #'-s',
         type=str,
         type=str,
-        help="List the links sorted using the given key, e.g. timestamp or updated.",
+        help="List the links sorted using the given key, e.g. timestamp or updated",
         default=None,
         default=None,
     )
     )
     parser.add_argument(
     parser.add_argument(
         '--before', #'-b',
         '--before', #'-b',
         type=float,
         type=float,
-        help="List only links bookmarked before the given timestamp.",
+        help="List only links bookmarked before (less than) the given timestamp",
         default=None,
         default=None,
     )
     )
     parser.add_argument(
     parser.add_argument(
         '--after', #'-a',
         '--after', #'-a',
         type=float,
         type=float,
-        help="List only links bookmarked after the given timestamp.",
+        help="List only links bookmarked after (greater than or equal to) the given timestamp",
         default=None,
         default=None,
     )
     )
     parser.add_argument(
     parser.add_argument(
@@ -96,9 +97,9 @@ def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional
         )
         )
     )
     )
     parser.add_argument(
     parser.add_argument(
-        '--filter-type',
+        '--filter-type', '-t',
         type=str,
         type=str,
-        choices=('exact', 'substring', 'domain', 'regex', 'tag', 'search'),
+        choices=(*LINK_FILTERS.keys(), 'search'),
         default='exact',
         default='exact',
         help='Type of pattern matching to use when filtering URLs',
         help='Type of pattern matching to use when filtering URLs',
     )
     )
@@ -107,20 +108,19 @@ def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional
         nargs='*',
         nargs='*',
         type=str,
         type=str,
         default=None,
         default=None,
-        help='List only URLs matching these filter patterns.'
+        help='List only URLs matching these filter patterns'
     )
     )
     command = parser.parse_args(args or ())
     command = parser.parse_args(args or ())
-    filter_patterns_str = accept_stdin(stdin)
+    reject_stdin(stdin)
 
 
     if command.with_headers and not (command.json or command.html or command.csv):
     if command.with_headers and not (command.json or command.html or command.csv):
         stderr(
         stderr(
-            '[X] --with-headers can only be used with --json, --html or --csv options.\n',
+            '[X] --with-headers can only be used with --json, --html or --csv options\n',
             color='red',
             color='red',
         )
         )
         raise SystemExit(2)
         raise SystemExit(2)
 
 
     matching_folders = list_all(
     matching_folders = list_all(
-        filter_patterns_str=filter_patterns_str,
         filter_patterns=command.filter_patterns,
         filter_patterns=command.filter_patterns,
         filter_type=command.filter_type,
         filter_type=command.filter_type,
         status=command.status,
         status=command.status,

+ 4 - 1
archivebox/cli/archivebox_oneshot.py

@@ -50,8 +50,11 @@ def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional
         help= "Path to save the single archive folder to, e.g. ./example.com_archive"
         help= "Path to save the single archive folder to, e.g. ./example.com_archive"
     )
     )
     command = parser.parse_args(args or ())
     command = parser.parse_args(args or ())
+    stdin_url = None
     url = command.url
     url = command.url
-    stdin_url = accept_stdin(stdin)
+    if not url:
+        stdin_url = accept_stdin(stdin)
+
     if (stdin_url and url) or (not stdin and not url):
     if (stdin_url and url) or (not stdin and not url):
         stderr(
         stderr(
             '[X] You must pass a URL/path to add via stdin or CLI arguments.\n',
             '[X] You must pass a URL/path to add via stdin or CLI arguments.\n',

+ 4 - 1
archivebox/cli/archivebox_remove.py

@@ -61,7 +61,10 @@ def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional
         help='URLs matching this filter pattern will be removed from the index.'
         help='URLs matching this filter pattern will be removed from the index.'
     )
     )
     command = parser.parse_args(args or ())
     command = parser.parse_args(args or ())
-    filter_str = accept_stdin(stdin)
+    
+    filter_str = None
+    if not command.filter_patterns:
+        filter_str = accept_stdin(stdin)
 
 
     remove(
     remove(
         filter_str=filter_str,
         filter_str=filter_str,

+ 13 - 2
archivebox/cli/archivebox_server.py

@@ -38,10 +38,20 @@ def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional
         action='store_true',
         action='store_true',
         help='Enable DEBUG=True mode with more verbose errors',
         help='Enable DEBUG=True mode with more verbose errors',
     )
     )
+    parser.add_argument(
+        '--nothreading',
+        action='store_true',
+        help='Force runserver to run in single-threaded mode',
+    )
     parser.add_argument(
     parser.add_argument(
         '--init',
         '--init',
         action='store_true',
         action='store_true',
-        help='Run archivebox init before starting the server',
+        help='Run a full archivebox init/upgrade before starting the server',
+    )
+    parser.add_argument(
+        '--quick-init', '-i',
+        action='store_true',
+        help='Run quick archivebox init/upgrade before starting the server',
     )
     )
     parser.add_argument(
     parser.add_argument(
         '--createsuperuser',
         '--createsuperuser',
@@ -52,10 +62,11 @@ def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional
     reject_stdin(__command__, stdin)
     reject_stdin(__command__, stdin)
     
     
     server(
     server(
-        runserver_args=command.runserver_args,
+        runserver_args=command.runserver_args + (['--nothreading'] if command.nothreading else []),
         reload=command.reload,
         reload=command.reload,
         debug=command.debug,
         debug=command.debug,
         init=command.init,
         init=command.init,
+        quick_init=command.quick_init,
         createsuperuser=command.createsuperuser,
         createsuperuser=command.createsuperuser,
         out_dir=pwd or OUTPUT_DIR,
         out_dir=pwd or OUTPUT_DIR,
     )
     )

+ 7 - 3
archivebox/cli/archivebox_update.py

@@ -12,6 +12,7 @@ from ..main import update
 from ..util import docstring
 from ..util import docstring
 from ..config import OUTPUT_DIR
 from ..config import OUTPUT_DIR
 from ..index import (
 from ..index import (
+    LINK_FILTERS,
     get_indexed_folders,
     get_indexed_folders,
     get_archived_folders,
     get_archived_folders,
     get_unarchived_folders,
     get_unarchived_folders,
@@ -89,9 +90,9 @@ def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional
         )
         )
     )
     )
     parser.add_argument(
     parser.add_argument(
-        '--filter-type',
+        '--filter-type', '-t',
         type=str,
         type=str,
-        choices=('exact', 'substring', 'domain', 'regex', 'tag', 'search'),
+        choices=(*LINK_FILTERS.keys(), 'search'),
         default='exact',
         default='exact',
         help='Type of pattern matching to use when filtering URLs',
         help='Type of pattern matching to use when filtering URLs',
     )
     )
@@ -110,7 +111,10 @@ def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional
         default=""
         default=""
     )
     )
     command = parser.parse_args(args or ())
     command = parser.parse_args(args or ())
-    filter_patterns_str = accept_stdin(stdin)
+
+    filter_patterns_str = None
+    if not command.filter_patterns:
+        filter_patterns_str = accept_stdin(stdin)
 
 
     update(
     update(
         resume=command.resume,
         resume=command.resume,

+ 227 - 0
archivebox/cli/tests.py

@@ -0,0 +1,227 @@
+#!/usr/bin/env python3
+
+__package__ = 'archivebox.cli'
+
+
+import os
+import sys
+import shutil
+import unittest
+from pathlib import Path
+
+from contextlib import contextmanager
+
+TEST_CONFIG = {
+    'USE_COLOR': 'False',
+    'SHOW_PROGRESS': 'False',
+
+    'OUTPUT_DIR': 'data.tests',
+    
+    'SAVE_ARCHIVE_DOT_ORG': 'False',
+    'SAVE_TITLE': 'False',
+    
+    'USE_CURL': 'False',
+    'USE_WGET': 'False',
+    'USE_GIT': 'False',
+    'USE_CHROME': 'False',
+    'USE_YOUTUBEDL': 'False',
+}
+
+OUTPUT_DIR = 'data.tests'
+os.environ.update(TEST_CONFIG)
+
+from ..main import init
+from ..index import load_main_index
+from ..config import (
+    SQL_INDEX_FILENAME,
+    JSON_INDEX_FILENAME,
+    HTML_INDEX_FILENAME,
+)
+
+from . import (
+    archivebox_init,
+    archivebox_add,
+    archivebox_remove,
+)
+
+HIDE_CLI_OUTPUT = True
+
+test_urls = '''
+https://example1.com/what/is/happening.html?what=1#how-about-this=1
+https://example2.com/what/is/happening/?what=1#how-about-this=1
+HTtpS://example3.com/what/is/happening/?what=1#how-about-this=1f
+https://example4.com/what/is/happening.html
+https://example5.com/
+https://example6.com
+
+<test>http://example7.com</test>
+[https://example8.com/what/is/this.php?what=1]
+[and http://example9.com?what=1&other=3#and-thing=2]
+<what>https://example10.com#and-thing=2 "</about>
+abc<this["https://subb.example11.com/what/is#and-thing=2?whoami=23&where=1"]that>def
+sdflkf[what](https://subb.example12.com/who/what.php?whoami=1#whatami=2)?am=hi
+example13.bada
+and example14.badb
+<or>htt://example15.badc</that>
+'''
+
+stdout = sys.stdout
+stderr = sys.stderr
+
+
+@contextmanager
+def output_hidden(show_failing=True):
+    if not HIDE_CLI_OUTPUT:
+        yield
+        return
+
+    sys.stdout = open('stdout.txt', 'w+', encoding='utf-8')
+    sys.stderr = open('stderr.txt', 'w+', encoding='utf-8')
+    try:
+        yield
+        sys.stdout.close()
+        sys.stderr.close()
+        sys.stdout = stdout
+        sys.stderr = stderr
+    except Exception:
+        sys.stdout.close()
+        sys.stderr.close()
+        sys.stdout = stdout
+        sys.stderr = stderr
+        if show_failing:
+            with open('stdout.txt', 'r', encoding='utf-8') as f:
+                print(f.read())
+            with open('stderr.txt', 'r', encoding='utf-8') as f:
+                print(f.read())
+        raise
+    finally:
+        os.remove('stdout.txt')
+        os.remove('stderr.txt')
+
+
+class TestInit(unittest.TestCase):
+    def setUp(self):
+        os.makedirs(OUTPUT_DIR, exist_ok=True)
+
+    def tearDown(self):
+        shutil.rmtree(OUTPUT_DIR, ignore_errors=True)
+
+    def test_basic_init(self):
+        with output_hidden():
+            archivebox_init.main([])
+
+        assert (Path(OUTPUT_DIR) / SQL_INDEX_FILENAME).exists()
+        assert (Path(OUTPUT_DIR) / JSON_INDEX_FILENAME).exists()
+        assert (Path(OUTPUT_DIR) / HTML_INDEX_FILENAME).exists()
+        assert len(load_main_index(out_dir=OUTPUT_DIR)) == 0
+
+    def test_conflicting_init(self):
+        with open(Path(OUTPUT_DIR) / 'test_conflict.txt', 'w+', encoding='utf-8') as f:
+            f.write('test')
+
+        try:
+            with output_hidden(show_failing=False):
+                archivebox_init.main([])
+            assert False, 'Init should have exited with an exception'
+        except SystemExit:
+            pass
+
+        assert not (Path(OUTPUT_DIR) / SQL_INDEX_FILENAME).exists()
+        assert not (Path(OUTPUT_DIR) / JSON_INDEX_FILENAME).exists()
+        assert not (Path(OUTPUT_DIR) / HTML_INDEX_FILENAME).exists()
+        try:
+            load_main_index(out_dir=OUTPUT_DIR)
+            assert False, 'load_main_index should raise an exception when no index is present'
+        except Exception:
+            pass
+
+    def test_no_dirty_state(self):
+        with output_hidden():
+            init()
+        shutil.rmtree(OUTPUT_DIR, ignore_errors=True)
+        with output_hidden():
+            init()
+
+
+class TestAdd(unittest.TestCase):
+    def setUp(self):
+        os.makedirs(OUTPUT_DIR, exist_ok=True)
+        with output_hidden():
+            init()
+
+    def tearDown(self):
+        shutil.rmtree(OUTPUT_DIR, ignore_errors=True)
+
+    def test_add_arg_url(self):
+        with output_hidden():
+            archivebox_add.main(['https://getpocket.com/users/nikisweeting/feed/all'])
+
+        all_links = load_main_index(out_dir=OUTPUT_DIR)
+        assert len(all_links) == 30
+
+    def test_add_arg_file(self):
+        test_file = Path(OUTPUT_DIR) / 'test.txt'
+        with open(test_file, 'w+', encoding='utf') as f:
+            f.write(test_urls)
+
+        with output_hidden():
+            archivebox_add.main([test_file])
+
+        all_links = load_main_index(out_dir=OUTPUT_DIR)
+        assert len(all_links) == 12
+        os.remove(test_file)
+
+    def test_add_stdin_url(self):
+        with output_hidden():
+            archivebox_add.main([], stdin=test_urls)
+
+        all_links = load_main_index(out_dir=OUTPUT_DIR)
+        assert len(all_links) == 12
+
+
+class TestRemove(unittest.TestCase):
+    def setUp(self):
+        os.makedirs(OUTPUT_DIR, exist_ok=True)
+        with output_hidden():
+            init()
+            archivebox_add.main([], stdin=test_urls)
+
+    # def tearDown(self):
+        # shutil.rmtree(OUTPUT_DIR, ignore_errors=True)
+
+
+    def test_remove_exact(self):
+        with output_hidden():
+            archivebox_remove.main(['--yes', '--delete', 'https://example5.com/'])
+
+        all_links = load_main_index(out_dir=OUTPUT_DIR)
+        assert len(all_links) == 11
+
+    def test_remove_regex(self):
+        with output_hidden():
+            archivebox_remove.main(['--yes', '--delete', '--filter-type=regex', r'http(s)?:\/\/(.+\.)?(example\d\.com)'])
+
+        all_links = load_main_index(out_dir=OUTPUT_DIR)
+        assert len(all_links) == 4
+
+    def test_remove_domain(self):
+        with output_hidden():
+            archivebox_remove.main(['--yes', '--delete', '--filter-type=domain', 'example5.com', 'example6.com'])
+
+        all_links = load_main_index(out_dir=OUTPUT_DIR)
+        assert len(all_links) == 10
+
+    def test_remove_none(self):
+        try:
+            with output_hidden(show_failing=False):
+                archivebox_remove.main(['--yes', '--delete', 'https://doesntexist.com'])
+            assert False, 'Should raise if no URLs match'
+        except Exception:
+            pass
+
+
+if __name__ == '__main__':
+    if '--verbose' in sys.argv or '-v' in sys.argv:
+        HIDE_CLI_OUTPUT = False
+    
+    unittest.main()

+ 62 - 10
archivebox/config.py

@@ -29,10 +29,12 @@ import json
 import getpass
 import getpass
 import platform
 import platform
 import shutil
 import shutil
+import sqlite3
 import django
 import django
 
 
 from hashlib import md5
 from hashlib import md5
 from pathlib import Path
 from pathlib import Path
+from datetime import datetime
 from typing import Optional, Type, Tuple, Dict, Union, List
 from typing import Optional, Type, Tuple, Dict, Union, List
 from subprocess import run, PIPE, DEVNULL
 from subprocess import run, PIPE, DEVNULL
 from configparser import ConfigParser
 from configparser import ConfigParser
@@ -77,6 +79,7 @@ CONFIG_SCHEMA: Dict[str, ConfigDefaultDict] = {
         'PUBLIC_SNAPSHOTS':         {'type': bool,  'default': True},
         'PUBLIC_SNAPSHOTS':         {'type': bool,  'default': True},
         'PUBLIC_ADD_VIEW':          {'type': bool,  'default': False},
         'PUBLIC_ADD_VIEW':          {'type': bool,  'default': False},
         'FOOTER_INFO':              {'type': str,   'default': 'Content is hosted for personal archiving purposes only.  Contact server owner for any takedown requests.'},
         'FOOTER_INFO':              {'type': str,   'default': 'Content is hosted for personal archiving purposes only.  Contact server owner for any takedown requests.'},
+        'SNAPSHOTS_PER_PAGE':       {'type': int,   'default': 40},
     },
     },
 
 
     'ARCHIVE_METHOD_TOGGLES': {
     'ARCHIVE_METHOD_TOGGLES': {
@@ -99,8 +102,9 @@ CONFIG_SCHEMA: Dict[str, ConfigDefaultDict] = {
 
 
     'ARCHIVE_METHOD_OPTIONS': {
     'ARCHIVE_METHOD_OPTIONS': {
         'RESOLUTION':               {'type': str,   'default': '1440,2000', 'aliases': ('SCREENSHOT_RESOLUTION',)},
         'RESOLUTION':               {'type': str,   'default': '1440,2000', 'aliases': ('SCREENSHOT_RESOLUTION',)},
-        'GIT_DOMAINS':              {'type': str,   'default': 'github.com,bitbucket.org,gitlab.com'},
+        'GIT_DOMAINS':              {'type': str,   'default': 'github.com,bitbucket.org,gitlab.com,gist.github.com'},
         'CHECK_SSL_VALIDITY':       {'type': bool,  'default': True},
         'CHECK_SSL_VALIDITY':       {'type': bool,  'default': True},
+        'MEDIA_MAX_SIZE':           {'type': str,   'default': '750m'},
 
 
         'CURL_USER_AGENT':          {'type': str,   'default': 'ArchiveBox/{VERSION} (+https://github.com/ArchiveBox/ArchiveBox/) curl/{CURL_VERSION}'},
         'CURL_USER_AGENT':          {'type': str,   'default': 'ArchiveBox/{VERSION} (+https://github.com/ArchiveBox/ArchiveBox/) curl/{CURL_VERSION}'},
         'WGET_USER_AGENT':          {'type': str,   'default': 'ArchiveBox/{VERSION} (+https://github.com/ArchiveBox/ArchiveBox/) wget/{WGET_VERSION}'},
         'WGET_USER_AGENT':          {'type': str,   'default': 'ArchiveBox/{VERSION} (+https://github.com/ArchiveBox/ArchiveBox/) wget/{WGET_VERSION}'},
@@ -111,7 +115,7 @@ CONFIG_SCHEMA: Dict[str, ConfigDefaultDict] = {
 
 
         'CHROME_HEADLESS':          {'type': bool,  'default': True},
         'CHROME_HEADLESS':          {'type': bool,  'default': True},
         'CHROME_SANDBOX':           {'type': bool,  'default': lambda c: not c['IN_DOCKER']},
         'CHROME_SANDBOX':           {'type': bool,  'default': lambda c: not c['IN_DOCKER']},
-        'YOUTUBEDL_ARGS':           {'type': list,  'default': ['--write-description',
+        'YOUTUBEDL_ARGS':           {'type': list,  'default': lambda c: ['--write-description',
                                                                 '--write-info-json',
                                                                 '--write-info-json',
                                                                 '--write-annotations',
                                                                 '--write-annotations',
                                                                 '--write-thumbnail',
                                                                 '--write-thumbnail',
@@ -122,7 +126,7 @@ CONFIG_SCHEMA: Dict[str, ConfigDefaultDict] = {
                                                                 '--ignore-errors',
                                                                 '--ignore-errors',
                                                                 '--geo-bypass',
                                                                 '--geo-bypass',
                                                                 '--add-metadata',
                                                                 '--add-metadata',
-                                                                '--max-filesize=750m',
+                                                                '--max-filesize={}'.format(c['MEDIA_MAX_SIZE']),
                                                                 ]},
                                                                 ]},
                                                                     
                                                                     
 
 
@@ -287,7 +291,6 @@ DYNAMIC_CONFIG_SCHEMA: ConfigDefaultDict = {
 
 
     'ARCHIVEBOX_BINARY':        {'default': lambda c: sys.argv[0]},
     'ARCHIVEBOX_BINARY':        {'default': lambda c: sys.argv[0]},
     'VERSION':                  {'default': lambda c: json.loads((Path(c['PACKAGE_DIR']) / 'package.json').read_text().strip())['version']},
     'VERSION':                  {'default': lambda c: json.loads((Path(c['PACKAGE_DIR']) / 'package.json').read_text().strip())['version']},
-    'GIT_SHA':                  {'default': lambda c: c['VERSION'].split('+')[-1] or 'unknown'},
 
 
     'PYTHON_BINARY':            {'default': lambda c: sys.executable},
     'PYTHON_BINARY':            {'default': lambda c: sys.executable},
     'PYTHON_ENCODING':          {'default': lambda c: sys.stdout.encoding.upper()},
     'PYTHON_ENCODING':          {'default': lambda c: sys.stdout.encoding.upper()},
@@ -459,7 +462,7 @@ def write_config_file(config: Dict[str, str], out_dir: str=None) -> ConfigDict:
     config_file.optionxform = str
     config_file.optionxform = str
     config_file.read(config_path)
     config_file.read(config_path)
 
 
-    with open(config_path, 'r') as old:
+    with open(config_path, 'r', encoding='utf-8') as old:
         atomic_write(f'{config_path}.bak', old.read())
         atomic_write(f'{config_path}.bak', old.read())
 
 
     find_section = lambda key: [name for name, opts in CONFIG_SCHEMA.items() if key in opts][0]
     find_section = lambda key: [name for name, opts in CONFIG_SCHEMA.items() if key in opts][0]
@@ -480,14 +483,14 @@ def write_config_file(config: Dict[str, str], out_dir: str=None) -> ConfigDict:
 
 
     if (not existing_secret_key) or ('not a valid secret' in existing_secret_key):
     if (not existing_secret_key) or ('not a valid secret' in existing_secret_key):
         from django.utils.crypto import get_random_string
         from django.utils.crypto import get_random_string
-        chars = 'abcdefghijklmnopqrstuvwxyz0123456789-_+!.'
+        chars = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789_'
         random_secret_key = get_random_string(50, chars)
         random_secret_key = get_random_string(50, chars)
         if 'SERVER_CONFIG' in config_file:
         if 'SERVER_CONFIG' in config_file:
             config_file['SERVER_CONFIG']['SECRET_KEY'] = random_secret_key
             config_file['SERVER_CONFIG']['SECRET_KEY'] = random_secret_key
         else:
         else:
             config_file['SERVER_CONFIG'] = {'SECRET_KEY': random_secret_key}
             config_file['SERVER_CONFIG'] = {'SECRET_KEY': random_secret_key}
 
 
-    with open(config_path, 'w+') as new:
+    with open(config_path, 'w+', encoding='utf-8') as new:
         config_file.write(new)
         config_file.write(new)
     
     
     try:
     try:
@@ -499,7 +502,7 @@ def write_config_file(config: Dict[str, str], out_dir: str=None) -> ConfigDict:
         }
         }
     except:
     except:
         # something went horribly wrong, rever to the previous version
         # something went horribly wrong, rever to the previous version
-        with open(f'{config_path}.bak', 'r') as old:
+        with open(f'{config_path}.bak', 'r', encoding='utf-8') as old:
             atomic_write(config_path, old.read())
             atomic_write(config_path, old.read())
 
 
     if Path(f'{config_path}.bak').exists():
     if Path(f'{config_path}.bak').exists():
@@ -1062,23 +1065,72 @@ def setup_django(out_dir: Path=None, check_db=False, config: ConfigDict=CONFIG,
 
 
     try:
     try:
         import django
         import django
+        from django.core.management import call_command
+
         sys.path.append(str(config['PACKAGE_DIR']))
         sys.path.append(str(config['PACKAGE_DIR']))
         os.environ.setdefault('OUTPUT_DIR', str(output_dir))
         os.environ.setdefault('OUTPUT_DIR', str(output_dir))
         assert (config['PACKAGE_DIR'] / 'core' / 'settings.py').exists(), 'settings.py was not found at archivebox/core/settings.py'
         assert (config['PACKAGE_DIR'] / 'core' / 'settings.py').exists(), 'settings.py was not found at archivebox/core/settings.py'
         os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'core.settings')
         os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'core.settings')
 
 
+        # Check to make sure JSON extension is available in our Sqlite3 instance
+        try:
+            cursor = sqlite3.connect(':memory:').cursor()
+            cursor.execute('SELECT JSON(\'{"a": "b"}\')')
+        except sqlite3.OperationalError as exc:
+            stderr(f'[X] Your SQLite3 version is missing the required JSON1 extension: {exc}', color='red')
+            hint([
+                'Upgrade your Python version or install the extension manually:',
+                'https://code.djangoproject.com/wiki/JSON1Extension'
+            ])
+
         if in_memory_db:
         if in_memory_db:
-            # Put the db in memory and run migrations in case any command requires it
-            from django.core.management import call_command
+            # some commands (e.g. oneshot) dont store a long-lived sqlite3 db file on disk.
+            # in those cases we create a temporary in-memory db and run the migrations
+            # immediately to get a usable in-memory-database at startup
             os.environ.setdefault("ARCHIVEBOX_DATABASE_NAME", ":memory:")
             os.environ.setdefault("ARCHIVEBOX_DATABASE_NAME", ":memory:")
             django.setup()
             django.setup()
             call_command("migrate", interactive=False, verbosity=0)
             call_command("migrate", interactive=False, verbosity=0)
         else:
         else:
+            # Otherwise use default sqlite3 file-based database and initialize django
+            # without running migrations automatically (user runs them manually by calling init)
             django.setup()
             django.setup()
+            
+
+        from django.conf import settings
+
+        # log startup message to the error log
+        with open(settings.ERROR_LOG, "a+", encoding='utf-8') as f:
+            command = ' '.join(sys.argv)
+            ts = datetime.now().strftime('%Y-%m-%d__%H:%M:%S')
+            f.write(f"\n> {command}; ts={ts} version={config['VERSION']} docker={config['IN_DOCKER']} is_tty={config['IS_TTY']}\n")
+
 
 
         if check_db:
         if check_db:
+            # Enable WAL mode in sqlite3
+            from django.db import connection
+            with connection.cursor() as cursor:
+                current_mode = cursor.execute("PRAGMA journal_mode")
+                if current_mode != 'wal':
+                    cursor.execute("PRAGMA journal_mode=wal;")
+
+            # Create cache table in DB if needed
+            try:
+                from django.core.cache import cache
+                cache.get('test', None)
+            except django.db.utils.OperationalError:
+                call_command("createcachetable", verbosity=0)
+
+
+            # if archivebox gets imported multiple times, we have to close
+            # the sqlite3 whenever we init from scratch to avoid multiple threads
+            # sharing the same connection by accident
+            from django.db import connections
+            for conn in connections.all():
+                conn.close_if_unusable_or_obsolete()
+
             sql_index_path = Path(output_dir) / SQL_INDEX_FILENAME
             sql_index_path = Path(output_dir) / SQL_INDEX_FILENAME
             assert sql_index_path.exists(), (
             assert sql_index_path.exists(), (
                 f'No database file {SQL_INDEX_FILENAME} found in: {config["OUTPUT_DIR"]} (Are you in an ArchiveBox collection directory?)')
                 f'No database file {SQL_INDEX_FILENAME} found in: {config["OUTPUT_DIR"]} (Are you in an ArchiveBox collection directory?)')
+
     except KeyboardInterrupt:
     except KeyboardInterrupt:
         raise SystemExit(2)
         raise SystemExit(2)

+ 162 - 71
archivebox/core/admin.py

@@ -1,6 +1,7 @@
 __package__ = 'archivebox.core'
 __package__ = 'archivebox.core'
 
 
 from io import StringIO
 from io import StringIO
+from pathlib import Path
 from contextlib import redirect_stdout
 from contextlib import redirect_stdout
 
 
 from django.contrib import admin
 from django.contrib import admin
@@ -13,15 +14,15 @@ from django import forms
 
 
 from ..util import htmldecode, urldecode, ansi_to_html
 from ..util import htmldecode, urldecode, ansi_to_html
 
 
-from core.models import Snapshot, Tag
-from core.forms import AddLinkForm, TagField
+from core.models import Snapshot, ArchiveResult, Tag
+from core.forms import AddLinkForm
 
 
 from core.mixins import SearchResultsAdminMixin
 from core.mixins import SearchResultsAdminMixin
 
 
 from index.html import snapshot_icons
 from index.html import snapshot_icons
 from logging_util import printable_filesize
 from logging_util import printable_filesize
 from main import add, remove
 from main import add, remove
-from config import OUTPUT_DIR
+from config import OUTPUT_DIR, SNAPSHOTS_PER_PAGE
 from extractors import archive_links
 from extractors import archive_links
 
 
 # Admin URLs
 # Admin URLs
@@ -36,77 +37,34 @@ from extractors import archive_links
 
 
 # TODO: https://stackoverflow.com/questions/40760880/add-custom-button-to-django-admin-panel
 # TODO: https://stackoverflow.com/questions/40760880/add-custom-button-to-django-admin-panel
 
 
-def update_snapshots(modeladmin, request, queryset):
-    archive_links([
-        snapshot.as_link()
-        for snapshot in queryset
-    ], out_dir=OUTPUT_DIR)
-update_snapshots.short_description = "Archive"
 
 
-def update_titles(modeladmin, request, queryset):
-    archive_links([
-        snapshot.as_link()
-        for snapshot in queryset
-    ], overwrite=True, methods=('title','favicon'), out_dir=OUTPUT_DIR)
-update_titles.short_description = "Pull title"
+class ArchiveResultInline(admin.TabularInline):
+    model = ArchiveResult
 
 
-def overwrite_snapshots(modeladmin, request, queryset):
-    archive_links([
-        snapshot.as_link()
-        for snapshot in queryset
-    ], overwrite=True, out_dir=OUTPUT_DIR)
-overwrite_snapshots.short_description = "Re-archive (overwrite)"
+class TagInline(admin.TabularInline):
+    model = Snapshot.tags.through
 
 
-def verify_snapshots(modeladmin, request, queryset):
-    for snapshot in queryset:
-        print(snapshot.timestamp, snapshot.url, snapshot.is_archived, snapshot.archive_size, len(snapshot.history))
+from django.contrib.admin.helpers import ActionForm
 
 
-verify_snapshots.short_description = "Check"
 
 
-def delete_snapshots(modeladmin, request, queryset):
-    remove(snapshots=queryset, yes=True, delete=True, out_dir=OUTPUT_DIR)
-
-delete_snapshots.short_description = "Delete"
-
-
-class SnapshotAdminForm(forms.ModelForm):
-    tags = TagField(required=False)
-
-    class Meta:
-        model = Snapshot
-        fields = "__all__"
-
-    def save(self, commit=True):
-        # Based on: https://stackoverflow.com/a/49933068/3509554
-
-        # Get the unsave instance
-        instance = forms.ModelForm.save(self, False)
-        tags = self.cleaned_data.pop("tags")
-
-        #update save_m2m
-        def new_save_m2m():
-            instance.save_tags(tags)
-
-        # Do we need to save all changes now?
-        self.save_m2m = new_save_m2m
-        if commit:
-            instance.save()
-
-        return instance
+class SnapshotActionForm(ActionForm):
+    tag = forms.ModelChoiceField(queryset=Tag.objects.all(), required=False)
 
 
 
 
 class SnapshotAdmin(SearchResultsAdminMixin, admin.ModelAdmin):
 class SnapshotAdmin(SearchResultsAdminMixin, admin.ModelAdmin):
     list_display = ('added', 'title_str', 'url_str', 'files', 'size')
     list_display = ('added', 'title_str', 'url_str', 'files', 'size')
     sort_fields = ('title_str', 'url_str', 'added')
     sort_fields = ('title_str', 'url_str', 'added')
-    readonly_fields = ('id', 'url', 'timestamp', 'num_outputs', 'is_archived', 'url_hash', 'added', 'updated')
+    readonly_fields = ('uuid', 'num_outputs', 'is_archived', 'url_hash', 'added', 'updated')
     search_fields = ['url__icontains', 'timestamp', 'title', 'tags__name']
     search_fields = ['url__icontains', 'timestamp', 'title', 'tags__name']
-    fields = (*readonly_fields, 'title', 'tags')
+    fields = ('timestamp', 'url', 'title', 'tags', *readonly_fields)
     list_filter = ('added', 'updated', 'tags')
     list_filter = ('added', 'updated', 'tags')
     ordering = ['-added']
     ordering = ['-added']
-    actions = [delete_snapshots, overwrite_snapshots, update_snapshots, update_titles, verify_snapshots]
-    actions_template = 'admin/actions_as_select.html'
-    form = SnapshotAdminForm
-    list_per_page = 40
+    actions = ['delete_snapshots', 'overwrite_snapshots', 'update_snapshots', 'update_titles', 'verify_snapshots', 'add_tag', 'remove_tag']
+    autocomplete_fields = ['tags']
+    inlines = [ArchiveResultInline]
+    list_per_page = SNAPSHOTS_PER_PAGE
+
+    action_form = SnapshotActionForm
 
 
     def get_urls(self):
     def get_urls(self):
         urls = super().get_urls()
         urls = super().get_urls()
@@ -116,21 +74,46 @@ class SnapshotAdmin(SearchResultsAdminMixin, admin.ModelAdmin):
         return custom_urls + urls
         return custom_urls + urls
 
 
     def get_queryset(self, request):
     def get_queryset(self, request):
+        self.request = request
         return super().get_queryset(request).prefetch_related('tags')
         return super().get_queryset(request).prefetch_related('tags')
 
 
     def tag_list(self, obj):
     def tag_list(self, obj):
         return ', '.join(obj.tags.values_list('name', flat=True))
         return ', '.join(obj.tags.values_list('name', flat=True))
 
 
-    def id_str(self, obj):
+    # TODO: figure out a different way to do this, you cant nest forms so this doenst work
+    # def action(self, obj):
+    #     # csrfmiddlewaretoken: Wa8UcQ4fD3FJibzxqHN3IYrrjLo4VguWynmbzzcPYoebfVUnDovon7GEMYFRgsh0
+    #     # action: update_snapshots
+    #     # select_across: 0
+    #     # _selected_action: 76d29b26-2a88-439e-877c-a7cca1b72bb3
+    #     return format_html(
+    #         '''
+    #             <form action="/admin/core/snapshot/" method="post" onsubmit="e => e.stopPropagation()">
+    #                 <input type="hidden" name="csrfmiddlewaretoken" value="{}">
+    #                 <input type="hidden" name="_selected_action" value="{}">
+    #                 <button name="update_snapshots">Check</button>
+    #                 <button name="update_titles">Pull title + favicon</button>
+    #                 <button name="update_snapshots">Update</button>
+    #                 <button name="overwrite_snapshots">Re-Archive (overwrite)</button>
+    #                 <button name="delete_snapshots">Permanently delete</button>
+    #             </form>
+    #         ''',
+    #         csrf.get_token(self.request),
+    #         obj.id,
+    #     )
+
+    def uuid(self, obj):
         return format_html(
         return format_html(
-            '<code style="font-size: 10px">{}</code>',
-            obj.url_hash[:8],
+            '<code style="font-size: 10px">{}</code><br/><a href="/archive/{}">View index ➡️</a> &nbsp; &nbsp; <a href="/admin/core/snapshot/?id__exact={}">View actions ⚙️</a>',
+            obj.id,
+            obj.timestamp,
+            obj.id,
         )
         )
 
 
     def title_str(self, obj):
     def title_str(self, obj):
         canon = obj.as_link().canonical_outputs()
         canon = obj.as_link().canonical_outputs()
         tags = ''.join(
         tags = ''.join(
-            format_html('<a href="/admin/core/snapshot/?tags__id__exact={}"><span class="tag">{}</span></a> ', tag.id, tag)
+            format_html('<a href="/admin/core/snapshot/?id__startswith={}"><span class="tag">{}</span></a> ', tag.id, tag)
             for tag in obj.tags.all()
             for tag in obj.tags.all()
             if str(tag).strip()
             if str(tag).strip()
         )
         )
@@ -152,7 +135,7 @@ class SnapshotAdmin(SearchResultsAdminMixin, admin.ModelAdmin):
         return snapshot_icons(obj)
         return snapshot_icons(obj)
 
 
     def size(self, obj):
     def size(self, obj):
-        archive_size = obj.archive_size
+        archive_size = (Path(obj.link_dir) / 'index.html').exists() and obj.archive_size
         if archive_size:
         if archive_size:
             size_txt = printable_filesize(archive_size)
             size_txt = printable_filesize(archive_size)
             if archive_size > 52428800:
             if archive_size > 52428800:
@@ -190,28 +173,135 @@ class SnapshotAdmin(SearchResultsAdminMixin, admin.ModelAdmin):
         rendered_response = self.changelist_view(request)
         rendered_response = self.changelist_view(request)
 
 
         # Restore values
         # Restore values
-        self.change_list_template =  saved_change_list_template
+        self.change_list_template = saved_change_list_template
         self.list_per_page = saved_list_per_page
         self.list_per_page = saved_list_per_page
         self.list_max_show_all = saved_list_max_show_all
         self.list_max_show_all = saved_list_max_show_all
 
 
         return rendered_response
         return rendered_response
+
+
+    def update_snapshots(self, request, queryset):
+        archive_links([
+            snapshot.as_link()
+            for snapshot in queryset
+        ], out_dir=OUTPUT_DIR)
+    update_snapshots.short_description = "Archive"
+
+    def update_titles(self, request, queryset):
+        archive_links([
+            snapshot.as_link()
+            for snapshot in queryset
+        ], overwrite=True, methods=('title','favicon'), out_dir=OUTPUT_DIR)
+    update_titles.short_description = "Pull title"
+
+    def overwrite_snapshots(self, request, queryset):
+        archive_links([
+            snapshot.as_link()
+            for snapshot in queryset
+        ], overwrite=True, out_dir=OUTPUT_DIR)
+    overwrite_snapshots.short_description = "Re-archive (overwrite)"
+
+    def verify_snapshots(self, request, queryset):
+        for snapshot in queryset:
+            print(snapshot.timestamp, snapshot.url, snapshot.is_archived, snapshot.archive_size, len(snapshot.history))
+
+    verify_snapshots.short_description = "Check"
+
+    def delete_snapshots(self, request, queryset):
+        remove(snapshots=queryset, yes=True, delete=True, out_dir=OUTPUT_DIR)
+
+    delete_snapshots.short_description = "Delete"
+
+    def add_tag(self, request, queryset):
+        tag = request.POST['tag']
+        for obj in queryset:
+            obj.tags.add(tag)
+
+    add_tag.short_description = "Add tag"
+
+    def remove_tag(self, request, queryset):
+        tag = request.POST['tag']
+        for obj in queryset:
+            obj.tags.remove(tag)
+
+    remove_tag.short_description = "Remove tag"
+
         
         
 
 
-    id_str.short_description = 'ID'
     title_str.short_description = 'Title'
     title_str.short_description = 'Title'
     url_str.short_description = 'Original URL'
     url_str.short_description = 'Original URL'
 
 
-    id_str.admin_order_field = 'id'
     title_str.admin_order_field = 'title'
     title_str.admin_order_field = 'title'
     url_str.admin_order_field = 'url'
     url_str.admin_order_field = 'url'
 
 
+
+
 class TagAdmin(admin.ModelAdmin):
 class TagAdmin(admin.ModelAdmin):
-    list_display = ('slug', 'name', 'id')
+    list_display = ('slug', 'name', 'num_snapshots', 'snapshots', 'id')
     sort_fields = ('id', 'name', 'slug')
     sort_fields = ('id', 'name', 'slug')
-    readonly_fields = ('id',)
+    readonly_fields = ('id', 'num_snapshots', 'snapshots')
     search_fields = ('id', 'name', 'slug')
     search_fields = ('id', 'name', 'slug')
     fields = (*readonly_fields, 'name', 'slug')
     fields = (*readonly_fields, 'name', 'slug')
+    actions = ['delete_selected']
+    ordering = ['-id']
+
+    def num_snapshots(self, obj):
+        return format_html(
+            '<a href="/admin/core/snapshot/?tags__id__exact={}">{} total</a>',
+            obj.id,
+            obj.snapshot_set.count(),
+        )
+
+    def snapshots(self, obj):
+        total_count = obj.snapshot_set.count()
+        return mark_safe('<br/>'.join(
+            format_html(
+                '{} <code><a href="/admin/core/snapshot/{}/change"><b>[{}]</b></a> {}</code>',
+                snap.updated.strftime('%Y-%m-%d %H:%M') if snap.updated else 'pending...',
+                snap.id,
+                snap.timestamp,
+                snap.url,
+            )
+            for snap in obj.snapshot_set.order_by('-updated')[:10]
+        ) + (f'<br/><a href="/admin/core/snapshot/?tags__id__exact={obj.id}">and {total_count-10} more...<a>' if obj.snapshot_set.count() > 10 else ''))
+
+
+class ArchiveResultAdmin(admin.ModelAdmin):
+    list_display = ('id', 'start_ts', 'extractor', 'snapshot_str', 'cmd_str', 'status', 'output_str')
+    sort_fields = ('start_ts', 'extractor', 'status')
+    readonly_fields = ('id', 'uuid', 'snapshot_str')
+    search_fields = ('id', 'uuid', 'snapshot__url', 'extractor', 'output', 'cmd_version', 'cmd', 'snapshot__timestamp')
+    fields = (*readonly_fields, 'snapshot', 'snapshot__tags', 'extractor', 'status', 'start_ts', 'end_ts', 'pwd', 'cmd', 'cmd_version', 'output')
+    autocomplete_fields = ['snapshot']
+
+    list_filter = ('status', 'extractor', 'start_ts', 'cmd_version')
+    ordering = ['-start_ts']
+    list_per_page = SNAPSHOTS_PER_PAGE
+
+    def snapshot_str(self, obj):
+        return format_html(
+            '<a href="/archive/{}/index.html"><b><code>[{}]</code></b></a><br/>'
+            '<small>{}</small>',
+            obj.snapshot.timestamp,
+            obj.snapshot.timestamp,
+            obj.snapshot.url[:128],
+        )
+
+    def cmd_str(self, obj):
+        return format_html(
+            '<pre>{}</pre>',
+            ' '.join(obj.cmd) if isinstance(obj.cmd, list) else str(obj.cmd),
+        )
+
+    def output_str(self, obj):
+        return format_html(
+            '<a href="/archive/{}/{}" class="output-link">↗️</a><pre>{}</pre>',
+            obj.snapshot.timestamp,
+            obj.output if (obj.status == 'succeeded') and obj.extractor not in ('title', 'archive_org') else 'index.html',
+            obj.output,
+        )
 
 
+    snapshot_str.short_description = 'snapshot'
 
 
 class ArchiveBoxAdmin(admin.AdminSite):
 class ArchiveBoxAdmin(admin.AdminSite):
     site_header = 'ArchiveBox'
     site_header = 'ArchiveBox'
@@ -266,4 +356,5 @@ admin.site = ArchiveBoxAdmin()
 admin.site.register(get_user_model())
 admin.site.register(get_user_model())
 admin.site.register(Snapshot, SnapshotAdmin)
 admin.site.register(Snapshot, SnapshotAdmin)
 admin.site.register(Tag, TagAdmin)
 admin.site.register(Tag, TagAdmin)
+admin.site.register(ArchiveResult, ArchiveResultAdmin)
 admin.site.disable_action('delete_selected')
 admin.site.disable_action('delete_selected')

+ 2 - 1
archivebox/core/forms.py

@@ -20,7 +20,8 @@ ARCHIVE_METHODS = [
 
 
 class AddLinkForm(forms.Form):
 class AddLinkForm(forms.Form):
     url = forms.RegexField(label="URLs (one per line)", regex=URL_REGEX, min_length='6', strip=True, widget=forms.Textarea, required=True)
     url = forms.RegexField(label="URLs (one per line)", regex=URL_REGEX, min_length='6', strip=True, widget=forms.Textarea, required=True)
-    depth = forms.ChoiceField(label="Archive depth", choices=CHOICES, widget=forms.RadioSelect, initial='0')
+    tag = forms.CharField(label="Tags (comma separated tag1,tag2,tag3)", strip=True, required=False)
+    depth = forms.ChoiceField(label="Archive depth", choices=CHOICES, initial='0', widget=forms.RadioSelect(attrs={"class": "depth-selection"}))
     archive_methods = forms.MultipleChoiceField(
     archive_methods = forms.MultipleChoiceField(
         label="Archive methods (select at least 1, otherwise all will be used by default)",
         label="Archive methods (select at least 1, otherwise all will be used by default)",
         required=False,
         required=False,

+ 18 - 0
archivebox/core/migrations/0009_auto_20210216_1038.py

@@ -0,0 +1,18 @@
+# Generated by Django 3.1.3 on 2021-02-16 10:38
+
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ('core', '0008_auto_20210105_1421'),
+    ]
+
+    operations = [
+        migrations.AlterField(
+            model_name='snapshot',
+            name='updated',
+            field=models.DateTimeField(auto_now=True, db_index=True, null=True),
+        ),
+    ]

+ 18 - 0
archivebox/core/migrations/0010_auto_20210216_1055.py

@@ -0,0 +1,18 @@
+# Generated by Django 3.1.3 on 2021-02-16 10:55
+
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ('core', '0009_auto_20210216_1038'),
+    ]
+
+    operations = [
+        migrations.AlterField(
+            model_name='archiveresult',
+            name='start_ts',
+            field=models.DateTimeField(db_index=True),
+        ),
+    ]

+ 24 - 0
archivebox/core/migrations/0011_auto_20210216_1331.py

@@ -0,0 +1,24 @@
+# Generated by Django 3.1.3 on 2021-02-16 13:31
+
+from django.db import migrations, models
+import uuid
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ('core', '0010_auto_20210216_1055'),
+    ]
+
+    operations = [
+        migrations.AddField(
+            model_name='archiveresult',
+            name='uuid',
+            field=models.UUIDField(default=uuid.uuid4, editable=False),
+        ),
+        migrations.AlterField(
+            model_name='archiveresult',
+            name='extractor',
+            field=models.CharField(choices=[('title', 'title'), ('favicon', 'favicon'), ('headers', 'headers'), ('singlefile', 'singlefile'), ('pdf', 'pdf'), ('screenshot', 'screenshot'), ('dom', 'dom'), ('wget', 'wget'), ('readability', 'readability'), ('mercury', 'mercury'), ('git', 'git'), ('media', 'media'), ('archive_org', 'archive_org')], max_length=32),
+        ),
+    ]

+ 23 - 0
archivebox/core/migrations/0012_auto_20210216_1425.py

@@ -0,0 +1,23 @@
+# Generated by Django 3.1.3 on 2021-02-16 14:25
+
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ('core', '0011_auto_20210216_1331'),
+    ]
+
+    operations = [
+        migrations.AlterField(
+            model_name='archiveresult',
+            name='cmd_version',
+            field=models.CharField(blank=True, default=None, max_length=128, null=True),
+        ),
+        migrations.AlterField(
+            model_name='archiveresult',
+            name='output',
+            field=models.CharField(max_length=1024),
+        ),
+    ]

+ 18 - 0
archivebox/core/migrations/0013_auto_20210218_0729.py

@@ -0,0 +1,18 @@
+# Generated by Django 3.1.3 on 2021-02-18 07:29
+
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ('core', '0012_auto_20210216_1425'),
+    ]
+
+    operations = [
+        migrations.AlterField(
+            model_name='snapshot',
+            name='title',
+            field=models.CharField(blank=True, db_index=True, max_length=256, null=True),
+        ),
+    ]

+ 18 - 0
archivebox/core/migrations/0014_auto_20210218_0729.py

@@ -0,0 +1,18 @@
+# Generated by Django 3.1.3 on 2021-02-18 07:29
+
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ('core', '0013_auto_20210218_0729'),
+    ]
+
+    operations = [
+        migrations.AlterField(
+            model_name='snapshot',
+            name='title',
+            field=models.CharField(blank=True, db_index=True, max_length=1024, null=True),
+        ),
+    ]

+ 18 - 0
archivebox/core/migrations/0015_auto_20210218_0730.py

@@ -0,0 +1,18 @@
+# Generated by Django 3.1.3 on 2021-02-18 07:30
+
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ('core', '0014_auto_20210218_0729'),
+    ]
+
+    operations = [
+        migrations.AlterField(
+            model_name='snapshot',
+            name='title',
+            field=models.CharField(blank=True, db_index=True, max_length=512, null=True),
+        ),
+    ]

+ 18 - 0
archivebox/core/migrations/0016_auto_20210218_1204.py

@@ -0,0 +1,18 @@
+# Generated by Django 3.1.3 on 2021-02-18 12:04
+
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ('core', '0015_auto_20210218_0730'),
+    ]
+
+    operations = [
+        migrations.AlterField(
+            model_name='snapshot',
+            name='tags',
+            field=models.ManyToManyField(blank=True, to='core.Tag'),
+        ),
+    ]

+ 18 - 0
archivebox/core/migrations/0017_auto_20210219_0211.py

@@ -0,0 +1,18 @@
+# Generated by Django 3.1.3 on 2021-02-19 02:11
+
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ('core', '0016_auto_20210218_1204'),
+    ]
+
+    operations = [
+        migrations.AlterField(
+            model_name='tag',
+            name='slug',
+            field=models.SlugField(blank=True, max_length=100, unique=True, verbose_name='slug'),
+        ),
+    ]

+ 23 - 0
archivebox/core/migrations/0018_auto_20210327_0952.py

@@ -0,0 +1,23 @@
+# Generated by Django 3.1.3 on 2021-03-27 09:52
+
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ('core', '0017_auto_20210219_0211'),
+    ]
+
+    operations = [
+        migrations.AlterField(
+            model_name='tag',
+            name='name',
+            field=models.CharField(max_length=100, unique=True),
+        ),
+        migrations.AlterField(
+            model_name='tag',
+            name='slug',
+            field=models.SlugField(blank=True, max_length=100, unique=True),
+        ),
+    ]

+ 90 - 38
archivebox/core/models.py

@@ -2,12 +2,15 @@ __package__ = 'archivebox.core'
 
 
 import uuid
 import uuid
 
 
-from django.db import models, transaction
+from django.db import models
 from django.utils.functional import cached_property
 from django.utils.functional import cached_property
 from django.utils.text import slugify
 from django.utils.text import slugify
+from django.core.cache import cache
 from django.db.models import Case, When, Value, IntegerField
 from django.db.models import Case, When, Value, IntegerField
 
 
-from ..util import parse_date
+from ..config import ARCHIVE_DIR, ARCHIVE_DIR_NAME
+from ..system import get_dir_size
+from ..util import parse_date, base_url, hashurl
 from ..index.schema import Link
 from ..index.schema import Link
 from ..extractors import get_default_archive_methods, ARCHIVE_METHODS_INDEXING_PRECEDENCE
 from ..extractors import get_default_archive_methods, ARCHIVE_METHODS_INDEXING_PRECEDENCE
 
 
@@ -29,8 +32,11 @@ class Tag(models.Model):
     """
     """
     Based on django-taggit model
     Based on django-taggit model
     """
     """
-    name = models.CharField(verbose_name="name", unique=True, blank=False, max_length=100)
-    slug = models.SlugField(verbose_name="slug", unique=True, max_length=100)
+    name = models.CharField(unique=True, blank=False, max_length=100)
+
+    # slug is autoset on save from name, never set it manually
+    slug = models.SlugField(unique=True, blank=True, max_length=100)
+
 
 
     class Meta:
     class Meta:
         verbose_name = "Tag"
         verbose_name = "Tag"
@@ -49,20 +55,21 @@ class Tag(models.Model):
         if self._state.adding and not self.slug:
         if self._state.adding and not self.slug:
             self.slug = self.slugify(self.name)
             self.slug = self.slugify(self.name)
 
 
-            with transaction.atomic():
-                slugs = set(
-                    type(self)
-                    ._default_manager.filter(slug__startswith=self.slug)
-                    .values_list("slug", flat=True)
-                )
-
-                i = None
-                while True:
-                    slug = self.slugify(self.name, i)
-                    if slug not in slugs:
-                        self.slug = slug
-                        return super().save(*args, **kwargs)
-                    i = 1 if i is None else i+1
+            # if name is different but slug conficts with another tags slug, append a counter
+            # with transaction.atomic():
+            slugs = set(
+                type(self)
+                ._default_manager.filter(slug__startswith=self.slug)
+                .values_list("slug", flat=True)
+            )
+
+            i = None
+            while True:
+                slug = self.slugify(self.name, i)
+                if slug not in slugs:
+                    self.slug = slug
+                    return super().save(*args, **kwargs)
+                i = 1 if i is None else i+1
         else:
         else:
             return super().save(*args, **kwargs)
             return super().save(*args, **kwargs)
 
 
@@ -73,11 +80,11 @@ class Snapshot(models.Model):
     url = models.URLField(unique=True)
     url = models.URLField(unique=True)
     timestamp = models.CharField(max_length=32, unique=True, db_index=True)
     timestamp = models.CharField(max_length=32, unique=True, db_index=True)
 
 
-    title = models.CharField(max_length=128, null=True, blank=True, db_index=True)
+    title = models.CharField(max_length=512, null=True, blank=True, db_index=True)
 
 
     added = models.DateTimeField(auto_now_add=True, db_index=True)
     added = models.DateTimeField(auto_now_add=True, db_index=True)
-    updated = models.DateTimeField(null=True, blank=True, db_index=True)
-    tags = models.ManyToManyField(Tag)
+    updated = models.DateTimeField(auto_now=True, blank=True, null=True, db_index=True)
+    tags = models.ManyToManyField(Tag, blank=True)
 
 
     keys = ('url', 'timestamp', 'title', 'tags', 'updated')
     keys = ('url', 'timestamp', 'title', 'tags', 'updated')
 
 
@@ -109,13 +116,24 @@ class Snapshot(models.Model):
         from ..index import load_link_details
         from ..index import load_link_details
         return load_link_details(self.as_link())
         return load_link_details(self.as_link())
 
 
-    def tags_str(self) -> str:
-        return ','.join(self.tags.order_by('name').values_list('name', flat=True))
+    def tags_str(self, nocache=True) -> str:
+        cache_key = f'{self.id}-{(self.updated or self.added).timestamp()}-tags'
+        calc_tags_str = lambda: ','.join(self.tags.order_by('name').values_list('name', flat=True))
+        if nocache:
+            tags_str = calc_tags_str()
+            cache.set(cache_key, tags_str)
+            return tags_str
+        return cache.get_or_set(cache_key, calc_tags_str)
 
 
     @cached_property
     @cached_property
     def bookmarked(self):
     def bookmarked(self):
         return parse_date(self.timestamp)
         return parse_date(self.timestamp)
 
 
+    @cached_property
+    def bookmarked_date(self):
+        # TODO: remove this
+        return self.bookmarked
+
     @cached_property
     @cached_property
     def is_archived(self):
     def is_archived(self):
         return self.as_link().is_archived
         return self.as_link().is_archived
@@ -126,23 +144,31 @@ class Snapshot(models.Model):
 
 
     @cached_property
     @cached_property
     def url_hash(self):
     def url_hash(self):
-        return self.as_link().url_hash
+        return hashurl(self.url)
 
 
     @cached_property
     @cached_property
     def base_url(self):
     def base_url(self):
-        return self.as_link().base_url
+        return base_url(self.url)
 
 
     @cached_property
     @cached_property
     def link_dir(self):
     def link_dir(self):
-        return self.as_link().link_dir
+        return str(ARCHIVE_DIR / self.timestamp)
 
 
     @cached_property
     @cached_property
     def archive_path(self):
     def archive_path(self):
-        return self.as_link().archive_path
+        return '{}/{}'.format(ARCHIVE_DIR_NAME, self.timestamp)
 
 
     @cached_property
     @cached_property
     def archive_size(self):
     def archive_size(self):
-        return self.as_link().archive_size
+        cache_key = f'{str(self.id)[:12]}-{(self.updated or self.added).timestamp()}-size'
+
+        def calc_dir_size():
+            try:
+                return get_dir_size(self.link_dir)[0]
+            except Exception:
+                return 0
+
+        return cache.get_or_set(cache_key, calc_dir_size)
 
 
     @cached_property
     @cached_property
     def history(self):
     def history(self):
@@ -151,17 +177,40 @@ class Snapshot(models.Model):
 
 
     @cached_property
     @cached_property
     def latest_title(self):
     def latest_title(self):
-        if ('title' in self.history
-            and self.history['title']
-            and (self.history['title'][-1].status == 'succeeded')
-            and self.history['title'][-1].output.strip()):
-            return self.history['title'][-1].output.strip()
+        if self.title:
+            return self.title   # whoopdedoo that was easy
+        
+        try:
+            # take longest successful title from ArchiveResult db history
+            return sorted(
+                self.archiveresult_set\
+                    .filter(extractor='title', status='succeeded', output__isnull=False)\
+                    .values_list('output', flat=True),
+                key=lambda r: len(r),
+            )[-1]
+        except IndexError:
+            pass
+
+        try:
+            # take longest successful title from Link json index file history
+            return sorted(
+                (
+                    result.output.strip()
+                    for result in self.history['title']
+                    if result.status == 'succeeded' and result.output.strip()
+                ),
+                key=lambda r: len(r),
+            )[-1]
+        except (KeyError, IndexError):
+            pass
+
         return None
         return None
 
 
     def save_tags(self, tags=()):
     def save_tags(self, tags=()):
         tags_id = []
         tags_id = []
         for tag in tags:
         for tag in tags:
-            tags_id.append(Tag.objects.get_or_create(name=tag)[0].id)
+            if tag.strip():
+                tags_id.append(Tag.objects.get_or_create(name=tag)[0].id)
         self.tags.clear()
         self.tags.clear()
         self.tags.add(*tags_id)
         self.tags.add(*tags_id)
 
 
@@ -178,15 +227,18 @@ class ArchiveResultManager(models.Manager):
 
 
 
 
 class ArchiveResult(models.Model):
 class ArchiveResult(models.Model):
+    id = models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')
+    uuid = models.UUIDField(default=uuid.uuid4, editable=False)
+
     snapshot = models.ForeignKey(Snapshot, on_delete=models.CASCADE)
     snapshot = models.ForeignKey(Snapshot, on_delete=models.CASCADE)
+    extractor = models.CharField(choices=EXTRACTORS, max_length=32)
     cmd = JSONField()
     cmd = JSONField()
     pwd = models.CharField(max_length=256)
     pwd = models.CharField(max_length=256)
-    cmd_version = models.CharField(max_length=32, default=None, null=True, blank=True)
-    output = models.CharField(max_length=512)
-    start_ts = models.DateTimeField()
+    cmd_version = models.CharField(max_length=128, default=None, null=True, blank=True)
+    output = models.CharField(max_length=1024)
+    start_ts = models.DateTimeField(db_index=True)
     end_ts = models.DateTimeField()
     end_ts = models.DateTimeField()
     status = models.CharField(max_length=16, choices=STATUS_CHOICES)
     status = models.CharField(max_length=16, choices=STATUS_CHOICES)
-    extractor = models.CharField(choices=EXTRACTORS, max_length=32)
 
 
     objects = ArchiveResultManager()
     objects = ArchiveResultManager()
 
 

+ 127 - 1
archivebox/core/settings.py

@@ -2,6 +2,9 @@ __package__ = 'archivebox.core'
 
 
 import os
 import os
 import sys
 import sys
+import re
+import logging
+import tempfile
 
 
 from pathlib import Path
 from pathlib import Path
 from django.utils.crypto import get_random_string
 from django.utils.crypto import get_random_string
@@ -14,6 +17,7 @@ from ..config import (
     TEMPLATES_DIR_NAME,
     TEMPLATES_DIR_NAME,
     SQL_INDEX_FILENAME,
     SQL_INDEX_FILENAME,
     OUTPUT_DIR,
     OUTPUT_DIR,
+    LOGS_DIR,
 )
 )
 
 
 
 
@@ -62,6 +66,40 @@ AUTHENTICATION_BACKENDS = [
     'django.contrib.auth.backends.ModelBackend',
     'django.contrib.auth.backends.ModelBackend',
 ]
 ]
 
 
+# only enable debug toolbar when in DEBUG mode with --nothreading (it doesnt work in multithreaded mode)
+DEBUG_TOOLBAR = DEBUG and ('--nothreading' in sys.argv) and ('--reload' not in sys.argv)
+if DEBUG_TOOLBAR:
+    try:
+        import debug_toolbar   # noqa
+        DEBUG_TOOLBAR = True
+    except ImportError:
+        DEBUG_TOOLBAR = False
+
+if DEBUG_TOOLBAR:
+    INSTALLED_APPS = [*INSTALLED_APPS, 'debug_toolbar']
+    INTERNAL_IPS = ['0.0.0.0', '127.0.0.1', '*']
+    DEBUG_TOOLBAR_CONFIG = {
+        "SHOW_TOOLBAR_CALLBACK": lambda request: True,
+        "RENDER_PANELS": True,
+    }
+    DEBUG_TOOLBAR_PANELS = [
+        'debug_toolbar.panels.history.HistoryPanel',
+        'debug_toolbar.panels.versions.VersionsPanel',
+        'debug_toolbar.panels.timer.TimerPanel',
+        'debug_toolbar.panels.settings.SettingsPanel',
+        'debug_toolbar.panels.headers.HeadersPanel',
+        'debug_toolbar.panels.request.RequestPanel',
+        'debug_toolbar.panels.sql.SQLPanel',
+        'debug_toolbar.panels.staticfiles.StaticFilesPanel',
+        # 'debug_toolbar.panels.templates.TemplatesPanel',
+        'debug_toolbar.panels.cache.CachePanel',
+        'debug_toolbar.panels.signals.SignalsPanel',
+        'debug_toolbar.panels.logging.LoggingPanel',
+        'debug_toolbar.panels.redirects.RedirectsPanel',
+        'debug_toolbar.panels.profiling.ProfilingPanel',
+        'djdt_flamegraph.FlamegraphPanel',
+    ]
+    MIDDLEWARE = [*MIDDLEWARE, 'debug_toolbar.middleware.DebugToolbarMiddleware']
 
 
 ################################################################################
 ################################################################################
 ### Staticfile and Template Settings
 ### Staticfile and Template Settings
@@ -107,6 +145,22 @@ DATABASES = {
     'default': {
     'default': {
         'ENGINE': 'django.db.backends.sqlite3',
         'ENGINE': 'django.db.backends.sqlite3',
         'NAME': DATABASE_NAME,
         'NAME': DATABASE_NAME,
+        'OPTIONS': {
+            'timeout': 60,
+            'check_same_thread': False,
+        },
+        # DB setup is sometimes modified at runtime by setup_django() in config.py
+    }
+}
+
+CACHE_BACKEND = 'django.core.cache.backends.locmem.LocMemCache'
+# CACHE_BACKEND = 'django.core.cache.backends.db.DatabaseCache'
+# CACHE_BACKEND = 'django.core.cache.backends.dummy.DummyCache'
+
+CACHES = {
+    'default': {
+        'BACKEND': CACHE_BACKEND,
+        'LOCATION': 'django_cache_default',
     }
     }
 }
 }
 
 
@@ -117,7 +171,7 @@ EMAIL_BACKEND = 'django.core.mail.backends.console.EmailBackend'
 ### Security Settings
 ### Security Settings
 ################################################################################
 ################################################################################
 
 
-SECRET_KEY = SECRET_KEY or get_random_string(50, 'abcdefghijklmnopqrstuvwxyz0123456789-_+!.')
+SECRET_KEY = SECRET_KEY or get_random_string(50, 'abcdefghijklmnopqrstuvwxyz0123456789_')
 
 
 ALLOWED_HOSTS = ALLOWED_HOSTS.split(',')
 ALLOWED_HOSTS = ALLOWED_HOSTS.split(',')
 
 
@@ -131,6 +185,8 @@ SESSION_COOKIE_AGE = 1209600  # 2 weeks
 SESSION_EXPIRE_AT_BROWSER_CLOSE = False
 SESSION_EXPIRE_AT_BROWSER_CLOSE = False
 SESSION_SAVE_EVERY_REQUEST = True
 SESSION_SAVE_EVERY_REQUEST = True
 
 
+SESSION_ENGINE = "django.contrib.sessions.backends.db"
+
 AUTH_PASSWORD_VALIDATORS = [
 AUTH_PASSWORD_VALIDATORS = [
     {'NAME': 'django.contrib.auth.password_validation.UserAttributeSimilarityValidator'},
     {'NAME': 'django.contrib.auth.password_validation.UserAttributeSimilarityValidator'},
     {'NAME': 'django.contrib.auth.password_validation.MinimumLengthValidator'},
     {'NAME': 'django.contrib.auth.password_validation.MinimumLengthValidator'},
@@ -163,3 +219,73 @@ USE_TZ = False
 
 
 DATETIME_FORMAT = 'Y-m-d g:iA'
 DATETIME_FORMAT = 'Y-m-d g:iA'
 SHORT_DATETIME_FORMAT = 'Y-m-d h:iA'
 SHORT_DATETIME_FORMAT = 'Y-m-d h:iA'
+
+
+################################################################################
+### Logging Settings
+################################################################################
+
+IGNORABLE_404_URLS = [
+    re.compile(r'apple-touch-icon.*\.png$'),
+    re.compile(r'favicon\.ico$'),
+    re.compile(r'robots\.txt$'),
+    re.compile(r'.*\.(css|js)\.map$'),
+]
+
+class NoisyRequestsFilter(logging.Filter):
+    def filter(self, record):
+        logline = record.getMessage()
+
+        # ignore harmless 404s for the patterns in IGNORABLE_404_URLS
+        for ignorable_url_pattern in IGNORABLE_404_URLS:
+            ignorable_log_pattern = re.compile(f'^"GET /.*/?{ignorable_url_pattern.pattern[:-1]} HTTP/.*" (200|30.|404) .+$', re.I | re.M)
+            if ignorable_log_pattern.match(logline):
+                return 0
+
+        # ignore staticfile requests that 200 or 30*
+        ignoreable_200_log_pattern = re.compile(r'"GET /static/.* HTTP/.*" (200|30.) .+', re.I | re.M)
+        if ignoreable_200_log_pattern.match(logline):
+            return 0
+
+        return 1
+
+if LOGS_DIR.exists():
+    ERROR_LOG = (LOGS_DIR / 'errors.log')
+else:
+    # meh too many edge cases here around creating log dir w/ correct permissions
+    # cant be bothered, just trash the log and let them figure it out via stdout/stderr
+    ERROR_LOG = tempfile.NamedTemporaryFile().name
+
+LOGGING = {
+    'version': 1,
+    'disable_existing_loggers': False,
+    'handlers': {
+        'console': {
+            'class': 'logging.StreamHandler',
+        },
+        'logfile': {
+            'level': 'ERROR',
+            'class': 'logging.handlers.RotatingFileHandler',
+            'filename': ERROR_LOG,
+            'maxBytes': 1024 * 1024 * 25,  # 25 MB
+            'backupCount': 10,
+        },
+    },
+    'filters': {
+        'noisyrequestsfilter': {
+            '()': NoisyRequestsFilter,
+        }
+    },
+    'loggers': {
+        'django': {
+            'handlers': ['console', 'logfile'],
+            'level': 'INFO',
+            'filters': ['noisyrequestsfilter'],
+        },
+        'django.server': {
+            'handlers': ['console', 'logfile'],
+            'level': 'INFO',
+            'filters': ['noisyrequestsfilter'],
+        }
+    },
+}

+ 43 - 34
archivebox/core/urls.py

@@ -2,6 +2,7 @@ from django.contrib import admin
 
 
 from django.urls import path, include
 from django.urls import path, include
 from django.views import static
 from django.views import static
+from django.contrib.staticfiles.urls import staticfiles_urlpatterns
 from django.conf import settings
 from django.conf import settings
 from django.views.generic.base import RedirectView
 from django.views.generic.base import RedirectView
 
 
@@ -13,8 +14,8 @@ from core.views import HomepageView, SnapshotView, PublicIndexView, AddView
 urlpatterns = [
 urlpatterns = [
     path('public/', PublicIndexView.as_view(), name='public-index'),
     path('public/', PublicIndexView.as_view(), name='public-index'),
 
 
-    path('robots.txt', static.serve, {'document_root': settings.OUTPUT_DIR, 'path': 'robots.txt'}),
-    path('favicon.ico', static.serve, {'document_root': settings.OUTPUT_DIR, 'path': 'favicon.ico'}),
+    path('robots.txt', static.serve, {'document_root': settings.STATICFILES_DIRS[0], 'path': 'robots.txt'}),
+    path('favicon.ico', static.serve, {'document_root': settings.STATICFILES_DIRS[0], 'path': 'favicon.ico'}),
 
 
     path('docs/', RedirectView.as_view(url='https://github.com/ArchiveBox/ArchiveBox/wiki'), name='Docs'),
     path('docs/', RedirectView.as_view(url='https://github.com/ArchiveBox/ArchiveBox/wiki'), name='Docs'),
 
 
@@ -35,35 +36,43 @@ urlpatterns = [
     path('index.json', static.serve, {'document_root': settings.OUTPUT_DIR, 'path': 'index.json'}),
     path('index.json', static.serve, {'document_root': settings.OUTPUT_DIR, 'path': 'index.json'}),
     path('', HomepageView.as_view(), name='Home'),
     path('', HomepageView.as_view(), name='Home'),
 ]
 ]
-
-    # # Proposed UI URLs spec
-    # path('',                 HomepageView)
-    # path('/add',             AddView)
-    # path('/public',          PublicIndexView)
-    # path('/snapshot/:slug',  SnapshotView)
-    
-    # path('/admin',           admin.site.urls)
-    # path('/accounts',        django.contrib.auth.urls)
-
-    # # Prposed REST API spec
-    # # :slugs can be uuid, short_uuid, or any of the unique index_fields
-    # path('api/v1/'),
-    # path('api/v1/core/'                      [GET])
-    # path('api/v1/core/snapshot/',            [GET, POST, PUT]),
-    # path('api/v1/core/snapshot/:slug',       [GET, PATCH, DELETE]),
-    # path('api/v1/core/archiveresult',        [GET, POST, PUT]),
-    # path('api/v1/core/archiveresult/:slug',  [GET, PATCH, DELETE]),
-    # path('api/v1/core/tag/',                 [GET, POST, PUT]),
-    # path('api/v1/core/tag/:slug',            [GET, PATCH, DELETE]),
-
-    # path('api/v1/cli/',                      [GET])
-    # path('api/v1/cli/{add,list,config,...}', [POST]),  # pass query as kwargs directly to `run_subcommand` and return stdout, stderr, exitcode
-
-    # path('api/v1/extractors/',                    [GET])
-    # path('api/v1/extractors/:extractor/',         [GET]),
-    # path('api/v1/extractors/:extractor/:func',    [GET, POST]),  # pass query as args directly to chosen function
-
-    # future, just an idea:
-    # path('api/v1/scheduler/',                [GET])
-    # path('api/v1/scheduler/task/',           [GET, POST, PUT]),
-    # path('api/v1/scheduler/task/:slug',      [GET, PATCH, DELETE]),
+urlpatterns += staticfiles_urlpatterns()
+
+if settings.DEBUG_TOOLBAR:
+    import debug_toolbar
+    urlpatterns += [
+        path('__debug__/', include(debug_toolbar.urls)),
+    ]
+
+
+# # Proposed FUTURE URLs spec
+# path('',                 HomepageView)
+# path('/add',             AddView)
+# path('/public',          PublicIndexView)
+# path('/snapshot/:slug',  SnapshotView)
+
+# path('/admin',           admin.site.urls)
+# path('/accounts',        django.contrib.auth.urls)
+
+# # Prposed REST API spec
+# # :slugs can be uuid, short_uuid, or any of the unique index_fields
+# path('api/v1/'),
+# path('api/v1/core/'                      [GET])
+# path('api/v1/core/snapshot/',            [GET, POST, PUT]),
+# path('api/v1/core/snapshot/:slug',       [GET, PATCH, DELETE]),
+# path('api/v1/core/archiveresult',        [GET, POST, PUT]),
+# path('api/v1/core/archiveresult/:slug',  [GET, PATCH, DELETE]),
+# path('api/v1/core/tag/',                 [GET, POST, PUT]),
+# path('api/v1/core/tag/:slug',            [GET, PATCH, DELETE]),
+
+# path('api/v1/cli/',                      [GET])
+# path('api/v1/cli/{add,list,config,...}', [POST]),  # pass query as kwargs directly to `run_subcommand` and return stdout, stderr, exitcode
+
+# path('api/v1/extractors/',                    [GET])
+# path('api/v1/extractors/:extractor/',         [GET]),
+# path('api/v1/extractors/:extractor/:func',    [GET, POST]),  # pass query as args directly to chosen function
+
+# future, just an idea:
+# path('api/v1/scheduler/',                [GET])
+# path('api/v1/scheduler/task/',           [GET, POST, PUT]),
+# path('api/v1/scheduler/task/:slug',      [GET, PATCH, DELETE]),

+ 161 - 42
archivebox/core/views.py

@@ -4,8 +4,8 @@ from io import StringIO
 from contextlib import redirect_stdout
 from contextlib import redirect_stdout
 
 
 from django.shortcuts import render, redirect
 from django.shortcuts import render, redirect
-
-from django.http import HttpResponse
+from django.http import HttpResponse, Http404
+from django.utils.html import format_html, mark_safe
 from django.views import View, static
 from django.views import View, static
 from django.views.generic.list import ListView
 from django.views.generic.list import ListView
 from django.views.generic import FormView
 from django.views.generic import FormView
@@ -22,6 +22,7 @@ from ..config import (
     PUBLIC_ADD_VIEW,
     PUBLIC_ADD_VIEW,
     VERSION,
     VERSION,
     FOOTER_INFO,
     FOOTER_INFO,
+    SNAPSHOTS_PER_PAGE,
 )
 )
 from main import add
 from main import add
 from ..util import base_url, ansi_to_html
 from ..util import base_url, ansi_to_html
@@ -43,10 +44,6 @@ class SnapshotView(View):
     # render static html index from filesystem archive/<timestamp>/index.html
     # render static html index from filesystem archive/<timestamp>/index.html
 
 
     def get(self, request, path):
     def get(self, request, path):
-        # missing trailing slash -> redirect to index
-        if '/' not in path:
-            return redirect(f'{path}/index.html')
-
         if not request.user.is_authenticated and not PUBLIC_SNAPSHOTS:
         if not request.user.is_authenticated and not PUBLIC_SNAPSHOTS:
             return redirect(f'/admin/login/?next={request.path}')
             return redirect(f'/admin/login/?next={request.path}')
 
 
@@ -55,46 +52,163 @@ class SnapshotView(View):
         except (IndexError, ValueError):
         except (IndexError, ValueError):
             slug, archivefile = path.split('/', 1)[0], 'index.html'
             slug, archivefile = path.split('/', 1)[0], 'index.html'
 
 
-        all_pages = list(Snapshot.objects.all())
-
         # slug is a timestamp
         # slug is a timestamp
-        by_ts = {page.timestamp: page for page in all_pages}
-        try:
-            # print('SERVING STATICFILE', by_ts[slug].link_dir, request.path, path)
-            response = static.serve(request, archivefile, document_root=by_ts[slug].link_dir, show_indexes=True)
-            response["Link"] = f'<{by_ts[slug].url}>; rel="canonical"'
-            return response
-        except KeyError:
-            pass
+        if slug.replace('.','').isdigit():
 
 
-        # slug is a hash
-        by_hash = {page.url_hash: page for page in all_pages}
-        try:
-            timestamp = by_hash[slug].timestamp
-            return redirect(f'/archive/{timestamp}/{archivefile}')
-        except KeyError:
-            pass
+            # missing trailing slash -> redirect to index
+            if '/' not in path:
+                return redirect(f'{path}/index.html')
 
 
+            try:
+                try:
+                    snapshot = Snapshot.objects.get(Q(timestamp=slug) | Q(id__startswith=slug))
+                    response = static.serve(request, archivefile, document_root=snapshot.link_dir, show_indexes=True)
+                    response["Link"] = f'<{snapshot.url}>; rel="canonical"'
+                    return response
+                except Snapshot.DoesNotExist:
+                    if Snapshot.objects.filter(timestamp__startswith=slug).exists():
+                        raise Snapshot.MultipleObjectsReturned
+                    else:
+                        raise
+            except Snapshot.DoesNotExist:
+                # Snapshot does not exist
+                return HttpResponse(
+                    format_html(
+                        (
+                            '<center><br/><br/><br/>'
+                            'No Snapshot directories match the given timestamp or UUID: <code>{}</code><br/><br/>'
+                            'You can <a href="/add/" target="_top">add a new Snapshot</a>, or return to the <a href="/" target="_top">Main Index</a>'
+                            '</center>'
+                        ),
+                        slug,
+                        path,
+                    ),
+                    content_type="text/html",
+                    status=404,
+                )
+            except Snapshot.MultipleObjectsReturned:
+                snapshot_hrefs = mark_safe('<br/>').join(
+                    format_html(
+                        '{} <a href="/archive/{}/index.html"><b><code>{}</code></b></a> {} <b>{}</b>',
+                        snap.added.strftime('%Y-%m-%d %H:%M:%S'),
+                        snap.timestamp,
+                        snap.timestamp,
+                        snap.url,
+                        snap.title or '',
+                    )
+                    for snap in Snapshot.objects.filter(timestamp__startswith=slug).only('url', 'timestamp', 'title', 'added').order_by('-added')
+                )
+                return HttpResponse(
+                    format_html(
+                        (
+                            'Multiple Snapshots match the given timestamp/UUID <code>{}</code><br/><pre>'
+                        ),
+                        slug,
+                    ) + snapshot_hrefs + format_html(
+                        (
+                            '</pre><br/>'
+                            'Choose a Snapshot to proceed or go back to the <a href="/" target="_top">Main Index</a>'
+                        )
+                    ),
+                    content_type="text/html",
+                    status=404,
+                )
+            except Http404:
+                # Snapshot dir exists but file within does not e.g. 124235.324234/screenshot.png
+                return HttpResponse(
+                    format_html(
+                        (
+                            '<center><br/><br/><br/>'
+                            f'Snapshot <a href="/archive/{snapshot.timestamp}/index.html" target="_top"><b><code>[{snapshot.timestamp}]</code></b></a> exists in DB, but resource <b><code>{snapshot.timestamp}/'
+                            '{}'
+                            f'</code></b> does not exist in <a href="/archive/{snapshot.timestamp}/" target="_top">snapshot dir</a> yet.<br/><br/>'
+                            'Maybe this resource type is not availabe for this Snapshot,<br/>or the archiving process has not completed yet?<br/>'
+                            f'<pre><code># run this cmd to finish archiving this Snapshot<br/>archivebox update -t timestamp {snapshot.timestamp}</code></pre><br/><br/>'
+                            '<div class="text-align: left; width: 100%; max-width: 400px">'
+                            '<i><b>Next steps:</i></b><br/>'
+                            f'- list all the <a href="/archive/{snapshot.timestamp}/" target="_top">Snapshot files <code>.*</code></a><br/>'
+                            f'- view the <a href="/archive/{snapshot.timestamp}/index.html" target="_top">Snapshot <code>./index.html</code></a><br/>'
+                            f'- go to the <a href="/admin/core/snapshot/{snapshot.id}/change/" target="_top">Snapshot admin</a> to edit<br/>'
+                            f'- go to the <a href="/admin/core/snapshot/?id__startswith={snapshot.id}" target="_top">Snapshot actions</a> to re-archive<br/>'
+                            '- or return to <a href="/" target="_top">the main index...</a></div>'
+                            '</center>'
+                        ),
+                        archivefile,
+                    ),
+                    content_type="text/html",
+                    status=404,
+                )
         # slug is a URL
         # slug is a URL
-        by_url = {page.base_url: page for page in all_pages}
         try:
         try:
-            # TODO: add multiple snapshot support by showing index of all snapshots
-            # for given url instead of redirecting to timestamp index
-            timestamp = by_url[base_url(path)].timestamp
-            return redirect(f'/archive/{timestamp}/index.html')
-        except KeyError:
-            pass
-
-        return HttpResponse(
-            'No archived link matches the given timestamp or hash.',
-            content_type="text/plain",
-            status=404,
-        )
+            try:
+                # try exact match on full url first
+                snapshot = Snapshot.objects.get(
+                    Q(url='http://' + path) | Q(url='https://' + path) | Q(id__startswith=path)
+                )
+            except Snapshot.DoesNotExist:
+                # fall back to match on exact base_url
+                try:
+                    snapshot = Snapshot.objects.get(
+                        Q(url='http://' + base_url(path)) | Q(url='https://' + base_url(path))
+                    )
+                except Snapshot.DoesNotExist:
+                    # fall back to matching base_url as prefix
+                    snapshot = Snapshot.objects.get(
+                        Q(url__startswith='http://' + base_url(path)) | Q(url__startswith='https://' + base_url(path))
+                    )
+            return redirect(f'/archive/{snapshot.timestamp}/index.html')
+        except Snapshot.DoesNotExist:
+            return HttpResponse(
+                format_html(
+                    (
+                        '<center><br/><br/><br/>'
+                        'No Snapshots match the given url: <code>{}</code><br/><br/><br/>'
+                        'Return to the <a href="/" target="_top">Main Index</a>, or:<br/><br/>'
+                        '+ <i><a href="/add/?url={}" target="_top">Add a new Snapshot for <code>{}</code></a><br/><br/></i>'
+                        '</center>'
+                    ),
+                    base_url(path),
+                    path if '://' in path else f'https://{path}',
+                    path,
+                ),
+                content_type="text/html",
+                status=404,
+            )
+        except Snapshot.MultipleObjectsReturned:
+            snapshot_hrefs = mark_safe('<br/>').join(
+                format_html(
+                    '{} <a href="/archive/{}/index.html"><b><code>{}</code></b></a> {} <b>{}</b>',
+                    snap.added.strftime('%Y-%m-%d %H:%M:%S'),
+                    snap.timestamp,
+                    snap.timestamp,
+                    snap.url,
+                    snap.title or '',
+                )
+                for snap in Snapshot.objects.filter(
+                    Q(url__startswith='http://' + base_url(path)) | Q(url__startswith='https://' + base_url(path))
+                ).only('url', 'timestamp', 'title', 'added').order_by('-added')
+            )
+            return HttpResponse(
+                format_html(
+                    (
+                        'Multiple Snapshots match the given URL <code>{}</code><br/><pre>'
+                    ),
+                    base_url(path),
+                ) + snapshot_hrefs + format_html(
+                    (
+                        '</pre><br/>'
+                        'Choose a Snapshot to proceed or go back to the <a href="/" target="_top">Main Index</a>'
+                    )
+                ),
+                content_type="text/html",
+                status=404,
+            )
+        
 
 
 class PublicIndexView(ListView):
 class PublicIndexView(ListView):
     template_name = 'public_index.html'
     template_name = 'public_index.html'
     model = Snapshot
     model = Snapshot
-    paginate_by = 100
+    paginate_by = SNAPSHOTS_PER_PAGE
     ordering = ['title']
     ordering = ['title']
 
 
     def get_context_data(self, **kwargs):
     def get_context_data(self, **kwargs):
@@ -105,12 +219,14 @@ class PublicIndexView(ListView):
         }
         }
 
 
     def get_queryset(self, **kwargs): 
     def get_queryset(self, **kwargs): 
-        qs = super().get_queryset(**kwargs) 
+        qs = super().get_queryset(**kwargs)
         query = self.request.GET.get('q')
         query = self.request.GET.get('q')
         if query:
         if query:
             qs = qs.filter(Q(title__icontains=query) | Q(url__icontains=query) | Q(timestamp__icontains=query) | Q(tags__name__icontains=query))
             qs = qs.filter(Q(title__icontains=query) | Q(url__icontains=query) | Q(timestamp__icontains=query) | Q(tags__name__icontains=query))
+        
         for snapshot in qs:
         for snapshot in qs:
-            snapshot.icons = snapshot_icons(snapshot)
+            # lazy load snapshot icons, otherwise it will load icons for entire index at once
+            snapshot.icons = lambda: snapshot_icons(snapshot)
         return qs
         return qs
 
 
     def get(self, *args, **kwargs):
     def get(self, *args, **kwargs):
@@ -130,9 +246,9 @@ class AddView(UserPassesTestMixin, FormView):
         if self.request.method == 'GET':
         if self.request.method == 'GET':
             url = self.request.GET.get('url', None)
             url = self.request.GET.get('url', None)
             if url:
             if url:
-                return {'url': url}
-        else:
-            return super().get_initial()
+                return {'url': url if '://' in url else f'https://{url}'}
+        
+        return super().get_initial()
 
 
     def test_func(self):
     def test_func(self):
         return PUBLIC_ADD_VIEW or self.request.user.is_authenticated
         return PUBLIC_ADD_VIEW or self.request.user.is_authenticated
@@ -145,15 +261,18 @@ class AddView(UserPassesTestMixin, FormView):
             'absolute_add_path': self.request.build_absolute_uri(self.request.path),
             'absolute_add_path': self.request.build_absolute_uri(self.request.path),
             'VERSION': VERSION,
             'VERSION': VERSION,
             'FOOTER_INFO': FOOTER_INFO,
             'FOOTER_INFO': FOOTER_INFO,
+            'stdout': '',
         }
         }
 
 
     def form_valid(self, form):
     def form_valid(self, form):
         url = form.cleaned_data["url"]
         url = form.cleaned_data["url"]
         print(f'[+] Adding URL: {url}')
         print(f'[+] Adding URL: {url}')
+        tag = form.cleaned_data["tag"]
         depth = 0 if form.cleaned_data["depth"] == "0" else 1
         depth = 0 if form.cleaned_data["depth"] == "0" else 1
         extractors = ','.join(form.cleaned_data["archive_methods"])
         extractors = ','.join(form.cleaned_data["archive_methods"])
         input_kwargs = {
         input_kwargs = {
             "urls": url,
             "urls": url,
+            "tag": tag,
             "depth": depth,
             "depth": depth,
             "update_all": False,
             "update_all": False,
             "out_dir": OUTPUT_DIR,
             "out_dir": OUTPUT_DIR,

+ 3 - 3
archivebox/core/wsgi.py

@@ -7,10 +7,10 @@ For more information on this file, see
 https://docs.djangoproject.com/en/2.1/howto/deployment/wsgi/
 https://docs.djangoproject.com/en/2.1/howto/deployment/wsgi/
 """
 """
 
 
-import os
 
 
-from django.core.wsgi import get_wsgi_application
+from archivebox.config import setup_django
+setup_django(in_memory_db=False, check_db=True)
 
 
-os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'archivebox.settings')
+from django.core.wsgi import get_wsgi_application
 
 
 application = get_wsgi_application()
 application = get_wsgi_application()

+ 10 - 3
archivebox/extractors/__init__.py

@@ -44,16 +44,16 @@ def get_default_archive_methods():
     return [
     return [
         ('title', should_save_title, save_title),
         ('title', should_save_title, save_title),
         ('favicon', should_save_favicon, save_favicon),
         ('favicon', should_save_favicon, save_favicon),
-        ('wget', should_save_wget, save_wget),
+        ('headers', should_save_headers, save_headers),
         ('singlefile', should_save_singlefile, save_singlefile),
         ('singlefile', should_save_singlefile, save_singlefile),
         ('pdf', should_save_pdf, save_pdf),
         ('pdf', should_save_pdf, save_pdf),
         ('screenshot', should_save_screenshot, save_screenshot),
         ('screenshot', should_save_screenshot, save_screenshot),
         ('dom', should_save_dom, save_dom),
         ('dom', should_save_dom, save_dom),
-        ('readability', should_save_readability, save_readability), #keep readability below wget and singlefile, as it depends on them
+        ('wget', should_save_wget, save_wget),
+        ('readability', should_save_readability, save_readability),  # keep readability below wget and singlefile, as it depends on them
         ('mercury', should_save_mercury, save_mercury),
         ('mercury', should_save_mercury, save_mercury),
         ('git', should_save_git, save_git),
         ('git', should_save_git, save_git),
         ('media', should_save_media, save_media),
         ('media', should_save_media, save_media),
-        ('headers', should_save_headers, save_headers),
         ('archive_org', should_save_archive_dot_org, save_archive_dot_org),
         ('archive_org', should_save_archive_dot_org, save_archive_dot_org),
     ]
     ]
 
 
@@ -115,6 +115,13 @@ def archive_link(link: Link, overwrite: bool=False, methods: Optional[Iterable[s
                     ArchiveResult.objects.create(snapshot=snapshot, extractor=method_name, cmd=result.cmd, cmd_version=result.cmd_version,
                     ArchiveResult.objects.create(snapshot=snapshot, extractor=method_name, cmd=result.cmd, cmd_version=result.cmd_version,
                                                  output=result.output, pwd=result.pwd, start_ts=result.start_ts, end_ts=result.end_ts, status=result.status)
                                                  output=result.output, pwd=result.pwd, start_ts=result.start_ts, end_ts=result.end_ts, status=result.status)
 
 
+
+                    # bump the updated time on the main Snapshot here, this is critical
+                    # to be able to cache summaries of the ArchiveResults for a given
+                    # snapshot without having to load all the results from the DB each time.
+                    # (we use {Snapshot.id}-{Snapshot.updated} as the cache key and assume
+                    # ArchiveResults are unchanged as long as the updated timestamp is unchanged)
+                    snapshot.save()
                 else:
                 else:
                     # print('{black}      X {}{reset}'.format(method_name, **ANSI))
                     # print('{black}      X {}{reset}'.format(method_name, **ANSI))
                     stats['skipped'] += 1
                     stats['skipped'] += 1

+ 1 - 1
archivebox/extractors/archive_org.py

@@ -31,7 +31,7 @@ def should_save_archive_dot_org(link: Link, out_dir: Optional[Path]=None, overwr
 
 
     out_dir = out_dir or Path(link.link_dir)
     out_dir = out_dir or Path(link.link_dir)
     if not overwrite and (out_dir / 'archive.org.txt').exists():
     if not overwrite and (out_dir / 'archive.org.txt').exists():
-        # if open(path, 'r').read().strip() != 'None':
+        # if open(path, 'r', encoding='utf-8').read().strip() != 'None':
         return False
         return False
 
 
     return SAVE_ARCHIVE_DOT_ORG
     return SAVE_ARCHIVE_DOT_ORG

+ 11 - 3
archivebox/extractors/mercury.py

@@ -54,11 +54,13 @@ def save_mercury(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT)
 
 
     out_dir = Path(out_dir or link.link_dir)
     out_dir = Path(out_dir or link.link_dir)
     output_folder = out_dir.absolute() / "mercury"
     output_folder = out_dir.absolute() / "mercury"
-    output = str(output_folder)
+    output = "mercury"
 
 
     status = 'succeeded'
     status = 'succeeded'
     timer = TimedProgress(timeout, prefix='      ')
     timer = TimedProgress(timeout, prefix='      ')
     try:
     try:
+        output_folder.mkdir(exist_ok=True)
+
         # Get plain text version of article
         # Get plain text version of article
         cmd = [
         cmd = [
             DEPENDENCIES['MERCURY_BINARY']['path'],
             DEPENDENCIES['MERCURY_BINARY']['path'],
@@ -71,6 +73,11 @@ def save_mercury(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT)
         except json.JSONDecodeError:
         except json.JSONDecodeError:
             raise ShellError(cmd, result)
             raise ShellError(cmd, result)
         
         
+        if article_text.get('failed'):
+            raise ArchiveError('Mercury was not able to get article text from the URL')
+
+        atomic_write(str(output_folder / "content.txt"), article_text["content"])
+
         # Get HTML version of article
         # Get HTML version of article
         cmd = [
         cmd = [
             DEPENDENCIES['MERCURY_BINARY']['path'],
             DEPENDENCIES['MERCURY_BINARY']['path'],
@@ -82,9 +89,10 @@ def save_mercury(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT)
         except json.JSONDecodeError:
         except json.JSONDecodeError:
             raise ShellError(cmd, result)
             raise ShellError(cmd, result)
 
 
-        output_folder.mkdir(exist_ok=True)
+        if article_text.get('failed'):
+            raise ArchiveError('Mercury was not able to get article HTML from the URL')
+
         atomic_write(str(output_folder / "content.html"), article_json.pop("content"))
         atomic_write(str(output_folder / "content.html"), article_json.pop("content"))
-        atomic_write(str(output_folder / "content.txt"), article_text["content"])
         atomic_write(str(output_folder / "article.json"), article_json)
         atomic_write(str(output_folder / "article.json"), article_json)
 
 
         # Check for common failure cases
         # Check for common failure cases

+ 13 - 5
archivebox/extractors/readability.py

@@ -35,7 +35,7 @@ def get_html(link: Link, path: Path) -> str:
     document = None
     document = None
     for source in sources:
     for source in sources:
         try:
         try:
-            with open(abs_path / source, "r") as f:
+            with open(abs_path / source, "r", encoding="utf-8") as f:
                 document = f.read()
                 document = f.read()
                 break
                 break
         except (FileNotFoundError, TypeError):
         except (FileNotFoundError, TypeError):
@@ -63,7 +63,7 @@ def save_readability(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEO
 
 
     out_dir = Path(out_dir or link.link_dir)
     out_dir = Path(out_dir or link.link_dir)
     output_folder = out_dir.absolute() / "readability"
     output_folder = out_dir.absolute() / "readability"
-    output = str(output_folder)
+    output = "readability"
 
 
     # Readability Docs: https://github.com/mozilla/readability
     # Readability Docs: https://github.com/mozilla/readability
 
 
@@ -81,13 +81,20 @@ def save_readability(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEO
         temp_doc.write(document.encode("utf-8"))
         temp_doc.write(document.encode("utf-8"))
         temp_doc.close()
         temp_doc.close()
 
 
+        if not document or len(document) < 10:
+            raise ArchiveError('Readability could not find HTML to parse for article text')
+
         cmd = [
         cmd = [
             DEPENDENCIES['READABILITY_BINARY']['path'],
             DEPENDENCIES['READABILITY_BINARY']['path'],
-            temp_doc.name
+            temp_doc.name,
         ]
         ]
 
 
         result = run(cmd, cwd=out_dir, timeout=timeout)
         result = run(cmd, cwd=out_dir, timeout=timeout)
-        result_json = json.loads(result.stdout)
+        try:
+            result_json = json.loads(result.stdout)
+        except json.JSONDecodeError:
+            raise ArchiveError('Readability was not able to archive the page', result.stdout + result.stderr)
+
         output_folder.mkdir(exist_ok=True)
         output_folder.mkdir(exist_ok=True)
         readability_content = result_json.pop("textContent") 
         readability_content = result_json.pop("textContent") 
         atomic_write(str(output_folder / "content.html"), result_json.pop("content"))
         atomic_write(str(output_folder / "content.html"), result_json.pop("content"))
@@ -112,6 +119,7 @@ def save_readability(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEO
     except (Exception, OSError) as err:
     except (Exception, OSError) as err:
         status = 'failed'
         status = 'failed'
         output = err
         output = err
+        cmd = [cmd[0], './{singlefile,dom}.html']
     finally:
     finally:
         timer.end()
         timer.end()
 
 
@@ -121,6 +129,6 @@ def save_readability(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEO
         cmd_version=READABILITY_VERSION,
         cmd_version=READABILITY_VERSION,
         output=output,
         output=output,
         status=status,
         status=status,
-        index_texts= [readability_content] if readability_content else [],
+        index_texts=[readability_content] if readability_content else [],
         **timer.stats,  
         **timer.stats,  
     )
     )

+ 1 - 0
archivebox/index/__init__.py

@@ -356,6 +356,7 @@ LINK_FILTERS = {
     'regex': lambda pattern: Q(url__iregex=pattern),
     'regex': lambda pattern: Q(url__iregex=pattern),
     'domain': lambda pattern: Q(url__istartswith=f"http://{pattern}") | Q(url__istartswith=f"https://{pattern}") | Q(url__istartswith=f"ftp://{pattern}"),
     'domain': lambda pattern: Q(url__istartswith=f"http://{pattern}") | Q(url__istartswith=f"https://{pattern}") | Q(url__istartswith=f"ftp://{pattern}"),
     'tag': lambda pattern: Q(tags__name=pattern),
     'tag': lambda pattern: Q(tags__name=pattern),
+    'timestamp': lambda pattern: Q(timestamp=pattern),
 }
 }
 
 
 @enforce_types
 @enforce_types

+ 79 - 72
archivebox/index/html.py

@@ -1,11 +1,12 @@
 __package__ = 'archivebox.index'
 __package__ = 'archivebox.index'
 
 
+from pathlib import Path
 from datetime import datetime
 from datetime import datetime
+from collections import defaultdict
 from typing import List, Optional, Iterator, Mapping
 from typing import List, Optional, Iterator, Mapping
-from pathlib import Path
 
 
 from django.utils.html import format_html, mark_safe
 from django.utils.html import format_html, mark_safe
-from collections import defaultdict
+from django.core.cache import cache
 
 
 from .schema import Link
 from .schema import Link
 from ..system import atomic_write
 from ..system import atomic_write
@@ -20,7 +21,6 @@ from ..util import (
 from ..config import (
 from ..config import (
     OUTPUT_DIR,
     OUTPUT_DIR,
     VERSION,
     VERSION,
-    GIT_SHA,
     FOOTER_INFO,
     FOOTER_INFO,
     HTML_INDEX_FILENAME,
     HTML_INDEX_FILENAME,
     SAVE_ARCHIVE_DOT_ORG,
     SAVE_ARCHIVE_DOT_ORG,
@@ -60,7 +60,7 @@ def main_index_template(links: List[Link], template: str=MAIN_INDEX_TEMPLATE) ->
 
 
     return render_django_template(template, {
     return render_django_template(template, {
         'version': VERSION,
         'version': VERSION,
-        'git_sha': GIT_SHA,
+        'git_sha': VERSION,  # not used anymore, but kept for backwards compatibility
         'num_links': str(len(links)),
         'num_links': str(len(links)),
         'date_updated': datetime.now().strftime('%Y-%m-%d'),
         'date_updated': datetime.now().strftime('%Y-%m-%d'),
         'time_updated': datetime.now().strftime('%Y-%m-%d %H:%M'),
         'time_updated': datetime.now().strftime('%Y-%m-%d %H:%M'),
@@ -116,71 +116,78 @@ def render_django_template(template: str, context: Mapping[str, str]) -> str:
 
 
 
 
 def snapshot_icons(snapshot) -> str:
 def snapshot_icons(snapshot) -> str:
-    from core.models import EXTRACTORS
-
-    # start = datetime.now()
-
-    archive_results = snapshot.archiveresult_set.filter(status="succeeded")
-    link = snapshot.as_link()
-    path = link.archive_path
-    canon = link.canonical_outputs()
-    output = ""
-    output_template = '<a href="/{}/{}" class="exists-{}" title="{}">{}</a> &nbsp;'
-    icons = {
-        "singlefile": "❶",
-        "wget": "🆆",
-        "dom": "🅷",
-        "pdf": "📄",
-        "screenshot": "💻",
-        "media": "📼",
-        "git": "🅶",
-        "archive_org": "🏛",
-        "readability": "🆁",
-        "mercury": "🅼",
-        "warc": "📦"
-    }
-    exclude = ["favicon", "title", "headers", "archive_org"]
-    # Missing specific entry for WARC
-
-    extractor_outputs = defaultdict(lambda: None)
-    for extractor, _ in EXTRACTORS:
-        for result in archive_results:
-            if result.extractor == extractor and result:
-                extractor_outputs[extractor] = result
-
-    for extractor, _ in EXTRACTORS:
-        if extractor not in exclude:
-            existing = extractor_outputs[extractor] and extractor_outputs[extractor].status == 'succeeded' and extractor_outputs[extractor].output
-            # Check filesystsem to see if anything is actually present (too slow, needs optimization/caching)
-            # if existing:
-            #     existing = (Path(path) / existing)
-            #     if existing.is_file():
-            #         existing = True
-            #     elif existing.is_dir():
-            #         existing = any(existing.glob('*.*'))
-            output += format_html(output_template, path, canon[f"{extractor}_path"], str(bool(existing)),
-                                         extractor, icons.get(extractor, "?"))
-        if extractor == "wget":
-            # warc isn't technically it's own extractor, so we have to add it after wget
-            
-            # get from db (faster but less thurthful)
-            exists = extractor_outputs[extractor] and extractor_outputs[extractor].status == 'succeeded' and extractor_outputs[extractor].output
-            # get from filesystem (slower but more accurate)
-            # exists = list((Path(path) / canon["warc_path"]).glob("*.warc.gz"))
-            output += format_html(output_template, 'warc/', canon["warc_path"], str(bool(exists)), "warc", icons.get("warc", "?"))
-
-        if extractor == "archive_org":
-            # The check for archive_org is different, so it has to be handled separately
-
-            # get from db (faster)
-            exists = extractor_outputs[extractor] and extractor_outputs[extractor].status == 'succeeded' and extractor_outputs[extractor].output
-            # get from filesystem (slower)
-            # target_path = Path(path) / "archive.org.txt"
-            # exists = target_path.exists()
-            output += '<a href="{}" class="exists-{}" title="{}">{}</a> '.format(canon["archive_org_path"], str(exists),
-                                                                                        "archive_org", icons.get("archive_org", "?"))
-
-    result = format_html('<span class="files-icons" style="font-size: 1.1em; opacity: 0.8; min-width: 240px; display: inline-block">{}<span>', mark_safe(output))
-    # end = datetime.now()
-    # print(((end - start).total_seconds()*1000) // 1, 'ms')
-    return result
+    cache_key = f'{str(snapshot.id)[:12]}-{(snapshot.updated or snapshot.added).timestamp()}-snapshot-icons'
+    
+    def calc_snapshot_icons():
+        from core.models import EXTRACTORS
+        # start = datetime.now()
+
+        archive_results = snapshot.archiveresult_set.filter(status="succeeded", output__isnull=False)
+        link = snapshot.as_link()
+        path = link.archive_path
+        canon = link.canonical_outputs()
+        output = ""
+        output_template = '<a href="/{}/{}" class="exists-{}" title="{}">{}</a> &nbsp;'
+        icons = {
+            "singlefile": "❶",
+            "wget": "🆆",
+            "dom": "🅷",
+            "pdf": "📄",
+            "screenshot": "💻",
+            "media": "📼",
+            "git": "🅶",
+            "archive_org": "🏛",
+            "readability": "🆁",
+            "mercury": "🅼",
+            "warc": "📦"
+        }
+        exclude = ["favicon", "title", "headers", "archive_org"]
+        # Missing specific entry for WARC
+
+        extractor_outputs = defaultdict(lambda: None)
+        for extractor, _ in EXTRACTORS:
+            for result in archive_results:
+                if result.extractor == extractor and result:
+                    extractor_outputs[extractor] = result
+
+        for extractor, _ in EXTRACTORS:
+            if extractor not in exclude:
+                existing = extractor_outputs[extractor] and extractor_outputs[extractor].status == 'succeeded' and extractor_outputs[extractor].output
+                # Check filesystsem to see if anything is actually present (too slow, needs optimization/caching)
+                # if existing:
+                #     existing = (Path(path) / existing)
+                #     if existing.is_file():
+                #         existing = True
+                #     elif existing.is_dir():
+                #         existing = any(existing.glob('*.*'))
+                output += format_html(output_template, path, canon[f"{extractor}_path"], str(bool(existing)),
+                                             extractor, icons.get(extractor, "?"))
+            if extractor == "wget":
+                # warc isn't technically it's own extractor, so we have to add it after wget
+                
+                # get from db (faster but less thurthful)
+                exists = extractor_outputs[extractor] and extractor_outputs[extractor].status == 'succeeded' and extractor_outputs[extractor].output
+                # get from filesystem (slower but more accurate)
+                # exists = list((Path(path) / canon["warc_path"]).glob("*.warc.gz"))
+                output += format_html(output_template, path, canon["warc_path"], str(bool(exists)), "warc", icons.get("warc", "?"))
+
+            if extractor == "archive_org":
+                # The check for archive_org is different, so it has to be handled separately
+
+                # get from db (faster)
+                exists = extractor_outputs[extractor] and extractor_outputs[extractor].status == 'succeeded' and extractor_outputs[extractor].output
+                # get from filesystem (slower)
+                # target_path = Path(path) / "archive.org.txt"
+                # exists = target_path.exists()
+                output += '<a href="{}" class="exists-{}" title="{}">{}</a> '.format(canon["archive_org_path"], str(exists),
+                                                                                            "archive_org", icons.get("archive_org", "?"))
+
+        result = format_html('<span class="files-icons" style="font-size: 1.1em; opacity: 0.8; min-width: 240px; display: inline-block">{}<span>', mark_safe(output))
+        # end = datetime.now()
+        # print(((end - start).total_seconds()*1000) // 1, 'ms')
+        return result
+
+    return cache.get_or_set(cache_key, calc_snapshot_icons)
+    # return calc_snapshot_icons()
+
+   

+ 1 - 2
archivebox/index/json.py

@@ -15,7 +15,6 @@ from ..config import (
     VERSION,
     VERSION,
     OUTPUT_DIR,
     OUTPUT_DIR,
     FOOTER_INFO,
     FOOTER_INFO,
-    GIT_SHA,
     DEPENDENCIES,
     DEPENDENCIES,
     JSON_INDEX_FILENAME,
     JSON_INDEX_FILENAME,
     ARCHIVE_DIR_NAME,
     ARCHIVE_DIR_NAME,
@@ -30,7 +29,7 @@ MAIN_INDEX_HEADER = {
     'meta': {
     'meta': {
         'project': 'ArchiveBox',
         'project': 'ArchiveBox',
         'version': VERSION,
         'version': VERSION,
-        'git_sha': GIT_SHA,
+        'git_sha': VERSION,  # not used anymore, but kept for backwards compatibility
         'website': 'https://ArchiveBox.io',
         'website': 'https://ArchiveBox.io',
         'docs': 'https://github.com/ArchiveBox/ArchiveBox/wiki',
         'docs': 'https://github.com/ArchiveBox/ArchiveBox/wiki',
         'source': 'https://github.com/ArchiveBox/ArchiveBox',
         'source': 'https://github.com/ArchiveBox/ArchiveBox',

+ 10 - 1
archivebox/index/schema.py

@@ -16,6 +16,7 @@ from typing import List, Dict, Any, Optional, Union
 
 
 from dataclasses import dataclass, asdict, field, fields
 from dataclasses import dataclass, asdict, field, fields
 
 
+from django.utils.functional import cached_property
 
 
 from ..system import get_dir_size
 from ..system import get_dir_size
 
 
@@ -133,7 +134,6 @@ class Link:
     updated: Optional[datetime] = None
     updated: Optional[datetime] = None
     schema: str = 'Link'
     schema: str = 'Link'
 
 
-
     def __str__(self) -> str:
     def __str__(self) -> str:
         return f'[{self.timestamp}] {self.url} "{self.title}"'
         return f'[{self.timestamp}] {self.url} "{self.title}"'
 
 
@@ -190,6 +190,7 @@ class Link:
         }
         }
         if extended:
         if extended:
             info.update({
             info.update({
+                'snapshot_id': self.snapshot_id,
                 'link_dir': self.link_dir,
                 'link_dir': self.link_dir,
                 'archive_path': self.archive_path,
                 'archive_path': self.archive_path,
                 
                 
@@ -201,6 +202,9 @@ class Link:
                 'basename': self.basename,
                 'basename': self.basename,
                 'extension': self.extension,
                 'extension': self.extension,
                 'is_static': self.is_static,
                 'is_static': self.is_static,
+                
+                'tags_str': self.tags,   # only used to render static index in index/html.py, remove if no longer needed there
+                'icons': None,           # only used to render static index in index/html.py, remove if no longer needed there
 
 
                 'bookmarked_date': self.bookmarked_date,
                 'bookmarked_date': self.bookmarked_date,
                 'updated_date': self.updated_date,
                 'updated_date': self.updated_date,
@@ -255,6 +259,11 @@ class Link:
 
 
         return to_csv(self, cols=cols or self.field_names(), separator=separator, ljust=ljust)
         return to_csv(self, cols=cols or self.field_names(), separator=separator, ljust=ljust)
 
 
+    @cached_property
+    def snapshot_id(self):
+        from core.models import Snapshot
+        return str(Snapshot.objects.only('id').get(url=self.url).id)
+
     @classmethod
     @classmethod
     def field_names(cls):
     def field_names(cls):
         return [f.name for f in fields(cls)]
         return [f.name for f in fields(cls)]

+ 61 - 21
archivebox/index/sql.py

@@ -7,7 +7,7 @@ from django.db.models import QuerySet
 from django.db import transaction
 from django.db import transaction
 
 
 from .schema import Link
 from .schema import Link
-from ..util import enforce_types
+from ..util import enforce_types, parse_date
 from ..config import OUTPUT_DIR
 from ..config import OUTPUT_DIR
 
 
 
 
@@ -23,13 +23,15 @@ def parse_sql_main_index(out_dir: Path=OUTPUT_DIR) -> Iterator[Link]:
     )
     )
 
 
 @enforce_types
 @enforce_types
-def remove_from_sql_main_index(snapshots: QuerySet, out_dir: Path=OUTPUT_DIR) -> None:
-    with transaction.atomic():
-        snapshots.delete()
+def remove_from_sql_main_index(snapshots: QuerySet, atomic: bool=False, out_dir: Path=OUTPUT_DIR) -> None:
+    if atomic:
+        with transaction.atomic():
+            return snapshots.delete()
+    return snapshots.delete()
 
 
 @enforce_types
 @enforce_types
 def write_link_to_sql_index(link: Link):
 def write_link_to_sql_index(link: Link):
-    from core.models import Snapshot
+    from core.models import Snapshot, ArchiveResult
     info = {k: v for k, v in link._asdict().items() if k in Snapshot.keys}
     info = {k: v for k, v in link._asdict().items() if k in Snapshot.keys}
     tags = info.pop("tags")
     tags = info.pop("tags")
     if tags is None:
     if tags is None:
@@ -41,36 +43,74 @@ def write_link_to_sql_index(link: Link):
         while Snapshot.objects.filter(timestamp=info["timestamp"]).exists():
         while Snapshot.objects.filter(timestamp=info["timestamp"]).exists():
             info["timestamp"] = str(float(info["timestamp"]) + 1.0)
             info["timestamp"] = str(float(info["timestamp"]) + 1.0)
 
 
-    snapshot, _ = Snapshot.objects.update_or_create(url=link.url, defaults=info)
+        snapshot, _ = Snapshot.objects.update_or_create(url=link.url, defaults=info)
     snapshot.save_tags(tags)
     snapshot.save_tags(tags)
+
+    for extractor, entries in link.history.items():
+        for entry in entries:
+            if isinstance(entry, dict):
+                result, _ = ArchiveResult.objects.get_or_create(
+                    snapshot_id=snapshot.id,
+                    extractor=extractor,
+                    start_ts=parse_date(entry['start_ts']),
+                    defaults={
+                        'end_ts': parse_date(entry['end_ts']),
+                        'cmd': entry['cmd'],
+                        'output': entry['output'],
+                        'cmd_version': entry.get('cmd_version') or 'unknown',
+                        'pwd': entry['pwd'],
+                        'status': entry['status'],
+                    }
+                )
+            else:
+                result, _ = ArchiveResult.objects.update_or_create(
+                    snapshot_id=snapshot.id,
+                    extractor=extractor,
+                    start_ts=parse_date(entry.start_ts),
+                    defaults={
+                        'end_ts': parse_date(entry.end_ts),
+                        'cmd': entry.cmd,
+                        'output': entry.output,
+                        'cmd_version': entry.cmd_version or 'unknown',
+                        'pwd': entry.pwd,
+                        'status': entry.status,
+                    }
+                )
+
     return snapshot
     return snapshot
 
 
 
 
 @enforce_types
 @enforce_types
 def write_sql_main_index(links: List[Link], out_dir: Path=OUTPUT_DIR) -> None:
 def write_sql_main_index(links: List[Link], out_dir: Path=OUTPUT_DIR) -> None:
-    with transaction.atomic():
-        for link in links:
-            write_link_to_sql_index(link)
+    for link in links:
+        # with transaction.atomic():
+            # write_link_to_sql_index(link)
+        write_link_to_sql_index(link)
             
             
 
 
 @enforce_types
 @enforce_types
 def write_sql_link_details(link: Link, out_dir: Path=OUTPUT_DIR) -> None:
 def write_sql_link_details(link: Link, out_dir: Path=OUTPUT_DIR) -> None:
     from core.models import Snapshot
     from core.models import Snapshot
 
 
-    with transaction.atomic():
-        try:
-            snap = Snapshot.objects.get(url=link.url)
-        except Snapshot.DoesNotExist:
-            snap = write_link_to_sql_index(link)
-        snap.title = link.title
+    # with transaction.atomic():
+    #     try:
+    #         snap = Snapshot.objects.get(url=link.url)
+    #     except Snapshot.DoesNotExist:
+    #         snap = write_link_to_sql_index(link)
+    #     snap.title = link.title
+    try:
+        snap = Snapshot.objects.get(url=link.url)
+    except Snapshot.DoesNotExist:
+        snap = write_link_to_sql_index(link)
+    snap.title = link.title
 
 
-        tag_set = (
-            set(tag.strip() for tag in (link.tags or '').split(','))
-        )
-        tag_list = list(tag_set) or []
+    tag_set = (
+        set(tag.strip() for tag in (link.tags or '').split(','))
+    )
+    tag_list = list(tag_set) or []
 
 
-        snap.save()
-        snap.save_tags(tag_list)
+    snap.save()
+    snap.save_tags(tag_list)
 
 
 
 
 
 

+ 64 - 11
archivebox/logging_util.py

@@ -3,6 +3,7 @@ __package__ = 'archivebox'
 import re
 import re
 import os
 import os
 import sys
 import sys
+import stat
 import time
 import time
 import argparse
 import argparse
 from math import log
 from math import log
@@ -11,18 +12,21 @@ from pathlib import Path
 
 
 from datetime import datetime
 from datetime import datetime
 from dataclasses import dataclass
 from dataclasses import dataclass
-from typing import Optional, List, Dict, Union, IO, TYPE_CHECKING
+from typing import Any, Optional, List, Dict, Union, IO, TYPE_CHECKING
 
 
 if TYPE_CHECKING:
 if TYPE_CHECKING:
     from .index.schema import Link, ArchiveResult
     from .index.schema import Link, ArchiveResult
 
 
+from .system import get_dir_size
 from .util import enforce_types
 from .util import enforce_types
 from .config import (
 from .config import (
     ConfigDict,
     ConfigDict,
     OUTPUT_DIR,
     OUTPUT_DIR,
     PYTHON_ENCODING,
     PYTHON_ENCODING,
+    VERSION,
     ANSI,
     ANSI,
     IS_TTY,
     IS_TTY,
+    IN_DOCKER,
     TERM_WIDTH,
     TERM_WIDTH,
     SHOW_PROGRESS,
     SHOW_PROGRESS,
     SOURCES_DIR_NAME,
     SOURCES_DIR_NAME,
@@ -50,6 +54,37 @@ class RuntimeStats:
 _LAST_RUN_STATS = RuntimeStats()
 _LAST_RUN_STATS = RuntimeStats()
 
 
 
 
+def debug_dict_summary(obj: Dict[Any, Any]) -> None:
+    stderr(' '.join(f'{key}={str(val).ljust(6)}' for key, val in obj.items()))
+
+
+def get_fd_info(fd) -> Dict[str, Any]:
+    NAME = fd.name[1:-1]
+    FILENO = fd.fileno()
+    MODE = os.fstat(FILENO).st_mode
+    IS_TTY = hasattr(fd, 'isatty') and fd.isatty()
+    IS_PIPE = stat.S_ISFIFO(MODE)
+    IS_FILE = stat.S_ISREG(MODE)
+    IS_TERMINAL =  not (IS_PIPE or IS_FILE)
+    IS_LINE_BUFFERED = fd.line_buffering
+    IS_READABLE = fd.readable()
+    return {
+        'NAME': NAME, 'FILENO': FILENO, 'MODE': MODE,
+        'IS_TTY': IS_TTY, 'IS_PIPE': IS_PIPE, 'IS_FILE': IS_FILE,
+        'IS_TERMINAL': IS_TERMINAL, 'IS_LINE_BUFFERED': IS_LINE_BUFFERED,
+        'IS_READABLE': IS_READABLE,
+    }
+    
+
+# # Log debug information about stdin, stdout, and stderr
+# sys.stdout.write('[>&1] this is python stdout\n')
+# sys.stderr.write('[>&2] this is python stderr\n')
+
+# debug_dict_summary(get_fd_info(sys.stdin))
+# debug_dict_summary(get_fd_info(sys.stdout))
+# debug_dict_summary(get_fd_info(sys.stderr))
+
+
 
 
 class SmartFormatter(argparse.HelpFormatter):
 class SmartFormatter(argparse.HelpFormatter):
     """Patched formatter that prints newlines in argparse help strings"""
     """Patched formatter that prints newlines in argparse help strings"""
@@ -62,22 +97,40 @@ class SmartFormatter(argparse.HelpFormatter):
 def reject_stdin(caller: str, stdin: Optional[IO]=sys.stdin) -> None:
 def reject_stdin(caller: str, stdin: Optional[IO]=sys.stdin) -> None:
     """Tell the user they passed stdin to a command that doesn't accept it"""
     """Tell the user they passed stdin to a command that doesn't accept it"""
 
 
-    if stdin and not stdin.isatty():
-        stdin_raw_text = stdin.read().strip()
+    if not stdin:
+        return None
+
+    if IN_DOCKER:
+        # when TTY is disabled in docker we cant tell if stdin is being piped in or not
+        # if we try to read stdin when its not piped we will hang indefinitely waiting for it
+        return None
+
+    if not stdin.isatty():
+        # stderr('READING STDIN TO REJECT...')
+        stdin_raw_text = stdin.read()
         if stdin_raw_text:
         if stdin_raw_text:
+            # stderr('GOT STDIN!', len(stdin_str))
             stderr(f'[X] The "{caller}" command does not accept stdin.', color='red')
             stderr(f'[X] The "{caller}" command does not accept stdin.', color='red')
             stderr(f'    Run archivebox "{caller} --help" to see usage and examples.')
             stderr(f'    Run archivebox "{caller} --help" to see usage and examples.')
             stderr()
             stderr()
             raise SystemExit(1)
             raise SystemExit(1)
+    return None
 
 
 
 
 def accept_stdin(stdin: Optional[IO]=sys.stdin) -> Optional[str]:
 def accept_stdin(stdin: Optional[IO]=sys.stdin) -> Optional[str]:
     """accept any standard input and return it as a string or None"""
     """accept any standard input and return it as a string or None"""
+    
     if not stdin:
     if not stdin:
         return None
         return None
-    elif stdin and not stdin.isatty():
-        stdin_str = stdin.read().strip()
-        return stdin_str or None
+
+    if not stdin.isatty():
+        # stderr('READING STDIN TO ACCEPT...')
+        stdin_str = stdin.read()
+
+        if stdin_str:
+            # stderr('GOT STDIN...', len(stdin_str))
+            return stdin_str
+
     return None
     return None
 
 
 
 
@@ -174,7 +227,6 @@ def progress_bar(seconds: int, prefix: str='') -> None:
 
 
 
 
 def log_cli_command(subcommand: str, subcommand_args: List[str], stdin: Optional[str], pwd: str):
 def log_cli_command(subcommand: str, subcommand_args: List[str], stdin: Optional[str], pwd: str):
-    from .config import VERSION, ANSI
     cmd = ' '.join(('archivebox', subcommand, *subcommand_args))
     cmd = ' '.join(('archivebox', subcommand, *subcommand_args))
     stderr('{black}[i] [{now}] ArchiveBox v{VERSION}: {cmd}{reset}'.format(
     stderr('{black}[i] [{now}] ArchiveBox v{VERSION}: {cmd}{reset}'.format(
         now=datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
         now=datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
@@ -233,11 +285,11 @@ def log_indexing_process_finished():
 
 
 def log_indexing_started(out_path: str):
 def log_indexing_started(out_path: str):
     if IS_TTY:
     if IS_TTY:
-        sys.stdout.write(f'    > {out_path}')
+        sys.stdout.write(f'    > ./{Path(out_path).relative_to(OUTPUT_DIR)}')
 
 
 
 
 def log_indexing_finished(out_path: str):
 def log_indexing_finished(out_path: str):
-    print(f'\r    √ {out_path}')
+    print(f'\r    √ ./{Path(out_path).relative_to(OUTPUT_DIR)}')
 
 
 
 
 ### Archiving Stage
 ### Archiving Stage
@@ -272,8 +324,6 @@ def log_archiving_paused(num_links: int, idx: int, timestamp: str):
         total=num_links,
         total=num_links,
     ))
     ))
     print()
     print()
-    print('    {lightred}Hint:{reset} To view your archive index, run:'.format(**ANSI))
-    print('        archivebox server  # then visit http://127.0.0.1:8000')
     print('    Continue archiving where you left off by running:')
     print('    Continue archiving where you left off by running:')
     print('        archivebox update --resume={}'.format(timestamp))
     print('        archivebox update --resume={}'.format(timestamp))
 
 
@@ -331,6 +381,9 @@ def log_link_archiving_finished(link: "Link", link_dir: str, is_new: bool, stats
     else:
     else:
         _LAST_RUN_STATS.succeeded += 1
         _LAST_RUN_STATS.succeeded += 1
 
 
+    size = get_dir_size(link_dir)
+    print('        {black}{} files ({}){reset}'.format(size[2], printable_filesize(size[0]), **ANSI))
+
 
 
 def log_archive_method_started(method: str):
 def log_archive_method_started(method: str):
     print('      > {}'.format(method))
     print('      > {}'.format(method))

+ 156 - 95
archivebox/main.py

@@ -67,6 +67,7 @@ from .config import (
     ConfigDict,
     ConfigDict,
     ANSI,
     ANSI,
     IS_TTY,
     IS_TTY,
+    DEBUG,
     IN_DOCKER,
     IN_DOCKER,
     USER,
     USER,
     ARCHIVEBOX_BINARY,
     ARCHIVEBOX_BINARY,
@@ -76,6 +77,7 @@ from .config import (
     ARCHIVE_DIR,
     ARCHIVE_DIR,
     LOGS_DIR,
     LOGS_DIR,
     CONFIG_FILE,
     CONFIG_FILE,
+    CONFIG_FILENAME,
     ARCHIVE_DIR_NAME,
     ARCHIVE_DIR_NAME,
     SOURCES_DIR_NAME,
     SOURCES_DIR_NAME,
     LOGS_DIR_NAME,
     LOGS_DIR_NAME,
@@ -84,6 +86,7 @@ from .config import (
     SQL_INDEX_FILENAME,
     SQL_INDEX_FILENAME,
     ROBOTS_TXT_FILENAME,
     ROBOTS_TXT_FILENAME,
     FAVICON_FILENAME,
     FAVICON_FILENAME,
+    SEARCH_BACKEND_ENGINE,
     check_dependencies,
     check_dependencies,
     check_data_folder,
     check_data_folder,
     write_config_file,
     write_config_file,
@@ -125,14 +128,19 @@ ALLOWED_IN_OUTPUT_DIR = {
     'node_modules',
     'node_modules',
     'package-lock.json',
     'package-lock.json',
     'static',
     'static',
+    'sonic',
     ARCHIVE_DIR_NAME,
     ARCHIVE_DIR_NAME,
     SOURCES_DIR_NAME,
     SOURCES_DIR_NAME,
     LOGS_DIR_NAME,
     LOGS_DIR_NAME,
     SQL_INDEX_FILENAME,
     SQL_INDEX_FILENAME,
+    f'{SQL_INDEX_FILENAME}-wal',
+    f'{SQL_INDEX_FILENAME}-shm',
     JSON_INDEX_FILENAME,
     JSON_INDEX_FILENAME,
     HTML_INDEX_FILENAME,
     HTML_INDEX_FILENAME,
     ROBOTS_TXT_FILENAME,
     ROBOTS_TXT_FILENAME,
     FAVICON_FILENAME,
     FAVICON_FILENAME,
+    CONFIG_FILENAME,
+    f'{CONFIG_FILENAME}.bak',
 }
 }
 
 
 @enforce_types
 @enforce_types
@@ -214,9 +222,23 @@ def version(quiet: bool=False,
     if quiet:
     if quiet:
         print(VERSION)
         print(VERSION)
     else:
     else:
+        # ArchiveBox v0.5.6
+        # Cpython Linux Linux-4.19.121-linuxkit-x86_64-with-glibc2.28 x86_64 (in Docker) (in TTY)
         print('ArchiveBox v{}'.format(VERSION))
         print('ArchiveBox v{}'.format(VERSION))
         p = platform.uname()
         p = platform.uname()
-        print(sys.implementation.name.title(), p.system, platform.platform(), p.machine, '(in Docker)' if IN_DOCKER else '(not in Docker)')
+        print(
+            sys.implementation.name.title(),
+            p.system,
+            platform.platform(),
+            p.machine,
+        )
+        print(
+            f'IN_DOCKER={IN_DOCKER}',
+            f'DEBUG={DEBUG}',
+            f'IS_TTY={IS_TTY}',
+            f'TZ={os.environ.get("TZ", "UTC")}',
+            f'SEARCH_BACKEND_ENGINE={SEARCH_BACKEND_ENGINE}',
+        )
         print()
         print()
 
 
         print('{white}[i] Dependency versions:{reset}'.format(**ANSI))
         print('{white}[i] Dependency versions:{reset}'.format(**ANSI))
@@ -261,7 +283,7 @@ def run(subcommand: str,
 
 
 
 
 @enforce_types
 @enforce_types
-def init(force: bool=False, out_dir: Path=OUTPUT_DIR) -> None:
+def init(force: bool=False, quick: bool=False, out_dir: Path=OUTPUT_DIR) -> None:
     """Initialize a new ArchiveBox collection in the current directory"""
     """Initialize a new ArchiveBox collection in the current directory"""
     
     
     from core.models import Snapshot
     from core.models import Snapshot
@@ -276,13 +298,12 @@ def init(force: bool=False, out_dir: Path=OUTPUT_DIR) -> None:
     existing_index = (Path(out_dir) / SQL_INDEX_FILENAME).exists()
     existing_index = (Path(out_dir) / SQL_INDEX_FILENAME).exists()
 
 
     if is_empty and not existing_index:
     if is_empty and not existing_index:
-        print('{green}[+] Initializing a new ArchiveBox collection in this folder...{reset}'.format(**ANSI))
-        print(f'    {out_dir}')
-        print('{green}------------------------------------------------------------------{reset}'.format(**ANSI))
+        print('{green}[+] Initializing a new ArchiveBox v{} collection...{reset}'.format(VERSION, **ANSI))
+        print('{green}----------------------------------------------------------------------{reset}'.format(**ANSI))
     elif existing_index:
     elif existing_index:
-        print('{green}[*] Updating existing ArchiveBox collection in this folder...{reset}'.format(**ANSI))
-        print(f'    {out_dir}')
-        print('{green}------------------------------------------------------------------{reset}'.format(**ANSI))
+        # TODO: properly detect and print the existing version in current index as well
+        print('{green}[^] Verifying and updating existing ArchiveBox collection to v{}...{reset}'.format(VERSION, **ANSI))
+        print('{green}----------------------------------------------------------------------{reset}'.format(**ANSI))
     else:
     else:
         if force:
         if force:
             stderr('[!] This folder appears to already have files in it, but no index.sqlite3 is present.', color='lightyellow')
             stderr('[!] This folder appears to already have files in it, but no index.sqlite3 is present.', color='lightyellow')
@@ -303,30 +324,25 @@ def init(force: bool=False, out_dir: Path=OUTPUT_DIR) -> None:
     else:
     else:
         print('\n{green}[+] Building archive folder structure...{reset}'.format(**ANSI))
         print('\n{green}[+] Building archive folder structure...{reset}'.format(**ANSI))
     
     
+    print(f'    + ./{ARCHIVE_DIR.relative_to(OUTPUT_DIR)}, ./{SOURCES_DIR.relative_to(OUTPUT_DIR)}, ./{LOGS_DIR.relative_to(OUTPUT_DIR)}...')
     Path(SOURCES_DIR).mkdir(exist_ok=True)
     Path(SOURCES_DIR).mkdir(exist_ok=True)
-    print(f'    √ {SOURCES_DIR}')
-    
     Path(ARCHIVE_DIR).mkdir(exist_ok=True)
     Path(ARCHIVE_DIR).mkdir(exist_ok=True)
-    print(f'    √ {ARCHIVE_DIR}')
-
     Path(LOGS_DIR).mkdir(exist_ok=True)
     Path(LOGS_DIR).mkdir(exist_ok=True)
-    print(f'    √ {LOGS_DIR}')
-
+    print(f'    + ./{CONFIG_FILE.relative_to(OUTPUT_DIR)}...')
     write_config_file({}, out_dir=out_dir)
     write_config_file({}, out_dir=out_dir)
-    print(f'    √ {CONFIG_FILE}')
+
     if (Path(out_dir) / SQL_INDEX_FILENAME).exists():
     if (Path(out_dir) / SQL_INDEX_FILENAME).exists():
-        print('\n{green}[*] Verifying main SQL index and running migrations...{reset}'.format(**ANSI))
+        print('\n{green}[*] Verifying main SQL index and running any migrations needed...{reset}'.format(**ANSI))
     else:
     else:
-        print('\n{green}[+] Building main SQL index and running migrations...{reset}'.format(**ANSI))
+        print('\n{green}[+] Building main SQL index and running initial migrations...{reset}'.format(**ANSI))
     
     
     DATABASE_FILE = Path(out_dir) / SQL_INDEX_FILENAME
     DATABASE_FILE = Path(out_dir) / SQL_INDEX_FILENAME
-    print(f'    √ {DATABASE_FILE}')
-    print()
     for migration_line in apply_migrations(out_dir):
     for migration_line in apply_migrations(out_dir):
         print(f'    {migration_line}')
         print(f'    {migration_line}')
 
 
-
     assert DATABASE_FILE.exists()
     assert DATABASE_FILE.exists()
+    print()
+    print(f'    √ ./{DATABASE_FILE.relative_to(OUTPUT_DIR)}')
     
     
     # from django.contrib.auth.models import User
     # from django.contrib.auth.models import User
     # if IS_TTY and not User.objects.filter(is_superuser=True).exists():
     # if IS_TTY and not User.objects.filter(is_superuser=True).exists():
@@ -334,7 +350,7 @@ def init(force: bool=False, out_dir: Path=OUTPUT_DIR) -> None:
     #     call_command("createsuperuser", interactive=True)
     #     call_command("createsuperuser", interactive=True)
 
 
     print()
     print()
-    print('{green}[*] Collecting links from any existing indexes and archive folders...{reset}'.format(**ANSI))
+    print('{green}[*] Checking links from indexes and archive folders (safe to Ctrl+C)...{reset}'.format(**ANSI))
 
 
     all_links = Snapshot.objects.none()
     all_links = Snapshot.objects.none()
     pending_links: Dict[str, Link] = {}
     pending_links: Dict[str, Link] = {}
@@ -343,63 +359,77 @@ def init(force: bool=False, out_dir: Path=OUTPUT_DIR) -> None:
         all_links = load_main_index(out_dir=out_dir, warn=False)
         all_links = load_main_index(out_dir=out_dir, warn=False)
         print('    √ Loaded {} links from existing main index.'.format(all_links.count()))
         print('    √ Loaded {} links from existing main index.'.format(all_links.count()))
 
 
-    # Links in data folders that dont match their timestamp
-    fixed, cant_fix = fix_invalid_folder_locations(out_dir=out_dir)
-    if fixed:
-        print('    {lightyellow}√ Fixed {} data directory locations that didn\'t match their link timestamps.{reset}'.format(len(fixed), **ANSI))
-    if cant_fix:
-        print('    {lightyellow}! Could not fix {} data directory locations due to conflicts with existing folders.{reset}'.format(len(cant_fix), **ANSI))
-
-    # Links in JSON index but not in main index
-    orphaned_json_links = {
-        link.url: link
-        for link in parse_json_main_index(out_dir)
-        if not all_links.filter(url=link.url).exists()
-    }
-    if orphaned_json_links:
-        pending_links.update(orphaned_json_links)
-        print('    {lightyellow}√ Added {} orphaned links from existing JSON index...{reset}'.format(len(orphaned_json_links), **ANSI))
-
-    # Links in data dir indexes but not in main index
-    orphaned_data_dir_links = {
-        link.url: link
-        for link in parse_json_links_details(out_dir)
-        if not all_links.filter(url=link.url).exists()
-    }
-    if orphaned_data_dir_links:
-        pending_links.update(orphaned_data_dir_links)
-        print('    {lightyellow}√ Added {} orphaned links from existing archive directories.{reset}'.format(len(orphaned_data_dir_links), **ANSI))
-
-    # Links in invalid/duplicate data dirs
-    invalid_folders = {
-        folder: link
-        for folder, link in get_invalid_folders(all_links, out_dir=out_dir).items()
-    }
-    if invalid_folders:
-        print('    {lightyellow}! Skipped adding {} invalid link data directories.{reset}'.format(len(invalid_folders), **ANSI))
-        print('        X ' + '\n        X '.join(f'{folder} {link}' for folder, link in invalid_folders.items()))
-        print()
-        print('    {lightred}Hint:{reset} For more information about the link data directories that were skipped, run:'.format(**ANSI))
-        print('        archivebox status')
-        print('        archivebox list --status=invalid')
-
-
-    write_main_index(list(pending_links.values()), out_dir=out_dir)
+    if quick:
+        print('    > Skipping full snapshot directory check (quick mode)')
+    else:
+        try:
+            # Links in data folders that dont match their timestamp
+            fixed, cant_fix = fix_invalid_folder_locations(out_dir=out_dir)
+            if fixed:
+                print('    {lightyellow}√ Fixed {} data directory locations that didn\'t match their link timestamps.{reset}'.format(len(fixed), **ANSI))
+            if cant_fix:
+                print('    {lightyellow}! Could not fix {} data directory locations due to conflicts with existing folders.{reset}'.format(len(cant_fix), **ANSI))
+
+            # Links in JSON index but not in main index
+            orphaned_json_links = {
+                link.url: link
+                for link in parse_json_main_index(out_dir)
+                if not all_links.filter(url=link.url).exists()
+            }
+            if orphaned_json_links:
+                pending_links.update(orphaned_json_links)
+                print('    {lightyellow}√ Added {} orphaned links from existing JSON index...{reset}'.format(len(orphaned_json_links), **ANSI))
+
+            # Links in data dir indexes but not in main index
+            orphaned_data_dir_links = {
+                link.url: link
+                for link in parse_json_links_details(out_dir)
+                if not all_links.filter(url=link.url).exists()
+            }
+            if orphaned_data_dir_links:
+                pending_links.update(orphaned_data_dir_links)
+                print('    {lightyellow}√ Added {} orphaned links from existing archive directories.{reset}'.format(len(orphaned_data_dir_links), **ANSI))
+
+            # Links in invalid/duplicate data dirs
+            invalid_folders = {
+                folder: link
+                for folder, link in get_invalid_folders(all_links, out_dir=out_dir).items()
+            }
+            if invalid_folders:
+                print('    {lightyellow}! Skipped adding {} invalid link data directories.{reset}'.format(len(invalid_folders), **ANSI))
+                print('        X ' + '\n        X '.join(f'./{Path(folder).relative_to(OUTPUT_DIR)} {link}' for folder, link in invalid_folders.items()))
+                print()
+                print('    {lightred}Hint:{reset} For more information about the link data directories that were skipped, run:'.format(**ANSI))
+                print('        archivebox status')
+                print('        archivebox list --status=invalid')
+
+        except (KeyboardInterrupt, SystemExit):
+            stderr()
+            stderr('[x] Stopped checking archive directories due to Ctrl-C/SIGTERM', color='red')
+            stderr('    Your archive data is safe, but you should re-run `archivebox init` to finish the process later.')
+            stderr()
+            stderr('    {lightred}Hint:{reset} In the future you can run a quick init without checking dirs like so:'.format(**ANSI))
+            stderr('        archivebox init --quick')
+            raise SystemExit(1)
+        
+        write_main_index(list(pending_links.values()), out_dir=out_dir)
 
 
-    print('\n{green}------------------------------------------------------------------{reset}'.format(**ANSI))
+    print('\n{green}----------------------------------------------------------------------{reset}'.format(**ANSI))
     if existing_index:
     if existing_index:
         print('{green}[√] Done. Verified and updated the existing ArchiveBox collection.{reset}'.format(**ANSI))
         print('{green}[√] Done. Verified and updated the existing ArchiveBox collection.{reset}'.format(**ANSI))
     else:
     else:
-        print('{green}[√] Done. A new ArchiveBox collection was initialized ({} links).{reset}'.format(len(all_links), **ANSI))
-    print()
-    print('    {lightred}Hint:{reset} To view your archive index, run:'.format(**ANSI))
-    print('        archivebox server  # then visit http://127.0.0.1:8000')
-    print()
-    print('    To add new links, you can run:')
-    print("        archivebox add ~/some/path/or/url/to/list_of_links.txt")
-    print()
-    print('    For more usage and examples, run:')
-    print('        archivebox help')
+        print('{green}[√] Done. A new ArchiveBox collection was initialized ({} links).{reset}'.format(len(all_links) + len(pending_links), **ANSI))
+    
+    if Snapshot.objects.count() < 25:     # hide the hints for experienced users
+        print()
+        print('    {lightred}Hint:{reset} To view your archive index, run:'.format(**ANSI))
+        print('        archivebox server  # then visit http://127.0.0.1:8000')
+        print()
+        print('    To add new links, you can run:')
+        print("        archivebox add ~/some/path/or/url/to/list_of_links.txt")
+        print()
+        print('    For more usage and examples, run:')
+        print('        archivebox help')
 
 
     json_index = Path(out_dir) / JSON_INDEX_FILENAME
     json_index = Path(out_dir) / JSON_INDEX_FILENAME
     html_index = Path(out_dir) / HTML_INDEX_FILENAME
     html_index = Path(out_dir) / HTML_INDEX_FILENAME
@@ -531,6 +561,7 @@ def oneshot(url: str, extractors: str="", out_dir: Path=OUTPUT_DIR):
 
 
 @enforce_types
 @enforce_types
 def add(urls: Union[str, List[str]],
 def add(urls: Union[str, List[str]],
+        tag: str='',
         depth: int=0,
         depth: int=0,
         update_all: bool=not ONLY_NEW,
         update_all: bool=not ONLY_NEW,
         index_only: bool=False,
         index_only: bool=False,
@@ -540,6 +571,8 @@ def add(urls: Union[str, List[str]],
         out_dir: Path=OUTPUT_DIR) -> List[Link]:
         out_dir: Path=OUTPUT_DIR) -> List[Link]:
     """Add a new URL or list of URLs to your archive"""
     """Add a new URL or list of URLs to your archive"""
 
 
+    from core.models import Tag
+
     assert depth in (0, 1), 'Depth must be 0 or 1 (depth >1 is not supported yet)'
     assert depth in (0, 1), 'Depth must be 0 or 1 (depth >1 is not supported yet)'
 
 
     extractors = extractors.split(",") if extractors else []
     extractors = extractors.split(",") if extractors else []
@@ -572,26 +605,48 @@ def add(urls: Union[str, List[str]],
             new_links_depth += parse_links_from_source(downloaded_file, root_url=new_link.url)
             new_links_depth += parse_links_from_source(downloaded_file, root_url=new_link.url)
 
 
     imported_links = list({link.url: link for link in (new_links + new_links_depth)}.values())
     imported_links = list({link.url: link for link in (new_links + new_links_depth)}.values())
+    
     new_links = dedupe_links(all_links, imported_links)
     new_links = dedupe_links(all_links, imported_links)
 
 
     write_main_index(links=new_links, out_dir=out_dir)
     write_main_index(links=new_links, out_dir=out_dir)
     all_links = load_main_index(out_dir=out_dir)
     all_links = load_main_index(out_dir=out_dir)
 
 
     if index_only:
     if index_only:
-        return all_links
+        # mock archive all the links using the fake index_only extractor method in order to update their state
+        if overwrite:
+            archive_links(imported_links, overwrite=overwrite, methods=['index_only'], out_dir=out_dir)
+        else:
+            archive_links(new_links, overwrite=False, methods=['index_only'], out_dir=out_dir)
+    else:
+        # fully run the archive extractor methods for each link
+        archive_kwargs = {
+            "out_dir": out_dir,
+        }
+        if extractors:
+            archive_kwargs["methods"] = extractors
+
+        if update_all:
+            archive_links(all_links, overwrite=overwrite, **archive_kwargs)
+        elif overwrite:
+            archive_links(imported_links, overwrite=True, **archive_kwargs)
+        elif new_links:
+            archive_links(new_links, overwrite=False, **archive_kwargs)
+
+
+    # add any tags to imported links
+    tags = [
+        Tag.objects.get_or_create(name=name.strip())[0]
+        for name in tag.split(',')
+        if name.strip()
+    ]
+    if tags:
+        for link in imported_links:
+            snapshot = link.as_snapshot()
+            snapshot.tags.add(*tags)
+            snapshot.tags_str(nocache=True)
+            snapshot.save()
+        # print(f'    √ Tagged {len(imported_links)} Snapshots with {len(tags)} tags {tags_str}')
 
 
-    # Run the archive methods for each link
-    archive_kwargs = {
-        "out_dir": out_dir,
-    }
-    if extractors:
-        archive_kwargs["methods"] = extractors
-    if update_all:
-        archive_links(all_links, overwrite=overwrite, **archive_kwargs)
-    elif overwrite:
-        archive_links(imported_links, overwrite=True, **archive_kwargs)
-    elif new_links:
-        archive_links(new_links, overwrite=False, **archive_kwargs)
 
 
     return all_links
     return all_links
 
 
@@ -811,11 +866,15 @@ def list_links(snapshots: Optional[QuerySet]=None,
         all_snapshots = load_main_index(out_dir=out_dir)
         all_snapshots = load_main_index(out_dir=out_dir)
 
 
     if after is not None:
     if after is not None:
-        all_snapshots = all_snapshots.filter(timestamp__lt=after)
+        all_snapshots = all_snapshots.filter(timestamp__gte=after)
     if before is not None:
     if before is not None:
-        all_snapshots = all_snapshots.filter(timestamp__gt=before)
+        all_snapshots = all_snapshots.filter(timestamp__lt=before)
     if filter_patterns:
     if filter_patterns:
         all_snapshots = snapshot_filter(all_snapshots, filter_patterns, filter_type)
         all_snapshots = snapshot_filter(all_snapshots, filter_patterns, filter_type)
+
+    if not all_snapshots:
+        stderr('[!] No Snapshots matched your filters:', filter_patterns, f'({filter_type})', color='lightyellow')
+
     return all_snapshots
     return all_snapshots
 
 
 @enforce_types
 @enforce_types
@@ -1061,6 +1120,7 @@ def server(runserver_args: Optional[List[str]]=None,
            reload: bool=False,
            reload: bool=False,
            debug: bool=False,
            debug: bool=False,
            init: bool=False,
            init: bool=False,
+           quick_init: bool=False,
            createsuperuser: bool=False,
            createsuperuser: bool=False,
            out_dir: Path=OUTPUT_DIR) -> None:
            out_dir: Path=OUTPUT_DIR) -> None:
     """Run the ArchiveBox HTTP server"""
     """Run the ArchiveBox HTTP server"""
@@ -1069,9 +1129,14 @@ def server(runserver_args: Optional[List[str]]=None,
     
     
     if init:
     if init:
         run_subcommand('init', stdin=None, pwd=out_dir)
         run_subcommand('init', stdin=None, pwd=out_dir)
+        print()
+    elif quick_init:
+        run_subcommand('init', subcommand_args=['--quick'], stdin=None, pwd=out_dir)
+        print()
 
 
     if createsuperuser:
     if createsuperuser:
         run_subcommand('manage', subcommand_args=['createsuperuser'], pwd=out_dir)
         run_subcommand('manage', subcommand_args=['createsuperuser'], pwd=out_dir)
+        print()
 
 
     # setup config for django runserver
     # setup config for django runserver
     from . import config
     from . import config
@@ -1083,12 +1148,9 @@ def server(runserver_args: Optional[List[str]]=None,
     from django.core.management import call_command
     from django.core.management import call_command
     from django.contrib.auth.models import User
     from django.contrib.auth.models import User
 
 
-    admin_user = User.objects.filter(is_superuser=True).order_by('date_joined').only('username').last()
-
     print('{green}[+] Starting ArchiveBox webserver...{reset}'.format(**ANSI))
     print('{green}[+] Starting ArchiveBox webserver...{reset}'.format(**ANSI))
-    if admin_user:
-        hint('The admin username is{lightblue} {}{reset}\n'.format(admin_user.username, **ANSI))
-    else:
+    print('    > Logging errors to ./logs/errors.log')
+    if not User.objects.filter(is_superuser=True).exists():
         print('{lightyellow}[!] No admin users exist yet, you will not be able to edit links in the UI.{reset}'.format(**ANSI))
         print('{lightyellow}[!] No admin users exist yet, you will not be able to edit links in the UI.{reset}'.format(**ANSI))
         print()
         print()
         print('    To create an admin user, run:')
         print('    To create an admin user, run:')
@@ -1106,7 +1168,6 @@ def server(runserver_args: Optional[List[str]]=None,
     config.SHOW_PROGRESS = False
     config.SHOW_PROGRESS = False
     config.DEBUG = config.DEBUG or debug
     config.DEBUG = config.DEBUG or debug
 
 
-
     call_command("runserver", *runserver_args)
     call_command("runserver", *runserver_args)
 
 
 
 

+ 45 - 31
archivebox/parsers/__init__.py

@@ -68,7 +68,6 @@ def parse_links_memory(urls: List[str], root_url: Optional[str]=None):
     """
     """
     parse a list of URLS without touching the filesystem
     parse a list of URLS without touching the filesystem
     """
     """
-    check_url_parsing_invariants()
 
 
     timer = TimedProgress(TIMEOUT * 4)
     timer = TimedProgress(TIMEOUT * 4)
     #urls = list(map(lambda x: x + "\n", urls))
     #urls = list(map(lambda x: x + "\n", urls))
@@ -89,8 +88,6 @@ def parse_links(source_file: str, root_url: Optional[str]=None) -> Tuple[List[Li
        RSS feed, bookmarks export, or text file
        RSS feed, bookmarks export, or text file
     """
     """
 
 
-    check_url_parsing_invariants()
-
     timer = TimedProgress(TIMEOUT * 4)
     timer = TimedProgress(TIMEOUT * 4)
     with open(source_file, 'r', encoding='utf-8') as file:
     with open(source_file, 'r', encoding='utf-8') as file:
         links, parser = run_parser_functions(file, timer, root_url=root_url)
         links, parser = run_parser_functions(file, timer, root_url=root_url)
@@ -173,31 +170,48 @@ def save_file_as_source(path: str, timeout: int=TIMEOUT, filename: str='{ts}-{ba
     return source_path
     return source_path
 
 
 
 
-def check_url_parsing_invariants() -> None:
-    """Check that plain text regex URL parsing works as expected"""
-
-    # this is last-line-of-defense to make sure the URL_REGEX isn't
-    # misbehaving, as the consequences could be disastrous and lead to many
-    # incorrect/badly parsed links being added to the archive
-
-    test_urls = '''
-    https://example1.com/what/is/happening.html?what=1#how-about-this=1
-    https://example2.com/what/is/happening/?what=1#how-about-this=1
-    HTtpS://example3.com/what/is/happening/?what=1#how-about-this=1f
-    https://example4.com/what/is/happening.html
-    https://example5.com/
-    https://example6.com
-
-    <test>http://example7.com</test>
-    [https://example8.com/what/is/this.php?what=1]
-    [and http://example9.com?what=1&other=3#and-thing=2]
-    <what>https://example10.com#and-thing=2 "</about>
-    abc<this["https://example11.com/what/is#and-thing=2?whoami=23&where=1"]that>def
-    sdflkf[what](https://example12.com/who/what.php?whoami=1#whatami=2)?am=hi
-    example13.bada
-    and example14.badb
-    <or>htt://example15.badc</that>
-    '''
-    # print('\n'.join(re.findall(URL_REGEX, test_urls)))
-    assert len(re.findall(URL_REGEX, test_urls)) == 12
-
+# Check that plain text regex URL parsing works as expected
+#   this is last-line-of-defense to make sure the URL_REGEX isn't
+#   misbehaving due to some OS-level or environment level quirks (e.g. bad regex lib)
+#   the consequences of bad URL parsing could be disastrous and lead to many
+#   incorrect/badly parsed links being added to the archive, so this is worth the cost of checking
+_test_url_strs = {
+    'example.com': 0,
+    '/example.com': 0,
+    '//example.com': 0,
+    ':/example.com': 0,
+    '://example.com': 0,
+    'htt://example8.com': 0,
+    '/htt://example.com': 0,
+    'https://example': 1,
+    'https://localhost/2345': 1,
+    'https://localhost:1234/123': 1,
+    '://': 0,
+    'https://': 0,
+    'http://': 0,
+    'ftp://': 0,
+    'ftp://example.com': 0,
+    'https://example.com': 1,
+    'https://example.com/': 1,
+    'https://a.example.com': 1,
+    'https://a.example.com/': 1,
+    'https://a.example.com/what/is/happening.html': 1,
+    'https://a.example.com/what/ís/happening.html': 1,
+    'https://a.example.com/what/is/happening.html?what=1&2%20b#höw-about-this=1a': 1,
+    'https://a.example.com/what/is/happéning/?what=1&2%20b#how-aboüt-this=1a': 1,
+    'HTtpS://a.example.com/what/is/happening/?what=1&2%20b#how-about-this=1af&2f%20b': 1,
+    'https://example.com/?what=1#how-about-this=1&2%20baf': 1,
+    'https://example.com?what=1#how-about-this=1&2%20baf': 1,
+    '<test>http://example7.com</test>': 1,
+    '[https://example8.com/what/is/this.php?what=1]': 1,
+    '[and http://example9.com?what=1&other=3#and-thing=2]': 1,
+    '<what>https://example10.com#and-thing=2 "</about>': 1,
+    'abc<this["https://example11.com/what/is#and-thing=2?whoami=23&where=1"]that>def': 1,
+    'sdflkf[what](https://example12.com/who/what.php?whoami=1#whatami=2)?am=hi': 1,
+    '<or>http://examplehttp://15.badc</that>': 2,
+    'https://a.example.com/one.html?url=http://example.com/inside/of/another?=http://': 2,
+    '[https://a.example.com/one.html?url=http://example.com/inside/of/another?=](http://a.example.com)': 3,
+}
+for url_str, num_urls in _test_url_strs.items():
+    assert len(re.findall(URL_REGEX, url_str)) == num_urls, (
+        f'{url_str} does not contain {num_urls} urls')

+ 1 - 1
archivebox/search/utils.py

@@ -16,7 +16,7 @@ def get_file_result_content(res, extra_path, use_pwd=False):
     if extra_path:
     if extra_path:
         fpath = f'{fpath}/{extra_path}'
         fpath = f'{fpath}/{extra_path}'
 
 
-    with open(fpath, 'r') as file:
+    with open(fpath, 'r', encoding='utf-8') as file:
         data = file.read()
         data = file.read()
     if data:
     if data:
         return [data]
         return [data]

+ 3 - 2
archivebox/system.py

@@ -10,7 +10,7 @@ from typing import Optional, Union, Set, Tuple
 from subprocess import run as subprocess_run
 from subprocess import run as subprocess_run
 
 
 from crontab import CronTab
 from crontab import CronTab
-from atomicwrites import atomic_write as lib_atomic_write
+from .vendor.atomicwrites import atomic_write as lib_atomic_write
 
 
 from .util import enforce_types, ExtendedEncoder
 from .util import enforce_types, ExtendedEncoder
 from .config import OUTPUT_PERMISSIONS
 from .config import OUTPUT_PERMISSIONS
@@ -37,10 +37,11 @@ def atomic_write(path: Union[Path, str], contents: Union[dict, str, bytes], over
     """Safe atomic write to filesystem by writing to temp file + atomic rename"""
     """Safe atomic write to filesystem by writing to temp file + atomic rename"""
 
 
     mode = 'wb+' if isinstance(contents, bytes) else 'w'
     mode = 'wb+' if isinstance(contents, bytes) else 'w'
+    encoding = None if isinstance(contents, bytes) else 'utf-8'  # enforce utf-8 on all text writes
 
 
     # print('\n> Atomic Write:', mode, path, len(contents), f'overwrite={overwrite}')
     # print('\n> Atomic Write:', mode, path, len(contents), f'overwrite={overwrite}')
     try:
     try:
-        with lib_atomic_write(path, mode=mode, overwrite=overwrite) as f:
+        with lib_atomic_write(path, mode=mode, overwrite=overwrite, encoding=encoding) as f:
             if isinstance(contents, dict):
             if isinstance(contents, dict):
                 dump(contents, f, indent=4, sort_keys=True, cls=ExtendedEncoder)
                 dump(contents, f, indent=4, sort_keys=True, cls=ExtendedEncoder)
             elif isinstance(contents, (bytes, str)):
             elif isinstance(contents, (bytes, str)):

+ 0 - 1
archivebox/templates/admin/actions_as_select.html

@@ -1 +0,0 @@
-actions_as_select

+ 6 - 6
archivebox/templates/admin/base.html

@@ -20,7 +20,7 @@
 <body class="{% if is_popup %}popup {% endif %}{% block bodyclass %}{% endblock %}"
 <body class="{% if is_popup %}popup {% endif %}{% block bodyclass %}{% endblock %}"
   data-admin-utc-offset="{% now "Z" %}">
   data-admin-utc-offset="{% now "Z" %}">
 
 
-  <style nonce="{{nonce}}">
+  <style>
       /* Loading Progress Bar */
       /* Loading Progress Bar */
         #progress {
         #progress {
             position: absolute;
             position: absolute;
@@ -89,7 +89,7 @@
             <a href="{% url 'admin:Add' %}">Add ➕</a> /
             <a href="{% url 'admin:Add' %}">Add ➕</a> /
             <a href="{% url 'Home' %}">Snapshots</a> /
             <a href="{% url 'Home' %}">Snapshots</a> /
             <a href="/admin/core/tag/">Tags</a> /
             <a href="/admin/core/tag/">Tags</a> /
-            <a href="/admin/auth/user/">Users</a> /
+            <a href="/admin/">Admin</a> /
             <a href="{% url 'Docs' %}">Docs</a>
             <a href="{% url 'Docs' %}">Docs</a>
              &nbsp; &nbsp;
              &nbsp; &nbsp;
             {% block welcome-msg %}
             {% block welcome-msg %}
@@ -157,15 +157,15 @@
         function fix_actions() {
         function fix_actions() {
             var container = $('div.actions');
             var container = $('div.actions');
 
 
-            if (container.find('option').length < 10) {
-                container.find('label, button').hide();
+            if (container.find('select[name=action] option').length < 10) {
+                container.find('label:nth-child(1), button[value=0]').hide();
 
 
                 var buttons = $('<div></div>')
                 var buttons = $('<div></div>')
-                    .prependTo(container)
+                    .appendTo(container)
                     .css('display', 'inline')
                     .css('display', 'inline')
                     .addClass('class', 'action-buttons');
                     .addClass('class', 'action-buttons');
 
 
-                container.find('option:gt(0)').reverse().each(function () {
+                container.find('select[name=action] option:gt(0)').reverse().each(function () {
                     const name = this.value
                     const name = this.value
                     $('<button>')
                     $('<button>')
                         .appendTo(buttons)
                         .appendTo(buttons)

+ 1 - 1
archivebox/templates/core/add.html

@@ -15,7 +15,7 @@
 {% endblock %}
 {% endblock %}
 
 
 {% block body %}
 {% block body %}
-    <div style="max-width: 550px; margin: auto; float: none">
+    <div style="max-width: 1440px; margin: auto; float: none">
         <br/><br/>
         <br/><br/>
         {% if stdout %}
         {% if stdout %}
             <h1>Add new URLs to your archive: results</h1>
             <h1>Add new URLs to your archive: results</h1>

+ 2 - 2
archivebox/templates/core/base.html

@@ -38,7 +38,7 @@
             <div id="header">
             <div id="header">
                 <div id="branding">
                 <div id="branding">
                     <h1 id="site-name">
                     <h1 id="site-name">
-                        <a href="{% url 'public-index' %}" class="header-archivebox" title="Last updated: {{updated}}">
+                        <a href="{% url 'public-index' %}" class="header-archivebox">
                             <img src="{% static 'archive.png' %}" alt="Logo" style="height: 30px"/>
                             <img src="{% static 'archive.png' %}" alt="Logo" style="height: 30px"/>
                             ArchiveBox
                             ArchiveBox
                         </a>
                         </a>
@@ -70,7 +70,7 @@
                     <center>
                     <center>
                         <small>
                         <small>
                             Archive created using <a href="https://github.com/ArchiveBox/ArchiveBox" title="Github">ArchiveBox</a> version
                             Archive created using <a href="https://github.com/ArchiveBox/ArchiveBox" title="Github">ArchiveBox</a> version
-                            <a href="https://github.com/ArchiveBox/ArchiveBox/releases" title="Releases">v{{VERSION}}</a>.
+                            <a href="https://github.com/ArchiveBox/ArchiveBox/releases/tag/v{{VERSION}}" title="Releases">v{{VERSION}}</a>.
                             <br/><br/>
                             <br/><br/>
                             {{FOOTER_INFO}}
                             {{FOOTER_INFO}}
                         </small>
                         </small>

+ 2 - 2
archivebox/templates/core/index_row.html

@@ -10,7 +10,7 @@
         {% endif %}
         {% endif %}
 
 
         <a href="archive/{{link.timestamp}}/index.html" title="{{link.title|default:'Not yet archived...'}}">
         <a href="archive/{{link.timestamp}}/index.html" title="{{link.title|default:'Not yet archived...'}}">
-            <span data-title-for="{{link.url}}" data-archived="{{link.is_archived}}">{{link.title|default:'Loading...'}}</span>
+            <span data-title-for="{{link.url}}" data-archived="{{link.is_archived}}">{{link.title|default:'Loading...'|truncatechars:128}}</span>
             {% if link.tags_str %}
             {% if link.tags_str %}
                 <span class="tags" style="float: right; border-radius: 5px; background-color: #bfdfff; padding: 2px 5px; margin-left: 4px; margin-top: 1px;">
                 <span class="tags" style="float: right; border-radius: 5px; background-color: #bfdfff; padding: 2px 5px; margin-left: 4px; margin-top: 1px;">
                     {% if link.tags_str != None %}
                     {% if link.tags_str != None %}
@@ -33,5 +33,5 @@
             {% endif %}
             {% endif %}
         </span>
         </span>
     </td>
     </td>
-   <td style="text-align:left"><a href="{{link.url}}">{{link.url}}</a></td>
+   <td style="text-align:left; word-wrap: anywhere;"><a href="{{link.url}}">{{link.url|truncatechars:128}}</a></td>
 </tr>
 </tr>

+ 1 - 1
archivebox/templates/core/minimal_index.html

@@ -4,7 +4,7 @@
         <title>Archived Sites</title>
         <title>Archived Sites</title>
         <meta charset="utf-8" name="viewport" content="width=device-width, initial-scale=1">
         <meta charset="utf-8" name="viewport" content="width=device-width, initial-scale=1">
     </head>
     </head>
-    <body data-status="{{status}}">
+    <body>
         <table id="table-bookmarks">
         <table id="table-bookmarks">
             <thead>
             <thead>
                 <tr class="thead-tr">
                 <tr class="thead-tr">

+ 15 - 10
archivebox/templates/core/public_index.html

@@ -2,6 +2,11 @@
 {% load static %}
 {% load static %}
 
 
 {% block body %}
 {% block body %}
+    <style>
+        #table-bookmarks_info {
+            display: none;
+        }
+    </style>
     <div id="toolbar">
     <div id="toolbar">
         <form id="changelist-search" action="{% url 'public-index' %}" method="get">
         <form id="changelist-search" action="{% url 'public-index' %}" method="get">
             <div>
             <div>
@@ -21,7 +26,7 @@
         <thead>
         <thead>
             <tr>
             <tr>
                 <th style="width: 100px;">Bookmarked</th>
                 <th style="width: 100px;">Bookmarked</th>
-                <th style="width: 26vw;">Snapshot ({{object_list|length}})</th>
+                <th style="width: 26vw;">Snapshot ({{page_obj.paginator.count}})</th>
                 <th style="width: 140px">Files</th>
                 <th style="width: 140px">Files</th>
                 <th style="width: 16vw;whitespace:nowrap;overflow-x:hidden;">Original URL</th>
                 <th style="width: 16vw;whitespace:nowrap;overflow-x:hidden;">Original URL</th>
             </tr>
             </tr>
@@ -33,26 +38,26 @@
             </tbody>
             </tbody>
         </table>
         </table>
         <center>
         <center>
+            <br/>
+            Showing {{ page_obj.start_index }}-{{ page_obj.end_index }} of {{ page_obj.paginator.count }} total
+            <br/>
             <span class="step-links">
             <span class="step-links">
                 {% if page_obj.has_previous %}
                 {% if page_obj.has_previous %}
-                    <a href="{% url 'public-index' %}?page=1">&laquo; first</a>
+                    <a href="{% url 'public-index' %}?page=1">&laquo; first</a> &nbsp;
                     <a href="{% url 'public-index' %}?page={{ page_obj.previous_page_number }}">previous</a>
                     <a href="{% url 'public-index' %}?page={{ page_obj.previous_page_number }}">previous</a>
+                    &nbsp;
                 {% endif %}
                 {% endif %}
         
         
                 <span class="current">
                 <span class="current">
-                    Page {{ page_obj.number }} of {{ page_obj.paginator.num_pages }}.
+                    Page {{ page_obj.number }} of {{ page_obj.paginator.num_pages }}
                 </span>
                 </span>
-        
+            
                 {% if page_obj.has_next %}
                 {% if page_obj.has_next %}
-                    <a href="{% url 'public-index' %}?page={{ page_obj.next_page_number }}">next </a>
+                    &nbsp;
+                    <a href="{% url 'public-index' %}?page={{ page_obj.next_page_number }}">next </a> &nbsp;
                     <a href="{% url 'public-index' %}?page={{ page_obj.paginator.num_pages }}">last &raquo;</a>
                     <a href="{% url 'public-index' %}?page={{ page_obj.paginator.num_pages }}">last &raquo;</a>
                 {% endif %}
                 {% endif %}
             </span>
             </span>
-    
-            {% if page_obj.has_next %}
-                <a href="{% url 'public-index' %}?page={{ page_obj.next_page_number }}">next </a>
-                <a href="{% url 'public-index' %}?page={{ page_obj.paginator.num_pages }}">last &raquo;</a>
-            {% endif %}
         </span>
         </span>
         <br>
         <br>
     </center>
     </center>

+ 44 - 18
archivebox/templates/core/snapshot.html

@@ -279,7 +279,7 @@
                     <div class="col-lg-8">
                     <div class="col-lg-8">
                         <img src="favicon.ico" onerror="this.style.opacity=0" alt="Favicon">
                         <img src="favicon.ico" onerror="this.style.opacity=0" alt="Favicon">
                         &nbsp;&nbsp;
                         &nbsp;&nbsp;
-                        {{title}}
+                        {{title|safe}}
                         &nbsp;&nbsp;
                         &nbsp;&nbsp;
                         <a href="#" class="header-toggle">▾</a>
                         <a href="#" class="header-toggle">▾</a>
                         <br/>
                         <br/>
@@ -335,20 +335,21 @@
                     </div>
                     </div>
                     <div class="col-lg-4">
                     <div class="col-lg-4">
                         <div class="info-chunk">
                         <div class="info-chunk">
-                            <h5>🗃 Files</h5>
+                            <h5>🗃 Snapshot ID: <a href="/admin/core/snapshot/{{snapshot_id}}/change/"><code style="color: rgba(255,255,255,0.6); font-weight: 200; font-size: 12px; background-color: #1a1a1a"><b>[{{timestamp}}]</b> <small>{{snapshot_id|truncatechars:24}}</small></code></a></h5>
                             <a href="index.json" title="JSON summary of archived link.">JSON</a> | 
                             <a href="index.json" title="JSON summary of archived link.">JSON</a> | 
                             <a href="warc/" title="Any WARC archives for the page">WARC</a> | 
                             <a href="warc/" title="Any WARC archives for the page">WARC</a> | 
                             <a href="media/" title="Audio, Video, and Subtitle files.">Media</a> | 
                             <a href="media/" title="Audio, Video, and Subtitle files.">Media</a> | 
                             <a href="git/" title="Any git repos at the url">Git</a> | 
                             <a href="git/" title="Any git repos at the url">Git</a> | 
-                            <a href="favicon.ico" title="Any git repos at the url">Favicon</a> | 
-                            <a href="." title="Webserver-provided index of files directory.">See all...</a>
+                            <a href="/admin/core/snapshot/?id__startswith={{snapshot_id}}" title="Go to the Snapshot admin to update, overwrite, or delete this Snapshot">Actions</a> | 
+                            <a href="/admin/core/snapshot/{{snapshot_id}}/change/" title="Edit this snapshot in the Admin UI">Admin</a> | 
+                            <a href="." title="Webserver-provided index of files directory.">See all files...</a><br/>
                         </div>
                         </div>
                     </div>
                     </div>
                 </div>
                 </div>
                 <div class="row header-bottom-frames">
                 <div class="row header-bottom-frames">
                     <div class="col-lg-2">
                     <div class="col-lg-2">
                         <div class="card selected-card">
                         <div class="card selected-card">
-                            <iframe class="card-img-top" src="{{singlefile_path}}" sandbox="allow-same-origin allow-scripts allow-forms" scrolling="no"></iframe>
+                            <iframe class="card-img-top" src="{{singlefile_path}}" sandbox="allow-same-origin allow-top-navigation-by-user-activation allow-scripts allow-forms" scrolling="no"></iframe>
                             <div class="card-body">
                             <div class="card-body">
                                 <a href="{{singlefile_path}}" title="Open in new tab..." target="_blank" rel="noopener">
                                 <a href="{{singlefile_path}}" title="Open in new tab..." target="_blank" rel="noopener">
                                     <p class="card-text"><code>./singlefile.html</code></p>
                                     <p class="card-text"><code>./singlefile.html</code></p>
@@ -381,7 +382,7 @@
                     </div>
                     </div>
                     <div class="col-lg-2">
                     <div class="col-lg-2">
                         <div class="card">
                         <div class="card">
-                          <iframe class="card-img-top" src="{{archive_url}}" sandbox="allow-same-origin allow-scripts allow-forms" scrolling="no"></iframe>
+                          <iframe class="card-img-top" src="{{archive_url}}" sandbox="allow-same-origin allow-top-navigation-by-user-activation allow-scripts allow-forms" scrolling="no"></iframe>
                           <div class="card-body">
                           <div class="card-body">
                                 <a href="{{archive_url}}" title="Open in new tab..." target="_blank" rel="noopener">
                                 <a href="{{archive_url}}" title="Open in new tab..." target="_blank" rel="noopener">
                                     <p class="card-text"><code>./{{domain}}</code></p>
                                     <p class="card-text"><code>./{{domain}}</code></p>
@@ -393,30 +394,30 @@
                     {% if SAVE_ARCHIVE_DOT_ORG %}
                     {% if SAVE_ARCHIVE_DOT_ORG %}
                     <div class="col-lg-2">
                     <div class="col-lg-2">
                         <div class="card">
                         <div class="card">
-                            <iframe class="card-img-top" src="{{archive_org_path}}" sandbox="allow-same-origin allow-scripts allow-forms" scrolling="no"></iframe>
+                            <iframe class="card-img-top" src="{{archive_org_path}}" sandbox="allow-same-origin allow-top-navigation-by-user-activation allow-scripts allow-forms" scrolling="no"></iframe>
                             <div class="card-body">
                             <div class="card-body">
                                 <a href="{{archive_org_path}}" title="Open in new tab..." target="_blank" rel="noopener">
                                 <a href="{{archive_org_path}}" title="Open in new tab..." target="_blank" rel="noopener">
                                     <p class="card-text"><code>🌐 web.archive.org/web/...</code></p>
                                     <p class="card-text"><code>🌐 web.archive.org/web/...</code></p>
                                 </a>
                                 </a>
-                                <a href="{{archive_org_path}}" target="preview"><h4 class="card-title">Archive.Org</h4></a>
+                                <a href="{{archive_org_path}}" target="preview" id="archive_dot_org-btn"><h4 class="card-title">Archive.Org</h4></a>
                           </div>
                           </div>
                         </div>
                         </div>
                     </div>
                     </div>
                     {% endif %}
                     {% endif %}
                     <div class="col-lg-2">
                     <div class="col-lg-2">
                         <div class="card">
                         <div class="card">
-                            <iframe class="card-img-top" src="{{url}}" sandbox="allow-same-origin allow-scripts allow-forms" scrolling="no"></iframe>
+                            <iframe class="card-img-top" src="{{url}}" sandbox="allow-same-origin allow-top-navigation-by-user-activation allow-scripts allow-forms" scrolling="no"></iframe>
                             <div class="card-body">
                             <div class="card-body">
                                 <a href="{{url}}" title="Open in new tab..." target="_blank" rel="noopener">
                                 <a href="{{url}}" title="Open in new tab..." target="_blank" rel="noopener">
                                     <p class="card-text"><code>🌐 {{domain}}</code></p>
                                     <p class="card-text"><code>🌐 {{domain}}</code></p>
                                 </a>
                                 </a>
-                                <a href="{{url}}" target="preview"><h4 class="card-title">Original</h4></a>
+                                <a href="{{url}}" target="preview" id="original-btn"><h4 class="card-title">Original</h4></a>
                           </div>
                           </div>
                         </div>
                         </div>
                     </div>
                     </div>
                     <div class="col-lg-2">
                     <div class="col-lg-2">
                         <div class="card">
                         <div class="card">
-                            <iframe class="card-img-top" src="{{headers_path}}" sandbox="allow-same-origin allow-scripts allow-forms" scrolling="no"></iframe>
+                            <iframe class="card-img-top" src="{{headers_path}}" sandbox="allow-same-origin allow-top-navigation-by-user-activation allow-scripts allow-forms" scrolling="no"></iframe>
                             <div class="card-body">
                             <div class="card-body">
                                 <a href="{{headers_path}}" title="Open in new tab..." target="_blank" rel="noopener">
                                 <a href="{{headers_path}}" title="Open in new tab..." target="_blank" rel="noopener">
                                     <p class="card-text"><code>./headers.json</code></p>
                                     <p class="card-text"><code>./headers.json</code></p>
@@ -427,7 +428,7 @@
                     </div>
                     </div>
                     <div class="col-lg-2">
                     <div class="col-lg-2">
                         <div class="card">
                         <div class="card">
-                            <iframe class="card-img-top" src="{{dom_path}}" sandbox="allow-same-origin allow-scripts allow-forms" scrolling="no"></iframe>
+                            <iframe class="card-img-top" src="{{dom_path}}" sandbox="allow-same-origin allow-top-navigation-by-user-activation allow-scripts allow-forms" scrolling="no"></iframe>
                             <div class="card-body">
                             <div class="card-body">
                                 <a href="{{dom_path}}" title="Open in new tab..." target="_blank" rel="noopener">
                                 <a href="{{dom_path}}" title="Open in new tab..." target="_blank" rel="noopener">
                                     <p class="card-text"><code>./output.html</code></p>
                                     <p class="card-text"><code>./output.html</code></p>
@@ -438,7 +439,7 @@
                     </div>
                     </div>
                     <div class="col-lg-2">
                     <div class="col-lg-2">
                         <div class="card">
                         <div class="card">
-                            <iframe class="card-img-top" src="{{readability_path}}" sandbox="allow-same-origin allow-scripts allow-forms" scrolling="no"></iframe>
+                            <iframe class="card-img-top" src="{{readability_path}}" sandbox="allow-same-origin allow-top-navigation-by-user-activation allow-scripts allow-forms" scrolling="no"></iframe>
                             <div class="card-body">
                             <div class="card-body">
                                 <a href="{{readability_path}}" title="Open in new tab..." target="_blank" rel="noopener">
                                 <a href="{{readability_path}}" title="Open in new tab..." target="_blank" rel="noopener">
                                     <p class="card-text"><code>./readability/content.html</code></p>
                                     <p class="card-text"><code>./readability/content.html</code></p>
@@ -450,7 +451,7 @@
                     <br/>
                     <br/>
                     <div class="col-lg-2">
                     <div class="col-lg-2">
                         <div class="card">
                         <div class="card">
-                            <iframe class="card-img-top" src="{{mercury_path}}" sandbox="allow-same-origin allow-scripts allow-forms" scrolling="no"></iframe>
+                            <iframe class="card-img-top" src="{{mercury_path}}" sandbox="allow-same-origin allow-top-navigation-by-user-activation allow-scripts allow-forms" scrolling="no"></iframe>
                             <div class="card-body">
                             <div class="card-body">
                                 <a href="{{mercury_path}}" title="Open in new tab..." target="_blank" rel="noopener">
                                 <a href="{{mercury_path}}" title="Open in new tab..." target="_blank" rel="noopener">
                                     <p class="card-text"><code>./mercury/content.html</code></p>
                                     <p class="card-text"><code>./mercury/content.html</code></p>
@@ -461,7 +462,7 @@
                     </div>
                     </div>
                     <div class="col-lg-2">
                     <div class="col-lg-2">
                         <div class="card">
                         <div class="card">
-                            <iframe class="card-img-top" src="{{media_path}}" sandbox="allow-same-origin allow-scripts allow-forms" scrolling="no"></iframe>
+                            <iframe class="card-img-top" src="{{media_path}}" sandbox="allow-same-origin allow-top-navigation-by-user-activation allow-scripts allow-forms" scrolling="no"></iframe>
                             <div class="card-body">
                             <div class="card-body">
                                 <a href="{{media_path}}" title="Open in new tab..." target="_blank" rel="noopener">
                                 <a href="{{media_path}}" title="Open in new tab..." target="_blank" rel="noopener">
                                     <p class="card-text"><code>./media/*.mp4</code></p>
                                     <p class="card-text"><code>./media/*.mp4</code></p>
@@ -472,7 +473,7 @@
                     </div>
                     </div>
                     <div class="col-lg-2">
                     <div class="col-lg-2">
                         <div class="card">
                         <div class="card">
-                            <iframe class="card-img-top" src="{{git_path}}" sandbox="allow-same-origin allow-scripts allow-forms" scrolling="no"></iframe>
+                            <iframe class="card-img-top" src="{{git_path}}" sandbox="allow-same-origin allow-top-navigation-by-user-activation allow-scripts allow-forms" scrolling="no"></iframe>
                             <div class="card-body">
                             <div class="card-body">
                                 <a href="{{git_path}}" title="Open in new tab..." target="_blank" rel="noopener">
                                 <a href="{{git_path}}" title="Open in new tab..." target="_blank" rel="noopener">
                                     <p class="card-text"><code>./git/*.git</code></p>
                                     <p class="card-text"><code>./git/*.git</code></p>
@@ -484,7 +485,7 @@
                 </div>
                 </div>
             </div>
             </div>
         </header>
         </header>
-        <iframe sandbox="allow-same-origin allow-scripts allow-forms" class="full-page-iframe" src="{{singlefile_path}}" name="preview"></iframe>
+        <iframe sandbox="allow-same-origin allow-top-navigation-by-user-activation allow-scripts allow-forms" class="full-page-iframe" src="{{singlefile_path}}" name="preview"></iframe>
     
     
         <script
         <script
               src="https://code.jquery.com/jquery-3.2.1.slim.min.js"
               src="https://code.jquery.com/jquery-3.2.1.slim.min.js"
@@ -493,6 +494,16 @@
         <script src="https://maxcdn.bootstrapcdn.com/bootstrap/4.0.0-alpha.6/js/bootstrap.min.js" integrity="sha384-vBWWzlZJ8ea9aCX4pEW3rVHjgjt7zpkNpZk+02D9phzyeVkE+jo0ieGizqPLForn" crossorigin="anonymous"></script>
         <script src="https://maxcdn.bootstrapcdn.com/bootstrap/4.0.0-alpha.6/js/bootstrap.min.js" integrity="sha384-vBWWzlZJ8ea9aCX4pEW3rVHjgjt7zpkNpZk+02D9phzyeVkE+jo0ieGizqPLForn" crossorigin="anonymous"></script>
 
 
         <script>
         <script>
+            function getPreviewTypeFromPath(link) {
+                if (link.id == 'original-btn') {
+                    return 'original'
+                }
+                if (link.id == 'archive_dot_org-btn') {
+                    return 'archive_dot_org'
+                }
+                return link.pathname.split('/').filter(a => a.length).slice(-1)[0].toLowerCase()
+            }
+
             // show selected file in iframe when preview card is clicked
             // show selected file in iframe when preview card is clicked
             jQuery('.card').on('click', function(e) {
             jQuery('.card').on('click', function(e) {
                 jQuery('.selected-card').removeClass('selected-card')
                 jQuery('.selected-card').removeClass('selected-card')
@@ -502,11 +513,26 @@
                 if (e.currentTarget.href.endsWith('.pdf')) {
                 if (e.currentTarget.href.endsWith('.pdf')) {
                     jQuery('.full-page-iframe')[0].removeAttribute('sandbox')
                     jQuery('.full-page-iframe')[0].removeAttribute('sandbox')
                 } else {
                 } else {
-                    jQuery('.full-page-iframe')[0].sandbox = "allow-same-origin allow-scripts allow-forms"
+                    jQuery('.full-page-iframe')[0].sandbox = "allow-same-origin allow-scripts allow-forms allow-top-navigation-by-user-activation"
                 }
                 }
+                window.location.hash = getPreviewTypeFromPath(e.currentTarget)
                 return true
                 return true
             })
             })
 
 
+            // check URL for hash e.g. #git and load relevant preview
+            jQuery(document).ready(function() {
+                if (window.location.hash) {
+                    for (const link of jQuery('a[target=preview]')) {
+                        console.log(link.pathname)
+                        if (getPreviewTypeFromPath(link) == window.location.hash.slice(1).toLowerCase()) {
+                            jQuery(link).closest('.card').click()
+                            jQuery(link).click()
+                            link.click()
+                        }
+                    }
+                }
+            })
+
             // un-sandbox iframes showing pdfs (required to display pdf viewer)
             // un-sandbox iframes showing pdfs (required to display pdf viewer)
             jQuery('iframe').map(function() {
             jQuery('iframe').map(function() {
                 if (this.src.endsWith('.pdf')) {
                 if (this.src.endsWith('.pdf')) {

+ 2 - 2
archivebox/templates/core/static_index.html

@@ -209,7 +209,7 @@
             <div class="header-top container-fluid">
             <div class="header-top container-fluid">
                 <div class="row nav">
                 <div class="row nav">
                     <div class="col-sm-2">
                     <div class="col-sm-2">
-                        <a href="/" class="header-archivebox" title="Last updated: {{updated}}">
+                        <a href="/" class="header-archivebox">
                             <img src="{% static 'archive.png' %}" alt="Logo"/>
                             <img src="{% static 'archive.png' %}" alt="Logo"/>
                             ArchiveBox: Index
                             ArchiveBox: Index
                         </a>
                         </a>
@@ -243,7 +243,7 @@
             <center>
             <center>
                 <small>
                 <small>
                     Archive created using <a href="https://github.com/ArchiveBox/ArchiveBox" title="Github">ArchiveBox</a>
                     Archive created using <a href="https://github.com/ArchiveBox/ArchiveBox" title="Github">ArchiveBox</a>
-                    version <a href="https://github.com/ArchiveBox/ArchiveBox/tree/v{{version}}" title="Git commit">v{{version}}</a> &nbsp; | &nbsp; 
+                    version <a href="https://github.com/ArchiveBox/ArchiveBox/releases/tag/v{{version}}" title="View source code and release info">v{{version}}</a> &nbsp; | &nbsp; 
                     Download index as <a href="index.json" title="JSON summary of archived links.">JSON</a>
                     Download index as <a href="index.json" title="JSON summary of archived links.">JSON</a>
                     <br/><br/>
                     <br/><br/>
                     {{FOOTER_INFO}}
                     {{FOOTER_INFO}}

+ 1 - 1
archivebox/templates/static/add.css

@@ -42,7 +42,7 @@ header {
   background-color: #f5dd5d;
   background-color: #f5dd5d;
 }
 }
 #stdout {
 #stdout {
-  background-color: #ded;
+  background-color: #fbfbfb;
   padding: 10px 10px;
   padding: 10px 10px;
   border-radius: 4px;
   border-radius: 4px;
   white-space: normal;
   white-space: normal;

+ 37 - 0
archivebox/templates/static/admin.css

@@ -237,3 +237,40 @@ body.model-snapshot.change-list #content .object-tools {
     opacity: 0.1;
     opacity: 0.1;
     filter: grayscale(100%);
     filter: grayscale(100%);
 }
 }
+
+
+#result_list tbody td.field-cmd_str pre,
+#result_list tbody td.field-output_str pre {
+    max-width: 22vw;
+    word-wrap: anywhere;
+    white-space: break-spaces;
+    max-height: 40px;
+    overflow: hidden;
+    margin: 2px;
+    background-color: rgba(0,0,0,0.05);
+    padding: 1px 4px 16px 8px;
+    border-radius: 4px;
+}
+
+#result_list tbody td.field-extractor {
+    font-weight: 800;
+    font-variant: small-caps;
+}
+
+#result_list tbody td.field-status {
+    font-variant: small-caps;
+}
+
+.inline-group .tabular td.original p {
+    margin-top: -33px;
+}
+
+tbody .output-link {
+    float: right;
+    margin-bottom: -25px;
+    margin-right: -3px;
+    margin-top: -4px;
+    opacity: 0.4;
+    box-shadow:   4px 4px 4px rgba(0,0,0,0.1);
+}
+tbody .output-link:hover {opacity: 1;}

BIN
archivebox/templates/static/favicon.ico


+ 2 - 0
archivebox/templates/static/robots.txt

@@ -0,0 +1,2 @@
+User-agent: *
+Disallow: /

+ 3 - 1
archivebox/util.py

@@ -56,11 +56,13 @@ ts_to_iso = lambda ts: ts and parse_date(ts).isoformat()
 
 
 
 
 URL_REGEX = re.compile(
 URL_REGEX = re.compile(
+    r'(?=('
     r'http[s]?://'                    # start matching from allowed schemes
     r'http[s]?://'                    # start matching from allowed schemes
     r'(?:[a-zA-Z]|[0-9]'              # followed by allowed alphanum characters
     r'(?:[a-zA-Z]|[0-9]'              # followed by allowed alphanum characters
     r'|[$-_@.&+]|[!*\(\),]'           #    or allowed symbols
     r'|[$-_@.&+]|[!*\(\),]'           #    or allowed symbols
     r'|(?:%[0-9a-fA-F][0-9a-fA-F]))'  #    or allowed unicode bytes
     r'|(?:%[0-9a-fA-F][0-9a-fA-F]))'  #    or allowed unicode bytes
-    r'[^\]\[\(\)<>"\'\s]+',         # stop parsing at these symbols
+    r'[^\]\[\(\)<>"\'\s]+'          # stop parsing at these symbols
+    r'))',
     re.IGNORECASE,
     re.IGNORECASE,
 )
 )
 
 

+ 1 - 0
archivebox/vendor/atomicwrites.py

@@ -0,0 +1 @@
+python-atomicwrites/atomicwrites/__init__.py

+ 1 - 0
archivebox/vendor/python-atomicwrites

@@ -0,0 +1 @@
+Subproject commit c35cd32eb364d5a4210e64bf38fd1a55f329f316

+ 5 - 3
bin/docker_entrypoint.sh

@@ -3,6 +3,7 @@
 DATA_DIR="${DATA_DIR:-/data}"
 DATA_DIR="${DATA_DIR:-/data}"
 ARCHIVEBOX_USER="${ARCHIVEBOX_USER:-archivebox}"
 ARCHIVEBOX_USER="${ARCHIVEBOX_USER:-archivebox}"
 
 
+
 # Set the archivebox user UID & GID
 # Set the archivebox user UID & GID
 if [[ -n "$PUID" && "$PUID" != 0 ]]; then
 if [[ -n "$PUID" && "$PUID" != 0 ]]; then
     usermod -u "$PUID" "$ARCHIVEBOX_USER" > /dev/null 2>&1
     usermod -u "$PUID" "$ARCHIVEBOX_USER" > /dev/null 2>&1
@@ -11,6 +12,7 @@ if [[ -n "$PGID" && "$PGID" != 0 ]]; then
     groupmod -g "$PGID" "$ARCHIVEBOX_USER" > /dev/null 2>&1
     groupmod -g "$PGID" "$ARCHIVEBOX_USER" > /dev/null 2>&1
 fi
 fi
 
 
+
 # Set the permissions of the data dir to match the archivebox user
 # Set the permissions of the data dir to match the archivebox user
 if [[ -d "$DATA_DIR/archive" ]]; then
 if [[ -d "$DATA_DIR/archive" ]]; then
     # check data directory permissions
     # check data directory permissions
@@ -21,7 +23,7 @@ if [[ -d "$DATA_DIR/archive" ]]; then
     fi
     fi
 else
 else
     # create data directory
     # create data directory
-    mkdir -p "$DATA_DIR"
+    mkdir -p "$DATA_DIR/logs"
     chown -R $ARCHIVEBOX_USER:$ARCHIVEBOX_USER "$DATA_DIR"
     chown -R $ARCHIVEBOX_USER:$ARCHIVEBOX_USER "$DATA_DIR"
 fi
 fi
 chown $ARCHIVEBOX_USER:$ARCHIVEBOX_USER "$DATA_DIR"
 chown $ARCHIVEBOX_USER:$ARCHIVEBOX_USER "$DATA_DIR"
@@ -33,11 +35,11 @@ if [[ "$1" == /* || "$1" == "echo" || "$1" == "archivebox" ]]; then
     # e.g. "archivebox init"
     # e.g. "archivebox init"
     #      "/bin/bash"
     #      "/bin/bash"
     #      "echo"
     #      "echo"
-    gosu "$ARCHIVEBOX_USER" bash -c "$*"
+    exec gosu "$ARCHIVEBOX_USER" bash -c "$*"
 else
 else
     # no command given, assume args were meant to be passed to archivebox cmd
     # no command given, assume args were meant to be passed to archivebox cmd
     # e.g. "add https://example.com"
     # e.g. "add https://example.com"
     #      "manage createsupseruser"
     #      "manage createsupseruser"
     #      "server 0.0.0.0:8000"
     #      "server 0.0.0.0:8000"
-    gosu "$ARCHIVEBOX_USER" bash -c "archivebox $*"
+    exec gosu "$ARCHIVEBOX_USER" bash -c "archivebox $*"
 fi
 fi

+ 23 - 20
docker-compose.yml

@@ -11,36 +11,39 @@ version: '3.7'
 
 
 services:
 services:
     archivebox:
     archivebox:
-        # build: .
+        # build: .                              # for developers working on archivebox
         image: ${DOCKER_IMAGE:-archivebox/archivebox:latest} 
         image: ${DOCKER_IMAGE:-archivebox/archivebox:latest} 
-        command: server 0.0.0.0:8000
+        command: server --quick-init 0.0.0.0:8000
         stdin_open: true
         stdin_open: true
         tty: true
         tty: true
         ports:
         ports:
             - 8000:8000
             - 8000:8000
         environment:
         environment:
-            - USE_COLOR=True
-            - SHOW_PROGRESS=False
-            - SEARCH_BACKEND_ENGINE=sonic
-            - SEARCH_BACKEND_HOST_NAME=sonic
-            - SEARCH_BACKEND_PASSWORD=SecretPassword
+            - ALLOWED_HOSTS=*                   # add any config options you want as env vars
+            - MEDIA_MAX_SIZE=750m
+            # - SHOW_PROGRESS=False
+            # - SEARCH_BACKEND_ENGINE=sonic     # uncomment these if you enable sonic below
+            # - SEARCH_BACKEND_HOST_NAME=sonic
+            # - SEARCH_BACKEND_PASSWORD=SecretPassword
         volumes:
         volumes:
             - ./data:/data
             - ./data:/data
-        depends_on:
-            - sonic
+            # - ./archivebox:/app/archivebox    # for developers working on archivebox
     
     
-    # Run sonic search backend
-    sonic:
-        image: valeriansaliou/sonic:v1.3.0    
-        ports:
-            - 1491:1491
-        environment:
-            - SEARCH_BACKEND_PASSWORD=SecretPassword
-        volumes:
-            - ./etc/sonic/config.cfg:/etc/sonic.cfg
-            - ./data:/var/lib/sonic/store/
+    # To run the Sonic full-text search backend, create an ./etc/sonic folder
+    # and download the sonic config file from here into that folder:
+    # https://raw.githubusercontent.com/ArchiveBox/ArchiveBox/master/etc/sonic/config.cfg
+    # sonic:
+    #     image: valeriansaliou/sonic:v1.3.0    
+    #     expose:
+    #         - 1491
+    #     environment:
+    #         - SEARCH_BACKEND_PASSWORD=SecretPassword
+    #     volumes:
+    #         - ./etc/sonic/config.cfg:/etc/sonic.cfg
+    #         - ./data/sonic:/var/lib/sonic/store
+
 
 
-    # Optional Addons: tweak these examples as needed for your specific use case
+    ### Optional Addons: tweak these examples as needed for your specific use case
 
 
     # Example: Run scheduled imports in a docker instead of using cron on the
     # Example: Run scheduled imports in a docker instead of using cron on the
     # host machine, add tasks and see more info with archivebox schedule --help
     # host machine, add tasks and see more info with archivebox schedule --help

+ 142 - 116
package-lock.json

@@ -1,6 +1,6 @@
 {
 {
 	"name": "archivebox",
 	"name": "archivebox",
-	"version": "0.4.21",
+	"version": "0.6.0",
 	"lockfileVersion": 1,
 	"lockfileVersion": 1,
 	"requires": true,
 	"requires": true,
 	"dependencies": {
 	"dependencies": {
@@ -14,9 +14,9 @@
 			}
 			}
 		},
 		},
 		"@mozilla/readability": {
 		"@mozilla/readability": {
-			"version": "0.3.0",
-			"resolved": "https://registry.npmjs.org/@mozilla/readability/-/readability-0.3.0.tgz",
-			"integrity": "sha512-q8f1CAZsRKK1j+O0BmikGIlKSK03RpT4woT0PCQwhw0nH0z4+rG026AkxoPcjT7Dsgh1ifGscW8tOpvjoyOjvw=="
+			"version": "0.4.1",
+			"resolved": "https://registry.npmjs.org/@mozilla/readability/-/readability-0.4.1.tgz",
+			"integrity": "sha512-yar/f0w0fRUVM895s6yd5Z2oIxjG/6c3ROB/uQboSOBaDlri/nqI4aKtdqrldWciTLcdpjB2Z6MiVF2Bl9b8LA=="
 		},
 		},
 		"@postlight/ci-failed-test-reporter": {
 		"@postlight/ci-failed-test-reporter": {
 			"version": "1.0.26",
 			"version": "1.0.26",
@@ -116,15 +116,10 @@
 				"safe-buffer": "^5.0.1"
 				"safe-buffer": "^5.0.1"
 			}
 			}
 		},
 		},
-		"@types/color-name": {
-			"version": "1.1.1",
-			"resolved": "https://registry.npmjs.org/@types/color-name/-/color-name-1.1.1.tgz",
-			"integrity": "sha512-rr+OQyAjxze7GgWrSaJwydHStIhHq2lvY3BOC2Mj7KnzI7XK0Uw1TOOdI9lDoajEbSWLiYgoo4f1R51erQfhPQ=="
-		},
 		"@types/node": {
 		"@types/node": {
-			"version": "14.11.2",
-			"resolved": "https://registry.npmjs.org/@types/node/-/node-14.11.2.tgz",
-			"integrity": "sha512-jiE3QIxJ8JLNcb1Ps6rDbysDhN4xa8DJJvuC9prr6w+1tIh+QAbYyNF3tyiZNLDBIuBCf4KEcV2UvQm/V60xfA==",
+			"version": "14.14.37",
+			"resolved": "https://registry.npmjs.org/@types/node/-/node-14.14.37.tgz",
+			"integrity": "sha512-XYmBiy+ohOR4Lh5jE379fV2IU+6Jn4g5qASinhitfyO71b/sCo6MKsMLF5tc7Zf2CE8hViVQyYSobJNke8OvUw==",
 			"optional": true
 			"optional": true
 		},
 		},
 		"@types/yauzl": {
 		"@types/yauzl": {
@@ -142,9 +137,9 @@
 			"integrity": "sha512-Eu9ELJWCz/c1e9gTiCY+FceWxcqzjYEbqMgtndnuSqZSUCOL73TWNK2mHfIj4Cw2E/ongOp+JISVNCmovt2KYQ=="
 			"integrity": "sha512-Eu9ELJWCz/c1e9gTiCY+FceWxcqzjYEbqMgtndnuSqZSUCOL73TWNK2mHfIj4Cw2E/ongOp+JISVNCmovt2KYQ=="
 		},
 		},
 		"acorn": {
 		"acorn": {
-			"version": "7.4.0",
-			"resolved": "https://registry.npmjs.org/acorn/-/acorn-7.4.0.tgz",
-			"integrity": "sha512-+G7P8jJmCHr+S+cLfQxygbWhXy+8YTVGzAkpEbcLo2mLoL7tij/VG41QSHACSf5QgYRhMZYHuNc6drJaO0Da+w=="
+			"version": "8.1.0",
+			"resolved": "https://registry.npmjs.org/acorn/-/acorn-8.1.0.tgz",
+			"integrity": "sha512-LWCF/Wn0nfHOmJ9rzQApGnxnvgfROzGilS8936rqN/lfcYkY9MYZzdMqN+2NJ4SlTc+m5HiSa+kNfDtI64dwUA=="
 		},
 		},
 		"acorn-globals": {
 		"acorn-globals": {
 			"version": "6.0.0",
 			"version": "6.0.0",
@@ -153,6 +148,13 @@
 			"requires": {
 			"requires": {
 				"acorn": "^7.1.1",
 				"acorn": "^7.1.1",
 				"acorn-walk": "^7.1.1"
 				"acorn-walk": "^7.1.1"
+			},
+			"dependencies": {
+				"acorn": {
+					"version": "7.4.1",
+					"resolved": "https://registry.npmjs.org/acorn/-/acorn-7.4.1.tgz",
+					"integrity": "sha512-nQyp0o1/mNdbTO1PO6kHkwSrmgZ0MT/jCCpNiwbUjGoRN4dlBhqJtoQuCnEOKzgTVwg0ZWiCoQy6SxMebQVh8A=="
+				}
 			}
 			}
 		},
 		},
 		"acorn-walk": {
 		"acorn-walk": {
@@ -182,11 +184,10 @@
 			"integrity": "sha512-bY6fj56OUQ0hU1KjFNDQuJFezqKdrAyFdIevADiqrWHwSlbmBNMHp5ak2f40Pm8JTFyM2mqxkG6ngkHO11f/lg=="
 			"integrity": "sha512-bY6fj56OUQ0hU1KjFNDQuJFezqKdrAyFdIevADiqrWHwSlbmBNMHp5ak2f40Pm8JTFyM2mqxkG6ngkHO11f/lg=="
 		},
 		},
 		"ansi-styles": {
 		"ansi-styles": {
-			"version": "4.2.1",
-			"resolved": "https://registry.npmjs.org/ansi-styles/-/ansi-styles-4.2.1.tgz",
-			"integrity": "sha512-9VGjrMsG1vePxcSweQsN20KY/c4zN0h9fLjqAbwbPfahM3t+NL+M9HC8xeXG2I8pX5NoamTGNuomEUFI7fcUjA==",
+			"version": "4.3.0",
+			"resolved": "https://registry.npmjs.org/ansi-styles/-/ansi-styles-4.3.0.tgz",
+			"integrity": "sha512-zbB9rCJAT1rbjiVDb2hqKFHNYLxgtk8NURxZ3IZwD3F6NtxbXZQCnnSi1Lkx+IDohdPlFp222wVALIheZJQSEg==",
 			"requires": {
 			"requires": {
-				"@types/color-name": "^1.1.1",
 				"color-convert": "^2.0.1"
 				"color-convert": "^2.0.1"
 			}
 			}
 		},
 		},
@@ -247,9 +248,9 @@
 			}
 			}
 		},
 		},
 		"bl": {
 		"bl": {
-			"version": "4.0.3",
-			"resolved": "https://registry.npmjs.org/bl/-/bl-4.0.3.tgz",
-			"integrity": "sha512-fs4G6/Hu4/EE+F75J8DuN/0IpQqNjAdC7aEQv7Qt8MHGUH7Ckv2MwTEEeN9QehD0pfIDkMI1bkHYkKy7xHyKIg==",
+			"version": "4.1.0",
+			"resolved": "https://registry.npmjs.org/bl/-/bl-4.1.0.tgz",
+			"integrity": "sha512-1W07cM9gS6DcLperZfFSj+bWLtaPGSOHWhPiGzXmvVJbRLdG82sH/Kn8EtW1VqWVA54AKf2h5k5BbnIbwF3h6w==",
 			"requires": {
 			"requires": {
 				"buffer": "^5.5.0",
 				"buffer": "^5.5.0",
 				"inherits": "^2.0.4",
 				"inherits": "^2.0.4",
@@ -296,12 +297,12 @@
 			}
 			}
 		},
 		},
 		"buffer": {
 		"buffer": {
-			"version": "5.6.0",
-			"resolved": "https://registry.npmjs.org/buffer/-/buffer-5.6.0.tgz",
-			"integrity": "sha512-/gDYp/UtU0eA1ys8bOs9J6a+E/KWIY+DZ+Q2WESNUA0jFRsJOc0SNUO6xJ5SGA1xueg3NL65W6s+NY5l9cunuw==",
+			"version": "5.7.1",
+			"resolved": "https://registry.npmjs.org/buffer/-/buffer-5.7.1.tgz",
+			"integrity": "sha512-EHcyIPBQ4BSGlvjB16k5KgAJ27CIsHY/2JBmCRReo48y9rQ3MaUzWX3KVlBa4U7MyX02HdVj0K7C3WaB3ju7FQ==",
 			"requires": {
 			"requires": {
-				"base64-js": "^1.0.2",
-				"ieee754": "^1.1.4"
+				"base64-js": "^1.3.1",
+				"ieee754": "^1.1.13"
 			}
 			}
 		},
 		},
 		"buffer-crc32": {
 		"buffer-crc32": {
@@ -348,9 +349,9 @@
 			"integrity": "sha512-jJ0bqzaylmJtVnNgzTeSOs8DPavpbYgEr/b0YL8/2GO3xJEhInFmhKMUnEJQjZumK7KXGFhUy89PrsJWlakBVg=="
 			"integrity": "sha512-jJ0bqzaylmJtVnNgzTeSOs8DPavpbYgEr/b0YL8/2GO3xJEhInFmhKMUnEJQjZumK7KXGFhUy89PrsJWlakBVg=="
 		},
 		},
 		"cliui": {
 		"cliui": {
-			"version": "7.0.1",
-			"resolved": "https://registry.npmjs.org/cliui/-/cliui-7.0.1.tgz",
-			"integrity": "sha512-rcvHOWyGyid6I1WjT/3NatKj2kDt9OdSHSXpyLXaMWFbKpGACNW8pRhhdPUq9MWUOdwn8Rz9AVETjF4105rZZQ==",
+			"version": "7.0.4",
+			"resolved": "https://registry.npmjs.org/cliui/-/cliui-7.0.4.tgz",
+			"integrity": "sha512-OcRE68cOsVMXp1Yvonl/fzkQOyjLSu/8bhPDfQt0e0/Eb283TKP20Fs2MqoPsr9SwA595rRCA+QMzYc9nBP+JQ==",
 			"requires": {
 			"requires": {
 				"string-width": "^4.2.0",
 				"string-width": "^4.2.0",
 				"strip-ansi": "^6.0.0",
 				"strip-ansi": "^6.0.0",
@@ -448,9 +449,9 @@
 			}
 			}
 		},
 		},
 		"debug": {
 		"debug": {
-			"version": "4.2.0",
-			"resolved": "https://registry.npmjs.org/debug/-/debug-4.2.0.tgz",
-			"integrity": "sha512-IX2ncY78vDTjZMFUdmsvIRFY2Cf4FnD0wRs+nQwJU8Lu99/tPFdb0VybiiMTPe3I6rQmwsqQqRBvxU+bZ/I8sg==",
+			"version": "4.3.1",
+			"resolved": "https://registry.npmjs.org/debug/-/debug-4.3.1.tgz",
+			"integrity": "sha512-doEwdvm4PCeK4K3RQN2ZC2BYUBaxwLARCqZmMjtF8a51J2Rb0xpVloFRnCODwqjpwnAoao4pelN8l3RJdv3gRQ==",
 			"requires": {
 			"requires": {
 				"ms": "2.1.2"
 				"ms": "2.1.2"
 			}
 			}
@@ -461,9 +462,9 @@
 			"integrity": "sha1-9lNNFRSCabIDUue+4m9QH5oZEpA="
 			"integrity": "sha1-9lNNFRSCabIDUue+4m9QH5oZEpA="
 		},
 		},
 		"decimal.js": {
 		"decimal.js": {
-			"version": "10.2.0",
-			"resolved": "https://registry.npmjs.org/decimal.js/-/decimal.js-10.2.0.tgz",
-			"integrity": "sha512-vDPw+rDgn3bZe1+F/pyEwb1oMG2XTlRVgAa6B4KccTEpYgF8w6eQllVbQcfIJnZyvzFtFpxnpGtx8dd7DJp/Rw=="
+			"version": "10.2.1",
+			"resolved": "https://registry.npmjs.org/decimal.js/-/decimal.js-10.2.1.tgz",
+			"integrity": "sha512-KaL7+6Fw6i5A2XSnsbhm/6B+NuEA7TZ4vqxnd5tXz9sbKtrN9Srj8ab4vKVdK8YAqZO9P1kg45Y6YLoduPf+kw=="
 		},
 		},
 		"deep-is": {
 		"deep-is": {
 			"version": "0.1.3",
 			"version": "0.1.3",
@@ -476,9 +477,9 @@
 			"integrity": "sha1-3zrhmayt+31ECqrgsp4icrJOxhk="
 			"integrity": "sha1-3zrhmayt+31ECqrgsp4icrJOxhk="
 		},
 		},
 		"devtools-protocol": {
 		"devtools-protocol": {
-			"version": "0.0.799653",
-			"resolved": "https://registry.npmjs.org/devtools-protocol/-/devtools-protocol-0.0.799653.tgz",
-			"integrity": "sha512-t1CcaZbvm8pOlikqrsIM9GOa7Ipp07+4h/q9u0JXBWjPCjHdBl9KkddX87Vv9vBHoBGtwV79sYQNGnQM6iS5gg=="
+			"version": "0.0.818844",
+			"resolved": "https://registry.npmjs.org/devtools-protocol/-/devtools-protocol-0.0.818844.tgz",
+			"integrity": "sha512-AD1hi7iVJ8OD0aMLQU5VK0XH9LDlA1+BcPIgrAxPfaibx2DbWucuyOhc4oyQCbnvDDO68nN6/LcKfqTP343Jjg=="
 		},
 		},
 		"difflib": {
 		"difflib": {
 			"version": "github:postlight/difflib.js#32e8e38c7fcd935241b9baab71bb432fd9b166ed",
 			"version": "github:postlight/difflib.js#32e8e38c7fcd935241b9baab71bb432fd9b166ed",
@@ -525,9 +526,9 @@
 			}
 			}
 		},
 		},
 		"dompurify": {
 		"dompurify": {
-			"version": "2.1.0",
-			"resolved": "https://registry.npmjs.org/dompurify/-/dompurify-2.1.0.tgz",
-			"integrity": "sha512-wKExRhOwUnfm1icoISSXnlmM1P2l07W2tFQqbU+8oySnvy7tHwj2iHJ1kJQi8EfcTlojsHKESOJwCGVJmNUdPQ=="
+			"version": "2.2.7",
+			"resolved": "https://registry.npmjs.org/dompurify/-/dompurify-2.2.7.tgz",
+			"integrity": "sha512-jdtDffdGNY+C76jvodNTu9jt5yYj59vuTUyx+wXdzcSwAGTYZDAQkQ7Iwx9zcGrA4ixC1syU4H3RZROqRxokxg=="
 		},
 		},
 		"domutils": {
 		"domutils": {
 			"version": "1.5.1",
 			"version": "1.5.1",
@@ -576,9 +577,9 @@
 			"integrity": "sha512-f2LZMYl1Fzu7YSBKg+RoROelpOaNrcGmE9AZubeDfrCEia483oW4MI4VyFd5VNHIgQ/7qm1I0wUHK1eJnn2y2w=="
 			"integrity": "sha512-f2LZMYl1Fzu7YSBKg+RoROelpOaNrcGmE9AZubeDfrCEia483oW4MI4VyFd5VNHIgQ/7qm1I0wUHK1eJnn2y2w=="
 		},
 		},
 		"escalade": {
 		"escalade": {
-			"version": "3.1.0",
-			"resolved": "https://registry.npmjs.org/escalade/-/escalade-3.1.0.tgz",
-			"integrity": "sha512-mAk+hPSO8fLDkhV7V0dXazH5pDc6MrjBTPyD3VeKzxnVFjH1MIxbCdqGZB9O8+EwWakZs3ZCbDS4IpRt79V1ig=="
+			"version": "3.1.1",
+			"resolved": "https://registry.npmjs.org/escalade/-/escalade-3.1.1.tgz",
+			"integrity": "sha512-k0er2gUkLf8O0zKJiAhmkTnJlTvINGv7ygDNPbeIsX/TJjGJZHuh9B2UxbsaEkmlEo9MfhrSzmhIlhRlI2GXnw=="
 		},
 		},
 		"escodegen": {
 		"escodegen": {
 			"version": "1.14.3",
 			"version": "1.14.3",
@@ -800,9 +801,9 @@
 			}
 			}
 		},
 		},
 		"ieee754": {
 		"ieee754": {
-			"version": "1.1.13",
-			"resolved": "https://registry.npmjs.org/ieee754/-/ieee754-1.1.13.tgz",
-			"integrity": "sha512-4vf7I2LYV/HaWerSo3XmlMkp5eZ83i+/CDluXi/IGTs/O1sejBNhTtnxzmRZfvOUqj7lZjqHkeTvpgSFDlWZTg=="
+			"version": "1.2.1",
+			"resolved": "https://registry.npmjs.org/ieee754/-/ieee754-1.2.1.tgz",
+			"integrity": "sha512-dcyqhDvX1C46lXZcVqCpK+FtMRQVdIMN6/Df5js2zouUsqG7I6sFxitIC+7KYK29KdXOLHdu9zL4sFnoVQnqaA=="
 		},
 		},
 		"immediate": {
 		"immediate": {
 			"version": "3.0.6",
 			"version": "3.0.6",
@@ -823,11 +824,6 @@
 			"resolved": "https://registry.npmjs.org/inherits/-/inherits-2.0.4.tgz",
 			"resolved": "https://registry.npmjs.org/inherits/-/inherits-2.0.4.tgz",
 			"integrity": "sha512-k/vGaX4/Yla3WzyMCvTQOXYeIHvqOKtnqBduzTHpzpQZzAskKMhZ2K+EnBiSM9zGSoIFeMpXKxa4dYeZIQqewQ=="
 			"integrity": "sha512-k/vGaX4/Yla3WzyMCvTQOXYeIHvqOKtnqBduzTHpzpQZzAskKMhZ2K+EnBiSM9zGSoIFeMpXKxa4dYeZIQqewQ=="
 		},
 		},
-		"ip-regex": {
-			"version": "2.1.0",
-			"resolved": "https://registry.npmjs.org/ip-regex/-/ip-regex-2.1.0.tgz",
-			"integrity": "sha1-+ni/XS5pE8kRzp+BnuUUa7bYROk="
-		},
 		"is-fullwidth-code-point": {
 		"is-fullwidth-code-point": {
 			"version": "3.0.0",
 			"version": "3.0.0",
 			"resolved": "https://registry.npmjs.org/is-fullwidth-code-point/-/is-fullwidth-code-point-3.0.0.tgz",
 			"resolved": "https://registry.npmjs.org/is-fullwidth-code-point/-/is-fullwidth-code-point-3.0.0.tgz",
@@ -859,36 +855,60 @@
 			"integrity": "sha1-peZUwuWi3rXyAdls77yoDA7y9RM="
 			"integrity": "sha1-peZUwuWi3rXyAdls77yoDA7y9RM="
 		},
 		},
 		"jsdom": {
 		"jsdom": {
-			"version": "16.4.0",
-			"resolved": "https://registry.npmjs.org/jsdom/-/jsdom-16.4.0.tgz",
-			"integrity": "sha512-lYMm3wYdgPhrl7pDcRmvzPhhrGVBeVhPIqeHjzeiHN3DFmD1RBpbExbi8vU7BJdH8VAZYovR8DMt0PNNDM7k8w==",
+			"version": "16.5.2",
+			"resolved": "https://registry.npmjs.org/jsdom/-/jsdom-16.5.2.tgz",
+			"integrity": "sha512-JxNtPt9C1ut85boCbJmffaQ06NBnzkQY/MWO3YxPW8IWS38A26z+B1oBvA9LwKrytewdfymnhi4UNH3/RAgZrg==",
 			"requires": {
 			"requires": {
-				"abab": "^2.0.3",
-				"acorn": "^7.1.1",
+				"abab": "^2.0.5",
+				"acorn": "^8.1.0",
 				"acorn-globals": "^6.0.0",
 				"acorn-globals": "^6.0.0",
 				"cssom": "^0.4.4",
 				"cssom": "^0.4.4",
-				"cssstyle": "^2.2.0",
+				"cssstyle": "^2.3.0",
 				"data-urls": "^2.0.0",
 				"data-urls": "^2.0.0",
-				"decimal.js": "^10.2.0",
+				"decimal.js": "^10.2.1",
 				"domexception": "^2.0.1",
 				"domexception": "^2.0.1",
-				"escodegen": "^1.14.1",
+				"escodegen": "^2.0.0",
 				"html-encoding-sniffer": "^2.0.1",
 				"html-encoding-sniffer": "^2.0.1",
 				"is-potential-custom-element-name": "^1.0.0",
 				"is-potential-custom-element-name": "^1.0.0",
 				"nwsapi": "^2.2.0",
 				"nwsapi": "^2.2.0",
-				"parse5": "5.1.1",
+				"parse5": "6.0.1",
 				"request": "^2.88.2",
 				"request": "^2.88.2",
-				"request-promise-native": "^1.0.8",
-				"saxes": "^5.0.0",
+				"request-promise-native": "^1.0.9",
+				"saxes": "^5.0.1",
 				"symbol-tree": "^3.2.4",
 				"symbol-tree": "^3.2.4",
-				"tough-cookie": "^3.0.1",
+				"tough-cookie": "^4.0.0",
 				"w3c-hr-time": "^1.0.2",
 				"w3c-hr-time": "^1.0.2",
 				"w3c-xmlserializer": "^2.0.0",
 				"w3c-xmlserializer": "^2.0.0",
 				"webidl-conversions": "^6.1.0",
 				"webidl-conversions": "^6.1.0",
 				"whatwg-encoding": "^1.0.5",
 				"whatwg-encoding": "^1.0.5",
 				"whatwg-mimetype": "^2.3.0",
 				"whatwg-mimetype": "^2.3.0",
-				"whatwg-url": "^8.0.0",
-				"ws": "^7.2.3",
+				"whatwg-url": "^8.5.0",
+				"ws": "^7.4.4",
 				"xml-name-validator": "^3.0.0"
 				"xml-name-validator": "^3.0.0"
+			},
+			"dependencies": {
+				"abab": {
+					"version": "2.0.5",
+					"resolved": "https://registry.npmjs.org/abab/-/abab-2.0.5.tgz",
+					"integrity": "sha512-9IK9EadsbHo6jLWIpxpR6pL0sazTXV6+SQv25ZB+F7Bj9mJNaOc4nCRabwd5M/JwmUa8idz6Eci6eKfJryPs6Q=="
+				},
+				"escodegen": {
+					"version": "2.0.0",
+					"resolved": "https://registry.npmjs.org/escodegen/-/escodegen-2.0.0.tgz",
+					"integrity": "sha512-mmHKys/C8BFUGI+MAWNcSYoORYLMdPzjrknd2Vc+bUsjN5bXcr8EhrNB+UTqfL1y3I9c4fw2ihgtMPQLBRiQxw==",
+					"requires": {
+						"esprima": "^4.0.1",
+						"estraverse": "^5.2.0",
+						"esutils": "^2.0.2",
+						"optionator": "^0.8.1",
+						"source-map": "~0.6.1"
+					}
+				},
+				"estraverse": {
+					"version": "5.2.0",
+					"resolved": "https://registry.npmjs.org/estraverse/-/estraverse-5.2.0.tgz",
+					"integrity": "sha512-BxbNGGNm0RyRYvUdHpIwv9IWzeM9XClbOxwoATuFdOE7ZE6wHL+HQ5T8hoPM+zHvmKzzsEqhgy0GrQ5X13afiQ=="
+				}
 			}
 			}
 		},
 		},
 		"json-schema": {
 		"json-schema": {
@@ -918,9 +938,9 @@
 			}
 			}
 		},
 		},
 		"jszip": {
 		"jszip": {
-			"version": "3.5.0",
-			"resolved": "https://registry.npmjs.org/jszip/-/jszip-3.5.0.tgz",
-			"integrity": "sha512-WRtu7TPCmYePR1nazfrtuF216cIVon/3GWOvHS9QR5bIwSbnxtdpma6un3jyGGNhHsKCSzn5Ypk+EkDRvTGiFA==",
+			"version": "3.6.0",
+			"resolved": "https://registry.npmjs.org/jszip/-/jszip-3.6.0.tgz",
+			"integrity": "sha512-jgnQoG9LKnWO3mnVNBnfhkh0QknICd1FGSrXcgrl67zioyJ4wgx25o9ZqwNtrROSflGBCGYnJfjrIyRIby1OoQ==",
 			"requires": {
 			"requires": {
 				"lie": "~3.3.0",
 				"lie": "~3.3.0",
 				"pako": "~1.0.2",
 				"pako": "~1.0.2",
@@ -1174,9 +1194,9 @@
 			"integrity": "sha512-4hLB8Py4zZce5s4yd9XzopqwVv/yGNhV1Bl8NTmCq1763HeK2+EwVTv+leGeL13Dnh2wfbqowVPXCIO0z4taYw=="
 			"integrity": "sha512-4hLB8Py4zZce5s4yd9XzopqwVv/yGNhV1Bl8NTmCq1763HeK2+EwVTv+leGeL13Dnh2wfbqowVPXCIO0z4taYw=="
 		},
 		},
 		"parse5": {
 		"parse5": {
-			"version": "5.1.1",
-			"resolved": "https://registry.npmjs.org/parse5/-/parse5-5.1.1.tgz",
-			"integrity": "sha512-ugq4DFI0Ptb+WWjAdOK16+u/nHfiIrcE+sh8kZMaM0WllQKLI9rOUq6c2b7cwPkXdzfQESqvoqK6ug7U/Yyzug=="
+			"version": "6.0.1",
+			"resolved": "https://registry.npmjs.org/parse5/-/parse5-6.0.1.tgz",
+			"integrity": "sha512-Ofn/CTFzRGTTxwpNEs9PP93gXShHcTq255nzRYSKe8AkVpZY7e1fpmTfOyoIvjP5HG7Z2ZM7VS9PPhQGW2pOpw=="
 		},
 		},
 		"path-exists": {
 		"path-exists": {
 			"version": "4.0.0",
 			"version": "4.0.0",
@@ -1301,14 +1321,15 @@
 			"integrity": "sha512-XRsRjdf+j5ml+y/6GKHPZbrF/8p2Yga0JPtdqTIY2Xe5ohJPD9saDJJLPvp9+NSBprVvevdXZybnj2cv8OEd0A=="
 			"integrity": "sha512-XRsRjdf+j5ml+y/6GKHPZbrF/8p2Yga0JPtdqTIY2Xe5ohJPD9saDJJLPvp9+NSBprVvevdXZybnj2cv8OEd0A=="
 		},
 		},
 		"puppeteer-core": {
 		"puppeteer-core": {
-			"version": "5.3.1",
-			"resolved": "https://registry.npmjs.org/puppeteer-core/-/puppeteer-core-5.3.1.tgz",
-			"integrity": "sha512-YE6c6FvHAFKQUyNTqFs78SgGmpcqOPhhmVfEVNYB4abv7bV2V+B3r72T3e7vlJkEeTloy4x9bQLrGbHHoKSg1w==",
+			"version": "5.5.0",
+			"resolved": "https://registry.npmjs.org/puppeteer-core/-/puppeteer-core-5.5.0.tgz",
+			"integrity": "sha512-tlA+1n+ziW/Db03hVV+bAecDKse8ihFRXYiEypBe9IlLRvOCzYFG6qrCMBYK34HO/Q/Ecjc+tvkHRAfLVH+NgQ==",
 			"requires": {
 			"requires": {
 				"debug": "^4.1.0",
 				"debug": "^4.1.0",
-				"devtools-protocol": "0.0.799653",
+				"devtools-protocol": "0.0.818844",
 				"extract-zip": "^2.0.0",
 				"extract-zip": "^2.0.0",
 				"https-proxy-agent": "^4.0.0",
 				"https-proxy-agent": "^4.0.0",
+				"node-fetch": "^2.6.1",
 				"pkg-dir": "^4.2.0",
 				"pkg-dir": "^4.2.0",
 				"progress": "^2.0.1",
 				"progress": "^2.0.1",
 				"proxy-from-env": "^1.0.0",
 				"proxy-from-env": "^1.0.0",
@@ -1332,9 +1353,9 @@
 			"version": "git+https://github.com/pirate/readability-extractor.git#0098f142b0a015c8c90766d3b74d9eb6fb7b7e6a",
 			"version": "git+https://github.com/pirate/readability-extractor.git#0098f142b0a015c8c90766d3b74d9eb6fb7b7e6a",
 			"from": "git+https://github.com/pirate/readability-extractor.git",
 			"from": "git+https://github.com/pirate/readability-extractor.git",
 			"requires": {
 			"requires": {
-				"@mozilla/readability": "^0.3.0",
-				"dompurify": "^2.1.0",
-				"jsdom": "^16.4.0"
+				"@mozilla/readability": "^0.4.1",
+				"dompurify": "^2.2.7",
+				"jsdom": "^16.5.2"
 			}
 			}
 		},
 		},
 		"readable-stream": {
 		"readable-stream": {
@@ -1571,9 +1592,9 @@
 			"integrity": "sha1-PYRT5ydKLkShQrPchEnftk2a3jo="
 			"integrity": "sha1-PYRT5ydKLkShQrPchEnftk2a3jo="
 		},
 		},
 		"string-width": {
 		"string-width": {
-			"version": "4.2.0",
-			"resolved": "https://registry.npmjs.org/string-width/-/string-width-4.2.0.tgz",
-			"integrity": "sha512-zUz5JD+tgqtuDjMhwIg5uFVV3dtqZ9yQJlZVfq4I01/K5Paj5UHj7VyrQOJvzawSVlKpObApbfD0Ed6yJc+1eg==",
+			"version": "4.2.2",
+			"resolved": "https://registry.npmjs.org/string-width/-/string-width-4.2.2.tgz",
+			"integrity": "sha512-XBJbT3N4JhVumXE0eoLU9DCjcaF92KLNqTmFCnG1pf8duUxFGwtP6AD6nkjw9a3IdiRtL3E2w3JDiE/xi3vOeA==",
 			"requires": {
 			"requires": {
 				"emoji-regex": "^8.0.0",
 				"emoji-regex": "^8.0.0",
 				"is-fullwidth-code-point": "^3.0.0",
 				"is-fullwidth-code-point": "^3.0.0",
@@ -1610,20 +1631,20 @@
 			"integrity": "sha512-9QNk5KwDF+Bvz+PyObkmSYjI5ksVUYtjW7AU22r2NKcfLJcXp96hkDWU3+XndOsUb+AQ9QhfzfCT2O+CNWT5Tw=="
 			"integrity": "sha512-9QNk5KwDF+Bvz+PyObkmSYjI5ksVUYtjW7AU22r2NKcfLJcXp96hkDWU3+XndOsUb+AQ9QhfzfCT2O+CNWT5Tw=="
 		},
 		},
 		"tar-fs": {
 		"tar-fs": {
-			"version": "2.1.0",
-			"resolved": "https://registry.npmjs.org/tar-fs/-/tar-fs-2.1.0.tgz",
-			"integrity": "sha512-9uW5iDvrIMCVpvasdFHW0wJPez0K4JnMZtsuIeDI7HyMGJNxmDZDOCQROr7lXyS+iL/QMpj07qcjGYTSdRFXUg==",
+			"version": "2.1.1",
+			"resolved": "https://registry.npmjs.org/tar-fs/-/tar-fs-2.1.1.tgz",
+			"integrity": "sha512-V0r2Y9scmbDRLCNex/+hYzvp/zyYjvFbHPNgVTKfQvVrb6guiE/fxP+XblDNR011utopbkex2nM4dHNV6GDsng==",
 			"requires": {
 			"requires": {
 				"chownr": "^1.1.1",
 				"chownr": "^1.1.1",
 				"mkdirp-classic": "^0.5.2",
 				"mkdirp-classic": "^0.5.2",
 				"pump": "^3.0.0",
 				"pump": "^3.0.0",
-				"tar-stream": "^2.0.0"
+				"tar-stream": "^2.1.4"
 			}
 			}
 		},
 		},
 		"tar-stream": {
 		"tar-stream": {
-			"version": "2.1.4",
-			"resolved": "https://registry.npmjs.org/tar-stream/-/tar-stream-2.1.4.tgz",
-			"integrity": "sha512-o3pS2zlG4gxr67GmFYBLlq+dM8gyRGUOvsrHclSkvtVtQbjV0s/+ZE8OpICbaj8clrX3tjeHngYGP7rweaBnuw==",
+			"version": "2.2.0",
+			"resolved": "https://registry.npmjs.org/tar-stream/-/tar-stream-2.2.0.tgz",
+			"integrity": "sha512-ujeqbceABgwMZxEJnk2HDY2DlnUZ+9oEcb1KzTVfYHio0UE6dG71n60d8D2I4qNvleWrrXpmjpt7vZeF1LnMZQ==",
 			"requires": {
 			"requires": {
 				"bl": "^4.0.3",
 				"bl": "^4.0.3",
 				"end-of-stream": "^1.4.1",
 				"end-of-stream": "^1.4.1",
@@ -1646,13 +1667,13 @@
 			}
 			}
 		},
 		},
 		"tough-cookie": {
 		"tough-cookie": {
-			"version": "3.0.1",
-			"resolved": "https://registry.npmjs.org/tough-cookie/-/tough-cookie-3.0.1.tgz",
-			"integrity": "sha512-yQyJ0u4pZsv9D4clxO69OEjLWYw+jbgspjTue4lTQZLfV0c5l1VmK2y1JK8E9ahdpltPOaAThPcp5nKPUgSnsg==",
+			"version": "4.0.0",
+			"resolved": "https://registry.npmjs.org/tough-cookie/-/tough-cookie-4.0.0.tgz",
+			"integrity": "sha512-tHdtEpQCMrc1YLrMaqXXcj6AxhYi/xgit6mZu1+EDWUn+qhUf8wMQoFIy9NXuq23zAwtcB0t/MjACGR18pcRbg==",
 			"requires": {
 			"requires": {
-				"ip-regex": "^2.1.0",
-				"psl": "^1.1.28",
-				"punycode": "^2.1.1"
+				"psl": "^1.1.33",
+				"punycode": "^2.1.1",
+				"universalify": "^0.1.2"
 			}
 			}
 		},
 		},
 		"tr46": {
 		"tr46": {
@@ -1863,6 +1884,11 @@
 				"through": "^2.3.8"
 				"through": "^2.3.8"
 			}
 			}
 		},
 		},
+		"universalify": {
+			"version": "0.1.2",
+			"resolved": "https://registry.npmjs.org/universalify/-/universalify-0.1.2.tgz",
+			"integrity": "sha512-rBJeI5CXAlmy1pV+617WB9J63U6XcazHHF2f2dbJix4XzpUF0RS3Zbj0FGIOCAva5P/d/GBOYaACQ1w+0azUkg=="
+		},
 		"uri-js": {
 		"uri-js": {
 			"version": "4.4.0",
 			"version": "4.4.0",
 			"resolved": "https://registry.npmjs.org/uri-js/-/uri-js-4.4.0.tgz",
 			"resolved": "https://registry.npmjs.org/uri-js/-/uri-js-4.4.0.tgz",
@@ -1947,11 +1973,11 @@
 			"integrity": "sha512-M4yMwr6mAnQz76TbJm914+gPpB/nCwvZbJU28cUD6dR004SAxDLOOSUaB1JDRqLtaOV/vi0IC5lEAGFgrjGv/g=="
 			"integrity": "sha512-M4yMwr6mAnQz76TbJm914+gPpB/nCwvZbJU28cUD6dR004SAxDLOOSUaB1JDRqLtaOV/vi0IC5lEAGFgrjGv/g=="
 		},
 		},
 		"whatwg-url": {
 		"whatwg-url": {
-			"version": "8.2.2",
-			"resolved": "https://registry.npmjs.org/whatwg-url/-/whatwg-url-8.2.2.tgz",
-			"integrity": "sha512-PcVnO6NiewhkmzV0qn7A+UZ9Xx4maNTI+O+TShmfE4pqjoCMwUMjkvoNhNHPTvgR7QH9Xt3R13iHuWy2sToFxQ==",
+			"version": "8.5.0",
+			"resolved": "https://registry.npmjs.org/whatwg-url/-/whatwg-url-8.5.0.tgz",
+			"integrity": "sha512-fy+R77xWv0AiqfLl4nuGUlQ3/6b5uNfQ4WAbGQVMYshCTCCPK9psC1nWh3XHuxGVCtlcDDQPQW1csmmIQo+fwg==",
 			"requires": {
 			"requires": {
-				"lodash.sortby": "^4.7.0",
+				"lodash": "^4.7.0",
 				"tr46": "^2.0.2",
 				"tr46": "^2.0.2",
 				"webidl-conversions": "^6.1.0"
 				"webidl-conversions": "^6.1.0"
 			}
 			}
@@ -1977,9 +2003,9 @@
 			"integrity": "sha1-tSQ9jz7BqjXxNkYFvA0QNuMKtp8="
 			"integrity": "sha1-tSQ9jz7BqjXxNkYFvA0QNuMKtp8="
 		},
 		},
 		"ws": {
 		"ws": {
-			"version": "7.3.1",
-			"resolved": "https://registry.npmjs.org/ws/-/ws-7.3.1.tgz",
-			"integrity": "sha512-D3RuNkynyHmEJIpD2qrgVkc9DQ23OrN/moAwZX4L8DfvszsJxpjQuUq3LMx6HoYji9fbIOBY18XWBsAux1ZZUA=="
+			"version": "7.4.4",
+			"resolved": "https://registry.npmjs.org/ws/-/ws-7.4.4.tgz",
+			"integrity": "sha512-Qm8k8ojNQIMx7S+Zp8u/uHOx7Qazv3Yv4q68MiWWWOJhiwG5W3x7iqmRtJo8xxrciZUY4vRxUTJCKuRnF28ZZw=="
 		},
 		},
 		"wuzzy": {
 		"wuzzy": {
 			"version": "0.1.6",
 			"version": "0.1.6",
@@ -2000,28 +2026,28 @@
 			"integrity": "sha512-JZnDKK8B0RCDw84FNdDAIpZK+JuJw+s7Lz8nksI7SIuU3UXJJslUthsi+uWBUYOwPFwW7W7PRLRfUKpxjtjFCw=="
 			"integrity": "sha512-JZnDKK8B0RCDw84FNdDAIpZK+JuJw+s7Lz8nksI7SIuU3UXJJslUthsi+uWBUYOwPFwW7W7PRLRfUKpxjtjFCw=="
 		},
 		},
 		"y18n": {
 		"y18n": {
-			"version": "5.0.1",
-			"resolved": "https://registry.npmjs.org/y18n/-/y18n-5.0.1.tgz",
-			"integrity": "sha512-/jJ831jEs4vGDbYPQp4yGKDYPSCCEQ45uZWJHE1AoYBzqdZi8+LDWas0z4HrmJXmKdpFsTiowSHXdxyFhpmdMg=="
+			"version": "5.0.5",
+			"resolved": "https://registry.npmjs.org/y18n/-/y18n-5.0.5.tgz",
+			"integrity": "sha512-hsRUr4FFrvhhRH12wOdfs38Gy7k2FFzB9qgN9v3aLykRq0dRcdcpz5C9FxdS2NuhOrI/628b/KSTJ3rwHysYSg=="
 		},
 		},
 		"yargs": {
 		"yargs": {
-			"version": "16.0.3",
-			"resolved": "https://registry.npmjs.org/yargs/-/yargs-16.0.3.tgz",
-			"integrity": "sha512-6+nLw8xa9uK1BOEOykaiYAJVh6/CjxWXK/q9b5FpRgNslt8s22F2xMBqVIKgCRjNgGvGPBy8Vog7WN7yh4amtA==",
+			"version": "16.2.0",
+			"resolved": "https://registry.npmjs.org/yargs/-/yargs-16.2.0.tgz",
+			"integrity": "sha512-D1mvvtDG0L5ft/jGWkLpG1+m0eQxOfaBvTNELraWj22wSVUMWxZUvYgJYcKh6jGGIkJFhH4IZPQhR4TKpc8mBw==",
 			"requires": {
 			"requires": {
-				"cliui": "^7.0.0",
-				"escalade": "^3.0.2",
+				"cliui": "^7.0.2",
+				"escalade": "^3.1.1",
 				"get-caller-file": "^2.0.5",
 				"get-caller-file": "^2.0.5",
 				"require-directory": "^2.1.1",
 				"require-directory": "^2.1.1",
 				"string-width": "^4.2.0",
 				"string-width": "^4.2.0",
-				"y18n": "^5.0.1",
-				"yargs-parser": "^20.0.0"
+				"y18n": "^5.0.5",
+				"yargs-parser": "^20.2.2"
 			}
 			}
 		},
 		},
 		"yargs-parser": {
 		"yargs-parser": {
-			"version": "20.2.0",
-			"resolved": "https://registry.npmjs.org/yargs-parser/-/yargs-parser-20.2.0.tgz",
-			"integrity": "sha512-2agPoRFPoIcFzOIp6656gcvsg2ohtscpw2OINr/q46+Sq41xz2OYLqx5HRHabmFU1OARIPAYH5uteICE7mn/5A=="
+			"version": "20.2.7",
+			"resolved": "https://registry.npmjs.org/yargs-parser/-/yargs-parser-20.2.7.tgz",
+			"integrity": "sha512-FiNkvbeHzB/syOjIUxFDCnhSfzAL8R5vs40MgLFBorXACCOAEaWu0gRZl14vG8MR9AOJIZbmkjhusqBYZ3HTHw=="
 		},
 		},
 		"yauzl": {
 		"yauzl": {
 			"version": "2.10.0",
 			"version": "2.10.0",

+ 1 - 1
package.json

@@ -1,6 +1,6 @@
 {
 {
   "name": "archivebox",
   "name": "archivebox",
-  "version": "0.5.6",
+  "version": "0.6.0",
   "description": "ArchiveBox: The self-hosted internet archive",
   "description": "ArchiveBox: The self-hosted internet archive",
   "author": "Nick Sweeting <[email protected]>",
   "author": "Nick Sweeting <[email protected]>",
   "license": "MIT",
   "license": "MIT",

+ 49 - 40
setup.py

@@ -27,6 +27,48 @@ PACKAGE_DIR = ROOT_DIR / PKG_NAME
 README = (PACKAGE_DIR / "README.md").read_text(encoding='utf-8', errors='ignore')
 README = (PACKAGE_DIR / "README.md").read_text(encoding='utf-8', errors='ignore')
 VERSION = json.loads((PACKAGE_DIR / "package.json").read_text().strip())['version']
 VERSION = json.loads((PACKAGE_DIR / "package.json").read_text().strip())['version']
 
 
+PYTHON_REQUIRES = ">=3.7"
+SETUP_REQUIRES = ["wheel"]
+INSTALL_REQUIRES = [
+    # only add things here that have corresponding apt python3-packages available
+    # anything added here also needs to be added to our package dependencies in
+    # stdeb.cfg (apt), archivebox.rb (brew), Dockerfile, etc.
+    # if there is no apt python3-package equivalent, then vendor it instead in
+    # ./archivebox/vendor/
+    "requests>=2.24.0",
+    "mypy-extensions>=0.4.3",
+    "django>=3.1.3",
+    "django-extensions>=3.0.3",
+    "dateparser",
+    "ipython",
+    "youtube-dl",
+    "python-crontab>=2.5.1",
+    "croniter>=0.3.34",
+    "w3lib>=1.22.0",
+]
+EXTRAS_REQUIRE = {
+    'sonic': [
+        "sonic-client>=0.0.5",
+    ],
+    'dev': [
+        "setuptools",
+        "twine",
+        "wheel",
+        "flake8",
+        "ipdb",
+        "mypy",
+        "django-stubs",
+        "sphinx",
+        "sphinx-rtd-theme",
+        "recommonmark",
+        "pytest",
+        "bottle",
+        "stdeb",
+        "django-debug-toolbar",
+        "djdt_flamegraph",
+    ],
+}
+
 # To see when setup.py gets called (uncomment for debugging):
 # To see when setup.py gets called (uncomment for debugging):
 # import sys
 # import sys
 # print(PACKAGE_DIR, f"     (v{VERSION})")
 # print(PACKAGE_DIR, f"     (v{VERSION})")
@@ -36,7 +78,9 @@ VERSION = json.loads((PACKAGE_DIR / "package.json").read_text().strip())['versio
 class DisabledTestCommand(test):
 class DisabledTestCommand(test):
     def run(self):
     def run(self):
         # setup.py test is deprecated, disable it here by force so stdeb doesnt run it
         # setup.py test is deprecated, disable it here by force so stdeb doesnt run it
-        print('Use the ./bin/test.sh script to run tests, not setup.py test.')
+        print()
+        print('[X] Running tests via setup.py test is deprecated.')
+        print('    Hint: Use the ./bin/test.sh script or pytest instead')
 
 
 
 
 setuptools.setup(
 setuptools.setup(
@@ -50,45 +94,10 @@ setuptools.setup(
     long_description_content_type="text/markdown",
     long_description_content_type="text/markdown",
     url=REPO_URL,
     url=REPO_URL,
     project_urls=PROJECT_URLS,
     project_urls=PROJECT_URLS,
-    python_requires=">=3.7",
-    setup_requires=[
-        "wheel",
-    ],
-    install_requires=[
-        # only add things here that have corresponding apt python3-packages available
-        # anything added here also needs to be added to our package dependencies in
-        # stdeb.cfg (apt), archivebox.rb (brew), Dockerfile, etc.
-        # if there is no apt python3-package equivalent, then vendor it instead in
-        # ./archivebox/vendor/
-        "requests==2.24.0",
-        "atomicwrites==1.4.0",
-        "mypy-extensions==0.4.3",
-        "django==3.1.3",
-        "django-extensions==3.0.3",
-        "dateparser",
-        "ipython",
-        "youtube-dl",
-        "python-crontab==2.5.1",
-        "croniter==0.3.34",
-        "w3lib==1.22.0",
-    ],
-    extras_require={
-        'dev': [
-            "setuptools",
-            "twine",
-            "wheel",
-            "flake8",
-            "ipdb",
-            "mypy",
-            "django-stubs",
-            "sphinx",
-            "sphinx-rtd-theme",
-            "recommonmark",
-            "pytest",
-            "bottle",
-            "stdeb",
-        ],
-    },
+    python_requires=PYTHON_REQUIRES,
+    setup_requires=SETUP_REQUIRES,
+    install_requires=INSTALL_REQUIRES,
+    extras_require=EXTRAS_REQUIRE,
     packages=[PKG_NAME],
     packages=[PKG_NAME],
     include_package_data=True,   # see MANIFEST.in
     include_package_data=True,   # see MANIFEST.in
     entry_points={
     entry_points={

+ 1 - 1
stdeb.cfg

@@ -5,6 +5,6 @@ Package3: archivebox
 Suite: focal
 Suite: focal
 Suite3: focal
 Suite3: focal
 Build-Depends: dh-python, python3-pip, python3-setuptools, python3-wheel, python3-stdeb
 Build-Depends: dh-python, python3-pip, python3-setuptools, python3-wheel, python3-stdeb
-Depends3: nodejs, chromium-browser, wget, curl, git, ffmpeg, youtube-dl, python3-atomicwrites, python3-croniter, python3-crontab, python3-dateparser, python3-django, python3-django-extensions, python3-django-jsonfield, python3-mypy-extensions, python3-requests, python3-w3lib, ripgrep
+Depends3: nodejs, chromium-browser, wget, curl, git, ffmpeg, youtube-dl, python3-croniter, python3-crontab, python3-dateparser, python3-django, python3-django-extensions, python3-django-jsonfield, python3-mypy-extensions, python3-requests, python3-w3lib, ripgrep
 XS-Python-Version: >= 3.7
 XS-Python-Version: >= 3.7
 Setup-Env-Vars: DEB_BUILD_OPTIONS=nocheck
 Setup-Env-Vars: DEB_BUILD_OPTIONS=nocheck

+ 3 - 3
tests/test_add.py

@@ -33,7 +33,7 @@ def test_depth_flag_0_crawls_only_the_arg_page(tmp_path, process, disable_extrac
     )
     )
     
     
     archived_item_path = list(tmp_path.glob('archive/**/*'))[0]
     archived_item_path = list(tmp_path.glob('archive/**/*'))[0]
-    with open(archived_item_path / "index.json", "r") as f:
+    with open(archived_item_path / "index.json", "r", encoding='utf-8') as f:
         output_json = json.load(f)
         output_json = json.load(f)
     assert output_json["base_url"] == "127.0.0.1:8080/static/example.com.html"
     assert output_json["base_url"] == "127.0.0.1:8080/static/example.com.html"
 
 
@@ -79,7 +79,7 @@ def test_add_updates_history_json_index(tmp_path, process, disable_extractors_di
 
 
     archived_item_path = list(tmp_path.glob('archive/**/*'))[0]
     archived_item_path = list(tmp_path.glob('archive/**/*'))[0]
 
 
-    with open(archived_item_path / "index.json", "r") as f:
+    with open(archived_item_path / "index.json", "r", encoding="utf-8") as f:
         output_json = json.load(f)
         output_json = json.load(f)
     assert output_json["history"] != {}
     assert output_json["history"] != {}
 
 
@@ -90,4 +90,4 @@ def test_extract_input_uses_only_passed_extractors(tmp_path, process):
     archived_item_path = list(tmp_path.glob('archive/**/*'))[0]
     archived_item_path = list(tmp_path.glob('archive/**/*'))[0]
 
 
     assert (archived_item_path / "warc").exists()
     assert (archived_item_path / "warc").exists()
-    assert not (archived_item_path / "singlefile.html").exists()
+    assert not (archived_item_path / "singlefile.html").exists()

+ 4 - 4
tests/test_extractors.py

@@ -86,7 +86,7 @@ def test_headers_retrieved(tmp_path, process, disable_extractors_dict):
     output_file = archived_item_path / "headers.json"
     output_file = archived_item_path / "headers.json"
     assert output_file.exists()
     assert output_file.exists()
     headers_file = archived_item_path / 'headers.json'
     headers_file = archived_item_path / 'headers.json'
-    with open(headers_file) as f:
+    with open(headers_file, 'r', encoding='utf-8') as f:
         headers = pyjson.load(f)
         headers = pyjson.load(f)
     assert headers['Content-Language'] == 'en'
     assert headers['Content-Language'] == 'en'
     assert headers['Content-Script-Type'] == 'text/javascript'
     assert headers['Content-Script-Type'] == 'text/javascript'
@@ -98,7 +98,7 @@ def test_headers_redirect_chain(tmp_path, process, disable_extractors_dict):
                                   capture_output=True, env=disable_extractors_dict)
                                   capture_output=True, env=disable_extractors_dict)
     archived_item_path = list(tmp_path.glob("archive/**/*"))[0]
     archived_item_path = list(tmp_path.glob("archive/**/*"))[0]
     output_file = archived_item_path / "headers.json" 
     output_file = archived_item_path / "headers.json" 
-    with open(output_file) as f:
+    with open(output_file, 'r', encoding='utf-8') as f:
         headers = pyjson.load(f)
         headers = pyjson.load(f)
     assert headers['Content-Language'] == 'en'
     assert headers['Content-Language'] == 'en'
     assert headers['Content-Script-Type'] == 'text/javascript'
     assert headers['Content-Script-Type'] == 'text/javascript'
@@ -110,6 +110,6 @@ def test_headers_400_plus(tmp_path, process, disable_extractors_dict):
                                   capture_output=True, env=disable_extractors_dict)
                                   capture_output=True, env=disable_extractors_dict)
     archived_item_path = list(tmp_path.glob("archive/**/*"))[0]
     archived_item_path = list(tmp_path.glob("archive/**/*"))[0]
     output_file = archived_item_path / "headers.json" 
     output_file = archived_item_path / "headers.json" 
-    with open(output_file) as f:
+    with open(output_file, 'r', encoding='utf-8') as f:
         headers = pyjson.load(f)
         headers = pyjson.load(f)
-    assert headers["Status-Code"] == "200"
+    assert headers["Status-Code"] == "200"

+ 10 - 10
tests/test_init.py

@@ -12,12 +12,12 @@ from archivebox.config import OUTPUT_PERMISSIONS
 from .fixtures import *
 from .fixtures import *
 
 
 def test_init(tmp_path, process):
 def test_init(tmp_path, process):
-    assert "Initializing a new ArchiveBox collection in this folder..." in process.stdout.decode("utf-8")
+    assert "Initializing a new ArchiveBox" in process.stdout.decode("utf-8")
     
     
 def test_update(tmp_path, process):
 def test_update(tmp_path, process):
     os.chdir(tmp_path)
     os.chdir(tmp_path)
     update_process = subprocess.run(['archivebox', 'init'], capture_output=True)
     update_process = subprocess.run(['archivebox', 'init'], capture_output=True)
-    assert "Updating existing ArchiveBox collection in this folder" in update_process.stdout.decode("utf-8")
+    assert "updating existing ArchiveBox" in update_process.stdout.decode("utf-8")
 
 
 def test_add_link(tmp_path, process, disable_extractors_dict):
 def test_add_link(tmp_path, process, disable_extractors_dict):
     disable_extractors_dict.update({"USE_WGET": "true"})
     disable_extractors_dict.update({"USE_WGET": "true"})
@@ -28,11 +28,11 @@ def test_add_link(tmp_path, process, disable_extractors_dict):
 
 
     assert "index.json" in [x.name for x in archived_item_path.iterdir()]
     assert "index.json" in [x.name for x in archived_item_path.iterdir()]
 
 
-    with open(archived_item_path / "index.json", "r") as f:
+    with open(archived_item_path / "index.json", "r", encoding="utf-8") as f:
         output_json = json.load(f)
         output_json = json.load(f)
     assert "Example Domain" == output_json['history']['title'][0]['output']
     assert "Example Domain" == output_json['history']['title'][0]['output']
 
 
-    with open(archived_item_path / "index.html", "r") as f:
+    with open(archived_item_path / "index.html", "r", encoding="utf-8") as f:
         output_html = f.read()
         output_html = f.read()
     assert "Example Domain" in output_html
     assert "Example Domain" in output_html
 
 
@@ -47,7 +47,7 @@ def test_add_link_support_stdin(tmp_path, process, disable_extractors_dict):
 
 
     assert "index.json" in [x.name for x in archived_item_path.iterdir()]
     assert "index.json" in [x.name for x in archived_item_path.iterdir()]
 
 
-    with open(archived_item_path / "index.json", "r") as f:
+    with open(archived_item_path / "index.json", "r", encoding="utf-8") as f:
         output_json = json.load(f)
         output_json = json.load(f)
     assert "Example Domain" == output_json['history']['title'][0]['output']
     assert "Example Domain" == output_json['history']['title'][0]['output']
 
 
@@ -75,11 +75,11 @@ def test_collision_urls_different_timestamps(tmp_path, process, disable_extracto
     
     
     first_archive = tmp_path / "archive" / str(min([float(folder) for folder in archive_folders]))
     first_archive = tmp_path / "archive" / str(min([float(folder) for folder in archive_folders]))
     json_index = str(first_archive / "index.json")
     json_index = str(first_archive / "index.json")
-    with open(json_index, "r") as f:
+    with open(json_index, "r", encoding="utf-8") as f:
         link_details = json.loads(f.read())
         link_details = json.loads(f.read())
 
 
     link_details["url"] = "http://127.0.0.1:8080/static/iana.org.html"
     link_details["url"] = "http://127.0.0.1:8080/static/iana.org.html"
-    with open(json_index, "w") as f:
+    with open(json_index, "w", encoding="utf-8") as f:
         json.dump(link_details, f)
         json.dump(link_details, f)
 
 
     init_process = subprocess.run(['archivebox', 'init'], capture_output=True, env=disable_extractors_dict)
     init_process = subprocess.run(['archivebox', 'init'], capture_output=True, env=disable_extractors_dict)
@@ -98,12 +98,12 @@ def test_collision_timestamps_different_urls(tmp_path, process, disable_extracto
     archive_folders.remove(first_archive.name)
     archive_folders.remove(first_archive.name)
     json_index = str(first_archive / "index.json")
     json_index = str(first_archive / "index.json")
 
 
-    with open(json_index, "r") as f:
+    with open(json_index, "r", encoding="utf-8") as f:
         link_details = json.loads(f.read())
         link_details = json.loads(f.read())
 
 
     link_details["timestamp"] = archive_folders[0]
     link_details["timestamp"] = archive_folders[0]
 
 
-    with open(json_index, "w") as f:
+    with open(json_index, "w", encoding="utf-8") as f:
         json.dump(link_details, f)
         json.dump(link_details, f)
 
 
     init_process = subprocess.run(['archivebox', 'init'], capture_output=True, env=disable_extractors_dict)
     init_process = subprocess.run(['archivebox', 'init'], capture_output=True, env=disable_extractors_dict)
@@ -173,4 +173,4 @@ def test_tags_migration(tmp_path, disable_extractors_dict):
         snapshot_id = tag["id"]
         snapshot_id = tag["id"]
         tag_name = tag["name"]
         tag_name = tag["name"]
         # Check each tag migrated is in the previous field
         # Check each tag migrated is in the previous field
-        assert tag_name in snapshots_dict[snapshot_id]
+        assert tag_name in snapshots_dict[snapshot_id]

+ 2 - 2
tests/test_list.py

@@ -50,7 +50,7 @@ def test_list_csv_headers(process, disable_extractors_dict):
 
 
 def test_list_index_with_wrong_flags(process):
 def test_list_index_with_wrong_flags(process):
     list_process = subprocess.run(["archivebox", "list", "--with-headers"], capture_output=True)
     list_process = subprocess.run(["archivebox", "list", "--with-headers"], capture_output=True)
-    assert "--with-headers can only be used with --json, --html or --csv options." in list_process.stderr.decode("utf-8")
+    assert "--with-headers can only be used with --json, --html or --csv options" in list_process.stderr.decode("utf-8")
 
 
 def test_link_sort_by_url(process, disable_extractors_dict):
 def test_link_sort_by_url(process, disable_extractors_dict):
     subprocess.run(["archivebox", "add", "http://127.0.0.1:8080/static/iana.org.html", "--depth=0"],
     subprocess.run(["archivebox", "add", "http://127.0.0.1:8080/static/iana.org.html", "--depth=0"],
@@ -64,4 +64,4 @@ def test_link_sort_by_url(process, disable_extractors_dict):
 
 
     list_process = subprocess.run(["archivebox", "list", "--sort=url"], capture_output=True)
     list_process = subprocess.run(["archivebox", "list", "--sort=url"], capture_output=True)
     link_list = list_process.stdout.decode("utf-8").split("\n")
     link_list = list_process.stdout.decode("utf-8").split("\n")
-    assert "http://127.0.0.1:8080/static/example.com.html" in link_list[0]
+    assert "http://127.0.0.1:8080/static/example.com.html" in link_list[0]

+ 14 - 10
tests/test_remove.py

@@ -100,16 +100,18 @@ def test_remove_before(tmp_path, process, disable_extractors_dict):
 
 
     conn = sqlite3.connect("index.sqlite3")
     conn = sqlite3.connect("index.sqlite3")
     c = conn.cursor()
     c = conn.cursor()
-    timestamp = c.execute("SELECT timestamp FROM core_snapshot ORDER BY timestamp ASC").fetchall()
+    higherts, lowerts = timestamp = c.execute("SELECT timestamp FROM core_snapshot ORDER BY timestamp DESC").fetchall()
     conn.commit()
     conn.commit()
     conn.close()
     conn.close()
 
 
-    before = list(map(lambda x: int(x[0].split(".")[0]), timestamp))
+    lowerts = lowerts[0]
+    higherts = higherts[0]
 
 
-    subprocess.run(['archivebox', 'remove', '--filter-type=regex', '.*', '--yes', '--delete', '--before', str(before[1])], capture_output=True)
+    # before is less than, so only the lower snapshot gets deleted
+    subprocess.run(['archivebox', 'remove', '--filter-type=regex', '.*', '--yes', '--delete', '--before', higherts], capture_output=True)
 
 
-    assert (tmp_path / "archive" / timestamp[0][0]).exists()
-    assert not (tmp_path / "archive" / timestamp[1][0]).exists()
+    assert not (tmp_path / "archive" / lowerts).exists()
+    assert (tmp_path / "archive" / higherts).exists()
 
 
 def test_remove_after(tmp_path, process, disable_extractors_dict):
 def test_remove_after(tmp_path, process, disable_extractors_dict):
     subprocess.run(['archivebox', 'add', 'http://127.0.0.1:8080/static/example.com.html'], capture_output=True, env=disable_extractors_dict)
     subprocess.run(['archivebox', 'add', 'http://127.0.0.1:8080/static/example.com.html'], capture_output=True, env=disable_extractors_dict)
@@ -118,13 +120,15 @@ def test_remove_after(tmp_path, process, disable_extractors_dict):
 
 
     conn = sqlite3.connect("index.sqlite3")
     conn = sqlite3.connect("index.sqlite3")
     c = conn.cursor()
     c = conn.cursor()
-    timestamp = c.execute("SELECT timestamp FROM core_snapshot ORDER BY timestamp ASC").fetchall()
+    higherts, lowerts = c.execute("SELECT timestamp FROM core_snapshot ORDER BY timestamp DESC").fetchall()
     conn.commit()
     conn.commit()
     conn.close()
     conn.close()
 
 
-    after = list(map(lambda x: int(x[0].split(".")[0]), timestamp))
+    lowerts = lowerts[0].split(".")[0]
+    higherts = higherts[0].split(".")[0]
 
 
-    subprocess.run(['archivebox', 'remove', '--filter-type=regex', '.*', '--yes', '--delete', '--after', str(after[1])], capture_output=True)
+    # after is greater than or equal to, so both snapshots get deleted
+    subprocess.run(['archivebox', 'remove', '--filter-type=regex', '.*', '--yes', '--delete', '--after', lowerts], capture_output=True)
 
 
-    assert (tmp_path / "archive" / timestamp[1][0]).exists()
-    assert not (tmp_path / "archive" / timestamp[0][0]).exists()
+    assert not (tmp_path / "archive" / lowerts).exists()
+    assert not (tmp_path / "archive" / higherts).exists()

+ 13 - 0
uwsgi.ini

@@ -0,0 +1,13 @@
+[uwsgi]
+socket = 127.0.0.1:3031
+chdir = ../
+http = 0.0.0.0:8001
+env = OUTPUT_DIR=./data
+wsgi-file = archivebox/core/wsgi.py
+processes = 4
+threads = 1
+stats = 127.0.0.1:9191
+static-map /static=./archivebox/templates/static
+harakiri = 172800
+post-buffering = 1
+disable-logging = True