Browse Source

Merge pull request #655 from ArchiveBox/debug-toolbar

Nick Sweeting 4 năm trước cách đây
mục cha
commit
6fb7bbf2fb
65 tập tin đã thay đổi với 1827 bổ sung615 xóa
  1. 17 12
      Dockerfile
  2. 5 1
      archivebox/cli/__init__.py
  3. 12 1
      archivebox/cli/archivebox_add.py
  4. 4 1
      archivebox/cli/archivebox_config.py
  5. 6 0
      archivebox/cli/archivebox_init.py
  6. 11 11
      archivebox/cli/archivebox_list.py
  7. 4 1
      archivebox/cli/archivebox_oneshot.py
  8. 4 1
      archivebox/cli/archivebox_remove.py
  9. 13 2
      archivebox/cli/archivebox_server.py
  10. 7 3
      archivebox/cli/archivebox_update.py
  11. 227 0
      archivebox/cli/tests.py
  12. 62 10
      archivebox/config.py
  13. 163 71
      archivebox/core/admin.py
  14. 2 1
      archivebox/core/forms.py
  15. 18 0
      archivebox/core/migrations/0009_auto_20210216_1038.py
  16. 18 0
      archivebox/core/migrations/0010_auto_20210216_1055.py
  17. 24 0
      archivebox/core/migrations/0011_auto_20210216_1331.py
  18. 23 0
      archivebox/core/migrations/0012_auto_20210216_1425.py
  19. 18 0
      archivebox/core/migrations/0013_auto_20210218_0729.py
  20. 18 0
      archivebox/core/migrations/0014_auto_20210218_0729.py
  21. 18 0
      archivebox/core/migrations/0015_auto_20210218_0730.py
  22. 18 0
      archivebox/core/migrations/0016_auto_20210218_1204.py
  23. 18 0
      archivebox/core/migrations/0017_auto_20210219_0211.py
  24. 90 38
      archivebox/core/models.py
  25. 127 1
      archivebox/core/settings.py
  26. 43 34
      archivebox/core/urls.py
  27. 161 42
      archivebox/core/views.py
  28. 3 3
      archivebox/core/wsgi.py
  29. 10 3
      archivebox/extractors/__init__.py
  30. 1 1
      archivebox/extractors/archive_org.py
  31. 11 3
      archivebox/extractors/mercury.py
  32. 13 5
      archivebox/extractors/readability.py
  33. 1 0
      archivebox/index/__init__.py
  34. 79 72
      archivebox/index/html.py
  35. 1 2
      archivebox/index/json.py
  36. 10 1
      archivebox/index/schema.py
  37. 61 21
      archivebox/index/sql.py
  38. 64 11
      archivebox/logging_util.py
  39. 156 95
      archivebox/main.py
  40. 45 31
      archivebox/parsers/__init__.py
  41. 1 1
      archivebox/search/utils.py
  42. 2 1
      archivebox/system.py
  43. 0 1
      archivebox/templates/admin/actions_as_select.html
  44. 6 6
      archivebox/templates/admin/base.html
  45. 1 1
      archivebox/templates/core/add.html
  46. 2 2
      archivebox/templates/core/base.html
  47. 2 2
      archivebox/templates/core/index_row.html
  48. 1 1
      archivebox/templates/core/minimal_index.html
  49. 15 10
      archivebox/templates/core/public_index.html
  50. 44 18
      archivebox/templates/core/snapshot.html
  51. 2 2
      archivebox/templates/core/static_index.html
  52. 1 1
      archivebox/templates/static/add.css
  53. 37 0
      archivebox/templates/static/admin.css
  54. BIN
      archivebox/templates/static/favicon.ico
  55. 2 0
      archivebox/templates/static/robots.txt
  56. 3 1
      archivebox/util.py
  57. 4 2
      bin/docker_entrypoint.sh
  58. 23 20
      docker-compose.yml
  59. 1 1
      package.json
  60. 50 40
      setup.py
  61. 3 3
      tests/test_add.py
  62. 4 4
      tests/test_extractors.py
  63. 10 10
      tests/test_init.py
  64. 14 10
      tests/test_remove.py
  65. 13 0
      uwsgi.ini

+ 17 - 12
Dockerfile

@@ -50,13 +50,6 @@ RUN apt-get update -qq \
         fontconfig fonts-ipafont-gothic fonts-wqy-zenhei fonts-thai-tlwg fonts-kacst fonts-symbola fonts-noto fonts-freefont-ttf \
         fontconfig fonts-ipafont-gothic fonts-wqy-zenhei fonts-thai-tlwg fonts-kacst fonts-symbola fonts-noto fonts-freefont-ttf \
     && rm -rf /var/lib/apt/lists/*
     && rm -rf /var/lib/apt/lists/*
 
 
-# Install apt development dependencies
-# RUN apt-get install -qq \
-#     && apt-get install -qq -y --no-install-recommends \
-#         python3 python3-dev python3-pip python3-venv python3-all \
-#         dh-python debhelper devscripts dput software-properties-common \
-#         python3-distutils python3-setuptools python3-wheel python3-stdeb
-
 # Install Node environment
 # Install Node environment
 RUN curl -s https://deb.nodesource.com/gpgkey/nodesource.gpg.key | apt-key add - \
 RUN curl -s https://deb.nodesource.com/gpgkey/nodesource.gpg.key | apt-key add - \
     && echo 'deb https://deb.nodesource.com/node_15.x buster main' >> /etc/apt/sources.list \
     && echo 'deb https://deb.nodesource.com/node_15.x buster main' >> /etc/apt/sources.list \
@@ -79,17 +72,26 @@ WORKDIR "$CODE_DIR"
 ENV PATH="${PATH}:$VENV_PATH/bin"
 ENV PATH="${PATH}:$VENV_PATH/bin"
 RUN python -m venv --clear --symlinks "$VENV_PATH" \
 RUN python -m venv --clear --symlinks "$VENV_PATH" \
     && pip install --upgrade --quiet pip setuptools
     && pip install --upgrade --quiet pip setuptools
-ADD ./pip_dist/archivebox.egg-info/requires.txt "$CODE_DIR/pip_dist/archivebox.egg-info/requires.txt"
+ADD "./setup.py" "$CODE_DIR/"
+ADD "./README.md" "./package.json" "$CODE_DIR/archivebox/"
 RUN apt-get update -qq \
 RUN apt-get update -qq \
     && apt-get install -qq -y --no-install-recommends \
     && apt-get install -qq -y --no-install-recommends \
         build-essential python-dev python3-dev \
         build-essential python-dev python3-dev \
-    # && pip install --upgrade pip \
-    && grep -B 1000 -E '^$' "$CODE_DIR/pip_dist/archivebox.egg-info/requires.txt" | pip install --quiet -r /dev/stdin \
-    && pip install --quiet "sonic-client==0.0.5" \
+    && python3 -c 'from distutils.core import run_setup; result = run_setup("./setup.py", stop_after="init"); print("\n".join(result.install_requires + result.extras_require["sonic"]))' > /tmp/requirements.txt \
+    && pip install --quiet -r /tmp/requirements.txt \
     && apt-get purge -y build-essential python-dev python3-dev \
     && apt-get purge -y build-essential python-dev python3-dev \
     && apt-get autoremove -y \
     && apt-get autoremove -y \
     && rm -rf /var/lib/apt/lists/*
     && rm -rf /var/lib/apt/lists/*
 
 
+# Install apt development dependencies
+# RUN apt-get install -qq \
+#     && apt-get install -qq -y --no-install-recommends \
+#         python3 python3-dev python3-pip python3-venv python3-all \
+#         dh-python debhelper devscripts dput software-properties-common \
+#         python3-distutils python3-setuptools python3-wheel python3-stdeb
+# RUN python3 -c 'from distutils.core import run_setup; result = run_setup("./setup.py", stop_after="init"); print("\n".join(result.extras_require["dev"]))' > /tmp/dev_requirements.txt \
+    # && pip install --quiet -r /tmp/dev_requirements.txt
+
 # Install ArchiveBox Python package and its dependencies
 # Install ArchiveBox Python package and its dependencies
 WORKDIR "$CODE_DIR"
 WORKDIR "$CODE_DIR"
 ADD . "$CODE_DIR"
 ADD . "$CODE_DIR"
@@ -115,5 +117,8 @@ RUN /app/bin/docker_entrypoint.sh archivebox version
 VOLUME "$DATA_DIR"
 VOLUME "$DATA_DIR"
 EXPOSE 8000
 EXPOSE 8000
 
 
+HEALTHCHECK --interval=30s --timeout=20s --retries=15 \
+    CMD curl --silent 'http://localhost:8000/admin/login/' || exit 1
+
 ENTRYPOINT ["dumb-init", "--", "/app/bin/docker_entrypoint.sh"]
 ENTRYPOINT ["dumb-init", "--", "/app/bin/docker_entrypoint.sh"]
-CMD ["archivebox", "server", "0.0.0.0:8000"]
+CMD ["archivebox", "server", "--quick-init", "0.0.0.0:8000"]

+ 5 - 1
archivebox/cli/__init__.py

@@ -63,7 +63,11 @@ def run_subcommand(subcommand: str,
 
 
     if subcommand not in meta_cmds:
     if subcommand not in meta_cmds:
         from ..config import setup_django
         from ..config import setup_django
-        setup_django(in_memory_db=subcommand in fake_db, check_db=subcommand in archive_cmds)
+
+        cmd_requires_db = subcommand in archive_cmds
+        init_pending = '--init' in subcommand_args or '--quick-init' in subcommand_args
+
+        setup_django(in_memory_db=subcommand in fake_db, check_db=cmd_requires_db and not init_pending)
 
 
     module = import_module('.archivebox_{}'.format(subcommand), __package__)
     module = import_module('.archivebox_{}'.format(subcommand), __package__)
     module.main(args=subcommand_args, stdin=stdin, pwd=pwd)    # type: ignore
     module.main(args=subcommand_args, stdin=stdin, pwd=pwd)    # type: ignore

+ 12 - 1
archivebox/cli/archivebox_add.py

@@ -22,6 +22,12 @@ def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional
         add_help=True,
         add_help=True,
         formatter_class=SmartFormatter,
         formatter_class=SmartFormatter,
     )
     )
+    parser.add_argument(
+        '--tag', '-t',
+        type=str,
+        default='',
+        help="Tag the added URLs with the provided tags e.g. --tag=tag1,tag2,tag3",
+    )
     parser.add_argument(
     parser.add_argument(
         '--update-all', #'-n',
         '--update-all', #'-n',
         action='store_true',
         action='store_true',
@@ -75,7 +81,11 @@ def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional
     )
     )
     command = parser.parse_args(args or ())
     command = parser.parse_args(args or ())
     urls = command.urls
     urls = command.urls
-    stdin_urls = accept_stdin(stdin)
+
+    stdin_urls = ''
+    if not urls:
+        stdin_urls = accept_stdin(stdin)
+
     if (stdin_urls and urls) or (not stdin and not urls):
     if (stdin_urls and urls) or (not stdin and not urls):
         stderr(
         stderr(
             '[X] You must pass URLs/paths to add via stdin or CLI arguments.\n',
             '[X] You must pass URLs/paths to add via stdin or CLI arguments.\n',
@@ -85,6 +95,7 @@ def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional
     add(
     add(
         urls=stdin_urls or urls,
         urls=stdin_urls or urls,
         depth=command.depth,
         depth=command.depth,
+        tag=command.tag,
         update_all=command.update_all,
         update_all=command.update_all,
         index_only=command.index_only,
         index_only=command.index_only,
         overwrite=command.overwrite,
         overwrite=command.overwrite,

+ 4 - 1
archivebox/cli/archivebox_config.py

@@ -45,7 +45,10 @@ def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional
         help='KEY or KEY=VALUE formatted config values to get or set',
         help='KEY or KEY=VALUE formatted config values to get or set',
     )
     )
     command = parser.parse_args(args or ())
     command = parser.parse_args(args or ())
-    config_options_str = accept_stdin(stdin)
+
+    config_options_str = ''
+    if not command.config_options:
+        config_options_str = accept_stdin(stdin)
 
 
     config(
     config(
         config_options_str=config_options_str,
         config_options_str=config_options_str,

+ 6 - 0
archivebox/cli/archivebox_init.py

@@ -27,11 +27,17 @@ def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional
         action='store_true',
         action='store_true',
         help='Ignore unrecognized files in current directory and initialize anyway',
         help='Ignore unrecognized files in current directory and initialize anyway',
     )
     )
+    parser.add_argument(
+        '--quick', '-q',
+        action='store_true',
+        help='Run any updates or migrations without rechecking all snapshot dirs',
+    )
     command = parser.parse_args(args or ())
     command = parser.parse_args(args or ())
     reject_stdin(__command__, stdin)
     reject_stdin(__command__, stdin)
 
 
     init(
     init(
         force=command.force,
         force=command.force,
+        quick=command.quick,
         out_dir=pwd or OUTPUT_DIR,
         out_dir=pwd or OUTPUT_DIR,
     )
     )
     
     

+ 11 - 11
archivebox/cli/archivebox_list.py

@@ -12,6 +12,7 @@ from ..main import list_all
 from ..util import docstring
 from ..util import docstring
 from ..config import OUTPUT_DIR
 from ..config import OUTPUT_DIR
 from ..index import (
 from ..index import (
+    LINK_FILTERS,
     get_indexed_folders,
     get_indexed_folders,
     get_archived_folders,
     get_archived_folders,
     get_unarchived_folders,
     get_unarchived_folders,
@@ -23,7 +24,7 @@ from ..index import (
     get_corrupted_folders,
     get_corrupted_folders,
     get_unrecognized_folders,
     get_unrecognized_folders,
 )
 )
-from ..logging_util import SmartFormatter, accept_stdin, stderr
+from ..logging_util import SmartFormatter, reject_stdin, stderr
 
 
 
 
 @docstring(list_all.__doc__)
 @docstring(list_all.__doc__)
@@ -44,7 +45,7 @@ def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional
     group.add_argument(
     group.add_argument(
         '--json', #'-j',
         '--json', #'-j',
         action='store_true',
         action='store_true',
-        help="Print the output in JSON format with all columns included.",
+        help="Print the output in JSON format with all columns included",
     )
     )
     group.add_argument(
     group.add_argument(
         '--html',
         '--html',
@@ -59,19 +60,19 @@ def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional
     parser.add_argument(
     parser.add_argument(
         '--sort', #'-s',
         '--sort', #'-s',
         type=str,
         type=str,
-        help="List the links sorted using the given key, e.g. timestamp or updated.",
+        help="List the links sorted using the given key, e.g. timestamp or updated",
         default=None,
         default=None,
     )
     )
     parser.add_argument(
     parser.add_argument(
         '--before', #'-b',
         '--before', #'-b',
         type=float,
         type=float,
-        help="List only links bookmarked before the given timestamp.",
+        help="List only links bookmarked before (less than) the given timestamp",
         default=None,
         default=None,
     )
     )
     parser.add_argument(
     parser.add_argument(
         '--after', #'-a',
         '--after', #'-a',
         type=float,
         type=float,
-        help="List only links bookmarked after the given timestamp.",
+        help="List only links bookmarked after (greater than or equal to) the given timestamp",
         default=None,
         default=None,
     )
     )
     parser.add_argument(
     parser.add_argument(
@@ -96,9 +97,9 @@ def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional
         )
         )
     )
     )
     parser.add_argument(
     parser.add_argument(
-        '--filter-type',
+        '--filter-type', '-t',
         type=str,
         type=str,
-        choices=('exact', 'substring', 'domain', 'regex', 'tag', 'search'),
+        choices=(*LINK_FILTERS.keys(), 'search'),
         default='exact',
         default='exact',
         help='Type of pattern matching to use when filtering URLs',
         help='Type of pattern matching to use when filtering URLs',
     )
     )
@@ -107,20 +108,19 @@ def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional
         nargs='*',
         nargs='*',
         type=str,
         type=str,
         default=None,
         default=None,
-        help='List only URLs matching these filter patterns.'
+        help='List only URLs matching these filter patterns'
     )
     )
     command = parser.parse_args(args or ())
     command = parser.parse_args(args or ())
-    filter_patterns_str = accept_stdin(stdin)
+    reject_stdin(stdin)
 
 
     if command.with_headers and not (command.json or command.html or command.csv):
     if command.with_headers and not (command.json or command.html or command.csv):
         stderr(
         stderr(
-            '[X] --with-headers can only be used with --json, --html or --csv options.\n',
+            '[X] --with-headers can only be used with --json, --html or --csv options\n',
             color='red',
             color='red',
         )
         )
         raise SystemExit(2)
         raise SystemExit(2)
 
 
     matching_folders = list_all(
     matching_folders = list_all(
-        filter_patterns_str=filter_patterns_str,
         filter_patterns=command.filter_patterns,
         filter_patterns=command.filter_patterns,
         filter_type=command.filter_type,
         filter_type=command.filter_type,
         status=command.status,
         status=command.status,

+ 4 - 1
archivebox/cli/archivebox_oneshot.py

@@ -50,8 +50,11 @@ def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional
         help= "Path to save the single archive folder to, e.g. ./example.com_archive"
         help= "Path to save the single archive folder to, e.g. ./example.com_archive"
     )
     )
     command = parser.parse_args(args or ())
     command = parser.parse_args(args or ())
+    stdin_url = None
     url = command.url
     url = command.url
-    stdin_url = accept_stdin(stdin)
+    if not url:
+        stdin_url = accept_stdin(stdin)
+
     if (stdin_url and url) or (not stdin and not url):
     if (stdin_url and url) or (not stdin and not url):
         stderr(
         stderr(
             '[X] You must pass a URL/path to add via stdin or CLI arguments.\n',
             '[X] You must pass a URL/path to add via stdin or CLI arguments.\n',

+ 4 - 1
archivebox/cli/archivebox_remove.py

@@ -61,7 +61,10 @@ def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional
         help='URLs matching this filter pattern will be removed from the index.'
         help='URLs matching this filter pattern will be removed from the index.'
     )
     )
     command = parser.parse_args(args or ())
     command = parser.parse_args(args or ())
-    filter_str = accept_stdin(stdin)
+    
+    filter_str = None
+    if not command.filter_patterns:
+        filter_str = accept_stdin(stdin)
 
 
     remove(
     remove(
         filter_str=filter_str,
         filter_str=filter_str,

+ 13 - 2
archivebox/cli/archivebox_server.py

@@ -38,10 +38,20 @@ def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional
         action='store_true',
         action='store_true',
         help='Enable DEBUG=True mode with more verbose errors',
         help='Enable DEBUG=True mode with more verbose errors',
     )
     )
+    parser.add_argument(
+        '--nothreading',
+        action='store_true',
+        help='Force runserver to run in single-threaded mode',
+    )
     parser.add_argument(
     parser.add_argument(
         '--init',
         '--init',
         action='store_true',
         action='store_true',
-        help='Run archivebox init before starting the server',
+        help='Run a full archivebox init/upgrade before starting the server',
+    )
+    parser.add_argument(
+        '--quick-init', '-i',
+        action='store_true',
+        help='Run quick archivebox init/upgrade before starting the server',
     )
     )
     parser.add_argument(
     parser.add_argument(
         '--createsuperuser',
         '--createsuperuser',
@@ -52,10 +62,11 @@ def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional
     reject_stdin(__command__, stdin)
     reject_stdin(__command__, stdin)
     
     
     server(
     server(
-        runserver_args=command.runserver_args,
+        runserver_args=command.runserver_args + (['--nothreading'] if command.nothreading else []),
         reload=command.reload,
         reload=command.reload,
         debug=command.debug,
         debug=command.debug,
         init=command.init,
         init=command.init,
+        quick_init=command.quick_init,
         createsuperuser=command.createsuperuser,
         createsuperuser=command.createsuperuser,
         out_dir=pwd or OUTPUT_DIR,
         out_dir=pwd or OUTPUT_DIR,
     )
     )

+ 7 - 3
archivebox/cli/archivebox_update.py

@@ -12,6 +12,7 @@ from ..main import update
 from ..util import docstring
 from ..util import docstring
 from ..config import OUTPUT_DIR
 from ..config import OUTPUT_DIR
 from ..index import (
 from ..index import (
+    LINK_FILTERS,
     get_indexed_folders,
     get_indexed_folders,
     get_archived_folders,
     get_archived_folders,
     get_unarchived_folders,
     get_unarchived_folders,
@@ -89,9 +90,9 @@ def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional
         )
         )
     )
     )
     parser.add_argument(
     parser.add_argument(
-        '--filter-type',
+        '--filter-type', '-t',
         type=str,
         type=str,
-        choices=('exact', 'substring', 'domain', 'regex', 'tag', 'search'),
+        choices=(*LINK_FILTERS.keys(), 'search'),
         default='exact',
         default='exact',
         help='Type of pattern matching to use when filtering URLs',
         help='Type of pattern matching to use when filtering URLs',
     )
     )
@@ -110,7 +111,10 @@ def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional
         default=""
         default=""
     )
     )
     command = parser.parse_args(args or ())
     command = parser.parse_args(args or ())
-    filter_patterns_str = accept_stdin(stdin)
+
+    filter_patterns_str = None
+    if not command.filter_patterns:
+        filter_patterns_str = accept_stdin(stdin)
 
 
     update(
     update(
         resume=command.resume,
         resume=command.resume,

+ 227 - 0
archivebox/cli/tests.py

@@ -0,0 +1,227 @@
+#!/usr/bin/env python3
+
+__package__ = 'archivebox.cli'
+
+
+import os
+import sys
+import shutil
+import unittest
+from pathlib import Path
+
+from contextlib import contextmanager
+
+TEST_CONFIG = {
+    'USE_COLOR': 'False',
+    'SHOW_PROGRESS': 'False',
+
+    'OUTPUT_DIR': 'data.tests',
+    
+    'SAVE_ARCHIVE_DOT_ORG': 'False',
+    'SAVE_TITLE': 'False',
+    
+    'USE_CURL': 'False',
+    'USE_WGET': 'False',
+    'USE_GIT': 'False',
+    'USE_CHROME': 'False',
+    'USE_YOUTUBEDL': 'False',
+}
+
+OUTPUT_DIR = 'data.tests'
+os.environ.update(TEST_CONFIG)
+
+from ..main import init
+from ..index import load_main_index
+from ..config import (
+    SQL_INDEX_FILENAME,
+    JSON_INDEX_FILENAME,
+    HTML_INDEX_FILENAME,
+)
+
+from . import (
+    archivebox_init,
+    archivebox_add,
+    archivebox_remove,
+)
+
+HIDE_CLI_OUTPUT = True
+
+test_urls = '''
+https://example1.com/what/is/happening.html?what=1#how-about-this=1
+https://example2.com/what/is/happening/?what=1#how-about-this=1
+HTtpS://example3.com/what/is/happening/?what=1#how-about-this=1f
+https://example4.com/what/is/happening.html
+https://example5.com/
+https://example6.com
+
+<test>http://example7.com</test>
+[https://example8.com/what/is/this.php?what=1]
+[and http://example9.com?what=1&other=3#and-thing=2]
+<what>https://example10.com#and-thing=2 "</about>
+abc<this["https://subb.example11.com/what/is#and-thing=2?whoami=23&where=1"]that>def
+sdflkf[what](https://subb.example12.com/who/what.php?whoami=1#whatami=2)?am=hi
+example13.bada
+and example14.badb
+<or>htt://example15.badc</that>
+'''
+
+stdout = sys.stdout
+stderr = sys.stderr
+
+
+@contextmanager
+def output_hidden(show_failing=True):
+    if not HIDE_CLI_OUTPUT:
+        yield
+        return
+
+    sys.stdout = open('stdout.txt', 'w+', encoding='utf-8')
+    sys.stderr = open('stderr.txt', 'w+', encoding='utf-8')
+    try:
+        yield
+        sys.stdout.close()
+        sys.stderr.close()
+        sys.stdout = stdout
+        sys.stderr = stderr
+    except Exception:
+        sys.stdout.close()
+        sys.stderr.close()
+        sys.stdout = stdout
+        sys.stderr = stderr
+        if show_failing:
+            with open('stdout.txt', 'r', encoding='utf-8') as f:
+                print(f.read())
+            with open('stderr.txt', 'r', encoding='utf-8') as f:
+                print(f.read())
+        raise
+    finally:
+        os.remove('stdout.txt')
+        os.remove('stderr.txt')
+
+
+class TestInit(unittest.TestCase):
+    def setUp(self):
+        os.makedirs(OUTPUT_DIR, exist_ok=True)
+
+    def tearDown(self):
+        shutil.rmtree(OUTPUT_DIR, ignore_errors=True)
+
+    def test_basic_init(self):
+        with output_hidden():
+            archivebox_init.main([])
+
+        assert (Path(OUTPUT_DIR) / SQL_INDEX_FILENAME).exists()
+        assert (Path(OUTPUT_DIR) / JSON_INDEX_FILENAME).exists()
+        assert (Path(OUTPUT_DIR) / HTML_INDEX_FILENAME).exists()
+        assert len(load_main_index(out_dir=OUTPUT_DIR)) == 0
+
+    def test_conflicting_init(self):
+        with open(Path(OUTPUT_DIR) / 'test_conflict.txt', 'w+', encoding='utf-8') as f:
+            f.write('test')
+
+        try:
+            with output_hidden(show_failing=False):
+                archivebox_init.main([])
+            assert False, 'Init should have exited with an exception'
+        except SystemExit:
+            pass
+
+        assert not (Path(OUTPUT_DIR) / SQL_INDEX_FILENAME).exists()
+        assert not (Path(OUTPUT_DIR) / JSON_INDEX_FILENAME).exists()
+        assert not (Path(OUTPUT_DIR) / HTML_INDEX_FILENAME).exists()
+        try:
+            load_main_index(out_dir=OUTPUT_DIR)
+            assert False, 'load_main_index should raise an exception when no index is present'
+        except Exception:
+            pass
+
+    def test_no_dirty_state(self):
+        with output_hidden():
+            init()
+        shutil.rmtree(OUTPUT_DIR, ignore_errors=True)
+        with output_hidden():
+            init()
+
+
+class TestAdd(unittest.TestCase):
+    def setUp(self):
+        os.makedirs(OUTPUT_DIR, exist_ok=True)
+        with output_hidden():
+            init()
+
+    def tearDown(self):
+        shutil.rmtree(OUTPUT_DIR, ignore_errors=True)
+
+    def test_add_arg_url(self):
+        with output_hidden():
+            archivebox_add.main(['https://getpocket.com/users/nikisweeting/feed/all'])
+
+        all_links = load_main_index(out_dir=OUTPUT_DIR)
+        assert len(all_links) == 30
+
+    def test_add_arg_file(self):
+        test_file = Path(OUTPUT_DIR) / 'test.txt'
+        with open(test_file, 'w+', encoding='utf') as f:
+            f.write(test_urls)
+
+        with output_hidden():
+            archivebox_add.main([test_file])
+
+        all_links = load_main_index(out_dir=OUTPUT_DIR)
+        assert len(all_links) == 12
+        os.remove(test_file)
+
+    def test_add_stdin_url(self):
+        with output_hidden():
+            archivebox_add.main([], stdin=test_urls)
+
+        all_links = load_main_index(out_dir=OUTPUT_DIR)
+        assert len(all_links) == 12
+
+
+class TestRemove(unittest.TestCase):
+    def setUp(self):
+        os.makedirs(OUTPUT_DIR, exist_ok=True)
+        with output_hidden():
+            init()
+            archivebox_add.main([], stdin=test_urls)
+
+    # def tearDown(self):
+        # shutil.rmtree(OUTPUT_DIR, ignore_errors=True)
+
+
+    def test_remove_exact(self):
+        with output_hidden():
+            archivebox_remove.main(['--yes', '--delete', 'https://example5.com/'])
+
+        all_links = load_main_index(out_dir=OUTPUT_DIR)
+        assert len(all_links) == 11
+
+    def test_remove_regex(self):
+        with output_hidden():
+            archivebox_remove.main(['--yes', '--delete', '--filter-type=regex', r'http(s)?:\/\/(.+\.)?(example\d\.com)'])
+
+        all_links = load_main_index(out_dir=OUTPUT_DIR)
+        assert len(all_links) == 4
+
+    def test_remove_domain(self):
+        with output_hidden():
+            archivebox_remove.main(['--yes', '--delete', '--filter-type=domain', 'example5.com', 'example6.com'])
+
+        all_links = load_main_index(out_dir=OUTPUT_DIR)
+        assert len(all_links) == 10
+
+    def test_remove_none(self):
+        try:
+            with output_hidden(show_failing=False):
+                archivebox_remove.main(['--yes', '--delete', 'https://doesntexist.com'])
+            assert False, 'Should raise if no URLs match'
+        except Exception:
+            pass
+
+
+if __name__ == '__main__':
+    if '--verbose' in sys.argv or '-v' in sys.argv:
+        HIDE_CLI_OUTPUT = False
+    
+    unittest.main()

+ 62 - 10
archivebox/config.py

@@ -29,10 +29,12 @@ import json
 import getpass
 import getpass
 import platform
 import platform
 import shutil
 import shutil
+import sqlite3
 import django
 import django
 
 
 from hashlib import md5
 from hashlib import md5
 from pathlib import Path
 from pathlib import Path
+from datetime import datetime
 from typing import Optional, Type, Tuple, Dict, Union, List
 from typing import Optional, Type, Tuple, Dict, Union, List
 from subprocess import run, PIPE, DEVNULL
 from subprocess import run, PIPE, DEVNULL
 from configparser import ConfigParser
 from configparser import ConfigParser
@@ -77,6 +79,7 @@ CONFIG_SCHEMA: Dict[str, ConfigDefaultDict] = {
         'PUBLIC_SNAPSHOTS':         {'type': bool,  'default': True},
         'PUBLIC_SNAPSHOTS':         {'type': bool,  'default': True},
         'PUBLIC_ADD_VIEW':          {'type': bool,  'default': False},
         'PUBLIC_ADD_VIEW':          {'type': bool,  'default': False},
         'FOOTER_INFO':              {'type': str,   'default': 'Content is hosted for personal archiving purposes only.  Contact server owner for any takedown requests.'},
         'FOOTER_INFO':              {'type': str,   'default': 'Content is hosted for personal archiving purposes only.  Contact server owner for any takedown requests.'},
+        'SNAPSHOTS_PER_PAGE':       {'type': int,   'default': 40},
     },
     },
 
 
     'ARCHIVE_METHOD_TOGGLES': {
     'ARCHIVE_METHOD_TOGGLES': {
@@ -99,8 +102,9 @@ CONFIG_SCHEMA: Dict[str, ConfigDefaultDict] = {
 
 
     'ARCHIVE_METHOD_OPTIONS': {
     'ARCHIVE_METHOD_OPTIONS': {
         'RESOLUTION':               {'type': str,   'default': '1440,2000', 'aliases': ('SCREENSHOT_RESOLUTION',)},
         'RESOLUTION':               {'type': str,   'default': '1440,2000', 'aliases': ('SCREENSHOT_RESOLUTION',)},
-        'GIT_DOMAINS':              {'type': str,   'default': 'github.com,bitbucket.org,gitlab.com'},
+        'GIT_DOMAINS':              {'type': str,   'default': 'github.com,bitbucket.org,gitlab.com,gist.github.com'},
         'CHECK_SSL_VALIDITY':       {'type': bool,  'default': True},
         'CHECK_SSL_VALIDITY':       {'type': bool,  'default': True},
+        'MEDIA_MAX_SIZE':           {'type': str,   'default': '750m'},
 
 
         'CURL_USER_AGENT':          {'type': str,   'default': 'ArchiveBox/{VERSION} (+https://github.com/ArchiveBox/ArchiveBox/) curl/{CURL_VERSION}'},
         'CURL_USER_AGENT':          {'type': str,   'default': 'ArchiveBox/{VERSION} (+https://github.com/ArchiveBox/ArchiveBox/) curl/{CURL_VERSION}'},
         'WGET_USER_AGENT':          {'type': str,   'default': 'ArchiveBox/{VERSION} (+https://github.com/ArchiveBox/ArchiveBox/) wget/{WGET_VERSION}'},
         'WGET_USER_AGENT':          {'type': str,   'default': 'ArchiveBox/{VERSION} (+https://github.com/ArchiveBox/ArchiveBox/) wget/{WGET_VERSION}'},
@@ -111,7 +115,7 @@ CONFIG_SCHEMA: Dict[str, ConfigDefaultDict] = {
 
 
         'CHROME_HEADLESS':          {'type': bool,  'default': True},
         'CHROME_HEADLESS':          {'type': bool,  'default': True},
         'CHROME_SANDBOX':           {'type': bool,  'default': lambda c: not c['IN_DOCKER']},
         'CHROME_SANDBOX':           {'type': bool,  'default': lambda c: not c['IN_DOCKER']},
-        'YOUTUBEDL_ARGS':           {'type': list,  'default': ['--write-description',
+        'YOUTUBEDL_ARGS':           {'type': list,  'default': lambda c: ['--write-description',
                                                                 '--write-info-json',
                                                                 '--write-info-json',
                                                                 '--write-annotations',
                                                                 '--write-annotations',
                                                                 '--write-thumbnail',
                                                                 '--write-thumbnail',
@@ -122,7 +126,7 @@ CONFIG_SCHEMA: Dict[str, ConfigDefaultDict] = {
                                                                 '--ignore-errors',
                                                                 '--ignore-errors',
                                                                 '--geo-bypass',
                                                                 '--geo-bypass',
                                                                 '--add-metadata',
                                                                 '--add-metadata',
-                                                                '--max-filesize=750m',
+                                                                '--max-filesize={}'.format(c['MEDIA_MAX_SIZE']),
                                                                 ]},
                                                                 ]},
                                                                     
                                                                     
 
 
@@ -287,7 +291,6 @@ DYNAMIC_CONFIG_SCHEMA: ConfigDefaultDict = {
 
 
     'ARCHIVEBOX_BINARY':        {'default': lambda c: sys.argv[0]},
     'ARCHIVEBOX_BINARY':        {'default': lambda c: sys.argv[0]},
     'VERSION':                  {'default': lambda c: json.loads((Path(c['PACKAGE_DIR']) / 'package.json').read_text().strip())['version']},
     'VERSION':                  {'default': lambda c: json.loads((Path(c['PACKAGE_DIR']) / 'package.json').read_text().strip())['version']},
-    'GIT_SHA':                  {'default': lambda c: c['VERSION'].split('+')[-1] or 'unknown'},
 
 
     'PYTHON_BINARY':            {'default': lambda c: sys.executable},
     'PYTHON_BINARY':            {'default': lambda c: sys.executable},
     'PYTHON_ENCODING':          {'default': lambda c: sys.stdout.encoding.upper()},
     'PYTHON_ENCODING':          {'default': lambda c: sys.stdout.encoding.upper()},
@@ -459,7 +462,7 @@ def write_config_file(config: Dict[str, str], out_dir: str=None) -> ConfigDict:
     config_file.optionxform = str
     config_file.optionxform = str
     config_file.read(config_path)
     config_file.read(config_path)
 
 
-    with open(config_path, 'r') as old:
+    with open(config_path, 'r', encoding='utf-8') as old:
         atomic_write(f'{config_path}.bak', old.read())
         atomic_write(f'{config_path}.bak', old.read())
 
 
     find_section = lambda key: [name for name, opts in CONFIG_SCHEMA.items() if key in opts][0]
     find_section = lambda key: [name for name, opts in CONFIG_SCHEMA.items() if key in opts][0]
@@ -480,14 +483,14 @@ def write_config_file(config: Dict[str, str], out_dir: str=None) -> ConfigDict:
 
 
     if (not existing_secret_key) or ('not a valid secret' in existing_secret_key):
     if (not existing_secret_key) or ('not a valid secret' in existing_secret_key):
         from django.utils.crypto import get_random_string
         from django.utils.crypto import get_random_string
-        chars = 'abcdefghijklmnopqrstuvwxyz0123456789-_+!.'
+        chars = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789_'
         random_secret_key = get_random_string(50, chars)
         random_secret_key = get_random_string(50, chars)
         if 'SERVER_CONFIG' in config_file:
         if 'SERVER_CONFIG' in config_file:
             config_file['SERVER_CONFIG']['SECRET_KEY'] = random_secret_key
             config_file['SERVER_CONFIG']['SECRET_KEY'] = random_secret_key
         else:
         else:
             config_file['SERVER_CONFIG'] = {'SECRET_KEY': random_secret_key}
             config_file['SERVER_CONFIG'] = {'SECRET_KEY': random_secret_key}
 
 
-    with open(config_path, 'w+') as new:
+    with open(config_path, 'w+', encoding='utf-8') as new:
         config_file.write(new)
         config_file.write(new)
     
     
     try:
     try:
@@ -499,7 +502,7 @@ def write_config_file(config: Dict[str, str], out_dir: str=None) -> ConfigDict:
         }
         }
     except:
     except:
         # something went horribly wrong, rever to the previous version
         # something went horribly wrong, rever to the previous version
-        with open(f'{config_path}.bak', 'r') as old:
+        with open(f'{config_path}.bak', 'r', encoding='utf-8') as old:
             atomic_write(config_path, old.read())
             atomic_write(config_path, old.read())
 
 
     if Path(f'{config_path}.bak').exists():
     if Path(f'{config_path}.bak').exists():
@@ -1062,23 +1065,72 @@ def setup_django(out_dir: Path=None, check_db=False, config: ConfigDict=CONFIG,
 
 
     try:
     try:
         import django
         import django
+        from django.core.management import call_command
+
         sys.path.append(str(config['PACKAGE_DIR']))
         sys.path.append(str(config['PACKAGE_DIR']))
         os.environ.setdefault('OUTPUT_DIR', str(output_dir))
         os.environ.setdefault('OUTPUT_DIR', str(output_dir))
         assert (config['PACKAGE_DIR'] / 'core' / 'settings.py').exists(), 'settings.py was not found at archivebox/core/settings.py'
         assert (config['PACKAGE_DIR'] / 'core' / 'settings.py').exists(), 'settings.py was not found at archivebox/core/settings.py'
         os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'core.settings')
         os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'core.settings')
 
 
+        # Check to make sure JSON extension is available in our Sqlite3 instance
+        try:
+            cursor = sqlite3.connect(':memory:').cursor()
+            cursor.execute('SELECT JSON(\'{"a": "b"}\')')
+        except sqlite3.OperationalError as exc:
+            stderr('[X] Your SQLite3 version is missing the required JSON1 extension', color='red')
+            hint([
+                'Upgrade your Python version or install the extension manually:',
+                'https://code.djangoproject.com/wiki/JSON1Extension'
+            ])
+
         if in_memory_db:
         if in_memory_db:
-            # Put the db in memory and run migrations in case any command requires it
-            from django.core.management import call_command
+            # some commands (e.g. oneshot) dont store a long-lived sqlite3 db file on disk.
+            # in those cases we create a temporary in-memory db and run the migrations
+            # immediately to get a usable in-memory-database at startup
             os.environ.setdefault("ARCHIVEBOX_DATABASE_NAME", ":memory:")
             os.environ.setdefault("ARCHIVEBOX_DATABASE_NAME", ":memory:")
             django.setup()
             django.setup()
             call_command("migrate", interactive=False, verbosity=0)
             call_command("migrate", interactive=False, verbosity=0)
         else:
         else:
+            # Otherwise use default sqlite3 file-based database and initialize django
+            # without running migrations automatically (user runs them manually by calling init)
             django.setup()
             django.setup()
+            
+
+        from django.conf import settings
+
+        # log startup message to the error log
+        with open(settings.ERROR_LOG, "a+", encoding='utf-8') as f:
+            command = ' '.join(sys.argv)
+            ts = datetime.now().strftime('%Y-%m-%d__%H:%M:%S')
+            f.write(f"\n> {command}; ts={ts} version={config['VERSION']} docker={config['IN_DOCKER']} is_tty={config['IS_TTY']}\n")
+
 
 
         if check_db:
         if check_db:
+            # Enable WAL mode in sqlite3
+            from django.db import connection
+            with connection.cursor() as cursor:
+                current_mode = cursor.execute("PRAGMA journal_mode")
+                if current_mode != 'wal':
+                    cursor.execute("PRAGMA journal_mode=wal;")
+
+            # Create cache table in DB if needed
+            try:
+                from django.core.cache import cache
+                cache.get('test', None)
+            except django.db.utils.OperationalError:
+                call_command("createcachetable", verbosity=0)
+
+
+            # if archivebox gets imported multiple times, we have to close
+            # the sqlite3 whenever we init from scratch to avoid multiple threads
+            # sharing the same connection by accident
+            from django.db import connections
+            for conn in connections.all():
+                conn.close_if_unusable_or_obsolete()
+
             sql_index_path = Path(output_dir) / SQL_INDEX_FILENAME
             sql_index_path = Path(output_dir) / SQL_INDEX_FILENAME
             assert sql_index_path.exists(), (
             assert sql_index_path.exists(), (
                 f'No database file {SQL_INDEX_FILENAME} found in: {config["OUTPUT_DIR"]} (Are you in an ArchiveBox collection directory?)')
                 f'No database file {SQL_INDEX_FILENAME} found in: {config["OUTPUT_DIR"]} (Are you in an ArchiveBox collection directory?)')
+
     except KeyboardInterrupt:
     except KeyboardInterrupt:
         raise SystemExit(2)
         raise SystemExit(2)

+ 163 - 71
archivebox/core/admin.py

@@ -1,6 +1,7 @@
 __package__ = 'archivebox.core'
 __package__ = 'archivebox.core'
 
 
 from io import StringIO
 from io import StringIO
+from pathlib import Path
 from contextlib import redirect_stdout
 from contextlib import redirect_stdout
 
 
 from django.contrib import admin
 from django.contrib import admin
@@ -13,15 +14,15 @@ from django import forms
 
 
 from ..util import htmldecode, urldecode, ansi_to_html
 from ..util import htmldecode, urldecode, ansi_to_html
 
 
-from core.models import Snapshot, Tag
-from core.forms import AddLinkForm, TagField
+from core.models import Snapshot, ArchiveResult, Tag
+from core.forms import AddLinkForm
 
 
 from core.mixins import SearchResultsAdminMixin
 from core.mixins import SearchResultsAdminMixin
 
 
 from index.html import snapshot_icons
 from index.html import snapshot_icons
 from logging_util import printable_filesize
 from logging_util import printable_filesize
 from main import add, remove
 from main import add, remove
-from config import OUTPUT_DIR
+from config import OUTPUT_DIR, SNAPSHOTS_PER_PAGE
 from extractors import archive_links
 from extractors import archive_links
 
 
 # Admin URLs
 # Admin URLs
@@ -36,77 +37,34 @@ from extractors import archive_links
 
 
 # TODO: https://stackoverflow.com/questions/40760880/add-custom-button-to-django-admin-panel
 # TODO: https://stackoverflow.com/questions/40760880/add-custom-button-to-django-admin-panel
 
 
-def update_snapshots(modeladmin, request, queryset):
-    archive_links([
-        snapshot.as_link()
-        for snapshot in queryset
-    ], out_dir=OUTPUT_DIR)
-update_snapshots.short_description = "Archive"
 
 
-def update_titles(modeladmin, request, queryset):
-    archive_links([
-        snapshot.as_link()
-        for snapshot in queryset
-    ], overwrite=True, methods=('title','favicon'), out_dir=OUTPUT_DIR)
-update_titles.short_description = "Pull title"
+class ArchiveResultInline(admin.TabularInline):
+    model = ArchiveResult
 
 
-def overwrite_snapshots(modeladmin, request, queryset):
-    archive_links([
-        snapshot.as_link()
-        for snapshot in queryset
-    ], overwrite=True, out_dir=OUTPUT_DIR)
-overwrite_snapshots.short_description = "Re-archive (overwrite)"
+class TagInline(admin.TabularInline):
+    model = Snapshot.tags.through
 
 
-def verify_snapshots(modeladmin, request, queryset):
-    for snapshot in queryset:
-        print(snapshot.timestamp, snapshot.url, snapshot.is_archived, snapshot.archive_size, len(snapshot.history))
+from django.contrib.admin.helpers import ActionForm
 
 
-verify_snapshots.short_description = "Check"
 
 
-def delete_snapshots(modeladmin, request, queryset):
-    remove(snapshots=queryset, yes=True, delete=True, out_dir=OUTPUT_DIR)
-
-delete_snapshots.short_description = "Delete"
-
-
-class SnapshotAdminForm(forms.ModelForm):
-    tags = TagField(required=False)
-
-    class Meta:
-        model = Snapshot
-        fields = "__all__"
-
-    def save(self, commit=True):
-        # Based on: https://stackoverflow.com/a/49933068/3509554
-
-        # Get the unsave instance
-        instance = forms.ModelForm.save(self, False)
-        tags = self.cleaned_data.pop("tags")
-
-        #update save_m2m
-        def new_save_m2m():
-            instance.save_tags(tags)
-
-        # Do we need to save all changes now?
-        self.save_m2m = new_save_m2m
-        if commit:
-            instance.save()
-
-        return instance
+class SnapshotActionForm(ActionForm):
+    tag = forms.ModelChoiceField(queryset=Tag.objects.all(), required=False)
 
 
 
 
 class SnapshotAdmin(SearchResultsAdminMixin, admin.ModelAdmin):
 class SnapshotAdmin(SearchResultsAdminMixin, admin.ModelAdmin):
     list_display = ('added', 'title_str', 'url_str', 'files', 'size')
     list_display = ('added', 'title_str', 'url_str', 'files', 'size')
     sort_fields = ('title_str', 'url_str', 'added')
     sort_fields = ('title_str', 'url_str', 'added')
-    readonly_fields = ('id', 'url', 'timestamp', 'num_outputs', 'is_archived', 'url_hash', 'added', 'updated')
+    readonly_fields = ('uuid', 'num_outputs', 'is_archived', 'url_hash', 'added', 'updated')
     search_fields = ['url__icontains', 'timestamp', 'title', 'tags__name']
     search_fields = ['url__icontains', 'timestamp', 'title', 'tags__name']
-    fields = (*readonly_fields, 'title', 'tags')
+    fields = ('timestamp', 'url', 'title', 'tags', *readonly_fields)
     list_filter = ('added', 'updated', 'tags')
     list_filter = ('added', 'updated', 'tags')
     ordering = ['-added']
     ordering = ['-added']
-    actions = [delete_snapshots, overwrite_snapshots, update_snapshots, update_titles, verify_snapshots]
-    actions_template = 'admin/actions_as_select.html'
-    form = SnapshotAdminForm
-    list_per_page = 40
+    actions = ['delete_snapshots', 'overwrite_snapshots', 'update_snapshots', 'update_titles', 'verify_snapshots', 'add_tag', 'remove_tag']
+    autocomplete_fields = ['tags']
+    inlines = [ArchiveResultInline]
+    list_per_page = SNAPSHOTS_PER_PAGE
+
+    action_form = SnapshotActionForm
 
 
     def get_urls(self):
     def get_urls(self):
         urls = super().get_urls()
         urls = super().get_urls()
@@ -116,21 +74,46 @@ class SnapshotAdmin(SearchResultsAdminMixin, admin.ModelAdmin):
         return custom_urls + urls
         return custom_urls + urls
 
 
     def get_queryset(self, request):
     def get_queryset(self, request):
+        self.request = request
         return super().get_queryset(request).prefetch_related('tags')
         return super().get_queryset(request).prefetch_related('tags')
 
 
     def tag_list(self, obj):
     def tag_list(self, obj):
         return ', '.join(obj.tags.values_list('name', flat=True))
         return ', '.join(obj.tags.values_list('name', flat=True))
 
 
-    def id_str(self, obj):
+    # TODO: figure out a different way to do this, you cant nest forms so this doenst work
+    # def action(self, obj):
+    #     # csrfmiddlewaretoken: Wa8UcQ4fD3FJibzxqHN3IYrrjLo4VguWynmbzzcPYoebfVUnDovon7GEMYFRgsh0
+    #     # action: update_snapshots
+    #     # select_across: 0
+    #     # _selected_action: 76d29b26-2a88-439e-877c-a7cca1b72bb3
+    #     return format_html(
+    #         '''
+    #             <form action="/admin/core/snapshot/" method="post" onsubmit="e => e.stopPropagation()">
+    #                 <input type="hidden" name="csrfmiddlewaretoken" value="{}">
+    #                 <input type="hidden" name="_selected_action" value="{}">
+    #                 <button name="update_snapshots">Check</button>
+    #                 <button name="update_titles">Pull title + favicon</button>
+    #                 <button name="update_snapshots">Update</button>
+    #                 <button name="overwrite_snapshots">Re-Archive (overwrite)</button>
+    #                 <button name="delete_snapshots">Permanently delete</button>
+    #             </form>
+    #         ''',
+    #         csrf.get_token(self.request),
+    #         obj.id,
+    #     )
+
+    def uuid(self, obj):
         return format_html(
         return format_html(
-            '<code style="font-size: 10px">{}</code>',
-            obj.url_hash[:8],
+            '<code style="font-size: 10px">{}</code><br/><a href="/archive/{}">View index ➡️</a> &nbsp; &nbsp; <a href="/admin/core/snapshot/?id__exact={}">View actions ⚙️</a>',
+            obj.id,
+            obj.timestamp,
+            obj.id,
         )
         )
 
 
     def title_str(self, obj):
     def title_str(self, obj):
         canon = obj.as_link().canonical_outputs()
         canon = obj.as_link().canonical_outputs()
         tags = ''.join(
         tags = ''.join(
-            format_html('<a href="/admin/core/snapshot/?tags__id__exact={}"><span class="tag">{}</span></a> ', tag.id, tag)
+            format_html('<a href="/admin/core/snapshot/?id__startswith={}"><span class="tag">{}</span></a> ', tag.id, tag)
             for tag in obj.tags.all()
             for tag in obj.tags.all()
             if str(tag).strip()
             if str(tag).strip()
         )
         )
@@ -152,7 +135,7 @@ class SnapshotAdmin(SearchResultsAdminMixin, admin.ModelAdmin):
         return snapshot_icons(obj)
         return snapshot_icons(obj)
 
 
     def size(self, obj):
     def size(self, obj):
-        archive_size = obj.archive_size
+        archive_size = (Path(obj.link_dir) / 'index.html').exists() and obj.archive_size
         if archive_size:
         if archive_size:
             size_txt = printable_filesize(archive_size)
             size_txt = printable_filesize(archive_size)
             if archive_size > 52428800:
             if archive_size > 52428800:
@@ -190,28 +173,136 @@ class SnapshotAdmin(SearchResultsAdminMixin, admin.ModelAdmin):
         rendered_response = self.changelist_view(request)
         rendered_response = self.changelist_view(request)
 
 
         # Restore values
         # Restore values
-        self.change_list_template =  saved_change_list_template
+        self.change_list_template = saved_change_list_template
         self.list_per_page = saved_list_per_page
         self.list_per_page = saved_list_per_page
         self.list_max_show_all = saved_list_max_show_all
         self.list_max_show_all = saved_list_max_show_all
 
 
         return rendered_response
         return rendered_response
+
+
+    def update_snapshots(self, request, queryset):
+        archive_links([
+            snapshot.as_link()
+            for snapshot in queryset
+        ], out_dir=OUTPUT_DIR)
+    update_snapshots.short_description = "Archive"
+
+    def update_titles(self, request, queryset):
+        archive_links([
+            snapshot.as_link()
+            for snapshot in queryset
+        ], overwrite=True, methods=('title','favicon'), out_dir=OUTPUT_DIR)
+    update_titles.short_description = "Pull title"
+
+    def overwrite_snapshots(self, request, queryset):
+        archive_links([
+            snapshot.as_link()
+            for snapshot in queryset
+        ], overwrite=True, out_dir=OUTPUT_DIR)
+    overwrite_snapshots.short_description = "Re-archive (overwrite)"
+
+    def verify_snapshots(self, request, queryset):
+        for snapshot in queryset:
+            print(snapshot.timestamp, snapshot.url, snapshot.is_archived, snapshot.archive_size, len(snapshot.history))
+
+    verify_snapshots.short_description = "Check"
+
+    def delete_snapshots(self, request, queryset):
+        remove(snapshots=queryset, yes=True, delete=True, out_dir=OUTPUT_DIR)
+
+    delete_snapshots.short_description = "Delete"
+
+    def add_tag(self, request, queryset):
+        if tag and tag.id:
+            tag = request.POST['tag']
+            for obj in queryset:
+                obj.tags.add(tag)
+
+    add_tag.short_description = "Add tag"
+
+    def remove_tag(self, request, queryset):
+        tag = request.POST['tag']
+        for obj in queryset:
+            obj.tags.remove(tag)
+
+    remove_tag.short_description = "Remove tag"
+
         
         
 
 
-    id_str.short_description = 'ID'
     title_str.short_description = 'Title'
     title_str.short_description = 'Title'
     url_str.short_description = 'Original URL'
     url_str.short_description = 'Original URL'
 
 
-    id_str.admin_order_field = 'id'
     title_str.admin_order_field = 'title'
     title_str.admin_order_field = 'title'
     url_str.admin_order_field = 'url'
     url_str.admin_order_field = 'url'
 
 
+
+
 class TagAdmin(admin.ModelAdmin):
 class TagAdmin(admin.ModelAdmin):
-    list_display = ('slug', 'name', 'id')
+    list_display = ('slug', 'name', 'num_snapshots', 'snapshots', 'id')
     sort_fields = ('id', 'name', 'slug')
     sort_fields = ('id', 'name', 'slug')
-    readonly_fields = ('id',)
+    readonly_fields = ('id', 'num_snapshots', 'snapshots')
     search_fields = ('id', 'name', 'slug')
     search_fields = ('id', 'name', 'slug')
     fields = (*readonly_fields, 'name', 'slug')
     fields = (*readonly_fields, 'name', 'slug')
+    actions = ['delete_selected']
+    ordering = ['-id']
+
+    def num_snapshots(self, obj):
+        return format_html(
+            '<a href="/admin/core/snapshot/?tags__id__exact={}">{} total</a>',
+            obj.id,
+            obj.snapshot_set.count(),
+        )
+
+    def snapshots(self, obj):
+        total_count = obj.snapshot_set.count()
+        return mark_safe('<br/>'.join(
+            format_html(
+                '{} <code><a href="/admin/core/snapshot/{}/change"><b>[{}]</b></a> {}</code>',
+                snap.updated.strftime('%Y-%m-%d %H:%M') if snap.updated else 'pending...',
+                snap.id,
+                snap.timestamp,
+                snap.url,
+            )
+            for snap in obj.snapshot_set.order_by('-updated')[:10]
+        ) + (f'<br/><a href="/admin/core/snapshot/?tags__id__exact={obj.id}">and {total_count-10} more...<a>' if obj.snapshot_set.count() > 10 else ''))
+
+
+class ArchiveResultAdmin(admin.ModelAdmin):
+    list_display = ('id', 'start_ts', 'extractor', 'snapshot_str', 'cmd_str', 'status', 'output_str')
+    sort_fields = ('start_ts', 'extractor', 'status')
+    readonly_fields = ('id', 'uuid', 'snapshot_str')
+    search_fields = ('id', 'uuid', 'snapshot__url', 'extractor', 'output', 'cmd_version', 'cmd', 'snapshot__timestamp')
+    fields = (*readonly_fields, 'snapshot', 'snapshot__tags', 'extractor', 'status', 'start_ts', 'end_ts', 'pwd', 'cmd', 'cmd_version', 'output')
+    autocomplete_fields = ['snapshot']
+
+    list_filter = ('status', 'extractor', 'start_ts', 'cmd_version')
+    ordering = ['-start_ts']
+    list_per_page = SNAPSHOTS_PER_PAGE
+
+    def snapshot_str(self, obj):
+        return format_html(
+            '<a href="/archive/{}/index.html"><b><code>[{}]</code></b></a><br/>'
+            '<small>{}</small>',
+            obj.snapshot.timestamp,
+            obj.snapshot.timestamp,
+            obj.snapshot.url[:128],
+        )
+
+    def cmd_str(self, obj):
+        return format_html(
+            '<pre>{}</pre>',
+            ' '.join(obj.cmd) if isinstance(obj.cmd, list) else str(obj.cmd),
+        )
+
+    def output_str(self, obj):
+        return format_html(
+            '<a href="/archive/{}/{}" class="output-link">↗️</a><pre>{}</pre>',
+            obj.snapshot.timestamp,
+            obj.output if (obj.status == 'succeeded') and obj.extractor not in ('title', 'archive_org') else 'index.html',
+            obj.output,
+        )
 
 
+    snapshot_str.short_description = 'snapshot'
 
 
 class ArchiveBoxAdmin(admin.AdminSite):
 class ArchiveBoxAdmin(admin.AdminSite):
     site_header = 'ArchiveBox'
     site_header = 'ArchiveBox'
@@ -266,4 +357,5 @@ admin.site = ArchiveBoxAdmin()
 admin.site.register(get_user_model())
 admin.site.register(get_user_model())
 admin.site.register(Snapshot, SnapshotAdmin)
 admin.site.register(Snapshot, SnapshotAdmin)
 admin.site.register(Tag, TagAdmin)
 admin.site.register(Tag, TagAdmin)
+admin.site.register(ArchiveResult, ArchiveResultAdmin)
 admin.site.disable_action('delete_selected')
 admin.site.disable_action('delete_selected')

+ 2 - 1
archivebox/core/forms.py

@@ -20,7 +20,8 @@ ARCHIVE_METHODS = [
 
 
 class AddLinkForm(forms.Form):
 class AddLinkForm(forms.Form):
     url = forms.RegexField(label="URLs (one per line)", regex=URL_REGEX, min_length='6', strip=True, widget=forms.Textarea, required=True)
     url = forms.RegexField(label="URLs (one per line)", regex=URL_REGEX, min_length='6', strip=True, widget=forms.Textarea, required=True)
-    depth = forms.ChoiceField(label="Archive depth", choices=CHOICES, widget=forms.RadioSelect, initial='0')
+    tag = forms.CharField(label="Tags (comma separated tag1,tag2,tag3)", strip=True, required=False)
+    depth = forms.ChoiceField(label="Archive depth", choices=CHOICES, initial='0', widget=forms.RadioSelect(attrs={"class": "depth-selection"}))
     archive_methods = forms.MultipleChoiceField(
     archive_methods = forms.MultipleChoiceField(
         label="Archive methods (select at least 1, otherwise all will be used by default)",
         label="Archive methods (select at least 1, otherwise all will be used by default)",
         required=False,
         required=False,

+ 18 - 0
archivebox/core/migrations/0009_auto_20210216_1038.py

@@ -0,0 +1,18 @@
+# Generated by Django 3.1.3 on 2021-02-16 10:38
+
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ('core', '0008_auto_20210105_1421'),
+    ]
+
+    operations = [
+        migrations.AlterField(
+            model_name='snapshot',
+            name='updated',
+            field=models.DateTimeField(auto_now=True, db_index=True, null=True),
+        ),
+    ]

+ 18 - 0
archivebox/core/migrations/0010_auto_20210216_1055.py

@@ -0,0 +1,18 @@
+# Generated by Django 3.1.3 on 2021-02-16 10:55
+
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ('core', '0009_auto_20210216_1038'),
+    ]
+
+    operations = [
+        migrations.AlterField(
+            model_name='archiveresult',
+            name='start_ts',
+            field=models.DateTimeField(db_index=True),
+        ),
+    ]

+ 24 - 0
archivebox/core/migrations/0011_auto_20210216_1331.py

@@ -0,0 +1,24 @@
+# Generated by Django 3.1.3 on 2021-02-16 13:31
+
+from django.db import migrations, models
+import uuid
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ('core', '0010_auto_20210216_1055'),
+    ]
+
+    operations = [
+        migrations.AddField(
+            model_name='archiveresult',
+            name='uuid',
+            field=models.UUIDField(default=uuid.uuid4, editable=False),
+        ),
+        migrations.AlterField(
+            model_name='archiveresult',
+            name='extractor',
+            field=models.CharField(choices=[('title', 'title'), ('favicon', 'favicon'), ('headers', 'headers'), ('singlefile', 'singlefile'), ('pdf', 'pdf'), ('screenshot', 'screenshot'), ('dom', 'dom'), ('wget', 'wget'), ('readability', 'readability'), ('mercury', 'mercury'), ('git', 'git'), ('media', 'media'), ('archive_org', 'archive_org')], max_length=32),
+        ),
+    ]

+ 23 - 0
archivebox/core/migrations/0012_auto_20210216_1425.py

@@ -0,0 +1,23 @@
+# Generated by Django 3.1.3 on 2021-02-16 14:25
+
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ('core', '0011_auto_20210216_1331'),
+    ]
+
+    operations = [
+        migrations.AlterField(
+            model_name='archiveresult',
+            name='cmd_version',
+            field=models.CharField(blank=True, default=None, max_length=128, null=True),
+        ),
+        migrations.AlterField(
+            model_name='archiveresult',
+            name='output',
+            field=models.CharField(max_length=1024),
+        ),
+    ]

+ 18 - 0
archivebox/core/migrations/0013_auto_20210218_0729.py

@@ -0,0 +1,18 @@
+# Generated by Django 3.1.3 on 2021-02-18 07:29
+
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ('core', '0012_auto_20210216_1425'),
+    ]
+
+    operations = [
+        migrations.AlterField(
+            model_name='snapshot',
+            name='title',
+            field=models.CharField(blank=True, db_index=True, max_length=256, null=True),
+        ),
+    ]

+ 18 - 0
archivebox/core/migrations/0014_auto_20210218_0729.py

@@ -0,0 +1,18 @@
+# Generated by Django 3.1.3 on 2021-02-18 07:29
+
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ('core', '0013_auto_20210218_0729'),
+    ]
+
+    operations = [
+        migrations.AlterField(
+            model_name='snapshot',
+            name='title',
+            field=models.CharField(blank=True, db_index=True, max_length=1024, null=True),
+        ),
+    ]

+ 18 - 0
archivebox/core/migrations/0015_auto_20210218_0730.py

@@ -0,0 +1,18 @@
+# Generated by Django 3.1.3 on 2021-02-18 07:30
+
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ('core', '0014_auto_20210218_0729'),
+    ]
+
+    operations = [
+        migrations.AlterField(
+            model_name='snapshot',
+            name='title',
+            field=models.CharField(blank=True, db_index=True, max_length=512, null=True),
+        ),
+    ]

+ 18 - 0
archivebox/core/migrations/0016_auto_20210218_1204.py

@@ -0,0 +1,18 @@
+# Generated by Django 3.1.3 on 2021-02-18 12:04
+
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ('core', '0015_auto_20210218_0730'),
+    ]
+
+    operations = [
+        migrations.AlterField(
+            model_name='snapshot',
+            name='tags',
+            field=models.ManyToManyField(blank=True, to='core.Tag'),
+        ),
+    ]

+ 18 - 0
archivebox/core/migrations/0017_auto_20210219_0211.py

@@ -0,0 +1,18 @@
+# Generated by Django 3.1.3 on 2021-02-19 02:11
+
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ('core', '0016_auto_20210218_1204'),
+    ]
+
+    operations = [
+        migrations.AlterField(
+            model_name='tag',
+            name='slug',
+            field=models.SlugField(blank=True, max_length=100, unique=True, verbose_name='slug'),
+        ),
+    ]

+ 90 - 38
archivebox/core/models.py

@@ -2,12 +2,15 @@ __package__ = 'archivebox.core'
 
 
 import uuid
 import uuid
 
 
-from django.db import models, transaction
+from django.db import models
 from django.utils.functional import cached_property
 from django.utils.functional import cached_property
 from django.utils.text import slugify
 from django.utils.text import slugify
+from django.core.cache import cache
 from django.db.models import Case, When, Value, IntegerField
 from django.db.models import Case, When, Value, IntegerField
 
 
-from ..util import parse_date
+from ..config import ARCHIVE_DIR, ARCHIVE_DIR_NAME
+from ..system import get_dir_size
+from ..util import parse_date, base_url, hashurl
 from ..index.schema import Link
 from ..index.schema import Link
 from ..extractors import get_default_archive_methods, ARCHIVE_METHODS_INDEXING_PRECEDENCE
 from ..extractors import get_default_archive_methods, ARCHIVE_METHODS_INDEXING_PRECEDENCE
 
 
@@ -29,8 +32,11 @@ class Tag(models.Model):
     """
     """
     Based on django-taggit model
     Based on django-taggit model
     """
     """
-    name = models.CharField(verbose_name="name", unique=True, blank=False, max_length=100)
-    slug = models.SlugField(verbose_name="slug", unique=True, max_length=100)
+    name = models.CharField(unique=True, blank=False, max_length=100)
+
+    # slug is autoset on save from name, never set it manually
+    slug = models.SlugField(unique=True, blank=True, max_length=100)
+
 
 
     class Meta:
     class Meta:
         verbose_name = "Tag"
         verbose_name = "Tag"
@@ -49,20 +55,21 @@ class Tag(models.Model):
         if self._state.adding and not self.slug:
         if self._state.adding and not self.slug:
             self.slug = self.slugify(self.name)
             self.slug = self.slugify(self.name)
 
 
-            with transaction.atomic():
-                slugs = set(
-                    type(self)
-                    ._default_manager.filter(slug__startswith=self.slug)
-                    .values_list("slug", flat=True)
-                )
-
-                i = None
-                while True:
-                    slug = self.slugify(self.name, i)
-                    if slug not in slugs:
-                        self.slug = slug
-                        return super().save(*args, **kwargs)
-                    i = 1 if i is None else i+1
+            # if name is different but slug conficts with another tags slug, append a counter
+            # with transaction.atomic():
+            slugs = set(
+                type(self)
+                ._default_manager.filter(slug__startswith=self.slug)
+                .values_list("slug", flat=True)
+            )
+
+            i = None
+            while True:
+                slug = self.slugify(self.name, i)
+                if slug not in slugs:
+                    self.slug = slug
+                    return super().save(*args, **kwargs)
+                i = 1 if i is None else i+1
         else:
         else:
             return super().save(*args, **kwargs)
             return super().save(*args, **kwargs)
 
 
@@ -73,11 +80,11 @@ class Snapshot(models.Model):
     url = models.URLField(unique=True)
     url = models.URLField(unique=True)
     timestamp = models.CharField(max_length=32, unique=True, db_index=True)
     timestamp = models.CharField(max_length=32, unique=True, db_index=True)
 
 
-    title = models.CharField(max_length=128, null=True, blank=True, db_index=True)
+    title = models.CharField(max_length=512, null=True, blank=True, db_index=True)
 
 
     added = models.DateTimeField(auto_now_add=True, db_index=True)
     added = models.DateTimeField(auto_now_add=True, db_index=True)
-    updated = models.DateTimeField(null=True, blank=True, db_index=True)
-    tags = models.ManyToManyField(Tag)
+    updated = models.DateTimeField(auto_now=True, blank=True, null=True, db_index=True)
+    tags = models.ManyToManyField(Tag, blank=True)
 
 
     keys = ('url', 'timestamp', 'title', 'tags', 'updated')
     keys = ('url', 'timestamp', 'title', 'tags', 'updated')
 
 
@@ -109,13 +116,24 @@ class Snapshot(models.Model):
         from ..index import load_link_details
         from ..index import load_link_details
         return load_link_details(self.as_link())
         return load_link_details(self.as_link())
 
 
-    def tags_str(self) -> str:
-        return ','.join(self.tags.order_by('name').values_list('name', flat=True))
+    def tags_str(self, nocache=True) -> str:
+        cache_key = f'{self.id}-{(self.updated or self.added).timestamp()}-tags'
+        calc_tags_str = lambda: ','.join(self.tags.order_by('name').values_list('name', flat=True))
+        if nocache:
+            tags_str = calc_tags_str()
+            cache.set(cache_key, tags_str)
+            return tags_str
+        return cache.get_or_set(cache_key, calc_tags_str)
 
 
     @cached_property
     @cached_property
     def bookmarked(self):
     def bookmarked(self):
         return parse_date(self.timestamp)
         return parse_date(self.timestamp)
 
 
+    @cached_property
+    def bookmarked_date(self):
+        # TODO: remove this
+        return self.bookmarked
+
     @cached_property
     @cached_property
     def is_archived(self):
     def is_archived(self):
         return self.as_link().is_archived
         return self.as_link().is_archived
@@ -126,23 +144,31 @@ class Snapshot(models.Model):
 
 
     @cached_property
     @cached_property
     def url_hash(self):
     def url_hash(self):
-        return self.as_link().url_hash
+        return hashurl(self.url)
 
 
     @cached_property
     @cached_property
     def base_url(self):
     def base_url(self):
-        return self.as_link().base_url
+        return base_url(self.url)
 
 
     @cached_property
     @cached_property
     def link_dir(self):
     def link_dir(self):
-        return self.as_link().link_dir
+        return str(ARCHIVE_DIR / self.timestamp)
 
 
     @cached_property
     @cached_property
     def archive_path(self):
     def archive_path(self):
-        return self.as_link().archive_path
+        return '{}/{}'.format(ARCHIVE_DIR_NAME, self.timestamp)
 
 
     @cached_property
     @cached_property
     def archive_size(self):
     def archive_size(self):
-        return self.as_link().archive_size
+        cache_key = f'{str(self.id)[:12]}-{(self.updated or self.added).timestamp()}-size'
+
+        def calc_dir_size():
+            try:
+                return get_dir_size(self.link_dir)[0]
+            except Exception:
+                return 0
+
+        return cache.get_or_set(cache_key, calc_dir_size)
 
 
     @cached_property
     @cached_property
     def history(self):
     def history(self):
@@ -151,17 +177,40 @@ class Snapshot(models.Model):
 
 
     @cached_property
     @cached_property
     def latest_title(self):
     def latest_title(self):
-        if ('title' in self.history
-            and self.history['title']
-            and (self.history['title'][-1].status == 'succeeded')
-            and self.history['title'][-1].output.strip()):
-            return self.history['title'][-1].output.strip()
+        if self.title:
+            return self.title   # whoopdedoo that was easy
+        
+        try:
+            # take longest successful title from ArchiveResult db history
+            return sorted(
+                self.archiveresult_set\
+                    .filter(extractor='title', status='succeeded', output__isnull=False)\
+                    .values_list('output', flat=True),
+                key=lambda r: len(r),
+            )[-1]
+        except IndexError:
+            pass
+
+        try:
+            # take longest successful title from Link json index file history
+            return sorted(
+                (
+                    result.output.strip()
+                    for result in self.history['title']
+                    if result.status == 'succeeded' and result.output.strip()
+                ),
+                key=lambda r: len(r),
+            )[-1]
+        except (KeyError, IndexError):
+            pass
+
         return None
         return None
 
 
     def save_tags(self, tags=()):
     def save_tags(self, tags=()):
         tags_id = []
         tags_id = []
         for tag in tags:
         for tag in tags:
-            tags_id.append(Tag.objects.get_or_create(name=tag)[0].id)
+            if tag.strip():
+                tags_id.append(Tag.objects.get_or_create(name=tag)[0].id)
         self.tags.clear()
         self.tags.clear()
         self.tags.add(*tags_id)
         self.tags.add(*tags_id)
 
 
@@ -178,15 +227,18 @@ class ArchiveResultManager(models.Manager):
 
 
 
 
 class ArchiveResult(models.Model):
 class ArchiveResult(models.Model):
+    id = models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')
+    uuid = models.UUIDField(default=uuid.uuid4, editable=False)
+
     snapshot = models.ForeignKey(Snapshot, on_delete=models.CASCADE)
     snapshot = models.ForeignKey(Snapshot, on_delete=models.CASCADE)
+    extractor = models.CharField(choices=EXTRACTORS, max_length=32)
     cmd = JSONField()
     cmd = JSONField()
     pwd = models.CharField(max_length=256)
     pwd = models.CharField(max_length=256)
-    cmd_version = models.CharField(max_length=32, default=None, null=True, blank=True)
-    output = models.CharField(max_length=512)
-    start_ts = models.DateTimeField()
+    cmd_version = models.CharField(max_length=128, default=None, null=True, blank=True)
+    output = models.CharField(max_length=1024)
+    start_ts = models.DateTimeField(db_index=True)
     end_ts = models.DateTimeField()
     end_ts = models.DateTimeField()
     status = models.CharField(max_length=16, choices=STATUS_CHOICES)
     status = models.CharField(max_length=16, choices=STATUS_CHOICES)
-    extractor = models.CharField(choices=EXTRACTORS, max_length=32)
 
 
     objects = ArchiveResultManager()
     objects = ArchiveResultManager()
 
 

+ 127 - 1
archivebox/core/settings.py

@@ -2,6 +2,9 @@ __package__ = 'archivebox.core'
 
 
 import os
 import os
 import sys
 import sys
+import re
+import logging
+import tempfile
 
 
 from pathlib import Path
 from pathlib import Path
 from django.utils.crypto import get_random_string
 from django.utils.crypto import get_random_string
@@ -14,6 +17,7 @@ from ..config import (
     TEMPLATES_DIR_NAME,
     TEMPLATES_DIR_NAME,
     SQL_INDEX_FILENAME,
     SQL_INDEX_FILENAME,
     OUTPUT_DIR,
     OUTPUT_DIR,
+    LOGS_DIR,
 )
 )
 
 
 
 
@@ -62,6 +66,40 @@ AUTHENTICATION_BACKENDS = [
     'django.contrib.auth.backends.ModelBackend',
     'django.contrib.auth.backends.ModelBackend',
 ]
 ]
 
 
+# only enable debug toolbar when in DEBUG mode with --nothreading (it doesnt work in multithreaded mode)
+DEBUG_TOOLBAR = DEBUG and ('--nothreading' in sys.argv) and ('--reload' not in sys.argv)
+if DEBUG_TOOLBAR:
+    try:
+        import debug_toolbar   # noqa
+        DEBUG_TOOLBAR = True
+    except ImportError:
+        DEBUG_TOOLBAR = False
+
+if DEBUG_TOOLBAR:
+    INSTALLED_APPS = [*INSTALLED_APPS, 'debug_toolbar']
+    INTERNAL_IPS = ['0.0.0.0', '127.0.0.1', '*']
+    DEBUG_TOOLBAR_CONFIG = {
+        "SHOW_TOOLBAR_CALLBACK": lambda request: True,
+        "RENDER_PANELS": True,
+    }
+    DEBUG_TOOLBAR_PANELS = [
+        'debug_toolbar.panels.history.HistoryPanel',
+        'debug_toolbar.panels.versions.VersionsPanel',
+        'debug_toolbar.panels.timer.TimerPanel',
+        'debug_toolbar.panels.settings.SettingsPanel',
+        'debug_toolbar.panels.headers.HeadersPanel',
+        'debug_toolbar.panels.request.RequestPanel',
+        'debug_toolbar.panels.sql.SQLPanel',
+        'debug_toolbar.panels.staticfiles.StaticFilesPanel',
+        # 'debug_toolbar.panels.templates.TemplatesPanel',
+        'debug_toolbar.panels.cache.CachePanel',
+        'debug_toolbar.panels.signals.SignalsPanel',
+        'debug_toolbar.panels.logging.LoggingPanel',
+        'debug_toolbar.panels.redirects.RedirectsPanel',
+        'debug_toolbar.panels.profiling.ProfilingPanel',
+        'djdt_flamegraph.FlamegraphPanel',
+    ]
+    MIDDLEWARE = [*MIDDLEWARE, 'debug_toolbar.middleware.DebugToolbarMiddleware']
 
 
 ################################################################################
 ################################################################################
 ### Staticfile and Template Settings
 ### Staticfile and Template Settings
@@ -107,6 +145,22 @@ DATABASES = {
     'default': {
     'default': {
         'ENGINE': 'django.db.backends.sqlite3',
         'ENGINE': 'django.db.backends.sqlite3',
         'NAME': DATABASE_NAME,
         'NAME': DATABASE_NAME,
+        'OPTIONS': {
+            'timeout': 60,
+            'check_same_thread': False,
+        },
+        # DB setup is sometimes modified at runtime by setup_django() in config.py
+    }
+}
+
+CACHE_BACKEND = 'django.core.cache.backends.locmem.LocMemCache'
+# CACHE_BACKEND = 'django.core.cache.backends.db.DatabaseCache'
+# CACHE_BACKEND = 'django.core.cache.backends.dummy.DummyCache'
+
+CACHES = {
+    'default': {
+        'BACKEND': CACHE_BACKEND,
+        'LOCATION': 'django_cache_default',
     }
     }
 }
 }
 
 
@@ -117,7 +171,7 @@ EMAIL_BACKEND = 'django.core.mail.backends.console.EmailBackend'
 ### Security Settings
 ### Security Settings
 ################################################################################
 ################################################################################
 
 
-SECRET_KEY = SECRET_KEY or get_random_string(50, 'abcdefghijklmnopqrstuvwxyz0123456789-_+!.')
+SECRET_KEY = SECRET_KEY or get_random_string(50, 'abcdefghijklmnopqrstuvwxyz0123456789_')
 
 
 ALLOWED_HOSTS = ALLOWED_HOSTS.split(',')
 ALLOWED_HOSTS = ALLOWED_HOSTS.split(',')
 
 
@@ -131,6 +185,8 @@ SESSION_COOKIE_AGE = 1209600  # 2 weeks
 SESSION_EXPIRE_AT_BROWSER_CLOSE = False
 SESSION_EXPIRE_AT_BROWSER_CLOSE = False
 SESSION_SAVE_EVERY_REQUEST = True
 SESSION_SAVE_EVERY_REQUEST = True
 
 
+SESSION_ENGINE = "django.contrib.sessions.backends.db"
+
 AUTH_PASSWORD_VALIDATORS = [
 AUTH_PASSWORD_VALIDATORS = [
     {'NAME': 'django.contrib.auth.password_validation.UserAttributeSimilarityValidator'},
     {'NAME': 'django.contrib.auth.password_validation.UserAttributeSimilarityValidator'},
     {'NAME': 'django.contrib.auth.password_validation.MinimumLengthValidator'},
     {'NAME': 'django.contrib.auth.password_validation.MinimumLengthValidator'},
@@ -163,3 +219,73 @@ USE_TZ = False
 
 
 DATETIME_FORMAT = 'Y-m-d g:iA'
 DATETIME_FORMAT = 'Y-m-d g:iA'
 SHORT_DATETIME_FORMAT = 'Y-m-d h:iA'
 SHORT_DATETIME_FORMAT = 'Y-m-d h:iA'
+
+
+################################################################################
+### Logging Settings
+################################################################################
+
+IGNORABLE_404_URLS = [
+    re.compile(r'apple-touch-icon.*\.png$'),
+    re.compile(r'favicon\.ico$'),
+    re.compile(r'robots\.txt$'),
+    re.compile(r'.*\.(css|js)\.map$'),
+]
+
+class NoisyRequestsFilter(logging.Filter):
+    def filter(self, record):
+        logline = record.getMessage()
+
+        # ignore harmless 404s for the patterns in IGNORABLE_404_URLS
+        for ignorable_url_pattern in IGNORABLE_404_URLS:
+            ignorable_log_pattern = re.compile(f'^"GET /.*/?{ignorable_url_pattern.pattern[:-1]} HTTP/.*" (200|30.|404) .+$', re.I | re.M)
+            if ignorable_log_pattern.match(logline):
+                return 0
+
+        # ignore staticfile requests that 200 or 30*
+        ignoreable_200_log_pattern = re.compile(r'"GET /static/.* HTTP/.*" (200|30.) .+', re.I | re.M)
+        if ignoreable_200_log_pattern.match(logline):
+            return 0
+
+        return 1
+
+if LOGS_DIR.exists():
+    ERROR_LOG = (LOGS_DIR / 'errors.log')
+else:
+    # meh too many edge cases here around creating log dir w/ correct permissions
+    # cant be bothered, just trash the log and let them figure it out via stdout/stderr
+    ERROR_LOG = tempfile.NamedTemporaryFile().name
+
+LOGGING = {
+    'version': 1,
+    'disable_existing_loggers': False,
+    'handlers': {
+        'console': {
+            'class': 'logging.StreamHandler',
+        },
+        'logfile': {
+            'level': 'ERROR',
+            'class': 'logging.handlers.RotatingFileHandler',
+            'filename': ERROR_LOG,
+            'maxBytes': 1024 * 1024 * 25,  # 25 MB
+            'backupCount': 10,
+        },
+    },
+    'filters': {
+        'noisyrequestsfilter': {
+            '()': NoisyRequestsFilter,
+        }
+    },
+    'loggers': {
+        'django': {
+            'handlers': ['console', 'logfile'],
+            'level': 'INFO',
+            'filters': ['noisyrequestsfilter'],
+        },
+        'django.server': {
+            'handlers': ['console', 'logfile'],
+            'level': 'INFO',
+            'filters': ['noisyrequestsfilter'],
+        }
+    },
+}

+ 43 - 34
archivebox/core/urls.py

@@ -2,6 +2,7 @@ from django.contrib import admin
 
 
 from django.urls import path, include
 from django.urls import path, include
 from django.views import static
 from django.views import static
+from django.contrib.staticfiles.urls import staticfiles_urlpatterns
 from django.conf import settings
 from django.conf import settings
 from django.views.generic.base import RedirectView
 from django.views.generic.base import RedirectView
 
 
@@ -13,8 +14,8 @@ from core.views import HomepageView, SnapshotView, PublicIndexView, AddView
 urlpatterns = [
 urlpatterns = [
     path('public/', PublicIndexView.as_view(), name='public-index'),
     path('public/', PublicIndexView.as_view(), name='public-index'),
 
 
-    path('robots.txt', static.serve, {'document_root': settings.OUTPUT_DIR, 'path': 'robots.txt'}),
-    path('favicon.ico', static.serve, {'document_root': settings.OUTPUT_DIR, 'path': 'favicon.ico'}),
+    path('robots.txt', static.serve, {'document_root': settings.STATICFILES_DIRS[0], 'path': 'robots.txt'}),
+    path('favicon.ico', static.serve, {'document_root': settings.STATICFILES_DIRS[0], 'path': 'favicon.ico'}),
 
 
     path('docs/', RedirectView.as_view(url='https://github.com/ArchiveBox/ArchiveBox/wiki'), name='Docs'),
     path('docs/', RedirectView.as_view(url='https://github.com/ArchiveBox/ArchiveBox/wiki'), name='Docs'),
 
 
@@ -35,35 +36,43 @@ urlpatterns = [
     path('index.json', static.serve, {'document_root': settings.OUTPUT_DIR, 'path': 'index.json'}),
     path('index.json', static.serve, {'document_root': settings.OUTPUT_DIR, 'path': 'index.json'}),
     path('', HomepageView.as_view(), name='Home'),
     path('', HomepageView.as_view(), name='Home'),
 ]
 ]
-
-    # # Proposed UI URLs spec
-    # path('',                 HomepageView)
-    # path('/add',             AddView)
-    # path('/public',          PublicIndexView)
-    # path('/snapshot/:slug',  SnapshotView)
-    
-    # path('/admin',           admin.site.urls)
-    # path('/accounts',        django.contrib.auth.urls)
-
-    # # Prposed REST API spec
-    # # :slugs can be uuid, short_uuid, or any of the unique index_fields
-    # path('api/v1/'),
-    # path('api/v1/core/'                      [GET])
-    # path('api/v1/core/snapshot/',            [GET, POST, PUT]),
-    # path('api/v1/core/snapshot/:slug',       [GET, PATCH, DELETE]),
-    # path('api/v1/core/archiveresult',        [GET, POST, PUT]),
-    # path('api/v1/core/archiveresult/:slug',  [GET, PATCH, DELETE]),
-    # path('api/v1/core/tag/',                 [GET, POST, PUT]),
-    # path('api/v1/core/tag/:slug',            [GET, PATCH, DELETE]),
-
-    # path('api/v1/cli/',                      [GET])
-    # path('api/v1/cli/{add,list,config,...}', [POST]),  # pass query as kwargs directly to `run_subcommand` and return stdout, stderr, exitcode
-
-    # path('api/v1/extractors/',                    [GET])
-    # path('api/v1/extractors/:extractor/',         [GET]),
-    # path('api/v1/extractors/:extractor/:func',    [GET, POST]),  # pass query as args directly to chosen function
-
-    # future, just an idea:
-    # path('api/v1/scheduler/',                [GET])
-    # path('api/v1/scheduler/task/',           [GET, POST, PUT]),
-    # path('api/v1/scheduler/task/:slug',      [GET, PATCH, DELETE]),
+urlpatterns += staticfiles_urlpatterns()
+
+if settings.DEBUG_TOOLBAR:
+    import debug_toolbar
+    urlpatterns += [
+        path('__debug__/', include(debug_toolbar.urls)),
+    ]
+
+
+# # Proposed FUTURE URLs spec
+# path('',                 HomepageView)
+# path('/add',             AddView)
+# path('/public',          PublicIndexView)
+# path('/snapshot/:slug',  SnapshotView)
+
+# path('/admin',           admin.site.urls)
+# path('/accounts',        django.contrib.auth.urls)
+
+# # Prposed REST API spec
+# # :slugs can be uuid, short_uuid, or any of the unique index_fields
+# path('api/v1/'),
+# path('api/v1/core/'                      [GET])
+# path('api/v1/core/snapshot/',            [GET, POST, PUT]),
+# path('api/v1/core/snapshot/:slug',       [GET, PATCH, DELETE]),
+# path('api/v1/core/archiveresult',        [GET, POST, PUT]),
+# path('api/v1/core/archiveresult/:slug',  [GET, PATCH, DELETE]),
+# path('api/v1/core/tag/',                 [GET, POST, PUT]),
+# path('api/v1/core/tag/:slug',            [GET, PATCH, DELETE]),
+
+# path('api/v1/cli/',                      [GET])
+# path('api/v1/cli/{add,list,config,...}', [POST]),  # pass query as kwargs directly to `run_subcommand` and return stdout, stderr, exitcode
+
+# path('api/v1/extractors/',                    [GET])
+# path('api/v1/extractors/:extractor/',         [GET]),
+# path('api/v1/extractors/:extractor/:func',    [GET, POST]),  # pass query as args directly to chosen function
+
+# future, just an idea:
+# path('api/v1/scheduler/',                [GET])
+# path('api/v1/scheduler/task/',           [GET, POST, PUT]),
+# path('api/v1/scheduler/task/:slug',      [GET, PATCH, DELETE]),

+ 161 - 42
archivebox/core/views.py

@@ -4,8 +4,8 @@ from io import StringIO
 from contextlib import redirect_stdout
 from contextlib import redirect_stdout
 
 
 from django.shortcuts import render, redirect
 from django.shortcuts import render, redirect
-
-from django.http import HttpResponse
+from django.http import HttpResponse, Http404
+from django.utils.html import format_html, mark_safe
 from django.views import View, static
 from django.views import View, static
 from django.views.generic.list import ListView
 from django.views.generic.list import ListView
 from django.views.generic import FormView
 from django.views.generic import FormView
@@ -22,6 +22,7 @@ from ..config import (
     PUBLIC_ADD_VIEW,
     PUBLIC_ADD_VIEW,
     VERSION,
     VERSION,
     FOOTER_INFO,
     FOOTER_INFO,
+    SNAPSHOTS_PER_PAGE,
 )
 )
 from main import add
 from main import add
 from ..util import base_url, ansi_to_html
 from ..util import base_url, ansi_to_html
@@ -43,10 +44,6 @@ class SnapshotView(View):
     # render static html index from filesystem archive/<timestamp>/index.html
     # render static html index from filesystem archive/<timestamp>/index.html
 
 
     def get(self, request, path):
     def get(self, request, path):
-        # missing trailing slash -> redirect to index
-        if '/' not in path:
-            return redirect(f'{path}/index.html')
-
         if not request.user.is_authenticated and not PUBLIC_SNAPSHOTS:
         if not request.user.is_authenticated and not PUBLIC_SNAPSHOTS:
             return redirect(f'/admin/login/?next={request.path}')
             return redirect(f'/admin/login/?next={request.path}')
 
 
@@ -55,46 +52,163 @@ class SnapshotView(View):
         except (IndexError, ValueError):
         except (IndexError, ValueError):
             slug, archivefile = path.split('/', 1)[0], 'index.html'
             slug, archivefile = path.split('/', 1)[0], 'index.html'
 
 
-        all_pages = list(Snapshot.objects.all())
-
         # slug is a timestamp
         # slug is a timestamp
-        by_ts = {page.timestamp: page for page in all_pages}
-        try:
-            # print('SERVING STATICFILE', by_ts[slug].link_dir, request.path, path)
-            response = static.serve(request, archivefile, document_root=by_ts[slug].link_dir, show_indexes=True)
-            response["Link"] = f'<{by_ts[slug].url}>; rel="canonical"'
-            return response
-        except KeyError:
-            pass
+        if slug.replace('.','').isdigit():
 
 
-        # slug is a hash
-        by_hash = {page.url_hash: page for page in all_pages}
-        try:
-            timestamp = by_hash[slug].timestamp
-            return redirect(f'/archive/{timestamp}/{archivefile}')
-        except KeyError:
-            pass
+            # missing trailing slash -> redirect to index
+            if '/' not in path:
+                return redirect(f'{path}/index.html')
 
 
+            try:
+                try:
+                    snapshot = Snapshot.objects.get(Q(timestamp=slug) | Q(id__startswith=slug))
+                    response = static.serve(request, archivefile, document_root=snapshot.link_dir, show_indexes=True)
+                    response["Link"] = f'<{snapshot.url}>; rel="canonical"'
+                    return response
+                except Snapshot.DoesNotExist:
+                    if Snapshot.objects.filter(timestamp__startswith=slug).exists():
+                        raise Snapshot.MultipleObjectsReturned
+                    else:
+                        raise
+            except Snapshot.DoesNotExist:
+                # Snapshot does not exist
+                return HttpResponse(
+                    format_html(
+                        (
+                            '<center><br/><br/><br/>'
+                            'No Snapshot directories match the given timestamp or UUID: <code>{}</code><br/><br/>'
+                            'You can <a href="/add/" target="_top">add a new Snapshot</a>, or return to the <a href="/" target="_top">Main Index</a>'
+                            '</center>'
+                        ),
+                        slug,
+                        path,
+                    ),
+                    content_type="text/html",
+                    status=404,
+                )
+            except Snapshot.MultipleObjectsReturned:
+                snapshot_hrefs = mark_safe('<br/>').join(
+                    format_html(
+                        '{} <a href="/archive/{}/index.html"><b><code>{}</code></b></a> {} <b>{}</b>',
+                        snap.added.strftime('%Y-%m-%d %H:%M:%S'),
+                        snap.timestamp,
+                        snap.timestamp,
+                        snap.url,
+                        snap.title or '',
+                    )
+                    for snap in Snapshot.objects.filter(timestamp__startswith=slug).only('url', 'timestamp', 'title', 'added').order_by('-added')
+                )
+                return HttpResponse(
+                    format_html(
+                        (
+                            'Multiple Snapshots match the given timestamp/UUID <code>{}</code><br/><pre>'
+                        ),
+                        slug,
+                    ) + snapshot_hrefs + format_html(
+                        (
+                            '</pre><br/>'
+                            'Choose a Snapshot to proceed or go back to the <a href="/" target="_top">Main Index</a>'
+                        )
+                    ),
+                    content_type="text/html",
+                    status=404,
+                )
+            except Http404:
+                # Snapshot dir exists but file within does not e.g. 124235.324234/screenshot.png
+                return HttpResponse(
+                    format_html(
+                        (
+                            '<center><br/><br/><br/>'
+                            f'Snapshot <a href="/archive/{snapshot.timestamp}/index.html" target="_top"><b><code>[{snapshot.timestamp}]</code></b></a> exists in DB, but resource <b><code>{snapshot.timestamp}/'
+                            '{}'
+                            f'</code></b> does not exist in <a href="/archive/{snapshot.timestamp}/" target="_top">snapshot dir</a> yet.<br/><br/>'
+                            'Maybe this resource type is not availabe for this Snapshot,<br/>or the archiving process has not completed yet?<br/>'
+                            f'<pre><code># run this cmd to finish archiving this Snapshot<br/>archivebox update -t timestamp {snapshot.timestamp}</code></pre><br/><br/>'
+                            '<div class="text-align: left; width: 100%; max-width: 400px">'
+                            '<i><b>Next steps:</i></b><br/>'
+                            f'- list all the <a href="/archive/{snapshot.timestamp}/" target="_top">Snapshot files <code>.*</code></a><br/>'
+                            f'- view the <a href="/archive/{snapshot.timestamp}/index.html" target="_top">Snapshot <code>./index.html</code></a><br/>'
+                            f'- go to the <a href="/admin/core/snapshot/{snapshot.id}/change/" target="_top">Snapshot admin</a> to edit<br/>'
+                            f'- go to the <a href="/admin/core/snapshot/?id__startswith={snapshot.id}" target="_top">Snapshot actions</a> to re-archive<br/>'
+                            '- or return to <a href="/" target="_top">the main index...</a></div>'
+                            '</center>'
+                        ),
+                        archivefile,
+                    ),
+                    content_type="text/html",
+                    status=404,
+                )
         # slug is a URL
         # slug is a URL
-        by_url = {page.base_url: page for page in all_pages}
         try:
         try:
-            # TODO: add multiple snapshot support by showing index of all snapshots
-            # for given url instead of redirecting to timestamp index
-            timestamp = by_url[base_url(path)].timestamp
-            return redirect(f'/archive/{timestamp}/index.html')
-        except KeyError:
-            pass
-
-        return HttpResponse(
-            'No archived link matches the given timestamp or hash.',
-            content_type="text/plain",
-            status=404,
-        )
+            try:
+                # try exact match on full url first
+                snapshot = Snapshot.objects.get(
+                    Q(url='http://' + path) | Q(url='https://' + path) | Q(id__startswith=path)
+                )
+            except Snapshot.DoesNotExist:
+                # fall back to match on exact base_url
+                try:
+                    snapshot = Snapshot.objects.get(
+                        Q(url='http://' + base_url(path)) | Q(url='https://' + base_url(path))
+                    )
+                except Snapshot.DoesNotExist:
+                    # fall back to matching base_url as prefix
+                    snapshot = Snapshot.objects.get(
+                        Q(url__startswith='http://' + base_url(path)) | Q(url__startswith='https://' + base_url(path))
+                    )
+            return redirect(f'/archive/{snapshot.timestamp}/index.html')
+        except Snapshot.DoesNotExist:
+            return HttpResponse(
+                format_html(
+                    (
+                        '<center><br/><br/><br/>'
+                        'No Snapshots match the given url: <code>{}</code><br/><br/><br/>'
+                        'Return to the <a href="/" target="_top">Main Index</a>, or:<br/><br/>'
+                        '+ <i><a href="/add/?url={}" target="_top">Add a new Snapshot for <code>{}</code></a><br/><br/></i>'
+                        '</center>'
+                    ),
+                    base_url(path),
+                    path if '://' in path else f'https://{path}',
+                    path,
+                ),
+                content_type="text/html",
+                status=404,
+            )
+        except Snapshot.MultipleObjectsReturned:
+            snapshot_hrefs = mark_safe('<br/>').join(
+                format_html(
+                    '{} <a href="/archive/{}/index.html"><b><code>{}</code></b></a> {} <b>{}</b>',
+                    snap.added.strftime('%Y-%m-%d %H:%M:%S'),
+                    snap.timestamp,
+                    snap.timestamp,
+                    snap.url,
+                    snap.title or '',
+                )
+                for snap in Snapshot.objects.filter(
+                    Q(url__startswith='http://' + base_url(path)) | Q(url__startswith='https://' + base_url(path))
+                ).only('url', 'timestamp', 'title', 'added').order_by('-added')
+            )
+            return HttpResponse(
+                format_html(
+                    (
+                        'Multiple Snapshots match the given URL <code>{}</code><br/><pre>'
+                    ),
+                    base_url(path),
+                ) + snapshot_hrefs + format_html(
+                    (
+                        '</pre><br/>'
+                        'Choose a Snapshot to proceed or go back to the <a href="/" target="_top">Main Index</a>'
+                    )
+                ),
+                content_type="text/html",
+                status=404,
+            )
+        
 
 
 class PublicIndexView(ListView):
 class PublicIndexView(ListView):
     template_name = 'public_index.html'
     template_name = 'public_index.html'
     model = Snapshot
     model = Snapshot
-    paginate_by = 100
+    paginate_by = SNAPSHOTS_PER_PAGE
     ordering = ['title']
     ordering = ['title']
 
 
     def get_context_data(self, **kwargs):
     def get_context_data(self, **kwargs):
@@ -105,12 +219,14 @@ class PublicIndexView(ListView):
         }
         }
 
 
     def get_queryset(self, **kwargs): 
     def get_queryset(self, **kwargs): 
-        qs = super().get_queryset(**kwargs) 
+        qs = super().get_queryset(**kwargs)
         query = self.request.GET.get('q')
         query = self.request.GET.get('q')
         if query:
         if query:
             qs = qs.filter(Q(title__icontains=query) | Q(url__icontains=query) | Q(timestamp__icontains=query) | Q(tags__name__icontains=query))
             qs = qs.filter(Q(title__icontains=query) | Q(url__icontains=query) | Q(timestamp__icontains=query) | Q(tags__name__icontains=query))
+        
         for snapshot in qs:
         for snapshot in qs:
-            snapshot.icons = snapshot_icons(snapshot)
+            # lazy load snapshot icons, otherwise it will load icons for entire index at once
+            snapshot.icons = lambda: snapshot_icons(snapshot)
         return qs
         return qs
 
 
     def get(self, *args, **kwargs):
     def get(self, *args, **kwargs):
@@ -130,9 +246,9 @@ class AddView(UserPassesTestMixin, FormView):
         if self.request.method == 'GET':
         if self.request.method == 'GET':
             url = self.request.GET.get('url', None)
             url = self.request.GET.get('url', None)
             if url:
             if url:
-                return {'url': url}
-        else:
-            return super().get_initial()
+                return {'url': url if '://' in url else f'https://{url}'}
+        
+        return super().get_initial()
 
 
     def test_func(self):
     def test_func(self):
         return PUBLIC_ADD_VIEW or self.request.user.is_authenticated
         return PUBLIC_ADD_VIEW or self.request.user.is_authenticated
@@ -145,15 +261,18 @@ class AddView(UserPassesTestMixin, FormView):
             'absolute_add_path': self.request.build_absolute_uri(self.request.path),
             'absolute_add_path': self.request.build_absolute_uri(self.request.path),
             'VERSION': VERSION,
             'VERSION': VERSION,
             'FOOTER_INFO': FOOTER_INFO,
             'FOOTER_INFO': FOOTER_INFO,
+            'stdout': '',
         }
         }
 
 
     def form_valid(self, form):
     def form_valid(self, form):
         url = form.cleaned_data["url"]
         url = form.cleaned_data["url"]
         print(f'[+] Adding URL: {url}')
         print(f'[+] Adding URL: {url}')
+        tag = form.cleaned_data["tag"]
         depth = 0 if form.cleaned_data["depth"] == "0" else 1
         depth = 0 if form.cleaned_data["depth"] == "0" else 1
         extractors = ','.join(form.cleaned_data["archive_methods"])
         extractors = ','.join(form.cleaned_data["archive_methods"])
         input_kwargs = {
         input_kwargs = {
             "urls": url,
             "urls": url,
+            "tag": tag,
             "depth": depth,
             "depth": depth,
             "update_all": False,
             "update_all": False,
             "out_dir": OUTPUT_DIR,
             "out_dir": OUTPUT_DIR,

+ 3 - 3
archivebox/core/wsgi.py

@@ -7,10 +7,10 @@ For more information on this file, see
 https://docs.djangoproject.com/en/2.1/howto/deployment/wsgi/
 https://docs.djangoproject.com/en/2.1/howto/deployment/wsgi/
 """
 """
 
 
-import os
 
 
-from django.core.wsgi import get_wsgi_application
+from archivebox.config import setup_django
+setup_django(in_memory_db=False, check_db=True)
 
 
-os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'archivebox.settings')
+from django.core.wsgi import get_wsgi_application
 
 
 application = get_wsgi_application()
 application = get_wsgi_application()

+ 10 - 3
archivebox/extractors/__init__.py

@@ -44,16 +44,16 @@ def get_default_archive_methods():
     return [
     return [
         ('title', should_save_title, save_title),
         ('title', should_save_title, save_title),
         ('favicon', should_save_favicon, save_favicon),
         ('favicon', should_save_favicon, save_favicon),
-        ('wget', should_save_wget, save_wget),
+        ('headers', should_save_headers, save_headers),
         ('singlefile', should_save_singlefile, save_singlefile),
         ('singlefile', should_save_singlefile, save_singlefile),
         ('pdf', should_save_pdf, save_pdf),
         ('pdf', should_save_pdf, save_pdf),
         ('screenshot', should_save_screenshot, save_screenshot),
         ('screenshot', should_save_screenshot, save_screenshot),
         ('dom', should_save_dom, save_dom),
         ('dom', should_save_dom, save_dom),
-        ('readability', should_save_readability, save_readability), #keep readability below wget and singlefile, as it depends on them
+        ('wget', should_save_wget, save_wget),
+        ('readability', should_save_readability, save_readability),  # keep readability below wget and singlefile, as it depends on them
         ('mercury', should_save_mercury, save_mercury),
         ('mercury', should_save_mercury, save_mercury),
         ('git', should_save_git, save_git),
         ('git', should_save_git, save_git),
         ('media', should_save_media, save_media),
         ('media', should_save_media, save_media),
-        ('headers', should_save_headers, save_headers),
         ('archive_org', should_save_archive_dot_org, save_archive_dot_org),
         ('archive_org', should_save_archive_dot_org, save_archive_dot_org),
     ]
     ]
 
 
@@ -115,6 +115,13 @@ def archive_link(link: Link, overwrite: bool=False, methods: Optional[Iterable[s
                     ArchiveResult.objects.create(snapshot=snapshot, extractor=method_name, cmd=result.cmd, cmd_version=result.cmd_version,
                     ArchiveResult.objects.create(snapshot=snapshot, extractor=method_name, cmd=result.cmd, cmd_version=result.cmd_version,
                                                  output=result.output, pwd=result.pwd, start_ts=result.start_ts, end_ts=result.end_ts, status=result.status)
                                                  output=result.output, pwd=result.pwd, start_ts=result.start_ts, end_ts=result.end_ts, status=result.status)
 
 
+
+                    # bump the updated time on the main Snapshot here, this is critical
+                    # to be able to cache summaries of the ArchiveResults for a given
+                    # snapshot without having to load all the results from the DB each time.
+                    # (we use {Snapshot.id}-{Snapshot.updated} as the cache key and assume
+                    # ArchiveResults are unchanged as long as the updated timestamp is unchanged)
+                    snapshot.save()
                 else:
                 else:
                     # print('{black}      X {}{reset}'.format(method_name, **ANSI))
                     # print('{black}      X {}{reset}'.format(method_name, **ANSI))
                     stats['skipped'] += 1
                     stats['skipped'] += 1

+ 1 - 1
archivebox/extractors/archive_org.py

@@ -31,7 +31,7 @@ def should_save_archive_dot_org(link: Link, out_dir: Optional[Path]=None, overwr
 
 
     out_dir = out_dir or Path(link.link_dir)
     out_dir = out_dir or Path(link.link_dir)
     if not overwrite and (out_dir / 'archive.org.txt').exists():
     if not overwrite and (out_dir / 'archive.org.txt').exists():
-        # if open(path, 'r').read().strip() != 'None':
+        # if open(path, 'r', encoding='utf-8').read().strip() != 'None':
         return False
         return False
 
 
     return SAVE_ARCHIVE_DOT_ORG
     return SAVE_ARCHIVE_DOT_ORG

+ 11 - 3
archivebox/extractors/mercury.py

@@ -54,11 +54,13 @@ def save_mercury(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT)
 
 
     out_dir = Path(out_dir or link.link_dir)
     out_dir = Path(out_dir or link.link_dir)
     output_folder = out_dir.absolute() / "mercury"
     output_folder = out_dir.absolute() / "mercury"
-    output = str(output_folder)
+    output = "mercury"
 
 
     status = 'succeeded'
     status = 'succeeded'
     timer = TimedProgress(timeout, prefix='      ')
     timer = TimedProgress(timeout, prefix='      ')
     try:
     try:
+        output_folder.mkdir(exist_ok=True)
+
         # Get plain text version of article
         # Get plain text version of article
         cmd = [
         cmd = [
             DEPENDENCIES['MERCURY_BINARY']['path'],
             DEPENDENCIES['MERCURY_BINARY']['path'],
@@ -71,6 +73,11 @@ def save_mercury(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT)
         except json.JSONDecodeError:
         except json.JSONDecodeError:
             raise ShellError(cmd, result)
             raise ShellError(cmd, result)
         
         
+        if article_text.get('failed'):
+            raise ArchiveError('Mercury was not able to get article text from the URL')
+
+        atomic_write(str(output_folder / "content.txt"), article_text["content"])
+
         # Get HTML version of article
         # Get HTML version of article
         cmd = [
         cmd = [
             DEPENDENCIES['MERCURY_BINARY']['path'],
             DEPENDENCIES['MERCURY_BINARY']['path'],
@@ -82,9 +89,10 @@ def save_mercury(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT)
         except json.JSONDecodeError:
         except json.JSONDecodeError:
             raise ShellError(cmd, result)
             raise ShellError(cmd, result)
 
 
-        output_folder.mkdir(exist_ok=True)
+        if article_text.get('failed'):
+            raise ArchiveError('Mercury was not able to get article HTML from the URL')
+
         atomic_write(str(output_folder / "content.html"), article_json.pop("content"))
         atomic_write(str(output_folder / "content.html"), article_json.pop("content"))
-        atomic_write(str(output_folder / "content.txt"), article_text["content"])
         atomic_write(str(output_folder / "article.json"), article_json)
         atomic_write(str(output_folder / "article.json"), article_json)
 
 
         # Check for common failure cases
         # Check for common failure cases

+ 13 - 5
archivebox/extractors/readability.py

@@ -35,7 +35,7 @@ def get_html(link: Link, path: Path) -> str:
     document = None
     document = None
     for source in sources:
     for source in sources:
         try:
         try:
-            with open(abs_path / source, "r") as f:
+            with open(abs_path / source, "r", encoding="utf-8") as f:
                 document = f.read()
                 document = f.read()
                 break
                 break
         except (FileNotFoundError, TypeError):
         except (FileNotFoundError, TypeError):
@@ -63,7 +63,7 @@ def save_readability(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEO
 
 
     out_dir = Path(out_dir or link.link_dir)
     out_dir = Path(out_dir or link.link_dir)
     output_folder = out_dir.absolute() / "readability"
     output_folder = out_dir.absolute() / "readability"
-    output = str(output_folder)
+    output = "readability"
 
 
     # Readability Docs: https://github.com/mozilla/readability
     # Readability Docs: https://github.com/mozilla/readability
 
 
@@ -81,13 +81,20 @@ def save_readability(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEO
         temp_doc.write(document.encode("utf-8"))
         temp_doc.write(document.encode("utf-8"))
         temp_doc.close()
         temp_doc.close()
 
 
+        if not document or len(document) < 10:
+            raise ArchiveError('Readability could not find HTML to parse for article text')
+
         cmd = [
         cmd = [
             DEPENDENCIES['READABILITY_BINARY']['path'],
             DEPENDENCIES['READABILITY_BINARY']['path'],
-            temp_doc.name
+            temp_doc.name,
         ]
         ]
 
 
         result = run(cmd, cwd=out_dir, timeout=timeout)
         result = run(cmd, cwd=out_dir, timeout=timeout)
-        result_json = json.loads(result.stdout)
+        try:
+            result_json = json.loads(result.stdout)
+        except json.JSONDecodeError:
+            raise ArchiveError('Readability was not able to archive the page', result.stdout + result.stderr)
+
         output_folder.mkdir(exist_ok=True)
         output_folder.mkdir(exist_ok=True)
         readability_content = result_json.pop("textContent") 
         readability_content = result_json.pop("textContent") 
         atomic_write(str(output_folder / "content.html"), result_json.pop("content"))
         atomic_write(str(output_folder / "content.html"), result_json.pop("content"))
@@ -112,6 +119,7 @@ def save_readability(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEO
     except (Exception, OSError) as err:
     except (Exception, OSError) as err:
         status = 'failed'
         status = 'failed'
         output = err
         output = err
+        cmd = [cmd[0], './{singlefile,dom}.html']
     finally:
     finally:
         timer.end()
         timer.end()
 
 
@@ -121,6 +129,6 @@ def save_readability(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEO
         cmd_version=READABILITY_VERSION,
         cmd_version=READABILITY_VERSION,
         output=output,
         output=output,
         status=status,
         status=status,
-        index_texts= [readability_content] if readability_content else [],
+        index_texts=[readability_content] if readability_content else [],
         **timer.stats,  
         **timer.stats,  
     )
     )

+ 1 - 0
archivebox/index/__init__.py

@@ -356,6 +356,7 @@ LINK_FILTERS = {
     'regex': lambda pattern: Q(url__iregex=pattern),
     'regex': lambda pattern: Q(url__iregex=pattern),
     'domain': lambda pattern: Q(url__istartswith=f"http://{pattern}") | Q(url__istartswith=f"https://{pattern}") | Q(url__istartswith=f"ftp://{pattern}"),
     'domain': lambda pattern: Q(url__istartswith=f"http://{pattern}") | Q(url__istartswith=f"https://{pattern}") | Q(url__istartswith=f"ftp://{pattern}"),
     'tag': lambda pattern: Q(tags__name=pattern),
     'tag': lambda pattern: Q(tags__name=pattern),
+    'timestamp': lambda pattern: Q(timestamp=pattern),
 }
 }
 
 
 @enforce_types
 @enforce_types

+ 79 - 72
archivebox/index/html.py

@@ -1,11 +1,12 @@
 __package__ = 'archivebox.index'
 __package__ = 'archivebox.index'
 
 
+from pathlib import Path
 from datetime import datetime
 from datetime import datetime
+from collections import defaultdict
 from typing import List, Optional, Iterator, Mapping
 from typing import List, Optional, Iterator, Mapping
-from pathlib import Path
 
 
 from django.utils.html import format_html, mark_safe
 from django.utils.html import format_html, mark_safe
-from collections import defaultdict
+from django.core.cache import cache
 
 
 from .schema import Link
 from .schema import Link
 from ..system import atomic_write
 from ..system import atomic_write
@@ -20,7 +21,6 @@ from ..util import (
 from ..config import (
 from ..config import (
     OUTPUT_DIR,
     OUTPUT_DIR,
     VERSION,
     VERSION,
-    GIT_SHA,
     FOOTER_INFO,
     FOOTER_INFO,
     HTML_INDEX_FILENAME,
     HTML_INDEX_FILENAME,
     SAVE_ARCHIVE_DOT_ORG,
     SAVE_ARCHIVE_DOT_ORG,
@@ -60,7 +60,7 @@ def main_index_template(links: List[Link], template: str=MAIN_INDEX_TEMPLATE) ->
 
 
     return render_django_template(template, {
     return render_django_template(template, {
         'version': VERSION,
         'version': VERSION,
-        'git_sha': GIT_SHA,
+        'git_sha': VERSION,  # not used anymore, but kept for backwards compatibility
         'num_links': str(len(links)),
         'num_links': str(len(links)),
         'date_updated': datetime.now().strftime('%Y-%m-%d'),
         'date_updated': datetime.now().strftime('%Y-%m-%d'),
         'time_updated': datetime.now().strftime('%Y-%m-%d %H:%M'),
         'time_updated': datetime.now().strftime('%Y-%m-%d %H:%M'),
@@ -116,71 +116,78 @@ def render_django_template(template: str, context: Mapping[str, str]) -> str:
 
 
 
 
 def snapshot_icons(snapshot) -> str:
 def snapshot_icons(snapshot) -> str:
-    from core.models import EXTRACTORS
-
-    # start = datetime.now()
-
-    archive_results = snapshot.archiveresult_set.filter(status="succeeded")
-    link = snapshot.as_link()
-    path = link.archive_path
-    canon = link.canonical_outputs()
-    output = ""
-    output_template = '<a href="/{}/{}" class="exists-{}" title="{}">{}</a> &nbsp;'
-    icons = {
-        "singlefile": "❶",
-        "wget": "🆆",
-        "dom": "🅷",
-        "pdf": "📄",
-        "screenshot": "💻",
-        "media": "📼",
-        "git": "🅶",
-        "archive_org": "🏛",
-        "readability": "🆁",
-        "mercury": "🅼",
-        "warc": "📦"
-    }
-    exclude = ["favicon", "title", "headers", "archive_org"]
-    # Missing specific entry for WARC
-
-    extractor_outputs = defaultdict(lambda: None)
-    for extractor, _ in EXTRACTORS:
-        for result in archive_results:
-            if result.extractor == extractor and result:
-                extractor_outputs[extractor] = result
-
-    for extractor, _ in EXTRACTORS:
-        if extractor not in exclude:
-            existing = extractor_outputs[extractor] and extractor_outputs[extractor].status == 'succeeded' and extractor_outputs[extractor].output
-            # Check filesystsem to see if anything is actually present (too slow, needs optimization/caching)
-            # if existing:
-            #     existing = (Path(path) / existing)
-            #     if existing.is_file():
-            #         existing = True
-            #     elif existing.is_dir():
-            #         existing = any(existing.glob('*.*'))
-            output += format_html(output_template, path, canon[f"{extractor}_path"], str(bool(existing)),
-                                         extractor, icons.get(extractor, "?"))
-        if extractor == "wget":
-            # warc isn't technically it's own extractor, so we have to add it after wget
-            
-            # get from db (faster but less thurthful)
-            exists = extractor_outputs[extractor] and extractor_outputs[extractor].status == 'succeeded' and extractor_outputs[extractor].output
-            # get from filesystem (slower but more accurate)
-            # exists = list((Path(path) / canon["warc_path"]).glob("*.warc.gz"))
-            output += format_html(output_template, 'warc/', canon["warc_path"], str(bool(exists)), "warc", icons.get("warc", "?"))
-
-        if extractor == "archive_org":
-            # The check for archive_org is different, so it has to be handled separately
-
-            # get from db (faster)
-            exists = extractor_outputs[extractor] and extractor_outputs[extractor].status == 'succeeded' and extractor_outputs[extractor].output
-            # get from filesystem (slower)
-            # target_path = Path(path) / "archive.org.txt"
-            # exists = target_path.exists()
-            output += '<a href="{}" class="exists-{}" title="{}">{}</a> '.format(canon["archive_org_path"], str(exists),
-                                                                                        "archive_org", icons.get("archive_org", "?"))
-
-    result = format_html('<span class="files-icons" style="font-size: 1.1em; opacity: 0.8; min-width: 240px; display: inline-block">{}<span>', mark_safe(output))
-    # end = datetime.now()
-    # print(((end - start).total_seconds()*1000) // 1, 'ms')
-    return result
+    cache_key = f'{str(snapshot.id)[:12]}-{(snapshot.updated or snapshot.added).timestamp()}-snapshot-icons'
+    
+    def calc_snapshot_icons():
+        from core.models import EXTRACTORS
+        # start = datetime.now()
+
+        archive_results = snapshot.archiveresult_set.filter(status="succeeded", output__isnull=False)
+        link = snapshot.as_link()
+        path = link.archive_path
+        canon = link.canonical_outputs()
+        output = ""
+        output_template = '<a href="/{}/{}" class="exists-{}" title="{}">{}</a> &nbsp;'
+        icons = {
+            "singlefile": "❶",
+            "wget": "🆆",
+            "dom": "🅷",
+            "pdf": "📄",
+            "screenshot": "💻",
+            "media": "📼",
+            "git": "🅶",
+            "archive_org": "🏛",
+            "readability": "🆁",
+            "mercury": "🅼",
+            "warc": "📦"
+        }
+        exclude = ["favicon", "title", "headers", "archive_org"]
+        # Missing specific entry for WARC
+
+        extractor_outputs = defaultdict(lambda: None)
+        for extractor, _ in EXTRACTORS:
+            for result in archive_results:
+                if result.extractor == extractor and result:
+                    extractor_outputs[extractor] = result
+
+        for extractor, _ in EXTRACTORS:
+            if extractor not in exclude:
+                existing = extractor_outputs[extractor] and extractor_outputs[extractor].status == 'succeeded' and extractor_outputs[extractor].output
+                # Check filesystsem to see if anything is actually present (too slow, needs optimization/caching)
+                # if existing:
+                #     existing = (Path(path) / existing)
+                #     if existing.is_file():
+                #         existing = True
+                #     elif existing.is_dir():
+                #         existing = any(existing.glob('*.*'))
+                output += format_html(output_template, path, canon[f"{extractor}_path"], str(bool(existing)),
+                                             extractor, icons.get(extractor, "?"))
+            if extractor == "wget":
+                # warc isn't technically it's own extractor, so we have to add it after wget
+                
+                # get from db (faster but less thurthful)
+                exists = extractor_outputs[extractor] and extractor_outputs[extractor].status == 'succeeded' and extractor_outputs[extractor].output
+                # get from filesystem (slower but more accurate)
+                # exists = list((Path(path) / canon["warc_path"]).glob("*.warc.gz"))
+                output += format_html(output_template, path, canon["warc_path"], str(bool(exists)), "warc", icons.get("warc", "?"))
+
+            if extractor == "archive_org":
+                # The check for archive_org is different, so it has to be handled separately
+
+                # get from db (faster)
+                exists = extractor_outputs[extractor] and extractor_outputs[extractor].status == 'succeeded' and extractor_outputs[extractor].output
+                # get from filesystem (slower)
+                # target_path = Path(path) / "archive.org.txt"
+                # exists = target_path.exists()
+                output += '<a href="{}" class="exists-{}" title="{}">{}</a> '.format(canon["archive_org_path"], str(exists),
+                                                                                            "archive_org", icons.get("archive_org", "?"))
+
+        result = format_html('<span class="files-icons" style="font-size: 1.1em; opacity: 0.8; min-width: 240px; display: inline-block">{}<span>', mark_safe(output))
+        # end = datetime.now()
+        # print(((end - start).total_seconds()*1000) // 1, 'ms')
+        return result
+
+    return cache.get_or_set(cache_key, calc_snapshot_icons)
+    # return calc_snapshot_icons()
+
+   

+ 1 - 2
archivebox/index/json.py

@@ -15,7 +15,6 @@ from ..config import (
     VERSION,
     VERSION,
     OUTPUT_DIR,
     OUTPUT_DIR,
     FOOTER_INFO,
     FOOTER_INFO,
-    GIT_SHA,
     DEPENDENCIES,
     DEPENDENCIES,
     JSON_INDEX_FILENAME,
     JSON_INDEX_FILENAME,
     ARCHIVE_DIR_NAME,
     ARCHIVE_DIR_NAME,
@@ -30,7 +29,7 @@ MAIN_INDEX_HEADER = {
     'meta': {
     'meta': {
         'project': 'ArchiveBox',
         'project': 'ArchiveBox',
         'version': VERSION,
         'version': VERSION,
-        'git_sha': GIT_SHA,
+        'git_sha': VERSION,  # not used anymore, but kept for backwards compatibility
         'website': 'https://ArchiveBox.io',
         'website': 'https://ArchiveBox.io',
         'docs': 'https://github.com/ArchiveBox/ArchiveBox/wiki',
         'docs': 'https://github.com/ArchiveBox/ArchiveBox/wiki',
         'source': 'https://github.com/ArchiveBox/ArchiveBox',
         'source': 'https://github.com/ArchiveBox/ArchiveBox',

+ 10 - 1
archivebox/index/schema.py

@@ -16,6 +16,7 @@ from typing import List, Dict, Any, Optional, Union
 
 
 from dataclasses import dataclass, asdict, field, fields
 from dataclasses import dataclass, asdict, field, fields
 
 
+from django.utils.functional import cached_property
 
 
 from ..system import get_dir_size
 from ..system import get_dir_size
 
 
@@ -133,7 +134,6 @@ class Link:
     updated: Optional[datetime] = None
     updated: Optional[datetime] = None
     schema: str = 'Link'
     schema: str = 'Link'
 
 
-
     def __str__(self) -> str:
     def __str__(self) -> str:
         return f'[{self.timestamp}] {self.url} "{self.title}"'
         return f'[{self.timestamp}] {self.url} "{self.title}"'
 
 
@@ -190,6 +190,7 @@ class Link:
         }
         }
         if extended:
         if extended:
             info.update({
             info.update({
+                'snapshot_id': self.snapshot_id,
                 'link_dir': self.link_dir,
                 'link_dir': self.link_dir,
                 'archive_path': self.archive_path,
                 'archive_path': self.archive_path,
                 
                 
@@ -201,6 +202,9 @@ class Link:
                 'basename': self.basename,
                 'basename': self.basename,
                 'extension': self.extension,
                 'extension': self.extension,
                 'is_static': self.is_static,
                 'is_static': self.is_static,
+                
+                'tags_str': self.tags,   # only used to render static index in index/html.py, remove if no longer needed there
+                'icons': None,           # only used to render static index in index/html.py, remove if no longer needed there
 
 
                 'bookmarked_date': self.bookmarked_date,
                 'bookmarked_date': self.bookmarked_date,
                 'updated_date': self.updated_date,
                 'updated_date': self.updated_date,
@@ -255,6 +259,11 @@ class Link:
 
 
         return to_csv(self, cols=cols or self.field_names(), separator=separator, ljust=ljust)
         return to_csv(self, cols=cols or self.field_names(), separator=separator, ljust=ljust)
 
 
+    @cached_property
+    def snapshot_id(self):
+        from core.models import Snapshot
+        return str(Snapshot.objects.only('id').get(url=self.url).id)
+
     @classmethod
     @classmethod
     def field_names(cls):
     def field_names(cls):
         return [f.name for f in fields(cls)]
         return [f.name for f in fields(cls)]

+ 61 - 21
archivebox/index/sql.py

@@ -7,7 +7,7 @@ from django.db.models import QuerySet
 from django.db import transaction
 from django.db import transaction
 
 
 from .schema import Link
 from .schema import Link
-from ..util import enforce_types
+from ..util import enforce_types, parse_date
 from ..config import OUTPUT_DIR
 from ..config import OUTPUT_DIR
 
 
 
 
@@ -23,13 +23,15 @@ def parse_sql_main_index(out_dir: Path=OUTPUT_DIR) -> Iterator[Link]:
     )
     )
 
 
 @enforce_types
 @enforce_types
-def remove_from_sql_main_index(snapshots: QuerySet, out_dir: Path=OUTPUT_DIR) -> None:
-    with transaction.atomic():
-        snapshots.delete()
+def remove_from_sql_main_index(snapshots: QuerySet, atomic: bool=False, out_dir: Path=OUTPUT_DIR) -> None:
+    if atomic:
+        with transaction.atomic():
+            return snapshots.delete()
+    return snapshots.delete()
 
 
 @enforce_types
 @enforce_types
 def write_link_to_sql_index(link: Link):
 def write_link_to_sql_index(link: Link):
-    from core.models import Snapshot
+    from core.models import Snapshot, ArchiveResult
     info = {k: v for k, v in link._asdict().items() if k in Snapshot.keys}
     info = {k: v for k, v in link._asdict().items() if k in Snapshot.keys}
     tags = info.pop("tags")
     tags = info.pop("tags")
     if tags is None:
     if tags is None:
@@ -41,36 +43,74 @@ def write_link_to_sql_index(link: Link):
         while Snapshot.objects.filter(timestamp=info["timestamp"]).exists():
         while Snapshot.objects.filter(timestamp=info["timestamp"]).exists():
             info["timestamp"] = str(float(info["timestamp"]) + 1.0)
             info["timestamp"] = str(float(info["timestamp"]) + 1.0)
 
 
-    snapshot, _ = Snapshot.objects.update_or_create(url=link.url, defaults=info)
+        snapshot, _ = Snapshot.objects.update_or_create(url=link.url, defaults=info)
     snapshot.save_tags(tags)
     snapshot.save_tags(tags)
+
+    for extractor, entries in link.history.items():
+        for entry in entries:
+            if isinstance(entry, dict):
+                result, _ = ArchiveResult.objects.get_or_create(
+                    snapshot_id=snapshot.id,
+                    extractor=extractor,
+                    start_ts=parse_date(entry['start_ts']),
+                    defaults={
+                        'end_ts': parse_date(entry['end_ts']),
+                        'cmd': entry['cmd'],
+                        'output': entry['output'],
+                        'cmd_version': entry.get('cmd_version') or 'unknown',
+                        'pwd': entry['pwd'],
+                        'status': entry['status'],
+                    }
+                )
+            else:
+                result, _ = ArchiveResult.objects.update_or_create(
+                    snapshot_id=snapshot.id,
+                    extractor=extractor,
+                    start_ts=parse_date(entry.start_ts),
+                    defaults={
+                        'end_ts': parse_date(entry.end_ts),
+                        'cmd': entry.cmd,
+                        'output': entry.output,
+                        'cmd_version': entry.cmd_version or 'unknown',
+                        'pwd': entry.pwd,
+                        'status': entry.status,
+                    }
+                )
+
     return snapshot
     return snapshot
 
 
 
 
 @enforce_types
 @enforce_types
 def write_sql_main_index(links: List[Link], out_dir: Path=OUTPUT_DIR) -> None:
 def write_sql_main_index(links: List[Link], out_dir: Path=OUTPUT_DIR) -> None:
-    with transaction.atomic():
-        for link in links:
-            write_link_to_sql_index(link)
+    for link in links:
+        # with transaction.atomic():
+            # write_link_to_sql_index(link)
+        write_link_to_sql_index(link)
             
             
 
 
 @enforce_types
 @enforce_types
 def write_sql_link_details(link: Link, out_dir: Path=OUTPUT_DIR) -> None:
 def write_sql_link_details(link: Link, out_dir: Path=OUTPUT_DIR) -> None:
     from core.models import Snapshot
     from core.models import Snapshot
 
 
-    with transaction.atomic():
-        try:
-            snap = Snapshot.objects.get(url=link.url)
-        except Snapshot.DoesNotExist:
-            snap = write_link_to_sql_index(link)
-        snap.title = link.title
+    # with transaction.atomic():
+    #     try:
+    #         snap = Snapshot.objects.get(url=link.url)
+    #     except Snapshot.DoesNotExist:
+    #         snap = write_link_to_sql_index(link)
+    #     snap.title = link.title
+    try:
+        snap = Snapshot.objects.get(url=link.url)
+    except Snapshot.DoesNotExist:
+        snap = write_link_to_sql_index(link)
+    snap.title = link.title
 
 
-        tag_set = (
-            set(tag.strip() for tag in (link.tags or '').split(','))
-        )
-        tag_list = list(tag_set) or []
+    tag_set = (
+        set(tag.strip() for tag in (link.tags or '').split(','))
+    )
+    tag_list = list(tag_set) or []
 
 
-        snap.save()
-        snap.save_tags(tag_list)
+    snap.save()
+    snap.save_tags(tag_list)
 
 
 
 
 
 

+ 64 - 11
archivebox/logging_util.py

@@ -3,6 +3,7 @@ __package__ = 'archivebox'
 import re
 import re
 import os
 import os
 import sys
 import sys
+import stat
 import time
 import time
 import argparse
 import argparse
 from math import log
 from math import log
@@ -11,18 +12,21 @@ from pathlib import Path
 
 
 from datetime import datetime
 from datetime import datetime
 from dataclasses import dataclass
 from dataclasses import dataclass
-from typing import Optional, List, Dict, Union, IO, TYPE_CHECKING
+from typing import Any, Optional, List, Dict, Union, IO, TYPE_CHECKING
 
 
 if TYPE_CHECKING:
 if TYPE_CHECKING:
     from .index.schema import Link, ArchiveResult
     from .index.schema import Link, ArchiveResult
 
 
+from .system import get_dir_size
 from .util import enforce_types
 from .util import enforce_types
 from .config import (
 from .config import (
     ConfigDict,
     ConfigDict,
     OUTPUT_DIR,
     OUTPUT_DIR,
     PYTHON_ENCODING,
     PYTHON_ENCODING,
+    VERSION,
     ANSI,
     ANSI,
     IS_TTY,
     IS_TTY,
+    IN_DOCKER,
     TERM_WIDTH,
     TERM_WIDTH,
     SHOW_PROGRESS,
     SHOW_PROGRESS,
     SOURCES_DIR_NAME,
     SOURCES_DIR_NAME,
@@ -50,6 +54,37 @@ class RuntimeStats:
 _LAST_RUN_STATS = RuntimeStats()
 _LAST_RUN_STATS = RuntimeStats()
 
 
 
 
+def debug_dict_summary(obj: Dict[Any, Any]) -> None:
+    stderr(' '.join(f'{key}={str(val).ljust(6)}' for key, val in obj.items()))
+
+
+def get_fd_info(fd) -> Dict[str, Any]:
+    NAME = fd.name[1:-1]
+    FILENO = fd.fileno()
+    MODE = os.fstat(FILENO).st_mode
+    IS_TTY = hasattr(fd, 'isatty') and fd.isatty()
+    IS_PIPE = stat.S_ISFIFO(MODE)
+    IS_FILE = stat.S_ISREG(MODE)
+    IS_TERMINAL =  not (IS_PIPE or IS_FILE)
+    IS_LINE_BUFFERED = fd.line_buffering
+    IS_READABLE = fd.readable()
+    return {
+        'NAME': NAME, 'FILENO': FILENO, 'MODE': MODE,
+        'IS_TTY': IS_TTY, 'IS_PIPE': IS_PIPE, 'IS_FILE': IS_FILE,
+        'IS_TERMINAL': IS_TERMINAL, 'IS_LINE_BUFFERED': IS_LINE_BUFFERED,
+        'IS_READABLE': IS_READABLE,
+    }
+    
+
+# # Log debug information about stdin, stdout, and stderr
+# sys.stdout.write('[>&1] this is python stdout\n')
+# sys.stderr.write('[>&2] this is python stderr\n')
+
+# debug_dict_summary(get_fd_info(sys.stdin))
+# debug_dict_summary(get_fd_info(sys.stdout))
+# debug_dict_summary(get_fd_info(sys.stderr))
+
+
 
 
 class SmartFormatter(argparse.HelpFormatter):
 class SmartFormatter(argparse.HelpFormatter):
     """Patched formatter that prints newlines in argparse help strings"""
     """Patched formatter that prints newlines in argparse help strings"""
@@ -62,22 +97,40 @@ class SmartFormatter(argparse.HelpFormatter):
 def reject_stdin(caller: str, stdin: Optional[IO]=sys.stdin) -> None:
 def reject_stdin(caller: str, stdin: Optional[IO]=sys.stdin) -> None:
     """Tell the user they passed stdin to a command that doesn't accept it"""
     """Tell the user they passed stdin to a command that doesn't accept it"""
 
 
-    if stdin and not stdin.isatty():
-        stdin_raw_text = stdin.read().strip()
+    if not stdin:
+        return None
+
+    if IN_DOCKER:
+        # when TTY is disabled in docker we cant tell if stdin is being piped in or not
+        # if we try to read stdin when its not piped we will hang indefinitely waiting for it
+        return None
+
+    if not stdin.isatty():
+        # stderr('READING STDIN TO REJECT...')
+        stdin_raw_text = stdin.read()
         if stdin_raw_text:
         if stdin_raw_text:
+            # stderr('GOT STDIN!', len(stdin_str))
             stderr(f'[X] The "{caller}" command does not accept stdin.', color='red')
             stderr(f'[X] The "{caller}" command does not accept stdin.', color='red')
             stderr(f'    Run archivebox "{caller} --help" to see usage and examples.')
             stderr(f'    Run archivebox "{caller} --help" to see usage and examples.')
             stderr()
             stderr()
             raise SystemExit(1)
             raise SystemExit(1)
+    return None
 
 
 
 
 def accept_stdin(stdin: Optional[IO]=sys.stdin) -> Optional[str]:
 def accept_stdin(stdin: Optional[IO]=sys.stdin) -> Optional[str]:
     """accept any standard input and return it as a string or None"""
     """accept any standard input and return it as a string or None"""
+    
     if not stdin:
     if not stdin:
         return None
         return None
-    elif stdin and not stdin.isatty():
-        stdin_str = stdin.read().strip()
-        return stdin_str or None
+
+    if not stdin.isatty():
+        # stderr('READING STDIN TO ACCEPT...')
+        stdin_str = stdin.read()
+
+        if stdin_str:
+            # stderr('GOT STDIN...', len(stdin_str))
+            return stdin_str
+
     return None
     return None
 
 
 
 
@@ -174,7 +227,6 @@ def progress_bar(seconds: int, prefix: str='') -> None:
 
 
 
 
 def log_cli_command(subcommand: str, subcommand_args: List[str], stdin: Optional[str], pwd: str):
 def log_cli_command(subcommand: str, subcommand_args: List[str], stdin: Optional[str], pwd: str):
-    from .config import VERSION, ANSI
     cmd = ' '.join(('archivebox', subcommand, *subcommand_args))
     cmd = ' '.join(('archivebox', subcommand, *subcommand_args))
     stderr('{black}[i] [{now}] ArchiveBox v{VERSION}: {cmd}{reset}'.format(
     stderr('{black}[i] [{now}] ArchiveBox v{VERSION}: {cmd}{reset}'.format(
         now=datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
         now=datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
@@ -233,11 +285,11 @@ def log_indexing_process_finished():
 
 
 def log_indexing_started(out_path: str):
 def log_indexing_started(out_path: str):
     if IS_TTY:
     if IS_TTY:
-        sys.stdout.write(f'    > {out_path}')
+        sys.stdout.write(f'    > ./{Path(out_path).relative_to(OUTPUT_DIR)}')
 
 
 
 
 def log_indexing_finished(out_path: str):
 def log_indexing_finished(out_path: str):
-    print(f'\r    √ {out_path}')
+    print(f'\r    √ ./{Path(out_path).relative_to(OUTPUT_DIR)}')
 
 
 
 
 ### Archiving Stage
 ### Archiving Stage
@@ -272,8 +324,6 @@ def log_archiving_paused(num_links: int, idx: int, timestamp: str):
         total=num_links,
         total=num_links,
     ))
     ))
     print()
     print()
-    print('    {lightred}Hint:{reset} To view your archive index, run:'.format(**ANSI))
-    print('        archivebox server  # then visit http://127.0.0.1:8000')
     print('    Continue archiving where you left off by running:')
     print('    Continue archiving where you left off by running:')
     print('        archivebox update --resume={}'.format(timestamp))
     print('        archivebox update --resume={}'.format(timestamp))
 
 
@@ -331,6 +381,9 @@ def log_link_archiving_finished(link: "Link", link_dir: str, is_new: bool, stats
     else:
     else:
         _LAST_RUN_STATS.succeeded += 1
         _LAST_RUN_STATS.succeeded += 1
 
 
+    size = get_dir_size(link_dir)
+    print('        {black}{} files ({}){reset}'.format(size[2], printable_filesize(size[0]), **ANSI))
+
 
 
 def log_archive_method_started(method: str):
 def log_archive_method_started(method: str):
     print('      > {}'.format(method))
     print('      > {}'.format(method))

+ 156 - 95
archivebox/main.py

@@ -67,6 +67,7 @@ from .config import (
     ConfigDict,
     ConfigDict,
     ANSI,
     ANSI,
     IS_TTY,
     IS_TTY,
+    DEBUG,
     IN_DOCKER,
     IN_DOCKER,
     USER,
     USER,
     ARCHIVEBOX_BINARY,
     ARCHIVEBOX_BINARY,
@@ -76,6 +77,7 @@ from .config import (
     ARCHIVE_DIR,
     ARCHIVE_DIR,
     LOGS_DIR,
     LOGS_DIR,
     CONFIG_FILE,
     CONFIG_FILE,
+    CONFIG_FILENAME,
     ARCHIVE_DIR_NAME,
     ARCHIVE_DIR_NAME,
     SOURCES_DIR_NAME,
     SOURCES_DIR_NAME,
     LOGS_DIR_NAME,
     LOGS_DIR_NAME,
@@ -84,6 +86,7 @@ from .config import (
     SQL_INDEX_FILENAME,
     SQL_INDEX_FILENAME,
     ROBOTS_TXT_FILENAME,
     ROBOTS_TXT_FILENAME,
     FAVICON_FILENAME,
     FAVICON_FILENAME,
+    SEARCH_BACKEND_ENGINE,
     check_dependencies,
     check_dependencies,
     check_data_folder,
     check_data_folder,
     write_config_file,
     write_config_file,
@@ -125,14 +128,19 @@ ALLOWED_IN_OUTPUT_DIR = {
     'node_modules',
     'node_modules',
     'package-lock.json',
     'package-lock.json',
     'static',
     'static',
+    'sonic',
     ARCHIVE_DIR_NAME,
     ARCHIVE_DIR_NAME,
     SOURCES_DIR_NAME,
     SOURCES_DIR_NAME,
     LOGS_DIR_NAME,
     LOGS_DIR_NAME,
     SQL_INDEX_FILENAME,
     SQL_INDEX_FILENAME,
+    f'{SQL_INDEX_FILENAME}-wal',
+    f'{SQL_INDEX_FILENAME}-shm',
     JSON_INDEX_FILENAME,
     JSON_INDEX_FILENAME,
     HTML_INDEX_FILENAME,
     HTML_INDEX_FILENAME,
     ROBOTS_TXT_FILENAME,
     ROBOTS_TXT_FILENAME,
     FAVICON_FILENAME,
     FAVICON_FILENAME,
+    CONFIG_FILENAME,
+    f'{CONFIG_FILENAME}.bak',
 }
 }
 
 
 @enforce_types
 @enforce_types
@@ -214,9 +222,23 @@ def version(quiet: bool=False,
     if quiet:
     if quiet:
         print(VERSION)
         print(VERSION)
     else:
     else:
+        # ArchiveBox v0.5.6
+        # Cpython Linux Linux-4.19.121-linuxkit-x86_64-with-glibc2.28 x86_64 (in Docker) (in TTY)
         print('ArchiveBox v{}'.format(VERSION))
         print('ArchiveBox v{}'.format(VERSION))
         p = platform.uname()
         p = platform.uname()
-        print(sys.implementation.name.title(), p.system, platform.platform(), p.machine, '(in Docker)' if IN_DOCKER else '(not in Docker)')
+        print(
+            sys.implementation.name.title(),
+            p.system,
+            platform.platform(),
+            p.machine,
+        )
+        print(
+            f'IN_DOCKER={IN_DOCKER}',
+            f'DEBUG={DEBUG}',
+            f'IS_TTY={IS_TTY}',
+            f'TZ={os.environ.get("TZ", "UTC")}',
+            f'SEARCH_BACKEND_ENGINE={SEARCH_BACKEND_ENGINE}',
+        )
         print()
         print()
 
 
         print('{white}[i] Dependency versions:{reset}'.format(**ANSI))
         print('{white}[i] Dependency versions:{reset}'.format(**ANSI))
@@ -261,7 +283,7 @@ def run(subcommand: str,
 
 
 
 
 @enforce_types
 @enforce_types
-def init(force: bool=False, out_dir: Path=OUTPUT_DIR) -> None:
+def init(force: bool=False, quick: bool=False, out_dir: Path=OUTPUT_DIR) -> None:
     """Initialize a new ArchiveBox collection in the current directory"""
     """Initialize a new ArchiveBox collection in the current directory"""
     
     
     from core.models import Snapshot
     from core.models import Snapshot
@@ -276,13 +298,12 @@ def init(force: bool=False, out_dir: Path=OUTPUT_DIR) -> None:
     existing_index = (Path(out_dir) / SQL_INDEX_FILENAME).exists()
     existing_index = (Path(out_dir) / SQL_INDEX_FILENAME).exists()
 
 
     if is_empty and not existing_index:
     if is_empty and not existing_index:
-        print('{green}[+] Initializing a new ArchiveBox collection in this folder...{reset}'.format(**ANSI))
-        print(f'    {out_dir}')
-        print('{green}------------------------------------------------------------------{reset}'.format(**ANSI))
+        print('{green}[+] Initializing a new ArchiveBox v{} collection...{reset}'.format(VERSION, **ANSI))
+        print('{green}----------------------------------------------------------------------{reset}'.format(**ANSI))
     elif existing_index:
     elif existing_index:
-        print('{green}[*] Updating existing ArchiveBox collection in this folder...{reset}'.format(**ANSI))
-        print(f'    {out_dir}')
-        print('{green}------------------------------------------------------------------{reset}'.format(**ANSI))
+        # TODO: properly detect and print the existing version in current index as well
+        print('{green}[^] Verifying and updating existing ArchiveBox collection to v{}...{reset}'.format(VERSION, **ANSI))
+        print('{green}----------------------------------------------------------------------{reset}'.format(**ANSI))
     else:
     else:
         if force:
         if force:
             stderr('[!] This folder appears to already have files in it, but no index.sqlite3 is present.', color='lightyellow')
             stderr('[!] This folder appears to already have files in it, but no index.sqlite3 is present.', color='lightyellow')
@@ -303,30 +324,25 @@ def init(force: bool=False, out_dir: Path=OUTPUT_DIR) -> None:
     else:
     else:
         print('\n{green}[+] Building archive folder structure...{reset}'.format(**ANSI))
         print('\n{green}[+] Building archive folder structure...{reset}'.format(**ANSI))
     
     
+    print(f'    + ./{ARCHIVE_DIR.relative_to(OUTPUT_DIR)}, ./{SOURCES_DIR.relative_to(OUTPUT_DIR)}, ./{LOGS_DIR.relative_to(OUTPUT_DIR)}...')
     Path(SOURCES_DIR).mkdir(exist_ok=True)
     Path(SOURCES_DIR).mkdir(exist_ok=True)
-    print(f'    √ {SOURCES_DIR}')
-    
     Path(ARCHIVE_DIR).mkdir(exist_ok=True)
     Path(ARCHIVE_DIR).mkdir(exist_ok=True)
-    print(f'    √ {ARCHIVE_DIR}')
-
     Path(LOGS_DIR).mkdir(exist_ok=True)
     Path(LOGS_DIR).mkdir(exist_ok=True)
-    print(f'    √ {LOGS_DIR}')
-
+    print(f'    + ./{CONFIG_FILE.relative_to(OUTPUT_DIR)}...')
     write_config_file({}, out_dir=out_dir)
     write_config_file({}, out_dir=out_dir)
-    print(f'    √ {CONFIG_FILE}')
+
     if (Path(out_dir) / SQL_INDEX_FILENAME).exists():
     if (Path(out_dir) / SQL_INDEX_FILENAME).exists():
-        print('\n{green}[*] Verifying main SQL index and running migrations...{reset}'.format(**ANSI))
+        print('\n{green}[*] Verifying main SQL index and running any migrations needed...{reset}'.format(**ANSI))
     else:
     else:
-        print('\n{green}[+] Building main SQL index and running migrations...{reset}'.format(**ANSI))
+        print('\n{green}[+] Building main SQL index and running initial migrations...{reset}'.format(**ANSI))
     
     
     DATABASE_FILE = Path(out_dir) / SQL_INDEX_FILENAME
     DATABASE_FILE = Path(out_dir) / SQL_INDEX_FILENAME
-    print(f'    √ {DATABASE_FILE}')
-    print()
     for migration_line in apply_migrations(out_dir):
     for migration_line in apply_migrations(out_dir):
         print(f'    {migration_line}')
         print(f'    {migration_line}')
 
 
-
     assert DATABASE_FILE.exists()
     assert DATABASE_FILE.exists()
+    print()
+    print(f'    √ ./{DATABASE_FILE.relative_to(OUTPUT_DIR)}')
     
     
     # from django.contrib.auth.models import User
     # from django.contrib.auth.models import User
     # if IS_TTY and not User.objects.filter(is_superuser=True).exists():
     # if IS_TTY and not User.objects.filter(is_superuser=True).exists():
@@ -334,7 +350,7 @@ def init(force: bool=False, out_dir: Path=OUTPUT_DIR) -> None:
     #     call_command("createsuperuser", interactive=True)
     #     call_command("createsuperuser", interactive=True)
 
 
     print()
     print()
-    print('{green}[*] Collecting links from any existing indexes and archive folders...{reset}'.format(**ANSI))
+    print('{green}[*] Checking links from indexes and archive folders (safe to Ctrl+C)...{reset}'.format(**ANSI))
 
 
     all_links = Snapshot.objects.none()
     all_links = Snapshot.objects.none()
     pending_links: Dict[str, Link] = {}
     pending_links: Dict[str, Link] = {}
@@ -343,63 +359,77 @@ def init(force: bool=False, out_dir: Path=OUTPUT_DIR) -> None:
         all_links = load_main_index(out_dir=out_dir, warn=False)
         all_links = load_main_index(out_dir=out_dir, warn=False)
         print('    √ Loaded {} links from existing main index.'.format(all_links.count()))
         print('    √ Loaded {} links from existing main index.'.format(all_links.count()))
 
 
-    # Links in data folders that dont match their timestamp
-    fixed, cant_fix = fix_invalid_folder_locations(out_dir=out_dir)
-    if fixed:
-        print('    {lightyellow}√ Fixed {} data directory locations that didn\'t match their link timestamps.{reset}'.format(len(fixed), **ANSI))
-    if cant_fix:
-        print('    {lightyellow}! Could not fix {} data directory locations due to conflicts with existing folders.{reset}'.format(len(cant_fix), **ANSI))
-
-    # Links in JSON index but not in main index
-    orphaned_json_links = {
-        link.url: link
-        for link in parse_json_main_index(out_dir)
-        if not all_links.filter(url=link.url).exists()
-    }
-    if orphaned_json_links:
-        pending_links.update(orphaned_json_links)
-        print('    {lightyellow}√ Added {} orphaned links from existing JSON index...{reset}'.format(len(orphaned_json_links), **ANSI))
-
-    # Links in data dir indexes but not in main index
-    orphaned_data_dir_links = {
-        link.url: link
-        for link in parse_json_links_details(out_dir)
-        if not all_links.filter(url=link.url).exists()
-    }
-    if orphaned_data_dir_links:
-        pending_links.update(orphaned_data_dir_links)
-        print('    {lightyellow}√ Added {} orphaned links from existing archive directories.{reset}'.format(len(orphaned_data_dir_links), **ANSI))
-
-    # Links in invalid/duplicate data dirs
-    invalid_folders = {
-        folder: link
-        for folder, link in get_invalid_folders(all_links, out_dir=out_dir).items()
-    }
-    if invalid_folders:
-        print('    {lightyellow}! Skipped adding {} invalid link data directories.{reset}'.format(len(invalid_folders), **ANSI))
-        print('        X ' + '\n        X '.join(f'{folder} {link}' for folder, link in invalid_folders.items()))
-        print()
-        print('    {lightred}Hint:{reset} For more information about the link data directories that were skipped, run:'.format(**ANSI))
-        print('        archivebox status')
-        print('        archivebox list --status=invalid')
-
-
-    write_main_index(list(pending_links.values()), out_dir=out_dir)
+    if quick:
+        print('    > Skipping full snapshot directory check (quick mode)')
+    else:
+        try:
+            # Links in data folders that dont match their timestamp
+            fixed, cant_fix = fix_invalid_folder_locations(out_dir=out_dir)
+            if fixed:
+                print('    {lightyellow}√ Fixed {} data directory locations that didn\'t match their link timestamps.{reset}'.format(len(fixed), **ANSI))
+            if cant_fix:
+                print('    {lightyellow}! Could not fix {} data directory locations due to conflicts with existing folders.{reset}'.format(len(cant_fix), **ANSI))
+
+            # Links in JSON index but not in main index
+            orphaned_json_links = {
+                link.url: link
+                for link in parse_json_main_index(out_dir)
+                if not all_links.filter(url=link.url).exists()
+            }
+            if orphaned_json_links:
+                pending_links.update(orphaned_json_links)
+                print('    {lightyellow}√ Added {} orphaned links from existing JSON index...{reset}'.format(len(orphaned_json_links), **ANSI))
+
+            # Links in data dir indexes but not in main index
+            orphaned_data_dir_links = {
+                link.url: link
+                for link in parse_json_links_details(out_dir)
+                if not all_links.filter(url=link.url).exists()
+            }
+            if orphaned_data_dir_links:
+                pending_links.update(orphaned_data_dir_links)
+                print('    {lightyellow}√ Added {} orphaned links from existing archive directories.{reset}'.format(len(orphaned_data_dir_links), **ANSI))
+
+            # Links in invalid/duplicate data dirs
+            invalid_folders = {
+                folder: link
+                for folder, link in get_invalid_folders(all_links, out_dir=out_dir).items()
+            }
+            if invalid_folders:
+                print('    {lightyellow}! Skipped adding {} invalid link data directories.{reset}'.format(len(invalid_folders), **ANSI))
+                print('        X ' + '\n        X '.join(f'./{Path(folder).relative_to(OUTPUT_DIR)} {link}' for folder, link in invalid_folders.items()))
+                print()
+                print('    {lightred}Hint:{reset} For more information about the link data directories that were skipped, run:'.format(**ANSI))
+                print('        archivebox status')
+                print('        archivebox list --status=invalid')
+
+        except (KeyboardInterrupt, SystemExit):
+            stderr()
+            stderr('[x] Stopped checking archive directories due to Ctrl-C/SIGTERM', color='red')
+            stderr('    Your archive data is safe, but you should re-run `archivebox init` to finish the process later.')
+            stderr()
+            stderr('    {lightred}Hint:{reset} In the future you can run a quick init without checking dirs like so:'.format(**ANSI))
+            stderr('        archivebox init --quick')
+            raise SystemExit(1)
+        
+        write_main_index(list(pending_links.values()), out_dir=out_dir)
 
 
-    print('\n{green}------------------------------------------------------------------{reset}'.format(**ANSI))
+    print('\n{green}----------------------------------------------------------------------{reset}'.format(**ANSI))
     if existing_index:
     if existing_index:
         print('{green}[√] Done. Verified and updated the existing ArchiveBox collection.{reset}'.format(**ANSI))
         print('{green}[√] Done. Verified and updated the existing ArchiveBox collection.{reset}'.format(**ANSI))
     else:
     else:
-        print('{green}[√] Done. A new ArchiveBox collection was initialized ({} links).{reset}'.format(len(all_links), **ANSI))
-    print()
-    print('    {lightred}Hint:{reset} To view your archive index, run:'.format(**ANSI))
-    print('        archivebox server  # then visit http://127.0.0.1:8000')
-    print()
-    print('    To add new links, you can run:')
-    print("        archivebox add ~/some/path/or/url/to/list_of_links.txt")
-    print()
-    print('    For more usage and examples, run:')
-    print('        archivebox help')
+        print('{green}[√] Done. A new ArchiveBox collection was initialized ({} links).{reset}'.format(len(all_links) + len(pending_links), **ANSI))
+    
+    if Snapshot.objects.count() < 25:     # hide the hints for experienced users
+        print()
+        print('    {lightred}Hint:{reset} To view your archive index, run:'.format(**ANSI))
+        print('        archivebox server  # then visit http://127.0.0.1:8000')
+        print()
+        print('    To add new links, you can run:')
+        print("        archivebox add ~/some/path/or/url/to/list_of_links.txt")
+        print()
+        print('    For more usage and examples, run:')
+        print('        archivebox help')
 
 
     json_index = Path(out_dir) / JSON_INDEX_FILENAME
     json_index = Path(out_dir) / JSON_INDEX_FILENAME
     html_index = Path(out_dir) / HTML_INDEX_FILENAME
     html_index = Path(out_dir) / HTML_INDEX_FILENAME
@@ -531,6 +561,7 @@ def oneshot(url: str, extractors: str="", out_dir: Path=OUTPUT_DIR):
 
 
 @enforce_types
 @enforce_types
 def add(urls: Union[str, List[str]],
 def add(urls: Union[str, List[str]],
+        tag: str='',
         depth: int=0,
         depth: int=0,
         update_all: bool=not ONLY_NEW,
         update_all: bool=not ONLY_NEW,
         index_only: bool=False,
         index_only: bool=False,
@@ -540,6 +571,8 @@ def add(urls: Union[str, List[str]],
         out_dir: Path=OUTPUT_DIR) -> List[Link]:
         out_dir: Path=OUTPUT_DIR) -> List[Link]:
     """Add a new URL or list of URLs to your archive"""
     """Add a new URL or list of URLs to your archive"""
 
 
+    from core.models import Tag
+
     assert depth in (0, 1), 'Depth must be 0 or 1 (depth >1 is not supported yet)'
     assert depth in (0, 1), 'Depth must be 0 or 1 (depth >1 is not supported yet)'
 
 
     extractors = extractors.split(",") if extractors else []
     extractors = extractors.split(",") if extractors else []
@@ -572,26 +605,48 @@ def add(urls: Union[str, List[str]],
             new_links_depth += parse_links_from_source(downloaded_file, root_url=new_link.url)
             new_links_depth += parse_links_from_source(downloaded_file, root_url=new_link.url)
 
 
     imported_links = list({link.url: link for link in (new_links + new_links_depth)}.values())
     imported_links = list({link.url: link for link in (new_links + new_links_depth)}.values())
+    
     new_links = dedupe_links(all_links, imported_links)
     new_links = dedupe_links(all_links, imported_links)
 
 
     write_main_index(links=new_links, out_dir=out_dir)
     write_main_index(links=new_links, out_dir=out_dir)
     all_links = load_main_index(out_dir=out_dir)
     all_links = load_main_index(out_dir=out_dir)
 
 
     if index_only:
     if index_only:
-        return all_links
+        # mock archive all the links using the fake index_only extractor method in order to update their state
+        if overwrite:
+            archive_links(imported_links, overwrite=overwrite, methods=['index_only'], out_dir=out_dir)
+        else:
+            archive_links(new_links, overwrite=False, methods=['index_only'], out_dir=out_dir)
+    else:
+        # fully run the archive extractor methods for each link
+        archive_kwargs = {
+            "out_dir": out_dir,
+        }
+        if extractors:
+            archive_kwargs["methods"] = extractors
+
+        if update_all:
+            archive_links(all_links, overwrite=overwrite, **archive_kwargs)
+        elif overwrite:
+            archive_links(imported_links, overwrite=True, **archive_kwargs)
+        elif new_links:
+            archive_links(new_links, overwrite=False, **archive_kwargs)
+
+
+    # add any tags to imported links
+    tags = [
+        Tag.objects.get_or_create(name=name.strip())[0]
+        for name in tag.split(',')
+        if name.strip()
+    ]
+    if tags:
+        for link in imported_links:
+            snapshot = link.as_snapshot()
+            snapshot.tags.add(*tags)
+            snapshot.tags_str(nocache=True)
+            snapshot.save()
+        # print(f'    √ Tagged {len(imported_links)} Snapshots with {len(tags)} tags {tags_str}')
 
 
-    # Run the archive methods for each link
-    archive_kwargs = {
-        "out_dir": out_dir,
-    }
-    if extractors:
-        archive_kwargs["methods"] = extractors
-    if update_all:
-        archive_links(all_links, overwrite=overwrite, **archive_kwargs)
-    elif overwrite:
-        archive_links(imported_links, overwrite=True, **archive_kwargs)
-    elif new_links:
-        archive_links(new_links, overwrite=False, **archive_kwargs)
 
 
     return all_links
     return all_links
 
 
@@ -811,11 +866,15 @@ def list_links(snapshots: Optional[QuerySet]=None,
         all_snapshots = load_main_index(out_dir=out_dir)
         all_snapshots = load_main_index(out_dir=out_dir)
 
 
     if after is not None:
     if after is not None:
-        all_snapshots = all_snapshots.filter(timestamp__lt=after)
+        all_snapshots = all_snapshots.filter(timestamp__gte=after)
     if before is not None:
     if before is not None:
-        all_snapshots = all_snapshots.filter(timestamp__gt=before)
+        all_snapshots = all_snapshots.filter(timestamp__lt=before)
     if filter_patterns:
     if filter_patterns:
         all_snapshots = snapshot_filter(all_snapshots, filter_patterns, filter_type)
         all_snapshots = snapshot_filter(all_snapshots, filter_patterns, filter_type)
+
+    if not all_snapshots:
+        stderr('[!] No Snapshots matched your filters:', filter_patterns, f'({filter_type})', color='lightyellow')
+
     return all_snapshots
     return all_snapshots
 
 
 @enforce_types
 @enforce_types
@@ -1061,6 +1120,7 @@ def server(runserver_args: Optional[List[str]]=None,
            reload: bool=False,
            reload: bool=False,
            debug: bool=False,
            debug: bool=False,
            init: bool=False,
            init: bool=False,
+           quick_init: bool=False,
            createsuperuser: bool=False,
            createsuperuser: bool=False,
            out_dir: Path=OUTPUT_DIR) -> None:
            out_dir: Path=OUTPUT_DIR) -> None:
     """Run the ArchiveBox HTTP server"""
     """Run the ArchiveBox HTTP server"""
@@ -1069,9 +1129,14 @@ def server(runserver_args: Optional[List[str]]=None,
     
     
     if init:
     if init:
         run_subcommand('init', stdin=None, pwd=out_dir)
         run_subcommand('init', stdin=None, pwd=out_dir)
+        print()
+    elif quick_init:
+        run_subcommand('init', subcommand_args=['--quick'], stdin=None, pwd=out_dir)
+        print()
 
 
     if createsuperuser:
     if createsuperuser:
         run_subcommand('manage', subcommand_args=['createsuperuser'], pwd=out_dir)
         run_subcommand('manage', subcommand_args=['createsuperuser'], pwd=out_dir)
+        print()
 
 
     # setup config for django runserver
     # setup config for django runserver
     from . import config
     from . import config
@@ -1083,12 +1148,9 @@ def server(runserver_args: Optional[List[str]]=None,
     from django.core.management import call_command
     from django.core.management import call_command
     from django.contrib.auth.models import User
     from django.contrib.auth.models import User
 
 
-    admin_user = User.objects.filter(is_superuser=True).order_by('date_joined').only('username').last()
-
     print('{green}[+] Starting ArchiveBox webserver...{reset}'.format(**ANSI))
     print('{green}[+] Starting ArchiveBox webserver...{reset}'.format(**ANSI))
-    if admin_user:
-        hint('The admin username is{lightblue} {}{reset}\n'.format(admin_user.username, **ANSI))
-    else:
+    print('    > Logging errors to ./logs/errors.log')
+    if not User.objects.filter(is_superuser=True).exists():
         print('{lightyellow}[!] No admin users exist yet, you will not be able to edit links in the UI.{reset}'.format(**ANSI))
         print('{lightyellow}[!] No admin users exist yet, you will not be able to edit links in the UI.{reset}'.format(**ANSI))
         print()
         print()
         print('    To create an admin user, run:')
         print('    To create an admin user, run:')
@@ -1106,7 +1168,6 @@ def server(runserver_args: Optional[List[str]]=None,
     config.SHOW_PROGRESS = False
     config.SHOW_PROGRESS = False
     config.DEBUG = config.DEBUG or debug
     config.DEBUG = config.DEBUG or debug
 
 
-
     call_command("runserver", *runserver_args)
     call_command("runserver", *runserver_args)
 
 
 
 

+ 45 - 31
archivebox/parsers/__init__.py

@@ -68,7 +68,6 @@ def parse_links_memory(urls: List[str], root_url: Optional[str]=None):
     """
     """
     parse a list of URLS without touching the filesystem
     parse a list of URLS without touching the filesystem
     """
     """
-    check_url_parsing_invariants()
 
 
     timer = TimedProgress(TIMEOUT * 4)
     timer = TimedProgress(TIMEOUT * 4)
     #urls = list(map(lambda x: x + "\n", urls))
     #urls = list(map(lambda x: x + "\n", urls))
@@ -89,8 +88,6 @@ def parse_links(source_file: str, root_url: Optional[str]=None) -> Tuple[List[Li
        RSS feed, bookmarks export, or text file
        RSS feed, bookmarks export, or text file
     """
     """
 
 
-    check_url_parsing_invariants()
-
     timer = TimedProgress(TIMEOUT * 4)
     timer = TimedProgress(TIMEOUT * 4)
     with open(source_file, 'r', encoding='utf-8') as file:
     with open(source_file, 'r', encoding='utf-8') as file:
         links, parser = run_parser_functions(file, timer, root_url=root_url)
         links, parser = run_parser_functions(file, timer, root_url=root_url)
@@ -173,31 +170,48 @@ def save_file_as_source(path: str, timeout: int=TIMEOUT, filename: str='{ts}-{ba
     return source_path
     return source_path
 
 
 
 
-def check_url_parsing_invariants() -> None:
-    """Check that plain text regex URL parsing works as expected"""
-
-    # this is last-line-of-defense to make sure the URL_REGEX isn't
-    # misbehaving, as the consequences could be disastrous and lead to many
-    # incorrect/badly parsed links being added to the archive
-
-    test_urls = '''
-    https://example1.com/what/is/happening.html?what=1#how-about-this=1
-    https://example2.com/what/is/happening/?what=1#how-about-this=1
-    HTtpS://example3.com/what/is/happening/?what=1#how-about-this=1f
-    https://example4.com/what/is/happening.html
-    https://example5.com/
-    https://example6.com
-
-    <test>http://example7.com</test>
-    [https://example8.com/what/is/this.php?what=1]
-    [and http://example9.com?what=1&other=3#and-thing=2]
-    <what>https://example10.com#and-thing=2 "</about>
-    abc<this["https://example11.com/what/is#and-thing=2?whoami=23&where=1"]that>def
-    sdflkf[what](https://example12.com/who/what.php?whoami=1#whatami=2)?am=hi
-    example13.bada
-    and example14.badb
-    <or>htt://example15.badc</that>
-    '''
-    # print('\n'.join(re.findall(URL_REGEX, test_urls)))
-    assert len(re.findall(URL_REGEX, test_urls)) == 12
-
+# Check that plain text regex URL parsing works as expected
+#   this is last-line-of-defense to make sure the URL_REGEX isn't
+#   misbehaving due to some OS-level or environment level quirks (e.g. bad regex lib)
+#   the consequences of bad URL parsing could be disastrous and lead to many
+#   incorrect/badly parsed links being added to the archive, so this is worth the cost of checking
+_test_url_strs = {
+    'example.com': 0,
+    '/example.com': 0,
+    '//example.com': 0,
+    ':/example.com': 0,
+    '://example.com': 0,
+    'htt://example8.com': 0,
+    '/htt://example.com': 0,
+    'https://example': 1,
+    'https://localhost/2345': 1,
+    'https://localhost:1234/123': 1,
+    '://': 0,
+    'https://': 0,
+    'http://': 0,
+    'ftp://': 0,
+    'ftp://example.com': 0,
+    'https://example.com': 1,
+    'https://example.com/': 1,
+    'https://a.example.com': 1,
+    'https://a.example.com/': 1,
+    'https://a.example.com/what/is/happening.html': 1,
+    'https://a.example.com/what/ís/happening.html': 1,
+    'https://a.example.com/what/is/happening.html?what=1&2%20b#höw-about-this=1a': 1,
+    'https://a.example.com/what/is/happéning/?what=1&2%20b#how-aboüt-this=1a': 1,
+    'HTtpS://a.example.com/what/is/happening/?what=1&2%20b#how-about-this=1af&2f%20b': 1,
+    'https://example.com/?what=1#how-about-this=1&2%20baf': 1,
+    'https://example.com?what=1#how-about-this=1&2%20baf': 1,
+    '<test>http://example7.com</test>': 1,
+    '[https://example8.com/what/is/this.php?what=1]': 1,
+    '[and http://example9.com?what=1&other=3#and-thing=2]': 1,
+    '<what>https://example10.com#and-thing=2 "</about>': 1,
+    'abc<this["https://example11.com/what/is#and-thing=2?whoami=23&where=1"]that>def': 1,
+    'sdflkf[what](https://example12.com/who/what.php?whoami=1#whatami=2)?am=hi': 1,
+    '<or>http://examplehttp://15.badc</that>': 2,
+    'https://a.example.com/one.html?url=http://example.com/inside/of/another?=http://': 2,
+    '[https://a.example.com/one.html?url=http://example.com/inside/of/another?=](http://a.example.com)': 3,
+}
+for url_str, num_urls in _test_url_strs.items():
+    assert len(re.findall(URL_REGEX, url_str)) == num_urls, (
+        f'{url_str} does not contain {num_urls} urls')

+ 1 - 1
archivebox/search/utils.py

@@ -16,7 +16,7 @@ def get_file_result_content(res, extra_path, use_pwd=False):
     if extra_path:
     if extra_path:
         fpath = f'{fpath}/{extra_path}'
         fpath = f'{fpath}/{extra_path}'
 
 
-    with open(fpath, 'r') as file:
+    with open(fpath, 'r', encoding='utf-8') as file:
         data = file.read()
         data = file.read()
     if data:
     if data:
         return [data]
         return [data]

+ 2 - 1
archivebox/system.py

@@ -37,10 +37,11 @@ def atomic_write(path: Union[Path, str], contents: Union[dict, str, bytes], over
     """Safe atomic write to filesystem by writing to temp file + atomic rename"""
     """Safe atomic write to filesystem by writing to temp file + atomic rename"""
 
 
     mode = 'wb+' if isinstance(contents, bytes) else 'w'
     mode = 'wb+' if isinstance(contents, bytes) else 'w'
+    encoding = None if isinstance(contents, bytes) else 'utf-8'  # enforce utf-8 on all text writes
 
 
     # print('\n> Atomic Write:', mode, path, len(contents), f'overwrite={overwrite}')
     # print('\n> Atomic Write:', mode, path, len(contents), f'overwrite={overwrite}')
     try:
     try:
-        with lib_atomic_write(path, mode=mode, overwrite=overwrite) as f:
+        with lib_atomic_write(path, mode=mode, overwrite=overwrite, encoding=encoding) as f:
             if isinstance(contents, dict):
             if isinstance(contents, dict):
                 dump(contents, f, indent=4, sort_keys=True, cls=ExtendedEncoder)
                 dump(contents, f, indent=4, sort_keys=True, cls=ExtendedEncoder)
             elif isinstance(contents, (bytes, str)):
             elif isinstance(contents, (bytes, str)):

+ 0 - 1
archivebox/templates/admin/actions_as_select.html

@@ -1 +0,0 @@
-actions_as_select

+ 6 - 6
archivebox/templates/admin/base.html

@@ -20,7 +20,7 @@
 <body class="{% if is_popup %}popup {% endif %}{% block bodyclass %}{% endblock %}"
 <body class="{% if is_popup %}popup {% endif %}{% block bodyclass %}{% endblock %}"
   data-admin-utc-offset="{% now "Z" %}">
   data-admin-utc-offset="{% now "Z" %}">
 
 
-  <style nonce="{{nonce}}">
+  <style>
       /* Loading Progress Bar */
       /* Loading Progress Bar */
         #progress {
         #progress {
             position: absolute;
             position: absolute;
@@ -89,7 +89,7 @@
             <a href="{% url 'admin:Add' %}">Add ➕</a> /
             <a href="{% url 'admin:Add' %}">Add ➕</a> /
             <a href="{% url 'Home' %}">Snapshots</a> /
             <a href="{% url 'Home' %}">Snapshots</a> /
             <a href="/admin/core/tag/">Tags</a> /
             <a href="/admin/core/tag/">Tags</a> /
-            <a href="/admin/auth/user/">Users</a> /
+            <a href="/admin/">Admin</a> /
             <a href="{% url 'Docs' %}">Docs</a>
             <a href="{% url 'Docs' %}">Docs</a>
              &nbsp; &nbsp;
              &nbsp; &nbsp;
             {% block welcome-msg %}
             {% block welcome-msg %}
@@ -157,15 +157,15 @@
         function fix_actions() {
         function fix_actions() {
             var container = $('div.actions');
             var container = $('div.actions');
 
 
-            if (container.find('option').length < 10) {
-                container.find('label, button').hide();
+            if (container.find('select[name=action] option').length < 10) {
+                container.find('label:nth-child(1), button[value=0]').hide();
 
 
                 var buttons = $('<div></div>')
                 var buttons = $('<div></div>')
-                    .prependTo(container)
+                    .appendTo(container)
                     .css('display', 'inline')
                     .css('display', 'inline')
                     .addClass('class', 'action-buttons');
                     .addClass('class', 'action-buttons');
 
 
-                container.find('option:gt(0)').reverse().each(function () {
+                container.find('select[name=action] option:gt(0)').reverse().each(function () {
                     const name = this.value
                     const name = this.value
                     $('<button>')
                     $('<button>')
                         .appendTo(buttons)
                         .appendTo(buttons)

+ 1 - 1
archivebox/templates/core/add.html

@@ -15,7 +15,7 @@
 {% endblock %}
 {% endblock %}
 
 
 {% block body %}
 {% block body %}
-    <div style="max-width: 550px; margin: auto; float: none">
+    <div style="max-width: 1440px; margin: auto; float: none">
         <br/><br/>
         <br/><br/>
         {% if stdout %}
         {% if stdout %}
             <h1>Add new URLs to your archive: results</h1>
             <h1>Add new URLs to your archive: results</h1>

+ 2 - 2
archivebox/templates/core/base.html

@@ -38,7 +38,7 @@
             <div id="header">
             <div id="header">
                 <div id="branding">
                 <div id="branding">
                     <h1 id="site-name">
                     <h1 id="site-name">
-                        <a href="{% url 'public-index' %}" class="header-archivebox" title="Last updated: {{updated}}">
+                        <a href="{% url 'public-index' %}" class="header-archivebox">
                             <img src="{% static 'archive.png' %}" alt="Logo" style="height: 30px"/>
                             <img src="{% static 'archive.png' %}" alt="Logo" style="height: 30px"/>
                             ArchiveBox
                             ArchiveBox
                         </a>
                         </a>
@@ -70,7 +70,7 @@
                     <center>
                     <center>
                         <small>
                         <small>
                             Archive created using <a href="https://github.com/ArchiveBox/ArchiveBox" title="Github">ArchiveBox</a> version
                             Archive created using <a href="https://github.com/ArchiveBox/ArchiveBox" title="Github">ArchiveBox</a> version
-                            <a href="https://github.com/ArchiveBox/ArchiveBox/releases" title="Releases">v{{VERSION}}</a>.
+                            <a href="https://github.com/ArchiveBox/ArchiveBox/releases/tag/v{{VERSION}}" title="Releases">v{{VERSION}}</a>.
                             <br/><br/>
                             <br/><br/>
                             {{FOOTER_INFO}}
                             {{FOOTER_INFO}}
                         </small>
                         </small>

+ 2 - 2
archivebox/templates/core/index_row.html

@@ -10,7 +10,7 @@
         {% endif %}
         {% endif %}
 
 
         <a href="archive/{{link.timestamp}}/index.html" title="{{link.title|default:'Not yet archived...'}}">
         <a href="archive/{{link.timestamp}}/index.html" title="{{link.title|default:'Not yet archived...'}}">
-            <span data-title-for="{{link.url}}" data-archived="{{link.is_archived}}">{{link.title|default:'Loading...'}}</span>
+            <span data-title-for="{{link.url}}" data-archived="{{link.is_archived}}">{{link.title|default:'Loading...'|truncatechars:128}}</span>
             {% if link.tags_str %}
             {% if link.tags_str %}
                 <span class="tags" style="float: right; border-radius: 5px; background-color: #bfdfff; padding: 2px 5px; margin-left: 4px; margin-top: 1px;">
                 <span class="tags" style="float: right; border-radius: 5px; background-color: #bfdfff; padding: 2px 5px; margin-left: 4px; margin-top: 1px;">
                     {% if link.tags_str != None %}
                     {% if link.tags_str != None %}
@@ -33,5 +33,5 @@
             {% endif %}
             {% endif %}
         </span>
         </span>
     </td>
     </td>
-   <td style="text-align:left"><a href="{{link.url}}">{{link.url}}</a></td>
+   <td style="text-align:left; word-wrap: anywhere;"><a href="{{link.url}}">{{link.url|truncatechars:128}}</a></td>
 </tr>
 </tr>

+ 1 - 1
archivebox/templates/core/minimal_index.html

@@ -4,7 +4,7 @@
         <title>Archived Sites</title>
         <title>Archived Sites</title>
         <meta charset="utf-8" name="viewport" content="width=device-width, initial-scale=1">
         <meta charset="utf-8" name="viewport" content="width=device-width, initial-scale=1">
     </head>
     </head>
-    <body data-status="{{status}}">
+    <body>
         <table id="table-bookmarks">
         <table id="table-bookmarks">
             <thead>
             <thead>
                 <tr class="thead-tr">
                 <tr class="thead-tr">

+ 15 - 10
archivebox/templates/core/public_index.html

@@ -2,6 +2,11 @@
 {% load static %}
 {% load static %}
 
 
 {% block body %}
 {% block body %}
+    <style>
+        #table-bookmarks_info {
+            display: none;
+        }
+    </style>
     <div id="toolbar">
     <div id="toolbar">
         <form id="changelist-search" action="{% url 'public-index' %}" method="get">
         <form id="changelist-search" action="{% url 'public-index' %}" method="get">
             <div>
             <div>
@@ -21,7 +26,7 @@
         <thead>
         <thead>
             <tr>
             <tr>
                 <th style="width: 100px;">Bookmarked</th>
                 <th style="width: 100px;">Bookmarked</th>
-                <th style="width: 26vw;">Snapshot ({{object_list|length}})</th>
+                <th style="width: 26vw;">Snapshot ({{page_obj.paginator.count}})</th>
                 <th style="width: 140px">Files</th>
                 <th style="width: 140px">Files</th>
                 <th style="width: 16vw;whitespace:nowrap;overflow-x:hidden;">Original URL</th>
                 <th style="width: 16vw;whitespace:nowrap;overflow-x:hidden;">Original URL</th>
             </tr>
             </tr>
@@ -33,26 +38,26 @@
             </tbody>
             </tbody>
         </table>
         </table>
         <center>
         <center>
+            <br/>
+            Showing {{ page_obj.start_index }}-{{ page_obj.end_index }} of {{ page_obj.paginator.count }} total
+            <br/>
             <span class="step-links">
             <span class="step-links">
                 {% if page_obj.has_previous %}
                 {% if page_obj.has_previous %}
-                    <a href="{% url 'public-index' %}?page=1">&laquo; first</a>
+                    <a href="{% url 'public-index' %}?page=1">&laquo; first</a> &nbsp;
                     <a href="{% url 'public-index' %}?page={{ page_obj.previous_page_number }}">previous</a>
                     <a href="{% url 'public-index' %}?page={{ page_obj.previous_page_number }}">previous</a>
+                    &nbsp;
                 {% endif %}
                 {% endif %}
         
         
                 <span class="current">
                 <span class="current">
-                    Page {{ page_obj.number }} of {{ page_obj.paginator.num_pages }}.
+                    Page {{ page_obj.number }} of {{ page_obj.paginator.num_pages }}
                 </span>
                 </span>
-        
+            
                 {% if page_obj.has_next %}
                 {% if page_obj.has_next %}
-                    <a href="{% url 'public-index' %}?page={{ page_obj.next_page_number }}">next </a>
+                    &nbsp;
+                    <a href="{% url 'public-index' %}?page={{ page_obj.next_page_number }}">next </a> &nbsp;
                     <a href="{% url 'public-index' %}?page={{ page_obj.paginator.num_pages }}">last &raquo;</a>
                     <a href="{% url 'public-index' %}?page={{ page_obj.paginator.num_pages }}">last &raquo;</a>
                 {% endif %}
                 {% endif %}
             </span>
             </span>
-    
-            {% if page_obj.has_next %}
-                <a href="{% url 'public-index' %}?page={{ page_obj.next_page_number }}">next </a>
-                <a href="{% url 'public-index' %}?page={{ page_obj.paginator.num_pages }}">last &raquo;</a>
-            {% endif %}
         </span>
         </span>
         <br>
         <br>
     </center>
     </center>

+ 44 - 18
archivebox/templates/core/snapshot.html

@@ -279,7 +279,7 @@
                     <div class="col-lg-8">
                     <div class="col-lg-8">
                         <img src="favicon.ico" onerror="this.style.opacity=0" alt="Favicon">
                         <img src="favicon.ico" onerror="this.style.opacity=0" alt="Favicon">
                         &nbsp;&nbsp;
                         &nbsp;&nbsp;
-                        {{title}}
+                        {{title|safe}}
                         &nbsp;&nbsp;
                         &nbsp;&nbsp;
                         <a href="#" class="header-toggle">▾</a>
                         <a href="#" class="header-toggle">▾</a>
                         <br/>
                         <br/>
@@ -335,20 +335,21 @@
                     </div>
                     </div>
                     <div class="col-lg-4">
                     <div class="col-lg-4">
                         <div class="info-chunk">
                         <div class="info-chunk">
-                            <h5>🗃 Files</h5>
+                            <h5>🗃 Snapshot ID: <a href="/admin/core/snapshot/{{snapshot_id}}/change/"><code style="color: rgba(255,255,255,0.6); font-weight: 200; font-size: 12px; background-color: #1a1a1a"><b>[{{timestamp}}]</b> <small>{{snapshot_id|truncatechars:24}}</small></code></a></h5>
                             <a href="index.json" title="JSON summary of archived link.">JSON</a> | 
                             <a href="index.json" title="JSON summary of archived link.">JSON</a> | 
                             <a href="warc/" title="Any WARC archives for the page">WARC</a> | 
                             <a href="warc/" title="Any WARC archives for the page">WARC</a> | 
                             <a href="media/" title="Audio, Video, and Subtitle files.">Media</a> | 
                             <a href="media/" title="Audio, Video, and Subtitle files.">Media</a> | 
                             <a href="git/" title="Any git repos at the url">Git</a> | 
                             <a href="git/" title="Any git repos at the url">Git</a> | 
-                            <a href="favicon.ico" title="Any git repos at the url">Favicon</a> | 
-                            <a href="." title="Webserver-provided index of files directory.">See all...</a>
+                            <a href="/admin/core/snapshot/?id__startswith={{snapshot_id}}" title="Go to the Snapshot admin to update, overwrite, or delete this Snapshot">Actions</a> | 
+                            <a href="/admin/core/snapshot/{{snapshot_id}}/change/" title="Edit this snapshot in the Admin UI">Admin</a> | 
+                            <a href="." title="Webserver-provided index of files directory.">See all files...</a><br/>
                         </div>
                         </div>
                     </div>
                     </div>
                 </div>
                 </div>
                 <div class="row header-bottom-frames">
                 <div class="row header-bottom-frames">
                     <div class="col-lg-2">
                     <div class="col-lg-2">
                         <div class="card selected-card">
                         <div class="card selected-card">
-                            <iframe class="card-img-top" src="{{singlefile_path}}" sandbox="allow-same-origin allow-scripts allow-forms" scrolling="no"></iframe>
+                            <iframe class="card-img-top" src="{{singlefile_path}}" sandbox="allow-same-origin allow-top-navigation-by-user-activation allow-scripts allow-forms" scrolling="no"></iframe>
                             <div class="card-body">
                             <div class="card-body">
                                 <a href="{{singlefile_path}}" title="Open in new tab..." target="_blank" rel="noopener">
                                 <a href="{{singlefile_path}}" title="Open in new tab..." target="_blank" rel="noopener">
                                     <p class="card-text"><code>./singlefile.html</code></p>
                                     <p class="card-text"><code>./singlefile.html</code></p>
@@ -381,7 +382,7 @@
                     </div>
                     </div>
                     <div class="col-lg-2">
                     <div class="col-lg-2">
                         <div class="card">
                         <div class="card">
-                          <iframe class="card-img-top" src="{{archive_url}}" sandbox="allow-same-origin allow-scripts allow-forms" scrolling="no"></iframe>
+                          <iframe class="card-img-top" src="{{archive_url}}" sandbox="allow-same-origin allow-top-navigation-by-user-activation allow-scripts allow-forms" scrolling="no"></iframe>
                           <div class="card-body">
                           <div class="card-body">
                                 <a href="{{archive_url}}" title="Open in new tab..." target="_blank" rel="noopener">
                                 <a href="{{archive_url}}" title="Open in new tab..." target="_blank" rel="noopener">
                                     <p class="card-text"><code>./{{domain}}</code></p>
                                     <p class="card-text"><code>./{{domain}}</code></p>
@@ -393,30 +394,30 @@
                     {% if SAVE_ARCHIVE_DOT_ORG %}
                     {% if SAVE_ARCHIVE_DOT_ORG %}
                     <div class="col-lg-2">
                     <div class="col-lg-2">
                         <div class="card">
                         <div class="card">
-                            <iframe class="card-img-top" src="{{archive_org_path}}" sandbox="allow-same-origin allow-scripts allow-forms" scrolling="no"></iframe>
+                            <iframe class="card-img-top" src="{{archive_org_path}}" sandbox="allow-same-origin allow-top-navigation-by-user-activation allow-scripts allow-forms" scrolling="no"></iframe>
                             <div class="card-body">
                             <div class="card-body">
                                 <a href="{{archive_org_path}}" title="Open in new tab..." target="_blank" rel="noopener">
                                 <a href="{{archive_org_path}}" title="Open in new tab..." target="_blank" rel="noopener">
                                     <p class="card-text"><code>🌐 web.archive.org/web/...</code></p>
                                     <p class="card-text"><code>🌐 web.archive.org/web/...</code></p>
                                 </a>
                                 </a>
-                                <a href="{{archive_org_path}}" target="preview"><h4 class="card-title">Archive.Org</h4></a>
+                                <a href="{{archive_org_path}}" target="preview" id="archive_dot_org-btn"><h4 class="card-title">Archive.Org</h4></a>
                           </div>
                           </div>
                         </div>
                         </div>
                     </div>
                     </div>
                     {% endif %}
                     {% endif %}
                     <div class="col-lg-2">
                     <div class="col-lg-2">
                         <div class="card">
                         <div class="card">
-                            <iframe class="card-img-top" src="{{url}}" sandbox="allow-same-origin allow-scripts allow-forms" scrolling="no"></iframe>
+                            <iframe class="card-img-top" src="{{url}}" sandbox="allow-same-origin allow-top-navigation-by-user-activation allow-scripts allow-forms" scrolling="no"></iframe>
                             <div class="card-body">
                             <div class="card-body">
                                 <a href="{{url}}" title="Open in new tab..." target="_blank" rel="noopener">
                                 <a href="{{url}}" title="Open in new tab..." target="_blank" rel="noopener">
                                     <p class="card-text"><code>🌐 {{domain}}</code></p>
                                     <p class="card-text"><code>🌐 {{domain}}</code></p>
                                 </a>
                                 </a>
-                                <a href="{{url}}" target="preview"><h4 class="card-title">Original</h4></a>
+                                <a href="{{url}}" target="preview" id="original-btn"><h4 class="card-title">Original</h4></a>
                           </div>
                           </div>
                         </div>
                         </div>
                     </div>
                     </div>
                     <div class="col-lg-2">
                     <div class="col-lg-2">
                         <div class="card">
                         <div class="card">
-                            <iframe class="card-img-top" src="{{headers_path}}" sandbox="allow-same-origin allow-scripts allow-forms" scrolling="no"></iframe>
+                            <iframe class="card-img-top" src="{{headers_path}}" sandbox="allow-same-origin allow-top-navigation-by-user-activation allow-scripts allow-forms" scrolling="no"></iframe>
                             <div class="card-body">
                             <div class="card-body">
                                 <a href="{{headers_path}}" title="Open in new tab..." target="_blank" rel="noopener">
                                 <a href="{{headers_path}}" title="Open in new tab..." target="_blank" rel="noopener">
                                     <p class="card-text"><code>./headers.json</code></p>
                                     <p class="card-text"><code>./headers.json</code></p>
@@ -427,7 +428,7 @@
                     </div>
                     </div>
                     <div class="col-lg-2">
                     <div class="col-lg-2">
                         <div class="card">
                         <div class="card">
-                            <iframe class="card-img-top" src="{{dom_path}}" sandbox="allow-same-origin allow-scripts allow-forms" scrolling="no"></iframe>
+                            <iframe class="card-img-top" src="{{dom_path}}" sandbox="allow-same-origin allow-top-navigation-by-user-activation allow-scripts allow-forms" scrolling="no"></iframe>
                             <div class="card-body">
                             <div class="card-body">
                                 <a href="{{dom_path}}" title="Open in new tab..." target="_blank" rel="noopener">
                                 <a href="{{dom_path}}" title="Open in new tab..." target="_blank" rel="noopener">
                                     <p class="card-text"><code>./output.html</code></p>
                                     <p class="card-text"><code>./output.html</code></p>
@@ -438,7 +439,7 @@
                     </div>
                     </div>
                     <div class="col-lg-2">
                     <div class="col-lg-2">
                         <div class="card">
                         <div class="card">
-                            <iframe class="card-img-top" src="{{readability_path}}" sandbox="allow-same-origin allow-scripts allow-forms" scrolling="no"></iframe>
+                            <iframe class="card-img-top" src="{{readability_path}}" sandbox="allow-same-origin allow-top-navigation-by-user-activation allow-scripts allow-forms" scrolling="no"></iframe>
                             <div class="card-body">
                             <div class="card-body">
                                 <a href="{{readability_path}}" title="Open in new tab..." target="_blank" rel="noopener">
                                 <a href="{{readability_path}}" title="Open in new tab..." target="_blank" rel="noopener">
                                     <p class="card-text"><code>./readability/content.html</code></p>
                                     <p class="card-text"><code>./readability/content.html</code></p>
@@ -450,7 +451,7 @@
                     <br/>
                     <br/>
                     <div class="col-lg-2">
                     <div class="col-lg-2">
                         <div class="card">
                         <div class="card">
-                            <iframe class="card-img-top" src="{{mercury_path}}" sandbox="allow-same-origin allow-scripts allow-forms" scrolling="no"></iframe>
+                            <iframe class="card-img-top" src="{{mercury_path}}" sandbox="allow-same-origin allow-top-navigation-by-user-activation allow-scripts allow-forms" scrolling="no"></iframe>
                             <div class="card-body">
                             <div class="card-body">
                                 <a href="{{mercury_path}}" title="Open in new tab..." target="_blank" rel="noopener">
                                 <a href="{{mercury_path}}" title="Open in new tab..." target="_blank" rel="noopener">
                                     <p class="card-text"><code>./mercury/content.html</code></p>
                                     <p class="card-text"><code>./mercury/content.html</code></p>
@@ -461,7 +462,7 @@
                     </div>
                     </div>
                     <div class="col-lg-2">
                     <div class="col-lg-2">
                         <div class="card">
                         <div class="card">
-                            <iframe class="card-img-top" src="{{media_path}}" sandbox="allow-same-origin allow-scripts allow-forms" scrolling="no"></iframe>
+                            <iframe class="card-img-top" src="{{media_path}}" sandbox="allow-same-origin allow-top-navigation-by-user-activation allow-scripts allow-forms" scrolling="no"></iframe>
                             <div class="card-body">
                             <div class="card-body">
                                 <a href="{{media_path}}" title="Open in new tab..." target="_blank" rel="noopener">
                                 <a href="{{media_path}}" title="Open in new tab..." target="_blank" rel="noopener">
                                     <p class="card-text"><code>./media/*.mp4</code></p>
                                     <p class="card-text"><code>./media/*.mp4</code></p>
@@ -472,7 +473,7 @@
                     </div>
                     </div>
                     <div class="col-lg-2">
                     <div class="col-lg-2">
                         <div class="card">
                         <div class="card">
-                            <iframe class="card-img-top" src="{{git_path}}" sandbox="allow-same-origin allow-scripts allow-forms" scrolling="no"></iframe>
+                            <iframe class="card-img-top" src="{{git_path}}" sandbox="allow-same-origin allow-top-navigation-by-user-activation allow-scripts allow-forms" scrolling="no"></iframe>
                             <div class="card-body">
                             <div class="card-body">
                                 <a href="{{git_path}}" title="Open in new tab..." target="_blank" rel="noopener">
                                 <a href="{{git_path}}" title="Open in new tab..." target="_blank" rel="noopener">
                                     <p class="card-text"><code>./git/*.git</code></p>
                                     <p class="card-text"><code>./git/*.git</code></p>
@@ -484,7 +485,7 @@
                 </div>
                 </div>
             </div>
             </div>
         </header>
         </header>
-        <iframe sandbox="allow-same-origin allow-scripts allow-forms" class="full-page-iframe" src="{{singlefile_path}}" name="preview"></iframe>
+        <iframe sandbox="allow-same-origin allow-top-navigation-by-user-activation allow-scripts allow-forms" class="full-page-iframe" src="{{singlefile_path}}" name="preview"></iframe>
     
     
         <script
         <script
               src="https://code.jquery.com/jquery-3.2.1.slim.min.js"
               src="https://code.jquery.com/jquery-3.2.1.slim.min.js"
@@ -493,6 +494,16 @@
         <script src="https://maxcdn.bootstrapcdn.com/bootstrap/4.0.0-alpha.6/js/bootstrap.min.js" integrity="sha384-vBWWzlZJ8ea9aCX4pEW3rVHjgjt7zpkNpZk+02D9phzyeVkE+jo0ieGizqPLForn" crossorigin="anonymous"></script>
         <script src="https://maxcdn.bootstrapcdn.com/bootstrap/4.0.0-alpha.6/js/bootstrap.min.js" integrity="sha384-vBWWzlZJ8ea9aCX4pEW3rVHjgjt7zpkNpZk+02D9phzyeVkE+jo0ieGizqPLForn" crossorigin="anonymous"></script>
 
 
         <script>
         <script>
+            function getPreviewTypeFromPath(link) {
+                if (link.id == 'original-btn') {
+                    return 'original'
+                }
+                if (link.id == 'archive_dot_org-btn') {
+                    return 'archive_dot_org'
+                }
+                return link.pathname.split('/').filter(a => a.length).slice(-1)[0].toLowerCase()
+            }
+
             // show selected file in iframe when preview card is clicked
             // show selected file in iframe when preview card is clicked
             jQuery('.card').on('click', function(e) {
             jQuery('.card').on('click', function(e) {
                 jQuery('.selected-card').removeClass('selected-card')
                 jQuery('.selected-card').removeClass('selected-card')
@@ -502,11 +513,26 @@
                 if (e.currentTarget.href.endsWith('.pdf')) {
                 if (e.currentTarget.href.endsWith('.pdf')) {
                     jQuery('.full-page-iframe')[0].removeAttribute('sandbox')
                     jQuery('.full-page-iframe')[0].removeAttribute('sandbox')
                 } else {
                 } else {
-                    jQuery('.full-page-iframe')[0].sandbox = "allow-same-origin allow-scripts allow-forms"
+                    jQuery('.full-page-iframe')[0].sandbox = "allow-same-origin allow-scripts allow-forms allow-top-navigation-by-user-activation"
                 }
                 }
+                window.location.hash = getPreviewTypeFromPath(e.currentTarget)
                 return true
                 return true
             })
             })
 
 
+            // check URL for hash e.g. #git and load relevant preview
+            jQuery(document).ready(function() {
+                if (window.location.hash) {
+                    for (const link of jQuery('a[target=preview]')) {
+                        console.log(link.pathname)
+                        if (getPreviewTypeFromPath(link) == window.location.hash.slice(1).toLowerCase()) {
+                            jQuery(link).closest('.card').click()
+                            jQuery(link).click()
+                            link.click()
+                        }
+                    }
+                }
+            })
+
             // un-sandbox iframes showing pdfs (required to display pdf viewer)
             // un-sandbox iframes showing pdfs (required to display pdf viewer)
             jQuery('iframe').map(function() {
             jQuery('iframe').map(function() {
                 if (this.src.endsWith('.pdf')) {
                 if (this.src.endsWith('.pdf')) {

+ 2 - 2
archivebox/templates/core/static_index.html

@@ -209,7 +209,7 @@
             <div class="header-top container-fluid">
             <div class="header-top container-fluid">
                 <div class="row nav">
                 <div class="row nav">
                     <div class="col-sm-2">
                     <div class="col-sm-2">
-                        <a href="/" class="header-archivebox" title="Last updated: {{updated}}">
+                        <a href="/" class="header-archivebox">
                             <img src="{% static 'archive.png' %}" alt="Logo"/>
                             <img src="{% static 'archive.png' %}" alt="Logo"/>
                             ArchiveBox: Index
                             ArchiveBox: Index
                         </a>
                         </a>
@@ -243,7 +243,7 @@
             <center>
             <center>
                 <small>
                 <small>
                     Archive created using <a href="https://github.com/ArchiveBox/ArchiveBox" title="Github">ArchiveBox</a>
                     Archive created using <a href="https://github.com/ArchiveBox/ArchiveBox" title="Github">ArchiveBox</a>
-                    version <a href="https://github.com/ArchiveBox/ArchiveBox/tree/v{{version}}" title="Git commit">v{{version}}</a> &nbsp; | &nbsp; 
+                    version <a href="https://github.com/ArchiveBox/ArchiveBox/releases/tag/v{{version}}" title="View source code and release info">v{{version}}</a> &nbsp; | &nbsp; 
                     Download index as <a href="index.json" title="JSON summary of archived links.">JSON</a>
                     Download index as <a href="index.json" title="JSON summary of archived links.">JSON</a>
                     <br/><br/>
                     <br/><br/>
                     {{FOOTER_INFO}}
                     {{FOOTER_INFO}}

+ 1 - 1
archivebox/templates/static/add.css

@@ -42,7 +42,7 @@ header {
   background-color: #f5dd5d;
   background-color: #f5dd5d;
 }
 }
 #stdout {
 #stdout {
-  background-color: #ded;
+  background-color: #fbfbfb;
   padding: 10px 10px;
   padding: 10px 10px;
   border-radius: 4px;
   border-radius: 4px;
   white-space: normal;
   white-space: normal;

+ 37 - 0
archivebox/templates/static/admin.css

@@ -237,3 +237,40 @@ body.model-snapshot.change-list #content .object-tools {
     opacity: 0.1;
     opacity: 0.1;
     filter: grayscale(100%);
     filter: grayscale(100%);
 }
 }
+
+
+#result_list tbody td.field-cmd_str pre,
+#result_list tbody td.field-output_str pre {
+    max-width: 22vw;
+    word-wrap: anywhere;
+    white-space: break-spaces;
+    max-height: 40px;
+    overflow: hidden;
+    margin: 2px;
+    background-color: rgba(0,0,0,0.05);
+    padding: 1px 4px 16px 8px;
+    border-radius: 4px;
+}
+
+#result_list tbody td.field-extractor {
+    font-weight: 800;
+    font-variant: small-caps;
+}
+
+#result_list tbody td.field-status {
+    font-variant: small-caps;
+}
+
+.inline-group .tabular td.original p {
+    margin-top: -33px;
+}
+
+tbody .output-link {
+    float: right;
+    margin-bottom: -25px;
+    margin-right: -3px;
+    margin-top: -4px;
+    opacity: 0.4;
+    box-shadow:   4px 4px 4px rgba(0,0,0,0.1);
+}
+tbody .output-link:hover {opacity: 1;}

BIN
archivebox/templates/static/favicon.ico


+ 2 - 0
archivebox/templates/static/robots.txt

@@ -0,0 +1,2 @@
+User-agent: *
+Disallow: /

+ 3 - 1
archivebox/util.py

@@ -56,11 +56,13 @@ ts_to_iso = lambda ts: ts and parse_date(ts).isoformat()
 
 
 
 
 URL_REGEX = re.compile(
 URL_REGEX = re.compile(
+    r'(?=('
     r'http[s]?://'                    # start matching from allowed schemes
     r'http[s]?://'                    # start matching from allowed schemes
     r'(?:[a-zA-Z]|[0-9]'              # followed by allowed alphanum characters
     r'(?:[a-zA-Z]|[0-9]'              # followed by allowed alphanum characters
     r'|[$-_@.&+]|[!*\(\),]'           #    or allowed symbols
     r'|[$-_@.&+]|[!*\(\),]'           #    or allowed symbols
     r'|(?:%[0-9a-fA-F][0-9a-fA-F]))'  #    or allowed unicode bytes
     r'|(?:%[0-9a-fA-F][0-9a-fA-F]))'  #    or allowed unicode bytes
-    r'[^\]\[\(\)<>"\'\s]+',         # stop parsing at these symbols
+    r'[^\]\[\(\)<>"\'\s]+'          # stop parsing at these symbols
+    r'))',
     re.IGNORECASE,
     re.IGNORECASE,
 )
 )
 
 

+ 4 - 2
bin/docker_entrypoint.sh

@@ -3,6 +3,7 @@
 DATA_DIR="${DATA_DIR:-/data}"
 DATA_DIR="${DATA_DIR:-/data}"
 ARCHIVEBOX_USER="${ARCHIVEBOX_USER:-archivebox}"
 ARCHIVEBOX_USER="${ARCHIVEBOX_USER:-archivebox}"
 
 
+
 # Set the archivebox user UID & GID
 # Set the archivebox user UID & GID
 if [[ -n "$PUID" && "$PUID" != 0 ]]; then
 if [[ -n "$PUID" && "$PUID" != 0 ]]; then
     usermod -u "$PUID" "$ARCHIVEBOX_USER" > /dev/null 2>&1
     usermod -u "$PUID" "$ARCHIVEBOX_USER" > /dev/null 2>&1
@@ -11,6 +12,7 @@ if [[ -n "$PGID" && "$PGID" != 0 ]]; then
     groupmod -g "$PGID" "$ARCHIVEBOX_USER" > /dev/null 2>&1
     groupmod -g "$PGID" "$ARCHIVEBOX_USER" > /dev/null 2>&1
 fi
 fi
 
 
+
 # Set the permissions of the data dir to match the archivebox user
 # Set the permissions of the data dir to match the archivebox user
 if [[ -d "$DATA_DIR/archive" ]]; then
 if [[ -d "$DATA_DIR/archive" ]]; then
     # check data directory permissions
     # check data directory permissions
@@ -33,11 +35,11 @@ if [[ "$1" == /* || "$1" == "echo" || "$1" == "archivebox" ]]; then
     # e.g. "archivebox init"
     # e.g. "archivebox init"
     #      "/bin/bash"
     #      "/bin/bash"
     #      "echo"
     #      "echo"
-    gosu "$ARCHIVEBOX_USER" bash -c "$*"
+    exec gosu "$ARCHIVEBOX_USER" bash -c "$*"
 else
 else
     # no command given, assume args were meant to be passed to archivebox cmd
     # no command given, assume args were meant to be passed to archivebox cmd
     # e.g. "add https://example.com"
     # e.g. "add https://example.com"
     #      "manage createsupseruser"
     #      "manage createsupseruser"
     #      "server 0.0.0.0:8000"
     #      "server 0.0.0.0:8000"
-    gosu "$ARCHIVEBOX_USER" bash -c "archivebox $*"
+    exec gosu "$ARCHIVEBOX_USER" bash -c "archivebox $*"
 fi
 fi

+ 23 - 20
docker-compose.yml

@@ -11,36 +11,39 @@ version: '3.7'
 
 
 services:
 services:
     archivebox:
     archivebox:
-        # build: .
+        # build: .                              # for developers working on archivebox
         image: ${DOCKER_IMAGE:-archivebox/archivebox:latest} 
         image: ${DOCKER_IMAGE:-archivebox/archivebox:latest} 
-        command: server 0.0.0.0:8000
+        command: server --quick-init 0.0.0.0:8000
         stdin_open: true
         stdin_open: true
         tty: true
         tty: true
         ports:
         ports:
             - 8000:8000
             - 8000:8000
         environment:
         environment:
-            - USE_COLOR=True
-            - SHOW_PROGRESS=False
-            - SEARCH_BACKEND_ENGINE=sonic
-            - SEARCH_BACKEND_HOST_NAME=sonic
-            - SEARCH_BACKEND_PASSWORD=SecretPassword
+            - ALLOWED_HOSTS=*                   # add any config options you want as env vars
+            - MEDIA_MAX_SIZE=750m
+            # - SHOW_PROGRESS=False
+            # - SEARCH_BACKEND_ENGINE=sonic     # uncomment these if you enable sonic below
+            # - SEARCH_BACKEND_HOST_NAME=sonic
+            # - SEARCH_BACKEND_PASSWORD=SecretPassword
         volumes:
         volumes:
             - ./data:/data
             - ./data:/data
-        depends_on:
-            - sonic
+            # - ./archivebox:/app/archivebox    # for developers working on archivebox
     
     
-    # Run sonic search backend
-    sonic:
-        image: valeriansaliou/sonic:v1.3.0    
-        ports:
-            - 1491:1491
-        environment:
-            - SEARCH_BACKEND_PASSWORD=SecretPassword
-        volumes:
-            - ./etc/sonic/config.cfg:/etc/sonic.cfg
-            - ./data:/var/lib/sonic/store/
+    # To run the Sonic full-text search backend, create an ./etc/sonic folder
+    # and download the sonic config file from here into that folder:
+    # https://raw.githubusercontent.com/ArchiveBox/ArchiveBox/master/etc/sonic/config.cfg
+    # sonic:
+    #     image: valeriansaliou/sonic:v1.3.0    
+    #     expose:
+    #         - 1491
+    #     environment:
+    #         - SEARCH_BACKEND_PASSWORD=SecretPassword
+    #     volumes:
+    #         - ./etc/sonic/config.cfg:/etc/sonic.cfg
+    #         - ./data/sonic:/var/lib/sonic/store
+
 
 
-    # Optional Addons: tweak these examples as needed for your specific use case
+    ### Optional Addons: tweak these examples as needed for your specific use case
 
 
     # Example: Run scheduled imports in a docker instead of using cron on the
     # Example: Run scheduled imports in a docker instead of using cron on the
     # host machine, add tasks and see more info with archivebox schedule --help
     # host machine, add tasks and see more info with archivebox schedule --help

+ 1 - 1
package.json

@@ -1,6 +1,6 @@
 {
 {
   "name": "archivebox",
   "name": "archivebox",
-  "version": "0.5.6",
+  "version": "0.6.0",
   "description": "ArchiveBox: The self-hosted internet archive",
   "description": "ArchiveBox: The self-hosted internet archive",
   "author": "Nick Sweeting <[email protected]>",
   "author": "Nick Sweeting <[email protected]>",
   "license": "MIT",
   "license": "MIT",

+ 50 - 40
setup.py

@@ -27,6 +27,49 @@ PACKAGE_DIR = ROOT_DIR / PKG_NAME
 README = (PACKAGE_DIR / "README.md").read_text(encoding='utf-8', errors='ignore')
 README = (PACKAGE_DIR / "README.md").read_text(encoding='utf-8', errors='ignore')
 VERSION = json.loads((PACKAGE_DIR / "package.json").read_text().strip())['version']
 VERSION = json.loads((PACKAGE_DIR / "package.json").read_text().strip())['version']
 
 
+PYTHON_REQUIRES = ">=3.7"
+SETUP_REQUIRES = ["wheel"]
+INSTALL_REQUIRES = [
+    # only add things here that have corresponding apt python3-packages available
+    # anything added here also needs to be added to our package dependencies in
+    # stdeb.cfg (apt), archivebox.rb (brew), Dockerfile, etc.
+    # if there is no apt python3-package equivalent, then vendor it instead in
+    # ./archivebox/vendor/
+    "requests>=2.24.0",
+    "atomicwrites>=1.4.0",
+    "mypy-extensions>=0.4.3",
+    "django>=3.1.3",
+    "django-extensions>=3.0.3",
+    "dateparser",
+    "ipython",
+    "youtube-dl",
+    "python-crontab>=2.5.1",
+    "croniter>=0.3.34",
+    "w3lib>=1.22.0",
+]
+EXTRAS_REQUIRE = {
+    'sonic': [
+        "sonic-client>=0.0.5",
+    ],
+    'dev': [
+        "setuptools",
+        "twine",
+        "wheel",
+        "flake8",
+        "ipdb",
+        "mypy",
+        "django-stubs",
+        "sphinx",
+        "sphinx-rtd-theme",
+        "recommonmark",
+        "pytest",
+        "bottle",
+        "stdeb",
+        "django-debug-toolbar",
+        "djdt_flamegraph",
+    ],
+}
+
 # To see when setup.py gets called (uncomment for debugging):
 # To see when setup.py gets called (uncomment for debugging):
 # import sys
 # import sys
 # print(PACKAGE_DIR, f"     (v{VERSION})")
 # print(PACKAGE_DIR, f"     (v{VERSION})")
@@ -36,7 +79,9 @@ VERSION = json.loads((PACKAGE_DIR / "package.json").read_text().strip())['versio
 class DisabledTestCommand(test):
 class DisabledTestCommand(test):
     def run(self):
     def run(self):
         # setup.py test is deprecated, disable it here by force so stdeb doesnt run it
         # setup.py test is deprecated, disable it here by force so stdeb doesnt run it
-        print('Use the ./bin/test.sh script to run tests, not setup.py test.')
+        print()
+        print('[X] Running tests via setup.py test is deprecated.')
+        print('    Hint: Use the ./bin/test.sh script or pytest instead')
 
 
 
 
 setuptools.setup(
 setuptools.setup(
@@ -50,45 +95,10 @@ setuptools.setup(
     long_description_content_type="text/markdown",
     long_description_content_type="text/markdown",
     url=REPO_URL,
     url=REPO_URL,
     project_urls=PROJECT_URLS,
     project_urls=PROJECT_URLS,
-    python_requires=">=3.7",
-    setup_requires=[
-        "wheel",
-    ],
-    install_requires=[
-        # only add things here that have corresponding apt python3-packages available
-        # anything added here also needs to be added to our package dependencies in
-        # stdeb.cfg (apt), archivebox.rb (brew), Dockerfile, etc.
-        # if there is no apt python3-package equivalent, then vendor it instead in
-        # ./archivebox/vendor/
-        "requests==2.24.0",
-        "atomicwrites==1.4.0",
-        "mypy-extensions==0.4.3",
-        "django==3.1.3",
-        "django-extensions==3.0.3",
-        "dateparser",
-        "ipython",
-        "youtube-dl",
-        "python-crontab==2.5.1",
-        "croniter==0.3.34",
-        "w3lib==1.22.0",
-    ],
-    extras_require={
-        'dev': [
-            "setuptools",
-            "twine",
-            "wheel",
-            "flake8",
-            "ipdb",
-            "mypy",
-            "django-stubs",
-            "sphinx",
-            "sphinx-rtd-theme",
-            "recommonmark",
-            "pytest",
-            "bottle",
-            "stdeb",
-        ],
-    },
+    python_requires=PYTHON_REQUIRES,
+    setup_requires=SETUP_REQUIRES,
+    install_requires=INSTALL_REQUIRES,
+    extras_require=EXTRAS_REQUIRE,
     packages=[PKG_NAME],
     packages=[PKG_NAME],
     include_package_data=True,   # see MANIFEST.in
     include_package_data=True,   # see MANIFEST.in
     entry_points={
     entry_points={

+ 3 - 3
tests/test_add.py

@@ -33,7 +33,7 @@ def test_depth_flag_0_crawls_only_the_arg_page(tmp_path, process, disable_extrac
     )
     )
     
     
     archived_item_path = list(tmp_path.glob('archive/**/*'))[0]
     archived_item_path = list(tmp_path.glob('archive/**/*'))[0]
-    with open(archived_item_path / "index.json", "r") as f:
+    with open(archived_item_path / "index.json", "r", encoding='utf-8') as f:
         output_json = json.load(f)
         output_json = json.load(f)
     assert output_json["base_url"] == "127.0.0.1:8080/static/example.com.html"
     assert output_json["base_url"] == "127.0.0.1:8080/static/example.com.html"
 
 
@@ -79,7 +79,7 @@ def test_add_updates_history_json_index(tmp_path, process, disable_extractors_di
 
 
     archived_item_path = list(tmp_path.glob('archive/**/*'))[0]
     archived_item_path = list(tmp_path.glob('archive/**/*'))[0]
 
 
-    with open(archived_item_path / "index.json", "r") as f:
+    with open(archived_item_path / "index.json", "r", encoding="utf-8") as f:
         output_json = json.load(f)
         output_json = json.load(f)
     assert output_json["history"] != {}
     assert output_json["history"] != {}
 
 
@@ -90,4 +90,4 @@ def test_extract_input_uses_only_passed_extractors(tmp_path, process):
     archived_item_path = list(tmp_path.glob('archive/**/*'))[0]
     archived_item_path = list(tmp_path.glob('archive/**/*'))[0]
 
 
     assert (archived_item_path / "warc").exists()
     assert (archived_item_path / "warc").exists()
-    assert not (archived_item_path / "singlefile.html").exists()
+    assert not (archived_item_path / "singlefile.html").exists()

+ 4 - 4
tests/test_extractors.py

@@ -86,7 +86,7 @@ def test_headers_retrieved(tmp_path, process, disable_extractors_dict):
     output_file = archived_item_path / "headers.json"
     output_file = archived_item_path / "headers.json"
     assert output_file.exists()
     assert output_file.exists()
     headers_file = archived_item_path / 'headers.json'
     headers_file = archived_item_path / 'headers.json'
-    with open(headers_file) as f:
+    with open(headers_file, 'r', encoding='utf-8') as f:
         headers = pyjson.load(f)
         headers = pyjson.load(f)
     assert headers['Content-Language'] == 'en'
     assert headers['Content-Language'] == 'en'
     assert headers['Content-Script-Type'] == 'text/javascript'
     assert headers['Content-Script-Type'] == 'text/javascript'
@@ -98,7 +98,7 @@ def test_headers_redirect_chain(tmp_path, process, disable_extractors_dict):
                                   capture_output=True, env=disable_extractors_dict)
                                   capture_output=True, env=disable_extractors_dict)
     archived_item_path = list(tmp_path.glob("archive/**/*"))[0]
     archived_item_path = list(tmp_path.glob("archive/**/*"))[0]
     output_file = archived_item_path / "headers.json" 
     output_file = archived_item_path / "headers.json" 
-    with open(output_file) as f:
+    with open(output_file, 'r', encoding='utf-8') as f:
         headers = pyjson.load(f)
         headers = pyjson.load(f)
     assert headers['Content-Language'] == 'en'
     assert headers['Content-Language'] == 'en'
     assert headers['Content-Script-Type'] == 'text/javascript'
     assert headers['Content-Script-Type'] == 'text/javascript'
@@ -110,6 +110,6 @@ def test_headers_400_plus(tmp_path, process, disable_extractors_dict):
                                   capture_output=True, env=disable_extractors_dict)
                                   capture_output=True, env=disable_extractors_dict)
     archived_item_path = list(tmp_path.glob("archive/**/*"))[0]
     archived_item_path = list(tmp_path.glob("archive/**/*"))[0]
     output_file = archived_item_path / "headers.json" 
     output_file = archived_item_path / "headers.json" 
-    with open(output_file) as f:
+    with open(output_file, 'r', encoding='utf-8') as f:
         headers = pyjson.load(f)
         headers = pyjson.load(f)
-    assert headers["Status-Code"] == "200"
+    assert headers["Status-Code"] == "200"

+ 10 - 10
tests/test_init.py

@@ -12,12 +12,12 @@ from archivebox.config import OUTPUT_PERMISSIONS
 from .fixtures import *
 from .fixtures import *
 
 
 def test_init(tmp_path, process):
 def test_init(tmp_path, process):
-    assert "Initializing a new ArchiveBox collection in this folder..." in process.stdout.decode("utf-8")
+    assert "Initializing a new ArchiveBox" in process.stdout.decode("utf-8")
     
     
 def test_update(tmp_path, process):
 def test_update(tmp_path, process):
     os.chdir(tmp_path)
     os.chdir(tmp_path)
     update_process = subprocess.run(['archivebox', 'init'], capture_output=True)
     update_process = subprocess.run(['archivebox', 'init'], capture_output=True)
-    assert "Updating existing ArchiveBox collection in this folder" in update_process.stdout.decode("utf-8")
+    assert "updating existing ArchiveBox" in update_process.stdout.decode("utf-8")
 
 
 def test_add_link(tmp_path, process, disable_extractors_dict):
 def test_add_link(tmp_path, process, disable_extractors_dict):
     disable_extractors_dict.update({"USE_WGET": "true"})
     disable_extractors_dict.update({"USE_WGET": "true"})
@@ -28,11 +28,11 @@ def test_add_link(tmp_path, process, disable_extractors_dict):
 
 
     assert "index.json" in [x.name for x in archived_item_path.iterdir()]
     assert "index.json" in [x.name for x in archived_item_path.iterdir()]
 
 
-    with open(archived_item_path / "index.json", "r") as f:
+    with open(archived_item_path / "index.json", "r", encoding="utf-8") as f:
         output_json = json.load(f)
         output_json = json.load(f)
     assert "Example Domain" == output_json['history']['title'][0]['output']
     assert "Example Domain" == output_json['history']['title'][0]['output']
 
 
-    with open(archived_item_path / "index.html", "r") as f:
+    with open(archived_item_path / "index.html", "r", encoding="utf-8") as f:
         output_html = f.read()
         output_html = f.read()
     assert "Example Domain" in output_html
     assert "Example Domain" in output_html
 
 
@@ -47,7 +47,7 @@ def test_add_link_support_stdin(tmp_path, process, disable_extractors_dict):
 
 
     assert "index.json" in [x.name for x in archived_item_path.iterdir()]
     assert "index.json" in [x.name for x in archived_item_path.iterdir()]
 
 
-    with open(archived_item_path / "index.json", "r") as f:
+    with open(archived_item_path / "index.json", "r", encoding="utf-8") as f:
         output_json = json.load(f)
         output_json = json.load(f)
     assert "Example Domain" == output_json['history']['title'][0]['output']
     assert "Example Domain" == output_json['history']['title'][0]['output']
 
 
@@ -75,11 +75,11 @@ def test_collision_urls_different_timestamps(tmp_path, process, disable_extracto
     
     
     first_archive = tmp_path / "archive" / str(min([float(folder) for folder in archive_folders]))
     first_archive = tmp_path / "archive" / str(min([float(folder) for folder in archive_folders]))
     json_index = str(first_archive / "index.json")
     json_index = str(first_archive / "index.json")
-    with open(json_index, "r") as f:
+    with open(json_index, "r", encoding="utf-8") as f:
         link_details = json.loads(f.read())
         link_details = json.loads(f.read())
 
 
     link_details["url"] = "http://127.0.0.1:8080/static/iana.org.html"
     link_details["url"] = "http://127.0.0.1:8080/static/iana.org.html"
-    with open(json_index, "w") as f:
+    with open(json_index, "w", encoding="utf-8") as f:
         json.dump(link_details, f)
         json.dump(link_details, f)
 
 
     init_process = subprocess.run(['archivebox', 'init'], capture_output=True, env=disable_extractors_dict)
     init_process = subprocess.run(['archivebox', 'init'], capture_output=True, env=disable_extractors_dict)
@@ -98,12 +98,12 @@ def test_collision_timestamps_different_urls(tmp_path, process, disable_extracto
     archive_folders.remove(first_archive.name)
     archive_folders.remove(first_archive.name)
     json_index = str(first_archive / "index.json")
     json_index = str(first_archive / "index.json")
 
 
-    with open(json_index, "r") as f:
+    with open(json_index, "r", encoding="utf-8") as f:
         link_details = json.loads(f.read())
         link_details = json.loads(f.read())
 
 
     link_details["timestamp"] = archive_folders[0]
     link_details["timestamp"] = archive_folders[0]
 
 
-    with open(json_index, "w") as f:
+    with open(json_index, "w", encoding="utf-8") as f:
         json.dump(link_details, f)
         json.dump(link_details, f)
 
 
     init_process = subprocess.run(['archivebox', 'init'], capture_output=True, env=disable_extractors_dict)
     init_process = subprocess.run(['archivebox', 'init'], capture_output=True, env=disable_extractors_dict)
@@ -173,4 +173,4 @@ def test_tags_migration(tmp_path, disable_extractors_dict):
         snapshot_id = tag["id"]
         snapshot_id = tag["id"]
         tag_name = tag["name"]
         tag_name = tag["name"]
         # Check each tag migrated is in the previous field
         # Check each tag migrated is in the previous field
-        assert tag_name in snapshots_dict[snapshot_id]
+        assert tag_name in snapshots_dict[snapshot_id]

+ 14 - 10
tests/test_remove.py

@@ -100,16 +100,18 @@ def test_remove_before(tmp_path, process, disable_extractors_dict):
 
 
     conn = sqlite3.connect("index.sqlite3")
     conn = sqlite3.connect("index.sqlite3")
     c = conn.cursor()
     c = conn.cursor()
-    timestamp = c.execute("SELECT timestamp FROM core_snapshot ORDER BY timestamp ASC").fetchall()
+    higherts, lowerts = timestamp = c.execute("SELECT timestamp FROM core_snapshot ORDER BY timestamp DESC").fetchall()
     conn.commit()
     conn.commit()
     conn.close()
     conn.close()
 
 
-    before = list(map(lambda x: int(x[0].split(".")[0]), timestamp))
+    lowerts = lowerts[0].split(".")[0]
+    higherts = higherts[0].split(".")[0]
 
 
-    subprocess.run(['archivebox', 'remove', '--filter-type=regex', '.*', '--yes', '--delete', '--before', str(before[1])], capture_output=True)
+    # before is less than, so only the lower snapshot gets deleted
+    subprocess.run(['archivebox', 'remove', '--filter-type=regex', '.*', '--yes', '--delete', '--before', higherts], capture_output=True)
 
 
-    assert (tmp_path / "archive" / timestamp[0][0]).exists()
-    assert not (tmp_path / "archive" / timestamp[1][0]).exists()
+    assert not (tmp_path / "archive" / lowerts).exists()
+    assert (tmp_path / "archive" / higherts).exists()
 
 
 def test_remove_after(tmp_path, process, disable_extractors_dict):
 def test_remove_after(tmp_path, process, disable_extractors_dict):
     subprocess.run(['archivebox', 'add', 'http://127.0.0.1:8080/static/example.com.html'], capture_output=True, env=disable_extractors_dict)
     subprocess.run(['archivebox', 'add', 'http://127.0.0.1:8080/static/example.com.html'], capture_output=True, env=disable_extractors_dict)
@@ -118,13 +120,15 @@ def test_remove_after(tmp_path, process, disable_extractors_dict):
 
 
     conn = sqlite3.connect("index.sqlite3")
     conn = sqlite3.connect("index.sqlite3")
     c = conn.cursor()
     c = conn.cursor()
-    timestamp = c.execute("SELECT timestamp FROM core_snapshot ORDER BY timestamp ASC").fetchall()
+    higherts, lowerts = c.execute("SELECT timestamp FROM core_snapshot ORDER BY timestamp DESC").fetchall()
     conn.commit()
     conn.commit()
     conn.close()
     conn.close()
 
 
-    after = list(map(lambda x: int(x[0].split(".")[0]), timestamp))
+    lowerts = lowerts[0].split(".")[0]
+    higherts = higherts[0].split(".")[0]
 
 
-    subprocess.run(['archivebox', 'remove', '--filter-type=regex', '.*', '--yes', '--delete', '--after', str(after[1])], capture_output=True)
+    # after is greater than or equal to, so both snapshots get deleted
+    subprocess.run(['archivebox', 'remove', '--filter-type=regex', '.*', '--yes', '--delete', '--after', lowerts], capture_output=True)
 
 
-    assert (tmp_path / "archive" / timestamp[1][0]).exists()
-    assert not (tmp_path / "archive" / timestamp[0][0]).exists()
+    assert not (tmp_path / "archive" / lowerts).exists()
+    assert not (tmp_path / "archive" / higherts).exists()

+ 13 - 0
uwsgi.ini

@@ -0,0 +1,13 @@
+[uwsgi]
+socket = 127.0.0.1:3031
+chdir = ../
+http = 0.0.0.0:8001
+env = OUTPUT_DIR=./data
+wsgi-file = archivebox/core/wsgi.py
+processes = 4
+threads = 1
+stats = 127.0.0.1:9191
+static-map /static=./archivebox/templates/static
+harakiri = 172800
+post-buffering = 1
+disable-logging = True