Pārlūkot izejas kodu

add pipenv, schedule cmd, logs dir, and lots more

Nick Sweeting 6 gadi atpakaļ
vecāks
revīzija
39a0ab3013

+ 22 - 0
Pipfile

@@ -0,0 +1,22 @@
+[[source]]
+name = "pypi"
+url = "https://pypi.org/simple"
+verify_ssl = true
+
+[dev-packages]
+ipdb = "*"
+flake8 = "*"
+mypy = "*"
+django-stubs = "*"
+setuptools = "*"
+
+[packages]
+dataclasses = "*"
+base32-crockford = "*"
+django = "*"
+youtube-dl = "*"
+python-crontab = "*"
+croniter = "*"
+
+[requires]
+python_version = ">=3.6"

+ 314 - 0
Pipfile.lock

@@ -0,0 +1,314 @@
+{
+    "_meta": {
+        "hash": {
+            "sha256": "7f25fb9c97e469fdb787e755c5756e2be4b0b649e3c5ad8feb17200b32d3bb36"
+        },
+        "pipfile-spec": 6,
+        "requires": {
+            "python_version": ">=3.6"
+        },
+        "sources": [
+            {
+                "name": "pypi",
+                "url": "https://pypi.org/simple",
+                "verify_ssl": true
+            }
+        ]
+    },
+    "default": {
+        "base32-crockford": {
+            "hashes": [
+                "sha256:115f5bd32ae32b724035cb02eb65069a8824ea08c08851eb80c8b9f63443a969",
+                "sha256:295ef5ffbf6ed96b6e739ffd36be98fa7e90a206dd18c39acefb15777eedfe6e"
+            ],
+            "index": "pypi",
+            "version": "==0.3.0"
+        },
+        "croniter": {
+            "hashes": [
+                "sha256:625949cbd38a0b2325295591940dfa5fa0dfca41d03150ae0284a924e0be10f0",
+                "sha256:66b6a9c6b2d1a85d4af51453b2328be775a173e688b69eb3a96a7ec752ba77a3"
+            ],
+            "index": "pypi",
+            "version": "==0.3.29"
+        },
+        "dataclasses": {
+            "hashes": [
+                "sha256:454a69d788c7fda44efd71e259be79577822f5e3f53f029a22d08004e951dc9f",
+                "sha256:6988bd2b895eef432d562370bb707d540f32f7360ab13da45340101bc2307d84"
+            ],
+            "index": "pypi",
+            "version": "==0.6"
+        },
+        "django": {
+            "hashes": [
+                "sha256:7c3543e4fb070d14e10926189a7fcf42ba919263b7473dceaefce34d54e8a119",
+                "sha256:a2814bffd1f007805b19194eb0b9a331933b82bd5da1c3ba3d7b7ba16e06dc4b"
+            ],
+            "index": "pypi",
+            "version": "==2.2"
+        },
+        "python-crontab": {
+            "hashes": [
+                "sha256:91ce4b245ee5e5c117aa0b21b485bc43f2d80df854a36e922b707643f50d7923"
+            ],
+            "index": "pypi",
+            "version": "==2.3.6"
+        },
+        "python-dateutil": {
+            "hashes": [
+                "sha256:7e6584c74aeed623791615e26efd690f29817a27c73085b78e4bad02493df2fb",
+                "sha256:c89805f6f4d64db21ed966fda138f8a5ed7a4fdbc1a8ee329ce1b74e3c74da9e"
+            ],
+            "version": "==2.8.0"
+        },
+        "pytz": {
+            "hashes": [
+                "sha256:303879e36b721603cc54604edcac9d20401bdbe31e1e4fdee5b9f98d5d31dfda",
+                "sha256:d747dd3d23d77ef44c6a3526e274af6efeb0a6f1afd5a69ba4d5be4098c8e141"
+            ],
+            "version": "==2019.1"
+        },
+        "six": {
+            "hashes": [
+                "sha256:3350809f0555b11f552448330d0b52d5f24c91a322ea4a15ef22629740f3761c",
+                "sha256:d16a0141ec1a18405cd4ce8b4613101da75da0e9a7aec5bdd4fa804d0e0eba73"
+            ],
+            "version": "==1.12.0"
+        },
+        "sqlparse": {
+            "hashes": [
+                "sha256:40afe6b8d4b1117e7dff5504d7a8ce07d9a1b15aeeade8a2d10f130a834f8177",
+                "sha256:7c3dca29c022744e95b547e867cee89f4fce4373f3549ccd8797d8eb52cdb873"
+            ],
+            "version": "==0.3.0"
+        },
+        "youtube-dl": {
+            "hashes": [
+                "sha256:0d25459093870bf560bccafe9015e59402d7de1b2c956593623ba4c2840153e5",
+                "sha256:ea0824ae9a166059ec754c267480198a074bd899c20b2ba497809bac099cde2e"
+            ],
+            "index": "pypi",
+            "version": "==2019.4.17"
+        }
+    },
+    "develop": {
+        "appnope": {
+            "hashes": [
+                "sha256:5b26757dc6f79a3b7dc9fab95359328d5747fcb2409d331ea66d0272b90ab2a0",
+                "sha256:8b995ffe925347a2138d7ac0fe77155e4311a0ea6d6da4f5128fe4b3cbe5ed71"
+            ],
+            "markers": "sys_platform == 'darwin'",
+            "version": "==0.1.0"
+        },
+        "backcall": {
+            "hashes": [
+                "sha256:38ecd85be2c1e78f77fd91700c76e14667dc21e2713b63876c0eb901196e01e4",
+                "sha256:bbbf4b1e5cd2bdb08f915895b51081c041bac22394fdfcfdfbe9f14b77c08bf2"
+            ],
+            "version": "==0.1.0"
+        },
+        "decorator": {
+            "hashes": [
+                "sha256:86156361c50488b84a3f148056ea716ca587df2f0de1d34750d35c21312725de",
+                "sha256:f069f3a01830ca754ba5258fde2278454a0b5b79e0d7f5c13b3b97e57d4acff6"
+            ],
+            "version": "==4.4.0"
+        },
+        "django-stubs": {
+            "hashes": [
+                "sha256:9c06a4b28fc8c18f6abee4f199f8ee29cb5cfcecf349e912ded31cb3526ea2b6",
+                "sha256:9ef230843a24b5d74f2ebd4c60f9bea09c21911bc119d0325e8bb47e2f495e70"
+            ],
+            "index": "pypi",
+            "version": "==0.12.1"
+        },
+        "entrypoints": {
+            "hashes": [
+                "sha256:589f874b313739ad35be6e0cd7efde2a4e9b6fea91edcc34e58ecbb8dbe56d19",
+                "sha256:c70dd71abe5a8c85e55e12c19bd91ccfeec11a6e99044204511f9ed547d48451"
+            ],
+            "version": "==0.3"
+        },
+        "flake8": {
+            "hashes": [
+                "sha256:859996073f341f2670741b51ec1e67a01da142831aa1fdc6242dbf88dffbe661",
+                "sha256:a796a115208f5c03b18f332f7c11729812c8c3ded6c46319c59b53efd3819da8"
+            ],
+            "index": "pypi",
+            "version": "==3.7.7"
+        },
+        "ipdb": {
+            "hashes": [
+                "sha256:dce2112557edfe759742ca2d0fee35c59c97b0cc7a05398b791079d78f1519ce"
+            ],
+            "index": "pypi",
+            "version": "==0.12"
+        },
+        "ipython": {
+            "hashes": [
+                "sha256:b038baa489c38f6d853a3cfc4c635b0cda66f2864d136fe8f40c1a6e334e2a6b",
+                "sha256:f5102c1cd67e399ec8ea66bcebe6e3968ea25a8977e53f012963e5affeb1fe38"
+            ],
+            "markers": "python_version >= '3.4'",
+            "version": "==7.4.0"
+        },
+        "ipython-genutils": {
+            "hashes": [
+                "sha256:72dd37233799e619666c9f639a9da83c34013a73e8bbc79a7a6348d93c61fab8",
+                "sha256:eb2e116e75ecef9d4d228fdc66af54269afa26ab4463042e33785b887c628ba8"
+            ],
+            "version": "==0.2.0"
+        },
+        "jedi": {
+            "hashes": [
+                "sha256:2bb0603e3506f708e792c7f4ad8fc2a7a9d9c2d292a358fbbd58da531695595b",
+                "sha256:2c6bcd9545c7d6440951b12b44d373479bf18123a401a52025cf98563fbd826c"
+            ],
+            "version": "==0.13.3"
+        },
+        "mccabe": {
+            "hashes": [
+                "sha256:ab8a6258860da4b6677da4bd2fe5dc2c659cff31b3ee4f7f5d64e79735b80d42",
+                "sha256:dd8d182285a0fe56bace7f45b5e7d1a6ebcbf524e8f3bd87eb0f125271b8831f"
+            ],
+            "version": "==0.6.1"
+        },
+        "mypy": {
+            "hashes": [
+                "sha256:2afe51527b1f6cdc4a5f34fc90473109b22bf7f21086ba3e9451857cf11489e6",
+                "sha256:56a16df3e0abb145d8accd5dbb70eba6c4bd26e2f89042b491faa78c9635d1e2",
+                "sha256:5764f10d27b2e93c84f70af5778941b8f4aa1379b2430f85c827e0f5464e8714",
+                "sha256:5bbc86374f04a3aa817622f98e40375ccb28c4836f36b66706cf3c6ccce86eda",
+                "sha256:6a9343089f6377e71e20ca734cd8e7ac25d36478a9df580efabfe9059819bf82",
+                "sha256:6c9851bc4a23dc1d854d3f5dfd5f20a016f8da86bcdbb42687879bb5f86434b0",
+                "sha256:b8e85956af3fcf043d6f87c91cbe8705073fc67029ba6e22d3468bfee42c4823",
+                "sha256:b9a0af8fae490306bc112229000aa0c2ccc837b49d29a5c42e088c132a2334dd",
+                "sha256:bbf643528e2a55df2c1587008d6e3bda5c0445f1240dfa85129af22ae16d7a9a",
+                "sha256:c46ab3438bd21511db0f2c612d89d8344154c0c9494afc7fbc932de514cf8d15",
+                "sha256:f7a83d6bd805855ef83ec605eb01ab4fa42bcef254b13631e451cbb44914a9b0"
+            ],
+            "index": "pypi",
+            "version": "==0.701"
+        },
+        "mypy-extensions": {
+            "hashes": [
+                "sha256:37e0e956f41369209a3d5f34580150bcacfabaa57b33a15c0b25f4b5725e0812",
+                "sha256:b16cabe759f55e3409a7d231ebd2841378fb0c27a5d1994719e340e4f429ac3e"
+            ],
+            "version": "==0.4.1"
+        },
+        "parso": {
+            "hashes": [
+                "sha256:17cc2d7a945eb42c3569d4564cdf49bde221bc2b552af3eca9c1aad517dcdd33",
+                "sha256:2e9574cb12e7112a87253e14e2c380ce312060269d04bd018478a3c92ea9a376"
+            ],
+            "version": "==0.4.0"
+        },
+        "pexpect": {
+            "hashes": [
+                "sha256:2094eefdfcf37a1fdbfb9aa090862c1a4878e5c7e0e7e7088bdb511c558e5cd1",
+                "sha256:9e2c1fd0e6ee3a49b28f95d4b33bc389c89b20af6a1255906e90ff1262ce62eb"
+            ],
+            "markers": "sys_platform != 'win32'",
+            "version": "==4.7.0"
+        },
+        "pickleshare": {
+            "hashes": [
+                "sha256:87683d47965c1da65cdacaf31c8441d12b8044cdec9aca500cd78fc2c683afca",
+                "sha256:9649af414d74d4df115d5d718f82acb59c9d418196b7b4290ed47a12ce62df56"
+            ],
+            "version": "==0.7.5"
+        },
+        "prompt-toolkit": {
+            "hashes": [
+                "sha256:11adf3389a996a6d45cc277580d0d53e8a5afd281d0c9ec71b28e6f121463780",
+                "sha256:2519ad1d8038fd5fc8e770362237ad0364d16a7650fb5724af6997ed5515e3c1",
+                "sha256:977c6583ae813a37dc1c2e1b715892461fcbdaa57f6fc62f33a528c4886c8f55"
+            ],
+            "version": "==2.0.9"
+        },
+        "ptyprocess": {
+            "hashes": [
+                "sha256:923f299cc5ad920c68f2bc0bc98b75b9f838b93b599941a6b63ddbc2476394c0",
+                "sha256:d7cc528d76e76342423ca640335bd3633420dc1366f258cb31d05e865ef5ca1f"
+            ],
+            "version": "==0.6.0"
+        },
+        "pycodestyle": {
+            "hashes": [
+                "sha256:95a2219d12372f05704562a14ec30bc76b05a5b297b21a5dfe3f6fac3491ae56",
+                "sha256:e40a936c9a450ad81df37f549d676d127b1b66000a6c500caa2b085bc0ca976c"
+            ],
+            "version": "==2.5.0"
+        },
+        "pyflakes": {
+            "hashes": [
+                "sha256:17dbeb2e3f4d772725c777fabc446d5634d1038f234e77343108ce445ea69ce0",
+                "sha256:d976835886f8c5b31d47970ed689944a0262b5f3afa00a5a7b4dc81e5449f8a2"
+            ],
+            "version": "==2.1.1"
+        },
+        "pygments": {
+            "hashes": [
+                "sha256:5ffada19f6203563680669ee7f53b64dabbeb100eb51b61996085e99c03b284a",
+                "sha256:e8218dd399a61674745138520d0d4cf2621d7e032439341bc3f647bff125818d"
+            ],
+            "version": "==2.3.1"
+        },
+        "six": {
+            "hashes": [
+                "sha256:3350809f0555b11f552448330d0b52d5f24c91a322ea4a15ef22629740f3761c",
+                "sha256:d16a0141ec1a18405cd4ce8b4613101da75da0e9a7aec5bdd4fa804d0e0eba73"
+            ],
+            "version": "==1.12.0"
+        },
+        "traitlets": {
+            "hashes": [
+                "sha256:9c4bd2d267b7153df9152698efb1050a5d84982d3384a37b2c1f7723ba3e7835",
+                "sha256:c6cb5e6f57c5a9bdaa40fa71ce7b4af30298fbab9ece9815b5d995ab6217c7d9"
+            ],
+            "version": "==4.3.2"
+        },
+        "typed-ast": {
+            "hashes": [
+                "sha256:04894d268ba6eab7e093d43107869ad49e7b5ef40d1a94243ea49b352061b200",
+                "sha256:16616ece19daddc586e499a3d2f560302c11f122b9c692bc216e821ae32aa0d0",
+                "sha256:252fdae740964b2d3cdfb3f84dcb4d6247a48a6abe2579e8029ab3be3cdc026c",
+                "sha256:2af80a373af123d0b9f44941a46df67ef0ff7a60f95872412a145f4500a7fc99",
+                "sha256:2c88d0a913229a06282b285f42a31e063c3bf9071ff65c5ea4c12acb6977c6a7",
+                "sha256:2ea99c029ebd4b5a308d915cc7fb95b8e1201d60b065450d5d26deb65d3f2bc1",
+                "sha256:3d2e3ab175fc097d2a51c7a0d3fda442f35ebcc93bb1d7bd9b95ad893e44c04d",
+                "sha256:4766dd695548a15ee766927bf883fb90c6ac8321be5a60c141f18628fb7f8da8",
+                "sha256:56b6978798502ef66625a2e0f80cf923da64e328da8bbe16c1ff928c70c873de",
+                "sha256:5cddb6f8bce14325b2863f9d5ac5c51e07b71b462361fd815d1d7706d3a9d682",
+                "sha256:644ee788222d81555af543b70a1098f2025db38eaa99226f3a75a6854924d4db",
+                "sha256:64cf762049fc4775efe6b27161467e76d0ba145862802a65eefc8879086fc6f8",
+                "sha256:68c362848d9fb71d3c3e5f43c09974a0ae319144634e7a47db62f0f2a54a7fa7",
+                "sha256:6c1f3c6f6635e611d58e467bf4371883568f0de9ccc4606f17048142dec14a1f",
+                "sha256:b213d4a02eec4ddf622f4d2fbc539f062af3788d1f332f028a2e19c42da53f15",
+                "sha256:bb27d4e7805a7de0e35bd0cb1411bc85f807968b2b0539597a49a23b00a622ae",
+                "sha256:c9d414512eaa417aadae7758bc118868cd2396b0e6138c1dd4fda96679c079d3",
+                "sha256:f0937165d1e25477b01081c4763d2d9cdc3b18af69cb259dd4f640c9b900fe5e",
+                "sha256:fb96a6e2c11059ecf84e6741a319f93f683e440e341d4489c9b161eca251cf2a",
+                "sha256:fc71d2d6ae56a091a8d94f33ec9d0f2001d1cb1db423d8b4355debfe9ce689b7"
+            ],
+            "version": "==1.3.4"
+        },
+        "typing-extensions": {
+            "hashes": [
+                "sha256:07b2c978670896022a43c4b915df8958bec4a6b84add7f2c87b2b728bda3ba64",
+                "sha256:f3f0e67e1d42de47b5c67c32c9b26641642e9170fe7e292991793705cd5fef7c",
+                "sha256:fb2cd053238d33a8ec939190f30cfd736c00653a85a2919415cecf7dc3d9da71"
+            ],
+            "version": "==3.7.2"
+        },
+        "wcwidth": {
+            "hashes": [
+                "sha256:3df37372226d6e63e1b1e1eda15c594bca98a22d33a23832a90998faa96bc65e",
+                "sha256:f4ebe71925af7b40a864553f761ed559b43544f8f71746c2d756c7fe788ade7c"
+            ],
+            "version": "==0.1.7"
+        }
+    }
+}

+ 39 - 10
archivebox/cli/__init__.py

@@ -1,30 +1,59 @@
 __package__ = 'archivebox.cli'
 
 import os
+
+from typing import Dict
 from importlib import import_module
 
 CLI_DIR = os.path.dirname(os.path.abspath(__file__))
 
-required_attrs = ('__package__', '__command__', '__description__', 'main')
+# these common commands will appear sorted before any others for ease-of-use
+display_first = ('help', 'version', 'init', 'list', 'update', 'add', 'remove')
 
+# every imported command module must have these properties in order to be valid
+required_attrs = ('__package__', '__command__', 'main')
 
-order = ('help', 'version', 'init', 'list', 'update', 'add', 'remove')
+# basic checks to make sure imported files are valid subcommands
+is_cli_module = lambda fname: fname.startswith('archivebox_') and fname.endswith('.py')
+is_valid_cli_module = lambda module, subcommand: (
+    all(hasattr(module, attr) for attr in required_attrs)
+    and module.__command__.split(' ')[-1] == subcommand
+)
 
+def list_subcommands() -> Dict[str, str]:
+    """find and import all valid archivebox_<subcommand>.py files in CLI_DIR"""
 
-def list_subcommands():
     COMMANDS = []
     for filename in os.listdir(CLI_DIR):
-        if filename.startswith('archivebox_') and filename.endswith('.py'):
+        if is_cli_module(filename):
             subcommand = filename.replace('archivebox_', '').replace('.py', '')
             module = import_module('.archivebox_{}'.format(subcommand), __package__)
+            assert is_valid_cli_module(module, subcommand)
+            COMMANDS.append((subcommand, module.__description__))  # type: ignore
+            globals()[subcommand] = module.main
+            module.main.__doc__ = module.__description__
+
+    display_order = lambda cmd: (
+        display_first.index(cmd[0])
+        if cmd[0] in display_first else
+        100 + len(cmd[0])
+    )
 
-            assert all(hasattr(module, attr) for attr in required_attrs)
-            assert module.__command__.split(' ')[-1] == subcommand
-            COMMANDS.append((subcommand, module.__description__))
+    return dict(sorted(COMMANDS, key=display_order))
 
-    return dict(sorted(COMMANDS, key=lambda cmd: order.index(cmd[0]) if cmd[0] in order else 10 + len(cmd[0]))) 
 
+def run_subcommand(subcommand: str, args=None) -> None:
+    """run a given ArchiveBox subcommand with the given list of args"""
 
-def run_subcommand(subcommand: str, args=None):
     module = import_module('.archivebox_{}'.format(subcommand), __package__)
-    return module.main(args)    # type: ignore
+    module.main(args)    # type: ignore
+
+
+SUBCOMMANDS = list_subcommands()
+
+__all__ = (
+    'SUBCOMMANDS',
+    'list_subcommands',
+    'run_subcommand',
+    *SUBCOMMANDS.keys(),
+)

+ 1 - 0
archivebox/cli/archivebox_add.py

@@ -82,5 +82,6 @@ def main(args=None, stdin=None):
         only_new=command.only_new,
     )
 
+
 if __name__ == '__main__':
     main()

+ 0 - 1
archivebox/cli/archivebox_init.py

@@ -4,7 +4,6 @@ __package__ = 'archivebox.cli'
 __command__ = 'archivebox init'
 __description__ = 'Initialize a new ArchiveBox collection in the current directory'
 
-import os
 import sys
 import argparse
 

+ 194 - 0
archivebox/cli/archivebox_schedule.py

@@ -0,0 +1,194 @@
+#!/usr/bin/env python3
+
+__package__ = 'archivebox.cli'
+__command__ = 'archivebox schedule'
+__description__ = 'Set ArchiveBox to run regularly at a specific time'
+
+import os
+import sys
+import argparse
+
+from datetime import datetime
+from crontab import CronTab, CronSlices
+
+
+from ..legacy.util import reject_stdin
+from ..legacy.config import (
+    OUTPUT_DIR,
+    LOGS_DIR,
+    ARCHIVEBOX_BINARY,
+    USER,
+    ANSI,
+    stderr,
+)
+
+
+CRON_COMMENT = 'archivebox_schedule'
+
+
+def main(args=None):
+    args = sys.argv[1:] if args is None else args
+
+    parser = argparse.ArgumentParser(
+        prog=__command__,
+        description=__description__,
+        add_help=True,
+    )
+    parser.add_argument(
+        '--quiet', '-q',
+        action='store_true',
+        help=("Don't warn about storage space."),
+    )
+    group = parser.add_mutually_exclusive_group()
+    group.add_argument(
+        '--add', # '-a',
+        action='store_true',
+        help='Add a new scheduled ArchiveBox update job to cron',
+    )
+    parser.add_argument(
+        '--every', # '-e',
+        type=str,
+        default='daily',
+        help='Run ArchiveBox once every [timeperiod] (hour/day/week/month/year or cron format e.g. "0 0 * * *")',
+    )
+    group.add_argument(
+        '--clear', # '-c'
+        action='store_true',
+        help=("Stop all ArchiveBox scheduled runs, clear it completely from cron"),
+    )
+    group.add_argument(
+        '--show', # '-s'
+        action='store_true',
+        help=("Print a list of currently active ArchiveBox cron jobs"),
+    )
+    group.add_argument(
+        '--foreground', '-f',
+        action='store_true',
+        help=("Launch ArchiveBox as a long-running foreground task "
+              "instead of using cron."),
+    )
+    group.add_argument(
+        '--run-all', # '-a',
+        action='store_true',
+        help='Run all the scheduled jobs once immediately, independent of their configured schedules',
+    )
+    parser.add_argument(
+        'import_path',
+        nargs='?',
+        type=str,
+        default=None,
+        help=("Check this path and import any new links on every run "
+              "(can be either local file or remote URL)"),
+    )
+    command = parser.parse_args(args)
+    reject_stdin(__command__)
+
+    os.makedirs(LOGS_DIR, exist_ok=True)
+
+    cron = CronTab(user=True)
+    cron = dedupe_jobs(cron)
+
+    existing_jobs = list(cron.find_comment(CRON_COMMENT))
+    if command.foreground or command.run_all:
+        if command.import_path or (not existing_jobs):
+            stderr('{red}[X] You must schedule some jobs first before running in foreground mode.{reset}'.format(**ANSI))
+            stderr('    archivebox schedule --every=hour https://example.com/some/rss/feed.xml')
+            raise SystemExit(1)
+        print('{green}[*] Running {} ArchiveBox jobs in foreground task scheduler...{reset}'.format(len(existing_jobs), **ANSI))
+        if command.run_all:
+            try:
+                for job in existing_jobs:
+                    sys.stdout.write(f'  > {job.command}')
+                    sys.stdout.flush()
+                    job.run()
+                    sys.stdout.write(f'\r  √ {job.command}\n')
+            except KeyboardInterrupt:
+                print('\n{green}[√] Stopped.{reset}'.format(**ANSI))
+                raise SystemExit(1)
+        if command.foreground:
+            try:
+                for result in cron.run_scheduler():
+                    print(result)
+            except KeyboardInterrupt:
+                print('\n{green}[√] Stopped.{reset}'.format(**ANSI))
+                raise SystemExit(1)
+
+    elif command.show:
+        if existing_jobs:
+            print('\n'.join(str(cmd) for cmd in existing_jobs))
+        else:
+            stderr('{red}[X] There are no ArchiveBox cron jobs scheduled for your user ({}).{reset}'.format(USER, **ANSI))
+            stderr('    To schedule a new job, run:')
+            stderr('        archivebox schedule --every=[timeperiod] https://example.com/some/rss/feed.xml')
+        raise SystemExit(0)
+
+    elif command.clear:
+        print(cron.remove_all(comment=CRON_COMMENT))
+        cron.write()
+        raise SystemExit(0)
+
+    elif command.every:
+        quoted = lambda s: f'"{s}"' if s and ' ' in s else s
+        cmd = [
+            'cd',
+            quoted(OUTPUT_DIR),
+            '&&',
+            quoted(ARCHIVEBOX_BINARY),
+            *(('add', f'"{command.import_path}"',) if command.import_path else ('update',)),
+            '2>&1',
+            '>',
+            quoted(os.path.join(LOGS_DIR, 'archivebox.log')),
+
+        ]
+        new_job = cron.new(command=' '.join(cmd), comment=CRON_COMMENT)
+
+        if command.every in ('minute', 'hour', 'day', 'week', 'month', 'year'):
+            set_every = getattr(new_job.every(), command.every)
+            set_every()
+        elif CronSlices.is_valid(command.every):
+            new_job.setall(command.every)
+        else:
+            stderr('{red}[X] Got invalid timeperiod for cron task.{reset}'.format(**ANSI))
+            stderr('    It must be one of minute/hour/day/week/month')
+            stderr('    or a quoted cron-format schedule like:')
+            stderr('        archivebox init --every=day https://example.com/some/rss/feed.xml')
+            stderr('        archivebox init --every="0/5 * * * *" https://example.com/some/rss/feed.xml')
+            raise SystemExit(1)
+
+        cron = dedupe_jobs(cron)
+        cron.write()
+
+        total_runs = sum(j.frequency_per_year() for j in cron)
+        existing_jobs = list(cron.find_comment(CRON_COMMENT))
+
+        print()
+        print('{green}[√] Scheduled new ArchiveBox cron job for user: {} ({} jobs are active).{reset}'.format(USER, len(existing_jobs), **ANSI))
+        print('\n'.join(f'  > {cmd}' if str(cmd) == str(new_job) else f'    {cmd}' for cmd in existing_jobs))
+        if total_runs > 60 and not command.quiet:
+            stderr()
+            stderr('{lightyellow}[!] With the current cron config, ArchiveBox is estimated to run >{} times per year.{reset}'.format(total_runs, **ANSI))
+            stderr(f'    Congrats on being an enthusiastic internet archiver! 👌')
+            stderr()
+            stderr('    Make sure you have enough storage space available to hold all the data.')
+            stderr('    Using a compressed/deduped filesystem like ZFS is recommended if you plan on archiving a lot.')
+        raise SystemExit(0)
+
+
+def dedupe_jobs(cron: CronTab) -> CronTab:
+    deduped = set()
+    for job in list(cron):
+        unique_tuple = (str(job.slices), job.command)
+        if unique_tuple not in deduped:
+            deduped.add(unique_tuple)
+        cron.remove(job)
+
+    for schedule, command in deduped:
+        job = cron.new(command=command, comment=CRON_COMMENT)
+        job.setall(schedule)
+        job.enable()
+
+    return cron
+
+
+if __name__ == '__main__':
+    main()

+ 2 - 2
archivebox/cli/archivebox_server.py

@@ -7,7 +7,7 @@ __description__ = 'Run the ArchiveBox HTTP server'
 import sys
 import argparse
 
-from ..legacy.config import setup_django
+from ..legacy.config import setup_django, OUTPUT_DIR
 from ..legacy.util import reject_stdin
 
 
@@ -29,7 +29,7 @@ def main(args=None):
     command = parser.parse_args(args)
     reject_stdin(__command__)
     
-    setup_django()
+    setup_django(OUTPUT_DIR)
     from django.core.management import call_command
     call_command("runserver", *command.runserver_args)
 

+ 2 - 2
archivebox/cli/archivebox_shell.py

@@ -7,7 +7,7 @@ __description__ = 'Enter an interactive ArchiveBox Django shell'
 import sys
 import argparse
 
-from ..legacy.config import setup_django
+from ..legacy.config import setup_django, OUTPUT_DIR
 from ..legacy.util import reject_stdin
 
 
@@ -22,7 +22,7 @@ def main(args=None):
     parser.parse_args(args)
     reject_stdin(__command__)
     
-    setup_django()
+    setup_django(OUTPUT_DIR)
     from django.core.management import call_command
     call_command("shell_plus")
 

+ 3 - 5
archivebox/core/settings.py

@@ -5,10 +5,8 @@ import os
 SECRET_KEY = '---------------- not a valid secret key ! ----------------'
 DEBUG = True
 
-OUTPUT_DIR = os.path.abspath(os.curdir)
-DATABASE_DIR_NAME = 'database'
-DATABASE_FILE_NAME = 'database.sqlite3'
-DATABASE_FILE = os.path.join(OUTPUT_DIR, DATABASE_DIR_NAME, DATABASE_FILE_NAME)
+OUTPUT_DIR = os.path.abspath(os.getenv('OUTPUT_DIR', os.curdir))
+DATABASE_FILE = os.path.join(OUTPUT_DIR, 'index.sqlite3')
 
 
 INSTALLED_APPS = [
@@ -38,7 +36,7 @@ ROOT_URLCONF = 'core.urls'
 TEMPLATES = [
     {
         'BACKEND': 'django.template.backends.django.DjangoTemplates',
-        'DIRS': ['templates'],
+        'DIRS': ['themes'],
         'APP_DIRS': True,
         'OPTIONS': {
             'context_processors': [

+ 0 - 15
archivebox/env.py

@@ -1,15 +0,0 @@
-import os
-import sys
-
-
-PYTHON_DIR = os.path.dirname(os.path.abspath(__file__))
-
-sys.path.append(PYTHON_DIR)
-os.environ.setdefault("DJANGO_SETTINGS_MODULE", "core.settings")
-
-import django
-django.setup()
-
-from django.conf import settings
-
-DATABASE_FILE = settings.DATABASE_FILE

+ 28 - 14
archivebox/legacy/config.py

@@ -60,7 +60,6 @@ WGET_BINARY =            os.getenv('WGET_BINARY',            'wget')
 YOUTUBEDL_BINARY =       os.getenv('YOUTUBEDL_BINARY',       'youtube-dl')
 CHROME_BINARY =          os.getenv('CHROME_BINARY',          None)
 
-
 # ******************************************************************************
 
 ### Terminal Configuration
@@ -84,6 +83,7 @@ def stderr(*args):
     sys.stderr.write(' '.join(str(a) for a in args) + '\n')
 
 USER = getpass.getuser() or os.getlogin()
+ARCHIVEBOX_BINARY = sys.argv[0]
 
 REPO_DIR = os.path.abspath(os.path.join(os.path.dirname(os.path.abspath(__file__)), '..', '..'))
 if OUTPUT_DIR:
@@ -91,14 +91,15 @@ if OUTPUT_DIR:
 else:
     OUTPUT_DIR = os.path.abspath(os.curdir)
 
+SQL_INDEX_FILENAME = 'index.sqlite3'
+JSON_INDEX_FILENAME = 'index.json'
+HTML_INDEX_FILENAME = 'index.html'
 ARCHIVE_DIR_NAME = 'archive'
 SOURCES_DIR_NAME = 'sources'
-DATABASE_DIR_NAME = 'database'
-DATABASE_FILE_NAME = 'database.sqlite3'
+LOGS_DIR_NAME = 'logs'
 ARCHIVE_DIR = os.path.join(OUTPUT_DIR, ARCHIVE_DIR_NAME)
 SOURCES_DIR = os.path.join(OUTPUT_DIR, SOURCES_DIR_NAME)
-DATABASE_DIR = os.path.join(OUTPUT_DIR, DATABASE_DIR_NAME)
-DATABASE_FILE = os.path.join(OUTPUT_DIR, DATABASE_DIR_NAME, DATABASE_FILE_NAME)
+LOGS_DIR = os.path.join(OUTPUT_DIR, LOGS_DIR_NAME)
 
 PYTHON_DIR = os.path.join(REPO_DIR, 'archivebox')
 LEGACY_DIR = os.path.join(PYTHON_DIR, 'legacy')
@@ -126,9 +127,10 @@ if USER == 'root':
     raise SystemExit(1)
 
 ### Check Python environment
-python_vers = float('{}.{}'.format(sys.version_info.major, sys.version_info.minor))
-if python_vers < 3.6:
-    stderr('{}[X] Python version is not new enough: {} (>3.6 is required){}'.format(ANSI['red'], python_vers, ANSI['reset']))
+PYTHON_BINARY = sys.executable
+PYTHON_VERSION = '{}.{}'.format(sys.version_info.major, sys.version_info.minor)
+if float(PYTHON_VERSION) < 3.6:
+    stderr('{}[X] Python version is not new enough: {} (>3.6 is required){}'.format(ANSI['red'], PYTHON_VERSION, ANSI['reset']))
     stderr('    See https://github.com/pirate/ArchiveBox/wiki/Troubleshooting#python for help upgrading your Python installation.')
     raise SystemExit(1)
 
@@ -150,6 +152,7 @@ if sys.stdout.encoding.upper() not in ('UTF-8', 'UTF8'):
 
 def bin_version(binary: str) -> Optional[str]:
     """check the presence and return valid version line of a specified binary"""
+
     global HAS_INVALID_DEPENDENCIES
     binary = os.path.expanduser(binary)
     try:
@@ -223,12 +226,17 @@ def find_chrome_data_dir() -> Optional[str]:
     return None
 
 
-def setup_django():
+def setup_django(out_dir: str=OUTPUT_DIR, check_db=False):
     import django
     sys.path.append(PYTHON_DIR)
+    os.environ.setdefault('OUTPUT_DIR', out_dir)
     os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'core.settings')
     django.setup()
 
+    if check_db:
+        assert os.path.exists(os.path.join(out_dir, SQL_INDEX_FILENAME)), (
+            f'No database file {SQL_INDEX_FILENAME} found in OUTPUT_DIR: {out_dir}')
+
 # ******************************************************************************
 # ************************ Environment & Dependencies **************************
 # ******************************************************************************
@@ -338,16 +346,16 @@ try:
             'enabled': True,
             'is_valid': os.path.exists(SOURCES_DIR),
         },
+        'LOGS_DIR': {
+            'path': os.path.abspath(LOGS_DIR),
+            'enabled': True,
+            'is_valid': os.path.exists(LOGS_DIR),
+        },
         'ARCHIVE_DIR': {
             'path': os.path.abspath(ARCHIVE_DIR),
             'enabled': True,
             'is_valid': os.path.exists(ARCHIVE_DIR),
         },
-        'DATABASE_DIR': {
-            'path': os.path.abspath(DATABASE_DIR),
-            'enabled': True,
-            'is_valid': os.path.exists(DATABASE_FILE),
-        },
         'CHROME_USER_DATA_DIR': {
             'path': CHROME_USER_DATA_DIR and os.path.abspath(CHROME_USER_DATA_DIR),
             'enabled': USE_CHROME and CHROME_USER_DATA_DIR,
@@ -361,6 +369,12 @@ try:
     }
 
     DEPENDENCIES = {
+        'PYTHON_BINARY': {
+            'path': PYTHON_BINARY,
+            'version': PYTHON_VERSION,
+            'enabled': True,
+            'is_valid': bool(DJANGO_VERSION),
+        },
         'DJANGO_BINARY': {
             'path': DJANGO_BINARY,
             'version': DJANGO_VERSION,

+ 46 - 44
archivebox/legacy/index.py

@@ -1,13 +1,17 @@
+__package__ = 'archivebox.legacy'
+
 import os
 import json
 
 from typing import List, Tuple, Optional, Iterable
 from collections import OrderedDict
+from contextlib import contextmanager
 
 from .schema import Link, ArchiveResult
 from .config import (
-    DATABASE_DIR,
-    DATABASE_FILE_NAME,
+    SQL_INDEX_FILENAME,
+    JSON_INDEX_FILENAME,
+    HTML_INDEX_FILENAME,
     OUTPUT_DIR,
     TIMEOUT,
     URL_BLACKLIST_PTN,
@@ -35,14 +39,13 @@ from .util import (
 from .parse import parse_links
 from .logs import (
     log_indexing_process_started,
+    log_indexing_process_finished,
     log_indexing_started,
     log_indexing_finished,
     log_parsing_started,
     log_parsing_finished,
 )
 
-
-
 ### Link filtering and checking
 
 @enforce_types
@@ -117,7 +120,7 @@ def validate_links(links: Iterable[Link]) -> Iterable[Link]:
     links = uniquefied_links(links)  # merge/dedupe duplicate timestamps & urls
 
     if not links:
-        stderr('{red}[X] No links found in index.json{reset}'.format(**ANSI))
+        stderr('{red}[X] No links found in index.{reset}'.format(**ANSI))
         stderr('    To add a link to your archive, run:')
         stderr("        archivebox add 'https://example.com'")
         stderr()
@@ -204,58 +207,63 @@ def lowest_uniq_timestamp(used_timestamps: OrderedDict, timestamp: str) -> str:
 
 ### Main Links Index
 
+@contextmanager
 @enforce_types
-def write_main_index(links: List[Link], out_dir: str=OUTPUT_DIR, finished: bool=False) -> None:
-    """create index.html file for a given list of links"""
-
-    log_indexing_process_started()
-
-    log_indexing_started(DATABASE_DIR, DATABASE_FILE_NAME)
+def timed_index_update(out_path: str):
+    log_indexing_started(out_path)
     timer = TimedProgress(TIMEOUT * 2, prefix='      ')
     try:
-        write_sql_main_index(links)
+        yield
     finally:
         timer.end()
-    log_indexing_finished(DATABASE_DIR, DATABASE_FILE_NAME)
 
-    log_indexing_started(out_dir, 'index.json')
-    timer = TimedProgress(TIMEOUT * 2, prefix='      ')
-    try:
+    assert os.path.exists(out_path), f'Failed to write index file: {out_path}'
+    log_indexing_finished(out_path)
+
+
+@enforce_types
+def write_main_index(links: List[Link], out_dir: str=OUTPUT_DIR, finished: bool=False) -> None:
+    """create index.html file for a given list of links"""
+
+    log_indexing_process_started(len(links))
+
+    with timed_index_update(os.path.join(out_dir, SQL_INDEX_FILENAME)):
+        write_sql_main_index(links, out_dir=out_dir)
+
+    with timed_index_update(os.path.join(out_dir, JSON_INDEX_FILENAME)):
         write_json_main_index(links, out_dir=out_dir)
-    finally:
-        timer.end()
-    log_indexing_finished(out_dir, 'index.json')
-    
-    log_indexing_started(out_dir, 'index.html')
-    timer = TimedProgress(TIMEOUT * 2, prefix='      ')
-    try:
+
+    with timed_index_update(os.path.join(out_dir, HTML_INDEX_FILENAME)):
         write_html_main_index(links, out_dir=out_dir, finished=finished)
-    finally:
-        timer.end()
-    log_indexing_finished(out_dir, 'index.html')
+
+    log_indexing_process_finished()
 
 
 @enforce_types
-def load_main_index(out_dir: str=OUTPUT_DIR, import_path: Optional[str]=None) -> Tuple[List[Link], List[Link]]:
+def load_main_index(out_dir: str=OUTPUT_DIR) -> List[Link]:
     """parse and load existing index with any new links from import_path merged in"""
 
-    existing_links: List[Link] = []
-    if out_dir:
-        existing_links = list(parse_json_main_index(out_dir))
-        existing_sql_links = list(parse_sql_main_index())
-        assert set(l.url for l in existing_links) == set(l['url'] for l in existing_sql_links)
+    all_links: List[Link] = []
+    all_links = list(parse_json_main_index(out_dir))
+    links_from_sql = list(parse_sql_main_index())
+    assert set(l.url for l in all_links) == set(l['url'] for l in links_from_sql)
+
+    return all_links
 
+
+@enforce_types
+def import_new_links(existing_links: List[Link], import_path: str) -> Tuple[List[Link], List[Link]]:
     new_links: List[Link] = []
-    if import_path:
-        # parse and validate the import file
-        log_parsing_started(import_path)
-        raw_links, parser_name = parse_links(import_path)
-        new_links = list(validate_links(raw_links))
+
+    # parse and validate the import file
+    log_parsing_started(import_path)
+    raw_links, parser_name = parse_links(import_path)
+    new_links = list(validate_links(raw_links))
 
     # merge existing links in out_dir and new links
     all_links = list(validate_links(existing_links + new_links))
 
-    if import_path and parser_name:
+    if parser_name:
         num_parsed = len(raw_links)
         num_new_links = len(all_links) - len(existing_links)
         log_parsing_finished(num_parsed, num_new_links, parser_name)
@@ -323,9 +331,3 @@ def load_link_details(link: Link, out_dir: Optional[str]=None) -> Link:
         return merge_links(existing_link, link)
 
     return link
-
-
-
-
-
-

+ 14 - 10
archivebox/legacy/logs.py

@@ -6,7 +6,7 @@ from dataclasses import dataclass
 from typing import Optional, List
 
 from .schema import Link, ArchiveResult
-from .config import ANSI, OUTPUT_DIR
+from .config import ANSI, OUTPUT_DIR, IS_TTY
 
 
 @dataclass
@@ -42,7 +42,7 @@ def pretty_path(path: str) -> str:
 def log_parsing_started(source_file: str):
     start_ts = datetime.now()
     _LAST_RUN_STATS.parse_start_ts = start_ts
-    print('{green}[*] [{}] Parsing new links from output/sources/{}...{reset}'.format(
+    print('\n{green}[*] [{}] Parsing new links from output/sources/{}...{reset}'.format(
         start_ts.strftime('%Y-%m-%d %H:%M:%S'),
         source_file.rsplit('/', 1)[-1],
         **ANSI,
@@ -56,22 +56,26 @@ def log_parsing_finished(num_parsed: int, num_new_links: int, parser_name: str):
 
 ### Indexing Stage
 
-def log_indexing_process_started():
+def log_indexing_process_started(num_links: int):
     start_ts = datetime.now()
     _LAST_RUN_STATS.index_start_ts = start_ts
     print()
-    print('{green}[*] [{}] Saving main index files...{reset}'.format(
+    print('{green}[*] [{}] Updating {} links in main index...{reset}'.format(
         start_ts.strftime('%Y-%m-%d %H:%M:%S'),
+        num_links,
         **ANSI,
     ))
 
-def log_indexing_started(out_dir: str, out_file: str):
-    sys.stdout.write('    > {}/{}'.format(pretty_path(out_dir), out_file))
-
-def log_indexing_finished(out_dir: str, out_file: str):
+def log_indexing_process_finished():
     end_ts = datetime.now()
     _LAST_RUN_STATS.index_end_ts = end_ts
-    print('\r    √ {}/{}'.format(out_dir, out_file))
+
+def log_indexing_started(out_path: str):
+    if IS_TTY:
+        sys.stdout.write(f'    > {out_path}')
+
+def log_indexing_finished(out_path: str):
+    print(f'\r    √ {out_path}')
 
 
 ### Archiving Stage
@@ -108,7 +112,7 @@ def log_archiving_paused(num_links: int, idx: int, timestamp: str):
     print('    To view your archive, open:')
     print('        {}/index.html'.format(OUTPUT_DIR))
     print('    Continue archiving where you left off by running:')
-    print('        archivebox {}'.format(timestamp))
+    print('        archivebox update --resume={}'.format(timestamp))
 
 def log_archiving_finished(num_links: int):
     end_ts = datetime.now()

+ 66 - 35
archivebox/legacy/main.py

@@ -9,6 +9,7 @@ from .util import enforce_types, TimedProgress
 from .index import (
     links_after_timestamp,
     load_main_index,
+    import_new_links,
     write_main_index,
 )
 from .archive_methods import archive_link
@@ -19,8 +20,9 @@ from .config import (
     OUTPUT_DIR,
     SOURCES_DIR,
     ARCHIVE_DIR,
-    DATABASE_DIR,
-    DATABASE_FILE,
+    LOGS_DIR,
+    JSON_INDEX_FILENAME,
+    SQL_INDEX_FILENAME,
     check_dependencies,
     check_data_folder,
     setup_django,
@@ -36,60 +38,85 @@ from .logs import (
 )
 
 
+ALLOWED_IN_OUTPUT_DIR = {
+    '.DS_Store',
+    '.venv',
+    'venv',
+    'virtualenv',
+    '.virtualenv',
+    'sources',
+    'archive',
+    'logs',
+    'static',
+}
+
+
 @enforce_types
 def init():
     os.makedirs(OUTPUT_DIR, exist_ok=True)
 
-    harmless_files = {'.DS_Store', '.venv', 'venv', 'virtualenv', '.virtualenv', 'sources', 'archive', 'database', 'logs', 'static'}
-    is_empty = not len(set(os.listdir(OUTPUT_DIR)) - harmless_files)
-    existing_index = os.path.exists(os.path.join(OUTPUT_DIR, 'index.json'))
+    is_empty = not len(set(os.listdir(OUTPUT_DIR)) - ALLOWED_IN_OUTPUT_DIR)
+    existing_index = os.path.exists(os.path.join(OUTPUT_DIR, JSON_INDEX_FILENAME))
 
     if is_empty:
-        stderr('{green}[+] Initializing new archive directory: {}{reset}'.format(OUTPUT_DIR, **ANSI))
-        write_main_index([], out_dir=OUTPUT_DIR, finished=True)
+        print('{green}[+] Initializing new archive directory: {}{reset}'.format(OUTPUT_DIR, **ANSI))
+        print('{green}----------------------------------------------------------------{reset}'.format(**ANSI))
     else:
         if existing_index:
-            stderr('{green}[√] You already have an ArchiveBox collection in the current folder.{reset}'.format(**ANSI))
-            stderr(f'    {OUTPUT_DIR}')
-            stderr(f'    > index.html')
-            stderr(f'    > index.json')
+            print('{green}[√] You already have an ArchiveBox collection in the current folder.{reset}'.format(**ANSI))
+            print('{green}----------------------------------------------------------------{reset}'.format(**ANSI))
+            print(f'    {OUTPUT_DIR}')
         else:
             stderr(
-                ("{red}[X] This folder already has files in it. You must run init inside a completely empty directory.{reset}"
+                ("{red}[X] This folder appears to have non-ArchiveBox files in it. You must run 'archivebox init' inside a completely empty directory.{reset}"
                 "\n\n"
                 "    {lightred}Hint:{reset} To import a data folder created by an older version of ArchiveBox, \n"
-                "    just cd into the folder and run the archivebox command to pick up where you left off.\n\n"
+                "    just cd into the folder and run 'archivebox update' to pick up where you left off.\n\n"
                 "    (Always make sure your data folder is backed up first before updating ArchiveBox)"
                 ).format(OUTPUT_DIR, **ANSI)
             )
             raise SystemExit(1)
 
     os.makedirs(SOURCES_DIR, exist_ok=True)
-    stderr(f'    > sources/')
+    print(f'    > {SOURCES_DIR}')
+    
     os.makedirs(ARCHIVE_DIR, exist_ok=True)
-    stderr(f'    > archive/')
-    os.makedirs(DATABASE_DIR, exist_ok=True)
+    print(f'    > {ARCHIVE_DIR}')
 
-    setup_django()
-    from django.core.management import call_command
-    from django.contrib.auth.models import User
-    stderr(f'    > database/')
+    os.makedirs(LOGS_DIR, exist_ok=True)
+    print(f'    > {LOGS_DIR}')
     
-    stderr('\n{green}[+] Running Django migrations...{reset}'.format(**ANSI))
+    print('\n{green}[+] Running Django migrations...{reset}'.format(**ANSI))
+    setup_django(OUTPUT_DIR, check_db=False)
+    from django.core.management import call_command
+    from django.conf import settings
+    assert settings.DATABASE_FILE == os.path.join(OUTPUT_DIR, SQL_INDEX_FILENAME)
+    print(f'    {settings.DATABASE_FILE}')
+
+
     call_command("makemigrations", interactive=False)
     call_command("migrate", interactive=False)
+
+    assert os.path.exists(settings.DATABASE_FILE)
     
-    if not User.objects.filter(is_superuser=True).exists():
-        stderr('{green}[+] Creating admin user account...{reset}'.format(**ANSI))
-        call_command("createsuperuser", interactive=True)
+    # from django.contrib.auth.models import User
+    # if IS_TTY and not User.objects.filter(is_superuser=True).exists():
+    #     print('{green}[+] Creating admin user account...{reset}'.format(**ANSI))
+    #     call_command("createsuperuser", interactive=True)
+
+    if existing_index:
+        all_links = load_main_index(out_dir=OUTPUT_DIR)
+        write_main_index(links=list(all_links), out_dir=OUTPUT_DIR)
+    else:
+        write_main_index([], out_dir=OUTPUT_DIR)
 
-    stderr('\n{green}------------------------------------------------------------{reset}'.format(**ANSI))
-    stderr('{green}[√] Done. ArchiveBox collection is set up in current folder.{reset}'.format(**ANSI))
-    stderr('    To add new links, you can run:')
-    stderr("        archivebox add 'https://example.com'")
-    stderr()
-    stderr('    For more usage and examples, run:')
-    stderr('        archivebox help')
+    print('\n{green}----------------------------------------------------------------{reset}'.format(**ANSI))
+    print('{green}[√] Done. ArchiveBox collection is set up in the current folder.{reset}'.format(**ANSI))
+    print('    To add new links, you can run:')
+    print("        archivebox add 'https://example.com'")
+    print()
+    print('    For more usage and examples, run:')
+    print('        archivebox help')
 
 
 
@@ -102,7 +129,11 @@ def update_archive_data(import_path: Optional[str]=None, resume: Optional[float]
 
     # Step 1: Load list of links from the existing index
     #         merge in and dedupe new links from import_path
-    all_links, new_links = load_main_index(out_dir=OUTPUT_DIR, import_path=import_path)
+    all_links: List[Link] = []
+    new_links: List[Link] = []
+    all_links = load_main_index(out_dir=OUTPUT_DIR)
+    if import_path:
+        all_links, new_links = import_new_links(all_links, import_path)
 
     # Step 2: Write updated index with deduped old and new links back to disk
     write_main_index(links=list(all_links), out_dir=OUTPUT_DIR)
@@ -127,7 +158,7 @@ def update_archive_data(import_path: Optional[str]=None, resume: Optional[float]
     log_archiving_finished(len(links))
 
     # Step 4: Re-write links index with updated titles, icons, and resources
-    all_links, _ = load_main_index(out_dir=OUTPUT_DIR)
+    all_links = load_main_index(out_dir=OUTPUT_DIR)
     write_main_index(links=list(all_links), out_dir=OUTPUT_DIR, finished=True)
     return all_links
 
@@ -152,7 +183,7 @@ def link_matches_filter(link: Link, filter_patterns: List[str], filter_type: str
 def list_archive_data(filter_patterns: Optional[List[str]]=None, filter_type: str='exact',
                       after: Optional[float]=None, before: Optional[float]=None) -> Iterable[Link]:
     
-    all_links, _ = load_main_index(out_dir=OUTPUT_DIR)
+    all_links = load_main_index(out_dir=OUTPUT_DIR)
 
     for link in all_links:
         if after is not None and float(link.timestamp) < after:
@@ -198,7 +229,7 @@ def remove_archive_links(filter_patterns: List[str], filter_type: str='exact',
     timer = TimedProgress(360, prefix='      ')
     try:
         to_keep = []
-        all_links, _ = load_main_index(out_dir=OUTPUT_DIR)
+        all_links = load_main_index(out_dir=OUTPUT_DIR)
         for link in all_links:
             should_remove = (
                 (after is not None and float(link.timestamp) < after)

+ 3 - 2
archivebox/legacy/storage/html.py

@@ -13,6 +13,7 @@ from ..config import (
     GIT_SHA,
     FOOTER_INFO,
     ARCHIVE_DIR_NAME,
+    HTML_INDEX_FILENAME,
 )
 from ..util import (
     enforce_types,
@@ -44,7 +45,7 @@ def write_html_main_index(links: List[Link], out_dir: str=OUTPUT_DIR, finished:
     copy_and_overwrite(join(TEMPLATES_DIR, 'static'), join(out_dir, 'static'))
     
     rendered_html = main_index_template(links, finished=finished)
-    atomic_write(rendered_html, join(out_dir, 'index.html'))
+    atomic_write(rendered_html, join(out_dir, HTML_INDEX_FILENAME))
 
 
 @enforce_types
@@ -100,7 +101,7 @@ def write_html_link_details(link: Link, out_dir: Optional[str]=None) -> None:
     out_dir = out_dir or link.link_dir
 
     rendered_html = link_details_template(link)
-    atomic_write(rendered_html, join(out_dir, 'index.html'))
+    atomic_write(rendered_html, join(out_dir, HTML_INDEX_FILENAME))
 
 
 @enforce_types

+ 28 - 11
archivebox/legacy/storage/json.py

@@ -1,6 +1,7 @@
 __package__ = 'archivebox.legacy.storage'
 
 import os
+import sys
 import json
 
 from datetime import datetime
@@ -10,12 +11,33 @@ from ..schema import Link, ArchiveResult
 from ..config import (
     VERSION,
     OUTPUT_DIR,
+    FOOTER_INFO,
+    GIT_SHA,
+    DEPENDENCIES,
+    JSON_INDEX_FILENAME,
 )
 from ..util import (
     enforce_types,
     atomic_write,
 )
 
+MAIN_INDEX_HEADER = {
+    'info': 'This is an index of site data archived by ArchiveBox: The self-hosted web archive.',
+    'schema': 'archivebox.legacy.storage.json',
+    'copyright_info': FOOTER_INFO,
+    'meta': {
+        'project': 'ArchiveBox',
+        'cmd': sys.argv,
+        'version': VERSION,
+        'git_sha': GIT_SHA,
+        'website': 'https://ArchiveBox.io',
+        'docs': 'https://github.com/pirate/ArchiveBox/wiki',
+        'source': 'https://github.com/pirate/ArchiveBox',
+        'issues': 'https://github.com/pirate/ArchiveBox/issues',
+        'dependencies': DEPENDENCIES,
+    },
+}
+
 
 ### Main Links Index
 
@@ -23,7 +45,7 @@ from ..util import (
 def parse_json_main_index(out_dir: str=OUTPUT_DIR) -> Iterator[Link]:
     """parse a archive index json file and return the list of links"""
 
-    index_path = os.path.join(out_dir, 'index.json')
+    index_path = os.path.join(out_dir, JSON_INDEX_FILENAME)
     if os.path.exists(index_path):
         with open(index_path, 'r', encoding='utf-8') as f:
             links = json.load(f)['links']
@@ -46,18 +68,13 @@ def write_json_main_index(links: List[Link], out_dir: str=OUTPUT_DIR) -> None:
     if links and links[0].sources:
         assert isinstance(links[0].sources[0], str)
 
-    path = os.path.join(out_dir, 'index.json')
-
-    index_json = {
-        'info': 'ArchiveBox Index',
-        'source': 'https://github.com/pirate/ArchiveBox',
-        'docs': 'https://github.com/pirate/ArchiveBox/wiki',
-        'version': VERSION,
+    main_index_json = {
+        **MAIN_INDEX_HEADER,
         'num_links': len(links),
         'updated': datetime.now(),
         'links': links,
     }
-    atomic_write(index_json, path)
+    atomic_write(main_index_json, os.path.join(out_dir, JSON_INDEX_FILENAME))
 
 
 ### Link Details Index
@@ -67,7 +84,7 @@ def write_json_link_details(link: Link, out_dir: Optional[str]=None) -> None:
     """write a json file with some info about the link"""
     
     out_dir = out_dir or link.link_dir
-    path = os.path.join(out_dir, 'index.json')
+    path = os.path.join(out_dir, JSON_INDEX_FILENAME)
 
     atomic_write(link._asdict(extended=True), path)
 
@@ -75,7 +92,7 @@ def write_json_link_details(link: Link, out_dir: Optional[str]=None) -> None:
 @enforce_types
 def parse_json_link_details(out_dir: str) -> Optional[Link]:
     """load the json link index from a given directory"""
-    existing_index = os.path.join(out_dir, 'index.json')
+    existing_index = os.path.join(out_dir, JSON_INDEX_FILENAME)
     if os.path.exists(existing_index):
         with open(existing_index, 'r', encoding='utf-8') as f:
             link_json = json.load(f)

+ 5 - 5
archivebox/legacy/storage/sql.py

@@ -4,14 +4,14 @@ from typing import List, Iterator
 
 from ..schema import Link
 from ..util import enforce_types
-from ..config import setup_django
+from ..config import setup_django, OUTPUT_DIR
 
 
 ### Main Links Index
 
 @enforce_types
-def parse_sql_main_index() -> Iterator[Link]:
-    setup_django()
+def parse_sql_main_index(out_dir: str=OUTPUT_DIR) -> Iterator[Link]:
+    setup_django(out_dir, check_db=True)
     from core.models import Page
 
     return (
@@ -20,8 +20,8 @@ def parse_sql_main_index() -> Iterator[Link]:
     )
 
 @enforce_types
-def write_sql_main_index(links: List[Link]) -> None:
-    setup_django()
+def write_sql_main_index(links: List[Link], out_dir: str=OUTPUT_DIR) -> None:
+    setup_django(out_dir, check_db=True)
     from core.models import Page
 
     for link in links:

+ 40 - 11
archivebox/tests.py

@@ -27,6 +27,11 @@ os.environ.update(TEST_CONFIG)
 
 from .legacy.main import init
 from .legacy.index import load_main_index
+from .legacy.config import (
+    SQL_INDEX_FILENAME,
+    JSON_INDEX_FILENAME,
+    HTML_INDEX_FILENAME,
+)
 
 from .cli import (
     archivebox_init,
@@ -55,12 +60,12 @@ and example14.badb
 <or>htt://example15.badc</that>
 '''
 
+stdout = sys.stdout
+stderr = sys.stderr
+
 
 @contextmanager
 def output_hidden(show_failing=True):
-    stdout = sys.stdout
-    stderr = sys.stderr
-
     if not HIDE_CLI_OUTPUT:
         yield
         return
@@ -100,6 +105,11 @@ class TestInit(unittest.TestCase):
         with output_hidden():
             archivebox_init.main([])
 
+        assert os.path.exists(os.path.join(OUTPUT_DIR, SQL_INDEX_FILENAME))
+        assert os.path.exists(os.path.join(OUTPUT_DIR, JSON_INDEX_FILENAME))
+        assert os.path.exists(os.path.join(OUTPUT_DIR, HTML_INDEX_FILENAME))
+        assert len(load_main_index(out_dir=OUTPUT_DIR)) == 0
+
     def test_conflicting_init(self):
         with open(os.path.join(OUTPUT_DIR, 'test_conflict.txt'), 'w+') as f:
             f.write('test')
@@ -108,9 +118,25 @@ class TestInit(unittest.TestCase):
             with output_hidden(show_failing=False):
                 archivebox_init.main([])
             assert False, 'Init should have exited with an exception'
+        except SystemExit:
+            pass
+
+        assert not os.path.exists(os.path.join(OUTPUT_DIR, SQL_INDEX_FILENAME))
+        assert not os.path.exists(os.path.join(OUTPUT_DIR, JSON_INDEX_FILENAME))
+        assert not os.path.exists(os.path.join(OUTPUT_DIR, HTML_INDEX_FILENAME))
+        try:
+            load_main_index(out_dir=OUTPUT_DIR)
+            assert False, 'load_main_index should raise an exception when no index is present'
         except:
             pass
 
+    def test_no_dirty_state(self):
+        with output_hidden():
+            init()
+        shutil.rmtree(OUTPUT_DIR, ignore_errors=True)
+        with output_hidden():
+            init()
+
 
 class TestAdd(unittest.TestCase):
     def setUp(self):
@@ -125,7 +151,7 @@ class TestAdd(unittest.TestCase):
         with output_hidden():
             archivebox_add.main(['https://getpocket.com/users/nikisweeting/feed/all'])
 
-        all_links, _ = load_main_index(out_dir=OUTPUT_DIR)
+        all_links = load_main_index(out_dir=OUTPUT_DIR)
         assert len(all_links) == 30
 
     def test_add_arg_file(self):
@@ -136,7 +162,7 @@ class TestAdd(unittest.TestCase):
         with output_hidden():
             archivebox_add.main([test_file])
 
-        all_links, _ = load_main_index(out_dir=OUTPUT_DIR)
+        all_links = load_main_index(out_dir=OUTPUT_DIR)
         assert len(all_links) == 12
         os.remove(test_file)
 
@@ -144,7 +170,7 @@ class TestAdd(unittest.TestCase):
         with output_hidden():
             archivebox_add.main([], stdin=test_urls)
 
-        all_links, _ = load_main_index(out_dir=OUTPUT_DIR)
+        all_links = load_main_index(out_dir=OUTPUT_DIR)
         assert len(all_links) == 12
 
 
@@ -155,29 +181,29 @@ class TestRemove(unittest.TestCase):
             init()
             archivebox_add.main([], stdin=test_urls)
 
-    def tearDown(self):
-        shutil.rmtree(OUTPUT_DIR, ignore_errors=True)
+    # def tearDown(self):
+        # shutil.rmtree(OUTPUT_DIR, ignore_errors=True)
 
 
     def test_remove_exact(self):
         with output_hidden():
             archivebox_remove.main(['--yes', '--delete', 'https://example5.com/'])
 
-        all_links, _ = load_main_index(out_dir=OUTPUT_DIR)
+        all_links = load_main_index(out_dir=OUTPUT_DIR)
         assert len(all_links) == 11
 
     def test_remove_regex(self):
         with output_hidden():
             archivebox_remove.main(['--yes', '--delete', '--filter-type=regex', 'http(s)?:\/\/(.+\.)?(example\d\.com)'])
 
-        all_links, _ = load_main_index(out_dir=OUTPUT_DIR)
+        all_links = load_main_index(out_dir=OUTPUT_DIR)
         assert len(all_links) == 4
 
     def test_remove_domain(self):
         with output_hidden():
             archivebox_remove.main(['--yes', '--delete', '--filter-type=domain', 'example5.com', 'example6.com'])
 
-        all_links, _ = load_main_index(out_dir=OUTPUT_DIR)
+        all_links = load_main_index(out_dir=OUTPUT_DIR)
         assert len(all_links) == 10
 
     def test_remove_none(self):
@@ -190,4 +216,7 @@ class TestRemove(unittest.TestCase):
 
 
 if __name__ == '__main__':
+    if '--verbose' in sys.argv or '-v' in sys.argv:
+        HIDE_CLI_OUTPUT = False
+    
     unittest.main()

+ 0 - 17
requirements.txt

@@ -1,17 +0,0 @@
-dataclasses
-django
-base32-crockford
-
-setuptools
-ipdb
-mypy
-django-stubs
-flake8
-
-#wpull
-#pywb
-#pyppeteer
-#GitPython
-#youtube-dl
-#archivenow
-#requests

+ 10 - 1
setup.py

@@ -31,7 +31,7 @@ setuptools.setup(
         'Bug Tracker': 'https://github.com/pirate/ArchiveBox/issues',
         'Roadmap': 'https://github.com/pirate/ArchiveBox/wiki/Roadmap',
         'Changelog': 'https://github.com/pirate/ArchiveBox/wiki/Changelog',
-        'Donations': 'https://github.com/pirate/ArchiveBox/wiki/Donations',
+        'Patreon': 'https://github.com/pirate/ArchiveBox/wiki/Donations',
     },
     packages=setuptools.find_packages(),
     python_requires='>=3.6',
@@ -40,6 +40,15 @@ setuptools.setup(
         "base32-crockford==0.3.0",
         "django==2.2",
         "django-extensions==2.1.6",
+        "youtube-dl",
+
+        # Some/all of these will likely be added in the future:
+        # wpull
+        # pywb
+        # pyppeteer
+        # archivenow
+        # requests
+
     ],
     entry_points={
         'console_scripts': [